├── README.md ├── checkpoints ├── Pointer Generator │ ├── Low │ │ └── model_best.pth │ └── Medium │ │ └── model_best.pth ├── Transformer │ ├── Low │ │ └── model_best.pth │ └── Medium │ │ └── model_best.pth └── english-low │ └── model_best.pth ├── cover.py ├── data.py ├── data ├── arabic-covered-test ├── arabic-dev ├── arabic-test ├── arabic-train-high ├── arabic-train-low ├── english-covered-dev ├── english-covered-test ├── english-dev ├── english-dev-xx ├── english-test ├── english-test-pred-low ├── english-train-high ├── english-train-low ├── english-train-medium ├── english-train-medium.aug ├── english-train-mini ├── french-covered-dev ├── french-covered-test ├── french-dev ├── french-test ├── french-train-high ├── french-train-low ├── french-train-medium ├── hebrew-covered-test ├── hebrew-dev ├── hebrew-test ├── hebrew-train-high ├── hebrew-train-low ├── hebrew-train-medium ├── irish-covered-dev ├── irish-covered-test ├── irish-dev ├── irish-test ├── irish-train-high ├── irish-train-low ├── irish-train-medium ├── italian-covered-dev ├── italian-covered-test ├── italian-dev ├── italian-test ├── italian-train-high ├── italian-train-low ├── italian-train-medium ├── middle-french-covered-test ├── middle-french-dev ├── middle-french-test ├── middle-french-train-high ├── middle-french-train-low ├── middle-french-train-medium ├── middle-high-german-covered-test ├── middle-high-german-dev ├── middle-high-german-test ├── middle-high-german-train-low ├── middle-high-german-train-medium ├── old-french-dev ├── old-french-test ├── old-french-train-high ├── old-french-train-low ├── old-french-train-medium ├── scottish-gaelic-covered-test ├── scottish-gaelic-dev ├── scottish-gaelic-test ├── scottish-gaelic-train-low ├── scottish-gaelic-train-medium ├── spanish-covered-dev ├── spanish-covered-test ├── spanish-dev ├── spanish-test ├── spanish-train-high ├── spanish-train-low └── spanish-train-medium ├── dataset.py ├── decoder.py ├── evaluate.py ├── generate.py ├── hyperparameter_search.py ├── model_utils.py ├── pointer_generator.py ├── run_models.py ├── tokenizer.py ├── train-original.py ├── train.py ├── transformer.py ├── transformer_baseline-original.py ├── transformer_baseline.py ├── utils.py └── vocabulary.py /README.md: -------------------------------------------------------------------------------- 1 | # Pointer-Generator-Transformer-Inflection-2019 2 | Transformer model and pointer-generator transformer for the morphological inflection task 3 | 4 | **MEDIUM RESOURCE TRAINING FILE - ENGLISH EXAMPLE** 5 | 6 | *Data augmentation for dataset* - python augment.py --src "data/english-train-medium" --out "data/english-train-medium-aug" 7 | 8 | *Create vocabulary for dataset* - python vocabulary.py --src "data/english-train-medium" --vocab "data/english-train-medium-vocab" 9 | 10 | *Train model* - python train.py \ 11 | --train "data/english-train-medium" --dev "data/english-dev" --vocab "data/english-train-medium-vocab" checkpoints-dir "checkpoints" \ 12 | --batch-size 128 --epochs 200 --eval-every 1 --resume True \ 13 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \ 14 | --lr 0.001 --beta2 0.98 \ 15 | --scheduler warmupinvsqr --patience 10 --min-lr 1e-5 --warmup-steps 4000 16 | 17 | *Generate Predictions with model* - python generate.py \ 18 | --model-checkpoint "checkpoints/model_best.pth" \ 19 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \ 20 | --test "data/english-covered-test" \ 21 | --vocab "data/english-train-medium-vocab" \ 22 | --pred "data/english-test-pred-medium" 23 | 24 | *Compute accuracy of test set predictions* - python evaluate.py \ 25 | --pred "data/english-test-pred-medium" --target "data/english-test" 26 | 27 | 28 | **LOW RESOURCE TRAINING FILE - ENGLISH EXAMPLE** 29 | 30 | *Data augmentation for dataset* - python augment.py --src "data/english-train-low" --out "data/english-train-low-aug" 31 | 32 | *Create vocabulary for dataset* - python vocabulary.py --src "data/english-train-low" --vocab "data/english-train-low-vocab" 33 | 34 | *Train model* - python train.py \ 35 | --train "data/english-train-low" --dev "data/english-dev" --vocab "data/english-train-low-vocab" checkpoints-dir "checkpoints" \ 36 | --batch-size 128 --epochs 200 --eval-every 1 --resume True \ 37 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \ 38 | --lr 0.001 --beta2 0.98 \ 39 | --scheduler warmupinvsqr --patience 10 --min-lr 1e-5 --warmup-steps 4000 40 | 41 | *Generate Predictions with model* - python generate.py \ 42 | --model-checkpoint "checkpoints/model_best.pth" \ 43 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \ 44 | --test "data/english-covered-test" \ 45 | --vocab "data/english-train-low-vocab" \ 46 | --pred "data/english-test-pred-low" 47 | 48 | *Compute accuracy of test set predictions* - python evaluate.py \ 49 | --pred "data/english-test-pred-low" --target "data/english-test" 50 | -------------------------------------------------------------------------------- /checkpoints/Pointer Generator/Low/model_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/234f578af9cd475d2e6adaf87c6b585e60a9e5e9/checkpoints/Pointer Generator/Low/model_best.pth -------------------------------------------------------------------------------- /checkpoints/Pointer Generator/Medium/model_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/234f578af9cd475d2e6adaf87c6b585e60a9e5e9/checkpoints/Pointer Generator/Medium/model_best.pth -------------------------------------------------------------------------------- /checkpoints/Transformer/Low/model_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/234f578af9cd475d2e6adaf87c6b585e60a9e5e9/checkpoints/Transformer/Low/model_best.pth -------------------------------------------------------------------------------- /checkpoints/Transformer/Medium/model_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/234f578af9cd475d2e6adaf87c6b585e60a9e5e9/checkpoints/Transformer/Medium/model_best.pth -------------------------------------------------------------------------------- /checkpoints/english-low/model_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/234f578af9cd475d2e6adaf87c6b585e60a9e5e9/checkpoints/english-low/model_best.pth -------------------------------------------------------------------------------- /cover.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import data 5 | 6 | parser = argparse.ArgumentParser(description='Reads conll dev file, and covers the target (test file format)') 7 | parser.add_argument('--data-dir', type=str, default='train', 8 | help="Folder of the dataset file") 9 | parser.add_argument('--lang', type=str, default='train', 10 | help="language of the dataset file") 11 | args = parser.parse_args() 12 | 13 | 14 | def cover_file(data_dir, language): 15 | dev_file_path = os.path.join(data_dir, f"{language}.dev") 16 | covered_dev_file_path = os.path.join(data_dir, f"{language}.covered_dev") 17 | covered_dev_file = open(covered_dev_file_path, "w", encoding='utf-8') # "ISO-8859-1") 18 | dev_morph_list = data.read_morph_file(dev_file_path) 19 | for lemma, target, feature in dev_morph_list: 20 | covered_dev_file.write(f"{lemma}\t{feature}\n") 21 | covered_dev_file.close() 22 | 23 | 24 | if __name__ == '__main__': 25 | # Create vocab files 26 | cover_file(args.data_dir, args.lang) -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import re 2 | import tokenizer 3 | 4 | """Allows reading of conll files. Reads conll file, and 1) Splits it by different components of each examples, 5 | and also 2) Separates it to to input tokens and target tokens. conll: lemma-tab-target-tab-features """ 6 | 7 | 8 | """ READING FILES """ 9 | def read_morph_file(morph_file_path): 10 | """ Reads conll file, split to line, and splits each line by tabs. Returns list of lists""" 11 | # Get all lines in file 12 | morph_file = open(morph_file_path, 'r', encoding='utf-8') 13 | lines = morph_file.readlines() 14 | outputs = [] 15 | # Separate lines to proper format 16 | for line in lines: 17 | if line != "\n": 18 | # Strip '\n' and split 19 | outputs.append(line.replace("\n", "").split("\t")) 20 | morph_file.close() 21 | return outputs 22 | 23 | 24 | def clean_word(word): 25 | """ Strips word of unnecessary symbols """ 26 | word = re.sub("[!@#$']", '', word) 27 | return word.lower() 28 | 29 | 30 | def read_train_file(train_file_path): 31 | """ Reads conll train file, and splits to lists of input lemmas, input features and target lemmas""" 32 | lemmas = [] 33 | targets = [] 34 | features = [] 35 | train_morph_list = read_morph_file(train_file_path) 36 | 37 | for lemma, target, feature in train_morph_list: 38 | # Add results to relevant lists 39 | lemmas.append(lemma) 40 | features.append(feature) 41 | targets.append(target) 42 | 43 | return lemmas, targets, features 44 | 45 | 46 | def read_test_file(test_input_file): 47 | """ Reads conll test file, and splits to lists of input lemmas, input features and target lemmas""" 48 | lemmas = [] 49 | features = [] 50 | test_morph_list = read_morph_file(test_input_file) 51 | 52 | for lemma, feature in test_morph_list: 53 | # Add results to relevant lists 54 | lemmas.append(lemma) 55 | features.append(feature) 56 | 57 | return lemmas, features 58 | 59 | 60 | def read_train_file_tokens(train_file_path): 61 | """ Reads conll train file, and splits to input tokens and target tokens. 62 | Each input and target is a list of tokens""" 63 | lemmas, targets, features = read_train_file(train_file_path) 64 | # tokenize all three lists, get as list of tokens lists 65 | lemmas_tokens = tokenizer.tokenize_words(lemmas) 66 | targets_tokens = tokenizer.tokenize_words(targets) 67 | features_tokens = tokenizer.tokenize_features(features) 68 | # concatenate feature tokens to lemma tokens 69 | input_tokens = [lemma_tokens + feature_tokens for lemma_tokens, feature_tokens in 70 | zip(lemmas_tokens, features_tokens)] 71 | return input_tokens, targets_tokens 72 | 73 | 74 | def read_test_file_tokens(test_file_path): 75 | """ Reads conll test file. Each input is a list of tokens""" 76 | lemmas, features = read_test_file(test_file_path) 77 | # tokenize all two lists, get as list of tokens lists 78 | lemmas_tokens = tokenizer.tokenize_words(lemmas) 79 | features_tokens = tokenizer.tokenize_features(features) 80 | # concatenate feature tokens to lemma tokens 81 | input_tokens = [lemma_tokens + feature_tokens for lemma_tokens, feature_tokens in 82 | zip(lemmas_tokens, features_tokens)] 83 | return input_tokens 84 | 85 | 86 | """ WRITING FILES """ 87 | def write_morph_file(lemmas, targets, features, out_file_path): 88 | """ Writes tokens as conll file """ 89 | # Get all lines in file 90 | out_file = open(out_file_path, 'w', encoding='utf-8') 91 | for lemma, target, feature in zip(lemmas, targets, features): 92 | out_file.write("%s\t%s\t%s\n" % (lemma, target, feature)) 93 | out_file.close() 94 | -------------------------------------------------------------------------------- /data/arabic-train-low: -------------------------------------------------------------------------------- 1 | مَجَلَّةٌ الْمَجَلَّتَيْنِ N;DU;DEF;GEN 2 | مُسْتَشْفًى مُسْتَشْفَيَاتِ N;PL;PSSD;GEN 3 | قَالَ يُقَلْنَ V;3;PL;FEM;LGSPEC1;PASS 4 | جِيلٌ جِيلَيْنِ N;DU;NDEF;ACC 5 | أَفْلَحَ أَفْلَحُوا V;3;PL;MASC;PST;PRF;IND;ACT 6 | كَالَمَ أُكَالَمُ V;1;SG;IPFV;IND;PASS 7 | الْأَجْوَد أَجْوَدُ ADJ;SG;MASC;NDEF;NOM 8 | اِنْتَفَضَ يَنْتَفِضُو V;3;PL;MASC;SBJV;ACT 9 | قَمْحَةٌ الْقَمْحَتَانِ N;DU;DEF;NOM 10 | كَتِفٌ كَتِف N;SG;NDEF;INFM 11 | اِرْتَجَى تُرْتَجَيَانِ V;2;DU;IPFV;IND;PASS 12 | مَطْعَمٌ مَطْعَمَ N;SG;PSSD;ACC 13 | هُلُوكٌ هُلُوكٍ N;SG;NDEF;GEN 14 | عَاكَسَ يُعَاكَسْنَ V;3;PL;FEM;IPFV;IND;PASS 15 | اِسْتَنْبَأَ تَسْتَنْبِئَا V;3;DU;FEM;LGSPEC1;ACT 16 | أَنْبَأَ يُنْبَأُ V;3;SG;MASC;IPFV;IND;PASS 17 | اِشْتَرَكَ تَشْتَرِكُونَ V;2;PL;MASC;IPFV;IND;ACT 18 | اِحْمَرَّ يَحْمَرِرْنَ V;3;PL;FEM;SBJV;ACT 19 | نَاوَى تُنَاوِيَ V;3;SG;FEM;SBJV;ACT 20 | الثَّانَوِي الثَّانَوِيَّيْنِ ADJ;DU;MASC;DEF;ACC 21 | اِنْتَهَى اِنْتَهَيْتُنَّ V;2;PL;FEM;PST;PRF;IND;ACT 22 | النَّظِيف النَّظِيف ADJ;SG;MASC;DEF;INFM 23 | نُقْصَانٌ النُّقْصَانِ N;SG;DEF;GEN 24 | ضَلَّ تُضَلُّ V;2;SG;MASC;IPFV;IND;PASS 25 | تَرْسِيمٌ تَرْسِيمَ N;SG;PSSD;ACC 26 | قُنْبُلَةٌ الْقُنْبُلَةُ N;SG;DEF;NOM 27 | كَنِيسٌ الْكَنِيسُ N;SG;DEF;NOM 28 | غَرَّ غُرَّ V;2;SG;MASC;IMP;ACT 29 | وَزِيرٌ وَزِيرَيْن N;DU;NDEF;INFM 30 | أَمْكَنَ أَمْكَنَا V;3;DU;MASC;PST;PRF;IND;ACT 31 | اِسْتِضَاءَةٌ اِسْتِضَاءَةً N;SG;NDEF;ACC 32 | نِظَامُ تَشْغِيلٍ نِظَامِ تَشْغِيلِ N;SG;PSSD;GEN 33 | اِسْتَذْنَبَ يَسْتَذْنِبَا V;3;DU;MASC;LGSPEC1;ACT 34 | اِفْتَعَلَ تَفْتَعِلُونَ V;2;PL;MASC;IPFV;IND;ACT 35 | اِبْتَسَمَ يُبْتَسَمَ V;3;SG;MASC;SBJV;PASS 36 | خُلُودٌ خُلُودُ N;SG;PSSD;NOM 37 | اِسْتَنْفَرَ نَسْتَنْفِرُ V;1;PL;IPFV;IND;ACT 38 | تَمْتَمَةٌ تَمْتَمَةِ N;SG;PSSD;GEN 39 | السَّادِس السَّادِسَيْن ADJ;DU;MASC;DEF;INFM 40 | شَلَّالٌ شَلَّالَاتٍ N;PL;NDEF;ACC 41 | كَارِثَةٌ الْكَارِثَتَيْنِ N;DU;DEF;ACC 42 | اِنْفِصَالٌ اِنْفِصَالِ N;SG;PSSD;GEN 43 | اِحْتَفَلَ اِحْتَفَلْتَ V;2;SG;MASC;PST;PRF;IND;ACT 44 | اِحْقَوْقَفَ اِحْقَوْقَفْتُمْ V;2;PL;MASC;PST;PRF;IND;ACT 45 | عَذَّبَ يُعَذِّبَانِ V;3;DU;MASC;IPFV;IND;ACT 46 | الطَّنَّان طَنَّانَاتٍ ADJ;PL;FEM;NDEF;GEN 47 | أُمُّ قِرَاءَةٍ أُمَّهَاتِ قِرَاءَةِ N;PL;PSSD;ACC 48 | الصَّدِئ الصَّدِئَتَانِ ADJ;DU;FEM;DEF;NOM 49 | قُونَةٌ الْقُونَتَيْن N;DU;DEF;INFM 50 | نِمْسَاوِيَّةٌ نِمْسَاوِيَّتَيْن N;DU;NDEF;INFM 51 | حِرْفَةٌ الْحِرْفَةُ N;SG;DEF;NOM 52 | إِبْصَارٌ إِبْصَار N;SG;PSSD;INFM 53 | الْأَخِير أَخِير ADJ;SG;MASC;NDEF;INFM 54 | صَوَّبَ صُوِّبْتَ V;2;SG;MASC;PST;PRF;IND;PASS 55 | طَهَّرَ طُهِّرْنَ V;3;PL;FEM;PST;PRF;IND;PASS 56 | مِرْصَادٌ مَرَاصِيد N;PL;NDEF;INFM 57 | رَغِبَ نَرْغَبَ V;1;PL;SBJV;ACT 58 | سَاخِرَةٌ السَّاخِرَةِ N;SG;DEF;GEN 59 | كَاتِدْرَائِيَّةٌ كَاتِدْرَائِيَّتَا N;DU;PSSD;NOM 60 | اِمْتَازَ يَمْتَازَا V;3;DU;MASC;SBJV;ACT 61 | اِسْتَعْمَلَ يَسْتَعْمِلَ V;3;SG;MASC;SBJV;ACT 62 | اِعْتِيَادٌ اِعْتِيَادَاتِ N;PL;PSSD;GEN 63 | دُفُوقٌ دُفُوقٌ N;SG;NDEF;NOM 64 | أَضَاعَ تُضِيعَ V;2;SG;MASC;SBJV;ACT 65 | حَرَّرَ أُحَرِّرُ V;1;SG;IPFV;IND;ACT 66 | الْقُرَيْشِي الْقُرَيْشِيَّات ADJ;PL;FEM;DEF;INFM 67 | سَمَّى يُسَمَّيْنَ V;3;PL;FEM;LGSPEC1;PASS 68 | قُونَةٌ قُونَةٌ N;SG;NDEF;NOM 69 | الْعُمَانِي الْعُمَانِيَّتَيْنِ ADJ;DU;FEM;DEF;GEN 70 | أُسْتُرَالِيَّةٌ أُسْتُرَالِيَّةٌ N;SG;NDEF;NOM 71 | تَكْرِيهٌ تَكْرِيهًا N;SG;NDEF;ACC 72 | شَيْطَانٌ شَيْطَانٍ N;SG;NDEF;GEN 73 | الْمُمْكِن الْمُمْكِنَ ADJ;SG;MASC;DEF;ACC 74 | اِخْرَنْمَسَ نَخْرَنْمِسَ V;1;PL;SBJV;ACT 75 | صَحَّى تُصَحَّيْنَ V;2;PL;FEM;SBJV;PASS 76 | اِنْضَمَّ تَنْضَمُّو V;2;PL;MASC;LGSPEC1;ACT 77 | الْمُنْفَرِد الْمُنْفَرِدَاتُ ADJ;PL;FEM;DEF;NOM 78 | زُوَارَةٌ زُوَارَة N;SG;NDEF;INFM 79 | الزَّاهِر زَاهِرَيْنِ ADJ;DU;MASC;NDEF;ACC 80 | قَمْحٌ الْقَمْحِ N;SG;DEF;GEN 81 | الْمَلَائِكِي مَلَائِكِيِّينَ ADJ;PL;MASC;NDEF;ACC 82 | الْعِمْلَاقِي الْعِمْلَاقِيَّةِ ADJ;SG;FEM;DEF;GEN 83 | مُحِبٌّ مُحِبَّيْ N;DU;PSSD;INFM 84 | جِلْدٌ جِلْدَيْن N;DU;NDEF;INFM 85 | مُخٌّ الْمُخَّيْنِ N;DU;DEF;ACC 86 | أَنْصَفَ تُنْصِفْنَ V;2;PL;FEM;LGSPEC1;ACT 87 | مُثَابَرَةٌ مُثَابَرَةٍ N;SG;NDEF;GEN 88 | اِبْلَاجَّ تَبْلَاجُّونَ V;2;PL;MASC;IPFV;IND;ACT 89 | عَاقَبَ تُعَاقِبَا V;3;DU;FEM;LGSPEC1;ACT 90 | بُلْبُلٌ الْبُلْبُلَ N;SG;DEF;ACC 91 | اِبْتِعَادٌ الِابْتِعَاد N;SG;DEF;INFM 92 | نَمَا تَنْمُوَ V;2;SG;MASC;SBJV;ACT 93 | شِمْبَانْزِي الشِّمْبَانْزِي N;SG;DEF;ACC 94 | اِسْتَعْرَفَ تَسْتَعْرِفُ V;3;SG;FEM;IPFV;IND;ACT 95 | اِحْمَارَّ نَحْمَارَّ V;1;PL;LGSPEC1;ACT 96 | اِسْتِئْذَانٌ الِاسْتِئْذَانُ N;SG;DEF;NOM 97 | خُصْيَةٌ خُصًى N;PL;NDEF;GEN 98 | مَطَارٌ مَطَارَات N;PL;PSSD;INFM 99 | اِسْتَوْلَى يَسْتَوْلِي V;3;SG;MASC;IPFV;IND;ACT 100 | اِسْتَبْدَلَ اِسْتَبْدَلَتْ V;3;SG;FEM;PST;PRF;IND;ACT 101 | -------------------------------------------------------------------------------- /data/english-train-low: -------------------------------------------------------------------------------- 1 | dreep dreep V;NFIN 2 | charcoal charcoal V;NFIN 3 | stodge stodges V;3;SG;PRS 4 | biotransform biotransform V;NFIN 5 | disallow disallowing V;V.PTCP;PRS 6 | precut precut V;V.PTCP;PST 7 | outmanœuvre outmanœuvred V;PST 8 | unsnib unsnibbing V;V.PTCP;PRS 9 | Afghanize Afghanized V;PST 10 | redescribe redescribes V;3;SG;PRS 11 | overspeculate overspeculates V;3;SG;PRS 12 | reënter reënters V;3;SG;PRS 13 | waller wallering V;V.PTCP;PRS 14 | carboxylate carboxylating V;V.PTCP;PRS 15 | imprison imprisoned V;PST 16 | helicopt helicopted V;PST 17 | tut tutted V;V.PTCP;PST 18 | misdoom misdooms V;3;SG;PRS 19 | mush mush V;NFIN 20 | billhook billhook V;NFIN 21 | ingrave ingraved V;PST 22 | estheticize estheticize V;NFIN 23 | off-split off-split V;PST 24 | excecate excecating V;V.PTCP;PRS 25 | hegemonise hegemonised V;V.PTCP;PST 26 | overregularize overregularized V;PST 27 | innoculate innoculates V;3;SG;PRS 28 | mopy mopying V;V.PTCP;PRS 29 | unhyphenate unhyphenated V;PST 30 | acetize acetizes V;3;SG;PRS 31 | outjuggle outjuggled V;V.PTCP;PST 32 | reexplain reexplain V;NFIN 33 | homestead homestead V;NFIN 34 | pregame pregamed V;V.PTCP;PST 35 | carabine carabined V;PST 36 | deauthorise deauthorising V;V.PTCP;PRS 37 | sqush squshed V;V.PTCP;PST 38 | gang-rape gang-rapes V;3;SG;PRS 39 | transfect transfect V;NFIN 40 | bemar bemarred V;V.PTCP;PST 41 | intrunk intrunk V;NFIN 42 | atray atrayed V;PST 43 | conculcate conculcated V;PST 44 | jettison jettisoning V;V.PTCP;PRS 45 | equidistribute equidistributes V;3;SG;PRS 46 | postdict postdicted V;V.PTCP;PST 47 | fine fined V;PST 48 | autohide autohides V;3;SG;PRS 49 | nourish nourished V;V.PTCP;PST 50 | jack jacks V;3;SG;PRS 51 | superrefine superrefined V;V.PTCP;PST 52 | begrasp begrasping V;V.PTCP;PRS 53 | autostart autostarted V;V.PTCP;PST 54 | rearrive rearrived V;PST 55 | motorcycle motorcycled V;V.PTCP;PST 56 | countersink countersank V;PST 57 | confessionalise confessionalise V;NFIN 58 | underfang underfang V;NFIN 59 | yabby yabbying V;V.PTCP;PRS 60 | betrail betrails V;3;SG;PRS 61 | whish whish V;NFIN 62 | deconceptualise deconceptualising V;V.PTCP;PRS 63 | bedrink bedrank V;PST 64 | effervesce effervesces V;3;SG;PRS 65 | failcascade failcascading V;V.PTCP;PRS 66 | wok woks V;3;SG;PRS 67 | quahog quahog V;NFIN 68 | jarp jarped V;V.PTCP;PST 69 | plum plumming V;V.PTCP;PRS 70 | reclude recluding V;V.PTCP;PRS 71 | handplant handplanted V;V.PTCP;PST 72 | overvolt overvolting V;V.PTCP;PRS 73 | quicksave quicksaving V;V.PTCP;PRS 74 | sandbathe sandbathes V;3;SG;PRS 75 | roister roistering V;V.PTCP;PRS 76 | mull mulls V;3;SG;PRS 77 | upclock upclocking V;V.PTCP;PRS 78 | subgrant subgrant V;NFIN 79 | insense insenses V;3;SG;PRS 80 | embrown embrowned V;PST 81 | sour soured V;V.PTCP;PST 82 | mesmerize mesmerize V;NFIN 83 | spill spills V;3;SG;PRS 84 | misevaluate misevaluated V;V.PTCP;PST 85 | sepose seposed V;V.PTCP;PST 86 | deem deemed V;V.PTCP;PST 87 | complex complex V;NFIN 88 | rehumanize rehumanizing V;V.PTCP;PRS 89 | unbrutalize unbrutalizes V;3;SG;PRS 90 | inform informs V;3;SG;PRS 91 | drool drooling V;V.PTCP;PRS 92 | radicate radicates V;3;SG;PRS 93 | gap gap V;NFIN 94 | addend addending V;V.PTCP;PRS 95 | derivate derivate V;NFIN 96 | misdevelop misdevelop V;NFIN 97 | drivel drivel V;NFIN 98 | prolapse prolapsed V;V.PTCP;PST 99 | instaure instaures V;3;SG;PRS 100 | fute futing V;V.PTCP;PRS 101 | -------------------------------------------------------------------------------- /data/english-train-mini: -------------------------------------------------------------------------------- 1 | dreep dreep V;NFIN 2 | charcoal charcoal V;NFIN 3 | stodge stodges V;3;SG;PRS 4 | biotransform biotransform V;NFIN 5 | disallow disallowing V;V.PTCP;PRS -------------------------------------------------------------------------------- /data/french-train-low: -------------------------------------------------------------------------------- 1 | enchérir enchérisse V;SBJV;PRS;3;SG 2 | retendre retendissions V;SBJV;PST;1;PL 3 | écharper écharperait V;COND;3;SG 4 | échiner échines V;IND;PRS;2;SG 5 | racornir racornira V;IND;FUT;3;SG 6 | recommencer en recommençant V.CVB;PRS 7 | dispatcher dispatchais V;IND;PST;1;SG;IPFV 8 | saler en salant V.CVB;PRS 9 | surexciter surexcitiez V;SBJV;PRS;2;PL 10 | picoter picoteront V;IND;FUT;3;PL 11 | souder en soudant V.CVB;PRS 12 | pèleriner pèlerinant V.PTCP;PRS 13 | recommencer recommences V;SBJV;PRS;2;SG 14 | chlorer chlorer V;NFIN 15 | fuir fuissent V;SBJV;PST;3;PL 16 | dépayser dépaysais V;IND;PST;1;SG;IPFV 17 | pénaliser pénalisais V;IND;PST;2;SG;IPFV 18 | retordre retordisse V;SBJV;PST;1;SG 19 | adopter adopteras V;IND;FUT;2;SG 20 | braver bravez V;IND;PRS;2;PL 21 | livrer livrer V;NFIN 22 | occidentaliser occidentalisassent V;SBJV;PST;3;PL 23 | revitaliser revitalisasse V;SBJV;PST;1;SG 24 | praeparer praeparassiez V;SBJV;PST;2;PL 25 | étalonner étalonneras V;IND;FUT;2;SG 26 | randomiser randomisant V.PTCP;PRS 27 | tergiverser tergiversâtes V;IND;PST;2;PL;PFV 28 | gober gobes V;IND;PRS;2;SG 29 | estudier estudia V;IND;PST;1;SG;PFV 30 | travailler travaillassent V;SBJV;PST;3;PL 31 | enrégimenter enrégimentassent V;SBJV;PST;3;PL 32 | résider résidez V;IND;PRS;2;PL 33 | déliter délitai V;IND;PST;1;SG;PFV 34 | reformer reformer V;NFIN 35 | remixer remixais V;IND;PST;1;SG;IPFV 36 | ouster oustassent V;SBJV;PST;3;PL 37 | aniser anise V;POS;IMP;2;SG 38 | cabasser cabasses V;SBJV;PRS;2;SG 39 | accuser accusions V;IND;PST;1;PL;IPFV 40 | marcher marchiez V;SBJV;PRS;2;PL 41 | engraisser engraisse V;POS;IMP;2;SG 42 | échauder échaudais V;IND;PST;2;SG;IPFV 43 | écurer écure V;POS;IMP;2;SG 44 | tituber titubais V;IND;PST;1;SG;IPFV 45 | larder larde V;IND;PRS;1;SG 46 | couronner couronnerais V;COND;2;SG 47 | mentionner mentionné V.PTCP;PST 48 | disloquer disloquassiez V;SBJV;PST;2;PL 49 | atterrir atterrîtes V;IND;PST;2;PL;PFV 50 | espruver espruve V;POS;IMP;2;SG 51 | importer importions V;IND;PST;1;PL;IPFV 52 | ambler amblons V;IND;PRS;1;PL 53 | eschever escheverai V;IND;FUT;1;SG 54 | receler recela V;IND;PST;3;SG;PFV 55 | biguiner biguinassent V;SBJV;PST;3;PL 56 | avantager avantageai V;IND;PST;1;SG;PFV 57 | advenir advins V;IND;PST;2;SG;PFV 58 | rouvrir rouvre V;IND;PRS;1;SG 59 | espionner espionneraient V;COND;3;PL 60 | lisser lissasses V;SBJV;PST;2;SG 61 | chaperonner chaperonnes V;IND;PRS;2;SG 62 | mugir mugir V;NFIN 63 | décroître décrû V.PTCP;PST 64 | enditer enditera V;IND;FUT;3;SG 65 | accompagner accompagnent V;SBJV;PRS;3;PL 66 | polycopier polycopiai V;IND;PST;1;SG;PFV 67 | coisier coisiez V;IND;PRS;2;PL 68 | tondre tondiez V;IND;PST;2;PL;IPFV 69 | comparer comparent V;SBJV;PRS;3;PL 70 | bêcher bêchais V;IND;PST;2;SG;IPFV 71 | abaisser abaisse V;SBJV;PRS;1;SG 72 | alterner alternassiez V;SBJV;PST;2;PL 73 | catalyser catalyses V;IND;PRS;2;SG 74 | biaiser biaisait V;IND;PST;3;SG;IPFV 75 | mercyer mercyons V;POS;IMP;1;PL 76 | renchérir renchérissions V;SBJV;PST;1;PL 77 | refléter reflèteras V;IND;FUT;2;SG 78 | défriser défrisent V;IND;PRS;3;PL 79 | régurgiter régurgite V;IND;PRS;3;SG 80 | enditer enditast V;SBJV;PST;3;SG 81 | adiouster adioustast V;SBJV;PST;3;SG 82 | gratouiller gratouille V;SBJV;PRS;3;SG 83 | kayaker kayakasses V;SBJV;PST;2;SG 84 | franciser francise V;POS;IMP;2;SG 85 | duveter duveté V.PTCP;PST 86 | tackler tackle V;IND;PRS;1;SG 87 | demorer demora V;IND;PST;3;SG;PFV 88 | dénigrer dénigrerais V;COND;2;SG 89 | présidentialiser présidentialisèrent V;IND;PST;3;PL;PFV 90 | chavirer chavirassiez V;SBJV;PST;2;PL 91 | poireauter poireautaient V;IND;PST;3;PL;IPFV 92 | débâillonner débâillonnez V;POS;IMP;2;PL 93 | comprehendre comprehendre V;NFIN 94 | kifer kiferont V;IND;FUT;3;PL 95 | sojorner sojornes V;IND;PRS;2;SG 96 | desfier en desfiant V.CVB;PRS 97 | membrer membrai V;IND;PST;1;SG;PFV 98 | enregistrer enregistraient V;IND;PST;3;PL;IPFV 99 | presupposer presupposois V;IND;PST;1;SG;IPFV 100 | sechier sechiez V;SBJV;PRS;2;PL 101 | -------------------------------------------------------------------------------- /data/hebrew-train-low: -------------------------------------------------------------------------------- 1 | התנשק תתנשׁקו V;2;PL;FUT:MASC 2 | הגיב הגבנו V;1;PL;PST 3 | דרש דרושנה V;2;PL;IMP;FEM 4 | חינך תחנך V;3;SG;FUT;FEM 5 | תיאר תיארתן V;2;PL;PST;FEM 6 | גליל גלילי N;SG;N;SGDEF;PSS1S 7 | התנקש התנקשׁתן V;2;PL;PST;FEM 8 | כיסא כיסאו N;SG;PSSD;PSS2S;MASC 9 | הוסף הוספתם V;2;PL;PST;MASC 10 | התקבל מתקבל V;SG;PRS;MASC 11 | התנפל התנפלתן V;2;PL;PST;FEM 12 | נשמה נשׁמותיו N;PL;PSSD;PSS2S;MASC 13 | טריבונה טריבונה N;SG;NDEF 14 | טופל תטופל V;2;SG;FUT;MASC 15 | תיעב תיעבת V;2;SG;PST;FEM 16 | מולדת מולדתך N;SG;DEF;PSS2S;FEM 17 | שרק שׁרקת V;2;SG;PST;FEM 18 | עולם עולמיהן N;PL;PSSD;PSS2P;FEM 19 | נאהב יאהבו V;3;PL;FUT;MASC 20 | סגר סגרנו V;1;PL;PST 21 | הצלחה הצלחותיכן N;PL;DEF;PSS2P;FEM 22 | שחזור שחזוריי N;PL;N;PLDEF;PSS1S 23 | מרד ימרדו V;3;PL;FUT;MASC 24 | קם קמתן V;2;PL;PST;FEM 25 | פגע פגעת V;2;SG;PST;MASC 26 | סר לסור V;NFIN 27 | היה אהיה V;1;SG;FUT 28 | יכול יוכל V;3;SG;FUT;MASC 29 | פרובינציה פרובינצית N;SG;PSSD 30 | רומם תרומם V;3;SG;FUT;FEM 31 | נאכף נאכפת V;2;SG;PST;MASC 32 | הביא תביאנה V;3;PL;FUT;FEM 33 | מגבת מגבת N;SG;NDEF 34 | הגיע יגיע V;3;SG;FUT;MASC 35 | חנוכייה חנוכיותיה N;PL;PSSD;PSS2S;FEM 36 | סרט סרטכן N;SG;DEF;PSS2P;FEM 37 | חזק חזקה V;3;SG;PST;FEM 38 | ליקק ליקקה V;3;SG;PST;FEM 39 | התפוצץ התפוצצתם V;2;PL;PST;MASC 40 | סדרה סדרתכן N;SG;DEF;PSS2P;FEM 41 | הושיב הושׁבתן V;2;PL;PST;FEM 42 | חזק תחזקו V;2;PL;FUT:MASC 43 | הכשיר הכשׁרנה V;2;PL;IMP;FEM 44 | אלוה אלוהו N;SG;PSSD;PSS2S;MASC 45 | מצב מצביו N;PL;PSSD;PSS2S;MASC 46 | אץ אצה V;3;SG;PST;FEM 47 | קטון תקטנו V;2;PL;FUT:MASC 48 | חדר חדרת V;2;SG;PST;FEM 49 | הוסיף להוסיף V;NFIN 50 | התנדב להתנדב V;NFIN 51 | גילגל יגלגלו V;3;PL;FUT;MASC 52 | עורר תעוררנה V;3;PL;FUT;FEM 53 | גילגל מגלגלת V;SG;PRS;FEM 54 | הרכיב תרכבנה V;3;PL;FUT;FEM 55 | נמס נימס V;1;PL;FUT 56 | האמין האמנו V;1;PL;PST 57 | מסירה מסירותינו N;PL;N;PLDEF;PSS1P 58 | ברח בורחים V;PL;PRS;MASC 59 | כונס כונסת V;2;SG;PST;FEM 60 | לימד ללמד V;NFIN 61 | משיכה משׁיכתן N;SG;PSSD;PSS2P;FEM 62 | התחבא התחבאו V;2;PL;IMP;MASC 63 | תיאר תיארתם V;2;PL;PST;MASC 64 | הוריד הורדנה V;2;PL;IMP;FEM 65 | לקה לקו V;2;PL;IMP;MASC 66 | תיבל תבל V;2;SG;IMP;MASC 67 | מצץ מצצת V;2;SG;PST;FEM 68 | יונה יוניכן N;PL;DEF;PSS2P;FEM 69 | התעורר תתעוררנה V;2;PL;FUT;FEM 70 | כובה אכובה V;1;SG;FUT 71 | הקיף הקיפו V;2;PL;IMP;MASC 72 | נשם נשׁמתן V;2;PL;PST;FEM 73 | זקף זקף V;3;SG;PST;MASC 74 | שריפה שׂרפותיהן N;PL;PSSD;PSS2P;FEM 75 | קטף קטופנה V;2;PL;IMP;FEM 76 | ביקר ביקרתם V;2;PL;PST;MASC 77 | התקנא התקנאנו V;1;PL;PST 78 | הזדמנות הזדמנויותיהם N;PL;PSSD;PSS2P;MASC 79 | השמיד השׁמדת V;2;SG;PST;MASC 80 | זרח זרח V;3;SG;PST;MASC 81 | תרם תרמתי V;1;SG;PST 82 | רשם תרשום V;2;SG;FUT;MASC 83 | גלה גלית V;2;SG;PST;MASC 84 | ורד ורדיך N;PL;DEF;PSS2S;MASC 85 | ירה תירינה V;2;PL;FUT;FEM 86 | פתח תפתחו V;2;PL;FUT:MASC 87 | עישן עשׁנו V;2;PL;IMP;MASC 88 | כתובית כתוביותיהם N;PL;PSSD;PSS2P;MASC 89 | שחיטה שׁחיטתה N;SG;PSSD;PSS2S;FEM 90 | רבה תרבה V;3;SG;FUT;FEM 91 | בטן בטניכן N;PL;DEF;PSS2P;FEM 92 | מכר מוכרים V;PL;PRS;MASC 93 | שמלה שׂמלתי N;SG;N;SGDEF;PSS1S 94 | מלוכה מלוכתנו N;SG;N;SGDEF;PSS1P 95 | שחיטה השׁחיטות N;PL;DEF 96 | שכן שׁכנו V;2;PL;IMP;MASC 97 | שיקר שיקרתי V;1;SG;PST 98 | נאהד נאהדנו V;1;PL;PST 99 | הגיע הגעת V;2;SG;PST;FEM 100 | מת תמותו V;2;PL;FUT:MASC 101 | -------------------------------------------------------------------------------- /data/irish-train-low: -------------------------------------------------------------------------------- 1 | cruinnigh cruinnídís V;3;PL;IMP 2 | tíogar an tíogair N;GEN;SG;DEF 3 | pailéa-luibheolaíocht pailéa-luibheolaíocht N;DAT;SG 4 | mórghnáth mórghnáth N;NOM;SG 5 | agaigh agaídís V;3;PL;IMP 6 | agairg na hagairge N;GEN;SG;DEF 7 | scoil saothair scoileanna saothair N;DAT;PL 8 | tuathánach tuathánaigh N;GEN;SG 9 | amhrán na hamhráin N;NOM;PL;DEF 10 | ruainne ruainne N;NOM;SG 11 | sácráilteacht leis an tsácráilteacht N;DAT;SG;DEF 12 | néar-raideolaí an néar-raideolaí N;GEN;SG;DEF 13 | donacht donacht N;DAT;SG 14 | gloinigh gloineoidh mé V;1;SG;FUT;IND 15 | dear dhear tú V;2;SG;PST;IND 16 | féile féile N;GEN;SG 17 | fo-éadach fo-éadaí N;NOM;PL 18 | fiafheoil a fhiafheoil N;VOC;SG 19 | lucht luchta N;GEN;SG 20 | tiontaigh dá dtiontaídís V;3;PL;SBJV;PST 21 | fótaidhé-óid na fótaidhé-óidí N;NOM;PL;DEF 22 | réimsetheoiric leis na réimsetheoiricí N;DAT;PL;DEF 23 | baintreach leis na baintreacha N;DAT;PL;DEF 24 | rian a rian N;VOC;SG 25 | éadaitheoireacht an éadaitheoireacht N;NOM;SG;DEF 26 | stoirm na stoirme N;GEN;SG;DEF 27 | Ard-Aighne an Ard-Aighne N;GEN;SG;DEF 28 | paidir paidir N;NOM;SG 29 | leagh leaghfaimid V;1;PL;FUT;IND 30 | fiúntas an fiúntas N;NOM;SG;DEF 31 | ceardaíocht an cheardaíocht N;NOM;SG;DEF 32 | bádóir leis na bádóirí N;DAT;PL;DEF 33 | cnap leis an gcnap N;DAT;SG;DEF 34 | pocadán pocadáin N;DAT;PL 35 | fótagrafaíocht na fótagrafaíochta N;GEN;SG;DEF 36 | Guineach an Ghuinigh N;GEN;SG;DEF 37 | eagar an eagair N;GEN;SG;DEF 38 | tar thiocfaí V;AUTO;COND 39 | mórbhonn mórbhoinn N;DAT;PL 40 | fótónaic fótónaic N;NOM;SG 41 | ráthaigh ráthaígí V;2;PL;IMP 42 | liopard fiaigh leis an liopard fiaigh N;DAT;SG;DEF 43 | bréagiontaise bréagiontaise N;GEN;SG 44 | cóisir bhrídeoige an chóisir bhrídeoige N;NOM;SG;DEF 45 | imaistrigh imaistríonn sé V;3;SG;PRS;IND 46 | clois cloiseann sibh V;2;PL;PRS;IND 47 | stiallfhótagrafaíocht a stiallfhótagrafaíocht N;VOC;SG 48 | lárthosaí leis an lárthosaí N;DAT;SG;DEF 49 | crúca crúcaí N;GEN;PL 50 | foghlaim d'fhoghlaim sibh V;2;PL;PST;IND 51 | luachálaí a luachálaithe N;VOC;PL 52 | folig foligeadh sé V;3;SG;IMP 53 | míolsiorc an mhíolsiorca N;GEN;SG;DEF 54 | dulcaiméir dulcaiméir N;DAT;SG 55 | fiaile fiailí N;DAT;PL 56 | culaith culaith N;DAT;SG 57 | blast leis na blastaí N;DAT;PL;DEF 58 | grúdaigh dá ngrúdaínn V;1;SG;SBJV;PST 59 | grúdaigh grúdaígí V;2;PL;IMP 60 | snas a shnas N;VOC;SG 61 | comhghuaillíocht comhghuaillíocht N;NOM;SG 62 | teorannaigh go dteorannaítear V;AUTO;SBJV;PRS 63 | Fionlainnis na Fionlainnise N;GEN;SG;DEF 64 | siollach an tsiollaigh N;GEN;SG;DEF 65 | greannán greannán N;GEN;PL 66 | treisigh treiseoidh sibh V;2;PL;FUT;IND 67 | gasúr leis an ngasúr N;DAT;SG;DEF 68 | oileánra oileánraí N;NOM;PL 69 | teicnealathach teicnealathach ADJ;DAT;SG;MASC 70 | rí-ulchabhán na rí-ulchabháin N;NOM;PL;DEF 71 | abhrán abhrán N;GEN;PL 72 | croscheann leis an gcroscheann N;DAT;SG;DEF 73 | tilleadh tilleadh N;NOM;SG 74 | troigh na troithe N;NOM;PL;DEF 75 | naomhainmnigh a naomhainmnigh V;REL;PST;IND 76 | mí-oiriúint na mí-oiriúna N;GEN;SG;DEF 77 | léitheoir an léitheora N;GEN;SG;DEF 78 | feirm feirmeacha N;GEN;PL 79 | breathnaigh breathnaíonn tú V;2;SG;PRS;IND 80 | figh fíonn tú V;2;SG;PRS;IND 81 | éalaitheach éalaitheach N;NOM;SG 82 | coigistigh go gcoigistítear V;AUTO;SBJV;PRS 83 | sceir sceir N;DAT;SG 84 | léirmheastóir léirmheastóirí N;DAT;PL 85 | rostram a rostraim N;VOC;SG 86 | ráille tuáillí ráillí tuáillí N;DAT;PL 87 | coinnigh coinníonn sé V;3;SG;PRS;IND 88 | aíochtlann leis an aíochtlann N;DAT;SG;DEF 89 | fidil na fidleacha N;NOM;PL;DEF 90 | iompaigh dá n-iompaíteá V;2:SG;SBJV;PST 91 | cairdinéal cairdinéil N;DAT;PL 92 | sreangshúil sreangshúile N;GEN;SG 93 | ainéifeacht na hainéifeachta N;GEN;SG;DEF 94 | oibrigh d'oibreoimis V;1;PL;COND 95 | lobh lobhtar V;AUTO;PRS;IND 96 | beach beacha N;DAT;PL 97 | pá pá N;NOM;SG 98 | socheolaíocht a shocheolaíocht N;VOC;SG 99 | amplóir a amplóra N;VOC;SG 100 | aelus aelusanna N;DAT;PL 101 | -------------------------------------------------------------------------------- /data/italian-train-low: -------------------------------------------------------------------------------- 1 | rinvasare rinvasi V;SBJV;PRS;1;SG 2 | infilzare infilzeremo V;IND;FUT;1;PL 3 | rivedersi mi rivedevo V;IND;PST;1;SG;IPFV 4 | insanguinare insanguinante V.PTCP;PRS 5 | tramare tramate V;POS;IMP;2;PL 6 | scartabellare scartabelliamo V;POS;IMP;1;PL 7 | calzare calzerete V;IND;FUT;2;PL 8 | sfidarsi si sfidi V;POS;IMP;3;SG 9 | recarvisi vi si recheranno V;IND;FUT;3;PL 10 | intrufolare intrufolassi V;SBJV;PST;2;SG 11 | rinchiudere rinchiuderei V;COND;1;SG 12 | impiombare impiombi V;IND;PRS;2;SG 13 | scombinare scombini V;IND;PRS;2;SG 14 | attendersi si attenda V;POS;IMP;3;SG 15 | cazzeggiare cazzeggerei V;COND;1;SG 16 | inzeppare inzeppassi V;SBJV;PST;1;SG 17 | strabiliare strabilerò V;IND;FUT;1;SG 18 | disvolgere disvolga V;SBJV;PRS;1;SG 19 | argomentare argomentasti V;IND;PST;2;SG;PFV 20 | sciogliersi si sciolga V;POS;IMP;3;SG 21 | corcare corcavo V;IND;PST;1;SG;IPFV 22 | umiliare umilierà V;IND;FUT;3;SG 23 | estrarre estraessimo V;SBJV;PST;1;PL 24 | compiere compivi V;IND;PST;2;SG;IPFV 25 | ricoverarsi ti ricoveravi V;IND;PST;2;SG;IPFV 26 | rinsanguare rinsanguavate V;IND;PST;2;PL;IPFV 27 | assistere assisterei V;COND;1;SG 28 | sanare sani V;SBJV;PRS;2;SG 29 | azzerare azzerando V.CVB;PRS 30 | gloglottare gloglottereste V;COND;2;PL 31 | tangere tangi V;POS;IMP;2;SG 32 | invigorire invigorisca V;SBJV;PRS;3;SG 33 | innamorare innamoravi V;IND;PST;2;SG;IPFV 34 | ospedalizzare ospedalizzò V;IND;PST;3;SG;PFV 35 | piegare piegheremmo V;COND;1;PL 36 | formare formino V;POS;IMP;3;PL 37 | rassumere rassumesti V;IND;PST;2;SG;PFV 38 | sterminare sterminavate V;IND;PST;2;PL;IPFV 39 | incuriosirsi si incuriosiscano V;SBJV;PRS;3;PL 40 | sfarinarsi si sfarinano V;IND;PRS;3;PL 41 | metaforizzare metaforizzavamo V;IND;PST;1;PL;IPFV 42 | pavimentare pavimentassero V;SBJV;PST;3;PL 43 | doppiare doppierai V;IND;FUT;2;SG 44 | scempiare scempiasse V;SBJV;PST;3;SG 45 | potabilizzare potabilizzaste V;SBJV;PST;2;PL 46 | disintegrarsi ci disintegreremo V;IND;FUT;1;PL 47 | rabbonire rabbonendo V.CVB;PRS 48 | sagginare sagginasse V;SBJV;PST;3;SG 49 | costudire costudirebbero V;COND;3;PL 50 | scarnire scarnisse V;SBJV;PST;3;SG 51 | simulare simulassero V;SBJV;PST;3;PL 52 | asciolvere asciolverete V;IND;FUT;2;PL 53 | vellutare velluta V;POS;IMP;2;SG 54 | spogliarsi ti spoglierai V;IND;FUT;2;SG 55 | girondolare girondolavi V;IND;PST;2;SG;IPFV 56 | attingersi si attingerà V;IND;FUT;3;SG 57 | ondulare ondulassimo V;SBJV;PST;1;PL 58 | frenarsi ci freneremmo V;COND;1;PL 59 | vendicare vendicaste V;SBJV;PST;2;PL 60 | smetterla la smettano V;POS;IMP;3;PL 61 | disimpegnarsi vi disimpegnerete V;IND;FUT;2;PL 62 | sgonfiare sgonfiaste V;SBJV;PST;2;PL 63 | bastire bastissi V;SBJV;PST;2;SG 64 | obliare obliante V.PTCP;PRS 65 | ripudiare ripudia V;POS;IMP;2;SG 66 | trasognare trasogneremo V;IND;FUT;1;PL 67 | concedere concedi V;POS;IMP;2;SG 68 | reimpiantare reimpianterete V;IND;FUT;2;PL 69 | scriversi ti scrivevi V;IND;PST;2;SG;IPFV 70 | ornare ornasse V;SBJV;PST;3;SG 71 | nuclearizzare nuclearizzi V;IND;PRS;2;SG 72 | ibridare ibriderebbe V;COND;3;SG 73 | mansuefarsi vi mansuefacciate V;SBJV;PRS;2;PL 74 | testificare testificassimo V;SBJV;PST;1;PL 75 | addebbiare addebbierete V;IND;FUT;2;PL 76 | smaliziare smalizierà V;IND;FUT;3;SG 77 | crucciarsi mi crucciavo V;IND;PST;1;SG;IPFV 78 | trivellare trivellerai V;IND;FUT;2;SG 79 | smembrare smembra V;POS;IMP;2;SG 80 | carezzarsi vi carezziate V;SBJV;PRS;2;PL 81 | illanguidire illanguidì V;IND;PST;3;SG;PFV 82 | storcere storcente V.PTCP;PRS 83 | depolverizzare depolverizzammo V;IND;PST;1;PL;PFV 84 | refrangere refranga V;POS;IMP;3;SG 85 | interpretare interpreterai V;IND;FUT;2;SG 86 | corteare corteavi V;IND;PST;2;SG;IPFV 87 | addimesticare addimesticavano V;IND;PST;3;PL;IPFV 88 | attendere attendeva V;IND;PST;3;SG;IPFV 89 | ritorcere ritorcemmo V;IND;PST;1;PL;PFV 90 | impuzzolentire impuzzolentiscano V;POS;IMP;3;PL 91 | rombare rombasse V;SBJV;PST;3;SG 92 | intervistare intervistiate V;SBJV;PRS;2;PL 93 | caracollare caracolli V;POS;IMP;3;SG 94 | uccidere uccidereste V;COND;2;PL 95 | apporre apponga V;SBJV;PRS;2;SG 96 | transitare transitassimo V;SBJV;PST;1;PL 97 | risospendere risospesero V;IND;PST;3;PL;PFV 98 | sostantivare sostantiviamo V;POS;IMP;1;PL 99 | subirsi subisciti V;POS;IMP;2;SG 100 | appendersi mi appendo V;IND;PRS;1;SG 101 | -------------------------------------------------------------------------------- /data/middle-french-train-low: -------------------------------------------------------------------------------- 1 | inquieter inquieta V;IND;PST;3;SG;PFV 2 | descharger deschargeons V;IND;PRS;1;PL 3 | racompter racompterent V;IND;PST;3;PL;PFV 4 | fascher faschastes V;IND;PST;2;PL;PFV 5 | accompaigner accompaigneras V;2;SG;IND;FUT 6 | baptizer baptizyez V;IND;PST;2;PL;IPFV;LGSPEC2 7 | expuyser expuyserez V;2;PL;IND;FUT 8 | destabler destablasmes V;IND;PST;1;PL;PFV 9 | brusler bruslé V.PTCP;PST 10 | quester questoyt V;IND;PST;3;SG;IPFV;LGSPEC2 11 | houssepillier houssepillieroient V;3;PL;COND;LGSPEC1 12 | destourner destournas V;IND;PST;2;SG;PFV 13 | superseder supersedastes V;IND;PST;2;PL;PFV 14 | marier mariassions V;1;PL;SBJV;PST;IPFV 15 | explorer explorerions V;1;PL;COND;LGSPEC1 16 | practicquer practicquant V.PTCP;PRS 17 | frapper frappes V;IND;PRS;2;SG 18 | eslancer eslanças V;IND;PST;2;SG;PFV 19 | aracher araché V.PTCP;PST 20 | porter portons V;1;PL;SBJV;PRS 21 | eslancer eslançoyt V;IND;PST;3;SG;IPFV;LGSPEC2 22 | aguyser aguyserai V;1;SG;IND;FUT;LGSPEC1 23 | finir finissions V;1;PL;SBJV;PST;IPFV 24 | desbarater desbaratois V;IND;PST;1;SG;IPFV;LGSPEC1 25 | dampner dampneras V;2;SG;IND;FUT 26 | soucyer soucyoys V;IND;PST;2;SG;IPFV;LGSPEC2 27 | trancher tranchois V;IND;PST;2;SG;IPFV;LGSPEC1 28 | desieuner desieunasmes V;IND;PST;1;PL;PFV 29 | garder gardassent V;3;PL;SBJV;PST;IPFV 30 | engendrer engendrerons V;1;PL;IND;FUT 31 | importuner importuneront V;3;PL;IND;FUT 32 | toucher touchez V;2;PL;SBJV;PRS 33 | cauterizer cauterizoyt V;IND;PST;3;SG;IPFV;LGSPEC2 34 | lever leva V;IND;PST;3;SG;PFV 35 | seiourner seiournons V;IND;PRS;1;PL 36 | preferer preferois V;IND;PST;1;SG;IPFV;LGSPEC1 37 | pourchasser pourchassent V;3;PL;SBJV;PRS 38 | delaisser delaissoys V;IND;PST;1;SG;IPFV;LGSPEC2 39 | resider residassions V;1;PL;SBJV;PST;IPFV 40 | herrier herrierez V;2;PL;IND;FUT 41 | cercher cercheroit V;3;SG;COND;LGSPEC1 42 | honnourer honnoure V;IND;PRS;3;SG 43 | delaisser delaisseroyent V;3;PL;COND;LGSPEC2 44 | alonger alongerois V;2;SG;COND;LGSPEC1 45 | chanter chanteryons V;1;PL;COND;LGSPEC2 46 | baptizer baptizons V;1;PL;IMP 47 | mulcter mulctons V;1;PL;SBJV;PRS 48 | estiquier estiquois V;IND;PST;2;SG;IPFV;LGSPEC1 49 | menasser menasseroyt V;3;SG;COND;LGSPEC2 50 | accompaigner accompaigne V;1;SG;SBJV;PRS 51 | desbaucher desbaucheryez V;2;PL;COND;LGSPEC2 52 | mucer muçoient V;IND;PST;3;PL;IPFV;LGSPEC1 53 | laisser laissassiez V;2;PL;SBJV;PST;IPFV 54 | ayder en aydant V.CVB;PRS 55 | esveigler esveigloient V;IND;PST;3;PL;IPFV;LGSPEC1 56 | baisler baisle V;1;SG;SBJV;PRS 57 | tyrer tyrois V;IND;PST;1;SG;IPFV;LGSPEC1 58 | couster cousteroient V;3;PL;COND;LGSPEC1 59 | desadvouer desadvouastes V;IND;PST;2;PL;PFV 60 | confesser confessoys V;IND;PST;2;SG;IPFV;LGSPEC2 61 | excuser excuserois V;1;SG;COND;LGSPEC1 62 | provocquer provocquez V;IND;PRS;2;PL 63 | saluer salues V;IND;PRS;2;SG 64 | arroier arroié V.PTCP;PST 65 | asseurer asseuroit V;IND;PST;3;SG;IPFV;LGSPEC1 66 | affier affieryez V;2;PL;COND;LGSPEC2 67 | obliger obligeasse V;1;SG;SBJV;PST;IPFV 68 | regretter regretterent V;IND;PST;3;PL;PFV 69 | desnouer desnouons V;1;PL;SBJV;PRS 70 | protester protestent V;IND;PRS;3;PL 71 | noter notois V;IND;PST;1;SG;IPFV;LGSPEC1 72 | manouvrer manouvrent V;IND;PRS;3;PL 73 | engendrer engendreriez V;2;PL;COND;LGSPEC1 74 | desplacer desplaciez V;IND;PST;2;PL;IPFV;LGSPEC1 75 | croniquier croniquiez V;2;PL;SBJV;PRS 76 | reietter reiettiez V;IND;PST;2;PL;IPFV;LGSPEC1 77 | communicquer communicqueront V;3;PL;IND;FUT 78 | resister resistoyent V;IND;PST;3;PL;IPFV;LGSPEC2 79 | estudier estudie V;2;SG;IMP 80 | dancer dancerez V;2;PL;IND;FUT 81 | croniquier croniquions V;1;PL;IMP 82 | gaigner gaignent V;IND;PRS;3;PL 83 | despuceller despucelloys V;IND;PST;2;SG;IPFV;LGSPEC2 84 | fraper frapa V;IND;PST;3;SG;PFV 85 | effectuer effectuasses V;2;SG;SBJV;PST;IPFV 86 | brusler brusleryons V;1;PL;COND;LGSPEC2 87 | encliner enclinoys V;IND;PST;1;SG;IPFV;LGSPEC2 88 | marcher marchiez V;IND;PST;2;PL;IPFV;LGSPEC1 89 | visiter visiterent V;IND;PST;3;PL;PFV 90 | soupper souppera V;3;SG;IND;FUT 91 | reguarder reguardoys V;IND;PST;2;SG;IPFV;LGSPEC2 92 | entituler entitulerai V;1;SG;IND;FUT;LGSPEC1 93 | eschapper eschappes V;IND;PRS;2;SG 94 | pourforcer pourforcez V;2;PL;SBJV;PRS 95 | plaider plaidez V;2;PL;IMP 96 | espoventer espoventast V;3;SG;SBJV;PST;IPFV 97 | praesider praesides V;2;SG;SBJV;PRS 98 | pourpenser pourpenses V;IND;PRS;2;SG 99 | noter notons V;1;PL;SBJV;PRS 100 | reiecter reiecterons V;1;PL;IND;FUT 101 | -------------------------------------------------------------------------------- /data/middle-high-german-covered-test: -------------------------------------------------------------------------------- 1 | trinken V;IND;PST;1;SG 2 | vinden V;IND;PRS;3;SG 3 | ziehen V;SBJV;PRS;1;PL 4 | enpfinden V.PTCP;PRS 5 | werden V;SBJV;PRS;3;SG 6 | wahsen V;SBJV;PRS;2;PL 7 | grave N;NOM;SG 8 | triegen V;SBJV;PST;1;SG 9 | binden V;IND;PST;1;PL 10 | werfen V;IND;PST;2;PL 11 | helfen V;SBJV;PRS;2;SG 12 | beginnen V;IND;PRS;1;PL 13 | slahen V;SBJV;PRS;2;SG 14 | engel N;GEN;PL 15 | swimmen V;IMP;2;PL 16 | triegen V;IMP;2;SG 17 | ziehen V;IND;PST;3;SG 18 | burcgrave N;ACC;SG 19 | biegen V;IND;PRS;1;SG 20 | werden V;SBJV;PST;2;SG 21 | binden V;IND;PST;2;SG 22 | swimmen V;IND;PST;2;SG 23 | göugrave N;GEN;PL 24 | kiesen V;IND;PST;1;SG 25 | singen V;SBJV;PST;1;SG 26 | vinden V;IMP;2;PL 27 | vinden V;NFIN 28 | werfen V;IND;PST;3;PL 29 | werden V;IND;PRS;2;SG 30 | swimmen V;SBJV;PST;2;PL 31 | vinden V.PTCP;PST 32 | trinken V;SBJV;PRS;1;SG 33 | enpfinden V;IMP;2;SG 34 | werden V;IMP;1;PL 35 | biegen V;SBJV;PST;3;SG 36 | kiesen V;SBJV;PRS;3;SG 37 | ziehen V;SBJV;PST;3;PL 38 | kiesen V;SBJV;PST;1;PL 39 | betriegen V;SBJV;PRS;2;SG 40 | werfen V;SBJV;PST;3;SG 41 | triegen V;SBJV;PST;3;SG 42 | rinnen V;NFIN 43 | wahsen V;SBJV;PRS;3;PL 44 | vinden V;SBJV;PRS;2;PL 45 | kiesen V;SBJV;PRS;2;SG 46 | trinken V;SBJV;PST;3;PL 47 | binden V;IND;PRS;3;SG 48 | trinken V;SBJV;PRS;2;PL 49 | binden V;IND;PST;3;SG 50 | kiesen V;IND;PRS;3;SG 51 | -------------------------------------------------------------------------------- /data/middle-high-german-dev: -------------------------------------------------------------------------------- 1 | biegen buget V;IND;PST;2;PL 2 | helfen hülfe V;SBJV;PST;1;SG 3 | verswinden verswindet V;IND;PRS;2;PL 4 | rinnen runnen V;IND;PST;3;PL 5 | werden würde V;SBJV;PST;3;SG 6 | helfen half V;IND;PST;1;SG 7 | binden binden V;IMP;1;PL 8 | slahen sluogen V;IND;PST;3;PL 9 | sieden siedende V.PTCP;PRS 10 | helfen hülfest V;SBJV;PST;2;SG 11 | verswinden verswindet V;SBJV;PRS;3;SG 12 | triegen truget V;IND;PST;2;PL 13 | binden binde V;SBJV;PRS;1;SG 14 | binden bind V;IMP;2;SG 15 | slahen slüege V;SBJV;PST;3;SG 16 | helfen hëlfende V.PTCP;PRS 17 | binden bindet V;IMP;2;PL 18 | trinken trinket V;IND;PRS;3;SG 19 | verswinden verswindest V;IND;PRS;2;SG 20 | singen sing V;IMP;2;SG 21 | vinden vindest V;SBJV;PRS;2;SG 22 | betriegen betriuc V;IMP;2;SG 23 | ziehen ziehent V;IND;PRS;3;PL 24 | werden wurden V;IND;PST;3;PL 25 | enpfinden enpfinden V;NFIN 26 | swimmen geswummen V.PTCP;PST 27 | sieden sôt V;IND;PST;1;SG 28 | slahen slaget V;IND;PRS;2;PL 29 | trinken trunken V;IND;PST;3;PL 30 | bruoder brüeder N;GEN;PL 31 | singen singet V;IND;PRS;2;PL 32 | rinnen rinnen V;IMP;1;PL 33 | betriegen betrüge V;SBJV;PST;3;SG 34 | enpfinden enpfinde V;SBJV;PRS;1;SG 35 | helfen hilfest V;IND;PRS;2;SG 36 | helfen hëlfen V;IMP;1;PL 37 | slahen slüeget V;SBJV;PST;2;PL 38 | trinken trunken V;IND;PST;1;PL 39 | enpfinden enpfinde V;IND;PRS;1;SG 40 | beginnen beginnet V;SBJV;PRS;2;PL 41 | beginnen begunnet V;IND;PST;2;PL 42 | enpfinden enpfinden V;SBJV;PRS;3;PL 43 | sieden sütet V;SBJV;PST;2;PL 44 | werfen wërfest V;SBJV;PRS;2;SG 45 | verswinden verswünden V;SBJV;PST;1;PL 46 | enpfinden enpfunden V;IND;PST;1;PL 47 | trinken trinkest V;SBJV;PRS;2;SG 48 | werden ward V;IND;PST;1;SG 49 | verswinden verswindest V;SBJV;PRS;2;SG 50 | singen singen V;IND;PRS;1;PL 51 | -------------------------------------------------------------------------------- /data/middle-high-german-test: -------------------------------------------------------------------------------- 1 | trinken trank V;IND;PST;1;SG 2 | vinden vindet V;IND;PRS;3;SG 3 | ziehen ziehen V;SBJV;PRS;1;PL 4 | enpfinden enpfindende V.PTCP;PRS 5 | werden wërdet V;SBJV;PRS;3;SG 6 | wahsen wahset V;SBJV;PRS;2;PL 7 | grave grâve N;NOM;SG 8 | triegen trüge V;SBJV;PST;1;SG 9 | binden bunden V;IND;PST;1;PL 10 | werfen wurfet V;IND;PST;2;PL 11 | helfen hëlfest V;SBJV;PRS;2;SG 12 | beginnen beginnen V;IND;PRS;1;PL 13 | slahen slagest V;SBJV;PRS;2;SG 14 | engel engele N;GEN;PL 15 | swimmen swimmet V;IMP;2;PL 16 | triegen triuc V;IMP;2;SG 17 | ziehen zôch V;IND;PST;3;SG 18 | burcgrave burcgrâven N;ACC;SG 19 | biegen biuge V;IND;PRS;1;SG 20 | werden würdest V;SBJV;PST;2;SG 21 | binden bünde V;IND;PST;2;SG 22 | swimmen swümme V;IND;PST;2;SG 23 | göugrave göugrâven N;GEN;PL 24 | kiesen kôs V;IND;PST;1;SG 25 | singen sünge V;SBJV;PST;1;SG 26 | vinden vindet V;IMP;2;PL 27 | vinden vinden V;NFIN 28 | werfen wurfen V;IND;PST;3;PL 29 | werden wirdest V;IND;PRS;2;SG 30 | swimmen swümmet V;SBJV;PST;2;PL 31 | vinden gevunden V.PTCP;PST 32 | trinken trinke V;SBJV;PRS;1;SG 33 | enpfinden enpfind V;IMP;2;SG 34 | werden wërden V;IMP;1;PL 35 | biegen büge V;SBJV;PST;3;SG 36 | kiesen kieset V;SBJV;PRS;3;SG 37 | ziehen zühen V;SBJV;PST;3;PL 38 | kiesen küsen V;SBJV;PST;1;PL 39 | betriegen betriegest V;SBJV;PRS;2;SG 40 | werfen würfe V;SBJV;PST;3;SG 41 | triegen trüge V;SBJV;PST;3;SG 42 | rinnen rinnen V;NFIN 43 | wahsen wahsen V;SBJV;PRS;3;PL 44 | vinden vindet V;SBJV;PRS;2;PL 45 | kiesen kiesest V;SBJV;PRS;2;SG 46 | trinken trünken V;SBJV;PST;3;PL 47 | binden bindet V;IND;PRS;3;SG 48 | trinken trinket V;SBJV;PRS;2;PL 49 | binden band V;IND;PST;3;SG 50 | kiesen kiuset V;IND;PRS;3;SG 51 | -------------------------------------------------------------------------------- /data/middle-high-german-train-low: -------------------------------------------------------------------------------- 1 | werden wurdet V;IND;PST;2;PL 2 | triegen trieget V;SBJV;PRS;2;PL 3 | binden binde V;IND;PRS;1;SG 4 | singen singet V;IND;PRS;3;SG 5 | verswinden verswindende V.PTCP;PRS 6 | burcgrave burcgrâven N;DAT;SG 7 | bruoder bruoder N;NOM;SG 8 | betriegen betriugest V;IND;PRS;2;SG 9 | slahen slage V;SBJV;PRS;1;SG 10 | swern sweret V;IND;PRS;3;SG 11 | grave grâven N;NOM;PL 12 | slahen slaget V;SBJV;PRS;2;PL 13 | trinken trinken V;IND;PRS;1;PL 14 | enpfinden enpfunden V;IND;PST;3;PL 15 | werfen wërfet V;SBJV;PRS;2;PL 16 | wahsen wahset V;SBJV;PRS;3;SG 17 | tac tac N;ACC;SG 18 | swern swuoren V;IND;PST;3;PL 19 | tac tage N;NOM;PL 20 | slahen slegest V;IND;PRS;2;SG 21 | betriegen betriegen V;NFIN 22 | helfen hëlfen V;NFIN 23 | werfen würfen V;SBJV;PST;1;PL 24 | vinden vünde V;IND;PST;2;SG 25 | trinken trinket V;IND;PRS;2;PL 26 | werfen geworfen V.PTCP;PST 27 | beginnen beginnet V;IMP;2;PL 28 | slahen slagen V;NFIN 29 | burcgrave burcgrâven N;NOM;PL 30 | ziehen ziehe V;SBJV;PRS;1;SG 31 | ziehen ziehet V;IMP;2;PL 32 | burcgrave burcgrâven N;DAT;PL 33 | swern swuoren V;IND;PST;1;PL 34 | beginnen begünnest V;SBJV;PST;2;SG 35 | swimmen swümme V;SBJV;PST;3;SG 36 | binden bündet V;SBJV;PST;2;PL 37 | betriegen betrugen V;IND;PST;3;PL 38 | engel engele N;DAT;SG 39 | singen singent V;IND;PRS;3;PL 40 | werfen wërfen V;IND;PRS;1;PL 41 | werden wirde V;IND;PRS;1;SG 42 | rinnen runnen V;IND;PST;1;PL 43 | swimmen swimment V;IND;PRS;3;PL 44 | enpfinden enpfand V;IND;PST;1;SG 45 | binden binden V;NFIN 46 | slahen slagen V;SBJV;PRS;3;PL 47 | biegen biegent V;IND;PRS;3;PL 48 | binden bindest V;SBJV;PRS;2;SG 49 | swern sweren V;SBJV;PRS;1;PL 50 | betriegen betrügen V;SBJV;PST;1;PL 51 | rinnen rinnet V;SBJV;PRS;3;SG 52 | swern sweren V;IND;PRS;1;PL 53 | swimmen swimmet V;SBJV;PRS;2;PL 54 | swern swüren V;SBJV;PST;1;PL 55 | beginnen begünne V;IND;PST;2;SG 56 | biegen biegen V;IND;PRS;1;PL 57 | helfen hëlfet V;IMP;2;PL 58 | vinden vünde V;SBJV;PST;1;SG 59 | sieden gesoten V.PTCP;PST 60 | biegen bugen V;IND;PST;3;PL 61 | vinden vind V;IMP;2;SG 62 | helfen hëlfet V;SBJV;PRS;3;SG 63 | swern swernde V.PTCP;PRS 64 | kiesen kiesen V;NFIN 65 | trinken trinkest V;IND;PRS;2;SG 66 | ziehen ziehende V.PTCP;PRS 67 | werfen wërfen V;SBJV;PRS;1;PL 68 | swimmen swimmende V.PTCP;PRS 69 | enpfinden enpfünde V;SBJV;PST;1;SG 70 | sieden sutet V;IND;PST;2;PL 71 | überganc überganc N;ACC;SG 72 | sieden süte V;IND;PST;2;SG 73 | sieden sieden V;IND;PRS;1;PL 74 | werden wërden V;SBJV;PRS;3;PL 75 | trinken trinken V;NFIN 76 | werden würden V;SBJV;PST;1;PL 77 | rinnen rinnent V;IND;PRS;3;PL 78 | swern swere V;SBJV;PRS;1;SG 79 | binden bindende V.PTCP;PRS 80 | swern swüre V;SBJV;PST;3;SG 81 | wahsen wahsen V;IND;PRS;1;PL 82 | werfen würfe V;IND;PST;2;SG 83 | betriegen betriegen V;IND;PRS;1;PL 84 | ziehen zühest V;SBJV;PST;2;SG 85 | triegen trüge V;IND;PST;2;SG 86 | helfen hülfen V;SBJV;PST;1;PL 87 | swimmen swamm V;IND;PST;3;SG 88 | überganc übergange N;DAT;SG 89 | triegen trüget V;SBJV;PST;2;PL 90 | swimmen swimme V;IND;PRS;1;SG 91 | enpfinden enpfünde V;SBJV;PST;3;SG 92 | swimmen swimme V;SBJV;PRS;1;SG 93 | swern swüret V;SBJV;PST;2;PL 94 | beginnen beginnest V;IND;PRS;2;SG 95 | binden bindet V;SBJV;PRS;3;SG 96 | beginnen begünnen V;SBJV;PST;3;PL 97 | singen singen V;SBJV;PRS;3;PL 98 | singen singe V;IND;PRS;1;SG 99 | werfen wërfen V;SBJV;PRS;3;PL 100 | singen singest V;IND;PRS;2;SG 101 | -------------------------------------------------------------------------------- /data/middle-high-german-train-medium: -------------------------------------------------------------------------------- 1 | werden wurdet V;IND;PST;2;PL 2 | triegen trieget V;SBJV;PRS;2;PL 3 | binden binde V;IND;PRS;1;SG 4 | singen singet V;IND;PRS;3;SG 5 | verswinden verswindende V.PTCP;PRS 6 | burcgrave burcgrâven N;DAT;SG 7 | bruoder bruoder N;NOM;SG 8 | betriegen betriugest V;IND;PRS;2;SG 9 | slahen slage V;SBJV;PRS;1;SG 10 | swern sweret V;IND;PRS;3;SG 11 | grave grâven N;NOM;PL 12 | slahen slaget V;SBJV;PRS;2;PL 13 | trinken trinken V;IND;PRS;1;PL 14 | enpfinden enpfunden V;IND;PST;3;PL 15 | werfen wërfet V;SBJV;PRS;2;PL 16 | wahsen wahset V;SBJV;PRS;3;SG 17 | tac tac N;ACC;SG 18 | swern swuoren V;IND;PST;3;PL 19 | tac tage N;NOM;PL 20 | slahen slegest V;IND;PRS;2;SG 21 | betriegen betriegen V;NFIN 22 | helfen hëlfen V;NFIN 23 | werfen würfen V;SBJV;PST;1;PL 24 | vinden vünde V;IND;PST;2;SG 25 | trinken trinket V;IND;PRS;2;PL 26 | werfen geworfen V.PTCP;PST 27 | beginnen beginnet V;IMP;2;PL 28 | slahen slagen V;NFIN 29 | burcgrave burcgrâven N;NOM;PL 30 | ziehen ziehe V;SBJV;PRS;1;SG 31 | ziehen ziehet V;IMP;2;PL 32 | burcgrave burcgrâven N;DAT;PL 33 | swern swuoren V;IND;PST;1;PL 34 | beginnen begünnest V;SBJV;PST;2;SG 35 | swimmen swümme V;SBJV;PST;3;SG 36 | binden bündet V;SBJV;PST;2;PL 37 | betriegen betrugen V;IND;PST;3;PL 38 | engel engele N;DAT;SG 39 | singen singent V;IND;PRS;3;PL 40 | werfen wërfen V;IND;PRS;1;PL 41 | werden wirde V;IND;PRS;1;SG 42 | rinnen runnen V;IND;PST;1;PL 43 | swimmen swimment V;IND;PRS;3;PL 44 | enpfinden enpfand V;IND;PST;1;SG 45 | binden binden V;NFIN 46 | slahen slagen V;SBJV;PRS;3;PL 47 | biegen biegent V;IND;PRS;3;PL 48 | binden bindest V;SBJV;PRS;2;SG 49 | swern sweren V;SBJV;PRS;1;PL 50 | betriegen betrügen V;SBJV;PST;1;PL 51 | rinnen rinnet V;SBJV;PRS;3;SG 52 | swern sweren V;IND;PRS;1;PL 53 | swimmen swimmet V;SBJV;PRS;2;PL 54 | swern swüren V;SBJV;PST;1;PL 55 | beginnen begünne V;IND;PST;2;SG 56 | biegen biegen V;IND;PRS;1;PL 57 | helfen hëlfet V;IMP;2;PL 58 | vinden vünde V;SBJV;PST;1;SG 59 | sieden gesoten V.PTCP;PST 60 | biegen bugen V;IND;PST;3;PL 61 | vinden vind V;IMP;2;SG 62 | helfen hëlfet V;SBJV;PRS;3;SG 63 | swern swernde V.PTCP;PRS 64 | kiesen kiesen V;NFIN 65 | trinken trinkest V;IND;PRS;2;SG 66 | ziehen ziehende V.PTCP;PRS 67 | werfen wërfen V;SBJV;PRS;1;PL 68 | swimmen swimmende V.PTCP;PRS 69 | enpfinden enpfünde V;SBJV;PST;1;SG 70 | sieden sutet V;IND;PST;2;PL 71 | überganc überganc N;ACC;SG 72 | sieden süte V;IND;PST;2;SG 73 | sieden sieden V;IND;PRS;1;PL 74 | werden wërden V;SBJV;PRS;3;PL 75 | trinken trinken V;NFIN 76 | werden würden V;SBJV;PST;1;PL 77 | rinnen rinnent V;IND;PRS;3;PL 78 | swern swere V;SBJV;PRS;1;SG 79 | binden bindende V.PTCP;PRS 80 | swern swüre V;SBJV;PST;3;SG 81 | wahsen wahsen V;IND;PRS;1;PL 82 | werfen würfe V;IND;PST;2;SG 83 | betriegen betriegen V;IND;PRS;1;PL 84 | ziehen zühest V;SBJV;PST;2;SG 85 | triegen trüge V;IND;PST;2;SG 86 | helfen hülfen V;SBJV;PST;1;PL 87 | swimmen swamm V;IND;PST;3;SG 88 | überganc übergange N;DAT;SG 89 | triegen trüget V;SBJV;PST;2;PL 90 | swimmen swimme V;IND;PRS;1;SG 91 | enpfinden enpfünde V;SBJV;PST;3;SG 92 | swimmen swimme V;SBJV;PRS;1;SG 93 | swern swüret V;SBJV;PST;2;PL 94 | beginnen beginnest V;IND;PRS;2;SG 95 | binden bindet V;SBJV;PRS;3;SG 96 | beginnen begünnen V;SBJV;PST;3;PL 97 | singen singen V;SBJV;PRS;3;PL 98 | singen singe V;IND;PRS;1;SG 99 | werfen wërfen V;SBJV;PRS;3;PL 100 | singen singest V;IND;PRS;2;SG 101 | göu göu N;NOM;SG 102 | tac tagen N;DAT;PL 103 | werden wërde V;SBJV;PRS;1;SG 104 | werden wërdet V;SBJV;PRS;2;PL 105 | werden geworden V.PTCP;PST 106 | swern swer V;IMP;2;SG 107 | betriegen betrieget V;SBJV;PRS;2;PL 108 | werden wërdende V.PTCP;PRS 109 | slahen geslagen V.PTCP;PST 110 | swern sweren V;SBJV;PRS;3;PL 111 | singen singen V;IMP;1;PL 112 | biegen bügest V;SBJV;PST;2;SG 113 | beginnen beginnet V;SBJV;PRS;3;SG 114 | helfen hëlfent V;IND;PRS;3;PL 115 | sieden sieden V;IMP;1;PL 116 | vinden vunden V;IND;PST;3;PL 117 | göu göuwe N;GEN;PL 118 | trinken trinket V;IMP;2;PL 119 | wahsen wuohs V;IND;PST;3;SG 120 | biegen biegen V;IMP;1;PL 121 | göugrave göugrâven N;NOM;PL 122 | ziehen ziehet V;IND;PRS;2;PL 123 | ziehen gezogen V.PTCP;PST 124 | betriegen betrieget V;IMP;2;PL 125 | trinken trünke V;SBJV;PST;3;SG 126 | göugrave göugrâven N;ACC;PL 127 | göugrave göugrâve N;NOM;SG 128 | swern swürest V;SBJV;PST;2;SG 129 | beginnen beginnen V;NFIN 130 | enpfinden enpfindet V;IND;PRS;2;PL 131 | werden wërden V;NFIN 132 | slahen slüegen V;SBJV;PST;1;PL 133 | ziehen zuhet V;IND;PST;2;PL 134 | enpfinden enpfinden V;SBJV;PRS;1;PL 135 | slahen sluogen V;IND;PST;1;PL 136 | swimmen swummen V;IND;PST;1;PL 137 | göu göuwes N;GEN;SG 138 | beginnen begunnen V.PTCP;PST 139 | sieden süten V;SBJV;PST;1;PL 140 | rinnen rinnen V;IND;PRS;1;PL 141 | betriegen betriuge V;IND;PRS;1;SG 142 | slahen slaget V;SBJV;PRS;3;SG 143 | biegen biegen V;SBJV;PRS;3;PL 144 | kiesen kurn V;IND;PST;1;PL 145 | tac tage N;GEN;PL 146 | triegen trugen V;IND;PST;1;PL 147 | enpfinden enpfinden V;IMP;1;PL 148 | kiesen kiesen V;SBJV;PRS;1;PL 149 | slahen slüege V;IND;PST;2;SG 150 | enpfinden enpfindet V;IMP;2;PL 151 | göu göuwe N;DAT;SG 152 | wahsen wüehse V;SBJV;PST;3;SG 153 | rinnen runnet V;IND;PST;2;PL 154 | werden ward V;IND;PST;3;SG 155 | betriegen betrogen V.PTCP;PST 156 | singen süngen V;SBJV;PST;1;PL 157 | werden würde V;IND;PST;2;SG 158 | helfen hëlfen V;IND;PRS;1;PL 159 | triegen triegen V;SBJV;PRS;3;PL 160 | betriegen betriuget V;IND;PRS;3;SG 161 | enpfinden enpfündest V;SBJV;PST;2;SG 162 | swern swere V;IND;PRS;1;SG 163 | wahsen gewahsen V.PTCP;PST 164 | göu göuwen N;DAT;PL 165 | engel engele N;ACC;PL 166 | kiesen kiesen V;IMP;1;PL 167 | enpfinden enpfindet V;SBJV;PRS;2;PL 168 | verswinden verswindent V;IND;PRS;3;PL 169 | trinken trinken V;SBJV;PRS;3;PL 170 | werfen wirf V;IMP;2;SG 171 | ziehen zühen V;SBJV;PST;1;PL 172 | sieden süte V;SBJV;PST;1;SG 173 | binden bindet V;IND;PRS;2;PL 174 | verswinden verswünde V;IND;PST;2;SG 175 | swimmen swimmen V;NFIN 176 | rinnen rinne V;SBJV;PRS;1;SG 177 | überganc überganc N;NOM;SG 178 | verswinden verswinden V;NFIN 179 | tac tages N;GEN;SG 180 | kiesen küsen V;SBJV;PST;3;PL 181 | ziehen ziehest V;SBJV;PRS;2;SG 182 | singen singe V;SBJV;PRS;1;SG 183 | wahsen wüehse V;IND;PST;2;SG 184 | burcgrave burcgrâven N;ACC;PL 185 | betriegen betriege V;SBJV;PRS;1;SG 186 | burcgrave burcgrâven N;GEN;PL 187 | ziehen ziehen V;SBJV;PRS;3;PL 188 | singen sungen V;IND;PST;1;PL 189 | singen sünge V;IND;PST;2;SG 190 | rinnen rinnest V;IND;PRS;2;SG 191 | biegen bugen V;IND;PST;1;PL 192 | vinden vinden V;SBJV;PRS;1;PL 193 | singen sunget V;IND;PST;2;PL 194 | wahsen wahs V;IMP;2;SG 195 | singen sünge V;SBJV;PST;3;SG 196 | swimmen swimmet V;IND;PRS;3;SG 197 | werden wurden V;IND;PST;1;PL 198 | rinnen rinnen V;SBJV;PRS;3;PL 199 | grave grâven N;ACC;SG 200 | triegen trouc V;IND;PST;3;SG 201 | biegen bouc V;IND;PST;3;SG 202 | enpfinden enpfindent V;IND;PRS;3;PL 203 | triegen triege V;SBJV;PRS;1;SG 204 | vinden vünden V;SBJV;PST;3;PL 205 | triegen triegen V;SBJV;PRS;1;PL 206 | enpfinden enpfindet V;SBJV;PRS;3;SG 207 | triegen trugen V;IND;PST;3;PL 208 | helfen hëlfet V;IND;PRS;2;PL 209 | vinden vindent V;IND;PRS;3;PL 210 | triegen triegent V;IND;PRS;3;PL 211 | werfen wërfende V.PTCP;PRS 212 | werfen würfen V;SBJV;PST;3;PL 213 | wahsen wuohs V;IND;PST;1;SG 214 | ziehen ziehen V;IMP;1;PL 215 | swimmen swimmen V;IMP;1;PL 216 | enpfinden enpfindet V;IND;PRS;3;SG 217 | vinden vunden V;IND;PST;1;PL 218 | sieden sieden V;NFIN 219 | biegen biuc V;IMP;2;SG 220 | slahen slagen V;SBJV;PRS;1;PL 221 | kiesen kiesent V;IND;PRS;3;PL 222 | beginnen begunnen V;IND;PST;1;PL 223 | sieden suten V;IND;PST;3;PL 224 | helfen hëlfe V;SBJV;PRS;1;SG 225 | biegen biegest V;SBJV;PRS;2;SG 226 | vinden vindet V;IND;PRS;2;PL 227 | triegen triegen V;NFIN 228 | wahsen wahset V;IND;PRS;2;PL 229 | kiesen kieset V;SBJV;PRS;2;PL 230 | swimmen swimm V;IMP;2;SG 231 | swimmen swümmen V;SBJV;PST;1;PL 232 | binden binden V;IND;PRS;1;PL 233 | swern sweren V;IMP;1;PL 234 | biegen bieget V;SBJV;PRS;2;PL 235 | ziehen zühet V;SBJV;PST;2;PL 236 | enpfinden enpfünden V;SBJV;PST;3;PL 237 | göugrave göugrâven N;DAT;PL 238 | sieden süten V;SBJV;PST;3;PL 239 | verswinden verswinde V;IND;PRS;1;SG 240 | kiesen küset V;SBJV;PST;2;PL 241 | vinden vinde V;SBJV;PRS;1;SG 242 | betriegen betrieget V;IND;PRS;2;PL 243 | enpfinden enpfunden V.PTCP;PST 244 | grave grâven N;DAT;PL 245 | trinken trünke V;IND;PST;2;SG 246 | singen singest V;SBJV;PRS;2;SG 247 | werfen wërfent V;IND;PRS;3;PL 248 | überganc übergange N;NOM;PL 249 | swern swern V;NFIN 250 | kiesen kieset V;IND;PRS;2;PL 251 | beginnen beginnende V.PTCP;PRS 252 | verswinden verswindet V;SBJV;PRS;2;PL 253 | triegen triuge V;IND;PRS;1;SG 254 | singen singet V;IMP;2;PL 255 | rinnen rinne V;IND;PRS;1;SG 256 | werfen wirfet V;IND;PRS;3;SG 257 | rinnen rünne V;SBJV;PST;3;SG 258 | beginnen beginnen V;IMP;1;PL 259 | trinken trunket V;IND;PST;2;PL 260 | werden wërden V;IND;PRS;1;PL 261 | swimmen swimmen V;SBJV;PRS;1;PL 262 | rinnen rünnen V;SBJV;PST;1;PL 263 | swimmen swamm V;IND;PST;1;SG 264 | rinnen rann V;IND;PST;1;SG 265 | grave grâven N;DAT;SG 266 | swern swerest V;IND;PRS;2;SG 267 | verswinden verswinde V;SBJV;PRS;1;SG 268 | tac tage N;DAT;SG 269 | werden würde V;SBJV;PST;1;SG 270 | swimmen swimmet V;IND;PRS;2;PL 271 | sieden sütest V;SBJV;PST;2;SG 272 | singen sang V;IND;PST;3;SG 273 | engel engel N;NOM;SG 274 | trinken trünken V;SBJV;PST;1;PL 275 | swimmen swümmen V;SBJV;PST;3;PL 276 | burcgrave burcgrâven N;GEN;SG 277 | göugrave göugrâven N;ACC;SG 278 | triegen triugest V;IND;PRS;2;SG 279 | swern swuoret V;IND;PST;2;PL 280 | überganc übergange N;ACC;PL 281 | engel engel N;ACC;SG 282 | singen sungen V;IND;PST;3;PL 283 | swimmen swümme V;SBJV;PST;1;SG 284 | engel engeles N;GEN;SG 285 | swern sweret V;IND;PRS;2;PL 286 | singen gesungen V.PTCP;PST 287 | swimmen swimmet V;SBJV;PRS;3;SG 288 | rinnen gerunnen V.PTCP;PST 289 | werfen wirfe V;IND;PRS;1;SG 290 | rinnen rünnest V;SBJV;PST;2;SG 291 | trinken trinke V;IND;PRS;1;SG 292 | helfen hülfen V;SBJV;PST;3;PL 293 | singen süngen V;SBJV;PST;3;PL 294 | rinnen rinnest V;SBJV;PRS;2;SG 295 | rinnen rünnet V;SBJV;PST;2;PL 296 | verswinden verswand V;IND;PST;3;SG 297 | beginnen begünne V;SBJV;PST;1;SG 298 | kiesen küsest V;SBJV;PST;2;SG 299 | burcgrave burcgrâve N;NOM;SG 300 | kiesen kius V;IMP;2;SG 301 | biegen büget V;SBJV;PST;2;PL 302 | verswinden verswünden V;SBJV;PST;3;PL 303 | ziehen zôch V;IND;PST;1;SG 304 | werden wërden V;SBJV;PRS;1;PL 305 | triegen trügen V;SBJV;PST;1;PL 306 | kiesen küse V;SBJV;PST;1;SG 307 | triegen triegest V;SBJV;PRS;2;SG 308 | enpfinden enpfinden V;IND;PRS;1;PL 309 | singen singet V;SBJV;PRS;2;PL 310 | biegen biegende V.PTCP;PRS 311 | verswinden verswunden V;IND;PST;1;PL 312 | kiesen kieset V;IMP;2;PL 313 | grave grâven N;GEN;SG 314 | rinnen rünne V;SBJV;PST;1;SG 315 | swimmen swimmen V;IND;PRS;1;PL 316 | ziehen ziehet V;SBJV;PRS;2;PL 317 | ziehen zuhen V;IND;PST;1;PL 318 | swern swuor V;IND;PST;3;SG 319 | werden wird V;IMP;2;SG 320 | wahsen wuohset V;IND;PST;2;PL 321 | wahsen wahsent V;IND;PRS;3;PL 322 | bruoder bruodere N;DAT;SG 323 | slahen slagende V.PTCP;PRS 324 | werden würden V;SBJV;PST;3;PL 325 | wahsen wahsen V;IMP;1;PL 326 | göugrave göugrâven N;GEN;SG 327 | betriegen betrouc V;IND;PST;3;SG 328 | betriegen betrüge V;SBJV;PST;1;SG 329 | werfen würfet V;SBJV;PST;2;PL 330 | vinden vinden V;IND;PRS;1;PL 331 | helfen hëlfen V;SBJV;PRS;3;PL 332 | slahen slüegest V;SBJV;PST;2;SG 333 | sieden siudet V;IND;PRS;3;SG 334 | wahsen wahsen V;SBJV;PRS;1;PL 335 | göu göu N;ACC;SG 336 | rinnen rinnet V;IMP;2;PL 337 | kiesen küse V;IND;PST;2;SG 338 | beginnen beginnen V;SBJV;PRS;3;PL 339 | bruoder bruoder N;ACC;SG 340 | helfen hülfet V;SBJV;PST;2;PL 341 | werfen wërfen V;IMP;1;PL 342 | triegen trügest V;SBJV;PST;2;SG 343 | trinken trinken V;SBJV;PRS;1;PL 344 | sieden siude V;IND;PRS;1;SG 345 | tac tage N;ACC;PL 346 | kiesen kurn V;IND;PST;3;PL 347 | verswinden verswinden V;IMP;1;PL 348 | kiesen kiesende V.PTCP;PRS 349 | binden bunden V;IND;PST;3;PL 350 | rinnen rinnende V.PTCP;PRS 351 | trinken trünkest V;SBJV;PST;2;SG 352 | trinken trinket V;SBJV;PRS;3;SG 353 | helfen hülfe V;SBJV;PST;3;SG 354 | ziehen zuhen V;IND;PST;3;PL 355 | biegen biege V;SBJV;PRS;1;SG 356 | binden bünden V;SBJV;PST;3;PL 357 | vinden vand V;IND;PST;1;SG 358 | ziehen ziehet V;SBJV;PRS;3;SG 359 | singen singen V;NFIN 360 | sieden siedet V;SBJV;PRS;3;SG 361 | slahen slage V;IND;PRS;1;SG 362 | bruoder brüedern N;DAT;PL 363 | verswinden verswinden V;SBJV;PRS;3;PL 364 | ziehen ziuch V;IMP;2;SG 365 | slahen sluoget V;IND;PST;2;PL 366 | binden bindet V;SBJV;PRS;2;PL 367 | helfen hilfe V;IND;PRS;1;SG 368 | betriegen betrouc V;IND;PST;1;SG 369 | helfen hülfe V;IND;PST;2;SG 370 | enpfinden enpfünde V;IND;PST;2;SG 371 | slahen sluoc V;IND;PST;1;SG 372 | rinnen rinnet V;SBJV;PRS;2;PL 373 | slahen sluoc V;IND;PST;3;SG 374 | helfen geholfen V.PTCP;PST 375 | sieden sieden V;SBJV;PRS;1;PL 376 | beginnen begann V;IND;PST;1;SG 377 | verswinden verswunden V;IND;PST;3;PL 378 | betriegen betrüge V;IND;PST;2;SG 379 | triegen trügen V;SBJV;PST;3;PL 380 | vinden vinde V;IND;PRS;1;SG 381 | singen singet V;SBJV;PRS;3;SG 382 | helfen hilfet V;IND;PRS;3;SG 383 | verswinden verswündet V;SBJV;PST;2;PL 384 | rinnen rinn V;IMP;2;SG 385 | vinden vindest V;IND;PRS;2;SG 386 | vinden vündet V;SBJV;PST;2;PL 387 | überganc überganges N;GEN;SG 388 | wahsen wüehsest V;SBJV;PST;2;SG 389 | betriegen betrieget V;SBJV;PRS;3;SG 390 | sieden suten V;IND;PST;1;PL 391 | binden binden V;SBJV;PRS;3;PL 392 | betriegen betriegen V;SBJV;PRS;1;PL 393 | trinken trank V;IND;PST;3;SG 394 | singen süngest V;SBJV;PST;2;SG 395 | singen singende V.PTCP;PRS 396 | beginnen beginn V;IMP;2;SG 397 | helfen hulfen V;IND;PST;3;PL 398 | singen sang V;IND;PST;1;SG 399 | werfen wërfet V;IND;PRS;2;PL 400 | beginnen begünnen V;SBJV;PST;1;PL 401 | rinnen rinnen V;SBJV;PRS;1;PL 402 | trinken trünket V;SBJV;PST;2;PL 403 | biegen bügen V;SBJV;PST;3;PL 404 | sieden siedest V;SBJV;PRS;2;SG 405 | ziehen ziehen V;IND;PRS;1;PL 406 | verswinden verswand V;IND;PST;1;SG 407 | beginnen beginne V;SBJV;PRS;1;SG 408 | werfen warf V;IND;PST;1;SG 409 | wahsen wahsest V;SBJV;PRS;2;SG 410 | triegen trieget V;IND;PRS;2;PL 411 | bruoder brüeder N;ACC;PL 412 | wahsen wahse V;SBJV;PRS;1;SG 413 | swern swüre V;IND;PST;2;SG 414 | biegen bügen V;SBJV;PST;1;PL 415 | werfen würfest V;SBJV;PST;2;SG 416 | triegen trouc V;IND;PST;1;SG 417 | verswinden verswindet V;IND;PRS;3;SG 418 | beginnen begünne V;SBJV;PST;3;SG 419 | werfen wërfe V;SBJV;PRS;1;SG 420 | bruoder brüeder N;NOM;PL 421 | wahsen wüehsen V;SBJV;PST;3;PL 422 | betriegen betriegen V;IMP;1;PL 423 | vinden vindende V.PTCP;PRS 424 | kiesen kiesen V;IND;PRS;1;PL 425 | trinken trinkende V.PTCP;PRS 426 | beginnen beginnest V;SBJV;PRS;2;SG 427 | ziehen ziuhest V;IND;PRS;2;SG 428 | wahsen wahsende V.PTCP;PRS 429 | binden bindent V;IND;PRS;3;PL 430 | swern swerent V;IND;PRS;3;PL 431 | triegen triegen V;IMP;1;PL 432 | verswinden verswunden V.PTCP;PST 433 | werden wërdent V;IND;PRS;3;PL 434 | binden bindest V;IND;PRS;2;SG 435 | slahen sleget V;IND;PRS;3;SG 436 | werden wërdet V;IMP;2;PL 437 | kiesen kuret V;IND;PST;2;PL 438 | helfen hilf V;IMP;2;SG 439 | rinnen rünne V;IND;PST;2;SG 440 | singen singen V;SBJV;PRS;1;PL 441 | werfen wërfet V;IMP;2;PL 442 | werfen wirfest V;IND;PRS;2;SG 443 | betriegen betriegent V;IND;PRS;3;PL 444 | swimmen swümmest V;SBJV;PST;2;SG 445 | wahsen wahse V;IND;PRS;1;SG 446 | wahsen wahsen V;NFIN 447 | helfen half V;IND;PST;3;SG 448 | swimmen swimmen V;SBJV;PRS;3;PL 449 | verswinden verswinden V;SBJV;PRS;1;PL 450 | werden wërdest V;SBJV;PRS;2;SG 451 | betriegen betriegen V;SBJV;PRS;3;PL 452 | ziehen zühe V;IND;PST;2;SG 453 | swimmen swimmest V;SBJV;PRS;2;SG 454 | trinken trinken V;IMP;1;PL 455 | triegen triegen V;IND;PRS;1;PL 456 | beginnen beginnen V;SBJV;PRS;1;PL 457 | überganc übergangen N;DAT;PL 458 | sieden siudest V;IND;PRS;2;SG 459 | enpfinden enpfand V;IND;PST;3;SG 460 | ziehen ziehen V;NFIN 461 | slahen slac V;IMP;2;SG 462 | vinden vinden V;SBJV;PRS;3;PL 463 | werfen wurfen V;IND;PST;1;PL 464 | engel engelen N;DAT;PL 465 | vinden vündest V;SBJV;PST;2;SG 466 | ziehen zühe V;SBJV;PST;3;SG 467 | betriegen betrugen V;IND;PST;1;PL 468 | enpfinden enpfünden V;SBJV;PST;1;PL 469 | wahsen wüehse V;SBJV;PST;1;SG 470 | trinken trink V;IMP;2;SG 471 | göugrave göugrâven N;DAT;SG 472 | verswinden verswind V;IMP;2;SG 473 | binden gebunden V.PTCP;PST 474 | helfen hulfen V;IND;PST;1;PL 475 | vinden vand V;IND;PST;3;SG 476 | trinken trinkent V;IND;PRS;3;PL 477 | ziehen zühe V;SBJV;PST;1;SG 478 | binden binden V;SBJV;PRS;1;PL 479 | slahen slagen V;IMP;1;PL 480 | swern sweret V;SBJV;PRS;3;SG 481 | göu göuwe N;NOM;PL 482 | werfen warf V;IND;PST;3;SG 483 | enpfinden enpfindest V;SBJV;PRS;2;SG 484 | triegen trieget V;IMP;2;PL 485 | verswinden verswünde V;SBJV;PST;3;SG 486 | betriegen betrüget V;SBJV;PST;2;PL 487 | sieden siedent V;IND;PRS;3;PL 488 | enpfinden enpfündet V;SBJV;PST;2;PL 489 | werden wërdet V;IND;PRS;2;PL 490 | slahen slagen V;IND;PRS;1;PL 491 | betriegen betrügest V;SBJV;PST;2;SG 492 | swern swüre V;SBJV;PST;1;SG 493 | beginnen beginnent V;IND;PRS;3;PL 494 | triegen getrogen V.PTCP;PST 495 | binden bünde V;SBJV;PST;1;SG 496 | kiesen kiusest V;IND;PRS;2;SG 497 | helfen hëlfet V;SBJV;PRS;2;PL 498 | vinden vindet V;SBJV;PRS;3;SG 499 | biegen biugest V;IND;PRS;2;SG 500 | triegen triegende V.PTCP;PRS 501 | werden wirdet V;IND;PRS;3;SG 502 | rinnen rünnen V;SBJV;PST;3;PL 503 | biegen gebogen V.PTCP;PST 504 | kiesen kiese V;SBJV;PRS;1;SG 505 | beginnen begünnet V;SBJV;PST;2;PL 506 | verswinden verswündest V;SBJV;PST;2;SG 507 | swern sweret V;IMP;2;PL 508 | grave grâven N;ACC;PL 509 | kiesen kôs V;IND;PST;3;SG 510 | slahen slagent V;IND;PRS;3;PL 511 | biegen bouc V;IND;PST;1;SG 512 | enpfinden enpfundet V;IND;PST;2;PL 513 | kiesen gekorn V.PTCP;PST 514 | sieden siedet V;IMP;2;PL 515 | göu göuwe N;ACC;PL 516 | biegen biegen V;SBJV;PRS;1;PL 517 | verswinden verswundet V;IND;PST;2;PL 518 | biegen biegen V;NFIN 519 | helfen hulfet V;IND;PST;2;PL 520 | beginnen beginnet V;IND;PRS;2;PL 521 | verswinden verswinden V;IND;PRS;1;PL 522 | wahsen wüehsen V;SBJV;PST;1;PL 523 | biegen büge V;SBJV;PST;1;SG 524 | enpfinden enpfindest V;IND;PRS;2;SG 525 | wahsen wüehset V;SBJV;PST;2;PL 526 | engel engele N;NOM;PL 527 | beginnen beginne V;IND;PRS;1;SG 528 | rinnen rinnet V;IND;PRS;3;SG 529 | biegen bieget V;SBJV;PRS;3;SG 530 | verswinden verswindet V;IMP;2;PL 531 | werden würdet V;SBJV;PST;2;PL 532 | kiesen kiesen V;SBJV;PRS;3;PL 533 | triegen trieget V;SBJV;PRS;3;SG 534 | betriegen betrügen V;SBJV;PST;3;PL 535 | swern swüren V;SBJV;PST;3;PL 536 | betriegen betriegende V.PTCP;PRS 537 | kiesen kiuse V;IND;PRS;1;SG 538 | beginnen beginnet V;IND;PRS;3;SG 539 | biegen bieget V;IND;PRS;2;PL 540 | swimmen swummet V;IND;PST;2;PL 541 | binden bünden V;SBJV;PST;1;PL 542 | überganc übergange N;GEN;PL 543 | binden bundet V;IND;PST;2;PL 544 | slahen slüegen V;SBJV;PST;3;PL 545 | swern geswarn V.PTCP;PST 546 | vinden vundet V;IND;PST;2;PL 547 | singen sünget V;SBJV;PST;2;PL 548 | werfen wërfen V;NFIN 549 | biegen büge V;IND;PST;2;SG 550 | tac tac N;NOM;SG 551 | binden band V;IND;PST;1;SG 552 | sieden sieden V;SBJV;PRS;3;PL 553 | wahsen wahset V;IMP;2;PL 554 | sieden siedet V;IND;PRS;2;PL 555 | swern swuor V;IND;PST;1;SG 556 | biegen biuget V;IND;PRS;3;SG 557 | sieden süde V;SBJV;PST;3;SG 558 | sieden sôt V;IND;PST;3;SG 559 | verswinden verswünde V;SBJV;PST;1;SG 560 | ziehen ziuhe V;IND;PRS;1;SG 561 | betriegen betruget V;IND;PST;2;PL 562 | vinden vinden V;IMP;1;PL 563 | slahen slaget V;IMP;2;PL 564 | rinnen rann V;IND;PST;3;SG 565 | sieden siut V;IMP;2;SG 566 | bruoder bruoders N;GEN;SG 567 | swern swerest V;SBJV;PRS;2;SG 568 | binden bündest V;SBJV;PST;2;SG 569 | wahsen wuohsen V;IND;PST;1;PL 570 | wahsen wehset V;IND;PRS;3;SG 571 | slahen slüege V;SBJV;PST;1;SG 572 | grave grâven N;GEN;PL 573 | triegen triuget V;IND;PRS;3;SG 574 | vinden vünden V;SBJV;PST;1;PL 575 | trinken getrunken V.PTCP;PST 576 | vinden vünde V;SBJV;PST;3;SG 577 | wahsen wuohsen V;IND;PST;3;PL 578 | werfen würfe V;SBJV;PST;1;SG 579 | kiesen küse V;SBJV;PST;3;SG 580 | binden bünde V;SBJV;PST;3;SG 581 | trinken trünke V;SBJV;PST;1;SG 582 | wahsen wehsest V;IND;PRS;2;SG 583 | werfen wërfet V;SBJV;PRS;3;SG 584 | swern sweret V;SBJV;PRS;2;PL 585 | ziehen ziuhet V;IND;PRS;3;SG 586 | swimmen swimmest V;IND;PRS;2;SG 587 | biegen bieget V;IMP;2;PL 588 | swimmen swummen V;IND;PST;3;PL 589 | beginnen begunnen V;IND;PST;3;PL 590 | sieden siede V;SBJV;PRS;1;SG 591 | rinnen rinnet V;IND;PRS;2;PL 592 | beginnen begann V;IND;PST;3;SG 593 | helfen hëlfen V;SBJV;PRS;1;PL 594 | sieden siedet V;SBJV;PRS;2;PL 595 | -------------------------------------------------------------------------------- /data/old-french-train-low: -------------------------------------------------------------------------------- 1 | chevalchier chevalchast V;3;SG;SBJV;PST;IPFV 2 | chever cheve V;2;SG;IMP 3 | esbaneier esbaneiissiez V;2;PL;SBJV;PST;IPFV;LGSPEC3 4 | coarder coardeve V;IND;PST;1;SG;IPFV;LGSPEC4 5 | travailler travaillez V;IND;PRS;2;PL 6 | atirier atiroies V;IND;PST;2;SG;IPFV;LGSPEC1 7 | larmoiier larmoieves V;IND;PST;2;SG;IPFV;LGSPEC5 8 | nettoiier nettis V;2;SG;SBJV;PRS;LGSPEC1 9 | sivre sevroie V;1;SG;COND;LGSPEC1 10 | desfigurer desfigureriiez V;2;PL;COND;LGSPEC1 11 | desturber desturber V;NFIN 12 | descupler descupleras V;2;SG;IND;FUT 13 | trichier trichastes V;IND;PST;2;PL;PFV 14 | tirer tireriens V;1;PL;COND;LGSPEC2 15 | laier lai V;1;SG;SBJV;PRS 16 | barguigner barguignoe V;IND;PST;1;SG;IPFV;LGSPEC3 17 | desmembrer desmembras V;IND;PST;2;SG;PFV 18 | laiier laions V;1;PL;IMP 19 | estriver estrivereient V;3;PL;COND;LGSPEC2 20 | aduster adustes V;IND;PRS;2;SG 21 | preier prei V;1;SG;SBJV;PRS;LGSPEC2 22 | descloer descloeras V;2;SG;IND;FUT 23 | plungier plunjoie V;IND;PST;1;SG;IPFV;LGSPEC1 24 | doctriner doctrinassent V;3;PL;SBJV;PST;IPFV 25 | duner dunront V;3;PL;IND;FUT;LGSPEC1 26 | umilier umilïereit V;3;SG;COND;LGSPEC2 27 | plongier plongeriiez V;2;PL;COND;LGSPEC1 28 | chulchier chulcheriens V;1;PL;COND;LGSPEC2 29 | tumer tumeroie V;1;SG;COND;LGSPEC1 30 | governer governeie V;IND;PST;1;SG;IPFV;LGSPEC2 31 | alaitier alaiteriiez V;2;PL;COND;LGSPEC1 32 | abrier abriot V;IND;PST;3;SG;IPFV;LGSPEC3 33 | enfrener enfreneroiz V;2;PL;IND;FUT;LGSPEC1 34 | ganchir ganchiroient V;3;PL;COND;LGSPEC1 35 | purgier purgievent V;IND;PST;3;PL;IPFV;LGSPEC4 36 | manecier maneçoies V;IND;PST;2;SG;IPFV;LGSPEC1 37 | amesurer amesuroient V;IND;PST;3;PL;IPFV;LGSPEC1 38 | plaider plaidasse V;1;SG;SBJV;PST;IPFV 39 | lever levai V;IND;PST;1;SG;PFV 40 | alegier alejast V;3;SG;SBJV;PST;IPFV 41 | glotir glotirent V;IND;PST;3;PL;PFV 42 | pourparler pourparlereit V;3;SG;COND;LGSPEC2 43 | estandre estanz V;IND;PRS;2;SG 44 | luisir luiseit V;IND;PST;3;SG;IPFV;LGSPEC2 45 | rompre rompierent V;IND;PST;3;PL;PFV 46 | prepenser prepensent V;3;PL;SBJV;PRS 47 | croire creïssoiz V;2;PL;SBJV;PST;IPFV;LGSPEC4 48 | awaitier awaitasses V;2;SG;SBJV;PST;IPFV 49 | apaier apais V;2;SG;SBJV;PRS 50 | laver les V;2;SG;SBJV;PRS 51 | desafubler desafublereit V;3;SG;COND;LGSPEC2 52 | encreistre encreüsses V;2;SG;SBJV;PST;IPFV 53 | compleindre compleindroit V;3;SG;COND;LGSPEC1 54 | conseller consellerent V;IND;PST;3;PL;PFV 55 | contremander contremandiez V;IND;PST;2;PL;IPFV;LGSPEC2 56 | larmoiier larmïe V;IND;PRS;3;SG;LGSPEC1 57 | manier manïereie V;1;SG;COND;LGSPEC2 58 | despire despire V;NFIN 59 | engraignier engraigneront V;3;PL;IND;FUT 60 | prouver prouveriiez V;2;PL;COND;LGSPEC1 61 | encountrer encountrissiez V;2;PL;SBJV;PST;IPFV;LGSPEC3 62 | taster tastent V;3;PL;SBJV;PRS 63 | atraire atraiiiez V;IND;PST;2;PL;IPFV;LGSPEC1 64 | adober adobeies V;IND;PST;2;SG;IPFV;LGSPEC2 65 | reclamer reclameient V;IND;PST;3;PL;IPFV;LGSPEC2 66 | sunger sungereient V;3;PL;COND;LGSPEC2 67 | creindre creinsissent V;3;PL;SBJV;PST;IPFV;LGSPEC1 68 | deraisnier deraisnissons V;1;PL;SBJV;PST;IPFV;LGSPEC1 69 | cuntenir en cuntenant V.CVB;PRS 70 | ganchir ganchis V;IND;PST;2;SG;PFV 71 | refaire refaimes V;IND;PRS;1;PL;LGSPEC1 72 | harigoter harigotissoiz V;2;PL;SBJV;PST;IPFV;LGSPEC1 73 | ariver ariviez V;IND;PST;2;PL;IPFV;LGSPEC2 74 | asemler asemleves V;IND;PST;2;SG;IPFV;LGSPEC4 75 | detranprer detranprons V;IND;PRS;1;PL 76 | chastier chastïeroient V;3;PL;COND;LGSPEC1 77 | replorer repleure V;IND;PRS;3;SG 78 | drescer dresceriens V;1;PL;COND;LGSPEC2 79 | amender amendent V;3;PL;SBJV;PRS 80 | antrespargnier antrespargneriez V;2;PL;COND;LGSPEC2 81 | correspondre correspondrai V;1;SG;IND;FUT 82 | comprehender comprehendoie V;IND;PST;1;SG;IPFV;LGSPEC1 83 | desploiier desploiier V;NFIN 84 | garbeler garbele V;2;SG;IMP 85 | aparoir aparu V;IND;PST;3;SG;PFV 86 | destraindre destrainsissez V;2;PL;SBJV;PST;IPFV;LGSPEC2 87 | aparoler aparoleient V;IND;PST;3;PL;IPFV;LGSPEC2 88 | parbolir parbolissez V;2;PL;SBJV;PST;IPFV;LGSPEC2 89 | redoter redoteras V;2;SG;IND;FUT 90 | parbouillir parboulissez V;2;PL;SBJV;PST;IPFV;LGSPEC2 91 | paser pasastes V;IND;PST;2;PL;PFV 92 | ouster ousterons V;1;PL;IND;FUT 93 | removoir removroiz V;2;PL;IND;FUT;LGSPEC1 94 | estouper estoupent V;3;PL;SBJV;PRS 95 | enbuscier enbusçoies V;IND;PST;2;SG;IPFV;LGSPEC1 96 | manier manïevent V;IND;PST;3;PL;IPFV;LGSPEC4 97 | soner sonerez V;2;PL;IND;FUT;LGSPEC3 98 | refaire referiez V;2;PL;COND;LGSPEC2 99 | taxer taxe V;IND;PRS;3;SG 100 | espier espia V;IND;PST;3;SG;PFV 101 | -------------------------------------------------------------------------------- /data/scottish-gaelic-covered-test: -------------------------------------------------------------------------------- 1 | nòsach ADJ;PL;DAT 2 | drùis-mhiannach ADJ;SG;MASC;VOC 3 | drùidhteach ADJ;SG;FEM;NOM 4 | Albannach ADJ;PL;NOM 5 | pròiseil ADJ;SG;MASC;GEN 6 | bòidheach ADJ;PL;DAT 7 | nòsail ADJ;SG;MASC;DAT 8 | adharcach ADJ;PL;NOM 9 | mòrail ADJ;SG;MASC;NOM 10 | nòsail ADJ;SG;FEM;DAT 11 | gruamach ADJ;PL;VOC 12 | rèidh ADJ;SG;MASC;VOC 13 | sliochdmhor ADJ;PL;VOC 14 | mòrail ADJ;SG;FEM;VOC 15 | pronn ADJ;SG;MASC;GEN 16 | fàsaichte ADJ;PL;VOC 17 | leatromach ADJ;PL;GEN 18 | feòlmhor ADJ;SG;FEM;DAT 19 | dualchasach ADJ;SG;MASC;VOC 20 | gobach ADJ;SG;MASC;VOC 21 | dòth V;COND;ACT 22 | cam ADJ;PL;VOC 23 | brèagha ADJ;SG;MASC;DAT 24 | gruamach ADJ;SG;MASC;NOM 25 | sgòideil ADJ;PL;VOC 26 | Manainneach ADJ;SG;MASC;GEN 27 | dealanach ADJ;PL;GEN 28 | sliochdmhor ADJ;PL;DAT 29 | abair V;PST;PASS 30 | cuagach ADJ;SG;MASC;NOM 31 | ioma-ghlac V;COND;ACT 32 | dualchasach ADJ;PL;GEN 33 | cuagach ADJ;SG;FEM;DAT 34 | eagalach ADJ;SG;FEM;VOC 35 | sliochdmhor ADJ;SG;FEM;NOM 36 | drùis-mhiannach ADJ;PL;VOC 37 | nòsach ADJ;SG;FEM;NOM 38 | sliochdmhor ADJ;SG;FEM;VOC 39 | Spàinneach ADJ;SG;FEM;NOM 40 | seann-nòsach ADJ;SG;FEM;DAT 41 | beir V;FUT;ACT 42 | crùb V;PST;ACT 43 | uaibhreach ADJ;SG;FEM;DAT 44 | glac V;PST;PASS 45 | brònach ADJ;SG;FEM;DAT 46 | grinn ADJ;SG;MASC;VOC 47 | nòsail ADJ;PL;NOM 48 | rìomhach ADJ;SG;FEM;GEN 49 | sgèimheach ADJ;PL;NOM 50 | sìolmhor ADJ;SG;MASC;NOM 51 | -------------------------------------------------------------------------------- /data/scottish-gaelic-dev: -------------------------------------------------------------------------------- 1 | Breatannach Breatannach ADJ;PL;NOM 2 | rìomhach rìomhach ADJ;SG;MASC;NOM 3 | stiùir stiùirteadh V;COND;PASS 4 | ruadh ruadha ADJ;PL;GEN 5 | blàth bhlàith ADJ;SG;FEM;VOC 6 | uaibhreach uaibhreach ADJ;SG;FEM;NOM 7 | nuadh nuadh ADJ;SG;MASC;NOM 8 | dealanach dhealanich ADJ;SG;MASC;VOC 9 | maiseach maiseach ADJ;PL;NOM 10 | blàth blàtha ADJ;PL;VOC 11 | rèidh rèidh ADJ;SG;MASC;GEN 12 | fàsaichte fàsaichte ADJ;PL;DAT 13 | fàsaichte fhàsaichte ADJ;SG;MASC;VOC 14 | nuadh nuaidh ADJ;SG;MASC;GEN 15 | leatromach leatromaich ADJ;SG;FEM;VOC 16 | stiùir stiùiridh V;FUT;ACT 17 | mòrail mòraile ADJ;PL;GEN 18 | dòth a' dòthadh V;V.PTCP;PRS 19 | Spàinneach Spàinnich ADJ;SG;FEM;GEN 20 | làidir làidir ADJ;SG;FEM;VOC 21 | rèitich rèitich V;PST;ACT 22 | ruig ruigteadh V;COND;PASS 23 | geal geal ADJ;SG;MASC;NOM 24 | eagalach eagalach ADJ;PL;GEN 25 | stiùir stiùir V;PST;ACT 26 | pronn phruinn ADJ;SG;MASC;VOC 27 | cuagach cuagach ADJ;PL;NOM 28 | seann-nòsach sheann-nòsaich ADJ;SG;MASC;GEN 29 | dòth dhòth V;PST;ACT 30 | dealanach dhealanach ADJ;SG;FEM;NOM 31 | maiseach maiseach ADJ;SG;MASC;NOM 32 | sìolmhor shìolmhor ADJ;SG;MASC;DAT 33 | drùidhteach dhrùidhtich ADJ;SG;MASC;VOC 34 | adharcach adharcaich ADJ;SG;FEM;DAT 35 | crùb crùbaidh V;FUT;ACT 36 | crùbach chrùbaich ADJ;SG;MASC;GEN 37 | Albannach Albannach ADJ;SG;MASC;NOM 38 | seann-nòsach seann-nòsach ADJ;PL;VOC 39 | faigh a' faighinn V;V.PTCP;PRS 40 | caidil chaidileadh V;PST;PASS 41 | blàth bhlàith ADJ;SG;MASC;VOC 42 | Cuimreach Chuimrich ADJ;SG;MASC;VOC 43 | Spàinneach Spàinneach ADJ;SG;MASC;NOM 44 | nòsail nòsail ADJ;SG;MASC;NOM 45 | nòsach nòsach ADJ;PL;GEN 46 | fàsaichte fhàsaichte ADJ;SG;FEM;GEN 47 | feòlmhor feòlmhor ADJ;PL;DAT 48 | Breatannach Bhreatannaich ADJ;SG;FEM;GEN 49 | tòisich thòisich V;PST;ACT 50 | Cuimreach Chuimreach ADJ;SG;FEM;NOM 51 | -------------------------------------------------------------------------------- /data/scottish-gaelic-test: -------------------------------------------------------------------------------- 1 | nòsach nòsach ADJ;PL;DAT 2 | drùis-mhiannach dhrùis-mhiannaich ADJ;SG;MASC;VOC 3 | drùidhteach dhrùidhteach ADJ;SG;FEM;NOM 4 | Albannach Albannach ADJ;PL;NOM 5 | pròiseil phròiseil ADJ;SG;MASC;GEN 6 | bòidheach bòidheach ADJ;PL;DAT 7 | nòsail nòsail ADJ;SG;MASC;DAT 8 | adharcach adharcach ADJ;PL;NOM 9 | mòrail mòrail ADJ;SG;MASC;NOM 10 | nòsail nòsail ADJ;SG;FEM;DAT 11 | gruamach gruamach ADJ;PL;VOC 12 | rèidh rèidh ADJ;SG;MASC;VOC 13 | sliochdmhor sliochdmhor ADJ;PL;VOC 14 | mòrail mhòrail ADJ;SG;FEM;VOC 15 | pronn phruinn ADJ;SG;MASC;GEN 16 | fàsaichte fàsaichte ADJ;PL;VOC 17 | leatromach learomach ADJ;PL;GEN 18 | feòlmhor fheòlmhoir ADJ;SG;FEM;DAT 19 | dualchasach dhualchasaich ADJ;SG;MASC;VOC 20 | gobach ghobaich ADJ;SG;MASC;VOC 21 | dòth dhòthadh V;COND;ACT 22 | cam cama ADJ;PL;VOC 23 | brèagha bhrèagha ADJ;SG;MASC;DAT 24 | gruamach gruamach ADJ;SG;MASC;NOM 25 | sgòideil sgòideil ADJ;PL;VOC 26 | Manainneach Mhanainnich ADJ;SG;MASC;GEN 27 | dealanach dealanach ADJ;PL;GEN 28 | sliochdmhor sliochdmhor ADJ;PL;DAT 29 | abair thuirteadh V;PST;PASS 30 | cuagach cuagach ADJ;SG;MASC;NOM 31 | ioma-ghlac dh'ioma-ghlacadh V;COND;ACT 32 | dualchasach dualchasach ADJ;PL;GEN 33 | cuagach chuagaich ADJ;SG;FEM;DAT 34 | eagalach eagalaich ADJ;SG;FEM;VOC 35 | sliochdmhor shliochdmhor ADJ;SG;FEM;NOM 36 | drùis-mhiannach drùis-mhiannach ADJ;PL;VOC 37 | nòsach nòsach ADJ;SG;FEM;NOM 38 | sliochdmhor shliochdmhoire ADJ;SG;FEM;VOC 39 | Spàinneach Spàinneach ADJ;SG;FEM;NOM 40 | seann-nòsach sheann-nòsaich ADJ;SG;FEM;DAT 41 | beir beiridh V;FUT;ACT 42 | crùb chrùb V;PST;ACT 43 | uaibhreach uaibhrich ADJ;SG;FEM;DAT 44 | glac ghlacadh V;PST;PASS 45 | brònach bhrònaich ADJ;SG;FEM;DAT 46 | grinn ghrinn ADJ;SG;MASC;VOC 47 | nòsail nòsaile ADJ;PL;NOM 48 | rìomhach rìomhaich ADJ;SG;FEM;GEN 49 | sgèimheach sgèimheach ADJ;PL;NOM 50 | sìolmhor sìolmhor ADJ;SG;MASC;NOM 51 | -------------------------------------------------------------------------------- /data/scottish-gaelic-train-low: -------------------------------------------------------------------------------- 1 | Spàinneach Spàinneach ADJ;PL;VOC 2 | toilich a' toileachadh V;V.PTCP;PRS 3 | rèitich a' rèiteachadh V;V.PTCP;PRS 4 | eagalach eagalaich ADJ;SG;MASC;VOC 5 | uaibhreach uaibhreach ADJ;SG;MASC;NOM 6 | nòsach nòsach ADJ;SG;FEM;VOC 7 | sìolmhor shìolmhor ADJ;SG;FEM;NOM 8 | rèitich rèiticheadh V;PST;PASS 9 | eagalach eagalaich ADJ;SG;FEM;DAT 10 | beulach bheulach ADJ;SG;FEM;NOM 11 | sùgh shùgh V;PST;ACT 12 | gruamach ghruamach ADJ;SG;FEM;NOM 13 | Albannach Albannaich ADJ;SG;MASC;GEN 14 | bacach bacach ADJ;PL;DAT 15 | crùbach chrùbaich ADJ;SG;FEM;DAT 16 | brònach bhrònaich ADJ;SG;MASC;VOC 17 | Breatannach Breatannach ADJ;PL;VOC 18 | geal geala ADJ;PL;GEN 19 | rèidh rèidh ADJ;SG;MASC;NOM 20 | nòsach nòsaich ADJ;SG;MASC;GEN 21 | Albannach Albannach ADJ;PL;VOC 22 | gobach ghobach ADJ;PL;VOC 23 | stiùir stiùireadh V;PST;PASS 24 | seòl sheòl V;PST;ACT 25 | Cuimreach Cuimreach ADJ;PL;VOC 26 | ruadh ruadha ADJ;PL;VOC 27 | sìolmhor sìolmhor ADJ;PL;DAT 28 | gobach ghobaich ADJ;SG;FEM;DAT 29 | nuadh nuaidh ADJ;SG;FEM;DAT 30 | drùis-mhiannach dhrùis-mhiannaich ADJ;SG;FEM;VOC 31 | fàsaichte fhàsaichte ADJ;SG;FEM;VOC 32 | Spàinneach Spàinnich ADJ;SG;MASC;VOC 33 | nuadh nuadha ADJ;PL;NOM 34 | seòl sheòladh V;COND;ACT 35 | gobach ghobach ADJ;SG;FEM;NOM 36 | ioma-ghlac ioma-ghlacaidh V;FUT;ACT 37 | gruamach ghruamach ADJ;SG;FEM;VOC 38 | Manainneach Manainneach ADJ;PL;DAT 39 | crùbach crùbach ADJ;PL;GEN 40 | uaibhreach uaibhreach ADJ;PL;DAT 41 | pronn pronn ADJ;PL;VOC 42 | brònach bhrònach ADJ;SG;MASC;DAT 43 | maiseach mhaisich ADJ;SG;FEM;VOC 44 | ruadh ruadha ADJ;PL;NOM 45 | maiseach mhaisich ADJ;SG;MASC;VOC 46 | seann-nòsach sheann-nòsach ADJ;SG;MASC;DAT 47 | Spàinneach Spàinnich ADJ;SG;MASC;GEN 48 | brònach bhrònach ADJ;SG;FEM;VOC 49 | rèitich rèiticheadh V;COND;ACT 50 | dealanach dhealanaich ADJ;SG;FEM;VOC 51 | fionn fionna ADJ;PL;NOM 52 | grinn ghrinn ADJ;SG;FEM;VOC 53 | abair their V;FUT;ACT 54 | sùgh shùghadh V;PST;PASS 55 | bacach bacach ADJ;SG;MASC;NOM 56 | rìomhach rìomhach ADJ;SG;FEM;NOM 57 | beulach bheulach ADJ;PL;GEN 58 | Manainneach Mhanainnich ADJ;SG;MASC;VOC 59 | beir rug V;PST;ACT 60 | brèagha brèagha ADJ;PL;VOC 61 | beulach bheulach ADJ;PL;VOC 62 | sliochdmhor sliochdmhor ADJ;SG;MASC;NOM 63 | caidil chaidileadhnote V;COND;ACT 64 | grinn grinne ADJ;PL;NOM 65 | searg sheargadh V;PST;PASS 66 | grinn grinn ADJ;SG;MASC;NOM 67 | nòsail nòsail ADJ;SG;FEM;VOC 68 | ligh lighidh V;FUT;ACT 69 | brònach brònach ADJ;PL;VOC 70 | bòidheach bhòidhich ADJ;SG;MASC;GEN 71 | sgèimheach sgèimhich ADJ;SG;MASC;GEN 72 | seasg sheisge ADJ;SG;FEM;DAT 73 | miannach miannach ADJ;PL;NOM 74 | mòrail mòrail ADJ;PL;DAT 75 | sùgh sùghaidh V;FUT;ACT 76 | sgòideil sgòideile ADJ;SG;FEM;GEN 77 | Breatannach Bhreatannaich ADJ;SG;MASC;GEN 78 | geal gheal ADJ;SG;FEM;NOM 79 | crùbach chrùbach ADJ;SG;FEM;NOM 80 | dealanach dealanach ADJ;PL;VOC 81 | pròiseil phròiseil ADJ;SG;FEM;GEN 82 | fionn fionn ADJ;SG;MASC;NOM 83 | làn làn ADJ;SG;FEM;VOC 84 | Manainneach Manainneach ADJ;PL;NOM 85 | gruamach ghruamaich ADJ;SG;FEM;DAT 86 | stad stadtadh V;COND;PASS 87 | miannach miannaich ADJ;SG;FEM;GEN 88 | dòth dhòthadh V;PST;PASS 89 | Albannach Albannach ADJ;SG;MASC;DAT 90 | pròiseil phròiseil ADJ;SG;MASC;VOC 91 | dealanach dhealanach ADJ;SG;MASC;DAT 92 | nuadh nuadh ADJ;SG;FEM;VOC 93 | sìolmhor sìolmhor ADJ;PL;VOC 94 | faigh gheibhear V;FUT;PASS 95 | inbheach inbheach ADJ;SG;FEM;NOM 96 | cam chaim ADJ;SG;FEM;VOC 97 | pròiseil pròiseil ADJ;SG;MASC;NOM 98 | tòisich tòisichear V;FUT;PASS 99 | mòrail mhòrail ADJ;SG;MASC;DAT 100 | ruig ruigidh V;FUT;ACT 101 | -------------------------------------------------------------------------------- /data/spanish-train-low: -------------------------------------------------------------------------------- 1 | reiterar no reiteren V;NEG;IMP;3;PL 2 | machucar machucaste V;IND;PST;2;SG;PFV 3 | preguntarse pregúntese V;POS;IMP;3;SG 4 | marcir no marzáis V;NEG;IMP;2;PL 5 | jaquear jaquearíamos V;COND;1;PL 6 | decir dices V;IND;PRS;2;SG 7 | atrasar atrasasen V;SBJV;PST;3;PL 8 | escasear escasearen V;SBJV;FUT;3;PL 9 | comparar comparaba V;IND;PST;1;SG;IPFV 10 | alechugar alechugare V;SBJV;FUT;1;SG 11 | redescubrir redescubriríamos V;COND;1;PL 12 | desempeñar desempeñaremos V;IND;FUT;1;PL 13 | suscitar suscitamos V;IND;PRS;1;PL 14 | expatriar expatriara V;SBJV;PST;1;SG;LGSPEC1 15 | ejercitar ejercitas V;IND;PRS;2;SG 16 | agredir agrediendo V.CVB;PRS 17 | retraducir retradujere V;SBJV;FUT;1;SG 18 | bordar bordas V;IND;PRS;2;SG 19 | cliquear cliquearas V;SBJV;PST;2;SG;LGSPEC1 20 | alocar alocaran V;SBJV;PST;3;PL;LGSPEC1 21 | extrapolar extrapolan V;IND;PRS;3;PL 22 | gratinar gratinaréis V;IND;FUT;2;PL 23 | avanzar no avancéis V;NEG;IMP;2;PL 24 | ofrendar ofrendásemos V;SBJV;PST;1;PL 25 | aojar aojares V;SBJV;FUT;2;SG 26 | editar editáramos V;SBJV;PST;1;PL;LGSPEC1 27 | entablar entablan V;IND;PRS;3;PL 28 | alivianar alivianará V;IND;FUT;3;SG 29 | perdurar perduró V;IND;PST;3;SG;PFV 30 | destorcer destorciese V;SBJV;PST;1;SG 31 | mutar muten V;POS;IMP;3;PL 32 | rebozar no reboce V;NEG;IMP;3;SG 33 | jabonar jabonarás V;IND;FUT;2;SG 34 | expedientar expedientes V;SBJV;PRS;2;SG 35 | endeudar endeudados V.PTCP;PST;MASC;PL 36 | esposar esposare V;SBJV;FUT;1;SG 37 | arrinconar arrinconamos V;IND;PRS;1;PL 38 | esquinar no esquinen V;NEG;IMP;3;PL 39 | reimpulsar reimpulsan V;IND;PRS;3;PL 40 | pisotear pisoteemos V;POS;IMP;1;PL 41 | anticipar no anticipen V;NEG;IMP;3;PL 42 | deshelar desheladas V.PTCP;PST;FEM;PL 43 | estriar estriarían V;COND;3;PL 44 | maniatar maniaten V;POS;IMP;3;PL 45 | indizar indicéis V;SBJV;PRS;2;PL 46 | ablandar ablandaban V;IND;PST;3;PL;IPFV 47 | obligar obligaren V;SBJV;FUT;3;PL 48 | sobar sobo V;IND;PRS;1;SG 49 | despeinarse os despeinaseis V;SBJV;PST;2;PL 50 | despenalizar despenalice V;SBJV;PRS;1;SG 51 | abonar abonaran V;SBJV;PST;3;PL;LGSPEC1 52 | resorber resorbías V;IND;PST;2;SG;IPFV 53 | fertilizar fertilice V;SBJV;PRS;3;SG 54 | destensar destensarías V;COND;2;SG 55 | desaprovechar desaprovechamos V;IND;PST;1;PL;PFV 56 | ficar ficasteis V;IND;PST;2;PL;PFV 57 | clavetear no clavetees V;NEG;IMP;2;SG 58 | fornicar fornico V;IND;PRS;1;SG 59 | valuar valuaren V;SBJV;FUT;3;PL 60 | situar situasen V;SBJV;PST;3;PL 61 | abonarse abónese V;POS;IMP;3;SG 62 | vandalizar no vandalicéis V;NEG;IMP;2;PL 63 | apasionar apasionases V;SBJV;PST;2;SG 64 | inteligir no intelijan V;NEG;IMP;3;PL 65 | sesionar sesionáis V;IND;PRS;2;PL 66 | transitar transitasteis V;IND;PST;2;PL;PFV 67 | pasterizar pasterizáramos V;SBJV;PST;1;PL;LGSPEC1 68 | realizar realizará V;IND;FUT;3;SG 69 | adelgazar adelgazaban V;IND;PST;3;PL;IPFV 70 | defenestrar defenestrar V;NFIN 71 | descocer descocido V.PTCP;PST;MASC;SG 72 | bifurcar bifurca V;POS;IMP;2;SG 73 | valer valga V;SBJV;PRS;1;SG 74 | intrigar intrigaban V;IND;PST;3;PL;IPFV 75 | externalizar externalizara V;SBJV;PST;3;SG;LGSPEC1 76 | efectivizar efectivizado V.PTCP;PST;MASC;SG 77 | noquear noqueare V;SBJV;FUT;3;SG 78 | vandalizar vandaliza V;IND;PRS;3;SG 79 | embarazar embarazaba V;IND;PST;3;SG;IPFV 80 | aturar aturarían V;COND;3;PL 81 | voltear volteaseis V;SBJV;PST;2;PL 82 | desunir desunimos V;IND;PST;1;PL;PFV 83 | pochar pochase V;SBJV;PST;3;SG 84 | fechar fecharía V;COND;1;SG 85 | lastimar no lastimen V;NEG;IMP;3;PL 86 | recalibrar no recalibréis V;NEG;IMP;2;PL 87 | franelear no franelees V;NEG;IMP;2;SG 88 | apostar apostada V.PTCP;PST;FEM;SG 89 | formatear formateen V;SBJV;PRS;3;PL 90 | ulcerar ulceremos V;SBJV;PRS;1;PL 91 | terraplenar terraplenó V;IND;PST;3;SG;PFV 92 | deslucir desluzcamos V;SBJV;PRS;1;PL 93 | robar robó V;IND;PST;3;SG;PFV 94 | pasteurizar pasteurizare V;SBJV;FUT;1;SG 95 | equivocarse nos equivocamos V;IND;PRS;1;PL 96 | documentar documentaran V;SBJV;PST;3;PL;LGSPEC1 97 | obstaculizar obstaculizarías V;COND;2;SG 98 | defecar defecara V;SBJV;PST;1;SG;LGSPEC1 99 | realimentar realimentaba V;IND;PST;3;SG;IPFV 100 | ripiar ripiados V.PTCP;PST;MASC;PL 101 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from random import shuffle 2 | import torch 3 | 4 | import data 5 | # import tokenizer 6 | 7 | 8 | def get_train_dataset(train_file_path, tokenizer, max_src_seq_len=30, max_tgt_seq_len=25): 9 | """ 10 | Reads input and output tokens from train set file, and converts tokens to tensors of ids using tokenizer. 11 | """ 12 | # Read input and output tokens from dataset 13 | inputs_tokens, outputs_tokens = data.read_train_file_tokens(train_file_path) 14 | # Pad with sos and eos 15 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens) 16 | outputs_tokens = tokenizer.add_sequence_symbols(outputs_tokens) 17 | # Split target into two targets, for teacher forcing 18 | # ------------- 19 | targets_tokens = [target_tokens[:-1] for target_tokens in outputs_tokens] 20 | # --------For Transformer model from SIGMORPHON 2020 Baseline---------- 21 | # targets_tokens = outputs_tokens 22 | # ------------- 23 | targets_y_tokens = [target_tokens[1:] for target_tokens in outputs_tokens] 24 | 25 | # Get lists of all input ids, target ids and target_y ids, where each sequence padded up to max length 26 | inputs_ids = [ 27 | tokenizer.convert_src_tokens_to_ids(tokenizer.pad_tokens_sequence(input_tokens, max_src_seq_len)) 28 | for input_tokens in inputs_tokens] 29 | targets_ids = [ 30 | tokenizer.convert_tgt_tokens_to_ids(tokenizer.pad_tokens_sequence(target_tokens, max_tgt_seq_len)) 31 | for target_tokens in targets_tokens] 32 | targets_y_ids = [ 33 | tokenizer.convert_tgt_tokens_to_ids(tokenizer.pad_tokens_sequence(target_y_tokens, max_tgt_seq_len)) 34 | for target_y_tokens in targets_y_tokens] 35 | 36 | return inputs_ids, targets_ids, targets_y_ids 37 | 38 | 39 | def get_valid_dataset(valid_file_path, tokenizer): 40 | """ 41 | Reads input and output tokens from valid set file, and converts tokens to tensors of ids using tokenizer. 42 | """ 43 | # Read input and output tokens from dataset 44 | inputs_tokens, outputs_tokens = data.read_train_file_tokens(valid_file_path) 45 | # Pad with sos and eos 46 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens) 47 | outputs_tokens = tokenizer.add_sequence_symbols(outputs_tokens) 48 | # Get tensors of all input ids and output ids 49 | inputs_ids = tokenizer.get_id_tensors(inputs_tokens, "INPUT") 50 | outputs_ids = tokenizer.get_id_tensors(outputs_tokens, "OUTPUT") 51 | return inputs_ids, outputs_ids 52 | 53 | 54 | def get_test_dataset(test_file_path, tokenizer): 55 | """ 56 | Reads tokens from test set file, and converts tokens to tensors of ids using tokenizer. 57 | Returns the tokens as well, used for prediction. 58 | """ 59 | # Read input tokens from dataset 60 | inputs_tokens = data.read_test_file_tokens(test_file_path) 61 | # Pad with sos and eos 62 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens) 63 | # Get tensors of input ids 64 | inputs_ids = tokenizer.get_id_tensors(inputs_tokens, "INPUT") 65 | return inputs_ids, inputs_tokens 66 | 67 | def shuffle_together(list1, list2, list3): 68 | """Shuffles two lists together""" 69 | zip_list = list(zip(list1, list2, list3)) 70 | shuffle(zip_list) 71 | list1, list2, list3 = zip(*zip_list) 72 | return list1, list2, list3 73 | 74 | def split_to_batches(ids_list, device, batch_size=128): 75 | """ splits list of id sequence into batchs. 76 | Gets list of sequences (list of size seq_len) 77 | returns list of batchs, each batch is a tensor of size N x S (batch_size x seq_len)""" 78 | return [torch.tensor(ids_list[x:x + batch_size], dtype=torch.long, device=device) for x in 79 | range(0, len(ids_list), batch_size)] 80 | 81 | 82 | def get_batches(input_ids, target_ids, target_y_ids, device, batch_size=128): 83 | """ Gets entire dataset, shuffles the data, and splits it to batches. 84 | Each batch is a tensor of size N x S (batch_size x seq_len).""" 85 | # Shuffle together 86 | shuffled_input_ids, shuffled_target_ids, shuffled_target_y_ids = shuffle_together(input_ids, target_ids, target_y_ids) 87 | # split to batches 88 | input_ids_batches = split_to_batches(shuffled_input_ids, device, batch_size) 89 | target_ids_batches = split_to_batches(shuffled_target_ids, device, batch_size) 90 | target_y_ids_batches = split_to_batches(shuffled_target_y_ids, device, batch_size) 91 | return input_ids_batches, target_ids_batches, target_y_ids_batches 92 | 93 | 94 | class DataLoader(object): 95 | """ Contains all utilities for reading train/valid/test sets """ 96 | def __init__(self, tokenizer, train_file_path=None, valid_file_path=None, test_file_path=None, 97 | device="cpu", batch_size=128, max_src_seq_len=30, max_tgt_seq_len=25): 98 | self.tokenizer = tokenizer 99 | self.device = device 100 | self.batch_size = batch_size 101 | self.max_src_seq_len = max_src_seq_len 102 | self.max_tgt_seq_len = max_tgt_seq_len 103 | # Read train file and get train set 104 | if train_file_path is not None: 105 | train_input_ids, train_target_ids, train_target_y_ids = get_train_dataset(train_file_path, tokenizer, 106 | self.max_src_seq_len, self.max_tgt_seq_len) 107 | self.train_input_ids = train_input_ids 108 | self.train_target_ids = train_target_ids 109 | self.train_target_y_ids = train_target_y_ids 110 | self.train_set_size = len(self.train_input_ids) 111 | else: 112 | self.train_input_ids = None 113 | self.train_target_ids = None 114 | self.train_target_y_ids = None 115 | self.train_set_size = 0 116 | 117 | if valid_file_path is not None: 118 | # Read validation file and get validation set, for checking loss using teacher forcing 119 | valid_input_ids_tf, valid_target_ids_tf, valid_target_y_ids_tf = get_train_dataset(valid_file_path, tokenizer, 120 | self.max_src_seq_len, self.max_tgt_seq_len) 121 | self.valid_input_ids_tf = valid_input_ids_tf 122 | self.valid_target_ids_tf = valid_target_ids_tf 123 | self.valid_target_y_ids_tf = valid_target_y_ids_tf 124 | # Read validation file and get validation set, for evaluation 125 | valid_input_ids, valid_target_ids = get_valid_dataset(valid_file_path, tokenizer) 126 | self.valid_input_ids = valid_input_ids 127 | self.valid_target_ids = valid_target_ids 128 | else: 129 | self.valid_input_ids_tf = None 130 | self.valid_target_ids_tf = None 131 | self.valid_target_y_ids_tf = None 132 | self.valid_input_ids = None 133 | self.valid_target_ids = None 134 | 135 | if test_file_path is not None: 136 | # Read test file and get test set 137 | test_input_ids = get_test_dataset(test_file_path, tokenizer) 138 | self.test_input_ids = test_input_ids 139 | else: 140 | self.test_input_ids = None 141 | 142 | def get_train_set(self): 143 | return get_batches(self.train_input_ids, self.train_target_ids, self.train_target_y_ids, 144 | self.device, batch_size=self.batch_size) 145 | 146 | def get_validation_set_tf(self): 147 | return get_batches(self.valid_input_ids_tf, self.valid_target_ids_tf, self.valid_target_y_ids_tf, 148 | self.device, batch_size=self.batch_size) 149 | 150 | def get_validation_set(self): 151 | return self.valid_input_ids, self.valid_target_ids 152 | 153 | def get_validation_set_len(self): 154 | return len(self.valid_input_ids) 155 | 156 | def get_test_set(self): 157 | return self.test_input_ids 158 | 159 | def get_test_set_len(self): 160 | return len(self.test_input_ids) 161 | 162 | def get_padding_mask(self, batch_tensor): 163 | """" Returns padding masks for given batch 164 | Padding masks are ByteTensor where True values are positions that are masked and False values are not. 165 | inputs are of size N x S (batch_size x seq_len) 166 | Returns masks of same size - N x S (batch_size x seq_len) """ 167 | return batch_tensor == self.tokenizer.pad_id 168 | 169 | def get_padding_masks(self, source_batch, target_batch): 170 | """" Returns padding masks for source batch, memory batch, and target batch. """ 171 | src_padding_mask = self.get_padding_mask(source_batch) 172 | mem_padding_mask = src_padding_mask 173 | target_padding_mask = self.get_padding_mask(target_batch) 174 | return src_padding_mask, mem_padding_mask, target_padding_mask 175 | -------------------------------------------------------------------------------- /decoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn.modules import TransformerDecoderLayer 6 | from torch.nn.modules.activation import MultiheadAttention 7 | from torch.nn import Linear, Dropout 8 | from torch.nn import LayerNorm 9 | from torch.nn.modules.transformer import _get_activation_fn, _get_clones 10 | 11 | 12 | class TransformerDecoder(nn.Module): 13 | r"""TransformerDecoder is a stack of N decoder layers 14 | 15 | Args: 16 | decoder_layer: an instance of the TransformerDecoderLayer() class (required). 17 | decoder_final_layer: an instance of the TransformerDecoderLayer() class (required). 18 | num_layers: the number of sub-decoder-layers in the decoder (required). 19 | norm: the layer normalization component (optional). 20 | 21 | Examples:: 22 | decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) 23 | transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) 24 | memory = torch.rand(10, 32, 512) 25 | tgt = torch.rand(20, 32, 512) 26 | out = transformer_decoder(tgt, memory) 27 | """ 28 | 29 | def __init__(self, decoder_layer, decoder_final_layer, num_layers, norm=None): 30 | super(TransformerDecoder, self).__init__() 31 | self.layers = _get_clones(decoder_layer, num_layers - 1) 32 | self.final_layer = decoder_final_layer 33 | self.num_layers = num_layers 34 | self.norm = norm 35 | 36 | def forward(self, tgt, memory, tgt_mask=None, 37 | memory_mask=None, tgt_key_padding_mask=None, 38 | memory_key_padding_mask=None): 39 | r"""Pass the inputs (and mask) through the decoder layer in turn. 40 | 41 | Args: 42 | tgt: the sequence to the decoder (required). 43 | memory: the sequnce from the last layer of the encoder (required). 44 | tgt_mask: the mask for the tgt sequence (optional). 45 | memory_mask: the mask for the memory sequence (optional). 46 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional). 47 | memory_key_padding_mask: the mask for the memory keys per batch (optional). 48 | 49 | Shape: 50 | see the docs in Transformer class. 51 | """ 52 | output = tgt 53 | # Run through "normal" decoder layer 54 | for i in range(self.num_layers - 1): 55 | output = self.layers[i](output, memory, tgt_mask=tgt_mask, 56 | memory_mask=memory_mask, 57 | tgt_key_padding_mask=tgt_key_padding_mask, 58 | memory_key_padding_mask=memory_key_padding_mask) 59 | # Run through final decoder layer, which outputs the attention weights as well 60 | output, attention_weights = self.final_layer(output, memory, tgt_mask=tgt_mask, 61 | memory_mask=memory_mask, 62 | tgt_key_padding_mask=tgt_key_padding_mask, 63 | memory_key_padding_mask=memory_key_padding_mask) 64 | 65 | if self.norm: 66 | output = self.norm(output) 67 | 68 | return output, attention_weights 69 | 70 | class TransformerDecoderFinalLayer(nn.Module): 71 | r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. 72 | Layer also output attention weights from the multi-head-attn, used for pointer-generator model. 73 | Args: 74 | d_model: the number of expected features in the input (required). 75 | nhead: the number of heads in the multiheadattention models (required). 76 | dim_feedforward: the dimension of the feedforward network model (default=2048). 77 | dropout: the dropout value (default=0.1). 78 | activation: the activation function of intermediate layer, relu or gelu (default=relu). 79 | 80 | Examples:: 81 | decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) 82 | memory = torch.rand(10, 32, 512) 83 | tgt = torch.rand(20, 32, 512) 84 | out, attention = decoder_layer(tgt, memory) 85 | """ 86 | 87 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): 88 | super(TransformerDecoderFinalLayer, self).__init__() 89 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 90 | self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 91 | # Implementation of Feedforward model 92 | self.linear1 = Linear(d_model, dim_feedforward) 93 | self.dropout = Dropout(dropout) 94 | self.linear2 = Linear(dim_feedforward, d_model) 95 | 96 | self.norm1 = LayerNorm(d_model) 97 | self.norm2 = LayerNorm(d_model) 98 | self.norm3 = LayerNorm(d_model) 99 | self.dropout1 = Dropout(dropout) 100 | self.dropout2 = Dropout(dropout) 101 | self.dropout3 = Dropout(dropout) 102 | 103 | self.activation = _get_activation_fn(activation) 104 | 105 | def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, 106 | tgt_key_padding_mask=None, memory_key_padding_mask=None): 107 | r"""Pass the inputs (and mask) through the decoder layer. 108 | 109 | Args: 110 | tgt: the sequence to the decoder layer (required). 111 | memory: the sequnce from the last layer of the encoder (required). 112 | tgt_mask: the mask for the tgt sequence (optional). 113 | memory_mask: the mask for the memory sequence (optional). 114 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional). 115 | memory_key_padding_mask: the mask for the memory keys per batch (optional). 116 | 117 | Shape: 118 | see the docs in Transformer class. 119 | """ 120 | tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask, 121 | key_padding_mask=tgt_key_padding_mask)[0] 122 | tgt = tgt + self.dropout1(tgt2) 123 | tgt = self.norm1(tgt) 124 | # Model saves attention weights from multi-head-attn 125 | tgt2, attention_weights = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask, 126 | key_padding_mask=memory_key_padding_mask) 127 | tgt = tgt + self.dropout2(tgt2) 128 | tgt = self.norm2(tgt) 129 | # for backward compatibility 130 | tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt)))) 131 | tgt = tgt + self.dropout3(tgt2) 132 | tgt = self.norm3(tgt) 133 | return tgt, attention_weights -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import data 3 | 4 | # Arguments 5 | import utils 6 | 7 | parser = argparse.ArgumentParser(description='Computing accuracy of model predictions compare to target file') 8 | parser.add_argument('--pred', type=str, default='data/pred', 9 | help="File with model predictions (must include folder path)") 10 | parser.add_argument('--target', type=str, default='target', 11 | help="File with gold targets (must include folder path)") 12 | args = parser.parse_args() 13 | 14 | # Log all relevant files 15 | logger = utils.get_logger() 16 | logger.info(f"Target file: {args.target}") 17 | logger.info(f"Prediction file: {args.pred}") 18 | 19 | 20 | """ FUNCTIONS """ 21 | def accuracy(predictions, targets): 22 | """Return fraction of matches between two lists sequentially.""" 23 | correct_count = 0 24 | for prediction, target in zip(predictions, targets): 25 | if prediction == target: 26 | correct_count += 1 27 | return float(100 * correct_count) / len(predictions) 28 | 29 | def evaluate_predictions(pred_file, target_file): 30 | """Compute prediction. words NOT cleaned""" 31 | pred_lines = data.read_morph_file(pred_file) 32 | target_lines = data.read_morph_file(target_file) 33 | predictions = [line[1] for line in pred_lines] 34 | truth = [line[1] for line in target_lines] 35 | total_accuracy = accuracy(predictions, truth) 36 | logger.info(f"Test set. accuracy: {total_accuracy:.2f}\n") 37 | return total_accuracy 38 | 39 | 40 | if __name__ == '__main__': 41 | # Compute accuracy of predictions compare to truth 42 | evaluate_predictions(args.pred, args.target) 43 | 44 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | import dataset 8 | import data 9 | import tokenizer 10 | 11 | # Arguments 12 | import transformer 13 | import utils 14 | 15 | parser = argparse.ArgumentParser(description='Evaluating the transformer over test and validation sets') 16 | parser.add_argument('--model-checkpoint', type=str, default='checkpoints/model_best.pth', 17 | help="the model file to be evaluated. Usually is of the form model_X.pth (must include folder path)") 18 | parser.add_argument('--arch', type=str, default='transformer', 19 | help="Architecture type for model: transformer, pointer_generator") 20 | parser.add_argument('--embed-dim', type=int, default=128, 21 | help='Embedding dimension (default: 128)') 22 | parser.add_argument('--fcn-dim', type=int, default=256, 23 | help='Fully-connected network hidden dimension (default: 256)') 24 | parser.add_argument('--num-heads', type=int, default=4, 25 | help='number of attention heads (default: 4)') 26 | parser.add_argument('--num-layers', type=int, default=2, 27 | help='number of layers in encoder and decoder (default: 2)') 28 | parser.add_argument('--dropout', type=float, default=0.2, 29 | help='Dropout probability (default: 0.2)') 30 | parser.add_argument('--test', type=str, default='data', 31 | help="Test file of the dataset (must include folder path)") 32 | parser.add_argument('--vocab', type=str, default='data', 33 | help="Base name of vocabulary files (must include folder path)") 34 | parser.add_argument('--pred', type=str, default='pred', 35 | help="Name of output file containing predictions of the test set (must include folder path)") 36 | args = parser.parse_args() 37 | 38 | """ FILES AND TOKENIZER """ 39 | # Get test and out file path 40 | test_file = args.test 41 | out_file = args.pred 42 | # Get vocabulary paths 43 | src_vocab_file = args.vocab + "-input" 44 | tgt_vocab_file = args.vocab + "-output" 45 | 46 | # Log all relevant files 47 | logger = utils.get_logger() 48 | logger.info(f"Model checkpoint: {args.model_checkpoint}") 49 | logger.info(f"Test file: {test_file}") 50 | logger.info(f"Input vocabulary file: {src_vocab_file}") 51 | logger.info(f"Output vocabulary file: {tgt_vocab_file}") 52 | logger.info(f"Prediction file: {out_file}") 53 | 54 | """ CONSTANTS """ 55 | MAX_SRC_SEQ_LEN = 45 56 | MAX_TGT_SEQ_LEN = 45 57 | 58 | """ MODEL AND DATA LOADER """ 59 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 60 | # Initialize Tokenizer object with input and output vocabulary files 61 | myTokenizer = tokenizer.Tokenizer(src_vocab_file, tgt_vocab_file, device=device) 62 | # Load model from checkpoint in evaluation mode 63 | model = utils.build_model(args.arch, myTokenizer.src_vocab_size, myTokenizer.tgt_vocab_size, args.embed_dim, args.fcn_dim, 64 | args.num_heads, args.num_layers, args.dropout, myTokenizer.src_to_tgt_vocab_conversion_matrix) 65 | model = utils.load_model(model, args.model_checkpoint, logger) 66 | model.to(device) 67 | model.eval() 68 | # Initialize DataLoader object 69 | data_loader = dataset.DataLoader(myTokenizer, train_file_path=None, valid_file_path=None, 70 | test_file_path=test_file, device=device) 71 | 72 | """ FUNCTIONS """ 73 | 74 | 75 | def prdeict_word(src, max_seq_len): 76 | # Add batch dimension 77 | src = src.unsqueeze(dim=0) 78 | src_key_padding_mask = data_loader.get_padding_mask(src) 79 | memory = model.encode(src, src_key_padding_mask) 80 | outputs = torch.zeros(1, max_seq_len, dtype=torch.long, device=device) 81 | outputs[0] = myTokenizer.sos_id 82 | for j in range(1, max_seq_len): 83 | # Compute output of model 84 | tgt_key_padding_mask = data_loader.get_padding_mask(outputs[:, :j]) 85 | out = model.decode(memory, outputs[:, :j], tgt_key_padding_mask, src_key_padding_mask).squeeze() if \ 86 | (model.__class__.__name__ == "Transformer") else \ 87 | model.decode(memory, outputs[:, :j], src, tgt_key_padding_mask, src_key_padding_mask).squeeze() 88 | val, ix = out.topk(1) 89 | outputs[0, j] = ix[-1] 90 | if ix[-1] == myTokenizer.eos_id: 91 | break 92 | # Strip off sos and eos tokens 93 | return outputs[0, 1:j] 94 | 95 | 96 | # --------For Transformer model from SIGMORPHON 2020 Baseline---------- 97 | def dummy_mask(seq): 98 | ''' 99 | create dummy mask (all 1) 100 | ''' 101 | if isinstance(seq, tuple): 102 | seq = seq[0] 103 | assert len(seq.size()) == 1 or (len(seq.size()) == 2 and seq.size(1) == 1) 104 | return torch.ones_like(seq, dtype=torch.float) 105 | 106 | 107 | def decode_greedy_transformer(src_sentence, max_len=40, trg_bos=myTokenizer.sos_id, trg_eos=myTokenizer.eos_id): 108 | ''' 109 | src_sentence: [seq_len, 1] 110 | ''' 111 | model.eval() 112 | src_mask = dummy_mask(src_sentence) 113 | src_mask = (src_mask == 0).transpose(0, 1) 114 | enc_hs = model.encode(src_sentence, src_mask) 115 | 116 | output, attns = [trg_bos], [] 117 | 118 | for _ in range(max_len): 119 | output_tensor = torch.tensor(output, device=device).view(len(output), 1) 120 | trg_mask = dummy_mask(output_tensor) 121 | trg_mask = (trg_mask == 0).transpose(0, 1) 122 | 123 | word_logprob = model.decode(enc_hs, src_mask, output_tensor, trg_mask) 124 | word_logprob = word_logprob[-1] 125 | 126 | word = torch.max(word_logprob, dim=1)[1] 127 | if word == trg_eos: 128 | break 129 | output.append(word.item()) 130 | return output[1:] # , attns 131 | # ------------------------------------ 132 | 133 | 134 | def write_predictions_to_file(predictions, test_file_path, out_file_path): 135 | utils.maybe_mkdir(out_file_path) 136 | # Get original input from test file 137 | lemmas, features = data.read_test_file(test_file_path) 138 | # Write all data with predictions to the out file 139 | data.write_morph_file(lemmas, predictions, features, out_file_path) 140 | 141 | 142 | def generate_prediction_file(max_seq_len=MAX_TGT_SEQ_LEN): 143 | """ Generates predictions over the test set and prints output to prediction file.""" 144 | input_ids, input_tokens = data_loader.get_test_set() 145 | predictions = [] 146 | # Go over each example 147 | for i, (data, data_tokens) in tqdm(enumerate(zip(input_ids, input_tokens))): 148 | unkown_tokens = [token for token in data_tokens if token not in myTokenizer.src_vocab] 149 | # Get prediction from model 150 | # ------------------ 151 | pred = prdeict_word(data, max_seq_len) 152 | # Convert from predicted ids to the predicted word 153 | pred_tokens = myTokenizer.convert_tgt_ids_to_tokens(pred.tolist()) 154 | # pred = decode_greedy_transformer(data.unsqueeze(dim=0).transpose(0, 1), max_seq_len) 155 | # pred_tokens = myTokenizer.convert_tgt_ids_to_tokens(list(pred)) 156 | # ------------------ 157 | 158 | # where token is unkown token, copy from the source at the same token location 159 | unkown_idx = 0 160 | for j in range(len(pred_tokens)): 161 | if pred_tokens[j] == myTokenizer.unk and (j < len(data_tokens) - 1): 162 | pred_tokens[j] = data_tokens[j + 1] # account for data token padded with at the beginning 163 | # if pred_tokens[j] == myTokenizer.unk: 164 | # pred_tokens[j] = unkown_tokens[unkown_idx] 165 | # # Increment index, until reaches the end, then stay 166 | # unkown_tokens = min(unkown_tokens + 1, len(unkown_tokens) - 1) 167 | 168 | pred_word = ''.join(pred_tokens) 169 | predictions.append(pred_word) 170 | write_predictions_to_file(predictions, test_file, out_file) 171 | 172 | 173 | if __name__ == '__main__': 174 | # Generate predictions for test set 175 | generate_prediction_file() 176 | logger.info(f"Created prediction file: {out_file}\n") 177 | 178 | 179 | # def prdeict_word(src, max_seq_len): 180 | # """ 181 | # Predicts target word given source (lemma + features). Predictions generated in greedy manner. 182 | # """ 183 | # # Add batch dimension 184 | # src = src.unsqueeze(dim=0) 185 | # src = src.transpose(0, 1) 186 | # outputs = torch.zeros(1, max_seq_len, dtype=torch.long, device=device) 187 | # outputs[0] = myTokenizer.sos_id 188 | # for j in range(1, max_seq_len): 189 | # trg = outputs[:, :j] 190 | # trg = trg.transpose(0, 1) 191 | # src_mask = (src > 0).float() 192 | # trg_mask = (trg > 0).float() 193 | # # Compute output of model 194 | # out = model(src, src_mask, trg, trg_mask).transpose(0, 1).squeeze() 195 | # val, ix = out.topk(1) 196 | # outputs[0, j] = ix[-1] 197 | # if ix[-1] == myTokenizer.eos_id: 198 | # break 199 | # return outputs[0, :j + 1] 200 | # 201 | -------------------------------------------------------------------------------- /hyperparameter_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import statistics 3 | 4 | TRAIN = "train" 5 | DEV = "dev" 6 | # TEST = "test" 7 | TEST = "dev"# For now, use dev set as test 8 | LANGUAGES = ["english", "french", "irish", "italian", "spanish"] 9 | RESOURCES = ["low"]#"medium"] 10 | MODEL_TYPE = ["transformer"]#, "pointer_generator"] 11 | EPOCHS_PER_RESOURCE = {"low": 800, "medium": 400} 12 | BATCH_SIZE_PER_RESOURCE = {"low": 64, "medium": 128} 13 | EVAL_EVERY = 25 14 | 15 | EMBEDDING_DIMS = [64, 128, 256] 16 | FCN_HIDDEN_DIMS = [64, 128, 256] 17 | NUM_HEADS = [4, 8] 18 | NUM_LAYERS = [2, 3] 19 | # DROPOUT = [0.2, 0.3] 20 | for model in MODEL_TYPE: 21 | for embed_dim in EMBEDDING_DIMS: 22 | for fcn_dim in FCN_HIDDEN_DIMS: 23 | for num_heads in NUM_HEADS: 24 | for num_layers in NUM_LAYERS: 25 | # skip these 26 | if (embed_dim, fcn_dim) == (64, 64): 27 | continue 28 | print(f"embed_dim: {embed_dim}, fcn_dim: {fcn_dim}, num-heads: {num_heads}, num-layers: {num_layers}") 29 | accuracies = [] 30 | hyper_folder = f"embed_dim-{embed_dim}-fcn_dim-{fcn_dim}-heads-{num_heads}-layers-{num_layers}" 31 | for resource in RESOURCES: 32 | for language in LANGUAGES: 33 | print(f"{resource} - {language}") 34 | # Get epoch and batch size 35 | epochs = EPOCHS_PER_RESOURCE[resource] 36 | batch_size = BATCH_SIZE_PER_RESOURCE[resource] 37 | # Set names of relevant files and directories 38 | train_file = f"{language}-{TRAIN}-{resource}" 39 | valid_file = f"{language}-{DEV}" 40 | test_file = f"{language}-{TEST}" 41 | covered_test_file = f"{language}-covered-{TEST}" 42 | pred_file = f"{language}-{resource}-{TEST}-pred" 43 | vocab_file = f"{train_file}-vocab" 44 | 45 | data_folder = "data" 46 | vocab_folder = f"vocab/{language}/{resource}" 47 | checkpoints_folder = f"model-checkpoints-test/{model}/{hyper_folder}/{language}/{resource}" 48 | pred_folder = f"predictions-test/{model}/{hyper_folder}" 49 | logs_folder = f"logs-test/{hyper_folder}" 50 | 51 | # create necessary folders, if they do not exist already 52 | if not os.path.exists(vocab_folder): 53 | os.makedirs(vocab_folder) 54 | if not os.path.exists(checkpoints_folder): 55 | os.makedirs(checkpoints_folder) 56 | if not os.path.exists(pred_folder): 57 | os.makedirs(pred_folder) 58 | if not os.path.exists(logs_folder): 59 | os.makedirs(logs_folder) 60 | 61 | # Create vocabulary 62 | # print(f"python vocabulary.py --src {data_folder}/{train_file} --vocab {data_folder}/{vocab_file}") 63 | # Train model 64 | os.system(f"python train.py --arch {model} --epochs {epochs} --batch-size {batch_size} --eval-every {EVAL_EVERY} " + 65 | f"--embed-dim {embed_dim} --fcn-dim {fcn_dim} --num-heads {num_heads} --num-layers {num_layers} " + 66 | f"--train {data_folder}/{train_file} --dev {data_folder}/{valid_file} " + 67 | f"--vocab {vocab_folder}/{vocab_file} --checkpoints-folder {checkpoints_folder}" + 68 | f" > {logs_folder}/train-log-{model}-{resource}-{language}.out") 69 | # Generate predictions for test set 70 | os.system(f"python generate.py --model-checkpoint {checkpoints_folder}/model_best.pth " + 71 | f"--test {data_folder}/{covered_test_file} --vocab {vocab_folder}/{vocab_file} " + 72 | f"--pred {pred_folder}/{pred_file}") 73 | # Evaluate accuracy of prediction file compared to true test set 74 | accuracies.append(os.system(f"python evaluate.py --pred {pred_folder}/{pred_file} " + 75 | f"--target {data_folder}/{test_file}")) 76 | print(f"average accuracy: {statistics.mean(accuracies):.4f}") -------------------------------------------------------------------------------- /model_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | """ MASKS UTILS """ 6 | def _generate_subsequent_mask(src_sz, tgt_sz): 7 | mask = (torch.triu(torch.ones(src_sz, tgt_sz)) == 1).transpose(0, 1) 8 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) 9 | return mask 10 | 11 | 12 | def _generate_square_subsequent_mask(sz): 13 | return _generate_subsequent_mask(sz, sz) 14 | 15 | 16 | """ EMBEDDING UTILS """ 17 | def Embedding(num_embeddings, embedding_dim, padding_idx): 18 | """ Generates embeddings for tokens in vocabulary 19 | Weights initialized with mean=0 and std=sqrt(embedding_dim)""" 20 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 21 | nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) 22 | nn.init.constant_(m.weight[padding_idx], 0) 23 | return m 24 | 25 | 26 | """ POSITIONAL ENCODING UTILS """ 27 | class PositionalEncoding(nn.Module): 28 | """ Adds positional encoding to sequences """ 29 | def __init__(self, embedding_dim, dropout=0.1, max_seq_len=100): 30 | """ Initializes a seq_len x 1 x embedding_dim positional encoding matrix""" 31 | super(PositionalEncoding, self).__init__() 32 | self.dropout = nn.Dropout(p=dropout) 33 | 34 | pe = torch.zeros(max_seq_len, embedding_dim) 35 | position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1) 36 | div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim)) 37 | pe[:, 0::2] = torch.sin(position * div_term) 38 | pe[:, 1::2] = torch.cos(position * div_term) 39 | pe = pe.unsqueeze(0).transpose(0, 1) 40 | self.register_buffer('pe', pe) 41 | 42 | def forward(self, x): 43 | """ Adds positional encoding to the input. 44 | Input of dimensions (seq_len x batch_sz x embedding_dim). 45 | Adds positional encoding matrix (seq_len x 1 x embedding_dim) to every individual example in batch """ 46 | x = x + self.pe[:x.size(0), :] 47 | return self.dropout(x) -------------------------------------------------------------------------------- /pointer_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.init import xavier_uniform_ 4 | 5 | from model_utils import PositionalEncoding, _generate_square_subsequent_mask, Embedding 6 | from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer 7 | from decoder import TransformerDecoder, TransformerDecoderFinalLayer 8 | 9 | 10 | class PointerGeneratorTransformer(nn.Module): 11 | def __init__(self, src_vocab_size=128, tgt_vocab_size=128, 12 | embedding_dim=128, fcn_hidden_dim=128, 13 | num_heads=4, num_layers=2, dropout=0.2, 14 | src_to_tgt_vocab_conversion_matrix=None): 15 | super(PointerGeneratorTransformer, self).__init__() 16 | 17 | self.src_vocab_size = src_vocab_size 18 | self.tgt_vocab_size = tgt_vocab_size 19 | self.embedding_dim = embedding_dim 20 | self.src_to_tgt_vocab_conversion_matrix = src_to_tgt_vocab_conversion_matrix 21 | self.pos_encoder = PositionalEncoding(embedding_dim) 22 | # Source and target embeddings 23 | self.src_embed = Embedding(self.src_vocab_size, embedding_dim, padding_idx=2) 24 | self.tgt_embed = Embedding(self.tgt_vocab_size, embedding_dim, padding_idx=2) 25 | 26 | # Encoder layers 27 | self.encoder_layer = TransformerEncoderLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout) 28 | self.encoder = TransformerEncoder(self.encoder_layer, num_layers) 29 | 30 | # Decoder layers 31 | self.decoder_layer = TransformerDecoderLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout) 32 | self.decoder_final_layer = TransformerDecoderFinalLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout) 33 | self.decoder = TransformerDecoder(self.decoder_layer, self.decoder_final_layer, num_layers) 34 | 35 | # Final linear layer + softmax. for probability over target vocabulary 36 | self.p_vocab = nn.Sequential( 37 | nn.Linear(self.embedding_dim, self.tgt_vocab_size), 38 | nn.Softmax(dim=-1)) 39 | 40 | # P_gen, probability of generating output 41 | self.p_gen = nn.Sequential( 42 | nn.Linear(self.embedding_dim * 3, 1), 43 | nn.Sigmoid()) 44 | # Context vector 45 | self.c_t = None 46 | 47 | # Initialize masks 48 | self.src_mask = None 49 | self.tgt_mask = None 50 | self.mem_mask = None 51 | # Initialize weights of model 52 | self._reset_parameters() 53 | 54 | def _reset_parameters(self): 55 | """ Initiate parameters in the transformer model. """ 56 | for p in self.parameters(): 57 | if p.dim() > 1: 58 | xavier_uniform_(p) 59 | 60 | 61 | def encode(self, src, src_key_padding_mask=None): 62 | """ 63 | Applies embedding, positional encoding and then runs the transformer encoder on the source 64 | :param src: source tokens batch 65 | :param src_key_padding_mask: source padding mask 66 | :return: memory- the encoder hidden states 67 | """ 68 | # Source embedding and positional encoding, changes dimension (N, S) -> (N, S, E) -> (S, N, E) 69 | src_embed = self.src_embed(src).transpose(0, 1) 70 | src_embed = self.pos_encoder(src_embed) 71 | # Pass the source to the encoder 72 | memory = self.encoder(src_embed, mask=self.src_mask, src_key_padding_mask=src_key_padding_mask) 73 | return memory 74 | 75 | def decode(self, memory, tgt, src, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True): 76 | """ 77 | Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target. 78 | Also creates square subsequent mask for teacher learning. 79 | :param memory: The encoder hidden states 80 | :param tgt: Target tokens batch 81 | :param tgt_key_padding_mask: target padding mask 82 | :param memory_key_padding_mask: memory padding mask 83 | :param has_mask: Whether to use square subsequent mask for teacher learning 84 | :return: decoder output 85 | """ 86 | # Create target mask for transformer if no appropriate one was created yet, created of size (T, T) 87 | if has_mask: 88 | if self.tgt_mask is None or self.tgt_mask.size(0) != tgt.size(1): 89 | self.tgt_mask = _generate_square_subsequent_mask(tgt.size(1)).to(tgt.device) 90 | else: 91 | self.tgt_mask = None 92 | # Target embedding and positional encoding, changes dimension (N, T) -> (N, T, E) -> (T, N, E) 93 | tgt_embed = self.tgt_embed(tgt).transpose(0, 1) 94 | tgt_embed_pos = self.pos_encoder(tgt_embed) 95 | # Get output of decoder and attention weights. decoder Dimensions stay the same 96 | decoder_output, attention = self.decoder(tgt_embed_pos, memory, tgt_mask=self.tgt_mask, 97 | memory_mask=self.mem_mask, 98 | tgt_key_padding_mask=tgt_key_padding_mask, 99 | memory_key_padding_mask=memory_key_padding_mask) 100 | # Get probability over target vocabulary, (T, N, E) -> (T, N, tgt_vocab_size) 101 | p_vocab = self.p_vocab(decoder_output) 102 | 103 | # ---Compute Pointer Generator probability--- 104 | # Get hidden states of source (easier/more understandable computation). (S, N, E) -> (N, S, E) 105 | hidden_states = memory.transpose(0, 1) 106 | # compute context vectors. (N, T, S) x (N, S, E) -> (N, T, E) 107 | context_vectors = torch.matmul(attention, hidden_states).transpose(0, 1) 108 | total_states = torch.cat((context_vectors, decoder_output, tgt_embed), dim=-1) 109 | # Get probability of generating output. (N, T, 3*E) -> (N, T, 1) 110 | p_gen = self.p_gen(total_states) 111 | # Get probability of copying from input. (N, T, 1) 112 | p_copy = 1 - p_gen 113 | 114 | # Get representation of src tokens as one hot encoding 115 | one_hot = torch.zeros(src.size(0), src.size(1), self.src_vocab_size, device=src.device) 116 | one_hot = one_hot.scatter_(dim=-1, index=src.unsqueeze(-1), value=1) 117 | # p_copy from source is sum over all attention weights for each token in source 118 | p_copy_src_vocab = torch.matmul(attention, one_hot) 119 | # convert representation of token from src vocab to tgt vocab 120 | p_copy_tgt_vocab = torch.matmul(p_copy_src_vocab, self.src_to_tgt_vocab_conversion_matrix).transpose(0, 121 | 1) 122 | # Compute final probability 123 | p = torch.add(p_vocab * p_gen, p_copy_tgt_vocab * p_copy) 124 | 125 | # Change back batch and sequence dimensions, from (T, N, tgt_vocab_size) -> (N, T, tgt_vocab_size) 126 | return torch.log(p.transpose(0, 1)) 127 | 128 | def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, 129 | memory_key_padding_mask=None, has_mask=True): 130 | """Take in and process masked source/target sequences. 131 | 132 | Args: 133 | src: the sequence to the encoder (required). 134 | tgt: the sequence to the decoder (required). 135 | src_mask: the additive mask for the src sequence (optional). 136 | tgt_mask: the additive mask for the tgt sequence (optional). 137 | memory_mask: the additive mask for the encoder output (optional). 138 | src_key_padding_mask: the ByteTensor mask for src keys per batch (optional). 139 | tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional). 140 | memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional). 141 | 142 | Shape: 143 | - src: :math:`(S, N, E)`. Starts as (N, S) and changed after embedding 144 | - tgt: :math:`(T, N, E)`. Starts as (N, T) and changed after embedding 145 | - src_mask: :math:`(S, S)`. 146 | - tgt_mask: :math:`(T, T)`. 147 | - memory_mask: :math:`(T, S)`. 148 | - src_key_padding_mask: :math:`(N, S)`. 149 | - tgt_key_padding_mask: :math:`(N, T)`. 150 | - memory_key_padding_mask: :math:`(N, S)`. 151 | 152 | Note: [src/tgt/memory]_mask should be filled with 153 | float('-inf') for the masked positions and float(0.0) else. These masks 154 | ensure that predictions for position i depend only on the unmasked positions 155 | j and are applied identically for each sequence in a batch. 156 | [src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions 157 | that should be masked with float('-inf') and False values will be unchanged. 158 | This mask ensures that no information will be taken from position i if 159 | it is masked, and has a separate mask for each sequence in a batch. 160 | 161 | - output: :math:`(T, N, E)`. 162 | 163 | Note: Due to the multi-head attention architecture in the transformer model, 164 | the output sequence length of a transformer is same as the input sequence 165 | (i.e. target) length of the decode. 166 | 167 | where S is the source sequence length, T is the target sequence length, N is the 168 | batch size, E is the feature number 169 | 170 | Examples: 171 | output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask) 172 | """ 173 | 174 | # Applies embedding, positional encoding and the transformer encoder on the source 175 | memory = self.encode(src, src_key_padding_mask) 176 | # Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target. 177 | output = self.decode(memory, tgt, src, tgt_key_padding_mask, memory_key_padding_mask, has_mask) 178 | return output 179 | 180 | 181 | 182 | 183 | # def get_context_vectors_1(self, hidden_states, attention, N, T): 184 | # """ compute context vectors using hidden states and attention over the source """ 185 | # # Replace source and embedding dimension 186 | # hidden_states = hidden_states.transpose(1, 2) 187 | # context_vectors = torch.zeros(N, T, self.embedding_dim).type(torch.float32) 188 | # # Get context vector 189 | # for i in range(N): # go over each data sample i in batch 190 | # h_t_i = hidden_states[i] # all hidden_states - E x S 191 | # for j in range(T): # go over each target token j 192 | # attn_i_j = attention[i][j] # attention over source for target token j - S 193 | # context_vectors[i][j] = torch.mv(h_t_i, attn_i_j) 194 | # context_vectors = context_vectors.transpose(0, 1) 195 | # return context_vectors.type(torch.float32) 196 | 197 | # one_hot_cat = torch.cat(T * [one_hot]).view(N, T, S, self.src_vocab_size) 198 | # one_hot_cat = one_hot.unsqueeze(1).repeat(1, T, 1, 1) 199 | 200 | # def one_hot_encoding(src, src_vocab_size): 201 | # one_hot = torch.zeros(src.size(0), src.size(1), src_vocab_size) 202 | # src_ext = src.unsqueeze_(-1) 203 | # one_hot = one_hot.scatter_(dim=-1, index=src_ext, value=1) 204 | # return one_hot 205 | -------------------------------------------------------------------------------- /run_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | TRAIN = "train" 4 | DEV = "dev" 5 | # TEST = "test" 6 | TEST = "dev"# For now, use dev set as test 7 | LANGUAGES = ["english", "french", "irish", "italian", "spanish"] 8 | RESOURCES = ["low"]#"medium"] 9 | MODEL_TYPE = ["transformer"]#, "pointer_generator"] 10 | EPOCHS_PER_RESOURCE = {"low": 1100, "medium": 400} 11 | BATCH_SIZE_PER_RESOURCE = {"low": 64, "medium": 128} 12 | EVAL_EVERY = 25 13 | for model in MODEL_TYPE: 14 | for resource in RESOURCES: 15 | for language in LANGUAGES: 16 | print(f"{resource} - {language}") 17 | # Get epoch and batch size 18 | epochs = EPOCHS_PER_RESOURCE[resource] 19 | batch_size = BATCH_SIZE_PER_RESOURCE[resource] 20 | # Set names of relevant files and directories 21 | train_file = f"{language}-{TRAIN}-{resource}" 22 | valid_file = f"{language}-{DEV}" 23 | test_file = f"{language}-{TEST}" 24 | covered_test_file = f"{language}-covered-{TEST}" 25 | pred_file = f"{language}-{resource}-{TEST}-pred" 26 | vocab_file = f"{train_file}-vocab" 27 | 28 | data_folder = "data" 29 | vocab_folder = f"vocab/{language}/{resource}" 30 | checkpoints_folder = f"model-checkpoints-test/{model}/{language}/{resource}" 31 | pred_folder = f"predictions-test/{model}" 32 | logs_folder = "logs-test" 33 | 34 | # create necessary folders, if they do not exist already 35 | if not os.path.exists(vocab_folder): 36 | os.makedirs(vocab_folder) 37 | if not os.path.exists(checkpoints_folder): 38 | os.makedirs(checkpoints_folder) 39 | if not os.path.exists(pred_folder): 40 | os.makedirs(pred_folder) 41 | if not os.path.exists(logs_folder): 42 | os.makedirs(logs_folder) 43 | 44 | # Create vocabulary 45 | # print(f"python vocabulary.py --src {data_folder}/{train_file} --vocab {data_folder}/{vocab_file}") 46 | # Train model 47 | os.system(f"python train.py --arch {model} --epochs {epochs} --batch-size {batch_size} --eval-every {EVAL_EVERY} " + 48 | f"--train {data_folder}/{train_file} --dev {data_folder}/{valid_file} " + 49 | f"--vocab {vocab_folder}/{vocab_file} --checkpoints-folder {checkpoints_folder}" + 50 | f" >> {logs_folder}/train-log-{model}-{resource}-{language}.out") 51 | # Generate predictions for test set 52 | os.system(f"python generate.py --model-checkpoint {checkpoints_folder}/model_best.pth " + 53 | f"--test {data_folder}/{covered_test_file} --vocab {vocab_folder}/{vocab_file} " + 54 | f"--pred {pred_folder}/{pred_file}") 55 | # Evaluate accuracy of prediction file compared to true test set 56 | os.system(f"python evaluate.py --pred {pred_folder}/{pred_file} " + 57 | f"--target {data_folder}/{test_file}") -------------------------------------------------------------------------------- /tokenizer.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import torch 4 | PAD_ID = 0 5 | SOS_ID = 1 6 | EOS_ID = 2 7 | UNK_ID = 3 8 | 9 | 10 | def load_vocab(vocab_file_path, sos, eos, pad, unk): 11 | """ Loads vocabulary from vocabulary file (create by vocabulary.py)""" 12 | vocab_file = open(vocab_file_path, "r+", encoding='utf-8') 13 | lines = vocab_file.readlines() 14 | 15 | vocab = collections.OrderedDict() 16 | # First, add special signs for sos, eos and pad to vocabulary 17 | vocab[pad] = PAD_ID 18 | vocab[sos] = SOS_ID 19 | vocab[eos] = EOS_ID 20 | vocab[unk] = UNK_ID 21 | # For each valid line, Get token and index of line 22 | for index, line in enumerate(lines): 23 | if line != "\n": 24 | token, count = line.replace("\n", "").split("\t") 25 | vocab[token] = index + 4 # first two values of vocabulary are taken 26 | 27 | return vocab 28 | 29 | 30 | def convert_by_vocab(vocab, items, unk_val): 31 | """Converts a sequence of tokens or ids using the given vocabulary. 32 | If token is not in vocabulary, unk_val is return as default. """ 33 | output = [] 34 | for item in items: 35 | output.append(vocab.get(item, unk_val)) 36 | return output 37 | 38 | 39 | def tokenize_words(word_list): 40 | """Split words to lists of characters""" 41 | return [list(words) for words in word_list] 42 | 43 | 44 | def tokenize_features(features_list): 45 | """Splits features by the separator sign ";" """ 46 | return [connected_features.split(";") for connected_features in features_list] 47 | 48 | 49 | class Tokenizer(object): 50 | """ Tokenizer object. Handles tokenizing sentences, converting tokens to ids and vice versa""" 51 | 52 | def __init__(self, src_vocab_file_path, tgt_vocab_file_path, device): 53 | self.sos = "" 54 | self.eos = "" 55 | self.pad = "" 56 | self.unk = "" 57 | self.pad_id = PAD_ID 58 | self.sos_id = SOS_ID 59 | self.eos_id = EOS_ID 60 | self.unk_id = UNK_ID 61 | 62 | self.device = device 63 | 64 | self.src_vocab = load_vocab(src_vocab_file_path, self.sos, 65 | self.eos, self.pad, self.unk) # vocabulary of all token->id in the input 66 | self.inv_src_vocab = {v: k for k, v in self.src_vocab.items()} # reverse vocabulary of input, id->token 67 | self.src_vocab_size = len(self.src_vocab) 68 | self.tgt_vocab = load_vocab(tgt_vocab_file_path, self.sos, 69 | self.eos, self.pad, self.unk) # vocabulary of all token->id in the output 70 | self.inv_tgt_vocab = {v: k for k, v in self.tgt_vocab.items()} # reverse vocabulary of output, id->token 71 | self.tgt_vocab_size = len(self.tgt_vocab) 72 | 73 | self.src_to_tgt_vocab_conversion_matrix = self.get_src_to_tgt_vocab_conversion_matrix() 74 | 75 | def add_sequence_symbols(self, tokens_list): 76 | """ Adds eos and sos symbols to each sequence of tokens""" 77 | return [[self.sos] + tokens + [self.eos] for tokens in tokens_list] 78 | 79 | def convert_src_tokens_to_ids(self, tokens): 80 | """ Converts all given tokens to ids using the input vocabulary""" 81 | return convert_by_vocab(self.src_vocab, tokens, self.unk_id) 82 | 83 | def convert_src_ids_to_tokens(self, ids): 84 | """ Converts all given ids to tokens using the input vocabulary""" 85 | return convert_by_vocab(self.inv_src_vocab, ids, self.unk) 86 | 87 | def convert_tgt_tokens_to_ids(self, tokens): 88 | """ Converts all given tokens to the ids using the output vocabulary""" 89 | return convert_by_vocab(self.tgt_vocab, tokens, self.unk_id) 90 | 91 | def convert_tgt_ids_to_tokens(self, ids): 92 | """ Converts all given tokens to the ids using the output vocabulary""" 93 | return convert_by_vocab(self.inv_tgt_vocab, ids, self.unk) 94 | 95 | def get_id_tensors(self, tokens_list, vocab_type): 96 | """ Gets list of token sequences, and converts each token sequence to tensor of ids, using the tokenizer 97 | device to determine tensor device type, and vocab type is either "INPUT" or "OUTPUT" """ 98 | if vocab_type == "INPUT": 99 | return [torch.tensor(self.convert_src_tokens_to_ids(tokens), dtype=torch.long, device=self.device) 100 | for tokens in tokens_list] 101 | else: 102 | return [torch.tensor(self.convert_tgt_tokens_to_ids(tokens), dtype=torch.long, device=self.device) 103 | for tokens in tokens_list] 104 | 105 | def pad_tokens_sequence(self, tokens, max_seq_len): 106 | """ Pads the token sequence with pad symbols until it reaches the max sequence length. 107 | If Sequence is already at max length, nothing is added. """ 108 | padding_len = max_seq_len - len(tokens) 109 | padding = [self.pad] * padding_len 110 | return tokens + padding 111 | 112 | def get_src_to_tgt_vocab_conversion_matrix(self): 113 | # Initialize conversion matrix 114 | src_to_tgt_conversion_matrix = torch.zeros(self.src_vocab_size, self.tgt_vocab_size, device=self.device) 115 | src_vocab_items = self.src_vocab.items() 116 | # Go over all (token, id) items in src vocab 117 | for src_token, src_id in src_vocab_items: 118 | tgt_id = self.tgt_vocab.get(src_token, self.unk_id) 119 | src_to_tgt_conversion_matrix[src_id][tgt_id] = 1 120 | return src_to_tgt_conversion_matrix 121 | 122 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | from tqdm import tqdm 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.optim.lr_scheduler import ReduceLROnPlateau 9 | 10 | import utils 11 | import dataset 12 | import tokenizer 13 | 14 | import transformer_baseline 15 | 16 | # Training parameters 17 | parser = argparse.ArgumentParser(description='Training Transformer and Pointer-Generator for morphological inflection') 18 | parser.add_argument('--train', type=str, default='data', 19 | help="Train file of the dataset (File is located in DATA_FOLDER)") 20 | parser.add_argument('--dev', type=str, default='data', 21 | help="Validation file of the dataset (File is located in DATA_FOLDER)") 22 | parser.add_argument('--vocab', type=str, default='data', 23 | help="Base name of vocabulary files (must include dir path)") 24 | parser.add_argument('--checkpoints-dir', type=str, default='model-checkpoints', 25 | help='Folder to keep checkpoints of model') 26 | parser.add_argument('--resume', default=False, action='store_true', 27 | help="Whether to resume training from a certain checkpoint") 28 | parser.add_argument('--reload', default=False, action='store_true', 29 | help="Whether to reload pretrained model from certain checkpoint") 30 | parser.add_argument('--epochs', type=int, default=100, 31 | help='number of epochs to train (default: 100)') 32 | parser.add_argument('--steps', type=int, default=100, 33 | help='number of batch steps to train (default: 20,000)') 34 | parser.add_argument('--batch-size', type=int, default=128, 35 | help='input batch size for training (default: 128)') 36 | parser.add_argument('--eval-every', type=int, default=1, 37 | help='Evaluate model over validation set every how many epochs (default: 1)') 38 | parser.add_argument('--arch', type=str, default='transformer', 39 | help="Architecture type for model: transformer, pointer_generator") 40 | parser.add_argument('--embed-dim', type=int, default=128, 41 | help='Embedding dimension (default: 128)') 42 | parser.add_argument('--fcn-dim', type=int, default=256, 43 | help='Fully-connected network hidden dimension (default: 256)') 44 | parser.add_argument('--num-heads', type=int, default=4, 45 | help='number of attention heads (default: 4)') 46 | parser.add_argument('--num-layers', type=int, default=2, 47 | help='number of layers in encoder and decoder (default: 2)') 48 | parser.add_argument('--dropout', type=float, default=0.2, 49 | help='Dropout probability (default: 0.2)') 50 | parser.add_argument('--lr', type=float, default=5e-4, 51 | help='learning rate (default: 0.01)') 52 | parser.add_argument('--beta', type=float, default=0.9, 53 | help='beta for Adam optimizer (default: 0.01)') 54 | parser.add_argument('--beta2', type=float, default=0.999, 55 | help='beta 2 for Adam optimizer (default: 0.01)') 56 | parser.add_argument('--label-smooth', default=0.1, type=float, 57 | help='label smoothing coeff') 58 | parser.add_argument('--scheduler', type=str, default="ReduceLROnPlateau", 59 | help='Learning rate Scheduler (default: ReduceLROnPlateau)') 60 | parser.add_argument('--patience', default=5, type=int, 61 | help='patience of for early stopping (default: 0)') 62 | parser.add_argument('--min-lr', type=float, default=1e-5, 63 | help='Minimum learning rate (default: 0.01)') 64 | parser.add_argument('--discount-factor', default=0.5, type=float, 65 | help='discount factor of `ReduceLROnPlateau` (default: 0.5)') 66 | parser.add_argument('--patience_reduce', default=0, type=int, 67 | help='patience of `ReduceLROnPlateau` (default: 0)') 68 | parser.add_argument('--warmup-steps', default=4000, type=int, 69 | help='number of warm up steps for scheduler (default: 4000)') 70 | args = parser.parse_args() 71 | 72 | # Get train and validation file paths 73 | train_file = args.train 74 | valid_file = args.dev 75 | # Get vocabulary paths 76 | src_vocab_file = args.vocab + "-input" 77 | tgt_vocab_file = args.vocab + "-output" 78 | # Initialize Tokenizer object with input and output vocabulary files 79 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 80 | myTokenizer = tokenizer.Tokenizer(src_vocab_file, tgt_vocab_file, device) 81 | 82 | """ CONSTANTS """ 83 | MAX_SRC_SEQ_LEN = 45 84 | MAX_TGT_SEQ_LEN = 45 85 | SRC_VOCAB_SIZE = myTokenizer.src_vocab_size 86 | TGT_VOCAB_SIZE = myTokenizer.tgt_vocab_size 87 | # Model Hyperparameters 88 | EMBEDDING_DIM = args.embed_dim 89 | FCN_HIDDEN_DIM = args.fcn_dim 90 | NUM_HEADS = args.num_heads 91 | NUM_LAYERS = args.num_layers 92 | DROPOUT = args.dropout 93 | 94 | """ MODEL AND DATA LOADER """ 95 | model = utils.build_model(args.arch, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, EMBEDDING_DIM, FCN_HIDDEN_DIM, 96 | NUM_HEADS, NUM_LAYERS, DROPOUT, myTokenizer.src_to_tgt_vocab_conversion_matrix) 97 | # --------Transformer model from SIGMORPHON 2020 Baseline---------- 98 | # model = transformer_baseline.Transformer(src_vocab_size=SRC_VOCAB_SIZE, trg_vocab_size=TGT_VOCAB_SIZE, 99 | # embed_dim=EMBEDDING_DIM, nb_heads=NUM_HEADS, 100 | # src_hid_size=FCN_HIDDEN_DIM, src_nb_layers=NUM_LAYERS, 101 | # trg_hid_size=FCN_HIDDEN_DIM, trg_nb_layers=NUM_LAYERS, 102 | # dropout_p=DROPOUT, 103 | # tie_trg_embed=False, src_c2i=None, trg_c2i=None, attr_c2i=None, label_smooth=0.1) 104 | model.to(device) 105 | criterion = nn.NLLLoss(reduction='mean', ignore_index=myTokenizer.pad_id) 106 | optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta, args.beta2)) 107 | scheduler = ReduceLROnPlateau(optimizer, 'min', min_lr=args.min_lr, factor=args.discount_factor, 108 | patience=args.patience_reduce) \ 109 | if (args.scheduler == "ReduceLROnPlateau") \ 110 | else utils.WarmupInverseSquareRootSchedule(optimizer, args.warmup_steps) 111 | # Initialize DataLoader object 112 | data_loader = dataset.DataLoader(myTokenizer, train_file_path=train_file, valid_file_path=valid_file, 113 | test_file_path=None, device=device, batch_size=args.batch_size, 114 | max_src_seq_len=MAX_SRC_SEQ_LEN, max_tgt_seq_len=MAX_TGT_SEQ_LEN) 115 | 116 | 117 | """ HELPER FUNCTIONS""" 118 | def get_lr(): 119 | if isinstance(scheduler, ReduceLROnPlateau): 120 | return optimizer.param_groups[0]['lr'] 121 | try: 122 | return scheduler.get_last_lr()[0] 123 | except: 124 | return scheduler.get_lr()[0] 125 | 126 | 127 | def get_loss(predict, target): 128 | """ 129 | Compute loss 130 | :param predict: SxNxTGT_VOCAB 131 | :param target: SxN 132 | :return: loss 133 | """ 134 | predict = predict.contiguous().view(-1, TGT_VOCAB_SIZE) 135 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX) 136 | target = target.contiguous().view(-1, 1) 137 | non_pad_mask = target.ne(myTokenizer.pad_id) 138 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean() 139 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean() 140 | smooth_loss = smooth_loss / TGT_VOCAB_SIZE 141 | loss = (1. - 142 | args.label_smooth) * nll_loss + args.label_smooth * smooth_loss 143 | return loss 144 | 145 | 146 | """ LOGGING SETTINGS AND LOGGING """ 147 | # Set number of total epoch and min epoch for eval start 148 | MIN_EVAL_STEPS = 4000 149 | steps_per_epoch = int(math.ceil(data_loader.train_set_size / args.batch_size)) 150 | epochs = int(math.ceil(args.steps / steps_per_epoch)) 151 | min_eval_epochs = int(MIN_EVAL_STEPS / steps_per_epoch) 152 | 153 | # Log all model settings 154 | logger = utils.get_logger() 155 | logger.info(f"Training model") 156 | logger.info(f"Arch: {args.arch}, embed_dim: {EMBEDDING_DIM}, fcn_hid_dim: {FCN_HIDDEN_DIM}," 157 | f" num-heads: {NUM_HEADS}, num-layers: {NUM_LAYERS}, dropout: {DROPOUT}, device: {device}") 158 | logger.info(f"Optimizer: Adam, lr: {args.lr}, beta: {args.beta}, beta2: {args.beta2}") 159 | logger.info( 160 | f"Scheduler: {args.scheduler}, patience: {args.patience}, min_lr: {args.min_lr}, warmup steps: {args.warmup_steps}," 161 | f" discount factor: {args.discount_factor}, patience_reduce: {args.patience_reduce}") 162 | logger.info(f"Source vocabulary: Size = {myTokenizer.src_vocab_size}, {myTokenizer.src_vocab}") 163 | logger.info(f"Target vocabulary: Size = {myTokenizer.tgt_vocab_size}, {myTokenizer.tgt_vocab}") 164 | logger.info(f"Training file: {train_file}") 165 | logger.info(f"Validation file: {valid_file}") 166 | logger.info(f"Input vocabulary file: {src_vocab_file}") 167 | logger.info(f"Output vocabulary file: {tgt_vocab_file}") 168 | logger.info(f"Checkpoints dir: {args.checkpoints_dir}") 169 | logger.info(f"Model: {model}") 170 | logger.info(f"Resume training: {args.resume}, reload from pretraining: {args.reload}") 171 | logger.info(f"Steps: {args.steps}, batch size:{args.batch_size},\n" 172 | f"Train set size: {data_loader.train_set_size}, Steps per epoch {steps_per_epoch},\n" 173 | f"Epochs: {epochs}, Eval every :{args.eval_every}") 174 | 175 | # Reload model/ resume training if applicable 176 | if args.resume: 177 | # Resume training from checkpoint 178 | model, optimizer, scheduler, start_epoch, best_valid_accuracy = \ 179 | utils.load_checkpoint(model, optimizer, scheduler, f"{args.checkpoints_dir}/model_best.pth", logger) 180 | best_valid_epoch = start_epoch 181 | else: 182 | # Reload pretrained model from checkpoint 183 | if args.reload: 184 | model = utils.load_model(model, f"{args.checkpoints_dir}/model_best.pth", logger) 185 | start_epoch = 0 186 | # Initialize best validation loss placeholders 187 | best_valid_accuracy = -1.0 188 | best_valid_epoch = 0 189 | 190 | """ FUNCTIONS """ 191 | def train(epoch): 192 | """ Runs full training epoch over the training set, uses teacher forcing in training""" 193 | model.train() 194 | running_loss = 0.0 195 | # Get Training set in batches 196 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_train_set() 197 | # Go over each batch 198 | for i, (data, target, target_y) in tqdm(enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches))): 199 | optimizer.zero_grad() 200 | # Get padding masks 201 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target) 202 | # Compute output of model 203 | output = model(data, target, src_pad_mask, target_pad_mask, mem_pad_mask) 204 | # --------------- 205 | # Compute loss 206 | # loss = criterion(output.contiguous().view(-1, TGT_VOCAB_SIZE), target_y.contiguous().view(-1)) 207 | loss = get_loss(output.transpose(0, 1), target_y.transpose(0, 1)) 208 | # ------------- 209 | # Propagate loss and update model parameters 210 | loss.backward() 211 | optimizer.step() 212 | if not isinstance(scheduler, ReduceLROnPlateau): 213 | scheduler.step() 214 | running_loss += loss.item() 215 | # print statistics 216 | logger.info(f"Train Epoch: {epoch}, avg loss: {running_loss / (i + 1):.4f}, lr {get_lr():.6f}") 217 | 218 | 219 | def validation(epoch): 220 | """ Computes loss and accuracy over the validation set, using teacher forcing inputs """ 221 | model.eval() 222 | running_loss = 0 223 | correct_preds = 0 224 | # Get Training set batches 225 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_validation_set_tf() 226 | # Go over each batch 227 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)): 228 | # Get padding masks 229 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target) 230 | # Compute output of model 231 | output = model(data, target, src_pad_mask, target_pad_mask, mem_pad_mask) 232 | # Get model predictions 233 | predictions = output.topk(1)[1].squeeze() 234 | # Compute accuracy 235 | target_pad_mask = (target_pad_mask == False).int() 236 | predictions = predictions * target_pad_mask 237 | correct_preds += torch.all(torch.eq(predictions, target_y), dim=-1).sum() 238 | # --------------- 239 | # Compute loss 240 | # loss = criterion(output.contiguous().view(-1, TGT_VOCAB_SIZE), target_y.contiguous().view(-1)) 241 | loss = get_loss(output.transpose(0, 1), target_y.transpose(0, 1)) 242 | # ------------- 243 | running_loss += loss.item() 244 | # print statistics 245 | final_loss = running_loss / (i + 1) 246 | if isinstance(scheduler, ReduceLROnPlateau): 247 | scheduler.step(final_loss) 248 | accuracy = float(100 * correct_preds) / data_loader.get_validation_set_len() 249 | logger.info(f"Validation. Epoch: {epoch}, avg dev loss: {final_loss:.4f}, accuracy: {accuracy:.2f}%") 250 | return accuracy # final_loss 251 | 252 | 253 | # --------For Transformer model from SIGMORPHON 2020 Baseline---------- 254 | def train_baseline(epoch): 255 | """ Runs full training epoch over the training set, uses teacher forcing in training""" 256 | model.train() 257 | running_loss = 0.0 258 | # Get Training set in batches 259 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_train_set() 260 | # Go over each batch 261 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)): 262 | optimizer.zero_grad() 263 | # Get padding masks 264 | data = data.transpose(0, 1) 265 | target = target.transpose(0, 1) 266 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target) 267 | src_pad_mask, mem_pad_mask, target_pad_mask = (src_pad_mask == False).float(), ( 268 | mem_pad_mask == False).float(), (target_pad_mask == False).float() 269 | # Compute loss 270 | batch = (data, src_pad_mask, target, target_pad_mask) 271 | loss = model.get_loss(batch) 272 | loss.backward() 273 | optimizer.step() 274 | running_loss += loss.item() 275 | # print statistics 276 | print(f"\nTrain Epoch: {epoch}, loss: {running_loss / (i + 1):.5f}") 277 | 278 | 279 | def validation_baseline(epoch): 280 | """ Computes loss and accuracy over the validation set, using teacher forcing inputs """ 281 | model.eval() 282 | running_loss = 0.0 283 | correct_preds = 0 284 | # Get Training set in batches 285 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_validation_set_tf() 286 | # Go over each batch 287 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)): 288 | data = data.transpose(0, 1) 289 | target = target.transpose(0, 1) 290 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target) 291 | src_pad_mask, mem_pad_mask, target_pad_mask = (src_pad_mask == False).float(), ( 292 | mem_pad_mask == False).float(), (target_pad_mask == False).float() 293 | target_y_pad_mask = data_loader.get_padding_mask(target_y) 294 | # Compute loss over output (using baseline code function) 295 | loss = model.get_loss((data, src_pad_mask, target, target_pad_mask)) 296 | running_loss += loss.item() 297 | # Compute output of model 298 | output = model(data, src_pad_mask, target, target_pad_mask).transpose(0, 1) 299 | # Get model predictions 300 | predictions = output.topk(1)[1].squeeze() 301 | target_pad_mask_test = (target_y_pad_mask == False).int() 302 | predictions = predictions * target_pad_mask_test 303 | correct_preds += torch.all(torch.eq(predictions, target_y), dim=-1).sum() 304 | final_loss = running_loss / (i + 1) 305 | accuracy = float(correct_preds) / data_loader.get_validation_set_len() 306 | print(f"Validation. Epoch: {epoch}, loss: {final_loss:.4f}, accuracy: {accuracy:.2f}%") 307 | return accuracy # , final_loss 308 | 309 | 310 | if __name__ == '__main__': 311 | eval_every = args.eval_every 312 | epochs_no_improve = 0 313 | logger.info(f"Starting training from Epoch {start_epoch + 1}") 314 | for epoch in range(start_epoch + 1, epochs + 1): 315 | # Check for early stopping 316 | if epochs_no_improve == args.patience: 317 | logger.info( 318 | f"Applied early stopping and stopped training. Val accuracy not improve in {args.patience} epochs") 319 | break 320 | # --------- 321 | train(epoch) 322 | # --------For Transformer model from SIGMORPHON 2020 Baseline---------- 323 | # train_baseline(epoch) 324 | # --------- 325 | is_best = False 326 | curr_valid_accuracy = 0 327 | # Check model on validation set and get loss, every few epochs 328 | if epoch % eval_every == 0 and epoch > min_eval_epochs: 329 | epochs_no_improve += 1 330 | # --------- 331 | curr_valid_accuracy = validation(epoch) 332 | # --------For Transformer model from SIGMORPHON 2020 Baseline---------- 333 | # curr_valid_accuracy = validation_baseline(epoch) 334 | # --------- 335 | # If best accuracy so far, save model as best and the accuracy 336 | if curr_valid_accuracy > best_valid_accuracy: 337 | logger.info("New best accuracy, Model saved") 338 | is_best = True 339 | best_valid_accuracy = curr_valid_accuracy 340 | best_valid_epoch = epoch 341 | epochs_no_improve = 0 342 | utils.save_checkpoint(model, epoch, optimizer, scheduler, curr_valid_accuracy, is_best, args.checkpoints_dir) 343 | utils.clean_checkpoints_dir(args.checkpoints_dir) 344 | logger.info(f"Finished training, best model on validation set: {best_valid_epoch}," 345 | f" accuracy: {best_valid_accuracy:.2f}%\n") 346 | 347 | 348 | # Train model 349 | # seed = 0 350 | # torch.manual_seed(seed=seed) 351 | # if torch.cuda.is_available(): 352 | # torch.cuda.manual_seed_all(seed) 353 | # BEST MODEL FOR MEDIUM RESOURCE 354 | # EMBEDDING_DIM = 64 355 | # FCN_HIDDEN_DIM = 256 356 | # NUM_HEADS = 4 357 | # NUM_LAYERS = 2 358 | # DROPOUT = 0.2 359 | # # BEST MODEL FOR LOW RESOURCE 360 | # EMBEDDING_DIM = 128 361 | # FCN_HIDDEN_DIM = 64 362 | # NUM_HEADS = 4 363 | # NUM_LAYERS = 2 364 | # DROPOUT = 0.2 365 | -------------------------------------------------------------------------------- /transformer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from torch.nn.init import xavier_uniform_ 4 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 5 | from model_utils import _generate_square_subsequent_mask, Embedding, PositionalEncoding 6 | from tokenizer import PAD_ID 7 | 8 | class Transformer(nn.Module): 9 | def __init__(self, src_vocab_size=128, tgt_vocab_size=128, 10 | embedding_dim=128, fcn_hidden_dim=128, 11 | num_heads=4, num_layers=2, dropout=0.2): 12 | super(Transformer, self).__init__() 13 | 14 | self.embedding_dim = embedding_dim 15 | # Source and Encoder layers 16 | self.src_embed = Embedding(src_vocab_size, embedding_dim, padding_idx=PAD_ID) 17 | self.src_pos_encoder = PositionalEncoding(embedding_dim) 18 | encoder_layer = TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, 19 | dim_feedforward=fcn_hidden_dim, dropout=dropout) 20 | encoder_norm = nn.LayerNorm(embedding_dim) 21 | self.encoder = TransformerEncoder(encoder_layer, num_layers, encoder_norm) 22 | 23 | # Target and Decoder layers 24 | self.tgt_embed = Embedding(tgt_vocab_size, embedding_dim, padding_idx=PAD_ID) 25 | self.tgt_pos_encoder = PositionalEncoding(embedding_dim) 26 | decoder_layer = nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads, 27 | dim_feedforward=fcn_hidden_dim, dropout=dropout) 28 | decoder_norm = nn.LayerNorm(embedding_dim) 29 | self.decoder = nn.TransformerDecoder(decoder_layer, num_layers, decoder_norm) 30 | # Final linear layer 31 | self.final_out = nn.Linear(embedding_dim, tgt_vocab_size) 32 | 33 | # Initialize masks 34 | self.src_mask = None 35 | self.tgt_mask = None 36 | self.mem_mask = None 37 | # Initialize weights of model 38 | self._reset_parameters() 39 | 40 | def _reset_parameters(self): 41 | """ Initiate parameters in the transformer model. """ 42 | for p in self.parameters(): 43 | if p.dim() > 1: 44 | xavier_uniform_(p) 45 | 46 | 47 | def encode(self, src, src_key_padding_mask=None): 48 | """ 49 | Applies embedding, positional encoding and then runs the transformer encoder on the source 50 | :param src: source tokens batch 51 | :param src_key_padding_mask: source padding mask 52 | :return: memory- the encoder hidden states 53 | """ 54 | # Source embedding and positional encoding, changes dimension (N, S) -> (N, S, E) -> (S, N, E) 55 | src_embed = self.src_embed(src).transpose(0, 1) 56 | src_embed = self.src_pos_encoder(src_embed) 57 | # Pass the source to the encoder 58 | memory = self.encoder(src_embed, mask=self.src_mask, src_key_padding_mask=src_key_padding_mask) 59 | return memory 60 | 61 | def decode(self, memory, tgt, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True): 62 | """ 63 | Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target. 64 | Also creates square subsequent mask for teacher learning. 65 | :param memory: The encoder hidden states 66 | :param tgt: Target tokens batch 67 | :param tgt_key_padding_mask: target padding mask 68 | :param memory_key_padding_mask: memory padding mask 69 | :param has_mask: Whether to use square subsequent mask for teacher learning 70 | :return: decoder output 71 | """ 72 | # Create target mask for transformer if no appropriate one was created yet, created of size (T, T) 73 | if has_mask: 74 | if self.tgt_mask is None or self.tgt_mask.size(0) != tgt.size(1): 75 | self.tgt_mask = _generate_square_subsequent_mask(tgt.size(1)).to(tgt.device) 76 | else: 77 | self.tgt_mask = None 78 | # Target embedding and positional encoding, changes dimension (N, T) -> (N, T, E) -> (T, N, E) 79 | tgt_embed = self.tgt_embed(tgt).transpose(0, 1) 80 | tgt_embed = self.tgt_pos_encoder(tgt_embed) 81 | # Get output of decoder. Dimensions stay the same 82 | decoder_output = self.decoder(tgt_embed, memory, tgt_mask=self.tgt_mask, memory_mask=self.mem_mask, 83 | tgt_key_padding_mask=tgt_key_padding_mask, 84 | memory_key_padding_mask=memory_key_padding_mask) 85 | # Add linear layer & log softmax, (T, N, E) -> (T, N, tgt_vocab_size) 86 | output = F.log_softmax(self.final_out(decoder_output), dim=-1) 87 | # Change back batch and sequence dimensions, from (T, N, tgt_vocab_size) -> (N, T, tgt_vocab_size) 88 | return output.transpose(0, 1) 89 | 90 | 91 | def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True): 92 | """Take in and process masked source/target sequences. 93 | 94 | Args: 95 | src: the sequence to the encoder (required). 96 | tgt: the sequence to the decoder (required). 97 | src_mask: the additive mask for the src sequence (optional). 98 | tgt_mask: the additive mask for the tgt sequence (optional). 99 | memory_mask: the additive mask for the encoder output (optional). 100 | src_key_padding_mask: the ByteTensor mask for src keys per batch (optional). 101 | tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional). 102 | memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional). 103 | 104 | Shape: 105 | - src: :math:`(S, N, E)`. Starts as (N, S) and changed after embedding 106 | - tgt: :math:`(T, N, E)`. Starts as (N, T) and changed after embedding 107 | - src_mask: :math:`(S, S)`. 108 | - tgt_mask: :math:`(T, T)`. 109 | - memory_mask: :math:`(T, S)`. 110 | - src_key_padding_mask: :math:`(N, S)`. 111 | - tgt_key_padding_mask: :math:`(N, T)`. 112 | - memory_key_padding_mask: :math:`(N, S)`. 113 | 114 | Note: [src/tgt/memory]_mask should be filled with 115 | float('-inf') for the masked positions and float(0.0) else. These masks 116 | ensure that predictions for position i depend only on the unmasked positions 117 | j and are applied identically for each sequence in a batch. 118 | [src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions 119 | that should be masked with float('-inf') and False values will be unchanged. 120 | This mask ensures that no information will be taken from position i if 121 | it is masked, and has a separate mask for each sequence in a batch. 122 | 123 | - output: :math:`(T, N, E)`. 124 | 125 | Note: Due to the multi-head attention architecture in the transformer model, 126 | the output sequence length of a transformer is same as the input sequence 127 | (i.e. target) length of the decode. 128 | 129 | where S is the source sequence length, T is the target sequence length, N is the 130 | batch size, E is the feature number 131 | 132 | Examples: 133 | output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask) 134 | """ 135 | # Applies embedding, positional encoding and the transformer encoder on the source 136 | memory = self.encode(src, src_key_padding_mask) 137 | # Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target. 138 | output = self.decode(memory, tgt, tgt_key_padding_mask, memory_key_padding_mask, has_mask) 139 | return output 140 | -------------------------------------------------------------------------------- /transformer_baseline-original.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | from collections import namedtuple 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.distributions import Distribution 10 | 11 | 12 | PAD_IDX, BOS_IDX, EOS_IDX= 0, 1, 2 #CHANGED LINE 13 | 14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | 16 | 17 | class SinusoidalPositionalEmbedding(nn.Module): 18 | """This module produces sinusoidal positional embeddings of any length. 19 | Padding symbols are ignored. 20 | """ 21 | def __init__(self, embedding_dim, padding_idx, init_size=1024): 22 | super().__init__() 23 | self.embedding_dim = embedding_dim 24 | self.padding_idx = padding_idx 25 | self.weights = SinusoidalPositionalEmbedding.get_embedding( 26 | init_size, 27 | embedding_dim, 28 | padding_idx, 29 | ) 30 | self.register_buffer('_float_tensor', torch.FloatTensor(1)) 31 | 32 | @staticmethod 33 | def get_embedding(num_embeddings, embedding_dim, padding_idx=None): 34 | """Build sinusoidal embeddings. 35 | This matches the implementation in tensor2tensor, but differs slightly 36 | from the description in Section 3.5 of "Attention Is All You Need". 37 | """ 38 | half_dim = embedding_dim // 2 39 | emb = math.log(10000) / (half_dim - 1) 40 | emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) 41 | emb = torch.arange(num_embeddings, 42 | dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) 43 | emb = torch.cat([torch.sin(emb), torch.cos(emb)], 44 | dim=1).view(num_embeddings, -1) 45 | if embedding_dim % 2 == 1: 46 | # zero pad 47 | emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) 48 | if padding_idx is not None: 49 | emb[padding_idx, :] = 0 50 | return emb 51 | 52 | def forward(self, input): 53 | """Input is expected to be of size [bsz x seqlen].""" 54 | bsz, seq_len = input.shape 55 | max_pos = self.padding_idx + 1 + seq_len 56 | if self.weights is None or max_pos > self.weights.size(0): 57 | # recompute/expand embeddings if needed 58 | self.weights = SinusoidalPositionalEmbedding.get_embedding( 59 | max_pos, 60 | self.embedding_dim, 61 | self.padding_idx, 62 | ) 63 | self.weights = self.weights.to(self._float_tensor) 64 | 65 | mask = input.ne(self.padding_idx).long() 66 | positions = torch.cumsum(mask, dim=0) * mask + self.padding_idx 67 | return self.weights.index_select(0, positions.view(-1)).view( 68 | bsz, seq_len, -1).detach() 69 | 70 | 71 | class TransformerEncoderLayer(nn.Module): 72 | def __init__(self, 73 | d_model, 74 | nhead, 75 | dim_feedforward=2048, 76 | dropout=0.1, 77 | attention_dropout=0.1, 78 | activation_dropout=0.1, 79 | activation='relu', 80 | normalize_before=True): 81 | super(TransformerEncoderLayer, self).__init__() 82 | self.normalize_before = normalize_before 83 | self.self_attn = nn.MultiheadAttention(d_model, 84 | nhead, 85 | dropout=attention_dropout) 86 | # Implementation of Feedforward model 87 | self.linear1 = Linear(d_model, dim_feedforward) 88 | self.dropout = nn.Dropout(dropout) 89 | self.linear2 = Linear(dim_feedforward, d_model) 90 | 91 | self.norm1 = nn.LayerNorm(d_model) 92 | self.norm2 = nn.LayerNorm(d_model) 93 | self.activation_dropout = nn.Dropout(activation_dropout) 94 | 95 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation] 96 | 97 | def forward(self, src, src_mask=None, src_key_padding_mask=None): 98 | r"""Pass the input through the endocder layer. 99 | 100 | Args: 101 | src: the sequnce to the encoder layer (required). 102 | src_mask: the mask for the src sequence (optional). 103 | src_key_padding_mask: the mask for the src keys per batch (optional). 104 | """ 105 | # Self attention block 106 | residual = src 107 | if self.normalize_before: 108 | src = self.norm1(src) 109 | src = self.self_attn(src, 110 | src, 111 | src, 112 | attn_mask=src_mask, 113 | key_padding_mask=src_key_padding_mask)[0] 114 | src = residual + self.dropout(src) 115 | if not self.normalize_before: 116 | src = self.norm1(src) 117 | # Feed forward block 118 | residual = src 119 | if self.normalize_before: 120 | src = self.norm2(src) 121 | src = self.activation(self.linear1(src)) 122 | src = self.activation_dropout(src) 123 | src = self.linear2(src) 124 | src = residual + self.dropout(src) 125 | if not self.normalize_before: 126 | src = self.norm2(src) 127 | return src 128 | 129 | 130 | class TransformerDecoderLayer(nn.Module): 131 | def __init__(self, 132 | d_model, 133 | nhead, 134 | dim_feedforward=2048, 135 | dropout=0.1, 136 | attention_dropout=0.1, 137 | activation_dropout=0.1, 138 | activation='relu', 139 | normalize_before=True): 140 | super(TransformerDecoderLayer, self).__init__() 141 | self.normalize_before = normalize_before 142 | self.self_attn = nn.MultiheadAttention(d_model, 143 | nhead, 144 | dropout=attention_dropout) 145 | self.multihead_attn = nn.MultiheadAttention(d_model, 146 | nhead, 147 | dropout=dropout) 148 | # Implementation of Feedforward model 149 | self.linear1 = Linear(d_model, dim_feedforward) 150 | self.dropout = nn.Dropout(dropout) 151 | self.linear2 = Linear(dim_feedforward, d_model) 152 | 153 | self.norm1 = nn.LayerNorm(d_model) 154 | self.norm2 = nn.LayerNorm(d_model) 155 | self.norm3 = nn.LayerNorm(d_model) 156 | self.activation_dropout = nn.Dropout(activation_dropout) 157 | 158 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation] 159 | 160 | def forward(self, 161 | tgt, 162 | memory, 163 | tgt_mask=None, 164 | memory_mask=None, 165 | tgt_key_padding_mask=None, 166 | memory_key_padding_mask=None): 167 | r"""Pass the inputs (and mask) through the decoder layer. 168 | 169 | Args: 170 | tgt: the sequence to the decoder layer (required). 171 | memory: the sequnce from the last layer of the encoder (required). 172 | tgt_mask: the mask for the tgt sequence (optional). 173 | memory_mask: the mask for the memory sequence (optional). 174 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional). 175 | memory_key_padding_mask: the mask for the memory keys per batch (optional). 176 | """ 177 | # self attention block 178 | residual = tgt 179 | if self.normalize_before: 180 | tgt = self.norm1(tgt) 181 | tgt = self.self_attn(tgt, 182 | tgt, 183 | tgt, 184 | attn_mask=tgt_mask, 185 | key_padding_mask=tgt_key_padding_mask)[0] 186 | tgt = residual + self.dropout(tgt) 187 | if not self.normalize_before: 188 | tgt = self.norm1(tgt) 189 | # cross attention block 190 | residual = tgt 191 | if self.normalize_before: 192 | tgt = self.norm2(tgt) 193 | tgt = self.multihead_attn(tgt, 194 | memory, 195 | memory, 196 | attn_mask=memory_mask, 197 | key_padding_mask=memory_key_padding_mask)[0] 198 | tgt = residual + self.dropout(tgt) 199 | if not self.normalize_before: 200 | tgt = self.norm2(tgt) 201 | # feed forward block 202 | residual = tgt 203 | if self.normalize_before: 204 | tgt = self.norm3(tgt) 205 | tgt = self.activation(self.linear1(tgt)) 206 | tgt = self.activation_dropout(tgt) 207 | tgt = self.linear2(tgt) 208 | tgt = residual + self.dropout(tgt) 209 | if not self.normalize_before: 210 | tgt = self.norm3(tgt) 211 | return tgt 212 | 213 | 214 | class Transformer(nn.Module): 215 | def __init__(self, *, src_vocab_size, trg_vocab_size, embed_dim, nb_heads, 216 | src_hid_size, src_nb_layers, trg_hid_size, trg_nb_layers, 217 | dropout_p, tie_trg_embed, src_c2i, trg_c2i, attr_c2i, 218 | label_smooth, **kwargs): 219 | ''' 220 | init 221 | ''' 222 | super().__init__() 223 | self.src_vocab_size = src_vocab_size 224 | self.trg_vocab_size = trg_vocab_size 225 | self.embed_dim = embed_dim 226 | self.embed_scale = math.sqrt(embed_dim) 227 | self.nb_heads = nb_heads 228 | self.src_hid_size = src_hid_size 229 | self.src_nb_layers = src_nb_layers 230 | self.trg_hid_size = trg_hid_size 231 | self.trg_nb_layers = trg_nb_layers 232 | self.dropout_p = dropout_p 233 | self.tie_trg_embed = tie_trg_embed 234 | self.label_smooth = label_smooth 235 | self.src_c2i, self.trg_c2i, self.attr_c2i = src_c2i, trg_c2i, attr_c2i 236 | self.src_embed = Embedding(src_vocab_size, 237 | embed_dim, 238 | padding_idx=PAD_IDX) 239 | self.trg_embed = Embedding(trg_vocab_size, 240 | embed_dim, 241 | padding_idx=PAD_IDX) 242 | self.position_embed = SinusoidalPositionalEmbedding(embed_dim, PAD_IDX) 243 | encoder_layer = TransformerEncoderLayer(d_model=embed_dim, 244 | nhead=nb_heads, 245 | dim_feedforward=src_hid_size, 246 | dropout=dropout_p, 247 | attention_dropout=dropout_p, 248 | activation_dropout=dropout_p, 249 | normalize_before=True) 250 | self.encoder = nn.TransformerEncoder(encoder_layer, 251 | num_layers=src_nb_layers, 252 | norm=nn.LayerNorm(embed_dim)) 253 | decoder_layer = TransformerDecoderLayer(d_model=embed_dim, 254 | nhead=nb_heads, 255 | dim_feedforward=trg_hid_size, 256 | dropout=dropout_p, 257 | attention_dropout=dropout_p, 258 | activation_dropout=dropout_p, 259 | normalize_before=True) 260 | self.decoder = nn.TransformerDecoder(decoder_layer, 261 | num_layers=trg_nb_layers, 262 | norm=nn.LayerNorm(embed_dim)) 263 | self.final_out = Linear(embed_dim, trg_vocab_size) 264 | if tie_trg_embed: 265 | self.final_out.weight = self.trg_embed.weight 266 | self.dropout = nn.Dropout(dropout_p) 267 | # self._reset_parameters() 268 | 269 | def embed(self, src_batch, src_mask): 270 | # ------------ 271 | # word_embed = self.embed_scale * self.src_embed(src_batch) 272 | word_embed = self.src_embed(src_batch) 273 | # ------------ 274 | pos_embed = self.position_embed(src_batch) 275 | embed = self.dropout(word_embed + pos_embed) 276 | return embed 277 | 278 | def encode(self, src_batch, src_mask): 279 | embed = self.embed(src_batch, src_mask) 280 | return self.encoder(embed, src_key_padding_mask=src_mask) 281 | 282 | def decode(self, enc_hs, src_mask, trg_batch, trg_mask): 283 | # -------------- 284 | # word_embed = self.embed_scale * self.trg_embed(trg_batch) 285 | word_embed = self.trg_embed(trg_batch) 286 | # -------------- 287 | pos_embed = self.position_embed(trg_batch) 288 | embed = self.dropout(word_embed + pos_embed) 289 | 290 | trg_seq_len = trg_batch.size(0) 291 | causal_mask = self.generate_square_subsequent_mask(trg_seq_len) 292 | dec_hs = self.decoder(embed, 293 | enc_hs, 294 | tgt_mask=causal_mask, 295 | tgt_key_padding_mask=trg_mask, 296 | memory_key_padding_mask=src_mask) 297 | return F.log_softmax(self.final_out(dec_hs), dim=-1) 298 | 299 | def forward(self, src_batch, src_mask, trg_batch, trg_mask): 300 | ''' 301 | only for training 302 | ''' 303 | src_mask = (src_mask == 0).transpose(0, 1) 304 | trg_mask = (trg_mask == 0).transpose(0, 1) 305 | # trg_seq_len, batch_size = trg_batch.size() 306 | enc_hs = self.encode(src_batch, src_mask) 307 | # output: [trg_seq_len, batch_size, vocab_siz] 308 | output = self.decode(enc_hs, src_mask, trg_batch, trg_mask) 309 | return output 310 | 311 | def count_nb_params(self): 312 | model_parameters = filter(lambda p: p.requires_grad, self.parameters()) 313 | params = sum([np.prod(p.size()) for p in model_parameters]) 314 | return params 315 | 316 | def loss(self, predict, target): 317 | ''' 318 | compute loss 319 | ''' 320 | predict = predict.view(-1, self.trg_vocab_size) 321 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX) 322 | target = target.contiguous().view(-1, 1) 323 | non_pad_mask = target.ne(PAD_IDX) 324 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean() 325 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean() 326 | smooth_loss = smooth_loss / self.trg_vocab_size 327 | loss = (1. - 328 | self.label_smooth) * nll_loss + self.label_smooth * smooth_loss 329 | return loss 330 | 331 | def get_loss(self, data): 332 | src, src_mask, trg, trg_mask = data 333 | out = self.forward(src, src_mask, trg, trg_mask) 334 | loss = self.loss(out[:-1], trg[1:]) 335 | return loss 336 | 337 | def generate_square_subsequent_mask(self, sz): 338 | r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). 339 | Unmasked positions are filled with float(0.0). 340 | """ 341 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 342 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( 343 | mask == 1, float(0.0)) 344 | return mask.to(DEVICE) 345 | 346 | 347 | class TagTransformer(Transformer): 348 | def __init__(self, *, nb_attr, **kwargs): 349 | super().__init__(**kwargs) 350 | self.nb_attr = nb_attr 351 | # 0 -> special token & tags, 1 -> character 352 | self.special_embeddings = Embedding(2, self.embed_dim) 353 | 354 | def embed(self, src_batch, src_mask): 355 | word_embed = self.embed_scale * self.src_embed(src_batch) 356 | char_mask = (src_batch < (self.src_vocab_size - self.nb_attr)).long() 357 | special_embed = self.embed_scale * self.special_embeddings(char_mask) 358 | pos_embed = self.position_embed(src_batch * char_mask) 359 | embed = self.dropout(word_embed + pos_embed + special_embed) 360 | return embed 361 | 362 | 363 | class UniversalTransformerEncoder(nn.Module): 364 | def __init__(self, encoder_layer, num_layers, norm=None): 365 | super(UniversalTransformerEncoder, self).__init__() 366 | self.encoder_layer = encoder_layer 367 | self.num_layers = num_layers 368 | self.norm = norm 369 | 370 | def forward(self, src, mask=None, src_key_padding_mask=None): 371 | output = src 372 | 373 | for i in range(self.num_layers): 374 | output = self.encoder_layer( 375 | output, 376 | src_mask=mask, 377 | src_key_padding_mask=src_key_padding_mask) 378 | 379 | if self.norm: 380 | output = self.norm(output) 381 | 382 | return output 383 | 384 | 385 | class UniversalTransformerDecoder(nn.Module): 386 | def __init__(self, decoder_layer, num_layers, norm=None): 387 | super(UniversalTransformerDecoder, self).__init__() 388 | self.decoder_layer = decoder_layer 389 | self.num_layers = num_layers 390 | self.norm = norm 391 | 392 | def forward(self, 393 | tgt, 394 | memory, 395 | tgt_mask=None, 396 | memory_mask=None, 397 | tgt_key_padding_mask=None, 398 | memory_key_padding_mask=None): 399 | output = tgt 400 | 401 | for i in range(self.num_layers): 402 | output = self.decoder_layer( 403 | output, 404 | memory, 405 | tgt_mask=tgt_mask, 406 | memory_mask=memory_mask, 407 | tgt_key_padding_mask=tgt_key_padding_mask, 408 | memory_key_padding_mask=memory_key_padding_mask) 409 | 410 | if self.norm: 411 | output = self.norm(output) 412 | 413 | return output 414 | 415 | 416 | class UniversalTransformer(Transformer): 417 | def __init__(self, **kwargs): 418 | super().__init__(**kwargs) 419 | encoder_layer = TransformerEncoderLayer( 420 | d_model=self.embed_dim, 421 | nhead=self.nb_heads, 422 | dim_feedforward=self.src_hid_size, 423 | dropout=self.dropout_p, 424 | attention_dropout=self.dropout_p, 425 | activation_dropout=self.dropout_p, 426 | normalize_before=True) 427 | self.encoder = UniversalTransformerEncoder( 428 | encoder_layer, 429 | num_layers=self.src_nb_layers, 430 | norm=nn.LayerNorm(self.embed_dim)) 431 | decoder_layer = TransformerDecoderLayer( 432 | d_model=self.embed_dim, 433 | nhead=self.nb_heads, 434 | dim_feedforward=self.trg_hid_size, 435 | dropout=self.dropout_p, 436 | attention_dropout=self.dropout_p, 437 | activation_dropout=self.dropout_p, 438 | normalize_before=True) 439 | self.decoder = UniversalTransformerDecoder( 440 | decoder_layer, 441 | num_layers=self.trg_nb_layers, 442 | norm=nn.LayerNorm(self.embed_dim)) 443 | 444 | 445 | class TagUniversalTransformer(TagTransformer, UniversalTransformer): 446 | pass 447 | 448 | 449 | def Embedding(num_embeddings, embedding_dim, padding_idx=None): 450 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 451 | nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) 452 | if padding_idx is not None: 453 | nn.init.constant_(m.weight[padding_idx], 0) 454 | return m 455 | 456 | 457 | def Linear(in_features, out_features, bias=True): 458 | m = nn.Linear(in_features, out_features, bias) 459 | nn.init.xavier_uniform_(m.weight) 460 | if bias: 461 | nn.init.constant_(m.bias, 0.) 462 | return m 463 | -------------------------------------------------------------------------------- /transformer_baseline.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | from collections import namedtuple 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.distributions import Distribution 10 | 11 | 12 | PAD_IDX, BOS_IDX, EOS_IDX= 0, 1, 2 #CHANGED LINE 13 | 14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | 16 | 17 | # class SinusoidalPositionalEmbedding(nn.Module): 18 | # """This module produces sinusoidal positional embeddings of any length. 19 | # Padding symbols are ignored. 20 | # """ 21 | # def __init__(self, embedding_dim, padding_idx, init_size=1024): 22 | # super().__init__() 23 | # self.embedding_dim = embedding_dim 24 | # self.padding_idx = padding_idx 25 | # self.weights = SinusoidalPositionalEmbedding.get_embedding( 26 | # init_size, 27 | # embedding_dim, 28 | # padding_idx, 29 | # ) 30 | # self.register_buffer('_float_tensor', torch.FloatTensor(1)) 31 | # 32 | # @staticmethod 33 | # def get_embedding(num_embeddings, embedding_dim, padding_idx=None): 34 | # """Build sinusoidal embeddings. 35 | # This matches the implementation in tensor2tensor, but differs slightly 36 | # from the description in Section 3.5 of "Attention Is All You Need". 37 | # """ 38 | # half_dim = embedding_dim // 2 39 | # emb = math.log(10000) / (half_dim - 1) 40 | # emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) 41 | # emb = torch.arange(num_embeddings, 42 | # dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) 43 | # emb = torch.cat([torch.sin(emb), torch.cos(emb)], 44 | # dim=1).view(num_embeddings, -1) 45 | # if embedding_dim % 2 == 1: 46 | # # zero pad 47 | # emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) 48 | # if padding_idx is not None: 49 | # emb[padding_idx, :] = 0 50 | # return emb 51 | # 52 | # def forward(self, input): 53 | # """Input is expected to be of size [bsz x seqlen].""" 54 | # bsz, seq_len = input.shape 55 | # max_pos = self.padding_idx + 1 + seq_len 56 | # if self.weights is None or max_pos > self.weights.size(0): 57 | # # recompute/expand embeddings if needed 58 | # self.weights = SinusoidalPositionalEmbedding.get_embedding( 59 | # max_pos, 60 | # self.embedding_dim, 61 | # self.padding_idx, 62 | # ) 63 | # self.weights = self.weights.to(self._float_tensor) 64 | # 65 | # mask = input.ne(self.padding_idx).long() 66 | # positions = torch.cumsum(mask, dim=0) * mask + self.padding_idx 67 | # return self.weights.index_select(0, positions.view(-1)).view( 68 | # bsz, seq_len, -1).detach() 69 | 70 | class SinusoidalPositionalEmbedding(nn.Module): 71 | """ Adds Sinusoidal positional encoding to sequences """ 72 | def __init__(self, embedding_dim, dropout=0.1, max_seq_len=100): 73 | """ Initializes a seq_len x 1 x embedding_dim positional encoding matrix""" 74 | super(SinusoidalPositionalEmbedding, self).__init__() 75 | self.dropout = nn.Dropout(p=dropout) 76 | 77 | pe = torch.zeros(max_seq_len, embedding_dim) 78 | position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1) 79 | div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim)) 80 | pe[:, 0::2] = torch.sin(position * div_term) 81 | pe[:, 1::2] = torch.cos(position * div_term) 82 | pe = pe.unsqueeze(0).transpose(0, 1) 83 | self.register_buffer('pe', pe) 84 | 85 | def forward(self, x): 86 | """ Adds positional encoding to the input. 87 | Input of dimensions (seq_len x batch_sz x embedding_dim). 88 | Adds positional encoding matrix (seq_len x 1 x embedding_dim) to every individual example in batch """ 89 | x = x + self.pe[:x.size(0), :] 90 | return self.dropout(x) 91 | 92 | 93 | class TransformerEncoderLayer(nn.Module): 94 | def __init__(self, 95 | d_model, 96 | nhead, 97 | dim_feedforward=2048, 98 | dropout=0.1, 99 | attention_dropout=0.1, 100 | activation_dropout=0.1, 101 | activation='relu', 102 | normalize_before=True): 103 | super(TransformerEncoderLayer, self).__init__() 104 | self.normalize_before = normalize_before 105 | self.self_attn = nn.MultiheadAttention(d_model, 106 | nhead, 107 | dropout=attention_dropout) 108 | # Implementation of Feedforward model 109 | self.linear1 = Linear(d_model, dim_feedforward) 110 | self.dropout = nn.Dropout(dropout) 111 | self.linear2 = Linear(dim_feedforward, d_model) 112 | 113 | self.norm1 = nn.LayerNorm(d_model) 114 | self.norm2 = nn.LayerNorm(d_model) 115 | self.activation_dropout = nn.Dropout(activation_dropout) 116 | 117 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation] 118 | 119 | def forward(self, src, src_mask=None, src_key_padding_mask=None): 120 | r"""Pass the input through the endocder layer. 121 | 122 | Args: 123 | src: the sequnce to the encoder layer (required). 124 | src_mask: the mask for the src sequence (optional). 125 | src_key_padding_mask: the mask for the src keys per batch (optional). 126 | """ 127 | # Self attention block 128 | residual = src 129 | if self.normalize_before: 130 | src = self.norm1(src) 131 | src = self.self_attn(src, 132 | src, 133 | src, 134 | attn_mask=src_mask, 135 | key_padding_mask=src_key_padding_mask)[0] 136 | src = residual + self.dropout(src) 137 | if not self.normalize_before: 138 | src = self.norm1(src) 139 | # Feed forward block 140 | residual = src 141 | if self.normalize_before: 142 | src = self.norm2(src) 143 | src = self.activation(self.linear1(src)) 144 | src = self.activation_dropout(src) 145 | src = self.linear2(src) 146 | src = residual + self.dropout(src) 147 | if not self.normalize_before: 148 | src = self.norm2(src) 149 | return src 150 | 151 | 152 | class TransformerDecoderLayer(nn.Module): 153 | def __init__(self, 154 | d_model, 155 | nhead, 156 | dim_feedforward=2048, 157 | dropout=0.1, 158 | attention_dropout=0.1, 159 | activation_dropout=0.1, 160 | activation='relu', 161 | normalize_before=True): 162 | super(TransformerDecoderLayer, self).__init__() 163 | self.normalize_before = normalize_before 164 | self.self_attn = nn.MultiheadAttention(d_model, 165 | nhead, 166 | dropout=attention_dropout) 167 | self.multihead_attn = nn.MultiheadAttention(d_model, 168 | nhead, 169 | dropout=dropout) 170 | # Implementation of Feedforward model 171 | self.linear1 = Linear(d_model, dim_feedforward) 172 | self.dropout = nn.Dropout(dropout) 173 | self.linear2 = Linear(dim_feedforward, d_model) 174 | 175 | self.norm1 = nn.LayerNorm(d_model) 176 | self.norm2 = nn.LayerNorm(d_model) 177 | self.norm3 = nn.LayerNorm(d_model) 178 | self.activation_dropout = nn.Dropout(activation_dropout) 179 | 180 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation] 181 | 182 | def forward(self, 183 | tgt, 184 | memory, 185 | tgt_mask=None, 186 | memory_mask=None, 187 | tgt_key_padding_mask=None, 188 | memory_key_padding_mask=None): 189 | r"""Pass the inputs (and mask) through the decoder layer. 190 | 191 | Args: 192 | tgt: the sequence to the decoder layer (required). 193 | memory: the sequnce from the last layer of the encoder (required). 194 | tgt_mask: the mask for the tgt sequence (optional). 195 | memory_mask: the mask for the memory sequence (optional). 196 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional). 197 | memory_key_padding_mask: the mask for the memory keys per batch (optional). 198 | """ 199 | # self attention block 200 | residual = tgt 201 | if self.normalize_before: 202 | tgt = self.norm1(tgt) 203 | tgt = self.self_attn(tgt, 204 | tgt, 205 | tgt, 206 | attn_mask=tgt_mask, 207 | key_padding_mask=tgt_key_padding_mask)[0] 208 | tgt = residual + self.dropout(tgt) 209 | if not self.normalize_before: 210 | tgt = self.norm1(tgt) 211 | # cross attention block 212 | residual = tgt 213 | if self.normalize_before: 214 | tgt = self.norm2(tgt) 215 | tgt = self.multihead_attn(tgt, 216 | memory, 217 | memory, 218 | attn_mask=memory_mask, 219 | key_padding_mask=memory_key_padding_mask)[0] 220 | tgt = residual + self.dropout(tgt) 221 | if not self.normalize_before: 222 | tgt = self.norm2(tgt) 223 | # feed forward block 224 | residual = tgt 225 | if self.normalize_before: 226 | tgt = self.norm3(tgt) 227 | tgt = self.activation(self.linear1(tgt)) 228 | tgt = self.activation_dropout(tgt) 229 | tgt = self.linear2(tgt) 230 | tgt = residual + self.dropout(tgt) 231 | if not self.normalize_before: 232 | tgt = self.norm3(tgt) 233 | return tgt 234 | 235 | 236 | class Transformer(nn.Module): 237 | def __init__(self, *, src_vocab_size, trg_vocab_size, embed_dim, nb_heads, 238 | src_hid_size, src_nb_layers, trg_hid_size, trg_nb_layers, 239 | dropout_p, tie_trg_embed, src_c2i, trg_c2i, attr_c2i, 240 | label_smooth, **kwargs): 241 | ''' 242 | init 243 | ''' 244 | super().__init__() 245 | self.src_vocab_size = src_vocab_size 246 | self.trg_vocab_size = trg_vocab_size 247 | self.embed_dim = embed_dim 248 | self.embed_scale = math.sqrt(embed_dim) 249 | self.nb_heads = nb_heads 250 | self.src_hid_size = src_hid_size 251 | self.src_nb_layers = src_nb_layers 252 | self.trg_hid_size = trg_hid_size 253 | self.trg_nb_layers = trg_nb_layers 254 | self.dropout_p = dropout_p 255 | self.tie_trg_embed = tie_trg_embed 256 | self.label_smooth = label_smooth 257 | self.src_c2i, self.trg_c2i, self.attr_c2i = src_c2i, trg_c2i, attr_c2i 258 | self.src_embed = Embedding(src_vocab_size, 259 | embed_dim, 260 | padding_idx=PAD_IDX) 261 | self.trg_embed = Embedding(trg_vocab_size, 262 | embed_dim, 263 | padding_idx=PAD_IDX) 264 | self.position_embed = SinusoidalPositionalEmbedding(embed_dim, PAD_IDX) 265 | encoder_layer = TransformerEncoderLayer(d_model=embed_dim, 266 | nhead=nb_heads, 267 | dim_feedforward=src_hid_size, 268 | dropout=dropout_p, 269 | attention_dropout=dropout_p, 270 | activation_dropout=dropout_p, 271 | normalize_before=True) 272 | self.encoder = nn.TransformerEncoder(encoder_layer, 273 | num_layers=src_nb_layers, 274 | norm=nn.LayerNorm(embed_dim)) 275 | decoder_layer = TransformerDecoderLayer(d_model=embed_dim, 276 | nhead=nb_heads, 277 | dim_feedforward=trg_hid_size, 278 | dropout=dropout_p, 279 | attention_dropout=dropout_p, 280 | activation_dropout=dropout_p, 281 | normalize_before=True) 282 | self.decoder = nn.TransformerDecoder(decoder_layer, 283 | num_layers=trg_nb_layers, 284 | norm=nn.LayerNorm(embed_dim)) 285 | self.final_out = Linear(embed_dim, trg_vocab_size) 286 | if tie_trg_embed: 287 | self.final_out.weight = self.trg_embed.weight 288 | self.dropout = nn.Dropout(dropout_p) 289 | # self._reset_parameters() 290 | 291 | def embed(self, src_batch, src_mask): 292 | # ------------ 293 | # word_embed = self.embed_scale * self.src_embed(src_batch) 294 | # pos_embed = self.position_embed(src_batch) 295 | # embed = self.dropout(word_embed + pos_embed) 296 | word_embed = self.src_embed(src_batch) 297 | embed = self.position_embed(word_embed) 298 | # ------------ 299 | return embed 300 | 301 | def encode(self, src_batch, src_mask): 302 | embed = self.embed(src_batch, src_mask) 303 | return self.encoder(embed, src_key_padding_mask=src_mask) 304 | 305 | def decode(self, enc_hs, src_mask, trg_batch, trg_mask): 306 | # -------------- 307 | # word_embed = self.embed_scale * self.trg_embed(trg_batch) 308 | # pos_embed = self.position_embed(trg_batch) 309 | # embed = self.dropout(word_embed + pos_embed) 310 | word_embed = self.trg_embed(trg_batch) 311 | embed = self.position_embed(word_embed) 312 | # -------------- 313 | 314 | trg_seq_len = trg_batch.size(0) 315 | causal_mask = self.generate_square_subsequent_mask(trg_seq_len) 316 | dec_hs = self.decoder(embed, 317 | enc_hs, 318 | tgt_mask=causal_mask, 319 | tgt_key_padding_mask=trg_mask, 320 | memory_key_padding_mask=src_mask) 321 | return F.log_softmax(self.final_out(dec_hs), dim=-1) 322 | 323 | def forward(self, src_batch, src_mask, trg_batch, trg_mask): 324 | ''' 325 | only for training 326 | ''' 327 | src_mask = (src_mask == 0).transpose(0, 1) 328 | trg_mask = (trg_mask == 0).transpose(0, 1) 329 | # trg_seq_len, batch_size = trg_batch.size() 330 | enc_hs = self.encode(src_batch, src_mask) 331 | # output: [trg_seq_len, batch_size, vocab_siz] 332 | output = self.decode(enc_hs, src_mask, trg_batch, trg_mask) 333 | return output 334 | 335 | def count_nb_params(self): 336 | model_parameters = filter(lambda p: p.requires_grad, self.parameters()) 337 | params = sum([np.prod(p.size()) for p in model_parameters]) 338 | return params 339 | 340 | def loss(self, predict, target): 341 | ''' 342 | compute loss 343 | ''' 344 | predict = predict.view(-1, self.trg_vocab_size) 345 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX) 346 | target = target.contiguous().view(-1, 1) 347 | non_pad_mask = target.ne(PAD_IDX) 348 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean() 349 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean() 350 | smooth_loss = smooth_loss / self.trg_vocab_size 351 | loss = (1. - 352 | self.label_smooth) * nll_loss + self.label_smooth * smooth_loss 353 | return loss 354 | 355 | def get_loss(self, data): 356 | src, src_mask, trg, trg_mask = data 357 | out = self.forward(src, src_mask, trg, trg_mask) 358 | loss = self.loss(out[:-1], trg[1:]) 359 | return loss 360 | 361 | def generate_square_subsequent_mask(self, sz): 362 | r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). 363 | Unmasked positions are filled with float(0.0). 364 | """ 365 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 366 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( 367 | mask == 1, float(0.0)) 368 | return mask.to(DEVICE) 369 | 370 | 371 | class TagTransformer(Transformer): 372 | def __init__(self, *, nb_attr, **kwargs): 373 | super().__init__(**kwargs) 374 | self.nb_attr = nb_attr 375 | # 0 -> special token & tags, 1 -> character 376 | self.special_embeddings = Embedding(2, self.embed_dim) 377 | 378 | def embed(self, src_batch, src_mask): 379 | word_embed = self.embed_scale * self.src_embed(src_batch) 380 | char_mask = (src_batch < (self.src_vocab_size - self.nb_attr)).long() 381 | special_embed = self.embed_scale * self.special_embeddings(char_mask) 382 | pos_embed = self.position_embed(src_batch * char_mask) 383 | embed = self.dropout(word_embed + pos_embed + special_embed) 384 | return embed 385 | 386 | 387 | class UniversalTransformerEncoder(nn.Module): 388 | def __init__(self, encoder_layer, num_layers, norm=None): 389 | super(UniversalTransformerEncoder, self).__init__() 390 | self.encoder_layer = encoder_layer 391 | self.num_layers = num_layers 392 | self.norm = norm 393 | 394 | def forward(self, src, mask=None, src_key_padding_mask=None): 395 | output = src 396 | 397 | for i in range(self.num_layers): 398 | output = self.encoder_layer( 399 | output, 400 | src_mask=mask, 401 | src_key_padding_mask=src_key_padding_mask) 402 | 403 | if self.norm: 404 | output = self.norm(output) 405 | 406 | return output 407 | 408 | 409 | class UniversalTransformerDecoder(nn.Module): 410 | def __init__(self, decoder_layer, num_layers, norm=None): 411 | super(UniversalTransformerDecoder, self).__init__() 412 | self.decoder_layer = decoder_layer 413 | self.num_layers = num_layers 414 | self.norm = norm 415 | 416 | def forward(self, 417 | tgt, 418 | memory, 419 | tgt_mask=None, 420 | memory_mask=None, 421 | tgt_key_padding_mask=None, 422 | memory_key_padding_mask=None): 423 | output = tgt 424 | 425 | for i in range(self.num_layers): 426 | output = self.decoder_layer( 427 | output, 428 | memory, 429 | tgt_mask=tgt_mask, 430 | memory_mask=memory_mask, 431 | tgt_key_padding_mask=tgt_key_padding_mask, 432 | memory_key_padding_mask=memory_key_padding_mask) 433 | 434 | if self.norm: 435 | output = self.norm(output) 436 | 437 | return output 438 | 439 | 440 | class UniversalTransformer(Transformer): 441 | def __init__(self, **kwargs): 442 | super().__init__(**kwargs) 443 | encoder_layer = TransformerEncoderLayer( 444 | d_model=self.embed_dim, 445 | nhead=self.nb_heads, 446 | dim_feedforward=self.src_hid_size, 447 | dropout=self.dropout_p, 448 | attention_dropout=self.dropout_p, 449 | activation_dropout=self.dropout_p, 450 | normalize_before=True) 451 | self.encoder = UniversalTransformerEncoder( 452 | encoder_layer, 453 | num_layers=self.src_nb_layers, 454 | norm=nn.LayerNorm(self.embed_dim)) 455 | decoder_layer = TransformerDecoderLayer( 456 | d_model=self.embed_dim, 457 | nhead=self.nb_heads, 458 | dim_feedforward=self.trg_hid_size, 459 | dropout=self.dropout_p, 460 | attention_dropout=self.dropout_p, 461 | activation_dropout=self.dropout_p, 462 | normalize_before=True) 463 | self.decoder = UniversalTransformerDecoder( 464 | decoder_layer, 465 | num_layers=self.trg_nb_layers, 466 | norm=nn.LayerNorm(self.embed_dim)) 467 | 468 | 469 | class TagUniversalTransformer(TagTransformer, UniversalTransformer): 470 | pass 471 | 472 | 473 | def Embedding(num_embeddings, embedding_dim, padding_idx=None): 474 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 475 | nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) 476 | if padding_idx is not None: 477 | nn.init.constant_(m.weight[padding_idx], 0) 478 | return m 479 | 480 | 481 | def Linear(in_features, out_features, bias=True): 482 | m = nn.Linear(in_features, out_features, bias) 483 | nn.init.xavier_uniform_(m.weight) 484 | if bias: 485 | nn.init.constant_(m.bias, 0.) 486 | return m 487 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import sys 5 | import time 6 | from datetime import timedelta 7 | 8 | import torch 9 | from torch.optim.lr_scheduler import LambdaLR 10 | 11 | import pointer_generator 12 | import transformer 13 | 14 | 15 | class WarmupInverseSquareRootSchedule(LambdaLR): 16 | """ Linear warmup and then inverse square root decay. 17 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 18 | Inverse square root decreases learning rate from 1. to 0. over remaining steps. 19 | """ 20 | def __init__(self, optimizer, warmup_steps, last_epoch=-1): 21 | self.warmup_steps = warmup_steps 22 | self.decay_factor = warmup_steps**0.5 23 | super(WarmupInverseSquareRootSchedule, 24 | self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 25 | 26 | def lr_lambda(self, step): 27 | if step < self.warmup_steps: 28 | return float(step) / float(max(1, self.warmup_steps)) 29 | return self.decay_factor * step**-0.5 30 | 31 | 32 | class LogFormatter(): 33 | def __init__(self): 34 | self.start_time = time.time() 35 | 36 | def format(self, record): 37 | elapsed_seconds = round(record.created - self.start_time) 38 | 39 | prefix = "%s - %s - %s" % (record.levelname, time.strftime('%x %X'), 40 | timedelta(seconds=elapsed_seconds)) 41 | message = record.getMessage() 42 | message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3)) 43 | return "%s - %s" % (prefix, message) if message else '' 44 | 45 | 46 | def get_logger(): 47 | ''' 48 | create logger and output to file and stdout 49 | ''' 50 | log_formatter = LogFormatter() 51 | logger = logging.getLogger() 52 | logger.setLevel(logging.INFO) 53 | 54 | stream = logging.StreamHandler(sys.stdout) 55 | stream.setFormatter(log_formatter) 56 | logger.addHandler(stream) 57 | return logger 58 | 59 | def maybe_mkdir(filename): 60 | ''' 61 | maybe mkdir 62 | ''' 63 | path = os.path.dirname(filename) 64 | if not os.path.isdir(path): 65 | try: 66 | os.makedirs(path) 67 | except FileExistsError: 68 | pass 69 | 70 | def save_checkpoint(model, epoch, optimizer, scheduler, val_accuracy, is_best, checkpoints_dir): 71 | """ 72 | Save checkpoint of model at current epoch, if new best model, saves checkpoint as best. 73 | Saves state of model, epoch, optimizer and scheduler. 74 | """ 75 | # Create the checkpoint folder in not exists 76 | if not os.path.exists(checkpoints_dir): 77 | os.makedirs(checkpoints_dir) 78 | checkpoint = { 79 | 'epoch': epoch, 80 | 'val_accuracy': val_accuracy, 81 | 'state_dict': model.state_dict(), 82 | 'optimizer': optimizer.state_dict(), 83 | 'scheduler': scheduler.state_dict() 84 | } 85 | model_path = f"{checkpoints_dir}/model_{epoch}.pth" 86 | best_model_path = f'{checkpoints_dir}/model_best.pth' 87 | torch.save(checkpoint, model_path) 88 | if is_best: 89 | shutil.copyfile(model_path, best_model_path) 90 | 91 | def load_checkpoint(model, optimizer, scheduler, checkpoint_path, logger): 92 | """ 93 | Load checkpoint of model from checkpoint path. Used for training. 94 | Loads state of model, epoch, optimizer and scheduler. 95 | """ 96 | if not os.path.exists(checkpoint_path): 97 | logger.info(f" Trying to resume training bot file {checkpoint_path} not exists,\n" 98 | f" starting training from scratch") 99 | return model, optimizer, scheduler, 0, -1.0 100 | logger.info(f"resume training, loading checkpoint from {checkpoint_path}") 101 | checkpoint = torch.load(checkpoint_path) 102 | try: 103 | model.load_state_dict(checkpoint['state_dict']) 104 | except: 105 | logger.debug("Model hyperparameters do not match loaded checkpoint") 106 | optimizer.load_state_dict(checkpoint['optimizer']) 107 | try: 108 | scheduler.load_state_dict(checkpoint['scheduler']) 109 | except: 110 | logger.debug("Scheduler does not match loaded checkpoint") 111 | start_epoch = checkpoint['epoch'] 112 | val_accuracy = checkpoint['val_accuracy'] 113 | return model, optimizer, scheduler, start_epoch, val_accuracy 114 | 115 | def load_model(model, checkpoint_path, logger): 116 | """ 117 | Load checkpoint of model from checkpoint path. Used for generating prediction files. 118 | Only Loads state of model. 119 | """ 120 | if not os.path.exists(checkpoint_path): 121 | logger.info(f" Trying to reload checkpoint from pretraining but file {checkpoint_path} not exists,\n" 122 | f" starting training from scratch") 123 | else: 124 | checkpoint = torch.load(checkpoint_path) 125 | state_dict = model.state_dict() 126 | state_dict.update(checkpoint['state_dict']) 127 | model.load_state_dict(state_dict) 128 | return model 129 | 130 | def build_model(arch, src_vocab_size, tgt_vocab_size, embedding_dim, fcn_hidden_dim, 131 | num_heads, num_layers, dropout, src_to_tgt_vocab_conversion_matrix): 132 | """ 133 | Builds model. 134 | """ 135 | model = transformer.Transformer(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, 136 | embedding_dim=embedding_dim, 137 | fcn_hidden_dim=fcn_hidden_dim, num_heads=num_heads, num_layers=num_layers, 138 | dropout=dropout) \ 139 | if (arch == "transformer") \ 140 | else \ 141 | pointer_generator.PointerGeneratorTransformer(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, 142 | src_to_tgt_vocab_conversion_matrix=src_to_tgt_vocab_conversion_matrix, 143 | embedding_dim=embedding_dim, 144 | fcn_hidden_dim=fcn_hidden_dim, num_heads=num_heads, 145 | num_layers=num_layers, 146 | dropout=dropout) 147 | return model 148 | 149 | 150 | def clean_checkpoints_dir(checkpoints_dir): 151 | """ 152 | Remove unnecessary model checkpoints (disk quota limit) 153 | """ 154 | for filename in sorted(os.listdir(checkpoints_dir)): 155 | if os.path.isfile(os.path.join(checkpoints_dir, filename)) and ("best" not in filename): 156 | os.remove(os.path.join(checkpoints_dir, filename)) 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /vocabulary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import itertools 3 | import collections 4 | import argparse 5 | 6 | import data 7 | import tokenizer 8 | import utils 9 | 10 | """Reads conll file using functions in data (only train files are used to create a vocabulary). Using generic 11 | tokenizer functions (and not Tokenizer object that uses a vocabulary), creates input tokens vocabulary and target 12 | tokens vocabulary in different files. """ 13 | 14 | 15 | # Arguments 16 | parser = argparse.ArgumentParser( 17 | description='Reads conll file (which is the dataset), and creates vocabulary files for input and output') 18 | parser.add_argument('--src', type=str, default='train', 19 | help="Source file of the dataset used to create the vocabulary (must include folder path)") 20 | parser.add_argument('--vocab', type=str, default='vocab', 21 | help="Target path of the vocabulary (must include folder path)") 22 | args = parser.parse_args() 23 | 24 | """ CONSTANTS """ 25 | WORD_FLAG = "WORD" 26 | FEATURE_FLAG = "FEATURE" 27 | 28 | 29 | def get_tokens_from_list(words_list, flag): 30 | """ Gets list of of either words or concatenated features, and returns one list of all tokens""" 31 | tokens_list = [] 32 | if flag == WORD_FLAG: 33 | # Split words to lists of characters 34 | tokens_list = tokenizer.tokenize_words(words_list) 35 | else: 36 | # Split features by separator sign ";" 37 | tokens_list = tokenizer.tokenize_features(words_list) 38 | # Flat lists of tokens into one list of all tokens 39 | return list(itertools.chain.from_iterable(tokens_list)) 40 | 41 | 42 | def write_vocab_to_file(tokens_list, vocab_file_path): 43 | """ 44 | Counts all tokens in list and writes them to file. Make dir if not exists. 45 | """ 46 | utils.maybe_mkdir(vocab_file_path) 47 | vocab_file = open(vocab_file_path, "w", encoding='utf-8') # "ISO-8859-1") 48 | # Get counter object to hold counts of characters 49 | vocab_counter = collections.Counter(tokens_list) 50 | # Write vocabulary (counter object) to file in order of frequency 51 | for vocab, count in vocab_counter.most_common(): 52 | vocab_file.write(f"{vocab}\t{count}\n") 53 | 54 | vocab_file.close() 55 | 56 | 57 | def create_vocab_files(src_file_path, vocab_file_path): 58 | """ Reads morph file and creates input tokens vocabulary and target tokens vocabulary, and writes them in 59 | different files """ 60 | lemmas, targets, features = data.read_train_file(src_file_path) 61 | # Get tokens lists for source, target lemmas and features 62 | lemmas_tokens = get_tokens_from_list(lemmas, WORD_FLAG) 63 | targets_tokens = get_tokens_from_list(targets, WORD_FLAG) 64 | features_tokens = get_tokens_from_list(features, FEATURE_FLAG) 65 | # input tokens = lemmas tokens + features tokens 66 | input_tokens = lemmas_tokens + targets_tokens + features_tokens 67 | output_tokens = lemmas_tokens + targets_tokens 68 | # write vocabularies of inputs and outputs to files 69 | write_vocab_to_file(input_tokens, vocab_file_path + "-input") 70 | write_vocab_to_file(output_tokens, vocab_file_path + "-output") 71 | 72 | 73 | if __name__ == '__main__': 74 | # Create vocab files 75 | create_vocab_files(args.src, args.vocab) 76 | --------------------------------------------------------------------------------