├── checkpoints
├── english-low
│ └── model_best.pth
├── Transformer
│ ├── Low
│ │ └── model_best.pth
│ └── Medium
│ │ └── model_best.pth
└── Pointer Generator
│ ├── Low
│ └── model_best.pth
│ └── Medium
│ └── model_best.pth
├── data
├── english-train-mini
├── middle-high-german-covered-test
├── scottish-gaelic-covered-test
├── middle-high-german-test
├── middle-high-german-dev
├── scottish-gaelic-dev
├── scottish-gaelic-test
├── hebrew-train-low
├── english-train-low
├── middle-high-german-train-low
├── scottish-gaelic-train-low
├── irish-train-low
├── french-train-low
├── spanish-train-low
├── italian-train-low
├── middle-french-train-low
├── old-french-train-low
├── arabic-train-low
└── middle-high-german-train-medium
├── cover.py
├── evaluate.py
├── model_utils.py
├── README.md
├── run_models.py
├── vocabulary.py
├── data.py
├── hyperparameter_search.py
├── tokenizer.py
├── decoder.py
├── utils.py
├── transformer.py
├── dataset.py
├── generate.py
├── pointer_generator.py
├── train.py
├── transformer_baseline-original.py
└── transformer_baseline.py
/checkpoints/english-low/model_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/HEAD/checkpoints/english-low/model_best.pth
--------------------------------------------------------------------------------
/checkpoints/Transformer/Low/model_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/HEAD/checkpoints/Transformer/Low/model_best.pth
--------------------------------------------------------------------------------
/data/english-train-mini:
--------------------------------------------------------------------------------
1 | dreep dreep V;NFIN
2 | charcoal charcoal V;NFIN
3 | stodge stodges V;3;SG;PRS
4 | biotransform biotransform V;NFIN
5 | disallow disallowing V;V.PTCP;PRS
--------------------------------------------------------------------------------
/checkpoints/Transformer/Medium/model_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/HEAD/checkpoints/Transformer/Medium/model_best.pth
--------------------------------------------------------------------------------
/checkpoints/Pointer Generator/Low/model_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/HEAD/checkpoints/Pointer Generator/Low/model_best.pth
--------------------------------------------------------------------------------
/checkpoints/Pointer Generator/Medium/model_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AssafSinger94/pointer-generator-transformer-inflection/HEAD/checkpoints/Pointer Generator/Medium/model_best.pth
--------------------------------------------------------------------------------
/cover.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import data
5 |
6 | parser = argparse.ArgumentParser(description='Reads conll dev file, and covers the target (test file format)')
7 | parser.add_argument('--data-dir', type=str, default='train',
8 | help="Folder of the dataset file")
9 | parser.add_argument('--lang', type=str, default='train',
10 | help="language of the dataset file")
11 | args = parser.parse_args()
12 |
13 |
14 | def cover_file(data_dir, language):
15 | dev_file_path = os.path.join(data_dir, f"{language}.dev")
16 | covered_dev_file_path = os.path.join(data_dir, f"{language}.covered_dev")
17 | covered_dev_file = open(covered_dev_file_path, "w", encoding='utf-8') # "ISO-8859-1")
18 | dev_morph_list = data.read_morph_file(dev_file_path)
19 | for lemma, target, feature in dev_morph_list:
20 | covered_dev_file.write(f"{lemma}\t{feature}\n")
21 | covered_dev_file.close()
22 |
23 |
24 | if __name__ == '__main__':
25 | # Create vocab files
26 | cover_file(args.data_dir, args.lang)
--------------------------------------------------------------------------------
/data/middle-high-german-covered-test:
--------------------------------------------------------------------------------
1 | trinken V;IND;PST;1;SG
2 | vinden V;IND;PRS;3;SG
3 | ziehen V;SBJV;PRS;1;PL
4 | enpfinden V.PTCP;PRS
5 | werden V;SBJV;PRS;3;SG
6 | wahsen V;SBJV;PRS;2;PL
7 | grave N;NOM;SG
8 | triegen V;SBJV;PST;1;SG
9 | binden V;IND;PST;1;PL
10 | werfen V;IND;PST;2;PL
11 | helfen V;SBJV;PRS;2;SG
12 | beginnen V;IND;PRS;1;PL
13 | slahen V;SBJV;PRS;2;SG
14 | engel N;GEN;PL
15 | swimmen V;IMP;2;PL
16 | triegen V;IMP;2;SG
17 | ziehen V;IND;PST;3;SG
18 | burcgrave N;ACC;SG
19 | biegen V;IND;PRS;1;SG
20 | werden V;SBJV;PST;2;SG
21 | binden V;IND;PST;2;SG
22 | swimmen V;IND;PST;2;SG
23 | göugrave N;GEN;PL
24 | kiesen V;IND;PST;1;SG
25 | singen V;SBJV;PST;1;SG
26 | vinden V;IMP;2;PL
27 | vinden V;NFIN
28 | werfen V;IND;PST;3;PL
29 | werden V;IND;PRS;2;SG
30 | swimmen V;SBJV;PST;2;PL
31 | vinden V.PTCP;PST
32 | trinken V;SBJV;PRS;1;SG
33 | enpfinden V;IMP;2;SG
34 | werden V;IMP;1;PL
35 | biegen V;SBJV;PST;3;SG
36 | kiesen V;SBJV;PRS;3;SG
37 | ziehen V;SBJV;PST;3;PL
38 | kiesen V;SBJV;PST;1;PL
39 | betriegen V;SBJV;PRS;2;SG
40 | werfen V;SBJV;PST;3;SG
41 | triegen V;SBJV;PST;3;SG
42 | rinnen V;NFIN
43 | wahsen V;SBJV;PRS;3;PL
44 | vinden V;SBJV;PRS;2;PL
45 | kiesen V;SBJV;PRS;2;SG
46 | trinken V;SBJV;PST;3;PL
47 | binden V;IND;PRS;3;SG
48 | trinken V;SBJV;PRS;2;PL
49 | binden V;IND;PST;3;SG
50 | kiesen V;IND;PRS;3;SG
51 |
--------------------------------------------------------------------------------
/data/scottish-gaelic-covered-test:
--------------------------------------------------------------------------------
1 | nòsach ADJ;PL;DAT
2 | drùis-mhiannach ADJ;SG;MASC;VOC
3 | drùidhteach ADJ;SG;FEM;NOM
4 | Albannach ADJ;PL;NOM
5 | pròiseil ADJ;SG;MASC;GEN
6 | bòidheach ADJ;PL;DAT
7 | nòsail ADJ;SG;MASC;DAT
8 | adharcach ADJ;PL;NOM
9 | mòrail ADJ;SG;MASC;NOM
10 | nòsail ADJ;SG;FEM;DAT
11 | gruamach ADJ;PL;VOC
12 | rèidh ADJ;SG;MASC;VOC
13 | sliochdmhor ADJ;PL;VOC
14 | mòrail ADJ;SG;FEM;VOC
15 | pronn ADJ;SG;MASC;GEN
16 | fàsaichte ADJ;PL;VOC
17 | leatromach ADJ;PL;GEN
18 | feòlmhor ADJ;SG;FEM;DAT
19 | dualchasach ADJ;SG;MASC;VOC
20 | gobach ADJ;SG;MASC;VOC
21 | dòth V;COND;ACT
22 | cam ADJ;PL;VOC
23 | brèagha ADJ;SG;MASC;DAT
24 | gruamach ADJ;SG;MASC;NOM
25 | sgòideil ADJ;PL;VOC
26 | Manainneach ADJ;SG;MASC;GEN
27 | dealanach ADJ;PL;GEN
28 | sliochdmhor ADJ;PL;DAT
29 | abair V;PST;PASS
30 | cuagach ADJ;SG;MASC;NOM
31 | ioma-ghlac V;COND;ACT
32 | dualchasach ADJ;PL;GEN
33 | cuagach ADJ;SG;FEM;DAT
34 | eagalach ADJ;SG;FEM;VOC
35 | sliochdmhor ADJ;SG;FEM;NOM
36 | drùis-mhiannach ADJ;PL;VOC
37 | nòsach ADJ;SG;FEM;NOM
38 | sliochdmhor ADJ;SG;FEM;VOC
39 | Spàinneach ADJ;SG;FEM;NOM
40 | seann-nòsach ADJ;SG;FEM;DAT
41 | beir V;FUT;ACT
42 | crùb V;PST;ACT
43 | uaibhreach ADJ;SG;FEM;DAT
44 | glac V;PST;PASS
45 | brònach ADJ;SG;FEM;DAT
46 | grinn ADJ;SG;MASC;VOC
47 | nòsail ADJ;PL;NOM
48 | rìomhach ADJ;SG;FEM;GEN
49 | sgèimheach ADJ;PL;NOM
50 | sìolmhor ADJ;SG;MASC;NOM
51 |
--------------------------------------------------------------------------------
/data/middle-high-german-test:
--------------------------------------------------------------------------------
1 | trinken trank V;IND;PST;1;SG
2 | vinden vindet V;IND;PRS;3;SG
3 | ziehen ziehen V;SBJV;PRS;1;PL
4 | enpfinden enpfindende V.PTCP;PRS
5 | werden wërdet V;SBJV;PRS;3;SG
6 | wahsen wahset V;SBJV;PRS;2;PL
7 | grave grâve N;NOM;SG
8 | triegen trüge V;SBJV;PST;1;SG
9 | binden bunden V;IND;PST;1;PL
10 | werfen wurfet V;IND;PST;2;PL
11 | helfen hëlfest V;SBJV;PRS;2;SG
12 | beginnen beginnen V;IND;PRS;1;PL
13 | slahen slagest V;SBJV;PRS;2;SG
14 | engel engele N;GEN;PL
15 | swimmen swimmet V;IMP;2;PL
16 | triegen triuc V;IMP;2;SG
17 | ziehen zôch V;IND;PST;3;SG
18 | burcgrave burcgrâven N;ACC;SG
19 | biegen biuge V;IND;PRS;1;SG
20 | werden würdest V;SBJV;PST;2;SG
21 | binden bünde V;IND;PST;2;SG
22 | swimmen swümme V;IND;PST;2;SG
23 | göugrave göugrâven N;GEN;PL
24 | kiesen kôs V;IND;PST;1;SG
25 | singen sünge V;SBJV;PST;1;SG
26 | vinden vindet V;IMP;2;PL
27 | vinden vinden V;NFIN
28 | werfen wurfen V;IND;PST;3;PL
29 | werden wirdest V;IND;PRS;2;SG
30 | swimmen swümmet V;SBJV;PST;2;PL
31 | vinden gevunden V.PTCP;PST
32 | trinken trinke V;SBJV;PRS;1;SG
33 | enpfinden enpfind V;IMP;2;SG
34 | werden wërden V;IMP;1;PL
35 | biegen büge V;SBJV;PST;3;SG
36 | kiesen kieset V;SBJV;PRS;3;SG
37 | ziehen zühen V;SBJV;PST;3;PL
38 | kiesen küsen V;SBJV;PST;1;PL
39 | betriegen betriegest V;SBJV;PRS;2;SG
40 | werfen würfe V;SBJV;PST;3;SG
41 | triegen trüge V;SBJV;PST;3;SG
42 | rinnen rinnen V;NFIN
43 | wahsen wahsen V;SBJV;PRS;3;PL
44 | vinden vindet V;SBJV;PRS;2;PL
45 | kiesen kiesest V;SBJV;PRS;2;SG
46 | trinken trünken V;SBJV;PST;3;PL
47 | binden bindet V;IND;PRS;3;SG
48 | trinken trinket V;SBJV;PRS;2;PL
49 | binden band V;IND;PST;3;SG
50 | kiesen kiuset V;IND;PRS;3;SG
51 |
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import data
3 |
4 | # Arguments
5 | import utils
6 |
7 | parser = argparse.ArgumentParser(description='Computing accuracy of model predictions compare to target file')
8 | parser.add_argument('--pred', type=str, default='data/pred',
9 | help="File with model predictions (must include folder path)")
10 | parser.add_argument('--target', type=str, default='target',
11 | help="File with gold targets (must include folder path)")
12 | args = parser.parse_args()
13 |
14 | # Log all relevant files
15 | logger = utils.get_logger()
16 | logger.info(f"Target file: {args.target}")
17 | logger.info(f"Prediction file: {args.pred}")
18 |
19 |
20 | """ FUNCTIONS """
21 | def accuracy(predictions, targets):
22 | """Return fraction of matches between two lists sequentially."""
23 | correct_count = 0
24 | for prediction, target in zip(predictions, targets):
25 | if prediction == target:
26 | correct_count += 1
27 | return float(100 * correct_count) / len(predictions)
28 |
29 | def evaluate_predictions(pred_file, target_file):
30 | """Compute prediction. words NOT cleaned"""
31 | pred_lines = data.read_morph_file(pred_file)
32 | target_lines = data.read_morph_file(target_file)
33 | predictions = [line[1] for line in pred_lines]
34 | truth = [line[1] for line in target_lines]
35 | total_accuracy = accuracy(predictions, truth)
36 | logger.info(f"Test set. accuracy: {total_accuracy:.2f}\n")
37 | return total_accuracy
38 |
39 |
40 | if __name__ == '__main__':
41 | # Compute accuracy of predictions compare to truth
42 | evaluate_predictions(args.pred, args.target)
43 |
44 |
--------------------------------------------------------------------------------
/data/middle-high-german-dev:
--------------------------------------------------------------------------------
1 | biegen buget V;IND;PST;2;PL
2 | helfen hülfe V;SBJV;PST;1;SG
3 | verswinden verswindet V;IND;PRS;2;PL
4 | rinnen runnen V;IND;PST;3;PL
5 | werden würde V;SBJV;PST;3;SG
6 | helfen half V;IND;PST;1;SG
7 | binden binden V;IMP;1;PL
8 | slahen sluogen V;IND;PST;3;PL
9 | sieden siedende V.PTCP;PRS
10 | helfen hülfest V;SBJV;PST;2;SG
11 | verswinden verswindet V;SBJV;PRS;3;SG
12 | triegen truget V;IND;PST;2;PL
13 | binden binde V;SBJV;PRS;1;SG
14 | binden bind V;IMP;2;SG
15 | slahen slüege V;SBJV;PST;3;SG
16 | helfen hëlfende V.PTCP;PRS
17 | binden bindet V;IMP;2;PL
18 | trinken trinket V;IND;PRS;3;SG
19 | verswinden verswindest V;IND;PRS;2;SG
20 | singen sing V;IMP;2;SG
21 | vinden vindest V;SBJV;PRS;2;SG
22 | betriegen betriuc V;IMP;2;SG
23 | ziehen ziehent V;IND;PRS;3;PL
24 | werden wurden V;IND;PST;3;PL
25 | enpfinden enpfinden V;NFIN
26 | swimmen geswummen V.PTCP;PST
27 | sieden sôt V;IND;PST;1;SG
28 | slahen slaget V;IND;PRS;2;PL
29 | trinken trunken V;IND;PST;3;PL
30 | bruoder brüeder N;GEN;PL
31 | singen singet V;IND;PRS;2;PL
32 | rinnen rinnen V;IMP;1;PL
33 | betriegen betrüge V;SBJV;PST;3;SG
34 | enpfinden enpfinde V;SBJV;PRS;1;SG
35 | helfen hilfest V;IND;PRS;2;SG
36 | helfen hëlfen V;IMP;1;PL
37 | slahen slüeget V;SBJV;PST;2;PL
38 | trinken trunken V;IND;PST;1;PL
39 | enpfinden enpfinde V;IND;PRS;1;SG
40 | beginnen beginnet V;SBJV;PRS;2;PL
41 | beginnen begunnet V;IND;PST;2;PL
42 | enpfinden enpfinden V;SBJV;PRS;3;PL
43 | sieden sütet V;SBJV;PST;2;PL
44 | werfen wërfest V;SBJV;PRS;2;SG
45 | verswinden verswünden V;SBJV;PST;1;PL
46 | enpfinden enpfunden V;IND;PST;1;PL
47 | trinken trinkest V;SBJV;PRS;2;SG
48 | werden ward V;IND;PST;1;SG
49 | verswinden verswindest V;SBJV;PRS;2;SG
50 | singen singen V;IND;PRS;1;PL
51 |
--------------------------------------------------------------------------------
/data/scottish-gaelic-dev:
--------------------------------------------------------------------------------
1 | Breatannach Breatannach ADJ;PL;NOM
2 | rìomhach rìomhach ADJ;SG;MASC;NOM
3 | stiùir stiùirteadh V;COND;PASS
4 | ruadh ruadha ADJ;PL;GEN
5 | blàth bhlàith ADJ;SG;FEM;VOC
6 | uaibhreach uaibhreach ADJ;SG;FEM;NOM
7 | nuadh nuadh ADJ;SG;MASC;NOM
8 | dealanach dhealanich ADJ;SG;MASC;VOC
9 | maiseach maiseach ADJ;PL;NOM
10 | blàth blàtha ADJ;PL;VOC
11 | rèidh rèidh ADJ;SG;MASC;GEN
12 | fàsaichte fàsaichte ADJ;PL;DAT
13 | fàsaichte fhàsaichte ADJ;SG;MASC;VOC
14 | nuadh nuaidh ADJ;SG;MASC;GEN
15 | leatromach leatromaich ADJ;SG;FEM;VOC
16 | stiùir stiùiridh V;FUT;ACT
17 | mòrail mòraile ADJ;PL;GEN
18 | dòth a' dòthadh V;V.PTCP;PRS
19 | Spàinneach Spàinnich ADJ;SG;FEM;GEN
20 | làidir làidir ADJ;SG;FEM;VOC
21 | rèitich rèitich V;PST;ACT
22 | ruig ruigteadh V;COND;PASS
23 | geal geal ADJ;SG;MASC;NOM
24 | eagalach eagalach ADJ;PL;GEN
25 | stiùir stiùir V;PST;ACT
26 | pronn phruinn ADJ;SG;MASC;VOC
27 | cuagach cuagach ADJ;PL;NOM
28 | seann-nòsach sheann-nòsaich ADJ;SG;MASC;GEN
29 | dòth dhòth V;PST;ACT
30 | dealanach dhealanach ADJ;SG;FEM;NOM
31 | maiseach maiseach ADJ;SG;MASC;NOM
32 | sìolmhor shìolmhor ADJ;SG;MASC;DAT
33 | drùidhteach dhrùidhtich ADJ;SG;MASC;VOC
34 | adharcach adharcaich ADJ;SG;FEM;DAT
35 | crùb crùbaidh V;FUT;ACT
36 | crùbach chrùbaich ADJ;SG;MASC;GEN
37 | Albannach Albannach ADJ;SG;MASC;NOM
38 | seann-nòsach seann-nòsach ADJ;PL;VOC
39 | faigh a' faighinn V;V.PTCP;PRS
40 | caidil chaidileadh V;PST;PASS
41 | blàth bhlàith ADJ;SG;MASC;VOC
42 | Cuimreach Chuimrich ADJ;SG;MASC;VOC
43 | Spàinneach Spàinneach ADJ;SG;MASC;NOM
44 | nòsail nòsail ADJ;SG;MASC;NOM
45 | nòsach nòsach ADJ;PL;GEN
46 | fàsaichte fhàsaichte ADJ;SG;FEM;GEN
47 | feòlmhor feòlmhor ADJ;PL;DAT
48 | Breatannach Bhreatannaich ADJ;SG;FEM;GEN
49 | tòisich thòisich V;PST;ACT
50 | Cuimreach Chuimreach ADJ;SG;FEM;NOM
51 |
--------------------------------------------------------------------------------
/data/scottish-gaelic-test:
--------------------------------------------------------------------------------
1 | nòsach nòsach ADJ;PL;DAT
2 | drùis-mhiannach dhrùis-mhiannaich ADJ;SG;MASC;VOC
3 | drùidhteach dhrùidhteach ADJ;SG;FEM;NOM
4 | Albannach Albannach ADJ;PL;NOM
5 | pròiseil phròiseil ADJ;SG;MASC;GEN
6 | bòidheach bòidheach ADJ;PL;DAT
7 | nòsail nòsail ADJ;SG;MASC;DAT
8 | adharcach adharcach ADJ;PL;NOM
9 | mòrail mòrail ADJ;SG;MASC;NOM
10 | nòsail nòsail ADJ;SG;FEM;DAT
11 | gruamach gruamach ADJ;PL;VOC
12 | rèidh rèidh ADJ;SG;MASC;VOC
13 | sliochdmhor sliochdmhor ADJ;PL;VOC
14 | mòrail mhòrail ADJ;SG;FEM;VOC
15 | pronn phruinn ADJ;SG;MASC;GEN
16 | fàsaichte fàsaichte ADJ;PL;VOC
17 | leatromach learomach ADJ;PL;GEN
18 | feòlmhor fheòlmhoir ADJ;SG;FEM;DAT
19 | dualchasach dhualchasaich ADJ;SG;MASC;VOC
20 | gobach ghobaich ADJ;SG;MASC;VOC
21 | dòth dhòthadh V;COND;ACT
22 | cam cama ADJ;PL;VOC
23 | brèagha bhrèagha ADJ;SG;MASC;DAT
24 | gruamach gruamach ADJ;SG;MASC;NOM
25 | sgòideil sgòideil ADJ;PL;VOC
26 | Manainneach Mhanainnich ADJ;SG;MASC;GEN
27 | dealanach dealanach ADJ;PL;GEN
28 | sliochdmhor sliochdmhor ADJ;PL;DAT
29 | abair thuirteadh V;PST;PASS
30 | cuagach cuagach ADJ;SG;MASC;NOM
31 | ioma-ghlac dh'ioma-ghlacadh V;COND;ACT
32 | dualchasach dualchasach ADJ;PL;GEN
33 | cuagach chuagaich ADJ;SG;FEM;DAT
34 | eagalach eagalaich ADJ;SG;FEM;VOC
35 | sliochdmhor shliochdmhor ADJ;SG;FEM;NOM
36 | drùis-mhiannach drùis-mhiannach ADJ;PL;VOC
37 | nòsach nòsach ADJ;SG;FEM;NOM
38 | sliochdmhor shliochdmhoire ADJ;SG;FEM;VOC
39 | Spàinneach Spàinneach ADJ;SG;FEM;NOM
40 | seann-nòsach sheann-nòsaich ADJ;SG;FEM;DAT
41 | beir beiridh V;FUT;ACT
42 | crùb chrùb V;PST;ACT
43 | uaibhreach uaibhrich ADJ;SG;FEM;DAT
44 | glac ghlacadh V;PST;PASS
45 | brònach bhrònaich ADJ;SG;FEM;DAT
46 | grinn ghrinn ADJ;SG;MASC;VOC
47 | nòsail nòsaile ADJ;PL;NOM
48 | rìomhach rìomhaich ADJ;SG;FEM;GEN
49 | sgèimheach sgèimheach ADJ;PL;NOM
50 | sìolmhor sìolmhor ADJ;SG;MASC;NOM
51 |
--------------------------------------------------------------------------------
/model_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 |
5 | """ MASKS UTILS """
6 | def _generate_subsequent_mask(src_sz, tgt_sz):
7 | mask = (torch.triu(torch.ones(src_sz, tgt_sz)) == 1).transpose(0, 1)
8 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
9 | return mask
10 |
11 |
12 | def _generate_square_subsequent_mask(sz):
13 | return _generate_subsequent_mask(sz, sz)
14 |
15 |
16 | """ EMBEDDING UTILS """
17 | def Embedding(num_embeddings, embedding_dim, padding_idx):
18 | """ Generates embeddings for tokens in vocabulary
19 | Weights initialized with mean=0 and std=sqrt(embedding_dim)"""
20 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
21 | nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
22 | nn.init.constant_(m.weight[padding_idx], 0)
23 | return m
24 |
25 |
26 | """ POSITIONAL ENCODING UTILS """
27 | class PositionalEncoding(nn.Module):
28 | """ Adds positional encoding to sequences """
29 | def __init__(self, embedding_dim, dropout=0.1, max_seq_len=100):
30 | """ Initializes a seq_len x 1 x embedding_dim positional encoding matrix"""
31 | super(PositionalEncoding, self).__init__()
32 | self.dropout = nn.Dropout(p=dropout)
33 |
34 | pe = torch.zeros(max_seq_len, embedding_dim)
35 | position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
36 | div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
37 | pe[:, 0::2] = torch.sin(position * div_term)
38 | pe[:, 1::2] = torch.cos(position * div_term)
39 | pe = pe.unsqueeze(0).transpose(0, 1)
40 | self.register_buffer('pe', pe)
41 |
42 | def forward(self, x):
43 | """ Adds positional encoding to the input.
44 | Input of dimensions (seq_len x batch_sz x embedding_dim).
45 | Adds positional encoding matrix (seq_len x 1 x embedding_dim) to every individual example in batch """
46 | x = x + self.pe[:x.size(0), :]
47 | return self.dropout(x)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pointer-Generator-Transformer-Inflection-2019
2 | Transformer model and pointer-generator transformer for the morphological inflection task
3 |
4 | **MEDIUM RESOURCE TRAINING FILE - ENGLISH EXAMPLE**
5 |
6 | *Data augmentation for dataset* - python augment.py --src "data/english-train-medium" --out "data/english-train-medium-aug"
7 |
8 | *Create vocabulary for dataset* - python vocabulary.py --src "data/english-train-medium" --vocab "data/english-train-medium-vocab"
9 |
10 | *Train model* - python train.py \
11 | --train "data/english-train-medium" --dev "data/english-dev" --vocab "data/english-train-medium-vocab" checkpoints-dir "checkpoints" \
12 | --batch-size 128 --epochs 200 --eval-every 1 --resume True \
13 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \
14 | --lr 0.001 --beta2 0.98 \
15 | --scheduler warmupinvsqr --patience 10 --min-lr 1e-5 --warmup-steps 4000
16 |
17 | *Generate Predictions with model* - python generate.py \
18 | --model-checkpoint "checkpoints/model_best.pth" \
19 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \
20 | --test "data/english-covered-test" \
21 | --vocab "data/english-train-medium-vocab" \
22 | --pred "data/english-test-pred-medium"
23 |
24 | *Compute accuracy of test set predictions* - python evaluate.py \
25 | --pred "data/english-test-pred-medium" --target "data/english-test"
26 |
27 |
28 | **LOW RESOURCE TRAINING FILE - ENGLISH EXAMPLE**
29 |
30 | *Data augmentation for dataset* - python augment.py --src "data/english-train-low" --out "data/english-train-low-aug"
31 |
32 | *Create vocabulary for dataset* - python vocabulary.py --src "data/english-train-low" --vocab "data/english-train-low-vocab"
33 |
34 | *Train model* - python train.py \
35 | --train "data/english-train-low" --dev "data/english-dev" --vocab "data/english-train-low-vocab" checkpoints-dir "checkpoints" \
36 | --batch-size 128 --epochs 200 --eval-every 1 --resume True \
37 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \
38 | --lr 0.001 --beta2 0.98 \
39 | --scheduler warmupinvsqr --patience 10 --min-lr 1e-5 --warmup-steps 4000
40 |
41 | *Generate Predictions with model* - python generate.py \
42 | --model-checkpoint "checkpoints/model_best.pth" \
43 | --arch transformer --embed-dim 64 --fcn-dim 256 --num-heads 4 --num-layers 2 --dropout 0.2 \
44 | --test "data/english-covered-test" \
45 | --vocab "data/english-train-low-vocab" \
46 | --pred "data/english-test-pred-low"
47 |
48 | *Compute accuracy of test set predictions* - python evaluate.py \
49 | --pred "data/english-test-pred-low" --target "data/english-test"
50 |
--------------------------------------------------------------------------------
/run_models.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | TRAIN = "train"
4 | DEV = "dev"
5 | # TEST = "test"
6 | TEST = "dev"# For now, use dev set as test
7 | LANGUAGES = ["english", "french", "irish", "italian", "spanish"]
8 | RESOURCES = ["low"]#"medium"]
9 | MODEL_TYPE = ["transformer"]#, "pointer_generator"]
10 | EPOCHS_PER_RESOURCE = {"low": 1100, "medium": 400}
11 | BATCH_SIZE_PER_RESOURCE = {"low": 64, "medium": 128}
12 | EVAL_EVERY = 25
13 | for model in MODEL_TYPE:
14 | for resource in RESOURCES:
15 | for language in LANGUAGES:
16 | print(f"{resource} - {language}")
17 | # Get epoch and batch size
18 | epochs = EPOCHS_PER_RESOURCE[resource]
19 | batch_size = BATCH_SIZE_PER_RESOURCE[resource]
20 | # Set names of relevant files and directories
21 | train_file = f"{language}-{TRAIN}-{resource}"
22 | valid_file = f"{language}-{DEV}"
23 | test_file = f"{language}-{TEST}"
24 | covered_test_file = f"{language}-covered-{TEST}"
25 | pred_file = f"{language}-{resource}-{TEST}-pred"
26 | vocab_file = f"{train_file}-vocab"
27 |
28 | data_folder = "data"
29 | vocab_folder = f"vocab/{language}/{resource}"
30 | checkpoints_folder = f"model-checkpoints-test/{model}/{language}/{resource}"
31 | pred_folder = f"predictions-test/{model}"
32 | logs_folder = "logs-test"
33 |
34 | # create necessary folders, if they do not exist already
35 | if not os.path.exists(vocab_folder):
36 | os.makedirs(vocab_folder)
37 | if not os.path.exists(checkpoints_folder):
38 | os.makedirs(checkpoints_folder)
39 | if not os.path.exists(pred_folder):
40 | os.makedirs(pred_folder)
41 | if not os.path.exists(logs_folder):
42 | os.makedirs(logs_folder)
43 |
44 | # Create vocabulary
45 | # print(f"python vocabulary.py --src {data_folder}/{train_file} --vocab {data_folder}/{vocab_file}")
46 | # Train model
47 | os.system(f"python train.py --arch {model} --epochs {epochs} --batch-size {batch_size} --eval-every {EVAL_EVERY} " +
48 | f"--train {data_folder}/{train_file} --dev {data_folder}/{valid_file} " +
49 | f"--vocab {vocab_folder}/{vocab_file} --checkpoints-folder {checkpoints_folder}" +
50 | f" >> {logs_folder}/train-log-{model}-{resource}-{language}.out")
51 | # Generate predictions for test set
52 | os.system(f"python generate.py --model-checkpoint {checkpoints_folder}/model_best.pth " +
53 | f"--test {data_folder}/{covered_test_file} --vocab {vocab_folder}/{vocab_file} " +
54 | f"--pred {pred_folder}/{pred_file}")
55 | # Evaluate accuracy of prediction file compared to true test set
56 | os.system(f"python evaluate.py --pred {pred_folder}/{pred_file} " +
57 | f"--target {data_folder}/{test_file}")
--------------------------------------------------------------------------------
/data/hebrew-train-low:
--------------------------------------------------------------------------------
1 | התנשק תתנשׁקו V;2;PL;FUT:MASC
2 | הגיב הגבנו V;1;PL;PST
3 | דרש דרושנה V;2;PL;IMP;FEM
4 | חינך תחנך V;3;SG;FUT;FEM
5 | תיאר תיארתן V;2;PL;PST;FEM
6 | גליל גלילי N;SG;N;SGDEF;PSS1S
7 | התנקש התנקשׁתן V;2;PL;PST;FEM
8 | כיסא כיסאו N;SG;PSSD;PSS2S;MASC
9 | הוסף הוספתם V;2;PL;PST;MASC
10 | התקבל מתקבל V;SG;PRS;MASC
11 | התנפל התנפלתן V;2;PL;PST;FEM
12 | נשמה נשׁמותיו N;PL;PSSD;PSS2S;MASC
13 | טריבונה טריבונה N;SG;NDEF
14 | טופל תטופל V;2;SG;FUT;MASC
15 | תיעב תיעבת V;2;SG;PST;FEM
16 | מולדת מולדתך N;SG;DEF;PSS2S;FEM
17 | שרק שׁרקת V;2;SG;PST;FEM
18 | עולם עולמיהן N;PL;PSSD;PSS2P;FEM
19 | נאהב יאהבו V;3;PL;FUT;MASC
20 | סגר סגרנו V;1;PL;PST
21 | הצלחה הצלחותיכן N;PL;DEF;PSS2P;FEM
22 | שחזור שחזוריי N;PL;N;PLDEF;PSS1S
23 | מרד ימרדו V;3;PL;FUT;MASC
24 | קם קמתן V;2;PL;PST;FEM
25 | פגע פגעת V;2;SG;PST;MASC
26 | סר לסור V;NFIN
27 | היה אהיה V;1;SG;FUT
28 | יכול יוכל V;3;SG;FUT;MASC
29 | פרובינציה פרובינצית N;SG;PSSD
30 | רומם תרומם V;3;SG;FUT;FEM
31 | נאכף נאכפת V;2;SG;PST;MASC
32 | הביא תביאנה V;3;PL;FUT;FEM
33 | מגבת מגבת N;SG;NDEF
34 | הגיע יגיע V;3;SG;FUT;MASC
35 | חנוכייה חנוכיותיה N;PL;PSSD;PSS2S;FEM
36 | סרט סרטכן N;SG;DEF;PSS2P;FEM
37 | חזק חזקה V;3;SG;PST;FEM
38 | ליקק ליקקה V;3;SG;PST;FEM
39 | התפוצץ התפוצצתם V;2;PL;PST;MASC
40 | סדרה סדרתכן N;SG;DEF;PSS2P;FEM
41 | הושיב הושׁבתן V;2;PL;PST;FEM
42 | חזק תחזקו V;2;PL;FUT:MASC
43 | הכשיר הכשׁרנה V;2;PL;IMP;FEM
44 | אלוה אלוהו N;SG;PSSD;PSS2S;MASC
45 | מצב מצביו N;PL;PSSD;PSS2S;MASC
46 | אץ אצה V;3;SG;PST;FEM
47 | קטון תקטנו V;2;PL;FUT:MASC
48 | חדר חדרת V;2;SG;PST;FEM
49 | הוסיף להוסיף V;NFIN
50 | התנדב להתנדב V;NFIN
51 | גילגל יגלגלו V;3;PL;FUT;MASC
52 | עורר תעוררנה V;3;PL;FUT;FEM
53 | גילגל מגלגלת V;SG;PRS;FEM
54 | הרכיב תרכבנה V;3;PL;FUT;FEM
55 | נמס נימס V;1;PL;FUT
56 | האמין האמנו V;1;PL;PST
57 | מסירה מסירותינו N;PL;N;PLDEF;PSS1P
58 | ברח בורחים V;PL;PRS;MASC
59 | כונס כונסת V;2;SG;PST;FEM
60 | לימד ללמד V;NFIN
61 | משיכה משׁיכתן N;SG;PSSD;PSS2P;FEM
62 | התחבא התחבאו V;2;PL;IMP;MASC
63 | תיאר תיארתם V;2;PL;PST;MASC
64 | הוריד הורדנה V;2;PL;IMP;FEM
65 | לקה לקו V;2;PL;IMP;MASC
66 | תיבל תבל V;2;SG;IMP;MASC
67 | מצץ מצצת V;2;SG;PST;FEM
68 | יונה יוניכן N;PL;DEF;PSS2P;FEM
69 | התעורר תתעוררנה V;2;PL;FUT;FEM
70 | כובה אכובה V;1;SG;FUT
71 | הקיף הקיפו V;2;PL;IMP;MASC
72 | נשם נשׁמתן V;2;PL;PST;FEM
73 | זקף זקף V;3;SG;PST;MASC
74 | שריפה שׂרפותיהן N;PL;PSSD;PSS2P;FEM
75 | קטף קטופנה V;2;PL;IMP;FEM
76 | ביקר ביקרתם V;2;PL;PST;MASC
77 | התקנא התקנאנו V;1;PL;PST
78 | הזדמנות הזדמנויותיהם N;PL;PSSD;PSS2P;MASC
79 | השמיד השׁמדת V;2;SG;PST;MASC
80 | זרח זרח V;3;SG;PST;MASC
81 | תרם תרמתי V;1;SG;PST
82 | רשם תרשום V;2;SG;FUT;MASC
83 | גלה גלית V;2;SG;PST;MASC
84 | ורד ורדיך N;PL;DEF;PSS2S;MASC
85 | ירה תירינה V;2;PL;FUT;FEM
86 | פתח תפתחו V;2;PL;FUT:MASC
87 | עישן עשׁנו V;2;PL;IMP;MASC
88 | כתובית כתוביותיהם N;PL;PSSD;PSS2P;MASC
89 | שחיטה שׁחיטתה N;SG;PSSD;PSS2S;FEM
90 | רבה תרבה V;3;SG;FUT;FEM
91 | בטן בטניכן N;PL;DEF;PSS2P;FEM
92 | מכר מוכרים V;PL;PRS;MASC
93 | שמלה שׂמלתי N;SG;N;SGDEF;PSS1S
94 | מלוכה מלוכתנו N;SG;N;SGDEF;PSS1P
95 | שחיטה השׁחיטות N;PL;DEF
96 | שכן שׁכנו V;2;PL;IMP;MASC
97 | שיקר שיקרתי V;1;SG;PST
98 | נאהד נאהדנו V;1;PL;PST
99 | הגיע הגעת V;2;SG;PST;FEM
100 | מת תמותו V;2;PL;FUT:MASC
101 |
--------------------------------------------------------------------------------
/vocabulary.py:
--------------------------------------------------------------------------------
1 | import os
2 | import itertools
3 | import collections
4 | import argparse
5 |
6 | import data
7 | import tokenizer
8 | import utils
9 |
10 | """Reads conll file using functions in data (only train files are used to create a vocabulary). Using generic
11 | tokenizer functions (and not Tokenizer object that uses a vocabulary), creates input tokens vocabulary and target
12 | tokens vocabulary in different files. """
13 |
14 |
15 | # Arguments
16 | parser = argparse.ArgumentParser(
17 | description='Reads conll file (which is the dataset), and creates vocabulary files for input and output')
18 | parser.add_argument('--src', type=str, default='train',
19 | help="Source file of the dataset used to create the vocabulary (must include folder path)")
20 | parser.add_argument('--vocab', type=str, default='vocab',
21 | help="Target path of the vocabulary (must include folder path)")
22 | args = parser.parse_args()
23 |
24 | """ CONSTANTS """
25 | WORD_FLAG = "WORD"
26 | FEATURE_FLAG = "FEATURE"
27 |
28 |
29 | def get_tokens_from_list(words_list, flag):
30 | """ Gets list of of either words or concatenated features, and returns one list of all tokens"""
31 | tokens_list = []
32 | if flag == WORD_FLAG:
33 | # Split words to lists of characters
34 | tokens_list = tokenizer.tokenize_words(words_list)
35 | else:
36 | # Split features by separator sign ";"
37 | tokens_list = tokenizer.tokenize_features(words_list)
38 | # Flat lists of tokens into one list of all tokens
39 | return list(itertools.chain.from_iterable(tokens_list))
40 |
41 |
42 | def write_vocab_to_file(tokens_list, vocab_file_path):
43 | """
44 | Counts all tokens in list and writes them to file. Make dir if not exists.
45 | """
46 | utils.maybe_mkdir(vocab_file_path)
47 | vocab_file = open(vocab_file_path, "w", encoding='utf-8') # "ISO-8859-1")
48 | # Get counter object to hold counts of characters
49 | vocab_counter = collections.Counter(tokens_list)
50 | # Write vocabulary (counter object) to file in order of frequency
51 | for vocab, count in vocab_counter.most_common():
52 | vocab_file.write(f"{vocab}\t{count}\n")
53 |
54 | vocab_file.close()
55 |
56 |
57 | def create_vocab_files(src_file_path, vocab_file_path):
58 | """ Reads morph file and creates input tokens vocabulary and target tokens vocabulary, and writes them in
59 | different files """
60 | lemmas, targets, features = data.read_train_file(src_file_path)
61 | # Get tokens lists for source, target lemmas and features
62 | lemmas_tokens = get_tokens_from_list(lemmas, WORD_FLAG)
63 | targets_tokens = get_tokens_from_list(targets, WORD_FLAG)
64 | features_tokens = get_tokens_from_list(features, FEATURE_FLAG)
65 | # input tokens = lemmas tokens + features tokens
66 | input_tokens = lemmas_tokens + targets_tokens + features_tokens
67 | output_tokens = lemmas_tokens + targets_tokens
68 | # write vocabularies of inputs and outputs to files
69 | write_vocab_to_file(input_tokens, vocab_file_path + "-input")
70 | write_vocab_to_file(output_tokens, vocab_file_path + "-output")
71 |
72 |
73 | if __name__ == '__main__':
74 | # Create vocab files
75 | create_vocab_files(args.src, args.vocab)
76 |
--------------------------------------------------------------------------------
/data/english-train-low:
--------------------------------------------------------------------------------
1 | dreep dreep V;NFIN
2 | charcoal charcoal V;NFIN
3 | stodge stodges V;3;SG;PRS
4 | biotransform biotransform V;NFIN
5 | disallow disallowing V;V.PTCP;PRS
6 | precut precut V;V.PTCP;PST
7 | outmanœuvre outmanœuvred V;PST
8 | unsnib unsnibbing V;V.PTCP;PRS
9 | Afghanize Afghanized V;PST
10 | redescribe redescribes V;3;SG;PRS
11 | overspeculate overspeculates V;3;SG;PRS
12 | reënter reënters V;3;SG;PRS
13 | waller wallering V;V.PTCP;PRS
14 | carboxylate carboxylating V;V.PTCP;PRS
15 | imprison imprisoned V;PST
16 | helicopt helicopted V;PST
17 | tut tutted V;V.PTCP;PST
18 | misdoom misdooms V;3;SG;PRS
19 | mush mush V;NFIN
20 | billhook billhook V;NFIN
21 | ingrave ingraved V;PST
22 | estheticize estheticize V;NFIN
23 | off-split off-split V;PST
24 | excecate excecating V;V.PTCP;PRS
25 | hegemonise hegemonised V;V.PTCP;PST
26 | overregularize overregularized V;PST
27 | innoculate innoculates V;3;SG;PRS
28 | mopy mopying V;V.PTCP;PRS
29 | unhyphenate unhyphenated V;PST
30 | acetize acetizes V;3;SG;PRS
31 | outjuggle outjuggled V;V.PTCP;PST
32 | reexplain reexplain V;NFIN
33 | homestead homestead V;NFIN
34 | pregame pregamed V;V.PTCP;PST
35 | carabine carabined V;PST
36 | deauthorise deauthorising V;V.PTCP;PRS
37 | sqush squshed V;V.PTCP;PST
38 | gang-rape gang-rapes V;3;SG;PRS
39 | transfect transfect V;NFIN
40 | bemar bemarred V;V.PTCP;PST
41 | intrunk intrunk V;NFIN
42 | atray atrayed V;PST
43 | conculcate conculcated V;PST
44 | jettison jettisoning V;V.PTCP;PRS
45 | equidistribute equidistributes V;3;SG;PRS
46 | postdict postdicted V;V.PTCP;PST
47 | fine fined V;PST
48 | autohide autohides V;3;SG;PRS
49 | nourish nourished V;V.PTCP;PST
50 | jack jacks V;3;SG;PRS
51 | superrefine superrefined V;V.PTCP;PST
52 | begrasp begrasping V;V.PTCP;PRS
53 | autostart autostarted V;V.PTCP;PST
54 | rearrive rearrived V;PST
55 | motorcycle motorcycled V;V.PTCP;PST
56 | countersink countersank V;PST
57 | confessionalise confessionalise V;NFIN
58 | underfang underfang V;NFIN
59 | yabby yabbying V;V.PTCP;PRS
60 | betrail betrails V;3;SG;PRS
61 | whish whish V;NFIN
62 | deconceptualise deconceptualising V;V.PTCP;PRS
63 | bedrink bedrank V;PST
64 | effervesce effervesces V;3;SG;PRS
65 | failcascade failcascading V;V.PTCP;PRS
66 | wok woks V;3;SG;PRS
67 | quahog quahog V;NFIN
68 | jarp jarped V;V.PTCP;PST
69 | plum plumming V;V.PTCP;PRS
70 | reclude recluding V;V.PTCP;PRS
71 | handplant handplanted V;V.PTCP;PST
72 | overvolt overvolting V;V.PTCP;PRS
73 | quicksave quicksaving V;V.PTCP;PRS
74 | sandbathe sandbathes V;3;SG;PRS
75 | roister roistering V;V.PTCP;PRS
76 | mull mulls V;3;SG;PRS
77 | upclock upclocking V;V.PTCP;PRS
78 | subgrant subgrant V;NFIN
79 | insense insenses V;3;SG;PRS
80 | embrown embrowned V;PST
81 | sour soured V;V.PTCP;PST
82 | mesmerize mesmerize V;NFIN
83 | spill spills V;3;SG;PRS
84 | misevaluate misevaluated V;V.PTCP;PST
85 | sepose seposed V;V.PTCP;PST
86 | deem deemed V;V.PTCP;PST
87 | complex complex V;NFIN
88 | rehumanize rehumanizing V;V.PTCP;PRS
89 | unbrutalize unbrutalizes V;3;SG;PRS
90 | inform informs V;3;SG;PRS
91 | drool drooling V;V.PTCP;PRS
92 | radicate radicates V;3;SG;PRS
93 | gap gap V;NFIN
94 | addend addending V;V.PTCP;PRS
95 | derivate derivate V;NFIN
96 | misdevelop misdevelop V;NFIN
97 | drivel drivel V;NFIN
98 | prolapse prolapsed V;V.PTCP;PST
99 | instaure instaures V;3;SG;PRS
100 | fute futing V;V.PTCP;PRS
101 |
--------------------------------------------------------------------------------
/data/middle-high-german-train-low:
--------------------------------------------------------------------------------
1 | werden wurdet V;IND;PST;2;PL
2 | triegen trieget V;SBJV;PRS;2;PL
3 | binden binde V;IND;PRS;1;SG
4 | singen singet V;IND;PRS;3;SG
5 | verswinden verswindende V.PTCP;PRS
6 | burcgrave burcgrâven N;DAT;SG
7 | bruoder bruoder N;NOM;SG
8 | betriegen betriugest V;IND;PRS;2;SG
9 | slahen slage V;SBJV;PRS;1;SG
10 | swern sweret V;IND;PRS;3;SG
11 | grave grâven N;NOM;PL
12 | slahen slaget V;SBJV;PRS;2;PL
13 | trinken trinken V;IND;PRS;1;PL
14 | enpfinden enpfunden V;IND;PST;3;PL
15 | werfen wërfet V;SBJV;PRS;2;PL
16 | wahsen wahset V;SBJV;PRS;3;SG
17 | tac tac N;ACC;SG
18 | swern swuoren V;IND;PST;3;PL
19 | tac tage N;NOM;PL
20 | slahen slegest V;IND;PRS;2;SG
21 | betriegen betriegen V;NFIN
22 | helfen hëlfen V;NFIN
23 | werfen würfen V;SBJV;PST;1;PL
24 | vinden vünde V;IND;PST;2;SG
25 | trinken trinket V;IND;PRS;2;PL
26 | werfen geworfen V.PTCP;PST
27 | beginnen beginnet V;IMP;2;PL
28 | slahen slagen V;NFIN
29 | burcgrave burcgrâven N;NOM;PL
30 | ziehen ziehe V;SBJV;PRS;1;SG
31 | ziehen ziehet V;IMP;2;PL
32 | burcgrave burcgrâven N;DAT;PL
33 | swern swuoren V;IND;PST;1;PL
34 | beginnen begünnest V;SBJV;PST;2;SG
35 | swimmen swümme V;SBJV;PST;3;SG
36 | binden bündet V;SBJV;PST;2;PL
37 | betriegen betrugen V;IND;PST;3;PL
38 | engel engele N;DAT;SG
39 | singen singent V;IND;PRS;3;PL
40 | werfen wërfen V;IND;PRS;1;PL
41 | werden wirde V;IND;PRS;1;SG
42 | rinnen runnen V;IND;PST;1;PL
43 | swimmen swimment V;IND;PRS;3;PL
44 | enpfinden enpfand V;IND;PST;1;SG
45 | binden binden V;NFIN
46 | slahen slagen V;SBJV;PRS;3;PL
47 | biegen biegent V;IND;PRS;3;PL
48 | binden bindest V;SBJV;PRS;2;SG
49 | swern sweren V;SBJV;PRS;1;PL
50 | betriegen betrügen V;SBJV;PST;1;PL
51 | rinnen rinnet V;SBJV;PRS;3;SG
52 | swern sweren V;IND;PRS;1;PL
53 | swimmen swimmet V;SBJV;PRS;2;PL
54 | swern swüren V;SBJV;PST;1;PL
55 | beginnen begünne V;IND;PST;2;SG
56 | biegen biegen V;IND;PRS;1;PL
57 | helfen hëlfet V;IMP;2;PL
58 | vinden vünde V;SBJV;PST;1;SG
59 | sieden gesoten V.PTCP;PST
60 | biegen bugen V;IND;PST;3;PL
61 | vinden vind V;IMP;2;SG
62 | helfen hëlfet V;SBJV;PRS;3;SG
63 | swern swernde V.PTCP;PRS
64 | kiesen kiesen V;NFIN
65 | trinken trinkest V;IND;PRS;2;SG
66 | ziehen ziehende V.PTCP;PRS
67 | werfen wërfen V;SBJV;PRS;1;PL
68 | swimmen swimmende V.PTCP;PRS
69 | enpfinden enpfünde V;SBJV;PST;1;SG
70 | sieden sutet V;IND;PST;2;PL
71 | überganc überganc N;ACC;SG
72 | sieden süte V;IND;PST;2;SG
73 | sieden sieden V;IND;PRS;1;PL
74 | werden wërden V;SBJV;PRS;3;PL
75 | trinken trinken V;NFIN
76 | werden würden V;SBJV;PST;1;PL
77 | rinnen rinnent V;IND;PRS;3;PL
78 | swern swere V;SBJV;PRS;1;SG
79 | binden bindende V.PTCP;PRS
80 | swern swüre V;SBJV;PST;3;SG
81 | wahsen wahsen V;IND;PRS;1;PL
82 | werfen würfe V;IND;PST;2;SG
83 | betriegen betriegen V;IND;PRS;1;PL
84 | ziehen zühest V;SBJV;PST;2;SG
85 | triegen trüge V;IND;PST;2;SG
86 | helfen hülfen V;SBJV;PST;1;PL
87 | swimmen swamm V;IND;PST;3;SG
88 | überganc übergange N;DAT;SG
89 | triegen trüget V;SBJV;PST;2;PL
90 | swimmen swimme V;IND;PRS;1;SG
91 | enpfinden enpfünde V;SBJV;PST;3;SG
92 | swimmen swimme V;SBJV;PRS;1;SG
93 | swern swüret V;SBJV;PST;2;PL
94 | beginnen beginnest V;IND;PRS;2;SG
95 | binden bindet V;SBJV;PRS;3;SG
96 | beginnen begünnen V;SBJV;PST;3;PL
97 | singen singen V;SBJV;PRS;3;PL
98 | singen singe V;IND;PRS;1;SG
99 | werfen wërfen V;SBJV;PRS;3;PL
100 | singen singest V;IND;PRS;2;SG
101 |
--------------------------------------------------------------------------------
/data/scottish-gaelic-train-low:
--------------------------------------------------------------------------------
1 | Spàinneach Spàinneach ADJ;PL;VOC
2 | toilich a' toileachadh V;V.PTCP;PRS
3 | rèitich a' rèiteachadh V;V.PTCP;PRS
4 | eagalach eagalaich ADJ;SG;MASC;VOC
5 | uaibhreach uaibhreach ADJ;SG;MASC;NOM
6 | nòsach nòsach ADJ;SG;FEM;VOC
7 | sìolmhor shìolmhor ADJ;SG;FEM;NOM
8 | rèitich rèiticheadh V;PST;PASS
9 | eagalach eagalaich ADJ;SG;FEM;DAT
10 | beulach bheulach ADJ;SG;FEM;NOM
11 | sùgh shùgh V;PST;ACT
12 | gruamach ghruamach ADJ;SG;FEM;NOM
13 | Albannach Albannaich ADJ;SG;MASC;GEN
14 | bacach bacach ADJ;PL;DAT
15 | crùbach chrùbaich ADJ;SG;FEM;DAT
16 | brònach bhrònaich ADJ;SG;MASC;VOC
17 | Breatannach Breatannach ADJ;PL;VOC
18 | geal geala ADJ;PL;GEN
19 | rèidh rèidh ADJ;SG;MASC;NOM
20 | nòsach nòsaich ADJ;SG;MASC;GEN
21 | Albannach Albannach ADJ;PL;VOC
22 | gobach ghobach ADJ;PL;VOC
23 | stiùir stiùireadh V;PST;PASS
24 | seòl sheòl V;PST;ACT
25 | Cuimreach Cuimreach ADJ;PL;VOC
26 | ruadh ruadha ADJ;PL;VOC
27 | sìolmhor sìolmhor ADJ;PL;DAT
28 | gobach ghobaich ADJ;SG;FEM;DAT
29 | nuadh nuaidh ADJ;SG;FEM;DAT
30 | drùis-mhiannach dhrùis-mhiannaich ADJ;SG;FEM;VOC
31 | fàsaichte fhàsaichte ADJ;SG;FEM;VOC
32 | Spàinneach Spàinnich ADJ;SG;MASC;VOC
33 | nuadh nuadha ADJ;PL;NOM
34 | seòl sheòladh V;COND;ACT
35 | gobach ghobach ADJ;SG;FEM;NOM
36 | ioma-ghlac ioma-ghlacaidh V;FUT;ACT
37 | gruamach ghruamach ADJ;SG;FEM;VOC
38 | Manainneach Manainneach ADJ;PL;DAT
39 | crùbach crùbach ADJ;PL;GEN
40 | uaibhreach uaibhreach ADJ;PL;DAT
41 | pronn pronn ADJ;PL;VOC
42 | brònach bhrònach ADJ;SG;MASC;DAT
43 | maiseach mhaisich ADJ;SG;FEM;VOC
44 | ruadh ruadha ADJ;PL;NOM
45 | maiseach mhaisich ADJ;SG;MASC;VOC
46 | seann-nòsach sheann-nòsach ADJ;SG;MASC;DAT
47 | Spàinneach Spàinnich ADJ;SG;MASC;GEN
48 | brònach bhrònach ADJ;SG;FEM;VOC
49 | rèitich rèiticheadh V;COND;ACT
50 | dealanach dhealanaich ADJ;SG;FEM;VOC
51 | fionn fionna ADJ;PL;NOM
52 | grinn ghrinn ADJ;SG;FEM;VOC
53 | abair their V;FUT;ACT
54 | sùgh shùghadh V;PST;PASS
55 | bacach bacach ADJ;SG;MASC;NOM
56 | rìomhach rìomhach ADJ;SG;FEM;NOM
57 | beulach bheulach ADJ;PL;GEN
58 | Manainneach Mhanainnich ADJ;SG;MASC;VOC
59 | beir rug V;PST;ACT
60 | brèagha brèagha ADJ;PL;VOC
61 | beulach bheulach ADJ;PL;VOC
62 | sliochdmhor sliochdmhor ADJ;SG;MASC;NOM
63 | caidil chaidileadhnote V;COND;ACT
64 | grinn grinne ADJ;PL;NOM
65 | searg sheargadh V;PST;PASS
66 | grinn grinn ADJ;SG;MASC;NOM
67 | nòsail nòsail ADJ;SG;FEM;VOC
68 | ligh lighidh V;FUT;ACT
69 | brònach brònach ADJ;PL;VOC
70 | bòidheach bhòidhich ADJ;SG;MASC;GEN
71 | sgèimheach sgèimhich ADJ;SG;MASC;GEN
72 | seasg sheisge ADJ;SG;FEM;DAT
73 | miannach miannach ADJ;PL;NOM
74 | mòrail mòrail ADJ;PL;DAT
75 | sùgh sùghaidh V;FUT;ACT
76 | sgòideil sgòideile ADJ;SG;FEM;GEN
77 | Breatannach Bhreatannaich ADJ;SG;MASC;GEN
78 | geal gheal ADJ;SG;FEM;NOM
79 | crùbach chrùbach ADJ;SG;FEM;NOM
80 | dealanach dealanach ADJ;PL;VOC
81 | pròiseil phròiseil ADJ;SG;FEM;GEN
82 | fionn fionn ADJ;SG;MASC;NOM
83 | làn làn ADJ;SG;FEM;VOC
84 | Manainneach Manainneach ADJ;PL;NOM
85 | gruamach ghruamaich ADJ;SG;FEM;DAT
86 | stad stadtadh V;COND;PASS
87 | miannach miannaich ADJ;SG;FEM;GEN
88 | dòth dhòthadh V;PST;PASS
89 | Albannach Albannach ADJ;SG;MASC;DAT
90 | pròiseil phròiseil ADJ;SG;MASC;VOC
91 | dealanach dhealanach ADJ;SG;MASC;DAT
92 | nuadh nuadh ADJ;SG;FEM;VOC
93 | sìolmhor sìolmhor ADJ;PL;VOC
94 | faigh gheibhear V;FUT;PASS
95 | inbheach inbheach ADJ;SG;FEM;NOM
96 | cam chaim ADJ;SG;FEM;VOC
97 | pròiseil pròiseil ADJ;SG;MASC;NOM
98 | tòisich tòisichear V;FUT;PASS
99 | mòrail mhòrail ADJ;SG;MASC;DAT
100 | ruig ruigidh V;FUT;ACT
101 |
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | import re
2 | import tokenizer
3 |
4 | """Allows reading of conll files. Reads conll file, and 1) Splits it by different components of each examples,
5 | and also 2) Separates it to to input tokens and target tokens. conll: lemma-tab-target-tab-features """
6 |
7 |
8 | """ READING FILES """
9 | def read_morph_file(morph_file_path):
10 | """ Reads conll file, split to line, and splits each line by tabs. Returns list of lists"""
11 | # Get all lines in file
12 | morph_file = open(morph_file_path, 'r', encoding='utf-8')
13 | lines = morph_file.readlines()
14 | outputs = []
15 | # Separate lines to proper format
16 | for line in lines:
17 | if line != "\n":
18 | # Strip '\n' and split
19 | outputs.append(line.replace("\n", "").split("\t"))
20 | morph_file.close()
21 | return outputs
22 |
23 |
24 | def clean_word(word):
25 | """ Strips word of unnecessary symbols """
26 | word = re.sub("[!@#$']", '', word)
27 | return word.lower()
28 |
29 |
30 | def read_train_file(train_file_path):
31 | """ Reads conll train file, and splits to lists of input lemmas, input features and target lemmas"""
32 | lemmas = []
33 | targets = []
34 | features = []
35 | train_morph_list = read_morph_file(train_file_path)
36 |
37 | for lemma, target, feature in train_morph_list:
38 | # Add results to relevant lists
39 | lemmas.append(lemma)
40 | features.append(feature)
41 | targets.append(target)
42 |
43 | return lemmas, targets, features
44 |
45 |
46 | def read_test_file(test_input_file):
47 | """ Reads conll test file, and splits to lists of input lemmas, input features and target lemmas"""
48 | lemmas = []
49 | features = []
50 | test_morph_list = read_morph_file(test_input_file)
51 |
52 | for lemma, feature in test_morph_list:
53 | # Add results to relevant lists
54 | lemmas.append(lemma)
55 | features.append(feature)
56 |
57 | return lemmas, features
58 |
59 |
60 | def read_train_file_tokens(train_file_path):
61 | """ Reads conll train file, and splits to input tokens and target tokens.
62 | Each input and target is a list of tokens"""
63 | lemmas, targets, features = read_train_file(train_file_path)
64 | # tokenize all three lists, get as list of tokens lists
65 | lemmas_tokens = tokenizer.tokenize_words(lemmas)
66 | targets_tokens = tokenizer.tokenize_words(targets)
67 | features_tokens = tokenizer.tokenize_features(features)
68 | # concatenate feature tokens to lemma tokens
69 | input_tokens = [lemma_tokens + feature_tokens for lemma_tokens, feature_tokens in
70 | zip(lemmas_tokens, features_tokens)]
71 | return input_tokens, targets_tokens
72 |
73 |
74 | def read_test_file_tokens(test_file_path):
75 | """ Reads conll test file. Each input is a list of tokens"""
76 | lemmas, features = read_test_file(test_file_path)
77 | # tokenize all two lists, get as list of tokens lists
78 | lemmas_tokens = tokenizer.tokenize_words(lemmas)
79 | features_tokens = tokenizer.tokenize_features(features)
80 | # concatenate feature tokens to lemma tokens
81 | input_tokens = [lemma_tokens + feature_tokens for lemma_tokens, feature_tokens in
82 | zip(lemmas_tokens, features_tokens)]
83 | return input_tokens
84 |
85 |
86 | """ WRITING FILES """
87 | def write_morph_file(lemmas, targets, features, out_file_path):
88 | """ Writes tokens as conll file """
89 | # Get all lines in file
90 | out_file = open(out_file_path, 'w', encoding='utf-8')
91 | for lemma, target, feature in zip(lemmas, targets, features):
92 | out_file.write("%s\t%s\t%s\n" % (lemma, target, feature))
93 | out_file.close()
94 |
--------------------------------------------------------------------------------
/data/irish-train-low:
--------------------------------------------------------------------------------
1 | cruinnigh cruinnídís V;3;PL;IMP
2 | tíogar an tíogair N;GEN;SG;DEF
3 | pailéa-luibheolaíocht pailéa-luibheolaíocht N;DAT;SG
4 | mórghnáth mórghnáth N;NOM;SG
5 | agaigh agaídís V;3;PL;IMP
6 | agairg na hagairge N;GEN;SG;DEF
7 | scoil saothair scoileanna saothair N;DAT;PL
8 | tuathánach tuathánaigh N;GEN;SG
9 | amhrán na hamhráin N;NOM;PL;DEF
10 | ruainne ruainne N;NOM;SG
11 | sácráilteacht leis an tsácráilteacht N;DAT;SG;DEF
12 | néar-raideolaí an néar-raideolaí N;GEN;SG;DEF
13 | donacht donacht N;DAT;SG
14 | gloinigh gloineoidh mé V;1;SG;FUT;IND
15 | dear dhear tú V;2;SG;PST;IND
16 | féile féile N;GEN;SG
17 | fo-éadach fo-éadaí N;NOM;PL
18 | fiafheoil a fhiafheoil N;VOC;SG
19 | lucht luchta N;GEN;SG
20 | tiontaigh dá dtiontaídís V;3;PL;SBJV;PST
21 | fótaidhé-óid na fótaidhé-óidí N;NOM;PL;DEF
22 | réimsetheoiric leis na réimsetheoiricí N;DAT;PL;DEF
23 | baintreach leis na baintreacha N;DAT;PL;DEF
24 | rian a rian N;VOC;SG
25 | éadaitheoireacht an éadaitheoireacht N;NOM;SG;DEF
26 | stoirm na stoirme N;GEN;SG;DEF
27 | Ard-Aighne an Ard-Aighne N;GEN;SG;DEF
28 | paidir paidir N;NOM;SG
29 | leagh leaghfaimid V;1;PL;FUT;IND
30 | fiúntas an fiúntas N;NOM;SG;DEF
31 | ceardaíocht an cheardaíocht N;NOM;SG;DEF
32 | bádóir leis na bádóirí N;DAT;PL;DEF
33 | cnap leis an gcnap N;DAT;SG;DEF
34 | pocadán pocadáin N;DAT;PL
35 | fótagrafaíocht na fótagrafaíochta N;GEN;SG;DEF
36 | Guineach an Ghuinigh N;GEN;SG;DEF
37 | eagar an eagair N;GEN;SG;DEF
38 | tar thiocfaí V;AUTO;COND
39 | mórbhonn mórbhoinn N;DAT;PL
40 | fótónaic fótónaic N;NOM;SG
41 | ráthaigh ráthaígí V;2;PL;IMP
42 | liopard fiaigh leis an liopard fiaigh N;DAT;SG;DEF
43 | bréagiontaise bréagiontaise N;GEN;SG
44 | cóisir bhrídeoige an chóisir bhrídeoige N;NOM;SG;DEF
45 | imaistrigh imaistríonn sé V;3;SG;PRS;IND
46 | clois cloiseann sibh V;2;PL;PRS;IND
47 | stiallfhótagrafaíocht a stiallfhótagrafaíocht N;VOC;SG
48 | lárthosaí leis an lárthosaí N;DAT;SG;DEF
49 | crúca crúcaí N;GEN;PL
50 | foghlaim d'fhoghlaim sibh V;2;PL;PST;IND
51 | luachálaí a luachálaithe N;VOC;PL
52 | folig foligeadh sé V;3;SG;IMP
53 | míolsiorc an mhíolsiorca N;GEN;SG;DEF
54 | dulcaiméir dulcaiméir N;DAT;SG
55 | fiaile fiailí N;DAT;PL
56 | culaith culaith N;DAT;SG
57 | blast leis na blastaí N;DAT;PL;DEF
58 | grúdaigh dá ngrúdaínn V;1;SG;SBJV;PST
59 | grúdaigh grúdaígí V;2;PL;IMP
60 | snas a shnas N;VOC;SG
61 | comhghuaillíocht comhghuaillíocht N;NOM;SG
62 | teorannaigh go dteorannaítear V;AUTO;SBJV;PRS
63 | Fionlainnis na Fionlainnise N;GEN;SG;DEF
64 | siollach an tsiollaigh N;GEN;SG;DEF
65 | greannán greannán N;GEN;PL
66 | treisigh treiseoidh sibh V;2;PL;FUT;IND
67 | gasúr leis an ngasúr N;DAT;SG;DEF
68 | oileánra oileánraí N;NOM;PL
69 | teicnealathach teicnealathach ADJ;DAT;SG;MASC
70 | rí-ulchabhán na rí-ulchabháin N;NOM;PL;DEF
71 | abhrán abhrán N;GEN;PL
72 | croscheann leis an gcroscheann N;DAT;SG;DEF
73 | tilleadh tilleadh N;NOM;SG
74 | troigh na troithe N;NOM;PL;DEF
75 | naomhainmnigh a naomhainmnigh V;REL;PST;IND
76 | mí-oiriúint na mí-oiriúna N;GEN;SG;DEF
77 | léitheoir an léitheora N;GEN;SG;DEF
78 | feirm feirmeacha N;GEN;PL
79 | breathnaigh breathnaíonn tú V;2;SG;PRS;IND
80 | figh fíonn tú V;2;SG;PRS;IND
81 | éalaitheach éalaitheach N;NOM;SG
82 | coigistigh go gcoigistítear V;AUTO;SBJV;PRS
83 | sceir sceir N;DAT;SG
84 | léirmheastóir léirmheastóirí N;DAT;PL
85 | rostram a rostraim N;VOC;SG
86 | ráille tuáillí ráillí tuáillí N;DAT;PL
87 | coinnigh coinníonn sé V;3;SG;PRS;IND
88 | aíochtlann leis an aíochtlann N;DAT;SG;DEF
89 | fidil na fidleacha N;NOM;PL;DEF
90 | iompaigh dá n-iompaíteá V;2:SG;SBJV;PST
91 | cairdinéal cairdinéil N;DAT;PL
92 | sreangshúil sreangshúile N;GEN;SG
93 | ainéifeacht na hainéifeachta N;GEN;SG;DEF
94 | oibrigh d'oibreoimis V;1;PL;COND
95 | lobh lobhtar V;AUTO;PRS;IND
96 | beach beacha N;DAT;PL
97 | pá pá N;NOM;SG
98 | socheolaíocht a shocheolaíocht N;VOC;SG
99 | amplóir a amplóra N;VOC;SG
100 | aelus aelusanna N;DAT;PL
101 |
--------------------------------------------------------------------------------
/data/french-train-low:
--------------------------------------------------------------------------------
1 | enchérir enchérisse V;SBJV;PRS;3;SG
2 | retendre retendissions V;SBJV;PST;1;PL
3 | écharper écharperait V;COND;3;SG
4 | échiner échines V;IND;PRS;2;SG
5 | racornir racornira V;IND;FUT;3;SG
6 | recommencer en recommençant V.CVB;PRS
7 | dispatcher dispatchais V;IND;PST;1;SG;IPFV
8 | saler en salant V.CVB;PRS
9 | surexciter surexcitiez V;SBJV;PRS;2;PL
10 | picoter picoteront V;IND;FUT;3;PL
11 | souder en soudant V.CVB;PRS
12 | pèleriner pèlerinant V.PTCP;PRS
13 | recommencer recommences V;SBJV;PRS;2;SG
14 | chlorer chlorer V;NFIN
15 | fuir fuissent V;SBJV;PST;3;PL
16 | dépayser dépaysais V;IND;PST;1;SG;IPFV
17 | pénaliser pénalisais V;IND;PST;2;SG;IPFV
18 | retordre retordisse V;SBJV;PST;1;SG
19 | adopter adopteras V;IND;FUT;2;SG
20 | braver bravez V;IND;PRS;2;PL
21 | livrer livrer V;NFIN
22 | occidentaliser occidentalisassent V;SBJV;PST;3;PL
23 | revitaliser revitalisasse V;SBJV;PST;1;SG
24 | praeparer praeparassiez V;SBJV;PST;2;PL
25 | étalonner étalonneras V;IND;FUT;2;SG
26 | randomiser randomisant V.PTCP;PRS
27 | tergiverser tergiversâtes V;IND;PST;2;PL;PFV
28 | gober gobes V;IND;PRS;2;SG
29 | estudier estudia V;IND;PST;1;SG;PFV
30 | travailler travaillassent V;SBJV;PST;3;PL
31 | enrégimenter enrégimentassent V;SBJV;PST;3;PL
32 | résider résidez V;IND;PRS;2;PL
33 | déliter délitai V;IND;PST;1;SG;PFV
34 | reformer reformer V;NFIN
35 | remixer remixais V;IND;PST;1;SG;IPFV
36 | ouster oustassent V;SBJV;PST;3;PL
37 | aniser anise V;POS;IMP;2;SG
38 | cabasser cabasses V;SBJV;PRS;2;SG
39 | accuser accusions V;IND;PST;1;PL;IPFV
40 | marcher marchiez V;SBJV;PRS;2;PL
41 | engraisser engraisse V;POS;IMP;2;SG
42 | échauder échaudais V;IND;PST;2;SG;IPFV
43 | écurer écure V;POS;IMP;2;SG
44 | tituber titubais V;IND;PST;1;SG;IPFV
45 | larder larde V;IND;PRS;1;SG
46 | couronner couronnerais V;COND;2;SG
47 | mentionner mentionné V.PTCP;PST
48 | disloquer disloquassiez V;SBJV;PST;2;PL
49 | atterrir atterrîtes V;IND;PST;2;PL;PFV
50 | espruver espruve V;POS;IMP;2;SG
51 | importer importions V;IND;PST;1;PL;IPFV
52 | ambler amblons V;IND;PRS;1;PL
53 | eschever escheverai V;IND;FUT;1;SG
54 | receler recela V;IND;PST;3;SG;PFV
55 | biguiner biguinassent V;SBJV;PST;3;PL
56 | avantager avantageai V;IND;PST;1;SG;PFV
57 | advenir advins V;IND;PST;2;SG;PFV
58 | rouvrir rouvre V;IND;PRS;1;SG
59 | espionner espionneraient V;COND;3;PL
60 | lisser lissasses V;SBJV;PST;2;SG
61 | chaperonner chaperonnes V;IND;PRS;2;SG
62 | mugir mugir V;NFIN
63 | décroître décrû V.PTCP;PST
64 | enditer enditera V;IND;FUT;3;SG
65 | accompagner accompagnent V;SBJV;PRS;3;PL
66 | polycopier polycopiai V;IND;PST;1;SG;PFV
67 | coisier coisiez V;IND;PRS;2;PL
68 | tondre tondiez V;IND;PST;2;PL;IPFV
69 | comparer comparent V;SBJV;PRS;3;PL
70 | bêcher bêchais V;IND;PST;2;SG;IPFV
71 | abaisser abaisse V;SBJV;PRS;1;SG
72 | alterner alternassiez V;SBJV;PST;2;PL
73 | catalyser catalyses V;IND;PRS;2;SG
74 | biaiser biaisait V;IND;PST;3;SG;IPFV
75 | mercyer mercyons V;POS;IMP;1;PL
76 | renchérir renchérissions V;SBJV;PST;1;PL
77 | refléter reflèteras V;IND;FUT;2;SG
78 | défriser défrisent V;IND;PRS;3;PL
79 | régurgiter régurgite V;IND;PRS;3;SG
80 | enditer enditast V;SBJV;PST;3;SG
81 | adiouster adioustast V;SBJV;PST;3;SG
82 | gratouiller gratouille V;SBJV;PRS;3;SG
83 | kayaker kayakasses V;SBJV;PST;2;SG
84 | franciser francise V;POS;IMP;2;SG
85 | duveter duveté V.PTCP;PST
86 | tackler tackle V;IND;PRS;1;SG
87 | demorer demora V;IND;PST;3;SG;PFV
88 | dénigrer dénigrerais V;COND;2;SG
89 | présidentialiser présidentialisèrent V;IND;PST;3;PL;PFV
90 | chavirer chavirassiez V;SBJV;PST;2;PL
91 | poireauter poireautaient V;IND;PST;3;PL;IPFV
92 | débâillonner débâillonnez V;POS;IMP;2;PL
93 | comprehendre comprehendre V;NFIN
94 | kifer kiferont V;IND;FUT;3;PL
95 | sojorner sojornes V;IND;PRS;2;SG
96 | desfier en desfiant V.CVB;PRS
97 | membrer membrai V;IND;PST;1;SG;PFV
98 | enregistrer enregistraient V;IND;PST;3;PL;IPFV
99 | presupposer presupposois V;IND;PST;1;SG;IPFV
100 | sechier sechiez V;SBJV;PRS;2;PL
101 |
--------------------------------------------------------------------------------
/data/spanish-train-low:
--------------------------------------------------------------------------------
1 | reiterar no reiteren V;NEG;IMP;3;PL
2 | machucar machucaste V;IND;PST;2;SG;PFV
3 | preguntarse pregúntese V;POS;IMP;3;SG
4 | marcir no marzáis V;NEG;IMP;2;PL
5 | jaquear jaquearíamos V;COND;1;PL
6 | decir dices V;IND;PRS;2;SG
7 | atrasar atrasasen V;SBJV;PST;3;PL
8 | escasear escasearen V;SBJV;FUT;3;PL
9 | comparar comparaba V;IND;PST;1;SG;IPFV
10 | alechugar alechugare V;SBJV;FUT;1;SG
11 | redescubrir redescubriríamos V;COND;1;PL
12 | desempeñar desempeñaremos V;IND;FUT;1;PL
13 | suscitar suscitamos V;IND;PRS;1;PL
14 | expatriar expatriara V;SBJV;PST;1;SG;LGSPEC1
15 | ejercitar ejercitas V;IND;PRS;2;SG
16 | agredir agrediendo V.CVB;PRS
17 | retraducir retradujere V;SBJV;FUT;1;SG
18 | bordar bordas V;IND;PRS;2;SG
19 | cliquear cliquearas V;SBJV;PST;2;SG;LGSPEC1
20 | alocar alocaran V;SBJV;PST;3;PL;LGSPEC1
21 | extrapolar extrapolan V;IND;PRS;3;PL
22 | gratinar gratinaréis V;IND;FUT;2;PL
23 | avanzar no avancéis V;NEG;IMP;2;PL
24 | ofrendar ofrendásemos V;SBJV;PST;1;PL
25 | aojar aojares V;SBJV;FUT;2;SG
26 | editar editáramos V;SBJV;PST;1;PL;LGSPEC1
27 | entablar entablan V;IND;PRS;3;PL
28 | alivianar alivianará V;IND;FUT;3;SG
29 | perdurar perduró V;IND;PST;3;SG;PFV
30 | destorcer destorciese V;SBJV;PST;1;SG
31 | mutar muten V;POS;IMP;3;PL
32 | rebozar no reboce V;NEG;IMP;3;SG
33 | jabonar jabonarás V;IND;FUT;2;SG
34 | expedientar expedientes V;SBJV;PRS;2;SG
35 | endeudar endeudados V.PTCP;PST;MASC;PL
36 | esposar esposare V;SBJV;FUT;1;SG
37 | arrinconar arrinconamos V;IND;PRS;1;PL
38 | esquinar no esquinen V;NEG;IMP;3;PL
39 | reimpulsar reimpulsan V;IND;PRS;3;PL
40 | pisotear pisoteemos V;POS;IMP;1;PL
41 | anticipar no anticipen V;NEG;IMP;3;PL
42 | deshelar desheladas V.PTCP;PST;FEM;PL
43 | estriar estriarían V;COND;3;PL
44 | maniatar maniaten V;POS;IMP;3;PL
45 | indizar indicéis V;SBJV;PRS;2;PL
46 | ablandar ablandaban V;IND;PST;3;PL;IPFV
47 | obligar obligaren V;SBJV;FUT;3;PL
48 | sobar sobo V;IND;PRS;1;SG
49 | despeinarse os despeinaseis V;SBJV;PST;2;PL
50 | despenalizar despenalice V;SBJV;PRS;1;SG
51 | abonar abonaran V;SBJV;PST;3;PL;LGSPEC1
52 | resorber resorbías V;IND;PST;2;SG;IPFV
53 | fertilizar fertilice V;SBJV;PRS;3;SG
54 | destensar destensarías V;COND;2;SG
55 | desaprovechar desaprovechamos V;IND;PST;1;PL;PFV
56 | ficar ficasteis V;IND;PST;2;PL;PFV
57 | clavetear no clavetees V;NEG;IMP;2;SG
58 | fornicar fornico V;IND;PRS;1;SG
59 | valuar valuaren V;SBJV;FUT;3;PL
60 | situar situasen V;SBJV;PST;3;PL
61 | abonarse abónese V;POS;IMP;3;SG
62 | vandalizar no vandalicéis V;NEG;IMP;2;PL
63 | apasionar apasionases V;SBJV;PST;2;SG
64 | inteligir no intelijan V;NEG;IMP;3;PL
65 | sesionar sesionáis V;IND;PRS;2;PL
66 | transitar transitasteis V;IND;PST;2;PL;PFV
67 | pasterizar pasterizáramos V;SBJV;PST;1;PL;LGSPEC1
68 | realizar realizará V;IND;FUT;3;SG
69 | adelgazar adelgazaban V;IND;PST;3;PL;IPFV
70 | defenestrar defenestrar V;NFIN
71 | descocer descocido V.PTCP;PST;MASC;SG
72 | bifurcar bifurca V;POS;IMP;2;SG
73 | valer valga V;SBJV;PRS;1;SG
74 | intrigar intrigaban V;IND;PST;3;PL;IPFV
75 | externalizar externalizara V;SBJV;PST;3;SG;LGSPEC1
76 | efectivizar efectivizado V.PTCP;PST;MASC;SG
77 | noquear noqueare V;SBJV;FUT;3;SG
78 | vandalizar vandaliza V;IND;PRS;3;SG
79 | embarazar embarazaba V;IND;PST;3;SG;IPFV
80 | aturar aturarían V;COND;3;PL
81 | voltear volteaseis V;SBJV;PST;2;PL
82 | desunir desunimos V;IND;PST;1;PL;PFV
83 | pochar pochase V;SBJV;PST;3;SG
84 | fechar fecharía V;COND;1;SG
85 | lastimar no lastimen V;NEG;IMP;3;PL
86 | recalibrar no recalibréis V;NEG;IMP;2;PL
87 | franelear no franelees V;NEG;IMP;2;SG
88 | apostar apostada V.PTCP;PST;FEM;SG
89 | formatear formateen V;SBJV;PRS;3;PL
90 | ulcerar ulceremos V;SBJV;PRS;1;PL
91 | terraplenar terraplenó V;IND;PST;3;SG;PFV
92 | deslucir desluzcamos V;SBJV;PRS;1;PL
93 | robar robó V;IND;PST;3;SG;PFV
94 | pasteurizar pasteurizare V;SBJV;FUT;1;SG
95 | equivocarse nos equivocamos V;IND;PRS;1;PL
96 | documentar documentaran V;SBJV;PST;3;PL;LGSPEC1
97 | obstaculizar obstaculizarías V;COND;2;SG
98 | defecar defecara V;SBJV;PST;1;SG;LGSPEC1
99 | realimentar realimentaba V;IND;PST;3;SG;IPFV
100 | ripiar ripiados V.PTCP;PST;MASC;PL
101 |
--------------------------------------------------------------------------------
/data/italian-train-low:
--------------------------------------------------------------------------------
1 | rinvasare rinvasi V;SBJV;PRS;1;SG
2 | infilzare infilzeremo V;IND;FUT;1;PL
3 | rivedersi mi rivedevo V;IND;PST;1;SG;IPFV
4 | insanguinare insanguinante V.PTCP;PRS
5 | tramare tramate V;POS;IMP;2;PL
6 | scartabellare scartabelliamo V;POS;IMP;1;PL
7 | calzare calzerete V;IND;FUT;2;PL
8 | sfidarsi si sfidi V;POS;IMP;3;SG
9 | recarvisi vi si recheranno V;IND;FUT;3;PL
10 | intrufolare intrufolassi V;SBJV;PST;2;SG
11 | rinchiudere rinchiuderei V;COND;1;SG
12 | impiombare impiombi V;IND;PRS;2;SG
13 | scombinare scombini V;IND;PRS;2;SG
14 | attendersi si attenda V;POS;IMP;3;SG
15 | cazzeggiare cazzeggerei V;COND;1;SG
16 | inzeppare inzeppassi V;SBJV;PST;1;SG
17 | strabiliare strabilerò V;IND;FUT;1;SG
18 | disvolgere disvolga V;SBJV;PRS;1;SG
19 | argomentare argomentasti V;IND;PST;2;SG;PFV
20 | sciogliersi si sciolga V;POS;IMP;3;SG
21 | corcare corcavo V;IND;PST;1;SG;IPFV
22 | umiliare umilierà V;IND;FUT;3;SG
23 | estrarre estraessimo V;SBJV;PST;1;PL
24 | compiere compivi V;IND;PST;2;SG;IPFV
25 | ricoverarsi ti ricoveravi V;IND;PST;2;SG;IPFV
26 | rinsanguare rinsanguavate V;IND;PST;2;PL;IPFV
27 | assistere assisterei V;COND;1;SG
28 | sanare sani V;SBJV;PRS;2;SG
29 | azzerare azzerando V.CVB;PRS
30 | gloglottare gloglottereste V;COND;2;PL
31 | tangere tangi V;POS;IMP;2;SG
32 | invigorire invigorisca V;SBJV;PRS;3;SG
33 | innamorare innamoravi V;IND;PST;2;SG;IPFV
34 | ospedalizzare ospedalizzò V;IND;PST;3;SG;PFV
35 | piegare piegheremmo V;COND;1;PL
36 | formare formino V;POS;IMP;3;PL
37 | rassumere rassumesti V;IND;PST;2;SG;PFV
38 | sterminare sterminavate V;IND;PST;2;PL;IPFV
39 | incuriosirsi si incuriosiscano V;SBJV;PRS;3;PL
40 | sfarinarsi si sfarinano V;IND;PRS;3;PL
41 | metaforizzare metaforizzavamo V;IND;PST;1;PL;IPFV
42 | pavimentare pavimentassero V;SBJV;PST;3;PL
43 | doppiare doppierai V;IND;FUT;2;SG
44 | scempiare scempiasse V;SBJV;PST;3;SG
45 | potabilizzare potabilizzaste V;SBJV;PST;2;PL
46 | disintegrarsi ci disintegreremo V;IND;FUT;1;PL
47 | rabbonire rabbonendo V.CVB;PRS
48 | sagginare sagginasse V;SBJV;PST;3;SG
49 | costudire costudirebbero V;COND;3;PL
50 | scarnire scarnisse V;SBJV;PST;3;SG
51 | simulare simulassero V;SBJV;PST;3;PL
52 | asciolvere asciolverete V;IND;FUT;2;PL
53 | vellutare velluta V;POS;IMP;2;SG
54 | spogliarsi ti spoglierai V;IND;FUT;2;SG
55 | girondolare girondolavi V;IND;PST;2;SG;IPFV
56 | attingersi si attingerà V;IND;FUT;3;SG
57 | ondulare ondulassimo V;SBJV;PST;1;PL
58 | frenarsi ci freneremmo V;COND;1;PL
59 | vendicare vendicaste V;SBJV;PST;2;PL
60 | smetterla la smettano V;POS;IMP;3;PL
61 | disimpegnarsi vi disimpegnerete V;IND;FUT;2;PL
62 | sgonfiare sgonfiaste V;SBJV;PST;2;PL
63 | bastire bastissi V;SBJV;PST;2;SG
64 | obliare obliante V.PTCP;PRS
65 | ripudiare ripudia V;POS;IMP;2;SG
66 | trasognare trasogneremo V;IND;FUT;1;PL
67 | concedere concedi V;POS;IMP;2;SG
68 | reimpiantare reimpianterete V;IND;FUT;2;PL
69 | scriversi ti scrivevi V;IND;PST;2;SG;IPFV
70 | ornare ornasse V;SBJV;PST;3;SG
71 | nuclearizzare nuclearizzi V;IND;PRS;2;SG
72 | ibridare ibriderebbe V;COND;3;SG
73 | mansuefarsi vi mansuefacciate V;SBJV;PRS;2;PL
74 | testificare testificassimo V;SBJV;PST;1;PL
75 | addebbiare addebbierete V;IND;FUT;2;PL
76 | smaliziare smalizierà V;IND;FUT;3;SG
77 | crucciarsi mi crucciavo V;IND;PST;1;SG;IPFV
78 | trivellare trivellerai V;IND;FUT;2;SG
79 | smembrare smembra V;POS;IMP;2;SG
80 | carezzarsi vi carezziate V;SBJV;PRS;2;PL
81 | illanguidire illanguidì V;IND;PST;3;SG;PFV
82 | storcere storcente V.PTCP;PRS
83 | depolverizzare depolverizzammo V;IND;PST;1;PL;PFV
84 | refrangere refranga V;POS;IMP;3;SG
85 | interpretare interpreterai V;IND;FUT;2;SG
86 | corteare corteavi V;IND;PST;2;SG;IPFV
87 | addimesticare addimesticavano V;IND;PST;3;PL;IPFV
88 | attendere attendeva V;IND;PST;3;SG;IPFV
89 | ritorcere ritorcemmo V;IND;PST;1;PL;PFV
90 | impuzzolentire impuzzolentiscano V;POS;IMP;3;PL
91 | rombare rombasse V;SBJV;PST;3;SG
92 | intervistare intervistiate V;SBJV;PRS;2;PL
93 | caracollare caracolli V;POS;IMP;3;SG
94 | uccidere uccidereste V;COND;2;PL
95 | apporre apponga V;SBJV;PRS;2;SG
96 | transitare transitassimo V;SBJV;PST;1;PL
97 | risospendere risospesero V;IND;PST;3;PL;PFV
98 | sostantivare sostantiviamo V;POS;IMP;1;PL
99 | subirsi subisciti V;POS;IMP;2;SG
100 | appendersi mi appendo V;IND;PRS;1;SG
101 |
--------------------------------------------------------------------------------
/data/middle-french-train-low:
--------------------------------------------------------------------------------
1 | inquieter inquieta V;IND;PST;3;SG;PFV
2 | descharger deschargeons V;IND;PRS;1;PL
3 | racompter racompterent V;IND;PST;3;PL;PFV
4 | fascher faschastes V;IND;PST;2;PL;PFV
5 | accompaigner accompaigneras V;2;SG;IND;FUT
6 | baptizer baptizyez V;IND;PST;2;PL;IPFV;LGSPEC2
7 | expuyser expuyserez V;2;PL;IND;FUT
8 | destabler destablasmes V;IND;PST;1;PL;PFV
9 | brusler bruslé V.PTCP;PST
10 | quester questoyt V;IND;PST;3;SG;IPFV;LGSPEC2
11 | houssepillier houssepillieroient V;3;PL;COND;LGSPEC1
12 | destourner destournas V;IND;PST;2;SG;PFV
13 | superseder supersedastes V;IND;PST;2;PL;PFV
14 | marier mariassions V;1;PL;SBJV;PST;IPFV
15 | explorer explorerions V;1;PL;COND;LGSPEC1
16 | practicquer practicquant V.PTCP;PRS
17 | frapper frappes V;IND;PRS;2;SG
18 | eslancer eslanças V;IND;PST;2;SG;PFV
19 | aracher araché V.PTCP;PST
20 | porter portons V;1;PL;SBJV;PRS
21 | eslancer eslançoyt V;IND;PST;3;SG;IPFV;LGSPEC2
22 | aguyser aguyserai V;1;SG;IND;FUT;LGSPEC1
23 | finir finissions V;1;PL;SBJV;PST;IPFV
24 | desbarater desbaratois V;IND;PST;1;SG;IPFV;LGSPEC1
25 | dampner dampneras V;2;SG;IND;FUT
26 | soucyer soucyoys V;IND;PST;2;SG;IPFV;LGSPEC2
27 | trancher tranchois V;IND;PST;2;SG;IPFV;LGSPEC1
28 | desieuner desieunasmes V;IND;PST;1;PL;PFV
29 | garder gardassent V;3;PL;SBJV;PST;IPFV
30 | engendrer engendrerons V;1;PL;IND;FUT
31 | importuner importuneront V;3;PL;IND;FUT
32 | toucher touchez V;2;PL;SBJV;PRS
33 | cauterizer cauterizoyt V;IND;PST;3;SG;IPFV;LGSPEC2
34 | lever leva V;IND;PST;3;SG;PFV
35 | seiourner seiournons V;IND;PRS;1;PL
36 | preferer preferois V;IND;PST;1;SG;IPFV;LGSPEC1
37 | pourchasser pourchassent V;3;PL;SBJV;PRS
38 | delaisser delaissoys V;IND;PST;1;SG;IPFV;LGSPEC2
39 | resider residassions V;1;PL;SBJV;PST;IPFV
40 | herrier herrierez V;2;PL;IND;FUT
41 | cercher cercheroit V;3;SG;COND;LGSPEC1
42 | honnourer honnoure V;IND;PRS;3;SG
43 | delaisser delaisseroyent V;3;PL;COND;LGSPEC2
44 | alonger alongerois V;2;SG;COND;LGSPEC1
45 | chanter chanteryons V;1;PL;COND;LGSPEC2
46 | baptizer baptizons V;1;PL;IMP
47 | mulcter mulctons V;1;PL;SBJV;PRS
48 | estiquier estiquois V;IND;PST;2;SG;IPFV;LGSPEC1
49 | menasser menasseroyt V;3;SG;COND;LGSPEC2
50 | accompaigner accompaigne V;1;SG;SBJV;PRS
51 | desbaucher desbaucheryez V;2;PL;COND;LGSPEC2
52 | mucer muçoient V;IND;PST;3;PL;IPFV;LGSPEC1
53 | laisser laissassiez V;2;PL;SBJV;PST;IPFV
54 | ayder en aydant V.CVB;PRS
55 | esveigler esveigloient V;IND;PST;3;PL;IPFV;LGSPEC1
56 | baisler baisle V;1;SG;SBJV;PRS
57 | tyrer tyrois V;IND;PST;1;SG;IPFV;LGSPEC1
58 | couster cousteroient V;3;PL;COND;LGSPEC1
59 | desadvouer desadvouastes V;IND;PST;2;PL;PFV
60 | confesser confessoys V;IND;PST;2;SG;IPFV;LGSPEC2
61 | excuser excuserois V;1;SG;COND;LGSPEC1
62 | provocquer provocquez V;IND;PRS;2;PL
63 | saluer salues V;IND;PRS;2;SG
64 | arroier arroié V.PTCP;PST
65 | asseurer asseuroit V;IND;PST;3;SG;IPFV;LGSPEC1
66 | affier affieryez V;2;PL;COND;LGSPEC2
67 | obliger obligeasse V;1;SG;SBJV;PST;IPFV
68 | regretter regretterent V;IND;PST;3;PL;PFV
69 | desnouer desnouons V;1;PL;SBJV;PRS
70 | protester protestent V;IND;PRS;3;PL
71 | noter notois V;IND;PST;1;SG;IPFV;LGSPEC1
72 | manouvrer manouvrent V;IND;PRS;3;PL
73 | engendrer engendreriez V;2;PL;COND;LGSPEC1
74 | desplacer desplaciez V;IND;PST;2;PL;IPFV;LGSPEC1
75 | croniquier croniquiez V;2;PL;SBJV;PRS
76 | reietter reiettiez V;IND;PST;2;PL;IPFV;LGSPEC1
77 | communicquer communicqueront V;3;PL;IND;FUT
78 | resister resistoyent V;IND;PST;3;PL;IPFV;LGSPEC2
79 | estudier estudie V;2;SG;IMP
80 | dancer dancerez V;2;PL;IND;FUT
81 | croniquier croniquions V;1;PL;IMP
82 | gaigner gaignent V;IND;PRS;3;PL
83 | despuceller despucelloys V;IND;PST;2;SG;IPFV;LGSPEC2
84 | fraper frapa V;IND;PST;3;SG;PFV
85 | effectuer effectuasses V;2;SG;SBJV;PST;IPFV
86 | brusler brusleryons V;1;PL;COND;LGSPEC2
87 | encliner enclinoys V;IND;PST;1;SG;IPFV;LGSPEC2
88 | marcher marchiez V;IND;PST;2;PL;IPFV;LGSPEC1
89 | visiter visiterent V;IND;PST;3;PL;PFV
90 | soupper souppera V;3;SG;IND;FUT
91 | reguarder reguardoys V;IND;PST;2;SG;IPFV;LGSPEC2
92 | entituler entitulerai V;1;SG;IND;FUT;LGSPEC1
93 | eschapper eschappes V;IND;PRS;2;SG
94 | pourforcer pourforcez V;2;PL;SBJV;PRS
95 | plaider plaidez V;2;PL;IMP
96 | espoventer espoventast V;3;SG;SBJV;PST;IPFV
97 | praesider praesides V;2;SG;SBJV;PRS
98 | pourpenser pourpenses V;IND;PRS;2;SG
99 | noter notons V;1;PL;SBJV;PRS
100 | reiecter reiecterons V;1;PL;IND;FUT
101 |
--------------------------------------------------------------------------------
/data/old-french-train-low:
--------------------------------------------------------------------------------
1 | chevalchier chevalchast V;3;SG;SBJV;PST;IPFV
2 | chever cheve V;2;SG;IMP
3 | esbaneier esbaneiissiez V;2;PL;SBJV;PST;IPFV;LGSPEC3
4 | coarder coardeve V;IND;PST;1;SG;IPFV;LGSPEC4
5 | travailler travaillez V;IND;PRS;2;PL
6 | atirier atiroies V;IND;PST;2;SG;IPFV;LGSPEC1
7 | larmoiier larmoieves V;IND;PST;2;SG;IPFV;LGSPEC5
8 | nettoiier nettis V;2;SG;SBJV;PRS;LGSPEC1
9 | sivre sevroie V;1;SG;COND;LGSPEC1
10 | desfigurer desfigureriiez V;2;PL;COND;LGSPEC1
11 | desturber desturber V;NFIN
12 | descupler descupleras V;2;SG;IND;FUT
13 | trichier trichastes V;IND;PST;2;PL;PFV
14 | tirer tireriens V;1;PL;COND;LGSPEC2
15 | laier lai V;1;SG;SBJV;PRS
16 | barguigner barguignoe V;IND;PST;1;SG;IPFV;LGSPEC3
17 | desmembrer desmembras V;IND;PST;2;SG;PFV
18 | laiier laions V;1;PL;IMP
19 | estriver estrivereient V;3;PL;COND;LGSPEC2
20 | aduster adustes V;IND;PRS;2;SG
21 | preier prei V;1;SG;SBJV;PRS;LGSPEC2
22 | descloer descloeras V;2;SG;IND;FUT
23 | plungier plunjoie V;IND;PST;1;SG;IPFV;LGSPEC1
24 | doctriner doctrinassent V;3;PL;SBJV;PST;IPFV
25 | duner dunront V;3;PL;IND;FUT;LGSPEC1
26 | umilier umilïereit V;3;SG;COND;LGSPEC2
27 | plongier plongeriiez V;2;PL;COND;LGSPEC1
28 | chulchier chulcheriens V;1;PL;COND;LGSPEC2
29 | tumer tumeroie V;1;SG;COND;LGSPEC1
30 | governer governeie V;IND;PST;1;SG;IPFV;LGSPEC2
31 | alaitier alaiteriiez V;2;PL;COND;LGSPEC1
32 | abrier abriot V;IND;PST;3;SG;IPFV;LGSPEC3
33 | enfrener enfreneroiz V;2;PL;IND;FUT;LGSPEC1
34 | ganchir ganchiroient V;3;PL;COND;LGSPEC1
35 | purgier purgievent V;IND;PST;3;PL;IPFV;LGSPEC4
36 | manecier maneçoies V;IND;PST;2;SG;IPFV;LGSPEC1
37 | amesurer amesuroient V;IND;PST;3;PL;IPFV;LGSPEC1
38 | plaider plaidasse V;1;SG;SBJV;PST;IPFV
39 | lever levai V;IND;PST;1;SG;PFV
40 | alegier alejast V;3;SG;SBJV;PST;IPFV
41 | glotir glotirent V;IND;PST;3;PL;PFV
42 | pourparler pourparlereit V;3;SG;COND;LGSPEC2
43 | estandre estanz V;IND;PRS;2;SG
44 | luisir luiseit V;IND;PST;3;SG;IPFV;LGSPEC2
45 | rompre rompierent V;IND;PST;3;PL;PFV
46 | prepenser prepensent V;3;PL;SBJV;PRS
47 | croire creïssoiz V;2;PL;SBJV;PST;IPFV;LGSPEC4
48 | awaitier awaitasses V;2;SG;SBJV;PST;IPFV
49 | apaier apais V;2;SG;SBJV;PRS
50 | laver les V;2;SG;SBJV;PRS
51 | desafubler desafublereit V;3;SG;COND;LGSPEC2
52 | encreistre encreüsses V;2;SG;SBJV;PST;IPFV
53 | compleindre compleindroit V;3;SG;COND;LGSPEC1
54 | conseller consellerent V;IND;PST;3;PL;PFV
55 | contremander contremandiez V;IND;PST;2;PL;IPFV;LGSPEC2
56 | larmoiier larmïe V;IND;PRS;3;SG;LGSPEC1
57 | manier manïereie V;1;SG;COND;LGSPEC2
58 | despire despire V;NFIN
59 | engraignier engraigneront V;3;PL;IND;FUT
60 | prouver prouveriiez V;2;PL;COND;LGSPEC1
61 | encountrer encountrissiez V;2;PL;SBJV;PST;IPFV;LGSPEC3
62 | taster tastent V;3;PL;SBJV;PRS
63 | atraire atraiiiez V;IND;PST;2;PL;IPFV;LGSPEC1
64 | adober adobeies V;IND;PST;2;SG;IPFV;LGSPEC2
65 | reclamer reclameient V;IND;PST;3;PL;IPFV;LGSPEC2
66 | sunger sungereient V;3;PL;COND;LGSPEC2
67 | creindre creinsissent V;3;PL;SBJV;PST;IPFV;LGSPEC1
68 | deraisnier deraisnissons V;1;PL;SBJV;PST;IPFV;LGSPEC1
69 | cuntenir en cuntenant V.CVB;PRS
70 | ganchir ganchis V;IND;PST;2;SG;PFV
71 | refaire refaimes V;IND;PRS;1;PL;LGSPEC1
72 | harigoter harigotissoiz V;2;PL;SBJV;PST;IPFV;LGSPEC1
73 | ariver ariviez V;IND;PST;2;PL;IPFV;LGSPEC2
74 | asemler asemleves V;IND;PST;2;SG;IPFV;LGSPEC4
75 | detranprer detranprons V;IND;PRS;1;PL
76 | chastier chastïeroient V;3;PL;COND;LGSPEC1
77 | replorer repleure V;IND;PRS;3;SG
78 | drescer dresceriens V;1;PL;COND;LGSPEC2
79 | amender amendent V;3;PL;SBJV;PRS
80 | antrespargnier antrespargneriez V;2;PL;COND;LGSPEC2
81 | correspondre correspondrai V;1;SG;IND;FUT
82 | comprehender comprehendoie V;IND;PST;1;SG;IPFV;LGSPEC1
83 | desploiier desploiier V;NFIN
84 | garbeler garbele V;2;SG;IMP
85 | aparoir aparu V;IND;PST;3;SG;PFV
86 | destraindre destrainsissez V;2;PL;SBJV;PST;IPFV;LGSPEC2
87 | aparoler aparoleient V;IND;PST;3;PL;IPFV;LGSPEC2
88 | parbolir parbolissez V;2;PL;SBJV;PST;IPFV;LGSPEC2
89 | redoter redoteras V;2;SG;IND;FUT
90 | parbouillir parboulissez V;2;PL;SBJV;PST;IPFV;LGSPEC2
91 | paser pasastes V;IND;PST;2;PL;PFV
92 | ouster ousterons V;1;PL;IND;FUT
93 | removoir removroiz V;2;PL;IND;FUT;LGSPEC1
94 | estouper estoupent V;3;PL;SBJV;PRS
95 | enbuscier enbusçoies V;IND;PST;2;SG;IPFV;LGSPEC1
96 | manier manïevent V;IND;PST;3;PL;IPFV;LGSPEC4
97 | soner sonerez V;2;PL;IND;FUT;LGSPEC3
98 | refaire referiez V;2;PL;COND;LGSPEC2
99 | taxer taxe V;IND;PRS;3;SG
100 | espier espia V;IND;PST;3;SG;PFV
101 |
--------------------------------------------------------------------------------
/data/arabic-train-low:
--------------------------------------------------------------------------------
1 | مَجَلَّةٌ الْمَجَلَّتَيْنِ N;DU;DEF;GEN
2 | مُسْتَشْفًى مُسْتَشْفَيَاتِ N;PL;PSSD;GEN
3 | قَالَ يُقَلْنَ V;3;PL;FEM;LGSPEC1;PASS
4 | جِيلٌ جِيلَيْنِ N;DU;NDEF;ACC
5 | أَفْلَحَ أَفْلَحُوا V;3;PL;MASC;PST;PRF;IND;ACT
6 | كَالَمَ أُكَالَمُ V;1;SG;IPFV;IND;PASS
7 | الْأَجْوَد أَجْوَدُ ADJ;SG;MASC;NDEF;NOM
8 | اِنْتَفَضَ يَنْتَفِضُو V;3;PL;MASC;SBJV;ACT
9 | قَمْحَةٌ الْقَمْحَتَانِ N;DU;DEF;NOM
10 | كَتِفٌ كَتِف N;SG;NDEF;INFM
11 | اِرْتَجَى تُرْتَجَيَانِ V;2;DU;IPFV;IND;PASS
12 | مَطْعَمٌ مَطْعَمَ N;SG;PSSD;ACC
13 | هُلُوكٌ هُلُوكٍ N;SG;NDEF;GEN
14 | عَاكَسَ يُعَاكَسْنَ V;3;PL;FEM;IPFV;IND;PASS
15 | اِسْتَنْبَأَ تَسْتَنْبِئَا V;3;DU;FEM;LGSPEC1;ACT
16 | أَنْبَأَ يُنْبَأُ V;3;SG;MASC;IPFV;IND;PASS
17 | اِشْتَرَكَ تَشْتَرِكُونَ V;2;PL;MASC;IPFV;IND;ACT
18 | اِحْمَرَّ يَحْمَرِرْنَ V;3;PL;FEM;SBJV;ACT
19 | نَاوَى تُنَاوِيَ V;3;SG;FEM;SBJV;ACT
20 | الثَّانَوِي الثَّانَوِيَّيْنِ ADJ;DU;MASC;DEF;ACC
21 | اِنْتَهَى اِنْتَهَيْتُنَّ V;2;PL;FEM;PST;PRF;IND;ACT
22 | النَّظِيف النَّظِيف ADJ;SG;MASC;DEF;INFM
23 | نُقْصَانٌ النُّقْصَانِ N;SG;DEF;GEN
24 | ضَلَّ تُضَلُّ V;2;SG;MASC;IPFV;IND;PASS
25 | تَرْسِيمٌ تَرْسِيمَ N;SG;PSSD;ACC
26 | قُنْبُلَةٌ الْقُنْبُلَةُ N;SG;DEF;NOM
27 | كَنِيسٌ الْكَنِيسُ N;SG;DEF;NOM
28 | غَرَّ غُرَّ V;2;SG;MASC;IMP;ACT
29 | وَزِيرٌ وَزِيرَيْن N;DU;NDEF;INFM
30 | أَمْكَنَ أَمْكَنَا V;3;DU;MASC;PST;PRF;IND;ACT
31 | اِسْتِضَاءَةٌ اِسْتِضَاءَةً N;SG;NDEF;ACC
32 | نِظَامُ تَشْغِيلٍ نِظَامِ تَشْغِيلِ N;SG;PSSD;GEN
33 | اِسْتَذْنَبَ يَسْتَذْنِبَا V;3;DU;MASC;LGSPEC1;ACT
34 | اِفْتَعَلَ تَفْتَعِلُونَ V;2;PL;MASC;IPFV;IND;ACT
35 | اِبْتَسَمَ يُبْتَسَمَ V;3;SG;MASC;SBJV;PASS
36 | خُلُودٌ خُلُودُ N;SG;PSSD;NOM
37 | اِسْتَنْفَرَ نَسْتَنْفِرُ V;1;PL;IPFV;IND;ACT
38 | تَمْتَمَةٌ تَمْتَمَةِ N;SG;PSSD;GEN
39 | السَّادِس السَّادِسَيْن ADJ;DU;MASC;DEF;INFM
40 | شَلَّالٌ شَلَّالَاتٍ N;PL;NDEF;ACC
41 | كَارِثَةٌ الْكَارِثَتَيْنِ N;DU;DEF;ACC
42 | اِنْفِصَالٌ اِنْفِصَالِ N;SG;PSSD;GEN
43 | اِحْتَفَلَ اِحْتَفَلْتَ V;2;SG;MASC;PST;PRF;IND;ACT
44 | اِحْقَوْقَفَ اِحْقَوْقَفْتُمْ V;2;PL;MASC;PST;PRF;IND;ACT
45 | عَذَّبَ يُعَذِّبَانِ V;3;DU;MASC;IPFV;IND;ACT
46 | الطَّنَّان طَنَّانَاتٍ ADJ;PL;FEM;NDEF;GEN
47 | أُمُّ قِرَاءَةٍ أُمَّهَاتِ قِرَاءَةِ N;PL;PSSD;ACC
48 | الصَّدِئ الصَّدِئَتَانِ ADJ;DU;FEM;DEF;NOM
49 | قُونَةٌ الْقُونَتَيْن N;DU;DEF;INFM
50 | نِمْسَاوِيَّةٌ نِمْسَاوِيَّتَيْن N;DU;NDEF;INFM
51 | حِرْفَةٌ الْحِرْفَةُ N;SG;DEF;NOM
52 | إِبْصَارٌ إِبْصَار N;SG;PSSD;INFM
53 | الْأَخِير أَخِير ADJ;SG;MASC;NDEF;INFM
54 | صَوَّبَ صُوِّبْتَ V;2;SG;MASC;PST;PRF;IND;PASS
55 | طَهَّرَ طُهِّرْنَ V;3;PL;FEM;PST;PRF;IND;PASS
56 | مِرْصَادٌ مَرَاصِيد N;PL;NDEF;INFM
57 | رَغِبَ نَرْغَبَ V;1;PL;SBJV;ACT
58 | سَاخِرَةٌ السَّاخِرَةِ N;SG;DEF;GEN
59 | كَاتِدْرَائِيَّةٌ كَاتِدْرَائِيَّتَا N;DU;PSSD;NOM
60 | اِمْتَازَ يَمْتَازَا V;3;DU;MASC;SBJV;ACT
61 | اِسْتَعْمَلَ يَسْتَعْمِلَ V;3;SG;MASC;SBJV;ACT
62 | اِعْتِيَادٌ اِعْتِيَادَاتِ N;PL;PSSD;GEN
63 | دُفُوقٌ دُفُوقٌ N;SG;NDEF;NOM
64 | أَضَاعَ تُضِيعَ V;2;SG;MASC;SBJV;ACT
65 | حَرَّرَ أُحَرِّرُ V;1;SG;IPFV;IND;ACT
66 | الْقُرَيْشِي الْقُرَيْشِيَّات ADJ;PL;FEM;DEF;INFM
67 | سَمَّى يُسَمَّيْنَ V;3;PL;FEM;LGSPEC1;PASS
68 | قُونَةٌ قُونَةٌ N;SG;NDEF;NOM
69 | الْعُمَانِي الْعُمَانِيَّتَيْنِ ADJ;DU;FEM;DEF;GEN
70 | أُسْتُرَالِيَّةٌ أُسْتُرَالِيَّةٌ N;SG;NDEF;NOM
71 | تَكْرِيهٌ تَكْرِيهًا N;SG;NDEF;ACC
72 | شَيْطَانٌ شَيْطَانٍ N;SG;NDEF;GEN
73 | الْمُمْكِن الْمُمْكِنَ ADJ;SG;MASC;DEF;ACC
74 | اِخْرَنْمَسَ نَخْرَنْمِسَ V;1;PL;SBJV;ACT
75 | صَحَّى تُصَحَّيْنَ V;2;PL;FEM;SBJV;PASS
76 | اِنْضَمَّ تَنْضَمُّو V;2;PL;MASC;LGSPEC1;ACT
77 | الْمُنْفَرِد الْمُنْفَرِدَاتُ ADJ;PL;FEM;DEF;NOM
78 | زُوَارَةٌ زُوَارَة N;SG;NDEF;INFM
79 | الزَّاهِر زَاهِرَيْنِ ADJ;DU;MASC;NDEF;ACC
80 | قَمْحٌ الْقَمْحِ N;SG;DEF;GEN
81 | الْمَلَائِكِي مَلَائِكِيِّينَ ADJ;PL;MASC;NDEF;ACC
82 | الْعِمْلَاقِي الْعِمْلَاقِيَّةِ ADJ;SG;FEM;DEF;GEN
83 | مُحِبٌّ مُحِبَّيْ N;DU;PSSD;INFM
84 | جِلْدٌ جِلْدَيْن N;DU;NDEF;INFM
85 | مُخٌّ الْمُخَّيْنِ N;DU;DEF;ACC
86 | أَنْصَفَ تُنْصِفْنَ V;2;PL;FEM;LGSPEC1;ACT
87 | مُثَابَرَةٌ مُثَابَرَةٍ N;SG;NDEF;GEN
88 | اِبْلَاجَّ تَبْلَاجُّونَ V;2;PL;MASC;IPFV;IND;ACT
89 | عَاقَبَ تُعَاقِبَا V;3;DU;FEM;LGSPEC1;ACT
90 | بُلْبُلٌ الْبُلْبُلَ N;SG;DEF;ACC
91 | اِبْتِعَادٌ الِابْتِعَاد N;SG;DEF;INFM
92 | نَمَا تَنْمُوَ V;2;SG;MASC;SBJV;ACT
93 | شِمْبَانْزِي الشِّمْبَانْزِي N;SG;DEF;ACC
94 | اِسْتَعْرَفَ تَسْتَعْرِفُ V;3;SG;FEM;IPFV;IND;ACT
95 | اِحْمَارَّ نَحْمَارَّ V;1;PL;LGSPEC1;ACT
96 | اِسْتِئْذَانٌ الِاسْتِئْذَانُ N;SG;DEF;NOM
97 | خُصْيَةٌ خُصًى N;PL;NDEF;GEN
98 | مَطَارٌ مَطَارَات N;PL;PSSD;INFM
99 | اِسْتَوْلَى يَسْتَوْلِي V;3;SG;MASC;IPFV;IND;ACT
100 | اِسْتَبْدَلَ اِسْتَبْدَلَتْ V;3;SG;FEM;PST;PRF;IND;ACT
101 |
--------------------------------------------------------------------------------
/hyperparameter_search.py:
--------------------------------------------------------------------------------
1 | import os
2 | import statistics
3 |
4 | TRAIN = "train"
5 | DEV = "dev"
6 | # TEST = "test"
7 | TEST = "dev"# For now, use dev set as test
8 | LANGUAGES = ["english", "french", "irish", "italian", "spanish"]
9 | RESOURCES = ["low"]#"medium"]
10 | MODEL_TYPE = ["transformer"]#, "pointer_generator"]
11 | EPOCHS_PER_RESOURCE = {"low": 800, "medium": 400}
12 | BATCH_SIZE_PER_RESOURCE = {"low": 64, "medium": 128}
13 | EVAL_EVERY = 25
14 |
15 | EMBEDDING_DIMS = [64, 128, 256]
16 | FCN_HIDDEN_DIMS = [64, 128, 256]
17 | NUM_HEADS = [4, 8]
18 | NUM_LAYERS = [2, 3]
19 | # DROPOUT = [0.2, 0.3]
20 | for model in MODEL_TYPE:
21 | for embed_dim in EMBEDDING_DIMS:
22 | for fcn_dim in FCN_HIDDEN_DIMS:
23 | for num_heads in NUM_HEADS:
24 | for num_layers in NUM_LAYERS:
25 | # skip these
26 | if (embed_dim, fcn_dim) == (64, 64):
27 | continue
28 | print(f"embed_dim: {embed_dim}, fcn_dim: {fcn_dim}, num-heads: {num_heads}, num-layers: {num_layers}")
29 | accuracies = []
30 | hyper_folder = f"embed_dim-{embed_dim}-fcn_dim-{fcn_dim}-heads-{num_heads}-layers-{num_layers}"
31 | for resource in RESOURCES:
32 | for language in LANGUAGES:
33 | print(f"{resource} - {language}")
34 | # Get epoch and batch size
35 | epochs = EPOCHS_PER_RESOURCE[resource]
36 | batch_size = BATCH_SIZE_PER_RESOURCE[resource]
37 | # Set names of relevant files and directories
38 | train_file = f"{language}-{TRAIN}-{resource}"
39 | valid_file = f"{language}-{DEV}"
40 | test_file = f"{language}-{TEST}"
41 | covered_test_file = f"{language}-covered-{TEST}"
42 | pred_file = f"{language}-{resource}-{TEST}-pred"
43 | vocab_file = f"{train_file}-vocab"
44 |
45 | data_folder = "data"
46 | vocab_folder = f"vocab/{language}/{resource}"
47 | checkpoints_folder = f"model-checkpoints-test/{model}/{hyper_folder}/{language}/{resource}"
48 | pred_folder = f"predictions-test/{model}/{hyper_folder}"
49 | logs_folder = f"logs-test/{hyper_folder}"
50 |
51 | # create necessary folders, if they do not exist already
52 | if not os.path.exists(vocab_folder):
53 | os.makedirs(vocab_folder)
54 | if not os.path.exists(checkpoints_folder):
55 | os.makedirs(checkpoints_folder)
56 | if not os.path.exists(pred_folder):
57 | os.makedirs(pred_folder)
58 | if not os.path.exists(logs_folder):
59 | os.makedirs(logs_folder)
60 |
61 | # Create vocabulary
62 | # print(f"python vocabulary.py --src {data_folder}/{train_file} --vocab {data_folder}/{vocab_file}")
63 | # Train model
64 | os.system(f"python train.py --arch {model} --epochs {epochs} --batch-size {batch_size} --eval-every {EVAL_EVERY} " +
65 | f"--embed-dim {embed_dim} --fcn-dim {fcn_dim} --num-heads {num_heads} --num-layers {num_layers} " +
66 | f"--train {data_folder}/{train_file} --dev {data_folder}/{valid_file} " +
67 | f"--vocab {vocab_folder}/{vocab_file} --checkpoints-folder {checkpoints_folder}" +
68 | f" > {logs_folder}/train-log-{model}-{resource}-{language}.out")
69 | # Generate predictions for test set
70 | os.system(f"python generate.py --model-checkpoint {checkpoints_folder}/model_best.pth " +
71 | f"--test {data_folder}/{covered_test_file} --vocab {vocab_folder}/{vocab_file} " +
72 | f"--pred {pred_folder}/{pred_file}")
73 | # Evaluate accuracy of prediction file compared to true test set
74 | accuracies.append(os.system(f"python evaluate.py --pred {pred_folder}/{pred_file} " +
75 | f"--target {data_folder}/{test_file}"))
76 | print(f"average accuracy: {statistics.mean(accuracies):.4f}")
--------------------------------------------------------------------------------
/tokenizer.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import torch
4 | PAD_ID = 0
5 | SOS_ID = 1
6 | EOS_ID = 2
7 | UNK_ID = 3
8 |
9 |
10 | def load_vocab(vocab_file_path, sos, eos, pad, unk):
11 | """ Loads vocabulary from vocabulary file (create by vocabulary.py)"""
12 | vocab_file = open(vocab_file_path, "r+", encoding='utf-8')
13 | lines = vocab_file.readlines()
14 |
15 | vocab = collections.OrderedDict()
16 | # First, add special signs for sos, eos and pad to vocabulary
17 | vocab[pad] = PAD_ID
18 | vocab[sos] = SOS_ID
19 | vocab[eos] = EOS_ID
20 | vocab[unk] = UNK_ID
21 | # For each valid line, Get token and index of line
22 | for index, line in enumerate(lines):
23 | if line != "\n":
24 | token, count = line.replace("\n", "").split("\t")
25 | vocab[token] = index + 4 # first two values of vocabulary are taken
26 |
27 | return vocab
28 |
29 |
30 | def convert_by_vocab(vocab, items, unk_val):
31 | """Converts a sequence of tokens or ids using the given vocabulary.
32 | If token is not in vocabulary, unk_val is return as default. """
33 | output = []
34 | for item in items:
35 | output.append(vocab.get(item, unk_val))
36 | return output
37 |
38 |
39 | def tokenize_words(word_list):
40 | """Split words to lists of characters"""
41 | return [list(words) for words in word_list]
42 |
43 |
44 | def tokenize_features(features_list):
45 | """Splits features by the separator sign ";" """
46 | return [connected_features.split(";") for connected_features in features_list]
47 |
48 |
49 | class Tokenizer(object):
50 | """ Tokenizer object. Handles tokenizing sentences, converting tokens to ids and vice versa"""
51 |
52 | def __init__(self, src_vocab_file_path, tgt_vocab_file_path, device):
53 | self.sos = ""
54 | self.eos = ""
55 | self.pad = ""
56 | self.unk = ""
57 | self.pad_id = PAD_ID
58 | self.sos_id = SOS_ID
59 | self.eos_id = EOS_ID
60 | self.unk_id = UNK_ID
61 |
62 | self.device = device
63 |
64 | self.src_vocab = load_vocab(src_vocab_file_path, self.sos,
65 | self.eos, self.pad, self.unk) # vocabulary of all token->id in the input
66 | self.inv_src_vocab = {v: k for k, v in self.src_vocab.items()} # reverse vocabulary of input, id->token
67 | self.src_vocab_size = len(self.src_vocab)
68 | self.tgt_vocab = load_vocab(tgt_vocab_file_path, self.sos,
69 | self.eos, self.pad, self.unk) # vocabulary of all token->id in the output
70 | self.inv_tgt_vocab = {v: k for k, v in self.tgt_vocab.items()} # reverse vocabulary of output, id->token
71 | self.tgt_vocab_size = len(self.tgt_vocab)
72 |
73 | self.src_to_tgt_vocab_conversion_matrix = self.get_src_to_tgt_vocab_conversion_matrix()
74 |
75 | def add_sequence_symbols(self, tokens_list):
76 | """ Adds eos and sos symbols to each sequence of tokens"""
77 | return [[self.sos] + tokens + [self.eos] for tokens in tokens_list]
78 |
79 | def convert_src_tokens_to_ids(self, tokens):
80 | """ Converts all given tokens to ids using the input vocabulary"""
81 | return convert_by_vocab(self.src_vocab, tokens, self.unk_id)
82 |
83 | def convert_src_ids_to_tokens(self, ids):
84 | """ Converts all given ids to tokens using the input vocabulary"""
85 | return convert_by_vocab(self.inv_src_vocab, ids, self.unk)
86 |
87 | def convert_tgt_tokens_to_ids(self, tokens):
88 | """ Converts all given tokens to the ids using the output vocabulary"""
89 | return convert_by_vocab(self.tgt_vocab, tokens, self.unk_id)
90 |
91 | def convert_tgt_ids_to_tokens(self, ids):
92 | """ Converts all given tokens to the ids using the output vocabulary"""
93 | return convert_by_vocab(self.inv_tgt_vocab, ids, self.unk)
94 |
95 | def get_id_tensors(self, tokens_list, vocab_type):
96 | """ Gets list of token sequences, and converts each token sequence to tensor of ids, using the tokenizer
97 | device to determine tensor device type, and vocab type is either "INPUT" or "OUTPUT" """
98 | if vocab_type == "INPUT":
99 | return [torch.tensor(self.convert_src_tokens_to_ids(tokens), dtype=torch.long, device=self.device)
100 | for tokens in tokens_list]
101 | else:
102 | return [torch.tensor(self.convert_tgt_tokens_to_ids(tokens), dtype=torch.long, device=self.device)
103 | for tokens in tokens_list]
104 |
105 | def pad_tokens_sequence(self, tokens, max_seq_len):
106 | """ Pads the token sequence with pad symbols until it reaches the max sequence length.
107 | If Sequence is already at max length, nothing is added. """
108 | padding_len = max_seq_len - len(tokens)
109 | padding = [self.pad] * padding_len
110 | return tokens + padding
111 |
112 | def get_src_to_tgt_vocab_conversion_matrix(self):
113 | # Initialize conversion matrix
114 | src_to_tgt_conversion_matrix = torch.zeros(self.src_vocab_size, self.tgt_vocab_size, device=self.device)
115 | src_vocab_items = self.src_vocab.items()
116 | # Go over all (token, id) items in src vocab
117 | for src_token, src_id in src_vocab_items:
118 | tgt_id = self.tgt_vocab.get(src_token, self.unk_id)
119 | src_to_tgt_conversion_matrix[src_id][tgt_id] = 1
120 | return src_to_tgt_conversion_matrix
121 |
122 |
--------------------------------------------------------------------------------
/decoder.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.nn.modules import TransformerDecoderLayer
6 | from torch.nn.modules.activation import MultiheadAttention
7 | from torch.nn import Linear, Dropout
8 | from torch.nn import LayerNorm
9 | from torch.nn.modules.transformer import _get_activation_fn, _get_clones
10 |
11 |
12 | class TransformerDecoder(nn.Module):
13 | r"""TransformerDecoder is a stack of N decoder layers
14 |
15 | Args:
16 | decoder_layer: an instance of the TransformerDecoderLayer() class (required).
17 | decoder_final_layer: an instance of the TransformerDecoderLayer() class (required).
18 | num_layers: the number of sub-decoder-layers in the decoder (required).
19 | norm: the layer normalization component (optional).
20 |
21 | Examples::
22 | decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
23 | transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
24 | memory = torch.rand(10, 32, 512)
25 | tgt = torch.rand(20, 32, 512)
26 | out = transformer_decoder(tgt, memory)
27 | """
28 |
29 | def __init__(self, decoder_layer, decoder_final_layer, num_layers, norm=None):
30 | super(TransformerDecoder, self).__init__()
31 | self.layers = _get_clones(decoder_layer, num_layers - 1)
32 | self.final_layer = decoder_final_layer
33 | self.num_layers = num_layers
34 | self.norm = norm
35 |
36 | def forward(self, tgt, memory, tgt_mask=None,
37 | memory_mask=None, tgt_key_padding_mask=None,
38 | memory_key_padding_mask=None):
39 | r"""Pass the inputs (and mask) through the decoder layer in turn.
40 |
41 | Args:
42 | tgt: the sequence to the decoder (required).
43 | memory: the sequnce from the last layer of the encoder (required).
44 | tgt_mask: the mask for the tgt sequence (optional).
45 | memory_mask: the mask for the memory sequence (optional).
46 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
47 | memory_key_padding_mask: the mask for the memory keys per batch (optional).
48 |
49 | Shape:
50 | see the docs in Transformer class.
51 | """
52 | output = tgt
53 | # Run through "normal" decoder layer
54 | for i in range(self.num_layers - 1):
55 | output = self.layers[i](output, memory, tgt_mask=tgt_mask,
56 | memory_mask=memory_mask,
57 | tgt_key_padding_mask=tgt_key_padding_mask,
58 | memory_key_padding_mask=memory_key_padding_mask)
59 | # Run through final decoder layer, which outputs the attention weights as well
60 | output, attention_weights = self.final_layer(output, memory, tgt_mask=tgt_mask,
61 | memory_mask=memory_mask,
62 | tgt_key_padding_mask=tgt_key_padding_mask,
63 | memory_key_padding_mask=memory_key_padding_mask)
64 |
65 | if self.norm:
66 | output = self.norm(output)
67 |
68 | return output, attention_weights
69 |
70 | class TransformerDecoderFinalLayer(nn.Module):
71 | r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
72 | Layer also output attention weights from the multi-head-attn, used for pointer-generator model.
73 | Args:
74 | d_model: the number of expected features in the input (required).
75 | nhead: the number of heads in the multiheadattention models (required).
76 | dim_feedforward: the dimension of the feedforward network model (default=2048).
77 | dropout: the dropout value (default=0.1).
78 | activation: the activation function of intermediate layer, relu or gelu (default=relu).
79 |
80 | Examples::
81 | decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
82 | memory = torch.rand(10, 32, 512)
83 | tgt = torch.rand(20, 32, 512)
84 | out, attention = decoder_layer(tgt, memory)
85 | """
86 |
87 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
88 | super(TransformerDecoderFinalLayer, self).__init__()
89 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
90 | self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
91 | # Implementation of Feedforward model
92 | self.linear1 = Linear(d_model, dim_feedforward)
93 | self.dropout = Dropout(dropout)
94 | self.linear2 = Linear(dim_feedforward, d_model)
95 |
96 | self.norm1 = LayerNorm(d_model)
97 | self.norm2 = LayerNorm(d_model)
98 | self.norm3 = LayerNorm(d_model)
99 | self.dropout1 = Dropout(dropout)
100 | self.dropout2 = Dropout(dropout)
101 | self.dropout3 = Dropout(dropout)
102 |
103 | self.activation = _get_activation_fn(activation)
104 |
105 | def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
106 | tgt_key_padding_mask=None, memory_key_padding_mask=None):
107 | r"""Pass the inputs (and mask) through the decoder layer.
108 |
109 | Args:
110 | tgt: the sequence to the decoder layer (required).
111 | memory: the sequnce from the last layer of the encoder (required).
112 | tgt_mask: the mask for the tgt sequence (optional).
113 | memory_mask: the mask for the memory sequence (optional).
114 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
115 | memory_key_padding_mask: the mask for the memory keys per batch (optional).
116 |
117 | Shape:
118 | see the docs in Transformer class.
119 | """
120 | tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
121 | key_padding_mask=tgt_key_padding_mask)[0]
122 | tgt = tgt + self.dropout1(tgt2)
123 | tgt = self.norm1(tgt)
124 | # Model saves attention weights from multi-head-attn
125 | tgt2, attention_weights = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
126 | key_padding_mask=memory_key_padding_mask)
127 | tgt = tgt + self.dropout2(tgt2)
128 | tgt = self.norm2(tgt)
129 | # for backward compatibility
130 | tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
131 | tgt = tgt + self.dropout3(tgt2)
132 | tgt = self.norm3(tgt)
133 | return tgt, attention_weights
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import shutil
4 | import sys
5 | import time
6 | from datetime import timedelta
7 |
8 | import torch
9 | from torch.optim.lr_scheduler import LambdaLR
10 |
11 | import pointer_generator
12 | import transformer
13 |
14 |
15 | class WarmupInverseSquareRootSchedule(LambdaLR):
16 | """ Linear warmup and then inverse square root decay.
17 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
18 | Inverse square root decreases learning rate from 1. to 0. over remaining steps.
19 | """
20 | def __init__(self, optimizer, warmup_steps, last_epoch=-1):
21 | self.warmup_steps = warmup_steps
22 | self.decay_factor = warmup_steps**0.5
23 | super(WarmupInverseSquareRootSchedule,
24 | self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
25 |
26 | def lr_lambda(self, step):
27 | if step < self.warmup_steps:
28 | return float(step) / float(max(1, self.warmup_steps))
29 | return self.decay_factor * step**-0.5
30 |
31 |
32 | class LogFormatter():
33 | def __init__(self):
34 | self.start_time = time.time()
35 |
36 | def format(self, record):
37 | elapsed_seconds = round(record.created - self.start_time)
38 |
39 | prefix = "%s - %s - %s" % (record.levelname, time.strftime('%x %X'),
40 | timedelta(seconds=elapsed_seconds))
41 | message = record.getMessage()
42 | message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3))
43 | return "%s - %s" % (prefix, message) if message else ''
44 |
45 |
46 | def get_logger():
47 | '''
48 | create logger and output to file and stdout
49 | '''
50 | log_formatter = LogFormatter()
51 | logger = logging.getLogger()
52 | logger.setLevel(logging.INFO)
53 |
54 | stream = logging.StreamHandler(sys.stdout)
55 | stream.setFormatter(log_formatter)
56 | logger.addHandler(stream)
57 | return logger
58 |
59 | def maybe_mkdir(filename):
60 | '''
61 | maybe mkdir
62 | '''
63 | path = os.path.dirname(filename)
64 | if not os.path.isdir(path):
65 | try:
66 | os.makedirs(path)
67 | except FileExistsError:
68 | pass
69 |
70 | def save_checkpoint(model, epoch, optimizer, scheduler, val_accuracy, is_best, checkpoints_dir):
71 | """
72 | Save checkpoint of model at current epoch, if new best model, saves checkpoint as best.
73 | Saves state of model, epoch, optimizer and scheduler.
74 | """
75 | # Create the checkpoint folder in not exists
76 | if not os.path.exists(checkpoints_dir):
77 | os.makedirs(checkpoints_dir)
78 | checkpoint = {
79 | 'epoch': epoch,
80 | 'val_accuracy': val_accuracy,
81 | 'state_dict': model.state_dict(),
82 | 'optimizer': optimizer.state_dict(),
83 | 'scheduler': scheduler.state_dict()
84 | }
85 | model_path = f"{checkpoints_dir}/model_{epoch}.pth"
86 | best_model_path = f'{checkpoints_dir}/model_best.pth'
87 | torch.save(checkpoint, model_path)
88 | if is_best:
89 | shutil.copyfile(model_path, best_model_path)
90 |
91 | def load_checkpoint(model, optimizer, scheduler, checkpoint_path, logger):
92 | """
93 | Load checkpoint of model from checkpoint path. Used for training.
94 | Loads state of model, epoch, optimizer and scheduler.
95 | """
96 | if not os.path.exists(checkpoint_path):
97 | logger.info(f" Trying to resume training bot file {checkpoint_path} not exists,\n"
98 | f" starting training from scratch")
99 | return model, optimizer, scheduler, 0, -1.0
100 | logger.info(f"resume training, loading checkpoint from {checkpoint_path}")
101 | checkpoint = torch.load(checkpoint_path)
102 | try:
103 | model.load_state_dict(checkpoint['state_dict'])
104 | except:
105 | logger.debug("Model hyperparameters do not match loaded checkpoint")
106 | optimizer.load_state_dict(checkpoint['optimizer'])
107 | try:
108 | scheduler.load_state_dict(checkpoint['scheduler'])
109 | except:
110 | logger.debug("Scheduler does not match loaded checkpoint")
111 | start_epoch = checkpoint['epoch']
112 | val_accuracy = checkpoint['val_accuracy']
113 | return model, optimizer, scheduler, start_epoch, val_accuracy
114 |
115 | def load_model(model, checkpoint_path, logger):
116 | """
117 | Load checkpoint of model from checkpoint path. Used for generating prediction files.
118 | Only Loads state of model.
119 | """
120 | if not os.path.exists(checkpoint_path):
121 | logger.info(f" Trying to reload checkpoint from pretraining but file {checkpoint_path} not exists,\n"
122 | f" starting training from scratch")
123 | else:
124 | checkpoint = torch.load(checkpoint_path)
125 | state_dict = model.state_dict()
126 | state_dict.update(checkpoint['state_dict'])
127 | model.load_state_dict(state_dict)
128 | return model
129 |
130 | def build_model(arch, src_vocab_size, tgt_vocab_size, embedding_dim, fcn_hidden_dim,
131 | num_heads, num_layers, dropout, src_to_tgt_vocab_conversion_matrix):
132 | """
133 | Builds model.
134 | """
135 | model = transformer.Transformer(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size,
136 | embedding_dim=embedding_dim,
137 | fcn_hidden_dim=fcn_hidden_dim, num_heads=num_heads, num_layers=num_layers,
138 | dropout=dropout) \
139 | if (arch == "transformer") \
140 | else \
141 | pointer_generator.PointerGeneratorTransformer(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size,
142 | src_to_tgt_vocab_conversion_matrix=src_to_tgt_vocab_conversion_matrix,
143 | embedding_dim=embedding_dim,
144 | fcn_hidden_dim=fcn_hidden_dim, num_heads=num_heads,
145 | num_layers=num_layers,
146 | dropout=dropout)
147 | return model
148 |
149 |
150 | def clean_checkpoints_dir(checkpoints_dir):
151 | """
152 | Remove unnecessary model checkpoints (disk quota limit)
153 | """
154 | for filename in sorted(os.listdir(checkpoints_dir)):
155 | if os.path.isfile(os.path.join(checkpoints_dir, filename)) and ("best" not in filename):
156 | os.remove(os.path.join(checkpoints_dir, filename))
157 |
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/transformer.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | from torch.nn.init import xavier_uniform_
4 | from torch.nn import TransformerEncoder, TransformerEncoderLayer
5 | from model_utils import _generate_square_subsequent_mask, Embedding, PositionalEncoding
6 | from tokenizer import PAD_ID
7 |
8 | class Transformer(nn.Module):
9 | def __init__(self, src_vocab_size=128, tgt_vocab_size=128,
10 | embedding_dim=128, fcn_hidden_dim=128,
11 | num_heads=4, num_layers=2, dropout=0.2):
12 | super(Transformer, self).__init__()
13 |
14 | self.embedding_dim = embedding_dim
15 | # Source and Encoder layers
16 | self.src_embed = Embedding(src_vocab_size, embedding_dim, padding_idx=PAD_ID)
17 | self.src_pos_encoder = PositionalEncoding(embedding_dim)
18 | encoder_layer = TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
19 | dim_feedforward=fcn_hidden_dim, dropout=dropout)
20 | encoder_norm = nn.LayerNorm(embedding_dim)
21 | self.encoder = TransformerEncoder(encoder_layer, num_layers, encoder_norm)
22 |
23 | # Target and Decoder layers
24 | self.tgt_embed = Embedding(tgt_vocab_size, embedding_dim, padding_idx=PAD_ID)
25 | self.tgt_pos_encoder = PositionalEncoding(embedding_dim)
26 | decoder_layer = nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads,
27 | dim_feedforward=fcn_hidden_dim, dropout=dropout)
28 | decoder_norm = nn.LayerNorm(embedding_dim)
29 | self.decoder = nn.TransformerDecoder(decoder_layer, num_layers, decoder_norm)
30 | # Final linear layer
31 | self.final_out = nn.Linear(embedding_dim, tgt_vocab_size)
32 |
33 | # Initialize masks
34 | self.src_mask = None
35 | self.tgt_mask = None
36 | self.mem_mask = None
37 | # Initialize weights of model
38 | self._reset_parameters()
39 |
40 | def _reset_parameters(self):
41 | """ Initiate parameters in the transformer model. """
42 | for p in self.parameters():
43 | if p.dim() > 1:
44 | xavier_uniform_(p)
45 |
46 |
47 | def encode(self, src, src_key_padding_mask=None):
48 | """
49 | Applies embedding, positional encoding and then runs the transformer encoder on the source
50 | :param src: source tokens batch
51 | :param src_key_padding_mask: source padding mask
52 | :return: memory- the encoder hidden states
53 | """
54 | # Source embedding and positional encoding, changes dimension (N, S) -> (N, S, E) -> (S, N, E)
55 | src_embed = self.src_embed(src).transpose(0, 1)
56 | src_embed = self.src_pos_encoder(src_embed)
57 | # Pass the source to the encoder
58 | memory = self.encoder(src_embed, mask=self.src_mask, src_key_padding_mask=src_key_padding_mask)
59 | return memory
60 |
61 | def decode(self, memory, tgt, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True):
62 | """
63 | Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target.
64 | Also creates square subsequent mask for teacher learning.
65 | :param memory: The encoder hidden states
66 | :param tgt: Target tokens batch
67 | :param tgt_key_padding_mask: target padding mask
68 | :param memory_key_padding_mask: memory padding mask
69 | :param has_mask: Whether to use square subsequent mask for teacher learning
70 | :return: decoder output
71 | """
72 | # Create target mask for transformer if no appropriate one was created yet, created of size (T, T)
73 | if has_mask:
74 | if self.tgt_mask is None or self.tgt_mask.size(0) != tgt.size(1):
75 | self.tgt_mask = _generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
76 | else:
77 | self.tgt_mask = None
78 | # Target embedding and positional encoding, changes dimension (N, T) -> (N, T, E) -> (T, N, E)
79 | tgt_embed = self.tgt_embed(tgt).transpose(0, 1)
80 | tgt_embed = self.tgt_pos_encoder(tgt_embed)
81 | # Get output of decoder. Dimensions stay the same
82 | decoder_output = self.decoder(tgt_embed, memory, tgt_mask=self.tgt_mask, memory_mask=self.mem_mask,
83 | tgt_key_padding_mask=tgt_key_padding_mask,
84 | memory_key_padding_mask=memory_key_padding_mask)
85 | # Add linear layer & log softmax, (T, N, E) -> (T, N, tgt_vocab_size)
86 | output = F.log_softmax(self.final_out(decoder_output), dim=-1)
87 | # Change back batch and sequence dimensions, from (T, N, tgt_vocab_size) -> (N, T, tgt_vocab_size)
88 | return output.transpose(0, 1)
89 |
90 |
91 | def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True):
92 | """Take in and process masked source/target sequences.
93 |
94 | Args:
95 | src: the sequence to the encoder (required).
96 | tgt: the sequence to the decoder (required).
97 | src_mask: the additive mask for the src sequence (optional).
98 | tgt_mask: the additive mask for the tgt sequence (optional).
99 | memory_mask: the additive mask for the encoder output (optional).
100 | src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
101 | tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
102 | memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
103 |
104 | Shape:
105 | - src: :math:`(S, N, E)`. Starts as (N, S) and changed after embedding
106 | - tgt: :math:`(T, N, E)`. Starts as (N, T) and changed after embedding
107 | - src_mask: :math:`(S, S)`.
108 | - tgt_mask: :math:`(T, T)`.
109 | - memory_mask: :math:`(T, S)`.
110 | - src_key_padding_mask: :math:`(N, S)`.
111 | - tgt_key_padding_mask: :math:`(N, T)`.
112 | - memory_key_padding_mask: :math:`(N, S)`.
113 |
114 | Note: [src/tgt/memory]_mask should be filled with
115 | float('-inf') for the masked positions and float(0.0) else. These masks
116 | ensure that predictions for position i depend only on the unmasked positions
117 | j and are applied identically for each sequence in a batch.
118 | [src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions
119 | that should be masked with float('-inf') and False values will be unchanged.
120 | This mask ensures that no information will be taken from position i if
121 | it is masked, and has a separate mask for each sequence in a batch.
122 |
123 | - output: :math:`(T, N, E)`.
124 |
125 | Note: Due to the multi-head attention architecture in the transformer model,
126 | the output sequence length of a transformer is same as the input sequence
127 | (i.e. target) length of the decode.
128 |
129 | where S is the source sequence length, T is the target sequence length, N is the
130 | batch size, E is the feature number
131 |
132 | Examples:
133 | output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
134 | """
135 | # Applies embedding, positional encoding and the transformer encoder on the source
136 | memory = self.encode(src, src_key_padding_mask)
137 | # Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target.
138 | output = self.decode(memory, tgt, tgt_key_padding_mask, memory_key_padding_mask, has_mask)
139 | return output
140 |
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | from random import shuffle
2 | import torch
3 |
4 | import data
5 | # import tokenizer
6 |
7 |
8 | def get_train_dataset(train_file_path, tokenizer, max_src_seq_len=30, max_tgt_seq_len=25):
9 | """
10 | Reads input and output tokens from train set file, and converts tokens to tensors of ids using tokenizer.
11 | """
12 | # Read input and output tokens from dataset
13 | inputs_tokens, outputs_tokens = data.read_train_file_tokens(train_file_path)
14 | # Pad with sos and eos
15 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens)
16 | outputs_tokens = tokenizer.add_sequence_symbols(outputs_tokens)
17 | # Split target into two targets, for teacher forcing
18 | # -------------
19 | targets_tokens = [target_tokens[:-1] for target_tokens in outputs_tokens]
20 | # --------For Transformer model from SIGMORPHON 2020 Baseline----------
21 | # targets_tokens = outputs_tokens
22 | # -------------
23 | targets_y_tokens = [target_tokens[1:] for target_tokens in outputs_tokens]
24 |
25 | # Get lists of all input ids, target ids and target_y ids, where each sequence padded up to max length
26 | inputs_ids = [
27 | tokenizer.convert_src_tokens_to_ids(tokenizer.pad_tokens_sequence(input_tokens, max_src_seq_len))
28 | for input_tokens in inputs_tokens]
29 | targets_ids = [
30 | tokenizer.convert_tgt_tokens_to_ids(tokenizer.pad_tokens_sequence(target_tokens, max_tgt_seq_len))
31 | for target_tokens in targets_tokens]
32 | targets_y_ids = [
33 | tokenizer.convert_tgt_tokens_to_ids(tokenizer.pad_tokens_sequence(target_y_tokens, max_tgt_seq_len))
34 | for target_y_tokens in targets_y_tokens]
35 |
36 | return inputs_ids, targets_ids, targets_y_ids
37 |
38 |
39 | def get_valid_dataset(valid_file_path, tokenizer):
40 | """
41 | Reads input and output tokens from valid set file, and converts tokens to tensors of ids using tokenizer.
42 | """
43 | # Read input and output tokens from dataset
44 | inputs_tokens, outputs_tokens = data.read_train_file_tokens(valid_file_path)
45 | # Pad with sos and eos
46 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens)
47 | outputs_tokens = tokenizer.add_sequence_symbols(outputs_tokens)
48 | # Get tensors of all input ids and output ids
49 | inputs_ids = tokenizer.get_id_tensors(inputs_tokens, "INPUT")
50 | outputs_ids = tokenizer.get_id_tensors(outputs_tokens, "OUTPUT")
51 | return inputs_ids, outputs_ids
52 |
53 |
54 | def get_test_dataset(test_file_path, tokenizer):
55 | """
56 | Reads tokens from test set file, and converts tokens to tensors of ids using tokenizer.
57 | Returns the tokens as well, used for prediction.
58 | """
59 | # Read input tokens from dataset
60 | inputs_tokens = data.read_test_file_tokens(test_file_path)
61 | # Pad with sos and eos
62 | inputs_tokens = tokenizer.add_sequence_symbols(inputs_tokens)
63 | # Get tensors of input ids
64 | inputs_ids = tokenizer.get_id_tensors(inputs_tokens, "INPUT")
65 | return inputs_ids, inputs_tokens
66 |
67 | def shuffle_together(list1, list2, list3):
68 | """Shuffles two lists together"""
69 | zip_list = list(zip(list1, list2, list3))
70 | shuffle(zip_list)
71 | list1, list2, list3 = zip(*zip_list)
72 | return list1, list2, list3
73 |
74 | def split_to_batches(ids_list, device, batch_size=128):
75 | """ splits list of id sequence into batchs.
76 | Gets list of sequences (list of size seq_len)
77 | returns list of batchs, each batch is a tensor of size N x S (batch_size x seq_len)"""
78 | return [torch.tensor(ids_list[x:x + batch_size], dtype=torch.long, device=device) for x in
79 | range(0, len(ids_list), batch_size)]
80 |
81 |
82 | def get_batches(input_ids, target_ids, target_y_ids, device, batch_size=128):
83 | """ Gets entire dataset, shuffles the data, and splits it to batches.
84 | Each batch is a tensor of size N x S (batch_size x seq_len)."""
85 | # Shuffle together
86 | shuffled_input_ids, shuffled_target_ids, shuffled_target_y_ids = shuffle_together(input_ids, target_ids, target_y_ids)
87 | # split to batches
88 | input_ids_batches = split_to_batches(shuffled_input_ids, device, batch_size)
89 | target_ids_batches = split_to_batches(shuffled_target_ids, device, batch_size)
90 | target_y_ids_batches = split_to_batches(shuffled_target_y_ids, device, batch_size)
91 | return input_ids_batches, target_ids_batches, target_y_ids_batches
92 |
93 |
94 | class DataLoader(object):
95 | """ Contains all utilities for reading train/valid/test sets """
96 | def __init__(self, tokenizer, train_file_path=None, valid_file_path=None, test_file_path=None,
97 | device="cpu", batch_size=128, max_src_seq_len=30, max_tgt_seq_len=25):
98 | self.tokenizer = tokenizer
99 | self.device = device
100 | self.batch_size = batch_size
101 | self.max_src_seq_len = max_src_seq_len
102 | self.max_tgt_seq_len = max_tgt_seq_len
103 | # Read train file and get train set
104 | if train_file_path is not None:
105 | train_input_ids, train_target_ids, train_target_y_ids = get_train_dataset(train_file_path, tokenizer,
106 | self.max_src_seq_len, self.max_tgt_seq_len)
107 | self.train_input_ids = train_input_ids
108 | self.train_target_ids = train_target_ids
109 | self.train_target_y_ids = train_target_y_ids
110 | self.train_set_size = len(self.train_input_ids)
111 | else:
112 | self.train_input_ids = None
113 | self.train_target_ids = None
114 | self.train_target_y_ids = None
115 | self.train_set_size = 0
116 |
117 | if valid_file_path is not None:
118 | # Read validation file and get validation set, for checking loss using teacher forcing
119 | valid_input_ids_tf, valid_target_ids_tf, valid_target_y_ids_tf = get_train_dataset(valid_file_path, tokenizer,
120 | self.max_src_seq_len, self.max_tgt_seq_len)
121 | self.valid_input_ids_tf = valid_input_ids_tf
122 | self.valid_target_ids_tf = valid_target_ids_tf
123 | self.valid_target_y_ids_tf = valid_target_y_ids_tf
124 | # Read validation file and get validation set, for evaluation
125 | valid_input_ids, valid_target_ids = get_valid_dataset(valid_file_path, tokenizer)
126 | self.valid_input_ids = valid_input_ids
127 | self.valid_target_ids = valid_target_ids
128 | else:
129 | self.valid_input_ids_tf = None
130 | self.valid_target_ids_tf = None
131 | self.valid_target_y_ids_tf = None
132 | self.valid_input_ids = None
133 | self.valid_target_ids = None
134 |
135 | if test_file_path is not None:
136 | # Read test file and get test set
137 | test_input_ids = get_test_dataset(test_file_path, tokenizer)
138 | self.test_input_ids = test_input_ids
139 | else:
140 | self.test_input_ids = None
141 |
142 | def get_train_set(self):
143 | return get_batches(self.train_input_ids, self.train_target_ids, self.train_target_y_ids,
144 | self.device, batch_size=self.batch_size)
145 |
146 | def get_validation_set_tf(self):
147 | return get_batches(self.valid_input_ids_tf, self.valid_target_ids_tf, self.valid_target_y_ids_tf,
148 | self.device, batch_size=self.batch_size)
149 |
150 | def get_validation_set(self):
151 | return self.valid_input_ids, self.valid_target_ids
152 |
153 | def get_validation_set_len(self):
154 | return len(self.valid_input_ids)
155 |
156 | def get_test_set(self):
157 | return self.test_input_ids
158 |
159 | def get_test_set_len(self):
160 | return len(self.test_input_ids)
161 |
162 | def get_padding_mask(self, batch_tensor):
163 | """" Returns padding masks for given batch
164 | Padding masks are ByteTensor where True values are positions that are masked and False values are not.
165 | inputs are of size N x S (batch_size x seq_len)
166 | Returns masks of same size - N x S (batch_size x seq_len) """
167 | return batch_tensor == self.tokenizer.pad_id
168 |
169 | def get_padding_masks(self, source_batch, target_batch):
170 | """" Returns padding masks for source batch, memory batch, and target batch. """
171 | src_padding_mask = self.get_padding_mask(source_batch)
172 | mem_padding_mask = src_padding_mask
173 | target_padding_mask = self.get_padding_mask(target_batch)
174 | return src_padding_mask, mem_padding_mask, target_padding_mask
175 |
--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from tqdm import tqdm
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | import dataset
8 | import data
9 | import tokenizer
10 |
11 | # Arguments
12 | import transformer
13 | import utils
14 |
15 | parser = argparse.ArgumentParser(description='Evaluating the transformer over test and validation sets')
16 | parser.add_argument('--model-checkpoint', type=str, default='checkpoints/model_best.pth',
17 | help="the model file to be evaluated. Usually is of the form model_X.pth (must include folder path)")
18 | parser.add_argument('--arch', type=str, default='transformer',
19 | help="Architecture type for model: transformer, pointer_generator")
20 | parser.add_argument('--embed-dim', type=int, default=128,
21 | help='Embedding dimension (default: 128)')
22 | parser.add_argument('--fcn-dim', type=int, default=256,
23 | help='Fully-connected network hidden dimension (default: 256)')
24 | parser.add_argument('--num-heads', type=int, default=4,
25 | help='number of attention heads (default: 4)')
26 | parser.add_argument('--num-layers', type=int, default=2,
27 | help='number of layers in encoder and decoder (default: 2)')
28 | parser.add_argument('--dropout', type=float, default=0.2,
29 | help='Dropout probability (default: 0.2)')
30 | parser.add_argument('--test', type=str, default='data',
31 | help="Test file of the dataset (must include folder path)")
32 | parser.add_argument('--vocab', type=str, default='data',
33 | help="Base name of vocabulary files (must include folder path)")
34 | parser.add_argument('--pred', type=str, default='pred',
35 | help="Name of output file containing predictions of the test set (must include folder path)")
36 | args = parser.parse_args()
37 |
38 | """ FILES AND TOKENIZER """
39 | # Get test and out file path
40 | test_file = args.test
41 | out_file = args.pred
42 | # Get vocabulary paths
43 | src_vocab_file = args.vocab + "-input"
44 | tgt_vocab_file = args.vocab + "-output"
45 |
46 | # Log all relevant files
47 | logger = utils.get_logger()
48 | logger.info(f"Model checkpoint: {args.model_checkpoint}")
49 | logger.info(f"Test file: {test_file}")
50 | logger.info(f"Input vocabulary file: {src_vocab_file}")
51 | logger.info(f"Output vocabulary file: {tgt_vocab_file}")
52 | logger.info(f"Prediction file: {out_file}")
53 |
54 | """ CONSTANTS """
55 | MAX_SRC_SEQ_LEN = 45
56 | MAX_TGT_SEQ_LEN = 45
57 |
58 | """ MODEL AND DATA LOADER """
59 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
60 | # Initialize Tokenizer object with input and output vocabulary files
61 | myTokenizer = tokenizer.Tokenizer(src_vocab_file, tgt_vocab_file, device=device)
62 | # Load model from checkpoint in evaluation mode
63 | model = utils.build_model(args.arch, myTokenizer.src_vocab_size, myTokenizer.tgt_vocab_size, args.embed_dim, args.fcn_dim,
64 | args.num_heads, args.num_layers, args.dropout, myTokenizer.src_to_tgt_vocab_conversion_matrix)
65 | model = utils.load_model(model, args.model_checkpoint, logger)
66 | model.to(device)
67 | model.eval()
68 | # Initialize DataLoader object
69 | data_loader = dataset.DataLoader(myTokenizer, train_file_path=None, valid_file_path=None,
70 | test_file_path=test_file, device=device)
71 |
72 | """ FUNCTIONS """
73 |
74 |
75 | def prdeict_word(src, max_seq_len):
76 | # Add batch dimension
77 | src = src.unsqueeze(dim=0)
78 | src_key_padding_mask = data_loader.get_padding_mask(src)
79 | memory = model.encode(src, src_key_padding_mask)
80 | outputs = torch.zeros(1, max_seq_len, dtype=torch.long, device=device)
81 | outputs[0] = myTokenizer.sos_id
82 | for j in range(1, max_seq_len):
83 | # Compute output of model
84 | tgt_key_padding_mask = data_loader.get_padding_mask(outputs[:, :j])
85 | out = model.decode(memory, outputs[:, :j], tgt_key_padding_mask, src_key_padding_mask).squeeze() if \
86 | (model.__class__.__name__ == "Transformer") else \
87 | model.decode(memory, outputs[:, :j], src, tgt_key_padding_mask, src_key_padding_mask).squeeze()
88 | val, ix = out.topk(1)
89 | outputs[0, j] = ix[-1]
90 | if ix[-1] == myTokenizer.eos_id:
91 | break
92 | # Strip off sos and eos tokens
93 | return outputs[0, 1:j]
94 |
95 |
96 | # --------For Transformer model from SIGMORPHON 2020 Baseline----------
97 | def dummy_mask(seq):
98 | '''
99 | create dummy mask (all 1)
100 | '''
101 | if isinstance(seq, tuple):
102 | seq = seq[0]
103 | assert len(seq.size()) == 1 or (len(seq.size()) == 2 and seq.size(1) == 1)
104 | return torch.ones_like(seq, dtype=torch.float)
105 |
106 |
107 | def decode_greedy_transformer(src_sentence, max_len=40, trg_bos=myTokenizer.sos_id, trg_eos=myTokenizer.eos_id):
108 | '''
109 | src_sentence: [seq_len, 1]
110 | '''
111 | model.eval()
112 | src_mask = dummy_mask(src_sentence)
113 | src_mask = (src_mask == 0).transpose(0, 1)
114 | enc_hs = model.encode(src_sentence, src_mask)
115 |
116 | output, attns = [trg_bos], []
117 |
118 | for _ in range(max_len):
119 | output_tensor = torch.tensor(output, device=device).view(len(output), 1)
120 | trg_mask = dummy_mask(output_tensor)
121 | trg_mask = (trg_mask == 0).transpose(0, 1)
122 |
123 | word_logprob = model.decode(enc_hs, src_mask, output_tensor, trg_mask)
124 | word_logprob = word_logprob[-1]
125 |
126 | word = torch.max(word_logprob, dim=1)[1]
127 | if word == trg_eos:
128 | break
129 | output.append(word.item())
130 | return output[1:] # , attns
131 | # ------------------------------------
132 |
133 |
134 | def write_predictions_to_file(predictions, test_file_path, out_file_path):
135 | utils.maybe_mkdir(out_file_path)
136 | # Get original input from test file
137 | lemmas, features = data.read_test_file(test_file_path)
138 | # Write all data with predictions to the out file
139 | data.write_morph_file(lemmas, predictions, features, out_file_path)
140 |
141 |
142 | def generate_prediction_file(max_seq_len=MAX_TGT_SEQ_LEN):
143 | """ Generates predictions over the test set and prints output to prediction file."""
144 | input_ids, input_tokens = data_loader.get_test_set()
145 | predictions = []
146 | # Go over each example
147 | for i, (data, data_tokens) in tqdm(enumerate(zip(input_ids, input_tokens))):
148 | unkown_tokens = [token for token in data_tokens if token not in myTokenizer.src_vocab]
149 | # Get prediction from model
150 | # ------------------
151 | pred = prdeict_word(data, max_seq_len)
152 | # Convert from predicted ids to the predicted word
153 | pred_tokens = myTokenizer.convert_tgt_ids_to_tokens(pred.tolist())
154 | # pred = decode_greedy_transformer(data.unsqueeze(dim=0).transpose(0, 1), max_seq_len)
155 | # pred_tokens = myTokenizer.convert_tgt_ids_to_tokens(list(pred))
156 | # ------------------
157 |
158 | # where token is unkown token, copy from the source at the same token location
159 | unkown_idx = 0
160 | for j in range(len(pred_tokens)):
161 | if pred_tokens[j] == myTokenizer.unk and (j < len(data_tokens) - 1):
162 | pred_tokens[j] = data_tokens[j + 1] # account for data token padded with at the beginning
163 | # if pred_tokens[j] == myTokenizer.unk:
164 | # pred_tokens[j] = unkown_tokens[unkown_idx]
165 | # # Increment index, until reaches the end, then stay
166 | # unkown_tokens = min(unkown_tokens + 1, len(unkown_tokens) - 1)
167 |
168 | pred_word = ''.join(pred_tokens)
169 | predictions.append(pred_word)
170 | write_predictions_to_file(predictions, test_file, out_file)
171 |
172 |
173 | if __name__ == '__main__':
174 | # Generate predictions for test set
175 | generate_prediction_file()
176 | logger.info(f"Created prediction file: {out_file}\n")
177 |
178 |
179 | # def prdeict_word(src, max_seq_len):
180 | # """
181 | # Predicts target word given source (lemma + features). Predictions generated in greedy manner.
182 | # """
183 | # # Add batch dimension
184 | # src = src.unsqueeze(dim=0)
185 | # src = src.transpose(0, 1)
186 | # outputs = torch.zeros(1, max_seq_len, dtype=torch.long, device=device)
187 | # outputs[0] = myTokenizer.sos_id
188 | # for j in range(1, max_seq_len):
189 | # trg = outputs[:, :j]
190 | # trg = trg.transpose(0, 1)
191 | # src_mask = (src > 0).float()
192 | # trg_mask = (trg > 0).float()
193 | # # Compute output of model
194 | # out = model(src, src_mask, trg, trg_mask).transpose(0, 1).squeeze()
195 | # val, ix = out.topk(1)
196 | # outputs[0, j] = ix[-1]
197 | # if ix[-1] == myTokenizer.eos_id:
198 | # break
199 | # return outputs[0, :j + 1]
200 | #
201 |
--------------------------------------------------------------------------------
/pointer_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn.init import xavier_uniform_
4 |
5 | from model_utils import PositionalEncoding, _generate_square_subsequent_mask, Embedding
6 | from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer
7 | from decoder import TransformerDecoder, TransformerDecoderFinalLayer
8 |
9 |
10 | class PointerGeneratorTransformer(nn.Module):
11 | def __init__(self, src_vocab_size=128, tgt_vocab_size=128,
12 | embedding_dim=128, fcn_hidden_dim=128,
13 | num_heads=4, num_layers=2, dropout=0.2,
14 | src_to_tgt_vocab_conversion_matrix=None):
15 | super(PointerGeneratorTransformer, self).__init__()
16 |
17 | self.src_vocab_size = src_vocab_size
18 | self.tgt_vocab_size = tgt_vocab_size
19 | self.embedding_dim = embedding_dim
20 | self.src_to_tgt_vocab_conversion_matrix = src_to_tgt_vocab_conversion_matrix
21 | self.pos_encoder = PositionalEncoding(embedding_dim)
22 | # Source and target embeddings
23 | self.src_embed = Embedding(self.src_vocab_size, embedding_dim, padding_idx=2)
24 | self.tgt_embed = Embedding(self.tgt_vocab_size, embedding_dim, padding_idx=2)
25 |
26 | # Encoder layers
27 | self.encoder_layer = TransformerEncoderLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout)
28 | self.encoder = TransformerEncoder(self.encoder_layer, num_layers)
29 |
30 | # Decoder layers
31 | self.decoder_layer = TransformerDecoderLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout)
32 | self.decoder_final_layer = TransformerDecoderFinalLayer(embedding_dim, num_heads, fcn_hidden_dim, dropout)
33 | self.decoder = TransformerDecoder(self.decoder_layer, self.decoder_final_layer, num_layers)
34 |
35 | # Final linear layer + softmax. for probability over target vocabulary
36 | self.p_vocab = nn.Sequential(
37 | nn.Linear(self.embedding_dim, self.tgt_vocab_size),
38 | nn.Softmax(dim=-1))
39 |
40 | # P_gen, probability of generating output
41 | self.p_gen = nn.Sequential(
42 | nn.Linear(self.embedding_dim * 3, 1),
43 | nn.Sigmoid())
44 | # Context vector
45 | self.c_t = None
46 |
47 | # Initialize masks
48 | self.src_mask = None
49 | self.tgt_mask = None
50 | self.mem_mask = None
51 | # Initialize weights of model
52 | self._reset_parameters()
53 |
54 | def _reset_parameters(self):
55 | """ Initiate parameters in the transformer model. """
56 | for p in self.parameters():
57 | if p.dim() > 1:
58 | xavier_uniform_(p)
59 |
60 |
61 | def encode(self, src, src_key_padding_mask=None):
62 | """
63 | Applies embedding, positional encoding and then runs the transformer encoder on the source
64 | :param src: source tokens batch
65 | :param src_key_padding_mask: source padding mask
66 | :return: memory- the encoder hidden states
67 | """
68 | # Source embedding and positional encoding, changes dimension (N, S) -> (N, S, E) -> (S, N, E)
69 | src_embed = self.src_embed(src).transpose(0, 1)
70 | src_embed = self.pos_encoder(src_embed)
71 | # Pass the source to the encoder
72 | memory = self.encoder(src_embed, mask=self.src_mask, src_key_padding_mask=src_key_padding_mask)
73 | return memory
74 |
75 | def decode(self, memory, tgt, src, tgt_key_padding_mask=None, memory_key_padding_mask=None, has_mask=True):
76 | """
77 | Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target.
78 | Also creates square subsequent mask for teacher learning.
79 | :param memory: The encoder hidden states
80 | :param tgt: Target tokens batch
81 | :param tgt_key_padding_mask: target padding mask
82 | :param memory_key_padding_mask: memory padding mask
83 | :param has_mask: Whether to use square subsequent mask for teacher learning
84 | :return: decoder output
85 | """
86 | # Create target mask for transformer if no appropriate one was created yet, created of size (T, T)
87 | if has_mask:
88 | if self.tgt_mask is None or self.tgt_mask.size(0) != tgt.size(1):
89 | self.tgt_mask = _generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
90 | else:
91 | self.tgt_mask = None
92 | # Target embedding and positional encoding, changes dimension (N, T) -> (N, T, E) -> (T, N, E)
93 | tgt_embed = self.tgt_embed(tgt).transpose(0, 1)
94 | tgt_embed_pos = self.pos_encoder(tgt_embed)
95 | # Get output of decoder and attention weights. decoder Dimensions stay the same
96 | decoder_output, attention = self.decoder(tgt_embed_pos, memory, tgt_mask=self.tgt_mask,
97 | memory_mask=self.mem_mask,
98 | tgt_key_padding_mask=tgt_key_padding_mask,
99 | memory_key_padding_mask=memory_key_padding_mask)
100 | # Get probability over target vocabulary, (T, N, E) -> (T, N, tgt_vocab_size)
101 | p_vocab = self.p_vocab(decoder_output)
102 |
103 | # ---Compute Pointer Generator probability---
104 | # Get hidden states of source (easier/more understandable computation). (S, N, E) -> (N, S, E)
105 | hidden_states = memory.transpose(0, 1)
106 | # compute context vectors. (N, T, S) x (N, S, E) -> (N, T, E)
107 | context_vectors = torch.matmul(attention, hidden_states).transpose(0, 1)
108 | total_states = torch.cat((context_vectors, decoder_output, tgt_embed), dim=-1)
109 | # Get probability of generating output. (N, T, 3*E) -> (N, T, 1)
110 | p_gen = self.p_gen(total_states)
111 | # Get probability of copying from input. (N, T, 1)
112 | p_copy = 1 - p_gen
113 |
114 | # Get representation of src tokens as one hot encoding
115 | one_hot = torch.zeros(src.size(0), src.size(1), self.src_vocab_size, device=src.device)
116 | one_hot = one_hot.scatter_(dim=-1, index=src.unsqueeze(-1), value=1)
117 | # p_copy from source is sum over all attention weights for each token in source
118 | p_copy_src_vocab = torch.matmul(attention, one_hot)
119 | # convert representation of token from src vocab to tgt vocab
120 | p_copy_tgt_vocab = torch.matmul(p_copy_src_vocab, self.src_to_tgt_vocab_conversion_matrix).transpose(0,
121 | 1)
122 | # Compute final probability
123 | p = torch.add(p_vocab * p_gen, p_copy_tgt_vocab * p_copy)
124 |
125 | # Change back batch and sequence dimensions, from (T, N, tgt_vocab_size) -> (N, T, tgt_vocab_size)
126 | return torch.log(p.transpose(0, 1))
127 |
128 | def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
129 | memory_key_padding_mask=None, has_mask=True):
130 | """Take in and process masked source/target sequences.
131 |
132 | Args:
133 | src: the sequence to the encoder (required).
134 | tgt: the sequence to the decoder (required).
135 | src_mask: the additive mask for the src sequence (optional).
136 | tgt_mask: the additive mask for the tgt sequence (optional).
137 | memory_mask: the additive mask for the encoder output (optional).
138 | src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
139 | tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
140 | memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
141 |
142 | Shape:
143 | - src: :math:`(S, N, E)`. Starts as (N, S) and changed after embedding
144 | - tgt: :math:`(T, N, E)`. Starts as (N, T) and changed after embedding
145 | - src_mask: :math:`(S, S)`.
146 | - tgt_mask: :math:`(T, T)`.
147 | - memory_mask: :math:`(T, S)`.
148 | - src_key_padding_mask: :math:`(N, S)`.
149 | - tgt_key_padding_mask: :math:`(N, T)`.
150 | - memory_key_padding_mask: :math:`(N, S)`.
151 |
152 | Note: [src/tgt/memory]_mask should be filled with
153 | float('-inf') for the masked positions and float(0.0) else. These masks
154 | ensure that predictions for position i depend only on the unmasked positions
155 | j and are applied identically for each sequence in a batch.
156 | [src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions
157 | that should be masked with float('-inf') and False values will be unchanged.
158 | This mask ensures that no information will be taken from position i if
159 | it is masked, and has a separate mask for each sequence in a batch.
160 |
161 | - output: :math:`(T, N, E)`.
162 |
163 | Note: Due to the multi-head attention architecture in the transformer model,
164 | the output sequence length of a transformer is same as the input sequence
165 | (i.e. target) length of the decode.
166 |
167 | where S is the source sequence length, T is the target sequence length, N is the
168 | batch size, E is the feature number
169 |
170 | Examples:
171 | output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
172 | """
173 |
174 | # Applies embedding, positional encoding and the transformer encoder on the source
175 | memory = self.encode(src, src_key_padding_mask)
176 | # Applies embedding, positional encoding on target and then runs the transformer encoder on the memory and target.
177 | output = self.decode(memory, tgt, src, tgt_key_padding_mask, memory_key_padding_mask, has_mask)
178 | return output
179 |
180 |
181 |
182 |
183 | # def get_context_vectors_1(self, hidden_states, attention, N, T):
184 | # """ compute context vectors using hidden states and attention over the source """
185 | # # Replace source and embedding dimension
186 | # hidden_states = hidden_states.transpose(1, 2)
187 | # context_vectors = torch.zeros(N, T, self.embedding_dim).type(torch.float32)
188 | # # Get context vector
189 | # for i in range(N): # go over each data sample i in batch
190 | # h_t_i = hidden_states[i] # all hidden_states - E x S
191 | # for j in range(T): # go over each target token j
192 | # attn_i_j = attention[i][j] # attention over source for target token j - S
193 | # context_vectors[i][j] = torch.mv(h_t_i, attn_i_j)
194 | # context_vectors = context_vectors.transpose(0, 1)
195 | # return context_vectors.type(torch.float32)
196 |
197 | # one_hot_cat = torch.cat(T * [one_hot]).view(N, T, S, self.src_vocab_size)
198 | # one_hot_cat = one_hot.unsqueeze(1).repeat(1, T, 1, 1)
199 |
200 | # def one_hot_encoding(src, src_vocab_size):
201 | # one_hot = torch.zeros(src.size(0), src.size(1), src_vocab_size)
202 | # src_ext = src.unsqueeze_(-1)
203 | # one_hot = one_hot.scatter_(dim=-1, index=src_ext, value=1)
204 | # return one_hot
205 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import math
3 | from tqdm import tqdm
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.optim.lr_scheduler import ReduceLROnPlateau
9 |
10 | import utils
11 | import dataset
12 | import tokenizer
13 |
14 | import transformer_baseline
15 |
16 | # Training parameters
17 | parser = argparse.ArgumentParser(description='Training Transformer and Pointer-Generator for morphological inflection')
18 | parser.add_argument('--train', type=str, default='data',
19 | help="Train file of the dataset (File is located in DATA_FOLDER)")
20 | parser.add_argument('--dev', type=str, default='data',
21 | help="Validation file of the dataset (File is located in DATA_FOLDER)")
22 | parser.add_argument('--vocab', type=str, default='data',
23 | help="Base name of vocabulary files (must include dir path)")
24 | parser.add_argument('--checkpoints-dir', type=str, default='model-checkpoints',
25 | help='Folder to keep checkpoints of model')
26 | parser.add_argument('--resume', default=False, action='store_true',
27 | help="Whether to resume training from a certain checkpoint")
28 | parser.add_argument('--reload', default=False, action='store_true',
29 | help="Whether to reload pretrained model from certain checkpoint")
30 | parser.add_argument('--epochs', type=int, default=100,
31 | help='number of epochs to train (default: 100)')
32 | parser.add_argument('--steps', type=int, default=100,
33 | help='number of batch steps to train (default: 20,000)')
34 | parser.add_argument('--batch-size', type=int, default=128,
35 | help='input batch size for training (default: 128)')
36 | parser.add_argument('--eval-every', type=int, default=1,
37 | help='Evaluate model over validation set every how many epochs (default: 1)')
38 | parser.add_argument('--arch', type=str, default='transformer',
39 | help="Architecture type for model: transformer, pointer_generator")
40 | parser.add_argument('--embed-dim', type=int, default=128,
41 | help='Embedding dimension (default: 128)')
42 | parser.add_argument('--fcn-dim', type=int, default=256,
43 | help='Fully-connected network hidden dimension (default: 256)')
44 | parser.add_argument('--num-heads', type=int, default=4,
45 | help='number of attention heads (default: 4)')
46 | parser.add_argument('--num-layers', type=int, default=2,
47 | help='number of layers in encoder and decoder (default: 2)')
48 | parser.add_argument('--dropout', type=float, default=0.2,
49 | help='Dropout probability (default: 0.2)')
50 | parser.add_argument('--lr', type=float, default=5e-4,
51 | help='learning rate (default: 0.01)')
52 | parser.add_argument('--beta', type=float, default=0.9,
53 | help='beta for Adam optimizer (default: 0.01)')
54 | parser.add_argument('--beta2', type=float, default=0.999,
55 | help='beta 2 for Adam optimizer (default: 0.01)')
56 | parser.add_argument('--label-smooth', default=0.1, type=float,
57 | help='label smoothing coeff')
58 | parser.add_argument('--scheduler', type=str, default="ReduceLROnPlateau",
59 | help='Learning rate Scheduler (default: ReduceLROnPlateau)')
60 | parser.add_argument('--patience', default=5, type=int,
61 | help='patience of for early stopping (default: 0)')
62 | parser.add_argument('--min-lr', type=float, default=1e-5,
63 | help='Minimum learning rate (default: 0.01)')
64 | parser.add_argument('--discount-factor', default=0.5, type=float,
65 | help='discount factor of `ReduceLROnPlateau` (default: 0.5)')
66 | parser.add_argument('--patience_reduce', default=0, type=int,
67 | help='patience of `ReduceLROnPlateau` (default: 0)')
68 | parser.add_argument('--warmup-steps', default=4000, type=int,
69 | help='number of warm up steps for scheduler (default: 4000)')
70 | args = parser.parse_args()
71 |
72 | # Get train and validation file paths
73 | train_file = args.train
74 | valid_file = args.dev
75 | # Get vocabulary paths
76 | src_vocab_file = args.vocab + "-input"
77 | tgt_vocab_file = args.vocab + "-output"
78 | # Initialize Tokenizer object with input and output vocabulary files
79 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
80 | myTokenizer = tokenizer.Tokenizer(src_vocab_file, tgt_vocab_file, device)
81 |
82 | """ CONSTANTS """
83 | MAX_SRC_SEQ_LEN = 45
84 | MAX_TGT_SEQ_LEN = 45
85 | SRC_VOCAB_SIZE = myTokenizer.src_vocab_size
86 | TGT_VOCAB_SIZE = myTokenizer.tgt_vocab_size
87 | # Model Hyperparameters
88 | EMBEDDING_DIM = args.embed_dim
89 | FCN_HIDDEN_DIM = args.fcn_dim
90 | NUM_HEADS = args.num_heads
91 | NUM_LAYERS = args.num_layers
92 | DROPOUT = args.dropout
93 |
94 | """ MODEL AND DATA LOADER """
95 | model = utils.build_model(args.arch, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, EMBEDDING_DIM, FCN_HIDDEN_DIM,
96 | NUM_HEADS, NUM_LAYERS, DROPOUT, myTokenizer.src_to_tgt_vocab_conversion_matrix)
97 | # --------Transformer model from SIGMORPHON 2020 Baseline----------
98 | # model = transformer_baseline.Transformer(src_vocab_size=SRC_VOCAB_SIZE, trg_vocab_size=TGT_VOCAB_SIZE,
99 | # embed_dim=EMBEDDING_DIM, nb_heads=NUM_HEADS,
100 | # src_hid_size=FCN_HIDDEN_DIM, src_nb_layers=NUM_LAYERS,
101 | # trg_hid_size=FCN_HIDDEN_DIM, trg_nb_layers=NUM_LAYERS,
102 | # dropout_p=DROPOUT,
103 | # tie_trg_embed=False, src_c2i=None, trg_c2i=None, attr_c2i=None, label_smooth=0.1)
104 | model.to(device)
105 | criterion = nn.NLLLoss(reduction='mean', ignore_index=myTokenizer.pad_id)
106 | optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta, args.beta2))
107 | scheduler = ReduceLROnPlateau(optimizer, 'min', min_lr=args.min_lr, factor=args.discount_factor,
108 | patience=args.patience_reduce) \
109 | if (args.scheduler == "ReduceLROnPlateau") \
110 | else utils.WarmupInverseSquareRootSchedule(optimizer, args.warmup_steps)
111 | # Initialize DataLoader object
112 | data_loader = dataset.DataLoader(myTokenizer, train_file_path=train_file, valid_file_path=valid_file,
113 | test_file_path=None, device=device, batch_size=args.batch_size,
114 | max_src_seq_len=MAX_SRC_SEQ_LEN, max_tgt_seq_len=MAX_TGT_SEQ_LEN)
115 |
116 |
117 | """ HELPER FUNCTIONS"""
118 | def get_lr():
119 | if isinstance(scheduler, ReduceLROnPlateau):
120 | return optimizer.param_groups[0]['lr']
121 | try:
122 | return scheduler.get_last_lr()[0]
123 | except:
124 | return scheduler.get_lr()[0]
125 |
126 |
127 | def get_loss(predict, target):
128 | """
129 | Compute loss
130 | :param predict: SxNxTGT_VOCAB
131 | :param target: SxN
132 | :return: loss
133 | """
134 | predict = predict.contiguous().view(-1, TGT_VOCAB_SIZE)
135 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX)
136 | target = target.contiguous().view(-1, 1)
137 | non_pad_mask = target.ne(myTokenizer.pad_id)
138 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean()
139 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean()
140 | smooth_loss = smooth_loss / TGT_VOCAB_SIZE
141 | loss = (1. -
142 | args.label_smooth) * nll_loss + args.label_smooth * smooth_loss
143 | return loss
144 |
145 |
146 | """ LOGGING SETTINGS AND LOGGING """
147 | # Set number of total epoch and min epoch for eval start
148 | MIN_EVAL_STEPS = 4000
149 | steps_per_epoch = int(math.ceil(data_loader.train_set_size / args.batch_size))
150 | epochs = int(math.ceil(args.steps / steps_per_epoch))
151 | min_eval_epochs = int(MIN_EVAL_STEPS / steps_per_epoch)
152 |
153 | # Log all model settings
154 | logger = utils.get_logger()
155 | logger.info(f"Training model")
156 | logger.info(f"Arch: {args.arch}, embed_dim: {EMBEDDING_DIM}, fcn_hid_dim: {FCN_HIDDEN_DIM},"
157 | f" num-heads: {NUM_HEADS}, num-layers: {NUM_LAYERS}, dropout: {DROPOUT}, device: {device}")
158 | logger.info(f"Optimizer: Adam, lr: {args.lr}, beta: {args.beta}, beta2: {args.beta2}")
159 | logger.info(
160 | f"Scheduler: {args.scheduler}, patience: {args.patience}, min_lr: {args.min_lr}, warmup steps: {args.warmup_steps},"
161 | f" discount factor: {args.discount_factor}, patience_reduce: {args.patience_reduce}")
162 | logger.info(f"Source vocabulary: Size = {myTokenizer.src_vocab_size}, {myTokenizer.src_vocab}")
163 | logger.info(f"Target vocabulary: Size = {myTokenizer.tgt_vocab_size}, {myTokenizer.tgt_vocab}")
164 | logger.info(f"Training file: {train_file}")
165 | logger.info(f"Validation file: {valid_file}")
166 | logger.info(f"Input vocabulary file: {src_vocab_file}")
167 | logger.info(f"Output vocabulary file: {tgt_vocab_file}")
168 | logger.info(f"Checkpoints dir: {args.checkpoints_dir}")
169 | logger.info(f"Model: {model}")
170 | logger.info(f"Resume training: {args.resume}, reload from pretraining: {args.reload}")
171 | logger.info(f"Steps: {args.steps}, batch size:{args.batch_size},\n"
172 | f"Train set size: {data_loader.train_set_size}, Steps per epoch {steps_per_epoch},\n"
173 | f"Epochs: {epochs}, Eval every :{args.eval_every}")
174 |
175 | # Reload model/ resume training if applicable
176 | if args.resume:
177 | # Resume training from checkpoint
178 | model, optimizer, scheduler, start_epoch, best_valid_accuracy = \
179 | utils.load_checkpoint(model, optimizer, scheduler, f"{args.checkpoints_dir}/model_best.pth", logger)
180 | best_valid_epoch = start_epoch
181 | else:
182 | # Reload pretrained model from checkpoint
183 | if args.reload:
184 | model = utils.load_model(model, f"{args.checkpoints_dir}/model_best.pth", logger)
185 | start_epoch = 0
186 | # Initialize best validation loss placeholders
187 | best_valid_accuracy = -1.0
188 | best_valid_epoch = 0
189 |
190 | """ FUNCTIONS """
191 | def train(epoch):
192 | """ Runs full training epoch over the training set, uses teacher forcing in training"""
193 | model.train()
194 | running_loss = 0.0
195 | # Get Training set in batches
196 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_train_set()
197 | # Go over each batch
198 | for i, (data, target, target_y) in tqdm(enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches))):
199 | optimizer.zero_grad()
200 | # Get padding masks
201 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target)
202 | # Compute output of model
203 | output = model(data, target, src_pad_mask, target_pad_mask, mem_pad_mask)
204 | # ---------------
205 | # Compute loss
206 | # loss = criterion(output.contiguous().view(-1, TGT_VOCAB_SIZE), target_y.contiguous().view(-1))
207 | loss = get_loss(output.transpose(0, 1), target_y.transpose(0, 1))
208 | # -------------
209 | # Propagate loss and update model parameters
210 | loss.backward()
211 | optimizer.step()
212 | if not isinstance(scheduler, ReduceLROnPlateau):
213 | scheduler.step()
214 | running_loss += loss.item()
215 | # print statistics
216 | logger.info(f"Train Epoch: {epoch}, avg loss: {running_loss / (i + 1):.4f}, lr {get_lr():.6f}")
217 |
218 |
219 | def validation(epoch):
220 | """ Computes loss and accuracy over the validation set, using teacher forcing inputs """
221 | model.eval()
222 | running_loss = 0
223 | correct_preds = 0
224 | # Get Training set batches
225 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_validation_set_tf()
226 | # Go over each batch
227 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)):
228 | # Get padding masks
229 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target)
230 | # Compute output of model
231 | output = model(data, target, src_pad_mask, target_pad_mask, mem_pad_mask)
232 | # Get model predictions
233 | predictions = output.topk(1)[1].squeeze()
234 | # Compute accuracy
235 | target_pad_mask = (target_pad_mask == False).int()
236 | predictions = predictions * target_pad_mask
237 | correct_preds += torch.all(torch.eq(predictions, target_y), dim=-1).sum()
238 | # ---------------
239 | # Compute loss
240 | # loss = criterion(output.contiguous().view(-1, TGT_VOCAB_SIZE), target_y.contiguous().view(-1))
241 | loss = get_loss(output.transpose(0, 1), target_y.transpose(0, 1))
242 | # -------------
243 | running_loss += loss.item()
244 | # print statistics
245 | final_loss = running_loss / (i + 1)
246 | if isinstance(scheduler, ReduceLROnPlateau):
247 | scheduler.step(final_loss)
248 | accuracy = float(100 * correct_preds) / data_loader.get_validation_set_len()
249 | logger.info(f"Validation. Epoch: {epoch}, avg dev loss: {final_loss:.4f}, accuracy: {accuracy:.2f}%")
250 | return accuracy # final_loss
251 |
252 |
253 | # --------For Transformer model from SIGMORPHON 2020 Baseline----------
254 | def train_baseline(epoch):
255 | """ Runs full training epoch over the training set, uses teacher forcing in training"""
256 | model.train()
257 | running_loss = 0.0
258 | # Get Training set in batches
259 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_train_set()
260 | # Go over each batch
261 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)):
262 | optimizer.zero_grad()
263 | # Get padding masks
264 | data = data.transpose(0, 1)
265 | target = target.transpose(0, 1)
266 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target)
267 | src_pad_mask, mem_pad_mask, target_pad_mask = (src_pad_mask == False).float(), (
268 | mem_pad_mask == False).float(), (target_pad_mask == False).float()
269 | # Compute loss
270 | batch = (data, src_pad_mask, target, target_pad_mask)
271 | loss = model.get_loss(batch)
272 | loss.backward()
273 | optimizer.step()
274 | running_loss += loss.item()
275 | # print statistics
276 | print(f"\nTrain Epoch: {epoch}, loss: {running_loss / (i + 1):.5f}")
277 |
278 |
279 | def validation_baseline(epoch):
280 | """ Computes loss and accuracy over the validation set, using teacher forcing inputs """
281 | model.eval()
282 | running_loss = 0.0
283 | correct_preds = 0
284 | # Get Training set in batches
285 | input_ids_batches, target_ids_batches, target_y_ids_batches = data_loader.get_validation_set_tf()
286 | # Go over each batch
287 | for i, (data, target, target_y) in enumerate(zip(input_ids_batches, target_ids_batches, target_y_ids_batches)):
288 | data = data.transpose(0, 1)
289 | target = target.transpose(0, 1)
290 | src_pad_mask, mem_pad_mask, target_pad_mask = data_loader.get_padding_masks(data, target)
291 | src_pad_mask, mem_pad_mask, target_pad_mask = (src_pad_mask == False).float(), (
292 | mem_pad_mask == False).float(), (target_pad_mask == False).float()
293 | target_y_pad_mask = data_loader.get_padding_mask(target_y)
294 | # Compute loss over output (using baseline code function)
295 | loss = model.get_loss((data, src_pad_mask, target, target_pad_mask))
296 | running_loss += loss.item()
297 | # Compute output of model
298 | output = model(data, src_pad_mask, target, target_pad_mask).transpose(0, 1)
299 | # Get model predictions
300 | predictions = output.topk(1)[1].squeeze()
301 | target_pad_mask_test = (target_y_pad_mask == False).int()
302 | predictions = predictions * target_pad_mask_test
303 | correct_preds += torch.all(torch.eq(predictions, target_y), dim=-1).sum()
304 | final_loss = running_loss / (i + 1)
305 | accuracy = float(correct_preds) / data_loader.get_validation_set_len()
306 | print(f"Validation. Epoch: {epoch}, loss: {final_loss:.4f}, accuracy: {accuracy:.2f}%")
307 | return accuracy # , final_loss
308 |
309 |
310 | if __name__ == '__main__':
311 | eval_every = args.eval_every
312 | epochs_no_improve = 0
313 | logger.info(f"Starting training from Epoch {start_epoch + 1}")
314 | for epoch in range(start_epoch + 1, epochs + 1):
315 | # Check for early stopping
316 | if epochs_no_improve == args.patience:
317 | logger.info(
318 | f"Applied early stopping and stopped training. Val accuracy not improve in {args.patience} epochs")
319 | break
320 | # ---------
321 | train(epoch)
322 | # --------For Transformer model from SIGMORPHON 2020 Baseline----------
323 | # train_baseline(epoch)
324 | # ---------
325 | is_best = False
326 | curr_valid_accuracy = 0
327 | # Check model on validation set and get loss, every few epochs
328 | if epoch % eval_every == 0 and epoch > min_eval_epochs:
329 | epochs_no_improve += 1
330 | # ---------
331 | curr_valid_accuracy = validation(epoch)
332 | # --------For Transformer model from SIGMORPHON 2020 Baseline----------
333 | # curr_valid_accuracy = validation_baseline(epoch)
334 | # ---------
335 | # If best accuracy so far, save model as best and the accuracy
336 | if curr_valid_accuracy > best_valid_accuracy:
337 | logger.info("New best accuracy, Model saved")
338 | is_best = True
339 | best_valid_accuracy = curr_valid_accuracy
340 | best_valid_epoch = epoch
341 | epochs_no_improve = 0
342 | utils.save_checkpoint(model, epoch, optimizer, scheduler, curr_valid_accuracy, is_best, args.checkpoints_dir)
343 | utils.clean_checkpoints_dir(args.checkpoints_dir)
344 | logger.info(f"Finished training, best model on validation set: {best_valid_epoch},"
345 | f" accuracy: {best_valid_accuracy:.2f}%\n")
346 |
347 |
348 | # Train model
349 | # seed = 0
350 | # torch.manual_seed(seed=seed)
351 | # if torch.cuda.is_available():
352 | # torch.cuda.manual_seed_all(seed)
353 | # BEST MODEL FOR MEDIUM RESOURCE
354 | # EMBEDDING_DIM = 64
355 | # FCN_HIDDEN_DIM = 256
356 | # NUM_HEADS = 4
357 | # NUM_LAYERS = 2
358 | # DROPOUT = 0.2
359 | # # BEST MODEL FOR LOW RESOURCE
360 | # EMBEDDING_DIM = 128
361 | # FCN_HIDDEN_DIM = 64
362 | # NUM_HEADS = 4
363 | # NUM_LAYERS = 2
364 | # DROPOUT = 0.2
365 |
--------------------------------------------------------------------------------
/data/middle-high-german-train-medium:
--------------------------------------------------------------------------------
1 | werden wurdet V;IND;PST;2;PL
2 | triegen trieget V;SBJV;PRS;2;PL
3 | binden binde V;IND;PRS;1;SG
4 | singen singet V;IND;PRS;3;SG
5 | verswinden verswindende V.PTCP;PRS
6 | burcgrave burcgrâven N;DAT;SG
7 | bruoder bruoder N;NOM;SG
8 | betriegen betriugest V;IND;PRS;2;SG
9 | slahen slage V;SBJV;PRS;1;SG
10 | swern sweret V;IND;PRS;3;SG
11 | grave grâven N;NOM;PL
12 | slahen slaget V;SBJV;PRS;2;PL
13 | trinken trinken V;IND;PRS;1;PL
14 | enpfinden enpfunden V;IND;PST;3;PL
15 | werfen wërfet V;SBJV;PRS;2;PL
16 | wahsen wahset V;SBJV;PRS;3;SG
17 | tac tac N;ACC;SG
18 | swern swuoren V;IND;PST;3;PL
19 | tac tage N;NOM;PL
20 | slahen slegest V;IND;PRS;2;SG
21 | betriegen betriegen V;NFIN
22 | helfen hëlfen V;NFIN
23 | werfen würfen V;SBJV;PST;1;PL
24 | vinden vünde V;IND;PST;2;SG
25 | trinken trinket V;IND;PRS;2;PL
26 | werfen geworfen V.PTCP;PST
27 | beginnen beginnet V;IMP;2;PL
28 | slahen slagen V;NFIN
29 | burcgrave burcgrâven N;NOM;PL
30 | ziehen ziehe V;SBJV;PRS;1;SG
31 | ziehen ziehet V;IMP;2;PL
32 | burcgrave burcgrâven N;DAT;PL
33 | swern swuoren V;IND;PST;1;PL
34 | beginnen begünnest V;SBJV;PST;2;SG
35 | swimmen swümme V;SBJV;PST;3;SG
36 | binden bündet V;SBJV;PST;2;PL
37 | betriegen betrugen V;IND;PST;3;PL
38 | engel engele N;DAT;SG
39 | singen singent V;IND;PRS;3;PL
40 | werfen wërfen V;IND;PRS;1;PL
41 | werden wirde V;IND;PRS;1;SG
42 | rinnen runnen V;IND;PST;1;PL
43 | swimmen swimment V;IND;PRS;3;PL
44 | enpfinden enpfand V;IND;PST;1;SG
45 | binden binden V;NFIN
46 | slahen slagen V;SBJV;PRS;3;PL
47 | biegen biegent V;IND;PRS;3;PL
48 | binden bindest V;SBJV;PRS;2;SG
49 | swern sweren V;SBJV;PRS;1;PL
50 | betriegen betrügen V;SBJV;PST;1;PL
51 | rinnen rinnet V;SBJV;PRS;3;SG
52 | swern sweren V;IND;PRS;1;PL
53 | swimmen swimmet V;SBJV;PRS;2;PL
54 | swern swüren V;SBJV;PST;1;PL
55 | beginnen begünne V;IND;PST;2;SG
56 | biegen biegen V;IND;PRS;1;PL
57 | helfen hëlfet V;IMP;2;PL
58 | vinden vünde V;SBJV;PST;1;SG
59 | sieden gesoten V.PTCP;PST
60 | biegen bugen V;IND;PST;3;PL
61 | vinden vind V;IMP;2;SG
62 | helfen hëlfet V;SBJV;PRS;3;SG
63 | swern swernde V.PTCP;PRS
64 | kiesen kiesen V;NFIN
65 | trinken trinkest V;IND;PRS;2;SG
66 | ziehen ziehende V.PTCP;PRS
67 | werfen wërfen V;SBJV;PRS;1;PL
68 | swimmen swimmende V.PTCP;PRS
69 | enpfinden enpfünde V;SBJV;PST;1;SG
70 | sieden sutet V;IND;PST;2;PL
71 | überganc überganc N;ACC;SG
72 | sieden süte V;IND;PST;2;SG
73 | sieden sieden V;IND;PRS;1;PL
74 | werden wërden V;SBJV;PRS;3;PL
75 | trinken trinken V;NFIN
76 | werden würden V;SBJV;PST;1;PL
77 | rinnen rinnent V;IND;PRS;3;PL
78 | swern swere V;SBJV;PRS;1;SG
79 | binden bindende V.PTCP;PRS
80 | swern swüre V;SBJV;PST;3;SG
81 | wahsen wahsen V;IND;PRS;1;PL
82 | werfen würfe V;IND;PST;2;SG
83 | betriegen betriegen V;IND;PRS;1;PL
84 | ziehen zühest V;SBJV;PST;2;SG
85 | triegen trüge V;IND;PST;2;SG
86 | helfen hülfen V;SBJV;PST;1;PL
87 | swimmen swamm V;IND;PST;3;SG
88 | überganc übergange N;DAT;SG
89 | triegen trüget V;SBJV;PST;2;PL
90 | swimmen swimme V;IND;PRS;1;SG
91 | enpfinden enpfünde V;SBJV;PST;3;SG
92 | swimmen swimme V;SBJV;PRS;1;SG
93 | swern swüret V;SBJV;PST;2;PL
94 | beginnen beginnest V;IND;PRS;2;SG
95 | binden bindet V;SBJV;PRS;3;SG
96 | beginnen begünnen V;SBJV;PST;3;PL
97 | singen singen V;SBJV;PRS;3;PL
98 | singen singe V;IND;PRS;1;SG
99 | werfen wërfen V;SBJV;PRS;3;PL
100 | singen singest V;IND;PRS;2;SG
101 | göu göu N;NOM;SG
102 | tac tagen N;DAT;PL
103 | werden wërde V;SBJV;PRS;1;SG
104 | werden wërdet V;SBJV;PRS;2;PL
105 | werden geworden V.PTCP;PST
106 | swern swer V;IMP;2;SG
107 | betriegen betrieget V;SBJV;PRS;2;PL
108 | werden wërdende V.PTCP;PRS
109 | slahen geslagen V.PTCP;PST
110 | swern sweren V;SBJV;PRS;3;PL
111 | singen singen V;IMP;1;PL
112 | biegen bügest V;SBJV;PST;2;SG
113 | beginnen beginnet V;SBJV;PRS;3;SG
114 | helfen hëlfent V;IND;PRS;3;PL
115 | sieden sieden V;IMP;1;PL
116 | vinden vunden V;IND;PST;3;PL
117 | göu göuwe N;GEN;PL
118 | trinken trinket V;IMP;2;PL
119 | wahsen wuohs V;IND;PST;3;SG
120 | biegen biegen V;IMP;1;PL
121 | göugrave göugrâven N;NOM;PL
122 | ziehen ziehet V;IND;PRS;2;PL
123 | ziehen gezogen V.PTCP;PST
124 | betriegen betrieget V;IMP;2;PL
125 | trinken trünke V;SBJV;PST;3;SG
126 | göugrave göugrâven N;ACC;PL
127 | göugrave göugrâve N;NOM;SG
128 | swern swürest V;SBJV;PST;2;SG
129 | beginnen beginnen V;NFIN
130 | enpfinden enpfindet V;IND;PRS;2;PL
131 | werden wërden V;NFIN
132 | slahen slüegen V;SBJV;PST;1;PL
133 | ziehen zuhet V;IND;PST;2;PL
134 | enpfinden enpfinden V;SBJV;PRS;1;PL
135 | slahen sluogen V;IND;PST;1;PL
136 | swimmen swummen V;IND;PST;1;PL
137 | göu göuwes N;GEN;SG
138 | beginnen begunnen V.PTCP;PST
139 | sieden süten V;SBJV;PST;1;PL
140 | rinnen rinnen V;IND;PRS;1;PL
141 | betriegen betriuge V;IND;PRS;1;SG
142 | slahen slaget V;SBJV;PRS;3;SG
143 | biegen biegen V;SBJV;PRS;3;PL
144 | kiesen kurn V;IND;PST;1;PL
145 | tac tage N;GEN;PL
146 | triegen trugen V;IND;PST;1;PL
147 | enpfinden enpfinden V;IMP;1;PL
148 | kiesen kiesen V;SBJV;PRS;1;PL
149 | slahen slüege V;IND;PST;2;SG
150 | enpfinden enpfindet V;IMP;2;PL
151 | göu göuwe N;DAT;SG
152 | wahsen wüehse V;SBJV;PST;3;SG
153 | rinnen runnet V;IND;PST;2;PL
154 | werden ward V;IND;PST;3;SG
155 | betriegen betrogen V.PTCP;PST
156 | singen süngen V;SBJV;PST;1;PL
157 | werden würde V;IND;PST;2;SG
158 | helfen hëlfen V;IND;PRS;1;PL
159 | triegen triegen V;SBJV;PRS;3;PL
160 | betriegen betriuget V;IND;PRS;3;SG
161 | enpfinden enpfündest V;SBJV;PST;2;SG
162 | swern swere V;IND;PRS;1;SG
163 | wahsen gewahsen V.PTCP;PST
164 | göu göuwen N;DAT;PL
165 | engel engele N;ACC;PL
166 | kiesen kiesen V;IMP;1;PL
167 | enpfinden enpfindet V;SBJV;PRS;2;PL
168 | verswinden verswindent V;IND;PRS;3;PL
169 | trinken trinken V;SBJV;PRS;3;PL
170 | werfen wirf V;IMP;2;SG
171 | ziehen zühen V;SBJV;PST;1;PL
172 | sieden süte V;SBJV;PST;1;SG
173 | binden bindet V;IND;PRS;2;PL
174 | verswinden verswünde V;IND;PST;2;SG
175 | swimmen swimmen V;NFIN
176 | rinnen rinne V;SBJV;PRS;1;SG
177 | überganc überganc N;NOM;SG
178 | verswinden verswinden V;NFIN
179 | tac tages N;GEN;SG
180 | kiesen küsen V;SBJV;PST;3;PL
181 | ziehen ziehest V;SBJV;PRS;2;SG
182 | singen singe V;SBJV;PRS;1;SG
183 | wahsen wüehse V;IND;PST;2;SG
184 | burcgrave burcgrâven N;ACC;PL
185 | betriegen betriege V;SBJV;PRS;1;SG
186 | burcgrave burcgrâven N;GEN;PL
187 | ziehen ziehen V;SBJV;PRS;3;PL
188 | singen sungen V;IND;PST;1;PL
189 | singen sünge V;IND;PST;2;SG
190 | rinnen rinnest V;IND;PRS;2;SG
191 | biegen bugen V;IND;PST;1;PL
192 | vinden vinden V;SBJV;PRS;1;PL
193 | singen sunget V;IND;PST;2;PL
194 | wahsen wahs V;IMP;2;SG
195 | singen sünge V;SBJV;PST;3;SG
196 | swimmen swimmet V;IND;PRS;3;SG
197 | werden wurden V;IND;PST;1;PL
198 | rinnen rinnen V;SBJV;PRS;3;PL
199 | grave grâven N;ACC;SG
200 | triegen trouc V;IND;PST;3;SG
201 | biegen bouc V;IND;PST;3;SG
202 | enpfinden enpfindent V;IND;PRS;3;PL
203 | triegen triege V;SBJV;PRS;1;SG
204 | vinden vünden V;SBJV;PST;3;PL
205 | triegen triegen V;SBJV;PRS;1;PL
206 | enpfinden enpfindet V;SBJV;PRS;3;SG
207 | triegen trugen V;IND;PST;3;PL
208 | helfen hëlfet V;IND;PRS;2;PL
209 | vinden vindent V;IND;PRS;3;PL
210 | triegen triegent V;IND;PRS;3;PL
211 | werfen wërfende V.PTCP;PRS
212 | werfen würfen V;SBJV;PST;3;PL
213 | wahsen wuohs V;IND;PST;1;SG
214 | ziehen ziehen V;IMP;1;PL
215 | swimmen swimmen V;IMP;1;PL
216 | enpfinden enpfindet V;IND;PRS;3;SG
217 | vinden vunden V;IND;PST;1;PL
218 | sieden sieden V;NFIN
219 | biegen biuc V;IMP;2;SG
220 | slahen slagen V;SBJV;PRS;1;PL
221 | kiesen kiesent V;IND;PRS;3;PL
222 | beginnen begunnen V;IND;PST;1;PL
223 | sieden suten V;IND;PST;3;PL
224 | helfen hëlfe V;SBJV;PRS;1;SG
225 | biegen biegest V;SBJV;PRS;2;SG
226 | vinden vindet V;IND;PRS;2;PL
227 | triegen triegen V;NFIN
228 | wahsen wahset V;IND;PRS;2;PL
229 | kiesen kieset V;SBJV;PRS;2;PL
230 | swimmen swimm V;IMP;2;SG
231 | swimmen swümmen V;SBJV;PST;1;PL
232 | binden binden V;IND;PRS;1;PL
233 | swern sweren V;IMP;1;PL
234 | biegen bieget V;SBJV;PRS;2;PL
235 | ziehen zühet V;SBJV;PST;2;PL
236 | enpfinden enpfünden V;SBJV;PST;3;PL
237 | göugrave göugrâven N;DAT;PL
238 | sieden süten V;SBJV;PST;3;PL
239 | verswinden verswinde V;IND;PRS;1;SG
240 | kiesen küset V;SBJV;PST;2;PL
241 | vinden vinde V;SBJV;PRS;1;SG
242 | betriegen betrieget V;IND;PRS;2;PL
243 | enpfinden enpfunden V.PTCP;PST
244 | grave grâven N;DAT;PL
245 | trinken trünke V;IND;PST;2;SG
246 | singen singest V;SBJV;PRS;2;SG
247 | werfen wërfent V;IND;PRS;3;PL
248 | überganc übergange N;NOM;PL
249 | swern swern V;NFIN
250 | kiesen kieset V;IND;PRS;2;PL
251 | beginnen beginnende V.PTCP;PRS
252 | verswinden verswindet V;SBJV;PRS;2;PL
253 | triegen triuge V;IND;PRS;1;SG
254 | singen singet V;IMP;2;PL
255 | rinnen rinne V;IND;PRS;1;SG
256 | werfen wirfet V;IND;PRS;3;SG
257 | rinnen rünne V;SBJV;PST;3;SG
258 | beginnen beginnen V;IMP;1;PL
259 | trinken trunket V;IND;PST;2;PL
260 | werden wërden V;IND;PRS;1;PL
261 | swimmen swimmen V;SBJV;PRS;1;PL
262 | rinnen rünnen V;SBJV;PST;1;PL
263 | swimmen swamm V;IND;PST;1;SG
264 | rinnen rann V;IND;PST;1;SG
265 | grave grâven N;DAT;SG
266 | swern swerest V;IND;PRS;2;SG
267 | verswinden verswinde V;SBJV;PRS;1;SG
268 | tac tage N;DAT;SG
269 | werden würde V;SBJV;PST;1;SG
270 | swimmen swimmet V;IND;PRS;2;PL
271 | sieden sütest V;SBJV;PST;2;SG
272 | singen sang V;IND;PST;3;SG
273 | engel engel N;NOM;SG
274 | trinken trünken V;SBJV;PST;1;PL
275 | swimmen swümmen V;SBJV;PST;3;PL
276 | burcgrave burcgrâven N;GEN;SG
277 | göugrave göugrâven N;ACC;SG
278 | triegen triugest V;IND;PRS;2;SG
279 | swern swuoret V;IND;PST;2;PL
280 | überganc übergange N;ACC;PL
281 | engel engel N;ACC;SG
282 | singen sungen V;IND;PST;3;PL
283 | swimmen swümme V;SBJV;PST;1;SG
284 | engel engeles N;GEN;SG
285 | swern sweret V;IND;PRS;2;PL
286 | singen gesungen V.PTCP;PST
287 | swimmen swimmet V;SBJV;PRS;3;SG
288 | rinnen gerunnen V.PTCP;PST
289 | werfen wirfe V;IND;PRS;1;SG
290 | rinnen rünnest V;SBJV;PST;2;SG
291 | trinken trinke V;IND;PRS;1;SG
292 | helfen hülfen V;SBJV;PST;3;PL
293 | singen süngen V;SBJV;PST;3;PL
294 | rinnen rinnest V;SBJV;PRS;2;SG
295 | rinnen rünnet V;SBJV;PST;2;PL
296 | verswinden verswand V;IND;PST;3;SG
297 | beginnen begünne V;SBJV;PST;1;SG
298 | kiesen küsest V;SBJV;PST;2;SG
299 | burcgrave burcgrâve N;NOM;SG
300 | kiesen kius V;IMP;2;SG
301 | biegen büget V;SBJV;PST;2;PL
302 | verswinden verswünden V;SBJV;PST;3;PL
303 | ziehen zôch V;IND;PST;1;SG
304 | werden wërden V;SBJV;PRS;1;PL
305 | triegen trügen V;SBJV;PST;1;PL
306 | kiesen küse V;SBJV;PST;1;SG
307 | triegen triegest V;SBJV;PRS;2;SG
308 | enpfinden enpfinden V;IND;PRS;1;PL
309 | singen singet V;SBJV;PRS;2;PL
310 | biegen biegende V.PTCP;PRS
311 | verswinden verswunden V;IND;PST;1;PL
312 | kiesen kieset V;IMP;2;PL
313 | grave grâven N;GEN;SG
314 | rinnen rünne V;SBJV;PST;1;SG
315 | swimmen swimmen V;IND;PRS;1;PL
316 | ziehen ziehet V;SBJV;PRS;2;PL
317 | ziehen zuhen V;IND;PST;1;PL
318 | swern swuor V;IND;PST;3;SG
319 | werden wird V;IMP;2;SG
320 | wahsen wuohset V;IND;PST;2;PL
321 | wahsen wahsent V;IND;PRS;3;PL
322 | bruoder bruodere N;DAT;SG
323 | slahen slagende V.PTCP;PRS
324 | werden würden V;SBJV;PST;3;PL
325 | wahsen wahsen V;IMP;1;PL
326 | göugrave göugrâven N;GEN;SG
327 | betriegen betrouc V;IND;PST;3;SG
328 | betriegen betrüge V;SBJV;PST;1;SG
329 | werfen würfet V;SBJV;PST;2;PL
330 | vinden vinden V;IND;PRS;1;PL
331 | helfen hëlfen V;SBJV;PRS;3;PL
332 | slahen slüegest V;SBJV;PST;2;SG
333 | sieden siudet V;IND;PRS;3;SG
334 | wahsen wahsen V;SBJV;PRS;1;PL
335 | göu göu N;ACC;SG
336 | rinnen rinnet V;IMP;2;PL
337 | kiesen küse V;IND;PST;2;SG
338 | beginnen beginnen V;SBJV;PRS;3;PL
339 | bruoder bruoder N;ACC;SG
340 | helfen hülfet V;SBJV;PST;2;PL
341 | werfen wërfen V;IMP;1;PL
342 | triegen trügest V;SBJV;PST;2;SG
343 | trinken trinken V;SBJV;PRS;1;PL
344 | sieden siude V;IND;PRS;1;SG
345 | tac tage N;ACC;PL
346 | kiesen kurn V;IND;PST;3;PL
347 | verswinden verswinden V;IMP;1;PL
348 | kiesen kiesende V.PTCP;PRS
349 | binden bunden V;IND;PST;3;PL
350 | rinnen rinnende V.PTCP;PRS
351 | trinken trünkest V;SBJV;PST;2;SG
352 | trinken trinket V;SBJV;PRS;3;SG
353 | helfen hülfe V;SBJV;PST;3;SG
354 | ziehen zuhen V;IND;PST;3;PL
355 | biegen biege V;SBJV;PRS;1;SG
356 | binden bünden V;SBJV;PST;3;PL
357 | vinden vand V;IND;PST;1;SG
358 | ziehen ziehet V;SBJV;PRS;3;SG
359 | singen singen V;NFIN
360 | sieden siedet V;SBJV;PRS;3;SG
361 | slahen slage V;IND;PRS;1;SG
362 | bruoder brüedern N;DAT;PL
363 | verswinden verswinden V;SBJV;PRS;3;PL
364 | ziehen ziuch V;IMP;2;SG
365 | slahen sluoget V;IND;PST;2;PL
366 | binden bindet V;SBJV;PRS;2;PL
367 | helfen hilfe V;IND;PRS;1;SG
368 | betriegen betrouc V;IND;PST;1;SG
369 | helfen hülfe V;IND;PST;2;SG
370 | enpfinden enpfünde V;IND;PST;2;SG
371 | slahen sluoc V;IND;PST;1;SG
372 | rinnen rinnet V;SBJV;PRS;2;PL
373 | slahen sluoc V;IND;PST;3;SG
374 | helfen geholfen V.PTCP;PST
375 | sieden sieden V;SBJV;PRS;1;PL
376 | beginnen begann V;IND;PST;1;SG
377 | verswinden verswunden V;IND;PST;3;PL
378 | betriegen betrüge V;IND;PST;2;SG
379 | triegen trügen V;SBJV;PST;3;PL
380 | vinden vinde V;IND;PRS;1;SG
381 | singen singet V;SBJV;PRS;3;SG
382 | helfen hilfet V;IND;PRS;3;SG
383 | verswinden verswündet V;SBJV;PST;2;PL
384 | rinnen rinn V;IMP;2;SG
385 | vinden vindest V;IND;PRS;2;SG
386 | vinden vündet V;SBJV;PST;2;PL
387 | überganc überganges N;GEN;SG
388 | wahsen wüehsest V;SBJV;PST;2;SG
389 | betriegen betrieget V;SBJV;PRS;3;SG
390 | sieden suten V;IND;PST;1;PL
391 | binden binden V;SBJV;PRS;3;PL
392 | betriegen betriegen V;SBJV;PRS;1;PL
393 | trinken trank V;IND;PST;3;SG
394 | singen süngest V;SBJV;PST;2;SG
395 | singen singende V.PTCP;PRS
396 | beginnen beginn V;IMP;2;SG
397 | helfen hulfen V;IND;PST;3;PL
398 | singen sang V;IND;PST;1;SG
399 | werfen wërfet V;IND;PRS;2;PL
400 | beginnen begünnen V;SBJV;PST;1;PL
401 | rinnen rinnen V;SBJV;PRS;1;PL
402 | trinken trünket V;SBJV;PST;2;PL
403 | biegen bügen V;SBJV;PST;3;PL
404 | sieden siedest V;SBJV;PRS;2;SG
405 | ziehen ziehen V;IND;PRS;1;PL
406 | verswinden verswand V;IND;PST;1;SG
407 | beginnen beginne V;SBJV;PRS;1;SG
408 | werfen warf V;IND;PST;1;SG
409 | wahsen wahsest V;SBJV;PRS;2;SG
410 | triegen trieget V;IND;PRS;2;PL
411 | bruoder brüeder N;ACC;PL
412 | wahsen wahse V;SBJV;PRS;1;SG
413 | swern swüre V;IND;PST;2;SG
414 | biegen bügen V;SBJV;PST;1;PL
415 | werfen würfest V;SBJV;PST;2;SG
416 | triegen trouc V;IND;PST;1;SG
417 | verswinden verswindet V;IND;PRS;3;SG
418 | beginnen begünne V;SBJV;PST;3;SG
419 | werfen wërfe V;SBJV;PRS;1;SG
420 | bruoder brüeder N;NOM;PL
421 | wahsen wüehsen V;SBJV;PST;3;PL
422 | betriegen betriegen V;IMP;1;PL
423 | vinden vindende V.PTCP;PRS
424 | kiesen kiesen V;IND;PRS;1;PL
425 | trinken trinkende V.PTCP;PRS
426 | beginnen beginnest V;SBJV;PRS;2;SG
427 | ziehen ziuhest V;IND;PRS;2;SG
428 | wahsen wahsende V.PTCP;PRS
429 | binden bindent V;IND;PRS;3;PL
430 | swern swerent V;IND;PRS;3;PL
431 | triegen triegen V;IMP;1;PL
432 | verswinden verswunden V.PTCP;PST
433 | werden wërdent V;IND;PRS;3;PL
434 | binden bindest V;IND;PRS;2;SG
435 | slahen sleget V;IND;PRS;3;SG
436 | werden wërdet V;IMP;2;PL
437 | kiesen kuret V;IND;PST;2;PL
438 | helfen hilf V;IMP;2;SG
439 | rinnen rünne V;IND;PST;2;SG
440 | singen singen V;SBJV;PRS;1;PL
441 | werfen wërfet V;IMP;2;PL
442 | werfen wirfest V;IND;PRS;2;SG
443 | betriegen betriegent V;IND;PRS;3;PL
444 | swimmen swümmest V;SBJV;PST;2;SG
445 | wahsen wahse V;IND;PRS;1;SG
446 | wahsen wahsen V;NFIN
447 | helfen half V;IND;PST;3;SG
448 | swimmen swimmen V;SBJV;PRS;3;PL
449 | verswinden verswinden V;SBJV;PRS;1;PL
450 | werden wërdest V;SBJV;PRS;2;SG
451 | betriegen betriegen V;SBJV;PRS;3;PL
452 | ziehen zühe V;IND;PST;2;SG
453 | swimmen swimmest V;SBJV;PRS;2;SG
454 | trinken trinken V;IMP;1;PL
455 | triegen triegen V;IND;PRS;1;PL
456 | beginnen beginnen V;SBJV;PRS;1;PL
457 | überganc übergangen N;DAT;PL
458 | sieden siudest V;IND;PRS;2;SG
459 | enpfinden enpfand V;IND;PST;3;SG
460 | ziehen ziehen V;NFIN
461 | slahen slac V;IMP;2;SG
462 | vinden vinden V;SBJV;PRS;3;PL
463 | werfen wurfen V;IND;PST;1;PL
464 | engel engelen N;DAT;PL
465 | vinden vündest V;SBJV;PST;2;SG
466 | ziehen zühe V;SBJV;PST;3;SG
467 | betriegen betrugen V;IND;PST;1;PL
468 | enpfinden enpfünden V;SBJV;PST;1;PL
469 | wahsen wüehse V;SBJV;PST;1;SG
470 | trinken trink V;IMP;2;SG
471 | göugrave göugrâven N;DAT;SG
472 | verswinden verswind V;IMP;2;SG
473 | binden gebunden V.PTCP;PST
474 | helfen hulfen V;IND;PST;1;PL
475 | vinden vand V;IND;PST;3;SG
476 | trinken trinkent V;IND;PRS;3;PL
477 | ziehen zühe V;SBJV;PST;1;SG
478 | binden binden V;SBJV;PRS;1;PL
479 | slahen slagen V;IMP;1;PL
480 | swern sweret V;SBJV;PRS;3;SG
481 | göu göuwe N;NOM;PL
482 | werfen warf V;IND;PST;3;SG
483 | enpfinden enpfindest V;SBJV;PRS;2;SG
484 | triegen trieget V;IMP;2;PL
485 | verswinden verswünde V;SBJV;PST;3;SG
486 | betriegen betrüget V;SBJV;PST;2;PL
487 | sieden siedent V;IND;PRS;3;PL
488 | enpfinden enpfündet V;SBJV;PST;2;PL
489 | werden wërdet V;IND;PRS;2;PL
490 | slahen slagen V;IND;PRS;1;PL
491 | betriegen betrügest V;SBJV;PST;2;SG
492 | swern swüre V;SBJV;PST;1;SG
493 | beginnen beginnent V;IND;PRS;3;PL
494 | triegen getrogen V.PTCP;PST
495 | binden bünde V;SBJV;PST;1;SG
496 | kiesen kiusest V;IND;PRS;2;SG
497 | helfen hëlfet V;SBJV;PRS;2;PL
498 | vinden vindet V;SBJV;PRS;3;SG
499 | biegen biugest V;IND;PRS;2;SG
500 | triegen triegende V.PTCP;PRS
501 | werden wirdet V;IND;PRS;3;SG
502 | rinnen rünnen V;SBJV;PST;3;PL
503 | biegen gebogen V.PTCP;PST
504 | kiesen kiese V;SBJV;PRS;1;SG
505 | beginnen begünnet V;SBJV;PST;2;PL
506 | verswinden verswündest V;SBJV;PST;2;SG
507 | swern sweret V;IMP;2;PL
508 | grave grâven N;ACC;PL
509 | kiesen kôs V;IND;PST;3;SG
510 | slahen slagent V;IND;PRS;3;PL
511 | biegen bouc V;IND;PST;1;SG
512 | enpfinden enpfundet V;IND;PST;2;PL
513 | kiesen gekorn V.PTCP;PST
514 | sieden siedet V;IMP;2;PL
515 | göu göuwe N;ACC;PL
516 | biegen biegen V;SBJV;PRS;1;PL
517 | verswinden verswundet V;IND;PST;2;PL
518 | biegen biegen V;NFIN
519 | helfen hulfet V;IND;PST;2;PL
520 | beginnen beginnet V;IND;PRS;2;PL
521 | verswinden verswinden V;IND;PRS;1;PL
522 | wahsen wüehsen V;SBJV;PST;1;PL
523 | biegen büge V;SBJV;PST;1;SG
524 | enpfinden enpfindest V;IND;PRS;2;SG
525 | wahsen wüehset V;SBJV;PST;2;PL
526 | engel engele N;NOM;PL
527 | beginnen beginne V;IND;PRS;1;SG
528 | rinnen rinnet V;IND;PRS;3;SG
529 | biegen bieget V;SBJV;PRS;3;SG
530 | verswinden verswindet V;IMP;2;PL
531 | werden würdet V;SBJV;PST;2;PL
532 | kiesen kiesen V;SBJV;PRS;3;PL
533 | triegen trieget V;SBJV;PRS;3;SG
534 | betriegen betrügen V;SBJV;PST;3;PL
535 | swern swüren V;SBJV;PST;3;PL
536 | betriegen betriegende V.PTCP;PRS
537 | kiesen kiuse V;IND;PRS;1;SG
538 | beginnen beginnet V;IND;PRS;3;SG
539 | biegen bieget V;IND;PRS;2;PL
540 | swimmen swummet V;IND;PST;2;PL
541 | binden bünden V;SBJV;PST;1;PL
542 | überganc übergange N;GEN;PL
543 | binden bundet V;IND;PST;2;PL
544 | slahen slüegen V;SBJV;PST;3;PL
545 | swern geswarn V.PTCP;PST
546 | vinden vundet V;IND;PST;2;PL
547 | singen sünget V;SBJV;PST;2;PL
548 | werfen wërfen V;NFIN
549 | biegen büge V;IND;PST;2;SG
550 | tac tac N;NOM;SG
551 | binden band V;IND;PST;1;SG
552 | sieden sieden V;SBJV;PRS;3;PL
553 | wahsen wahset V;IMP;2;PL
554 | sieden siedet V;IND;PRS;2;PL
555 | swern swuor V;IND;PST;1;SG
556 | biegen biuget V;IND;PRS;3;SG
557 | sieden süde V;SBJV;PST;3;SG
558 | sieden sôt V;IND;PST;3;SG
559 | verswinden verswünde V;SBJV;PST;1;SG
560 | ziehen ziuhe V;IND;PRS;1;SG
561 | betriegen betruget V;IND;PST;2;PL
562 | vinden vinden V;IMP;1;PL
563 | slahen slaget V;IMP;2;PL
564 | rinnen rann V;IND;PST;3;SG
565 | sieden siut V;IMP;2;SG
566 | bruoder bruoders N;GEN;SG
567 | swern swerest V;SBJV;PRS;2;SG
568 | binden bündest V;SBJV;PST;2;SG
569 | wahsen wuohsen V;IND;PST;1;PL
570 | wahsen wehset V;IND;PRS;3;SG
571 | slahen slüege V;SBJV;PST;1;SG
572 | grave grâven N;GEN;PL
573 | triegen triuget V;IND;PRS;3;SG
574 | vinden vünden V;SBJV;PST;1;PL
575 | trinken getrunken V.PTCP;PST
576 | vinden vünde V;SBJV;PST;3;SG
577 | wahsen wuohsen V;IND;PST;3;PL
578 | werfen würfe V;SBJV;PST;1;SG
579 | kiesen küse V;SBJV;PST;3;SG
580 | binden bünde V;SBJV;PST;3;SG
581 | trinken trünke V;SBJV;PST;1;SG
582 | wahsen wehsest V;IND;PRS;2;SG
583 | werfen wërfet V;SBJV;PRS;3;SG
584 | swern sweret V;SBJV;PRS;2;PL
585 | ziehen ziuhet V;IND;PRS;3;SG
586 | swimmen swimmest V;IND;PRS;2;SG
587 | biegen bieget V;IMP;2;PL
588 | swimmen swummen V;IND;PST;3;PL
589 | beginnen begunnen V;IND;PST;3;PL
590 | sieden siede V;SBJV;PRS;1;SG
591 | rinnen rinnet V;IND;PRS;2;PL
592 | beginnen begann V;IND;PST;3;SG
593 | helfen hëlfen V;SBJV;PRS;1;PL
594 | sieden siedet V;SBJV;PRS;2;PL
595 |
--------------------------------------------------------------------------------
/transformer_baseline-original.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | from collections import namedtuple
4 |
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from torch.distributions import Distribution
10 |
11 |
12 | PAD_IDX, BOS_IDX, EOS_IDX= 0, 1, 2 #CHANGED LINE
13 |
14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |
16 |
17 | class SinusoidalPositionalEmbedding(nn.Module):
18 | """This module produces sinusoidal positional embeddings of any length.
19 | Padding symbols are ignored.
20 | """
21 | def __init__(self, embedding_dim, padding_idx, init_size=1024):
22 | super().__init__()
23 | self.embedding_dim = embedding_dim
24 | self.padding_idx = padding_idx
25 | self.weights = SinusoidalPositionalEmbedding.get_embedding(
26 | init_size,
27 | embedding_dim,
28 | padding_idx,
29 | )
30 | self.register_buffer('_float_tensor', torch.FloatTensor(1))
31 |
32 | @staticmethod
33 | def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
34 | """Build sinusoidal embeddings.
35 | This matches the implementation in tensor2tensor, but differs slightly
36 | from the description in Section 3.5 of "Attention Is All You Need".
37 | """
38 | half_dim = embedding_dim // 2
39 | emb = math.log(10000) / (half_dim - 1)
40 | emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
41 | emb = torch.arange(num_embeddings,
42 | dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
43 | emb = torch.cat([torch.sin(emb), torch.cos(emb)],
44 | dim=1).view(num_embeddings, -1)
45 | if embedding_dim % 2 == 1:
46 | # zero pad
47 | emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
48 | if padding_idx is not None:
49 | emb[padding_idx, :] = 0
50 | return emb
51 |
52 | def forward(self, input):
53 | """Input is expected to be of size [bsz x seqlen]."""
54 | bsz, seq_len = input.shape
55 | max_pos = self.padding_idx + 1 + seq_len
56 | if self.weights is None or max_pos > self.weights.size(0):
57 | # recompute/expand embeddings if needed
58 | self.weights = SinusoidalPositionalEmbedding.get_embedding(
59 | max_pos,
60 | self.embedding_dim,
61 | self.padding_idx,
62 | )
63 | self.weights = self.weights.to(self._float_tensor)
64 |
65 | mask = input.ne(self.padding_idx).long()
66 | positions = torch.cumsum(mask, dim=0) * mask + self.padding_idx
67 | return self.weights.index_select(0, positions.view(-1)).view(
68 | bsz, seq_len, -1).detach()
69 |
70 |
71 | class TransformerEncoderLayer(nn.Module):
72 | def __init__(self,
73 | d_model,
74 | nhead,
75 | dim_feedforward=2048,
76 | dropout=0.1,
77 | attention_dropout=0.1,
78 | activation_dropout=0.1,
79 | activation='relu',
80 | normalize_before=True):
81 | super(TransformerEncoderLayer, self).__init__()
82 | self.normalize_before = normalize_before
83 | self.self_attn = nn.MultiheadAttention(d_model,
84 | nhead,
85 | dropout=attention_dropout)
86 | # Implementation of Feedforward model
87 | self.linear1 = Linear(d_model, dim_feedforward)
88 | self.dropout = nn.Dropout(dropout)
89 | self.linear2 = Linear(dim_feedforward, d_model)
90 |
91 | self.norm1 = nn.LayerNorm(d_model)
92 | self.norm2 = nn.LayerNorm(d_model)
93 | self.activation_dropout = nn.Dropout(activation_dropout)
94 |
95 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation]
96 |
97 | def forward(self, src, src_mask=None, src_key_padding_mask=None):
98 | r"""Pass the input through the endocder layer.
99 |
100 | Args:
101 | src: the sequnce to the encoder layer (required).
102 | src_mask: the mask for the src sequence (optional).
103 | src_key_padding_mask: the mask for the src keys per batch (optional).
104 | """
105 | # Self attention block
106 | residual = src
107 | if self.normalize_before:
108 | src = self.norm1(src)
109 | src = self.self_attn(src,
110 | src,
111 | src,
112 | attn_mask=src_mask,
113 | key_padding_mask=src_key_padding_mask)[0]
114 | src = residual + self.dropout(src)
115 | if not self.normalize_before:
116 | src = self.norm1(src)
117 | # Feed forward block
118 | residual = src
119 | if self.normalize_before:
120 | src = self.norm2(src)
121 | src = self.activation(self.linear1(src))
122 | src = self.activation_dropout(src)
123 | src = self.linear2(src)
124 | src = residual + self.dropout(src)
125 | if not self.normalize_before:
126 | src = self.norm2(src)
127 | return src
128 |
129 |
130 | class TransformerDecoderLayer(nn.Module):
131 | def __init__(self,
132 | d_model,
133 | nhead,
134 | dim_feedforward=2048,
135 | dropout=0.1,
136 | attention_dropout=0.1,
137 | activation_dropout=0.1,
138 | activation='relu',
139 | normalize_before=True):
140 | super(TransformerDecoderLayer, self).__init__()
141 | self.normalize_before = normalize_before
142 | self.self_attn = nn.MultiheadAttention(d_model,
143 | nhead,
144 | dropout=attention_dropout)
145 | self.multihead_attn = nn.MultiheadAttention(d_model,
146 | nhead,
147 | dropout=dropout)
148 | # Implementation of Feedforward model
149 | self.linear1 = Linear(d_model, dim_feedforward)
150 | self.dropout = nn.Dropout(dropout)
151 | self.linear2 = Linear(dim_feedforward, d_model)
152 |
153 | self.norm1 = nn.LayerNorm(d_model)
154 | self.norm2 = nn.LayerNorm(d_model)
155 | self.norm3 = nn.LayerNorm(d_model)
156 | self.activation_dropout = nn.Dropout(activation_dropout)
157 |
158 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation]
159 |
160 | def forward(self,
161 | tgt,
162 | memory,
163 | tgt_mask=None,
164 | memory_mask=None,
165 | tgt_key_padding_mask=None,
166 | memory_key_padding_mask=None):
167 | r"""Pass the inputs (and mask) through the decoder layer.
168 |
169 | Args:
170 | tgt: the sequence to the decoder layer (required).
171 | memory: the sequnce from the last layer of the encoder (required).
172 | tgt_mask: the mask for the tgt sequence (optional).
173 | memory_mask: the mask for the memory sequence (optional).
174 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
175 | memory_key_padding_mask: the mask for the memory keys per batch (optional).
176 | """
177 | # self attention block
178 | residual = tgt
179 | if self.normalize_before:
180 | tgt = self.norm1(tgt)
181 | tgt = self.self_attn(tgt,
182 | tgt,
183 | tgt,
184 | attn_mask=tgt_mask,
185 | key_padding_mask=tgt_key_padding_mask)[0]
186 | tgt = residual + self.dropout(tgt)
187 | if not self.normalize_before:
188 | tgt = self.norm1(tgt)
189 | # cross attention block
190 | residual = tgt
191 | if self.normalize_before:
192 | tgt = self.norm2(tgt)
193 | tgt = self.multihead_attn(tgt,
194 | memory,
195 | memory,
196 | attn_mask=memory_mask,
197 | key_padding_mask=memory_key_padding_mask)[0]
198 | tgt = residual + self.dropout(tgt)
199 | if not self.normalize_before:
200 | tgt = self.norm2(tgt)
201 | # feed forward block
202 | residual = tgt
203 | if self.normalize_before:
204 | tgt = self.norm3(tgt)
205 | tgt = self.activation(self.linear1(tgt))
206 | tgt = self.activation_dropout(tgt)
207 | tgt = self.linear2(tgt)
208 | tgt = residual + self.dropout(tgt)
209 | if not self.normalize_before:
210 | tgt = self.norm3(tgt)
211 | return tgt
212 |
213 |
214 | class Transformer(nn.Module):
215 | def __init__(self, *, src_vocab_size, trg_vocab_size, embed_dim, nb_heads,
216 | src_hid_size, src_nb_layers, trg_hid_size, trg_nb_layers,
217 | dropout_p, tie_trg_embed, src_c2i, trg_c2i, attr_c2i,
218 | label_smooth, **kwargs):
219 | '''
220 | init
221 | '''
222 | super().__init__()
223 | self.src_vocab_size = src_vocab_size
224 | self.trg_vocab_size = trg_vocab_size
225 | self.embed_dim = embed_dim
226 | self.embed_scale = math.sqrt(embed_dim)
227 | self.nb_heads = nb_heads
228 | self.src_hid_size = src_hid_size
229 | self.src_nb_layers = src_nb_layers
230 | self.trg_hid_size = trg_hid_size
231 | self.trg_nb_layers = trg_nb_layers
232 | self.dropout_p = dropout_p
233 | self.tie_trg_embed = tie_trg_embed
234 | self.label_smooth = label_smooth
235 | self.src_c2i, self.trg_c2i, self.attr_c2i = src_c2i, trg_c2i, attr_c2i
236 | self.src_embed = Embedding(src_vocab_size,
237 | embed_dim,
238 | padding_idx=PAD_IDX)
239 | self.trg_embed = Embedding(trg_vocab_size,
240 | embed_dim,
241 | padding_idx=PAD_IDX)
242 | self.position_embed = SinusoidalPositionalEmbedding(embed_dim, PAD_IDX)
243 | encoder_layer = TransformerEncoderLayer(d_model=embed_dim,
244 | nhead=nb_heads,
245 | dim_feedforward=src_hid_size,
246 | dropout=dropout_p,
247 | attention_dropout=dropout_p,
248 | activation_dropout=dropout_p,
249 | normalize_before=True)
250 | self.encoder = nn.TransformerEncoder(encoder_layer,
251 | num_layers=src_nb_layers,
252 | norm=nn.LayerNorm(embed_dim))
253 | decoder_layer = TransformerDecoderLayer(d_model=embed_dim,
254 | nhead=nb_heads,
255 | dim_feedforward=trg_hid_size,
256 | dropout=dropout_p,
257 | attention_dropout=dropout_p,
258 | activation_dropout=dropout_p,
259 | normalize_before=True)
260 | self.decoder = nn.TransformerDecoder(decoder_layer,
261 | num_layers=trg_nb_layers,
262 | norm=nn.LayerNorm(embed_dim))
263 | self.final_out = Linear(embed_dim, trg_vocab_size)
264 | if tie_trg_embed:
265 | self.final_out.weight = self.trg_embed.weight
266 | self.dropout = nn.Dropout(dropout_p)
267 | # self._reset_parameters()
268 |
269 | def embed(self, src_batch, src_mask):
270 | # ------------
271 | # word_embed = self.embed_scale * self.src_embed(src_batch)
272 | word_embed = self.src_embed(src_batch)
273 | # ------------
274 | pos_embed = self.position_embed(src_batch)
275 | embed = self.dropout(word_embed + pos_embed)
276 | return embed
277 |
278 | def encode(self, src_batch, src_mask):
279 | embed = self.embed(src_batch, src_mask)
280 | return self.encoder(embed, src_key_padding_mask=src_mask)
281 |
282 | def decode(self, enc_hs, src_mask, trg_batch, trg_mask):
283 | # --------------
284 | # word_embed = self.embed_scale * self.trg_embed(trg_batch)
285 | word_embed = self.trg_embed(trg_batch)
286 | # --------------
287 | pos_embed = self.position_embed(trg_batch)
288 | embed = self.dropout(word_embed + pos_embed)
289 |
290 | trg_seq_len = trg_batch.size(0)
291 | causal_mask = self.generate_square_subsequent_mask(trg_seq_len)
292 | dec_hs = self.decoder(embed,
293 | enc_hs,
294 | tgt_mask=causal_mask,
295 | tgt_key_padding_mask=trg_mask,
296 | memory_key_padding_mask=src_mask)
297 | return F.log_softmax(self.final_out(dec_hs), dim=-1)
298 |
299 | def forward(self, src_batch, src_mask, trg_batch, trg_mask):
300 | '''
301 | only for training
302 | '''
303 | src_mask = (src_mask == 0).transpose(0, 1)
304 | trg_mask = (trg_mask == 0).transpose(0, 1)
305 | # trg_seq_len, batch_size = trg_batch.size()
306 | enc_hs = self.encode(src_batch, src_mask)
307 | # output: [trg_seq_len, batch_size, vocab_siz]
308 | output = self.decode(enc_hs, src_mask, trg_batch, trg_mask)
309 | return output
310 |
311 | def count_nb_params(self):
312 | model_parameters = filter(lambda p: p.requires_grad, self.parameters())
313 | params = sum([np.prod(p.size()) for p in model_parameters])
314 | return params
315 |
316 | def loss(self, predict, target):
317 | '''
318 | compute loss
319 | '''
320 | predict = predict.view(-1, self.trg_vocab_size)
321 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX)
322 | target = target.contiguous().view(-1, 1)
323 | non_pad_mask = target.ne(PAD_IDX)
324 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean()
325 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean()
326 | smooth_loss = smooth_loss / self.trg_vocab_size
327 | loss = (1. -
328 | self.label_smooth) * nll_loss + self.label_smooth * smooth_loss
329 | return loss
330 |
331 | def get_loss(self, data):
332 | src, src_mask, trg, trg_mask = data
333 | out = self.forward(src, src_mask, trg, trg_mask)
334 | loss = self.loss(out[:-1], trg[1:])
335 | return loss
336 |
337 | def generate_square_subsequent_mask(self, sz):
338 | r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
339 | Unmasked positions are filled with float(0.0).
340 | """
341 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
342 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
343 | mask == 1, float(0.0))
344 | return mask.to(DEVICE)
345 |
346 |
347 | class TagTransformer(Transformer):
348 | def __init__(self, *, nb_attr, **kwargs):
349 | super().__init__(**kwargs)
350 | self.nb_attr = nb_attr
351 | # 0 -> special token & tags, 1 -> character
352 | self.special_embeddings = Embedding(2, self.embed_dim)
353 |
354 | def embed(self, src_batch, src_mask):
355 | word_embed = self.embed_scale * self.src_embed(src_batch)
356 | char_mask = (src_batch < (self.src_vocab_size - self.nb_attr)).long()
357 | special_embed = self.embed_scale * self.special_embeddings(char_mask)
358 | pos_embed = self.position_embed(src_batch * char_mask)
359 | embed = self.dropout(word_embed + pos_embed + special_embed)
360 | return embed
361 |
362 |
363 | class UniversalTransformerEncoder(nn.Module):
364 | def __init__(self, encoder_layer, num_layers, norm=None):
365 | super(UniversalTransformerEncoder, self).__init__()
366 | self.encoder_layer = encoder_layer
367 | self.num_layers = num_layers
368 | self.norm = norm
369 |
370 | def forward(self, src, mask=None, src_key_padding_mask=None):
371 | output = src
372 |
373 | for i in range(self.num_layers):
374 | output = self.encoder_layer(
375 | output,
376 | src_mask=mask,
377 | src_key_padding_mask=src_key_padding_mask)
378 |
379 | if self.norm:
380 | output = self.norm(output)
381 |
382 | return output
383 |
384 |
385 | class UniversalTransformerDecoder(nn.Module):
386 | def __init__(self, decoder_layer, num_layers, norm=None):
387 | super(UniversalTransformerDecoder, self).__init__()
388 | self.decoder_layer = decoder_layer
389 | self.num_layers = num_layers
390 | self.norm = norm
391 |
392 | def forward(self,
393 | tgt,
394 | memory,
395 | tgt_mask=None,
396 | memory_mask=None,
397 | tgt_key_padding_mask=None,
398 | memory_key_padding_mask=None):
399 | output = tgt
400 |
401 | for i in range(self.num_layers):
402 | output = self.decoder_layer(
403 | output,
404 | memory,
405 | tgt_mask=tgt_mask,
406 | memory_mask=memory_mask,
407 | tgt_key_padding_mask=tgt_key_padding_mask,
408 | memory_key_padding_mask=memory_key_padding_mask)
409 |
410 | if self.norm:
411 | output = self.norm(output)
412 |
413 | return output
414 |
415 |
416 | class UniversalTransformer(Transformer):
417 | def __init__(self, **kwargs):
418 | super().__init__(**kwargs)
419 | encoder_layer = TransformerEncoderLayer(
420 | d_model=self.embed_dim,
421 | nhead=self.nb_heads,
422 | dim_feedforward=self.src_hid_size,
423 | dropout=self.dropout_p,
424 | attention_dropout=self.dropout_p,
425 | activation_dropout=self.dropout_p,
426 | normalize_before=True)
427 | self.encoder = UniversalTransformerEncoder(
428 | encoder_layer,
429 | num_layers=self.src_nb_layers,
430 | norm=nn.LayerNorm(self.embed_dim))
431 | decoder_layer = TransformerDecoderLayer(
432 | d_model=self.embed_dim,
433 | nhead=self.nb_heads,
434 | dim_feedforward=self.trg_hid_size,
435 | dropout=self.dropout_p,
436 | attention_dropout=self.dropout_p,
437 | activation_dropout=self.dropout_p,
438 | normalize_before=True)
439 | self.decoder = UniversalTransformerDecoder(
440 | decoder_layer,
441 | num_layers=self.trg_nb_layers,
442 | norm=nn.LayerNorm(self.embed_dim))
443 |
444 |
445 | class TagUniversalTransformer(TagTransformer, UniversalTransformer):
446 | pass
447 |
448 |
449 | def Embedding(num_embeddings, embedding_dim, padding_idx=None):
450 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
451 | nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
452 | if padding_idx is not None:
453 | nn.init.constant_(m.weight[padding_idx], 0)
454 | return m
455 |
456 |
457 | def Linear(in_features, out_features, bias=True):
458 | m = nn.Linear(in_features, out_features, bias)
459 | nn.init.xavier_uniform_(m.weight)
460 | if bias:
461 | nn.init.constant_(m.bias, 0.)
462 | return m
463 |
--------------------------------------------------------------------------------
/transformer_baseline.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | from collections import namedtuple
4 |
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from torch.distributions import Distribution
10 |
11 |
12 | PAD_IDX, BOS_IDX, EOS_IDX= 0, 1, 2 #CHANGED LINE
13 |
14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |
16 |
17 | # class SinusoidalPositionalEmbedding(nn.Module):
18 | # """This module produces sinusoidal positional embeddings of any length.
19 | # Padding symbols are ignored.
20 | # """
21 | # def __init__(self, embedding_dim, padding_idx, init_size=1024):
22 | # super().__init__()
23 | # self.embedding_dim = embedding_dim
24 | # self.padding_idx = padding_idx
25 | # self.weights = SinusoidalPositionalEmbedding.get_embedding(
26 | # init_size,
27 | # embedding_dim,
28 | # padding_idx,
29 | # )
30 | # self.register_buffer('_float_tensor', torch.FloatTensor(1))
31 | #
32 | # @staticmethod
33 | # def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
34 | # """Build sinusoidal embeddings.
35 | # This matches the implementation in tensor2tensor, but differs slightly
36 | # from the description in Section 3.5 of "Attention Is All You Need".
37 | # """
38 | # half_dim = embedding_dim // 2
39 | # emb = math.log(10000) / (half_dim - 1)
40 | # emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
41 | # emb = torch.arange(num_embeddings,
42 | # dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
43 | # emb = torch.cat([torch.sin(emb), torch.cos(emb)],
44 | # dim=1).view(num_embeddings, -1)
45 | # if embedding_dim % 2 == 1:
46 | # # zero pad
47 | # emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
48 | # if padding_idx is not None:
49 | # emb[padding_idx, :] = 0
50 | # return emb
51 | #
52 | # def forward(self, input):
53 | # """Input is expected to be of size [bsz x seqlen]."""
54 | # bsz, seq_len = input.shape
55 | # max_pos = self.padding_idx + 1 + seq_len
56 | # if self.weights is None or max_pos > self.weights.size(0):
57 | # # recompute/expand embeddings if needed
58 | # self.weights = SinusoidalPositionalEmbedding.get_embedding(
59 | # max_pos,
60 | # self.embedding_dim,
61 | # self.padding_idx,
62 | # )
63 | # self.weights = self.weights.to(self._float_tensor)
64 | #
65 | # mask = input.ne(self.padding_idx).long()
66 | # positions = torch.cumsum(mask, dim=0) * mask + self.padding_idx
67 | # return self.weights.index_select(0, positions.view(-1)).view(
68 | # bsz, seq_len, -1).detach()
69 |
70 | class SinusoidalPositionalEmbedding(nn.Module):
71 | """ Adds Sinusoidal positional encoding to sequences """
72 | def __init__(self, embedding_dim, dropout=0.1, max_seq_len=100):
73 | """ Initializes a seq_len x 1 x embedding_dim positional encoding matrix"""
74 | super(SinusoidalPositionalEmbedding, self).__init__()
75 | self.dropout = nn.Dropout(p=dropout)
76 |
77 | pe = torch.zeros(max_seq_len, embedding_dim)
78 | position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
79 | div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
80 | pe[:, 0::2] = torch.sin(position * div_term)
81 | pe[:, 1::2] = torch.cos(position * div_term)
82 | pe = pe.unsqueeze(0).transpose(0, 1)
83 | self.register_buffer('pe', pe)
84 |
85 | def forward(self, x):
86 | """ Adds positional encoding to the input.
87 | Input of dimensions (seq_len x batch_sz x embedding_dim).
88 | Adds positional encoding matrix (seq_len x 1 x embedding_dim) to every individual example in batch """
89 | x = x + self.pe[:x.size(0), :]
90 | return self.dropout(x)
91 |
92 |
93 | class TransformerEncoderLayer(nn.Module):
94 | def __init__(self,
95 | d_model,
96 | nhead,
97 | dim_feedforward=2048,
98 | dropout=0.1,
99 | attention_dropout=0.1,
100 | activation_dropout=0.1,
101 | activation='relu',
102 | normalize_before=True):
103 | super(TransformerEncoderLayer, self).__init__()
104 | self.normalize_before = normalize_before
105 | self.self_attn = nn.MultiheadAttention(d_model,
106 | nhead,
107 | dropout=attention_dropout)
108 | # Implementation of Feedforward model
109 | self.linear1 = Linear(d_model, dim_feedforward)
110 | self.dropout = nn.Dropout(dropout)
111 | self.linear2 = Linear(dim_feedforward, d_model)
112 |
113 | self.norm1 = nn.LayerNorm(d_model)
114 | self.norm2 = nn.LayerNorm(d_model)
115 | self.activation_dropout = nn.Dropout(activation_dropout)
116 |
117 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation]
118 |
119 | def forward(self, src, src_mask=None, src_key_padding_mask=None):
120 | r"""Pass the input through the endocder layer.
121 |
122 | Args:
123 | src: the sequnce to the encoder layer (required).
124 | src_mask: the mask for the src sequence (optional).
125 | src_key_padding_mask: the mask for the src keys per batch (optional).
126 | """
127 | # Self attention block
128 | residual = src
129 | if self.normalize_before:
130 | src = self.norm1(src)
131 | src = self.self_attn(src,
132 | src,
133 | src,
134 | attn_mask=src_mask,
135 | key_padding_mask=src_key_padding_mask)[0]
136 | src = residual + self.dropout(src)
137 | if not self.normalize_before:
138 | src = self.norm1(src)
139 | # Feed forward block
140 | residual = src
141 | if self.normalize_before:
142 | src = self.norm2(src)
143 | src = self.activation(self.linear1(src))
144 | src = self.activation_dropout(src)
145 | src = self.linear2(src)
146 | src = residual + self.dropout(src)
147 | if not self.normalize_before:
148 | src = self.norm2(src)
149 | return src
150 |
151 |
152 | class TransformerDecoderLayer(nn.Module):
153 | def __init__(self,
154 | d_model,
155 | nhead,
156 | dim_feedforward=2048,
157 | dropout=0.1,
158 | attention_dropout=0.1,
159 | activation_dropout=0.1,
160 | activation='relu',
161 | normalize_before=True):
162 | super(TransformerDecoderLayer, self).__init__()
163 | self.normalize_before = normalize_before
164 | self.self_attn = nn.MultiheadAttention(d_model,
165 | nhead,
166 | dropout=attention_dropout)
167 | self.multihead_attn = nn.MultiheadAttention(d_model,
168 | nhead,
169 | dropout=dropout)
170 | # Implementation of Feedforward model
171 | self.linear1 = Linear(d_model, dim_feedforward)
172 | self.dropout = nn.Dropout(dropout)
173 | self.linear2 = Linear(dim_feedforward, d_model)
174 |
175 | self.norm1 = nn.LayerNorm(d_model)
176 | self.norm2 = nn.LayerNorm(d_model)
177 | self.norm3 = nn.LayerNorm(d_model)
178 | self.activation_dropout = nn.Dropout(activation_dropout)
179 |
180 | self.activation = {'relu': F.relu, 'gelu': F.gelu}[activation]
181 |
182 | def forward(self,
183 | tgt,
184 | memory,
185 | tgt_mask=None,
186 | memory_mask=None,
187 | tgt_key_padding_mask=None,
188 | memory_key_padding_mask=None):
189 | r"""Pass the inputs (and mask) through the decoder layer.
190 |
191 | Args:
192 | tgt: the sequence to the decoder layer (required).
193 | memory: the sequnce from the last layer of the encoder (required).
194 | tgt_mask: the mask for the tgt sequence (optional).
195 | memory_mask: the mask for the memory sequence (optional).
196 | tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
197 | memory_key_padding_mask: the mask for the memory keys per batch (optional).
198 | """
199 | # self attention block
200 | residual = tgt
201 | if self.normalize_before:
202 | tgt = self.norm1(tgt)
203 | tgt = self.self_attn(tgt,
204 | tgt,
205 | tgt,
206 | attn_mask=tgt_mask,
207 | key_padding_mask=tgt_key_padding_mask)[0]
208 | tgt = residual + self.dropout(tgt)
209 | if not self.normalize_before:
210 | tgt = self.norm1(tgt)
211 | # cross attention block
212 | residual = tgt
213 | if self.normalize_before:
214 | tgt = self.norm2(tgt)
215 | tgt = self.multihead_attn(tgt,
216 | memory,
217 | memory,
218 | attn_mask=memory_mask,
219 | key_padding_mask=memory_key_padding_mask)[0]
220 | tgt = residual + self.dropout(tgt)
221 | if not self.normalize_before:
222 | tgt = self.norm2(tgt)
223 | # feed forward block
224 | residual = tgt
225 | if self.normalize_before:
226 | tgt = self.norm3(tgt)
227 | tgt = self.activation(self.linear1(tgt))
228 | tgt = self.activation_dropout(tgt)
229 | tgt = self.linear2(tgt)
230 | tgt = residual + self.dropout(tgt)
231 | if not self.normalize_before:
232 | tgt = self.norm3(tgt)
233 | return tgt
234 |
235 |
236 | class Transformer(nn.Module):
237 | def __init__(self, *, src_vocab_size, trg_vocab_size, embed_dim, nb_heads,
238 | src_hid_size, src_nb_layers, trg_hid_size, trg_nb_layers,
239 | dropout_p, tie_trg_embed, src_c2i, trg_c2i, attr_c2i,
240 | label_smooth, **kwargs):
241 | '''
242 | init
243 | '''
244 | super().__init__()
245 | self.src_vocab_size = src_vocab_size
246 | self.trg_vocab_size = trg_vocab_size
247 | self.embed_dim = embed_dim
248 | self.embed_scale = math.sqrt(embed_dim)
249 | self.nb_heads = nb_heads
250 | self.src_hid_size = src_hid_size
251 | self.src_nb_layers = src_nb_layers
252 | self.trg_hid_size = trg_hid_size
253 | self.trg_nb_layers = trg_nb_layers
254 | self.dropout_p = dropout_p
255 | self.tie_trg_embed = tie_trg_embed
256 | self.label_smooth = label_smooth
257 | self.src_c2i, self.trg_c2i, self.attr_c2i = src_c2i, trg_c2i, attr_c2i
258 | self.src_embed = Embedding(src_vocab_size,
259 | embed_dim,
260 | padding_idx=PAD_IDX)
261 | self.trg_embed = Embedding(trg_vocab_size,
262 | embed_dim,
263 | padding_idx=PAD_IDX)
264 | self.position_embed = SinusoidalPositionalEmbedding(embed_dim, PAD_IDX)
265 | encoder_layer = TransformerEncoderLayer(d_model=embed_dim,
266 | nhead=nb_heads,
267 | dim_feedforward=src_hid_size,
268 | dropout=dropout_p,
269 | attention_dropout=dropout_p,
270 | activation_dropout=dropout_p,
271 | normalize_before=True)
272 | self.encoder = nn.TransformerEncoder(encoder_layer,
273 | num_layers=src_nb_layers,
274 | norm=nn.LayerNorm(embed_dim))
275 | decoder_layer = TransformerDecoderLayer(d_model=embed_dim,
276 | nhead=nb_heads,
277 | dim_feedforward=trg_hid_size,
278 | dropout=dropout_p,
279 | attention_dropout=dropout_p,
280 | activation_dropout=dropout_p,
281 | normalize_before=True)
282 | self.decoder = nn.TransformerDecoder(decoder_layer,
283 | num_layers=trg_nb_layers,
284 | norm=nn.LayerNorm(embed_dim))
285 | self.final_out = Linear(embed_dim, trg_vocab_size)
286 | if tie_trg_embed:
287 | self.final_out.weight = self.trg_embed.weight
288 | self.dropout = nn.Dropout(dropout_p)
289 | # self._reset_parameters()
290 |
291 | def embed(self, src_batch, src_mask):
292 | # ------------
293 | # word_embed = self.embed_scale * self.src_embed(src_batch)
294 | # pos_embed = self.position_embed(src_batch)
295 | # embed = self.dropout(word_embed + pos_embed)
296 | word_embed = self.src_embed(src_batch)
297 | embed = self.position_embed(word_embed)
298 | # ------------
299 | return embed
300 |
301 | def encode(self, src_batch, src_mask):
302 | embed = self.embed(src_batch, src_mask)
303 | return self.encoder(embed, src_key_padding_mask=src_mask)
304 |
305 | def decode(self, enc_hs, src_mask, trg_batch, trg_mask):
306 | # --------------
307 | # word_embed = self.embed_scale * self.trg_embed(trg_batch)
308 | # pos_embed = self.position_embed(trg_batch)
309 | # embed = self.dropout(word_embed + pos_embed)
310 | word_embed = self.trg_embed(trg_batch)
311 | embed = self.position_embed(word_embed)
312 | # --------------
313 |
314 | trg_seq_len = trg_batch.size(0)
315 | causal_mask = self.generate_square_subsequent_mask(trg_seq_len)
316 | dec_hs = self.decoder(embed,
317 | enc_hs,
318 | tgt_mask=causal_mask,
319 | tgt_key_padding_mask=trg_mask,
320 | memory_key_padding_mask=src_mask)
321 | return F.log_softmax(self.final_out(dec_hs), dim=-1)
322 |
323 | def forward(self, src_batch, src_mask, trg_batch, trg_mask):
324 | '''
325 | only for training
326 | '''
327 | src_mask = (src_mask == 0).transpose(0, 1)
328 | trg_mask = (trg_mask == 0).transpose(0, 1)
329 | # trg_seq_len, batch_size = trg_batch.size()
330 | enc_hs = self.encode(src_batch, src_mask)
331 | # output: [trg_seq_len, batch_size, vocab_siz]
332 | output = self.decode(enc_hs, src_mask, trg_batch, trg_mask)
333 | return output
334 |
335 | def count_nb_params(self):
336 | model_parameters = filter(lambda p: p.requires_grad, self.parameters())
337 | params = sum([np.prod(p.size()) for p in model_parameters])
338 | return params
339 |
340 | def loss(self, predict, target):
341 | '''
342 | compute loss
343 | '''
344 | predict = predict.view(-1, self.trg_vocab_size)
345 | # nll_loss = F.nll_loss(predict, target.view(-1), ignore_index=PAD_IDX)
346 | target = target.contiguous().view(-1, 1)
347 | non_pad_mask = target.ne(PAD_IDX)
348 | nll_loss = -predict.gather(dim=-1, index=target)[non_pad_mask].mean()
349 | smooth_loss = -predict.sum(dim=-1, keepdim=True)[non_pad_mask].mean()
350 | smooth_loss = smooth_loss / self.trg_vocab_size
351 | loss = (1. -
352 | self.label_smooth) * nll_loss + self.label_smooth * smooth_loss
353 | return loss
354 |
355 | def get_loss(self, data):
356 | src, src_mask, trg, trg_mask = data
357 | out = self.forward(src, src_mask, trg, trg_mask)
358 | loss = self.loss(out[:-1], trg[1:])
359 | return loss
360 |
361 | def generate_square_subsequent_mask(self, sz):
362 | r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
363 | Unmasked positions are filled with float(0.0).
364 | """
365 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
366 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
367 | mask == 1, float(0.0))
368 | return mask.to(DEVICE)
369 |
370 |
371 | class TagTransformer(Transformer):
372 | def __init__(self, *, nb_attr, **kwargs):
373 | super().__init__(**kwargs)
374 | self.nb_attr = nb_attr
375 | # 0 -> special token & tags, 1 -> character
376 | self.special_embeddings = Embedding(2, self.embed_dim)
377 |
378 | def embed(self, src_batch, src_mask):
379 | word_embed = self.embed_scale * self.src_embed(src_batch)
380 | char_mask = (src_batch < (self.src_vocab_size - self.nb_attr)).long()
381 | special_embed = self.embed_scale * self.special_embeddings(char_mask)
382 | pos_embed = self.position_embed(src_batch * char_mask)
383 | embed = self.dropout(word_embed + pos_embed + special_embed)
384 | return embed
385 |
386 |
387 | class UniversalTransformerEncoder(nn.Module):
388 | def __init__(self, encoder_layer, num_layers, norm=None):
389 | super(UniversalTransformerEncoder, self).__init__()
390 | self.encoder_layer = encoder_layer
391 | self.num_layers = num_layers
392 | self.norm = norm
393 |
394 | def forward(self, src, mask=None, src_key_padding_mask=None):
395 | output = src
396 |
397 | for i in range(self.num_layers):
398 | output = self.encoder_layer(
399 | output,
400 | src_mask=mask,
401 | src_key_padding_mask=src_key_padding_mask)
402 |
403 | if self.norm:
404 | output = self.norm(output)
405 |
406 | return output
407 |
408 |
409 | class UniversalTransformerDecoder(nn.Module):
410 | def __init__(self, decoder_layer, num_layers, norm=None):
411 | super(UniversalTransformerDecoder, self).__init__()
412 | self.decoder_layer = decoder_layer
413 | self.num_layers = num_layers
414 | self.norm = norm
415 |
416 | def forward(self,
417 | tgt,
418 | memory,
419 | tgt_mask=None,
420 | memory_mask=None,
421 | tgt_key_padding_mask=None,
422 | memory_key_padding_mask=None):
423 | output = tgt
424 |
425 | for i in range(self.num_layers):
426 | output = self.decoder_layer(
427 | output,
428 | memory,
429 | tgt_mask=tgt_mask,
430 | memory_mask=memory_mask,
431 | tgt_key_padding_mask=tgt_key_padding_mask,
432 | memory_key_padding_mask=memory_key_padding_mask)
433 |
434 | if self.norm:
435 | output = self.norm(output)
436 |
437 | return output
438 |
439 |
440 | class UniversalTransformer(Transformer):
441 | def __init__(self, **kwargs):
442 | super().__init__(**kwargs)
443 | encoder_layer = TransformerEncoderLayer(
444 | d_model=self.embed_dim,
445 | nhead=self.nb_heads,
446 | dim_feedforward=self.src_hid_size,
447 | dropout=self.dropout_p,
448 | attention_dropout=self.dropout_p,
449 | activation_dropout=self.dropout_p,
450 | normalize_before=True)
451 | self.encoder = UniversalTransformerEncoder(
452 | encoder_layer,
453 | num_layers=self.src_nb_layers,
454 | norm=nn.LayerNorm(self.embed_dim))
455 | decoder_layer = TransformerDecoderLayer(
456 | d_model=self.embed_dim,
457 | nhead=self.nb_heads,
458 | dim_feedforward=self.trg_hid_size,
459 | dropout=self.dropout_p,
460 | attention_dropout=self.dropout_p,
461 | activation_dropout=self.dropout_p,
462 | normalize_before=True)
463 | self.decoder = UniversalTransformerDecoder(
464 | decoder_layer,
465 | num_layers=self.trg_nb_layers,
466 | norm=nn.LayerNorm(self.embed_dim))
467 |
468 |
469 | class TagUniversalTransformer(TagTransformer, UniversalTransformer):
470 | pass
471 |
472 |
473 | def Embedding(num_embeddings, embedding_dim, padding_idx=None):
474 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
475 | nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
476 | if padding_idx is not None:
477 | nn.init.constant_(m.weight[padding_idx], 0)
478 | return m
479 |
480 |
481 | def Linear(in_features, out_features, bias=True):
482 | m = nn.Linear(in_features, out_features, bias)
483 | nn.init.xavier_uniform_(m.weight)
484 | if bias:
485 | nn.init.constant_(m.bias, 0.)
486 | return m
487 |
--------------------------------------------------------------------------------