├── examples ├── __init__.py ├── ud │ ├── __init__.py │ ├── bert │ │ ├── __init__.py │ │ ├── model.py │ │ ├── dataset.py │ │ └── system_wrapper.py │ ├── rnn │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── system_wrapper.py │ └── __main__.py ├── ner │ ├── __init__.py │ ├── rnn │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── system_wrapper.py │ ├── bert │ │ ├── __init__.py │ │ ├── model.py │ │ ├── dataset.py │ │ └── system_wrapper.py │ ├── utils.py │ └── __main__.py ├── utils │ ├── __init__.py │ ├── text.py │ ├── sequences.py │ ├── loss_wrappers.py │ ├── fasttext_downloader.py │ └── evaluators.py ├── xnli │ ├── __init__.py │ ├── bert │ │ ├── __init__.py │ │ ├── model.py │ │ ├── dataset.py │ │ └── system_wrapper.py │ ├── dam │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── system_wrapper.py │ └── __main__.py └── __main__.py ├── greek-bert-logo.png ├── examples-requirements.txt ├── LICENSE.md ├── create_new_vocabulary.py ├── .gitignore ├── normalize_data.py ├── train_tf_bert.py └── README.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ud/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ner/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ud/bert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ud/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/xnli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/ner/bert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/xnli/bert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/xnli/dam/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /greek-bert-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpaueb/greek-bert/HEAD/greek-bert-logo.png -------------------------------------------------------------------------------- /examples-requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.4.0 2 | transformers 3 | fasttext 4 | pytorch-wrapper 5 | click 6 | conllu 7 | spacy 8 | pytorch-crf 9 | -------------------------------------------------------------------------------- /examples/utils/text.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | 3 | 4 | def strip_accents_and_lowercase(s): 5 | return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn').lower() 6 | -------------------------------------------------------------------------------- /examples/utils/sequences.py: -------------------------------------------------------------------------------- 1 | def pad_to_max(lst, max_len=None, pad_value=0): 2 | pad = len(max(lst, key=len)) 3 | if max_len is not None: 4 | pad = min(max_len, pad) 5 | 6 | return [i + [pad_value for _ in range(pad - len(i))] if len(i) <= pad else i[:pad] for i in lst] -------------------------------------------------------------------------------- /examples/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .xnli.__main__ import xnli 4 | from .ud.__main__ import ud 5 | from .ner.__main__ import ner 6 | 7 | 8 | @click.group() 9 | def cli(): 10 | pass 11 | 12 | 13 | cli.add_command(xnli) 14 | cli.add_command(ud) 15 | cli.add_command(ner) 16 | 17 | if __name__ == '__main__': 18 | cli() 19 | -------------------------------------------------------------------------------- /examples/ner/utils.py: -------------------------------------------------------------------------------- 1 | def parse_ner_dataset_file(f): 2 | tokens = [] 3 | for i, l in enumerate(f): 4 | l_split = l.split() 5 | if len(l_split) == 0: 6 | yield tokens 7 | tokens.clear() 8 | continue 9 | if len(l_split) < 2: 10 | continue # todo: fix this 11 | else: 12 | tokens.append({'text': l_split[0], 'label': l_split[-1]}) 13 | if tokens: 14 | yield tokens 15 | -------------------------------------------------------------------------------- /examples/ud/bert/model.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper.functional as pwF 2 | 3 | from torch import nn 4 | 5 | 6 | class UDBERTModel(nn.Module): 7 | 8 | def __init__(self, bert_model, dp): 9 | super(UDBERTModel, self).__init__() 10 | self._bert_model = bert_model 11 | self._dp = nn.Dropout(dp) 12 | self._output_linear = nn.Linear(768, 17) 13 | 14 | def forward(self, text, text_len): 15 | attention_mask = pwF.create_mask_from_length(text_len, text.shape[1]) 16 | return self._output_linear(self._dp(self._bert_model(text, attention_mask=attention_mask)[0])) 17 | -------------------------------------------------------------------------------- /examples/ner/bert/model.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper.functional as pwF 2 | 3 | from torch import nn 4 | 5 | 6 | class NERBERTModel(nn.Module): 7 | 8 | def __init__(self, bert_model, dp): 9 | super(NERBERTModel, self).__init__() 10 | self._bert_model = bert_model 11 | self._dp = nn.Dropout(dp) 12 | self._output_linear = nn.Linear(768, 7) 13 | 14 | def forward(self, text, text_len): 15 | attention_mask = pwF.create_mask_from_length(text_len, text.shape[1]) 16 | return self._output_linear(self._dp(self._bert_model(text, attention_mask=attention_mask)[0])) 17 | -------------------------------------------------------------------------------- /examples/xnli/bert/model.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper.functional as pwF 2 | 3 | from torch import nn 4 | 5 | 6 | class XNLIBERTModel(nn.Module): 7 | 8 | def __init__(self, bert_model, dp): 9 | super(XNLIBERTModel, self).__init__() 10 | self._bert_model = bert_model 11 | self._dp = nn.Dropout(dp) 12 | self._output_linear = nn.Linear(768, 3) 13 | 14 | def forward(self, text, text_len): 15 | attention_mask = pwF.create_mask_from_length(text_len, text.shape[1]) 16 | return self._output_linear(self._dp(self._bert_model(text, attention_mask=attention_mask)[0][:, 0, :])) 17 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 NLP AUEB Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /create_new_vocabulary.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import random 3 | 4 | import sentencepiece as spm 5 | 6 | MODEL_PREFIX = "sentencepiece_el" 7 | VOC_SIZE = 35000 8 | SENTENCES_SIZE = 3000000 9 | 10 | filenames = glob.glob('/home/chalkidis/greek_corpora/*/*') 11 | 12 | SPM_COMMAND = ('--input={} --model_prefix={} ' 13 | '--input_sentence_size={} ' 14 | '--vocab_size={} ' 15 | '--shuffle_input_sentence=true').format(','.join(filenames), MODEL_PREFIX, SENTENCES_SIZE, VOC_SIZE) 16 | 17 | spm.SentencePieceTrainer.Train(SPM_COMMAND) 18 | 19 | 20 | def read_sentencepiece_vocab(filepath): 21 | voc = [] 22 | with open(filepath, encoding='utf-8') as fi: 23 | for line in fi: 24 | voc.append(line.split("\t")[0]) 25 | return voc 26 | 27 | 28 | snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX)) 29 | print("Learnt vocab size: {}".format(len(snt_vocab))) 30 | print("Sample tokens: {}".format(random.sample(snt_vocab, 10))) 31 | 32 | 33 | def parse_sentencepiece_token(token): 34 | if token.startswith("▁"): 35 | return token[1:] 36 | else: 37 | return "##" + token 38 | 39 | 40 | bert_vocab = list(map(parse_sentencepiece_token, snt_vocab)) 41 | # ctrl_symbols = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] 42 | # bert_vocab = ctrl_symbols + bert_vocab 43 | VOC_FNAME = "vocab_el.txt" 44 | 45 | with open(VOC_FNAME, "w") as fo: 46 | for token in bert_vocab: 47 | fo.write(token + "\n") 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | #editors 114 | .idea 115 | 116 | #mac 117 | .DS_STORE 118 | 119 | tmp/ 120 | dummy_scripts/ -------------------------------------------------------------------------------- /examples/xnli/bert/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | 4 | from tqdm.auto import tqdm 5 | from torch.utils.data import Dataset 6 | 7 | from ...utils.sequences import pad_to_max 8 | 9 | 10 | class XNLIBERTDataset(Dataset): 11 | L2I = { 12 | 'neutral': 0, 13 | 'contradiction': 1, 14 | 'contradictory': 1, 15 | 'entailment': 2 16 | } 17 | 18 | def __init__(self, file, tokenizer, preprocessing_function): 19 | self.ids = [] 20 | self.texts = [] 21 | self.texts_len = [] 22 | self.targets = [] 23 | 24 | for i, l in enumerate(tqdm(file)): 25 | ex = json.loads(l) 26 | cur_text, cur_len = self.process_example( 27 | ex, 28 | tokenizer, 29 | preprocessing_function 30 | ) 31 | self.texts.append(cur_text) 32 | self.texts_len.append(cur_len) 33 | self.targets.append(self.L2I[ex['label']]) 34 | self.ids.append(i) 35 | 36 | def __getitem__(self, index): 37 | return ( 38 | self.ids[index], 39 | (self.texts[index], self.texts_len[index]), 40 | self.targets[index] 41 | ) 42 | 43 | def __len__(self): 44 | return len(self.ids) 45 | 46 | @staticmethod 47 | def collate_fn(batch, pad_value): 48 | batch_zipped = list(zip(*batch)) 49 | input_zipped = list(zip(*batch_zipped[1])) 50 | 51 | ids = batch_zipped[0] 52 | texts = torch.tensor(pad_to_max(input_zipped[0], pad_value=pad_value), dtype=torch.long) 53 | texts_len = torch.tensor(input_zipped[1], dtype=torch.int) 54 | 55 | target = torch.tensor(batch_zipped[2], dtype=torch.long) 56 | 57 | batch = { 58 | 'id': ids, 59 | 'input': [texts, texts_len], 60 | 'target': target 61 | } 62 | 63 | return batch 64 | 65 | @staticmethod 66 | def process_example(ex, tokenizer, preprocessing_function): 67 | tokens = tokenizer.encode( 68 | preprocessing_function(ex['prem']) if preprocessing_function else ex['prem'], 69 | text_pair=preprocessing_function(ex['hypo']) if preprocessing_function else ex['hypo'], 70 | add_special_tokens=True, 71 | max_length=512 72 | ) 73 | 74 | return tokens, len(tokens) 75 | -------------------------------------------------------------------------------- /normalize_data.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from multiprocessing import Pool 3 | import unicodedata 4 | 5 | filenames = glob.glob('/home/chalkidis/greek_corpora/common_crawl_shards/*') 6 | filenames += glob.glob('/home/chalkidis/greek_corpora/europarl_shards/*') 7 | filenames += glob.glob('/home/chalkidis/greek_corpora/wikipedia_shards/*') 8 | 9 | def _is_punctuation(char): 10 | """Checks whether `chars` is a punctuation character.""" 11 | cp = ord(char) 12 | # We treat all non-letter/number ASCII as punctuation. 13 | # Characters such as "^", "$", and "`" are not in the Unicode 14 | # Punctuation class but we treat them as punctuation anyways, for 15 | # consistency. 16 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 17 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 18 | return True 19 | cat = unicodedata.category(char) 20 | if cat.startswith("P"): 21 | return True 22 | return False 23 | 24 | 25 | def _run_split_on_punc(text): 26 | """Splits punctuation on a piece of text.""" 27 | chars = list(text) 28 | i = 0 29 | start_new_word = True 30 | output = [] 31 | while i < len(chars): 32 | char = chars[i] 33 | if _is_punctuation(char): 34 | output.append([char]) 35 | start_new_word = True 36 | else: 37 | if start_new_word: 38 | output.append([]) 39 | start_new_word = False 40 | output[-1].append(char) 41 | i += 1 42 | 43 | return ["".join(x) for x in output] 44 | 45 | def strip_accents_and_lowercase(s): 46 | return ''.join(c for c in unicodedata.normalize('NFD', s) 47 | if unicodedata.category(c) != 'Mn').lower() 48 | 49 | def normalize(filename): 50 | output_file = open(filename.replace('greek_corpora', 'greek_corpora_norm'), 'w', encoding='utf8') 51 | with open(filename, encoding='utf8') as file: 52 | for line in file.readlines(): 53 | tokens = line.lower().split() 54 | splited_tokens = [] 55 | for token in tokens: 56 | splited_tokens.extend(_run_split_on_punc(token)) 57 | line = ' '.join(splited_tokens) 58 | line = strip_accents_and_lowercase(line) 59 | if line.endswith('\n'): 60 | output_file.write(line) 61 | else: 62 | output_file.write(line+'\n') 63 | output_file.close() 64 | 65 | 66 | with Pool(processes=10) as pool: 67 | pool.map(normalize, filenames) 68 | -------------------------------------------------------------------------------- /examples/xnli/dam/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | import spacy 4 | 5 | from tqdm.auto import tqdm 6 | from torch.utils.data import Dataset 7 | 8 | from ...utils.sequences import pad_to_max 9 | 10 | nlp = spacy.load('el', disable=['parser', 'ner']) 11 | 12 | 13 | class XNLIDAMDataset(Dataset): 14 | L2I = { 15 | 'neutral': 0, 16 | 'contradiction': 1, 17 | 'contradictory': 1, 18 | 'entailment': 2 19 | } 20 | 21 | def __init__(self, file, w2i): 22 | self.ids = [] 23 | self.prems = [] 24 | self.prem_lens = [] 25 | self.hypos = [] 26 | self.hypo_lens = [] 27 | self.targets = [] 28 | 29 | for i, l in enumerate(tqdm(file)): 30 | ex = json.loads(l) 31 | prem, prem_len, hypo, hypo_len = self.process_example(ex, w2i) 32 | self.prems.append(prem) 33 | self.prem_lens.append(prem_len) 34 | self.hypos.append(hypo) 35 | self.hypo_lens.append(hypo_len) 36 | self.targets.append(self.L2I[ex['label']]) 37 | self.ids.append(i) 38 | 39 | def __getitem__(self, index): 40 | return ( 41 | self.ids[index], 42 | (self.prems[index], self.prem_lens[index], self.hypos[index], self.hypo_lens[index]), 43 | self.targets[index] 44 | ) 45 | 46 | def __len__(self): 47 | return len(self.ids) 48 | 49 | @staticmethod 50 | def collate_fn(batch): 51 | batch_zipped = list(zip(*batch)) 52 | input_zipped = list(zip(*batch_zipped[1])) 53 | 54 | ids = batch_zipped[0] 55 | prems = torch.tensor(pad_to_max(input_zipped[0]), dtype=torch.long) 56 | prem_lens = torch.tensor(input_zipped[1], dtype=torch.int) 57 | hypos = torch.tensor(pad_to_max(input_zipped[2]), dtype=torch.long) 58 | hypo_lens = torch.tensor(input_zipped[3], dtype=torch.int) 59 | 60 | target = torch.tensor(batch_zipped[2], dtype=torch.long) 61 | 62 | batch = { 63 | 'id': ids, 64 | 'input': [prems, prem_lens, hypos, hypo_lens], 65 | 'target': target 66 | } 67 | 68 | return batch 69 | 70 | @staticmethod 71 | def process_example(ex, w2i): 72 | premise = [w2i[t] for t in XNLIDAMDataset.process_text(ex['prem'])] 73 | hypothesis = [w2i[t] for t in XNLIDAMDataset.process_text(ex['hypo'])] 74 | 75 | return premise, len(premise), hypothesis, len(hypothesis) 76 | 77 | @staticmethod 78 | def process_text(text): 79 | return [t.text for t in nlp(text)] 80 | -------------------------------------------------------------------------------- /examples/utils/loss_wrappers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from pytorch_wrapper.loss_wrappers import AbstractLossWrapper 4 | 5 | 6 | class MaskedTokenLabelingGenericPointWiseLossWrapper(AbstractLossWrapper): 7 | 8 | def __init__(self, loss, batch_input_key='input', model_output_key=None, 9 | batch_target_key='target', batch_mask_key='mask', perform_last_activation=False): 10 | 11 | super(MaskedTokenLabelingGenericPointWiseLossWrapper, self).__init__() 12 | self._loss = loss 13 | self._batch_input_key = batch_input_key 14 | self._batch_mask_key = batch_mask_key 15 | self._model_output_key = model_output_key 16 | self._batch_target_key = batch_target_key 17 | self._perform_last_activation = perform_last_activation 18 | 19 | def calculate_loss(self, output, batch, training_context, last_activation=None): 20 | 21 | if self._model_output_key is not None: 22 | output = output[self._model_output_key] 23 | 24 | if last_activation is not None and self._perform_last_activation: 25 | output = last_activation(output) 26 | 27 | mask = batch[self._batch_mask_key].to(output.device) 28 | 29 | output_extra_dims = output.dim() - mask.dim() 30 | output_mask_new_shape = list(mask.shape) + [1] * output_extra_dims 31 | output_extra_dims_shape = list(output.shape)[mask.dim():] 32 | output = torch.masked_select(output, mask.view(*output_mask_new_shape)) 33 | output = output.view(-1, *output_extra_dims_shape) 34 | 35 | target = batch[self._batch_target_key].to(output.device) 36 | target_extra_dims = target.dim() - mask.dim() 37 | target_mask_new_shape = list(mask.shape) + [1] * target_extra_dims 38 | target_extra_dims_shape = list(target.shape)[mask.dim():] 39 | target = torch.masked_select(target, mask.view(*target_mask_new_shape)) 40 | target = target.view(-1, *target_extra_dims_shape) 41 | 42 | return self._loss(output, target) 43 | 44 | 45 | class PassThroughLossWrapper(AbstractLossWrapper): 46 | """ 47 | Dummy adapter that returns the loss as returned by the model. Useful when the loss is calculated inside the model's 48 | forward method. 49 | """ 50 | 51 | def __init__(self, model_loss_key=None): 52 | """ 53 | :param model_loss_key: Key where the dict returned by the model contains the calculated loss. Leave None if the 54 | model returns only the loss. 55 | """ 56 | super(PassThroughLossWrapper, self).__init__() 57 | self._model_loss_key = model_loss_key 58 | 59 | def calculate_loss(self, output, batch, training_context, last_activation=None): 60 | """ 61 | Calculates the loss for a single batch. 62 | :param output: Output of the model. 63 | :param batch: Dict that contains all information needed by the loss wrapper. 64 | :param training_context: Dict containing information regarding the training process. 65 | :param last_activation: Last activation provided to the System. 66 | :return: Output of the loss function/module. 67 | """ 68 | if self._model_loss_key is None: 69 | return output 70 | else: 71 | return output[self._model_loss_key] 72 | 73 | def to(self, device): 74 | pass 75 | -------------------------------------------------------------------------------- /examples/ner/bert/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.utils.data import Dataset 4 | 5 | from ...utils.sequences import pad_to_max 6 | from ..utils import parse_ner_dataset_file 7 | 8 | 9 | class NERBERTDataset(Dataset): 10 | I2L = [ 11 | 'B-LOC', 12 | 'B-ORG', 13 | 'B-PER', 14 | 'I-LOC', 15 | 'I-ORG', 16 | 'I-PER', 17 | 'O' 18 | ] 19 | L2I = {k: i for i, k in enumerate(I2L)} 20 | 21 | def __init__(self, dataset_file, tokenizer, bert_like_special_tokens, preprocessing_function): 22 | 23 | self.ids = [] 24 | self.texts = [] 25 | self.text_lens = [] 26 | self.pred_masks = [] 27 | self.targets = [] 28 | 29 | for i, tokenlist in enumerate(parse_ner_dataset_file(dataset_file)): 30 | cur_texts, cur_text_lens, pred_mask, labels = self.process_example( 31 | tokenlist, 32 | tokenizer, 33 | bert_like_special_tokens, 34 | preprocessing_function 35 | ) 36 | self.texts.append(cur_texts) 37 | self.text_lens.append(cur_text_lens) 38 | self.pred_masks.append(pred_mask) 39 | self.targets.append([self.L2I.get(cur_l, -1) for cur_l in labels]) 40 | self.ids.append(i) 41 | 42 | def __getitem__(self, index): 43 | return ( 44 | self.ids[index], 45 | (self.texts[index], self.text_lens[index]), 46 | self.targets[index], 47 | self.pred_masks[index] 48 | ) 49 | 50 | def __len__(self): 51 | return len(self.ids) 52 | 53 | @staticmethod 54 | def collate_fn(batch, pad_value): 55 | batch_zipped = list(zip(*batch)) 56 | input_zipped = list(zip(*batch_zipped[1])) 57 | 58 | ids = batch_zipped[0] 59 | texts = torch.tensor(pad_to_max(input_zipped[0], pad_value=pad_value), dtype=torch.long) 60 | text_lens = torch.tensor(input_zipped[1], dtype=torch.int) 61 | target = torch.tensor(pad_to_max(batch_zipped[2], pad_value=-1), dtype=torch.long) 62 | pred_mask = torch.tensor(pad_to_max(batch_zipped[3]), dtype=torch.bool) 63 | 64 | batch = { 65 | 'id': ids, 66 | 'input': [texts, text_lens], 67 | 'target': target, 68 | 'mask': pred_mask 69 | } 70 | 71 | return batch 72 | 73 | @staticmethod 74 | def process_example(tokens, tokenizer, bert_like_special_tokens, preprocessing_function): 75 | transformer_tokens = [tokenizer.cls_token_id] if bert_like_special_tokens else [tokenizer.bos_token_id] 76 | pred_mask = [0] 77 | labels = ['PAD'] 78 | for token in tokens: 79 | processed_token = preprocessing_function(token['text']) if preprocessing_function else token['text'] 80 | current_tokens = tokenizer.encode(processed_token, add_special_tokens=False) 81 | if len(current_tokens) == 0: 82 | current_tokens = [tokenizer.unk_token_id] 83 | transformer_tokens.extend(current_tokens) 84 | labels.extend([token['label']] + ['PAD'] * (len(current_tokens) - 1)) 85 | pred_mask.extend([1] + [0] * (len(current_tokens) - 1)) 86 | transformer_tokens.append(tokenizer.sep_token_id if bert_like_special_tokens else tokenizer.eos_token_id) 87 | pred_mask.append(0) 88 | labels.append('PAD') 89 | 90 | return transformer_tokens, len(transformer_tokens), pred_mask, labels 91 | -------------------------------------------------------------------------------- /examples/ud/bert/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.utils.data import Dataset 4 | from conllu import parse_incr 5 | 6 | from ...utils.sequences import pad_to_max 7 | 8 | 9 | class UDBERTDataset(Dataset): 10 | I2L = [ 11 | 'ADJ', 12 | 'ADP', 13 | 'ADV', 14 | 'AUX', 15 | 'CCONJ', 16 | 'DET', 17 | 'NOUN', 18 | 'NUM', 19 | 'PART', 20 | 'PRON', 21 | 'PROPN', 22 | 'PUNCT', 23 | 'SCONJ', 24 | 'SYM', 25 | 'VERB', 26 | 'X', 27 | '_' 28 | ] 29 | L2I = {k: i for i, k in enumerate(I2L)} 30 | 31 | def __init__(self, dataset_file, tokenizer, bert_like_special_tokens, preprocessing_function): 32 | 33 | self.ids = [] 34 | self.texts = [] 35 | self.text_lens = [] 36 | self.pred_masks = [] 37 | self.targets = [] 38 | 39 | for i, tokenlist in enumerate(parse_incr(dataset_file)): 40 | cur_texts, cur_text_lens, pred_mask, labels = self.process_example( 41 | tokenlist, 42 | tokenizer, 43 | bert_like_special_tokens, 44 | preprocessing_function 45 | ) 46 | self.texts.append(cur_texts) 47 | self.text_lens.append(cur_text_lens) 48 | self.pred_masks.append(pred_mask) 49 | self.targets.append([self.L2I.get(cur_l, -1) for cur_l in labels]) 50 | self.ids.append(i) 51 | 52 | def __getitem__(self, index): 53 | return ( 54 | self.ids[index], 55 | (self.texts[index], self.text_lens[index]), 56 | self.targets[index], 57 | self.pred_masks[index] 58 | ) 59 | 60 | def __len__(self): 61 | return len(self.ids) 62 | 63 | @staticmethod 64 | def collate_fn(batch, pad_value): 65 | batch_zipped = list(zip(*batch)) 66 | input_zipped = list(zip(*batch_zipped[1])) 67 | 68 | ids = batch_zipped[0] 69 | texts = torch.tensor(pad_to_max(input_zipped[0], pad_value=pad_value), dtype=torch.long) 70 | text_lens = torch.tensor(input_zipped[1], dtype=torch.int) 71 | target = torch.tensor(pad_to_max(batch_zipped[2], pad_value=-1), dtype=torch.long) 72 | pred_mask = torch.tensor(pad_to_max(batch_zipped[3]), dtype=torch.bool) 73 | 74 | batch = { 75 | 'id': ids, 76 | 'input': [texts, text_lens], 77 | 'target': target, 78 | 'mask': pred_mask 79 | } 80 | 81 | return batch 82 | 83 | @staticmethod 84 | def process_example(tokens, tokenizer, bert_like_special_tokens, preprocessing_function): 85 | transformer_tokens = [tokenizer.cls_token_id] if bert_like_special_tokens else [tokenizer.bos_token_id] 86 | pred_mask = [0] 87 | labels = ['PAD'] 88 | for token in tokens: 89 | processed_token = preprocessing_function(token['form']) if preprocessing_function else token['form'] 90 | current_tokens = tokenizer.encode(processed_token, add_special_tokens=False) 91 | if len(current_tokens) == 0: 92 | current_tokens = [tokenizer.unk_token_id] 93 | transformer_tokens.extend(current_tokens) 94 | labels.extend([token['upostag']] + ['PAD'] * (len(current_tokens) - 1)) 95 | pred_mask.extend([1] + [0] * (len(current_tokens) - 1)) 96 | transformer_tokens.append(tokenizer.sep_token_id if bert_like_special_tokens else tokenizer.eos_token_id) 97 | pred_mask.append(0) 98 | labels.append('PAD') 99 | 100 | return transformer_tokens, len(transformer_tokens), pred_mask, labels 101 | -------------------------------------------------------------------------------- /train_tf_bert.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import time 5 | import tensorflow as tf 6 | sys.path.append("bert") 7 | from tensorflow.contrib.cluster_resolver import TPUClusterResolver 8 | from bert import modeling 9 | from bert.run_pretraining import input_fn_builder, model_fn_builder 10 | 11 | # configure logging 12 | log = logging.getLogger('tensorflow') 13 | log.setLevel(logging.INFO) 14 | 15 | # create formatter and add it to the handlers 16 | formatter = logging.Formatter('%(asctime)s : %(message)s') 17 | sh = logging.StreamHandler() 18 | sh.setLevel(logging.INFO) 19 | sh.setFormatter(formatter) 20 | log.handlers = [sh] 21 | log.info("Using TPU runtime") 22 | USE_TPU = True 23 | tpu_cluster_resolver = TPUClusterResolver(tpu='greek-bert', zone='us-central1-a') 24 | 25 | # SETUP FOLDERS 26 | with tf.Session(tpu_cluster_resolver.get_master()) as session: 27 | print(tpu_cluster_resolver.get_master()) 28 | HOME_PATH = "gs://greek_bert" # @param {type:"string"} 29 | MODEL_DIR = "greek_bert" # @param {type:"string"} 30 | PRETRAINING_DIR = "greek_tfrecords" # @param {type:"string"} 31 | VOC_FNAME = "vocab.txt" # @param {type:"string"} 32 | 33 | # Input data pipeline config 34 | TRAIN_BATCH_SIZE = 256 # @param {type:"integer"} 35 | MAX_PREDICTIONS =75 # @param {type:"integer"} 36 | MAX_SEQ_LENGTH = 512 # @param {type:"integer"} 37 | MASKED_LM_PROB = 0.15 # @param 38 | 39 | # Training procedure config 40 | EVAL_BATCH_SIZE = 256 41 | LEARNING_RATE = 1e-4 42 | TRAIN_STEPS = 1000000 # @param {type:"integer"} 43 | EVAL_STEPS = 10000 44 | SAVE_CHECKPOINTS_STEPS = 50000 # @param {type:"integer"} 45 | NUM_TPU_CORES = 8 46 | BERT_GCS_DIR = "{}/{}".format(HOME_PATH, MODEL_DIR) 47 | DATA_GCS_DIR = "{}/{}".format(HOME_PATH, PRETRAINING_DIR) 48 | VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME) 49 | CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json") 50 | INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR) 51 | bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE) 52 | input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR, '*')) 53 | log.info("Using checkpoint: {}".format(INIT_CHECKPOINT)) 54 | log.info("Using {} data shards".format(len(input_files))) 55 | time.sleep(10) 56 | 57 | # BUILD TPU ESTIMATOR 58 | model_fn = model_fn_builder( 59 | bert_config=bert_config, 60 | init_checkpoint=INIT_CHECKPOINT, 61 | learning_rate=LEARNING_RATE, 62 | num_train_steps=TRAIN_STEPS, 63 | num_warmup_steps=10000, 64 | use_tpu=USE_TPU, 65 | use_one_hot_embeddings=True) 66 | run_config = tf.contrib.tpu.RunConfig( 67 | cluster=tpu_cluster_resolver, 68 | model_dir=BERT_GCS_DIR, 69 | save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, 70 | tpu_config=tf.contrib.tpu.TPUConfig( 71 | iterations_per_loop=SAVE_CHECKPOINTS_STEPS, 72 | num_shards=NUM_TPU_CORES, 73 | per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) 74 | estimator = tf.contrib.tpu.TPUEstimator( 75 | use_tpu=USE_TPU, 76 | model_fn=model_fn, 77 | config=run_config, 78 | train_batch_size=TRAIN_BATCH_SIZE, 79 | eval_batch_size=EVAL_BATCH_SIZE) 80 | train_input_fn = input_fn_builder( 81 | input_files=input_files, 82 | max_seq_length=MAX_SEQ_LENGTH, 83 | max_predictions_per_seq=MAX_PREDICTIONS, 84 | is_training=True) 85 | estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS) 86 | tf.logging.info("***** Running evaluation *****") 87 | tf.logging.info(" Batch size = %d", EVAL_BATCH_SIZE) 88 | eval_input_fn = input_fn_builder( 89 | input_files=input_files, 90 | max_seq_length=MAX_SEQ_LENGTH, 91 | max_predictions_per_seq=MAX_PREDICTIONS, 92 | is_training=False) 93 | result = estimator.evaluate( 94 | input_fn=eval_input_fn, steps=EVAL_STEPS) 95 | tf.logging.info("***** Eval results *****") 96 | for key in sorted(result.keys()): 97 | tf.logging.info(" %s = %s", key, str(result[key])) 98 | -------------------------------------------------------------------------------- /examples/ud/rnn/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import itertools 3 | 4 | from torch.utils.data import Dataset 5 | from conllu import parse_incr 6 | 7 | from ...utils.sequences import pad_to_max 8 | 9 | 10 | class UDRNNDataset(Dataset): 11 | I2L = [ 12 | 'ADJ', 13 | 'ADP', 14 | 'ADV', 15 | 'AUX', 16 | 'CCONJ', 17 | 'DET', 18 | 'NOUN', 19 | 'NUM', 20 | 'PART', 21 | 'PRON', 22 | 'PROPN', 23 | 'PUNCT', 24 | 'SCONJ', 25 | 'SYM', 26 | 'VERB', 27 | 'X', 28 | '_' 29 | ] 30 | L2I = {k: i for i, k in enumerate(I2L)} 31 | 32 | def __init__(self, dataset_file, w2i, c2i): 33 | self.ids = [] 34 | self.processed_tokens = [] 35 | self.processed_tokens_len = [] 36 | self.char_words = [] 37 | self.char_word_lens = [] 38 | self.targets = [] 39 | 40 | for i, tokenlist in enumerate(parse_incr(dataset_file)): 41 | cur_words, cur_words_len, cur_char_words, cur_char_word_lens, cur_targets = \ 42 | self.process_example(tokenlist, w2i, c2i) 43 | 44 | self.processed_tokens.append(cur_words) 45 | self.processed_tokens_len.append(cur_words_len) 46 | self.char_words.append(cur_char_words) 47 | self.char_word_lens.append(cur_char_word_lens) 48 | self.targets.append([self.L2I[t] for t in cur_targets]) 49 | self.ids.append(i) 50 | 51 | def __getitem__(self, index): 52 | return ( 53 | self.ids[index], 54 | ( 55 | self.char_words[index], 56 | self.char_word_lens[index], 57 | self.processed_tokens[index], 58 | self.processed_tokens_len[index], 59 | ), 60 | self.targets[index] 61 | ) 62 | 63 | def __len__(self): 64 | return len(self.ids) 65 | 66 | @staticmethod 67 | def collate_fn(batch): 68 | batch_zipped = list(zip(*batch)) 69 | input_zipped = list(zip(*batch_zipped[1])) 70 | 71 | ids = batch_zipped[0] 72 | 73 | batched_char_words = torch.tensor( 74 | pad_to_max(list(itertools.chain.from_iterable(input_zipped[0]))), 75 | dtype=torch.long 76 | ) 77 | batched_char_words_len = torch.tensor(list(itertools.chain.from_iterable(input_zipped[1])), dtype=torch.int) 78 | 79 | nbs_accumulated = list(itertools.accumulate([1] + list(input_zipped[3]))) 80 | indices = [list(range(nbs_accumulated[i], nbs_accumulated[i + 1])) for i in range(len(nbs_accumulated) - 1)] 81 | batched_char_word_index = torch.tensor(pad_to_max(indices), dtype=torch.long) 82 | 83 | batched_tokens = torch.tensor(pad_to_max(input_zipped[2]), dtype=torch.long) 84 | batched_tokens_len = torch.tensor(input_zipped[3], dtype=torch.int) 85 | 86 | target = torch.tensor(pad_to_max(batch_zipped[2], pad_value=-1), dtype=torch.long) 87 | 88 | return { 89 | 'id': ids, 90 | 'input': [ 91 | batched_char_words, 92 | batched_char_words_len, 93 | batched_char_word_index, 94 | batched_tokens, 95 | batched_tokens_len, 96 | target 97 | ], 98 | 'target': target 99 | } 100 | 101 | @staticmethod 102 | def process_example(tokens, w2i, c2i): 103 | 104 | processed_tokens = [] 105 | char_words = [] 106 | char_word_lens = [] 107 | targets = [] 108 | 109 | for token in tokens: 110 | processed_tokens.append(w2i[token['form'].lower()]) 111 | char_words.append( 112 | [c2i['']] + 113 | [c2i.get(c, 1) for c in list(token['form'])] + 114 | [c2i['']] 115 | ) 116 | char_word_lens.append(len(char_words[-1])) 117 | targets.append(token['upostag']) 118 | 119 | return processed_tokens, len(processed_tokens), char_words, char_word_lens, targets 120 | -------------------------------------------------------------------------------- /examples/ner/rnn/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import itertools 3 | import pytorch_wrapper.functional as pwF 4 | 5 | from torch.utils.data import Dataset 6 | 7 | from ...utils.sequences import pad_to_max 8 | from ..utils import parse_ner_dataset_file 9 | 10 | 11 | class NERRNNDataset(Dataset): 12 | I2L = [ 13 | 'B-LOC', 14 | 'B-ORG', 15 | 'B-PER', 16 | 'I-LOC', 17 | 'I-ORG', 18 | 'I-PER', 19 | 'O' 20 | ] 21 | L2I = {k: i for i, k in enumerate(I2L)} 22 | 23 | def __init__(self, dataset_file, w2i, c2i): 24 | self.ids = [] 25 | self.processed_tokens = [] 26 | self.processed_tokens_len = [] 27 | self.char_words = [] 28 | self.char_word_lens = [] 29 | self.targets = [] 30 | 31 | for i, tokenlist in enumerate(parse_ner_dataset_file(dataset_file)): 32 | cur_words, cur_words_len, cur_char_words, cur_char_word_lens, cur_targets = \ 33 | self.process_example(tokenlist, w2i, c2i) 34 | 35 | self.processed_tokens.append(cur_words) 36 | self.processed_tokens_len.append(cur_words_len) 37 | self.char_words.append(cur_char_words) 38 | self.char_word_lens.append(cur_char_word_lens) 39 | self.targets.append([self.L2I[t] for t in cur_targets]) 40 | self.ids.append(i) 41 | 42 | def __getitem__(self, index): 43 | return ( 44 | self.ids[index], 45 | ( 46 | self.char_words[index], 47 | self.char_word_lens[index], 48 | self.processed_tokens[index], 49 | self.processed_tokens_len[index], 50 | ), 51 | self.targets[index] 52 | ) 53 | 54 | def __len__(self): 55 | return len(self.ids) 56 | 57 | @staticmethod 58 | def collate_fn(batch): 59 | batch_zipped = list(zip(*batch)) 60 | input_zipped = list(zip(*batch_zipped[1])) 61 | 62 | ids = batch_zipped[0] 63 | 64 | batched_char_words = torch.tensor( 65 | pad_to_max(list(itertools.chain.from_iterable(input_zipped[0]))), 66 | dtype=torch.long 67 | ) 68 | batched_char_words_len = torch.tensor(list(itertools.chain.from_iterable(input_zipped[1])), dtype=torch.int) 69 | 70 | nbs_accumulated = list(itertools.accumulate([1] + list(input_zipped[3]))) 71 | indices = [list(range(nbs_accumulated[i], nbs_accumulated[i + 1])) for i in range(len(nbs_accumulated) - 1)] 72 | batched_char_word_index = torch.tensor(pad_to_max(indices), dtype=torch.long) 73 | 74 | batched_tokens = torch.tensor(pad_to_max(input_zipped[2]), dtype=torch.long) 75 | batched_tokens_len = torch.tensor(input_zipped[3], dtype=torch.int) 76 | 77 | with torch.no_grad(): 78 | pred_mask = pwF.create_mask_from_length(batched_tokens_len, torch.max(batched_tokens_len).item()) 79 | 80 | target = torch.tensor(pad_to_max(batch_zipped[2], pad_value=-1), dtype=torch.long) 81 | 82 | return { 83 | 'id': ids, 84 | 'input': [ 85 | batched_char_words, 86 | batched_char_words_len, 87 | batched_char_word_index, 88 | batched_tokens, 89 | batched_tokens_len, 90 | target 91 | ], 92 | 'target': target, 93 | 'mask': pred_mask 94 | } 95 | 96 | @staticmethod 97 | def process_example(tokens, w2i, c2i): 98 | 99 | processed_tokens = [] 100 | char_words = [] 101 | char_word_lens = [] 102 | targets = [] 103 | 104 | for token in tokens: 105 | processed_tokens.append(w2i[token['text'].lower()]) 106 | char_words.append( 107 | [c2i['']] + 108 | [c2i.get(c, 1) for c in list(token['text'])] + 109 | [c2i['']] 110 | ) 111 | char_word_lens.append(len(char_words[-1])) 112 | targets.append(token['label']) 113 | 114 | return processed_tokens, len(processed_tokens), char_words, char_word_lens, targets 115 | -------------------------------------------------------------------------------- /examples/xnli/dam/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import pytorch_wrapper as pw 4 | import pytorch_wrapper.functional as pwF 5 | 6 | from torch import nn 7 | 8 | 9 | class XNLIDAMModel(nn.Module): 10 | 11 | def __init__(self, 12 | embeddings, 13 | mlp_num_layers=1, 14 | mlp_hidden_size=200, 15 | mlp_activation=nn.ReLU, 16 | mlp_dp=0.2): 17 | super(XNLIDAMModel, self).__init__() 18 | self._embedding_layer = pw.modules.EmbeddingLayer(embeddings.shape[0], embeddings.shape[1], False, 0) 19 | self._embedding_layer.load_embeddings(embeddings) 20 | 21 | self._linear_projection = nn.Linear(embeddings.shape[1], mlp_hidden_size, bias=False) 22 | 23 | self._att_mlp = pw.modules.MLP( 24 | input_size=mlp_hidden_size, 25 | num_hidden_layers=mlp_num_layers, 26 | hidden_layer_size=mlp_hidden_size, 27 | hidden_activation=mlp_activation, 28 | hidden_dp=mlp_dp, 29 | output_size=mlp_hidden_size, 30 | output_activation=mlp_activation, 31 | hidden_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x), 32 | output_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x) 33 | ) 34 | 35 | self._comp_mlp = pw.modules.MLP( 36 | input_size=2 * mlp_hidden_size, 37 | num_hidden_layers=mlp_num_layers, 38 | hidden_layer_size=mlp_hidden_size, 39 | hidden_activation=mlp_activation, 40 | hidden_dp=mlp_dp, 41 | output_size=mlp_hidden_size, 42 | output_activation=mlp_activation, 43 | hidden_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x), 44 | output_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x) 45 | ) 46 | 47 | self._out_mlp = pw.modules.MLP( 48 | input_size=2 * mlp_hidden_size, 49 | num_hidden_layers=mlp_num_layers, 50 | hidden_layer_size=mlp_hidden_size, 51 | hidden_activation=mlp_activation, 52 | hidden_dp=mlp_dp, 53 | output_size=3, 54 | output_activation=None, 55 | hidden_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x), 56 | output_layer_init=lambda x, y: torch.nn.init.xavier_uniform_(x) 57 | ) 58 | 59 | def forward(self, prems_indexes, prem_lens, hypos_indexes, hypo_lens): 60 | prems = self._embedding_layer(prems_indexes) 61 | prems = prems / (prems.norm(dim=-1, keepdim=True) + 1e-6) 62 | prems = self._linear_projection(prems) 63 | prems_mask = pwF.create_mask_from_length(prem_lens, prems.shape[1]) 64 | prems_att_vectors = self._att_mlp(prems) 65 | 66 | hypos = self._embedding_layer(hypos_indexes) 67 | hypos = hypos / (hypos.norm(dim=-1, keepdim=True) + 1e-6) 68 | hypos = self._linear_projection(hypos) 69 | hypos_mask = pwF.create_mask_from_length(hypo_lens, hypos.shape[1]) 70 | hypos_att_vectors = self._att_mlp(hypos) 71 | 72 | scores = torch.matmul(prems_att_vectors, hypos_att_vectors.transpose(1, 2)) 73 | scores = scores.masked_fill(prems_mask.unsqueeze(2) == 0, -1e9) 74 | scores = scores.masked_fill(hypos_mask.unsqueeze(1) == 0, -1e9) 75 | 76 | horizontal_softmaxed = F.softmax(scores, dim=2) 77 | vertical_softmaxed = F.softmax(scores, dim=1) 78 | 79 | hypos_attended = torch.matmul(horizontal_softmaxed, hypos) 80 | prems_hypos_attended = torch.cat([prems, hypos_attended], dim=-1) 81 | prems_hypos_attended_compared = self._comp_mlp(prems_hypos_attended) 82 | prems_hypos_attended_compared = prems_hypos_attended_compared.masked_fill(prems_mask.unsqueeze(2) == 0, 0) 83 | prems_hypos_attended_aggregated = torch.sum(prems_hypos_attended_compared, dim=1) 84 | 85 | prems_attended = torch.matmul(vertical_softmaxed.transpose(1, 2), prems) 86 | hypos_prems_attended = torch.cat([hypos, prems_attended], dim=-1) 87 | hypos_prems_attended_compared = self._comp_mlp(hypos_prems_attended) 88 | hypos_prems_attended_compared = hypos_prems_attended_compared.masked_fill(hypos_mask.unsqueeze(2) == 0, 0) 89 | hypos_prems_attended_aggregated = torch.sum(hypos_prems_attended_compared, dim=1) 90 | 91 | encodings = torch.cat([prems_hypos_attended_aggregated, hypos_prems_attended_aggregated], dim=-1) 92 | 93 | return self._out_mlp(encodings) 94 | -------------------------------------------------------------------------------- /examples/ud/rnn/model.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import pytorch_wrapper.functional as pwF 3 | import torch 4 | 5 | from torch import nn 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 7 | from torchcrf import CRF 8 | 9 | from ...utils.sequences import pad_to_max 10 | 11 | 12 | class UDRNNModel(nn.Module): 13 | 14 | def __init__(self, 15 | char_embeddings_shape, 16 | embeddings, 17 | char_cnn_kernel_heights=(3,), 18 | char_cnn_out_channels=30, 19 | rnn_class=nn.GRU, 20 | rnn_hidden_size=128, 21 | rnn_num_layers=2, 22 | rnn_dp=0.2, 23 | rnn_bidirectional=True, 24 | mlp_num_layers=1, 25 | mlp_hidden_size=128, 26 | mlp_activation=nn.ReLU, 27 | mlp_dp=0.2): 28 | super(UDRNNModel, self).__init__() 29 | 30 | self._char_embedding_layer = pw.modules.EmbeddingLayer( 31 | char_embeddings_shape[0], 32 | char_embeddings_shape[1], 33 | True, 34 | 0 35 | ) 36 | 37 | self._char_token_encoder = pw.modules.sequence_basic_cnn_encoder.SequenceBasicCNNEncoder( 38 | char_embeddings_shape[1], 39 | kernel_heights=char_cnn_kernel_heights, 40 | out_channels=char_cnn_out_channels 41 | ) 42 | 43 | self._embedding_layer = pw.modules.EmbeddingLayer(embeddings.shape[0], embeddings.shape[1], False, 0) 44 | self._embedding_layer.load_embeddings(embeddings) 45 | 46 | self._embedding_dp = nn.Dropout(rnn_dp) 47 | 48 | self._rnn = rnn_class( 49 | input_size=embeddings.shape[1] + char_cnn_out_channels * len(char_cnn_kernel_heights), 50 | hidden_size=rnn_hidden_size, 51 | num_layers=rnn_num_layers, 52 | dropout=rnn_dp, 53 | bidirectional=rnn_bidirectional, 54 | batch_first=True 55 | ) 56 | 57 | self._rnn_top_layer_dp = nn.Dropout(rnn_dp) 58 | 59 | self._out_mlp = pw.modules.MLP( 60 | input_size=rnn_hidden_size * (2 if rnn_bidirectional else 1), 61 | num_hidden_layers=mlp_num_layers, 62 | hidden_layer_size=mlp_hidden_size, 63 | hidden_activation=mlp_activation, 64 | hidden_dp=mlp_dp, 65 | output_size=17, 66 | output_activation=None 67 | ) 68 | 69 | self._crf = CRF(17, batch_first=True) 70 | 71 | def forward(self, 72 | batched_char_words, 73 | batched_char_words_len, 74 | batched_char_word_index, 75 | batched_tokens, 76 | batched_tokens_len, 77 | target=None 78 | ): 79 | 80 | char_tokens = self._char_embedding_layer(batched_char_words) 81 | token_encodings = self._char_token_encoder(char_tokens) 82 | 83 | token_encodings_z = torch.zeros((1, token_encodings.shape[1]), device=token_encodings.device) 84 | token_encodings = torch.cat([token_encodings_z, token_encodings], dim=0) 85 | 86 | token_encodings_indexed = torch.index_select(token_encodings, dim=0, index=batched_char_word_index.view(-1)) 87 | token_encodings_indexed = token_encodings_indexed.view( 88 | batched_char_word_index.shape[0], 89 | batched_char_word_index.shape[1], 90 | -1 91 | ) 92 | 93 | texts = self._embedding_layer(batched_tokens) 94 | texts = torch.cat([texts, token_encodings_indexed], -1) 95 | texts = self._embedding_dp(texts) 96 | 97 | texts = pack_padded_sequence(texts, batched_tokens_len, batch_first=True, enforce_sorted=False) 98 | texts = self._rnn(texts)[0] 99 | texts = pad_packed_sequence(texts, batch_first=True)[0] 100 | texts = self._rnn_top_layer_dp(texts) 101 | mlp_out = self._out_mlp(texts) 102 | 103 | mask = pwF.create_mask_from_length(batched_tokens_len, mlp_out.shape[1]) 104 | 105 | if self.training: 106 | return -self._crf(mlp_out, target, mask=mask, reduction='token_mean') 107 | else: 108 | predictions = self._crf.decode(mlp_out, mask) 109 | predictions = torch.tensor(pad_to_max(predictions), dtype=torch.long).to(mlp_out.device) 110 | one_hot_pred = torch.eye(17).to(mlp_out.device)[[predictions]] 111 | return one_hot_pred 112 | -------------------------------------------------------------------------------- /examples/ner/rnn/model.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import pytorch_wrapper.functional as pwF 3 | import torch 4 | 5 | from torch import nn 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 7 | from torchcrf import CRF 8 | 9 | from ...utils.sequences import pad_to_max 10 | 11 | 12 | class NERRNNModel(nn.Module): 13 | 14 | def __init__(self, 15 | char_embeddings_shape, 16 | embeddings, 17 | char_cnn_kernel_heights=(3,), 18 | char_cnn_out_channels=30, 19 | rnn_class=nn.GRU, 20 | rnn_hidden_size=128, 21 | rnn_num_layers=2, 22 | rnn_dp=0.2, 23 | rnn_bidirectional=True, 24 | mlp_num_layers=1, 25 | mlp_hidden_size=128, 26 | mlp_activation=nn.ReLU, 27 | mlp_dp=0.2): 28 | super(NERRNNModel, self).__init__() 29 | 30 | self._char_embedding_layer = pw.modules.EmbeddingLayer( 31 | char_embeddings_shape[0], 32 | char_embeddings_shape[1], 33 | True, 34 | 0 35 | ) 36 | 37 | self._char_token_encoder = pw.modules.sequence_basic_cnn_encoder.SequenceBasicCNNEncoder( 38 | char_embeddings_shape[1], 39 | kernel_heights=char_cnn_kernel_heights, 40 | out_channels=char_cnn_out_channels 41 | ) 42 | 43 | self._embedding_layer = pw.modules.EmbeddingLayer(embeddings.shape[0], embeddings.shape[1], False, 0) 44 | self._embedding_layer.load_embeddings(embeddings) 45 | 46 | self._embedding_dp = nn.Dropout(rnn_dp) 47 | 48 | self._rnn = rnn_class( 49 | input_size=embeddings.shape[1] + char_cnn_out_channels * len(char_cnn_kernel_heights), 50 | hidden_size=rnn_hidden_size, 51 | num_layers=rnn_num_layers, 52 | dropout=rnn_dp, 53 | bidirectional=rnn_bidirectional, 54 | batch_first=True 55 | ) 56 | 57 | self._rnn_top_layer_dp = nn.Dropout(rnn_dp) 58 | 59 | self._out_mlp = pw.modules.MLP( 60 | input_size=rnn_hidden_size * (2 if rnn_bidirectional else 1), 61 | num_hidden_layers=mlp_num_layers, 62 | hidden_layer_size=mlp_hidden_size, 63 | hidden_activation=mlp_activation, 64 | hidden_dp=mlp_dp, 65 | output_size=17, 66 | output_activation=None 67 | ) 68 | 69 | self._crf = CRF(17, batch_first=True) 70 | 71 | def forward(self, 72 | batched_char_words, 73 | batched_char_words_len, 74 | batched_char_word_index, 75 | batched_tokens, 76 | batched_tokens_len, 77 | target=None 78 | ): 79 | 80 | char_tokens = self._char_embedding_layer(batched_char_words) 81 | token_encodings = self._char_token_encoder(char_tokens) 82 | 83 | token_encodings_z = torch.zeros((1, token_encodings.shape[1]), device=token_encodings.device) 84 | token_encodings = torch.cat([token_encodings_z, token_encodings], dim=0) 85 | 86 | token_encodings_indexed = torch.index_select(token_encodings, dim=0, index=batched_char_word_index.view(-1)) 87 | token_encodings_indexed = token_encodings_indexed.view( 88 | batched_char_word_index.shape[0], 89 | batched_char_word_index.shape[1], 90 | -1 91 | ) 92 | 93 | texts = self._embedding_layer(batched_tokens) 94 | texts = torch.cat([texts, token_encodings_indexed], -1) 95 | texts = self._embedding_dp(texts) 96 | 97 | texts = pack_padded_sequence(texts, batched_tokens_len, batch_first=True, enforce_sorted=False) 98 | texts = self._rnn(texts)[0] 99 | texts = pad_packed_sequence(texts, batch_first=True)[0] 100 | texts = self._rnn_top_layer_dp(texts) 101 | mlp_out = self._out_mlp(texts) 102 | 103 | mask = pwF.create_mask_from_length(batched_tokens_len, mlp_out.shape[1]) 104 | 105 | if self.training: 106 | return -self._crf(mlp_out, target, mask=mask, reduction='token_mean') 107 | else: 108 | predictions = self._crf.decode(mlp_out, mask) 109 | predictions = torch.tensor(pad_to_max(predictions), dtype=torch.long).to(mlp_out.device) 110 | one_hot_pred = torch.eye(7).to(mlp_out.device)[[predictions]] 111 | return one_hot_pred 112 | -------------------------------------------------------------------------------- /examples/utils/fasttext_downloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the MIT license found in https://github.com/facebookresearch/fastText/LICENSE 5 | 6 | import sys 7 | import shutil 8 | import os 9 | import gzip 10 | 11 | from urllib.request import urlopen 12 | 13 | valid_lang_ids = {"af", "sq", "als", "am", "ar", "an", "hy", "as", "ast", 14 | "az", "ba", "eu", "bar", "be", "bn", "bh", "bpy", "bs", 15 | "br", "bg", "my", "ca", "ceb", "bcl", "ce", "zh", "cv", 16 | "co", "hr", "cs", "da", "dv", "nl", "pa", "arz", "eml", 17 | "en", "myv", "eo", "et", "hif", "fi", "fr", "gl", "ka", 18 | "de", "gom", "el", "gu", "ht", "he", "mrj", "hi", "hu", 19 | "is", "io", "ilo", "id", "ia", "ga", "it", "ja", "jv", 20 | "kn", "pam", "kk", "km", "ky", "ko", "ku", "ckb", "la", 21 | "lv", "li", "lt", "lmo", "nds", "lb", "mk", "mai", "mg", 22 | "ms", "ml", "mt", "gv", "mr", "mzn", "mhr", "min", "xmf", 23 | "mwl", "mn", "nah", "nap", "ne", "new", "frr", "nso", 24 | "no", "nn", "oc", "or", "os", "pfl", "ps", "fa", "pms", 25 | "pl", "pt", "qu", "ro", "rm", "ru", "sah", "sa", "sc", 26 | "sco", "gd", "sr", "sh", "scn", "sd", "si", "sk", "sl", 27 | "so", "azb", "es", "su", "sw", "sv", "tl", "tg", "ta", 28 | "tt", "te", "th", "bo", "tr", "tk", "uk", "hsb", "ur", 29 | "ug", "uz", "vec", "vi", "vo", "wa", "war", "cy", "vls", 30 | "fy", "pnb", "yi", "yo", "diq", "zea"} 31 | 32 | 33 | def _print_progress(downloaded_bytes, total_size): 34 | percent = float(downloaded_bytes) / total_size 35 | bar_size = 50 36 | bar = int(percent * bar_size) 37 | percent = round(percent * 100, 2) 38 | sys.stdout.write(" (%0.2f%%) [" % percent) 39 | sys.stdout.write("=" * bar) 40 | sys.stdout.write(">") 41 | sys.stdout.write(" " * (bar_size - bar)) 42 | sys.stdout.write("]\r") 43 | sys.stdout.flush() 44 | 45 | if downloaded_bytes >= total_size: 46 | sys.stdout.write('\n') 47 | 48 | 49 | def _download_file(url, out_folder, write_file_name, chunk_size=2 ** 13): 50 | print("Downloading %s" % url) 51 | 52 | response = urlopen(url) 53 | 54 | if hasattr(response, 'getheader'): 55 | file_size = int(response.getheader('Content-Length').strip()) 56 | else: 57 | file_size = int(response.info().getheader('Content-Length').strip()) 58 | downloaded = 0 59 | download_file_name = write_file_name + ".part" 60 | with open(f'{out_folder}/{download_file_name}', 'wb') as f: 61 | while True: 62 | chunk = response.read(chunk_size) 63 | downloaded += len(chunk) 64 | if not chunk: 65 | break 66 | f.write(chunk) 67 | _print_progress(downloaded, file_size) 68 | 69 | os.rename(f'{out_folder}/{download_file_name}', f'{out_folder}/{write_file_name}') 70 | 71 | 72 | def _download_gz_model(out_folder, gz_file_name, if_exists): 73 | if os.path.isfile(f'{out_folder}/{gz_file_name}'): 74 | if if_exists == 'ignore': 75 | return True 76 | elif if_exists == 'strict': 77 | print("gzip File exists. Use --overwrite to download anyway.") 78 | return False 79 | elif if_exists == 'overwrite': 80 | pass 81 | 82 | url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/%s" % gz_file_name 83 | _download_file(url, out_folder, gz_file_name) 84 | 85 | return True 86 | 87 | 88 | def download_model(lang_id, out_folder='.', if_exists='strict'): 89 | """ 90 | Download pre-trained common-crawl vectors from fastText's website 91 | https://fasttext.cc/docs/en/crawl-vectors.html 92 | """ 93 | if lang_id not in valid_lang_ids: 94 | raise Exception("Invalid lang id. Please select among %s" % 95 | repr(valid_lang_ids)) 96 | 97 | os.makedirs(out_folder, exist_ok=True) 98 | 99 | file_name = f"cc.{lang_id}.300.bin" 100 | gz_file_name = f"{file_name}.gz" 101 | 102 | if os.path.isfile(f'{out_folder}/{file_name}'): 103 | if if_exists == 'ignore': 104 | return file_name 105 | elif if_exists == 'strict': 106 | print("File exists. Use --overwrite to download anyway.") 107 | return 108 | elif if_exists == 'overwrite': 109 | pass 110 | 111 | if _download_gz_model(out_folder, gz_file_name, if_exists): 112 | with gzip.open(f'{out_folder}/{gz_file_name}', 'rb') as f: 113 | with open(f'{out_folder}/{file_name}', 'wb') as f_out: 114 | shutil.copyfileobj(f, f_out) 115 | 116 | return file_name 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GreekBERT 2 | 3 | A Greek edition of Google's BERT pre-trained language model. 4 | 5 | 6 | 7 | 8 | ## Pre-training corpora 9 | 10 | The pre-training corpora of `bert-base-greek-uncased-v1` include: 11 | 12 | * The Greek part of [Wikipedia](https://el.wikipedia.org/wiki/Βικιπαίδεια:Αντίγραφα_της_βάσης_δεδομένων), 13 | * The Greek part of [European Parliament Proceedings Parallel Corpus](https://www.statmt.org/europarl/), and 14 | * The Greek part of [OSCAR](https://traces1.inria.fr/oscar/), a cleansed version of [Common Crawl](https://commoncrawl.org). 15 | 16 | Future release will also include: 17 | 18 | * The entire corpus of Greek legislation, as published by the [National Publication Office](http://www.et.gr), 19 | * The entire corpus of EU legislation (Greek translation), as published in [Eur-Lex](https://eur-lex.europa.eu/homepage.html?locale=en). 20 | 21 | ## Pre-training details 22 | 23 | * We trained BERT using the official code provided in Google BERT's github repository (https://github.com/google-research/bert). 24 | * We released a model similar to the English `bert-base-uncased` model (12-layer, 768-hidden, 12-heads, 110M parameters). 25 | * We chose to follow the same training set-up: 1 million training steps with batches of 256 sequences of length 512 with an initial learning rate 1e-4. 26 | * We were able to use a single Google Cloud TPU v3-8 provided for free from [TensorFlow Research Cloud (TFRC)](https://www.tensorflow.org/tfrc), while also utilizing [GCP research credits](https://edu.google.com/programs/credits/research). Huge thanks to both Google programs for supporting us! 27 | 28 | 29 | ## Requirements 30 | 31 | We published `bert-base-greek-uncased-v1` as part of [Hugging Face](https://huggingface.co)'s [Transformers](https://github.com/huggingface/transformers) repository. So, you need to install the transfomers library through pip along with PyTorch or Tensorflow 2. 32 | 33 | ``` 34 | pip install unicodedata 35 | pip install transfomers 36 | pip install (torch|tensorflow) 37 | ``` 38 | 39 | ## Pre-process text (Deaccent - Lower) 40 | 41 | In order to use `bert-base-greek-uncased-v1`, you have to pre-process texts to lowercase letters and remove all Greek diacritics. 42 | 43 | ```python 44 | 45 | import unicodedata 46 | 47 | def strip_accents_and_lowercase(s): 48 | return ''.join(c for c in unicodedata.normalize('NFD', s) 49 | if unicodedata.category(c) != 'Mn').lower() 50 | 51 | accented_string = "Αυτή είναι η Ελληνική έκδοση του BERT." 52 | unaccented_string = strip_accents_and_lowercase(accented_string) 53 | 54 | print(unaccented_string) # αυτη ειναι η ελληνικη εκδοση του bert. 55 | 56 | ``` 57 | 58 | ## Load Pretrained Model 59 | 60 | ```python 61 | from transformers import AutoTokenizer, AutoModel 62 | 63 | tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1") 64 | model = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1") 65 | ``` 66 | 67 | ## Use Pretrained Model as a Language Model 68 | 69 | ```python 70 | import torch 71 | from transformers import * 72 | 73 | # Load model and tokenizer 74 | tokenizer_greek = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 75 | lm_model_greek = AutoModelWithLMHead.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 76 | 77 | # ================ EXAMPLE 1 ================ 78 | text_1 = 'O ποιητής έγραψε ένα [MASK] .' 79 | # EN: 'The poet wrote a [MASK].' 80 | input_ids = tokenizer_greek.encode(text_1) 81 | print(tokenizer_greek.convert_ids_to_tokens(input_ids)) 82 | # ['[CLS]', 'o', 'ποιητης', 'εγραψε', 'ενα', '[MASK]', '.', '[SEP]'] 83 | outputs = lm_model_greek(torch.tensor([input_ids]))[0] 84 | print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 5].max(0)[1].item())) 85 | # the most plausible prediction for [MASK] is "song" 86 | 87 | # ================ EXAMPLE 2 ================ 88 | text_2 = 'Είναι ένας [MASK] άνθρωπος.' 89 | # EN: 'He is a [MASK] person.' 90 | input_ids = tokenizer_greek.encode(text_1) 91 | print(tokenizer_greek.convert_ids_to_tokens(input_ids)) 92 | # ['[CLS]', 'ειναι', 'ενας', '[MASK]', 'ανθρωπος', '.', '[SEP]'] 93 | outputs = lm_model_greek(torch.tensor([input_ids]))[0] 94 | print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 3].max(0)[1].item())) 95 | # the most plausible prediction for [MASK] is "good" 96 | 97 | # ================ EXAMPLE 3 ================ 98 | text_3 = 'Είναι ένας [MASK] άνθρωπος και κάνει συχνά [MASK].' 99 | # EN: 'He is a [MASK] person he does frequently [MASK].' 100 | input_ids = tokenizer_greek.encode(text_3) 101 | print(tokenizer_greek.convert_ids_to_tokens(input_ids)) 102 | # ['[CLS]', 'ειναι', 'ενας', '[MASK]', 'ανθρωπος', 'και', 'κανει', 'συχνα', '[MASK]', '.', '[SEP]'] 103 | outputs = lm_model_greek(torch.tensor([input_ids]))[0] 104 | print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 8].max(0)[1].item())) 105 | # the most plausible prediction for the second [MASK] is "trips" 106 | ``` 107 | 108 | ## Evaluation on downstream tasks 109 | 110 | TBA 111 | 112 | ## Author 113 | 114 | Ilias Chalkidis on behalf of [AUEB's Natural Language Processing Group](http://nlp.cs.aueb.gr) 115 | 116 | | Github: [@ilias.chalkidis](https://github.com/seolhokim) | Twitter: [@KiddoThe2B](https://twitter.com/KiddoThe2B) | 117 | 118 | ## About Us 119 | 120 | [AUEB's Natural Language Processing Group](http://nlp.cs.aueb.gr) develops algorithms, models, and systems that allow computers to process and generate natural language texts. 121 | 122 | The group's current research interests include: 123 | * question answering systems for databases, ontologies, document collections, and the Web, especially biomedical question answering, 124 | * natural language generation from databases and ontologies, especially Semantic Web ontologies, 125 | text classification, including filtering spam and abusive content, 126 | * information extraction and opinion mining, including legal text analytics and sentiment analysis, 127 | * natural language processing tools for Greek, for example parsers and named-entity recognizers, 128 | machine learning in natural language processing, especially deep learning. 129 | 130 | The group is part of the Information Processing Laboratory of the Department of Informatics of the Athens University of Economics and Business. 131 | -------------------------------------------------------------------------------- /examples/xnli/dam/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from torch.optim import AdamW 9 | from itertools import product 10 | 11 | from .model import XNLIDAMModel 12 | from .dataset import XNLIDAMDataset 13 | 14 | 15 | class XNLIDAMSystemWrapper: 16 | 17 | def __init__(self, embeddings, w2i, model_params): 18 | 19 | self._w2i = w2i 20 | model = XNLIDAMModel(embeddings, **model_params) 21 | if torch.cuda.is_available(): 22 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 23 | else: 24 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 25 | 26 | def train(self, 27 | train_dataset_file, 28 | val_dataset_file, 29 | lr, 30 | batch_size, 31 | grad_accumulation_steps, 32 | run_on_multi_gpus, 33 | verbose=True, 34 | seed=0): 35 | torch.manual_seed(seed) 36 | train_dataset = XNLIDAMDataset(train_dataset_file, self._w2i) 37 | val_dataset = XNLIDAMDataset(val_dataset_file, self._w2i) 38 | self._train_impl( 39 | train_dataset, 40 | val_dataset, 41 | lr, 42 | batch_size, 43 | grad_accumulation_steps, 44 | run_on_multi_gpus, 45 | verbose 46 | ) 47 | 48 | def _train_impl(self, 49 | train_dataset, 50 | val_dataset, 51 | lr, 52 | batch_size, 53 | grad_accumulation_steps, 54 | run_on_multi_gpus, 55 | verbose=True): 56 | 57 | train_dataloader = DataLoader( 58 | train_dataset, 59 | sampler=RandomSampler(train_dataset), 60 | batch_size=batch_size, 61 | collate_fn=XNLIDAMDataset.collate_fn 62 | ) 63 | 64 | val_dataloader = DataLoader( 65 | val_dataset, 66 | sampler=SequentialSampler(val_dataset), 67 | batch_size=batch_size, 68 | collate_fn=XNLIDAMDataset.collate_fn 69 | ) 70 | 71 | loss_wrapper = pw.loss_wrappers.GenericPointWiseLossWrapper(nn.CrossEntropyLoss()) 72 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 73 | 74 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 75 | os.makedirs(base_es_path, exist_ok=True) 76 | 77 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 78 | 79 | _ = train_method( 80 | loss_wrapper, 81 | optimizer, 82 | train_data_loader=train_dataloader, 83 | evaluation_data_loaders={'val': val_dataloader}, 84 | evaluators={'macro-f1': pw.evaluators.MultiClassF1Evaluator(average='macro')}, 85 | gradient_accumulation_steps=grad_accumulation_steps, 86 | callbacks=[ 87 | pw.training_callbacks.EarlyStoppingCriterionCallback( 88 | patience=3, 89 | evaluation_data_loader_key='val', 90 | evaluator_key='macro-f1', 91 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 92 | ) 93 | ], 94 | verbose=verbose 95 | ) 96 | 97 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, verbose=True): 98 | eval_dataset = XNLIDAMDataset(eval_dataset_file, self._w2i) 99 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, verbose) 100 | 101 | def save_model_state(self, path): 102 | self._system.save_model_state(path) 103 | 104 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, verbose=True): 105 | 106 | eval_dataloader = DataLoader( 107 | eval_dataset, 108 | sampler=SequentialSampler(eval_dataset), 109 | batch_size=batch_size, 110 | collate_fn=XNLIDAMDataset.collate_fn 111 | ) 112 | 113 | evaluators = { 114 | 115 | 'acc': pw.evaluators.MultiClassAccuracyEvaluator(), 116 | 'macro-prec': pw.evaluators.MultiClassPrecisionEvaluator(average='macro'), 117 | 'macro-rec': pw.evaluators.MultiClassRecallEvaluator(average='macro'), 118 | 'macro-f1': pw.evaluators.MultiClassF1Evaluator(average='macro'), 119 | 'micro-prec': pw.evaluators.MultiClassPrecisionEvaluator(average='micro'), 120 | 'micro-rec': pw.evaluators.MultiClassRecallEvaluator(average='micro'), 121 | 'micro-f1': pw.evaluators.MultiClassF1Evaluator(average='micro') 122 | } 123 | 124 | if run_on_multi_gpus: 125 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evaluators, verbose=verbose) 126 | else: 127 | return self._system.evaluate(eval_dataloader, evaluators, verbose=verbose) 128 | 129 | @staticmethod 130 | def tune(embeddings, w2i, train_dataset_file, val_dataset_file, run_on_multi_gpus): 131 | lrs = [0.01, 0.001, 0.0001] 132 | batch_size = [16, 32, 64] 133 | mlp_dp = [0, 0.1, 0.2, 0.3] 134 | params = list(product(lrs, batch_size, mlp_dp)) 135 | grad_accumulation_steps = 1 136 | 137 | train_dataset = XNLIDAMDataset(train_dataset_file, w2i) 138 | val_dataset = XNLIDAMDataset(val_dataset_file, w2i) 139 | 140 | results = [] 141 | for i, (lr, bs, dp) in enumerate(params): 142 | print(f'{i + 1}/{len(params)}') 143 | torch.manual_seed(0) 144 | current_system_wrapper = XNLIDAMSystemWrapper( 145 | embeddings, 146 | w2i, 147 | { 148 | 'mlp_dp': dp 149 | } 150 | ) 151 | current_system_wrapper._train_impl( 152 | train_dataset, 153 | val_dataset, 154 | lr, 155 | bs, 156 | grad_accumulation_steps, 157 | run_on_multi_gpus 158 | ) 159 | 160 | current_results = current_system_wrapper._evaluate_impl(val_dataset, bs, run_on_multi_gpus) 161 | results.append([current_results['macro-f1'].score, (lr, bs, dp)]) 162 | 163 | return results 164 | -------------------------------------------------------------------------------- /examples/ner/rnn/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from torch.optim import AdamW 9 | from itertools import product 10 | 11 | from ...utils.loss_wrappers import PassThroughLossWrapper 12 | from ...utils import evaluators 13 | from .model import NERRNNModel 14 | from .dataset import NERRNNDataset 15 | 16 | 17 | class NERRNNSystemWrapper: 18 | 19 | def __init__(self, embeddings, w2i, c2i, model_params): 20 | 21 | self._w2i = w2i 22 | self._c2i = c2i 23 | model = NERRNNModel(embeddings=embeddings, **model_params) 24 | if torch.cuda.is_available(): 25 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 26 | else: 27 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 28 | 29 | def train(self, 30 | train_dataset_file, 31 | val_dataset_file, 32 | lr, 33 | batch_size, 34 | grad_accumulation_steps, 35 | run_on_multi_gpus, 36 | verbose=True, 37 | seed=0): 38 | torch.manual_seed(seed) 39 | train_dataset = NERRNNDataset(train_dataset_file, self._w2i, self._c2i) 40 | val_dataset = NERRNNDataset(val_dataset_file, self._w2i, self._c2i) 41 | self._train_impl( 42 | train_dataset, 43 | val_dataset, 44 | lr, 45 | batch_size, 46 | grad_accumulation_steps, 47 | run_on_multi_gpus, 48 | verbose 49 | ) 50 | 51 | def _train_impl(self, 52 | train_dataset, 53 | val_dataset, 54 | lr, 55 | batch_size, 56 | grad_accumulation_steps, 57 | run_on_multi_gpus, 58 | verbose=True): 59 | 60 | train_dataloader = DataLoader( 61 | train_dataset, 62 | sampler=RandomSampler(train_dataset), 63 | batch_size=batch_size, 64 | collate_fn=NERRNNDataset.collate_fn 65 | ) 66 | 67 | val_dataloader = DataLoader( 68 | val_dataset, 69 | sampler=SequentialSampler(val_dataset), 70 | batch_size=batch_size, 71 | collate_fn=NERRNNDataset.collate_fn 72 | ) 73 | 74 | loss_wrapper = PassThroughLossWrapper() 75 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 76 | 77 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 78 | os.makedirs(base_es_path, exist_ok=True) 79 | 80 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 81 | 82 | _ = train_method( 83 | loss_wrapper, 84 | optimizer, 85 | train_data_loader=train_dataloader, 86 | evaluation_data_loaders={'val': val_dataloader}, 87 | evaluators={ 88 | 'macro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator(train_dataset.I2L) 89 | }, 90 | gradient_accumulation_steps=grad_accumulation_steps, 91 | callbacks=[ 92 | pw.training_callbacks.EarlyStoppingCriterionCallback( 93 | patience=3, 94 | evaluation_data_loader_key='val', 95 | evaluator_key='macro-f1', 96 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 97 | ) 98 | ], 99 | verbose=verbose 100 | ) 101 | 102 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, verbose=True): 103 | eval_dataset = NERRNNDataset(eval_dataset_file, self._w2i, self._c2i) 104 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, verbose) 105 | 106 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, verbose=True): 107 | 108 | eval_dataloader = DataLoader( 109 | eval_dataset, 110 | sampler=SequentialSampler(eval_dataset), 111 | batch_size=batch_size, 112 | collate_fn=NERRNNDataset.collate_fn 113 | ) 114 | 115 | evals = { 116 | 'macro-prec': evaluators.MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 117 | 'macro-rec': evaluators.MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 118 | 'macro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 119 | 'micro-prec': evaluators.MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator( 120 | eval_dataset.I2L, 121 | average='micro' 122 | ), 123 | 'micro-rec': evaluators.MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator( 124 | eval_dataset.I2L, 125 | average='micro' 126 | ), 127 | 'micro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator( 128 | eval_dataset.I2L, 129 | average='micro' 130 | ) 131 | } 132 | 133 | if run_on_multi_gpus: 134 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evals, verbose=verbose) 135 | else: 136 | return self._system.evaluate(eval_dataloader, evals, verbose=verbose) 137 | 138 | @staticmethod 139 | def tune(embeddings, w2i, c2i, train_dataset_file, val_dataset_file, run_on_multi_gpus): 140 | lrs = [0.01, 0.001] 141 | batch_size = [16, 32, 64] 142 | dp = [0, 0.1, 0.2, 0.3] 143 | hs = [100, 200, 300] 144 | params = list(product(lrs, dp, batch_size, hs)) 145 | grad_accumulation_steps = 1 146 | char_embedding_size = 30 147 | 148 | train_dataset = NERRNNDataset(train_dataset_file, w2i, c2i) 149 | val_dataset = NERRNNDataset(val_dataset_file, w2i, c2i) 150 | 151 | results = [] 152 | for i, (lr, dp, batch_size, hs) in enumerate(params): 153 | print(f'{i + 1}/{len(params)}') 154 | torch.manual_seed(0) 155 | current_system_wrapper = NERRNNSystemWrapper( 156 | embeddings, 157 | w2i, 158 | c2i, 159 | { 160 | 'rnn_dp': dp, 161 | 'mlp_dp': dp, 162 | 'rnn_hidden_size': hs, 163 | 'char_embeddings_shape': (len(c2i), char_embedding_size) 164 | } 165 | ) 166 | current_system_wrapper._train_impl( 167 | train_dataset, 168 | val_dataset, 169 | lr, 170 | batch_size, 171 | grad_accumulation_steps, 172 | run_on_multi_gpus 173 | ) 174 | 175 | current_results = current_system_wrapper._evaluate_impl(val_dataset, batch_size, run_on_multi_gpus) 176 | results.append([current_results['macro-f1'].score, (lr, dp, batch_size, hs)]) 177 | 178 | return results 179 | -------------------------------------------------------------------------------- /examples/xnli/bert/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from itertools import product 9 | from transformers import AutoTokenizer, AutoModel, AdamW 10 | from functools import partial 11 | 12 | from .model import XNLIBERTModel 13 | from .dataset import XNLIBERTDataset 14 | 15 | 16 | class XNLIBERTSystemWrapper: 17 | 18 | def __init__(self, pretrained_bert_name, model_params): 19 | 20 | self._pretrained_bert_name = pretrained_bert_name 21 | bert_model = AutoModel.from_pretrained(pretrained_bert_name) 22 | model = XNLIBERTModel(bert_model, **model_params) 23 | 24 | if torch.cuda.is_available(): 25 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 26 | else: 27 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 28 | 29 | def train(self, 30 | train_dataset_file, 31 | val_dataset_file, 32 | lr, 33 | batch_size, 34 | grad_accumulation_steps, 35 | run_on_multi_gpus, 36 | preprocessing_function, 37 | verbose=True, 38 | seed=0): 39 | torch.manual_seed(seed) 40 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 41 | train_dataset = XNLIBERTDataset(train_dataset_file, tokenizer, preprocessing_function) 42 | val_dataset = XNLIBERTDataset(val_dataset_file, tokenizer, preprocessing_function) 43 | self._train_impl( 44 | train_dataset, 45 | val_dataset, 46 | lr, 47 | batch_size, 48 | grad_accumulation_steps, 49 | run_on_multi_gpus, 50 | tokenizer.pad_token_id, 51 | verbose 52 | ) 53 | 54 | def _train_impl(self, 55 | train_dataset, 56 | val_dataset, 57 | lr, 58 | batch_size, 59 | grad_accumulation_steps, 60 | run_on_multi_gpus, 61 | pad_value, 62 | verbose=True): 63 | 64 | train_dataloader = DataLoader( 65 | train_dataset, 66 | sampler=RandomSampler(train_dataset), 67 | batch_size=batch_size, 68 | collate_fn=partial(XNLIBERTDataset.collate_fn, pad_value=pad_value) 69 | ) 70 | 71 | val_dataloader = DataLoader( 72 | val_dataset, 73 | sampler=SequentialSampler(val_dataset), 74 | batch_size=batch_size, 75 | collate_fn=partial(XNLIBERTDataset.collate_fn, pad_value=pad_value) 76 | ) 77 | 78 | loss_wrapper = pw.loss_wrappers.GenericPointWiseLossWrapper(nn.CrossEntropyLoss()) 79 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 80 | 81 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 82 | os.makedirs(base_es_path, exist_ok=True) 83 | 84 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 85 | 86 | _ = train_method( 87 | loss_wrapper, 88 | optimizer, 89 | train_data_loader=train_dataloader, 90 | evaluation_data_loaders={'val': val_dataloader}, 91 | evaluators={'macro-f1': pw.evaluators.MultiClassF1Evaluator(average='macro')}, 92 | gradient_accumulation_steps=grad_accumulation_steps, 93 | callbacks=[ 94 | pw.training_callbacks.EarlyStoppingCriterionCallback( 95 | patience=3, 96 | evaluation_data_loader_key='val', 97 | evaluator_key='macro-f1', 98 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 99 | ) 100 | ], 101 | verbose=verbose 102 | ) 103 | 104 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, preprocessing_function, verbose=True): 105 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 106 | eval_dataset = XNLIBERTDataset(eval_dataset_file, tokenizer, preprocessing_function) 107 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, tokenizer.pad_token_id, verbose) 108 | 109 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, pad_value, verbose=True): 110 | 111 | eval_dataloader = DataLoader( 112 | eval_dataset, 113 | sampler=SequentialSampler(eval_dataset), 114 | batch_size=batch_size, 115 | collate_fn=partial(XNLIBERTDataset.collate_fn, pad_value=pad_value) 116 | ) 117 | 118 | evaluators = { 119 | 120 | 'acc': pw.evaluators.MultiClassAccuracyEvaluator(), 121 | 'macro-prec': pw.evaluators.MultiClassPrecisionEvaluator(average='macro'), 122 | 'macro-rec': pw.evaluators.MultiClassRecallEvaluator(average='macro'), 123 | 'macro-f1': pw.evaluators.MultiClassF1Evaluator(average='macro'), 124 | 'micro-prec': pw.evaluators.MultiClassPrecisionEvaluator(average='micro'), 125 | 'micro-rec': pw.evaluators.MultiClassRecallEvaluator(average='micro'), 126 | 'micro-f1': pw.evaluators.MultiClassF1Evaluator(average='micro') 127 | } 128 | 129 | if run_on_multi_gpus: 130 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evaluators, verbose=verbose) 131 | else: 132 | return self._system.evaluate(eval_dataloader, evaluators, verbose=verbose) 133 | 134 | def save_model_state(self, path): 135 | self._system.save_model_state(path) 136 | 137 | @staticmethod 138 | def tune(pretrained_bert_name, train_dataset_file, val_dataset_file, run_on_multi_gpus, preprocessing_function): 139 | lrs = [5e-5, 3e-5, 2e-5] 140 | dp = [0, 0.1, 0.2] 141 | grad_accumulation_steps = [4, 8] 142 | batch_size = 4 143 | params = list(product(lrs, dp, grad_accumulation_steps)) 144 | 145 | tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_name) 146 | 147 | train_dataset = XNLIBERTDataset(train_dataset_file, tokenizer, preprocessing_function) 148 | val_dataset = XNLIBERTDataset(val_dataset_file, tokenizer, preprocessing_function) 149 | 150 | results = [] 151 | for i, (lr, dp, grad_accumulation_steps) in enumerate(params): 152 | print(f'{i + 1}/{len(params)}') 153 | torch.manual_seed(0) 154 | current_system_wrapper = XNLIBERTSystemWrapper(pretrained_bert_name, {'dp': dp}) 155 | current_system_wrapper._train_impl( 156 | train_dataset, 157 | val_dataset, 158 | lr, 159 | batch_size, 160 | grad_accumulation_steps, 161 | run_on_multi_gpus 162 | ) 163 | 164 | current_results = current_system_wrapper._evaluate_impl(val_dataset, batch_size, run_on_multi_gpus) 165 | results.append([current_results['macro-f1'].score, (lr, dp, grad_accumulation_steps)]) 166 | 167 | return results 168 | -------------------------------------------------------------------------------- /examples/ud/rnn/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from torch.optim import AdamW 9 | from itertools import product 10 | 11 | from ...utils.loss_wrappers import PassThroughLossWrapper 12 | from .model import UDRNNModel 13 | from .dataset import UDRNNDataset 14 | 15 | 16 | class UDRNNSystemWrapper: 17 | 18 | def __init__(self, embeddings, w2i, c2i, model_params): 19 | 20 | self._w2i = w2i 21 | self._c2i = c2i 22 | model = UDRNNModel(embeddings=embeddings, **model_params) 23 | if torch.cuda.is_available(): 24 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 25 | else: 26 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 27 | 28 | def train(self, 29 | train_dataset_file, 30 | val_dataset_file, 31 | lr, 32 | batch_size, 33 | grad_accumulation_steps, 34 | run_on_multi_gpus, 35 | verbose=True, 36 | seed=0): 37 | torch.manual_seed(seed) 38 | train_dataset = UDRNNDataset(train_dataset_file, self._w2i, self._c2i) 39 | val_dataset = UDRNNDataset(val_dataset_file, self._w2i, self._c2i) 40 | self._train_impl( 41 | train_dataset, 42 | val_dataset, 43 | lr, 44 | batch_size, 45 | grad_accumulation_steps, 46 | run_on_multi_gpus, 47 | verbose 48 | ) 49 | 50 | def _train_impl(self, 51 | train_dataset, 52 | val_dataset, 53 | lr, 54 | batch_size, 55 | grad_accumulation_steps, 56 | run_on_multi_gpus, 57 | verbose=True): 58 | 59 | train_dataloader = DataLoader( 60 | train_dataset, 61 | sampler=RandomSampler(train_dataset), 62 | batch_size=batch_size, 63 | collate_fn=UDRNNDataset.collate_fn 64 | ) 65 | 66 | val_dataloader = DataLoader( 67 | val_dataset, 68 | sampler=SequentialSampler(val_dataset), 69 | batch_size=batch_size, 70 | collate_fn=UDRNNDataset.collate_fn 71 | ) 72 | 73 | loss_wrapper = PassThroughLossWrapper() 74 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 75 | 76 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 77 | os.makedirs(base_es_path, exist_ok=True) 78 | 79 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 80 | 81 | _ = train_method( 82 | loss_wrapper, 83 | optimizer, 84 | train_data_loader=train_dataloader, 85 | evaluation_data_loaders={'val': val_dataloader}, 86 | evaluators={ 87 | 'macro-f1': pw.evaluators.TokenLabelingEvaluatorWrapper( 88 | pw.evaluators.MultiClassF1Evaluator(average='macro'), 89 | 4 90 | ) 91 | }, 92 | gradient_accumulation_steps=grad_accumulation_steps, 93 | callbacks=[ 94 | pw.training_callbacks.EarlyStoppingCriterionCallback( 95 | patience=3, 96 | evaluation_data_loader_key='val', 97 | evaluator_key='macro-f1', 98 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 99 | ) 100 | ], 101 | verbose=verbose 102 | ) 103 | 104 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, verbose=True): 105 | eval_dataset = UDRNNDataset(eval_dataset_file, self._w2i, self._c2i) 106 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, verbose) 107 | 108 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, verbose=True): 109 | 110 | eval_dataloader = DataLoader( 111 | eval_dataset, 112 | sampler=SequentialSampler(eval_dataset), 113 | batch_size=batch_size, 114 | collate_fn=UDRNNDataset.collate_fn 115 | ) 116 | 117 | evaluators = { 118 | 119 | 'acc': pw.evaluators.TokenLabelingEvaluatorWrapper( 120 | pw.evaluators.MultiClassAccuracyEvaluator(), 121 | 4 122 | ), 123 | 'macro-prec': pw.evaluators.TokenLabelingEvaluatorWrapper( 124 | pw.evaluators.MultiClassPrecisionEvaluator(average='macro'), 125 | 4 126 | ), 127 | 'macro-rec': pw.evaluators.TokenLabelingEvaluatorWrapper( 128 | pw.evaluators.MultiClassRecallEvaluator(average='macro'), 129 | 4 130 | ), 131 | 'macro-f1': pw.evaluators.TokenLabelingEvaluatorWrapper( 132 | pw.evaluators.MultiClassF1Evaluator(average='macro'), 133 | 4 134 | ), 135 | 'micro-prec': pw.evaluators.TokenLabelingEvaluatorWrapper( 136 | pw.evaluators.MultiClassPrecisionEvaluator(average='micro'), 137 | 4 138 | ), 139 | 'micro-rec': pw.evaluators.TokenLabelingEvaluatorWrapper( 140 | pw.evaluators.MultiClassRecallEvaluator(average='micro'), 141 | 4 142 | ), 143 | 'micro-f1': pw.evaluators.TokenLabelingEvaluatorWrapper( 144 | pw.evaluators.MultiClassF1Evaluator(average='micro'), 145 | 4 146 | ) 147 | } 148 | 149 | if run_on_multi_gpus: 150 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evaluators, verbose=verbose) 151 | else: 152 | return self._system.evaluate(eval_dataloader, evaluators, verbose=verbose) 153 | 154 | @staticmethod 155 | def tune(embeddings, w2i, c2i, train_dataset_file, val_dataset_file, run_on_multi_gpus): 156 | lrs = [0.01, 0.001] 157 | batch_size = [16, 32, 64] 158 | dp = [0, 0.1, 0.2, 0.3] 159 | hs = [100, 200, 300] 160 | params = list(product(lrs, dp, batch_size, hs)) 161 | grad_accumulation_steps = 1 162 | char_embedding_size = 30 163 | 164 | train_dataset = UDRNNDataset(train_dataset_file, w2i, c2i) 165 | val_dataset = UDRNNDataset(val_dataset_file, w2i, c2i) 166 | 167 | results = [] 168 | for i, (lr, dp, batch_size, hs) in enumerate(params): 169 | print(f'{i + 1}/{len(params)}') 170 | torch.manual_seed(0) 171 | current_system_wrapper = UDRNNSystemWrapper( 172 | embeddings, 173 | w2i, 174 | c2i, 175 | { 176 | 'rnn_dp': dp, 177 | 'mlp_dp': dp, 178 | 'rnn_hidden_size': hs, 179 | 'char_embeddings_shape': (len(c2i), char_embedding_size) 180 | } 181 | ) 182 | current_system_wrapper._train_impl( 183 | train_dataset, 184 | val_dataset, 185 | lr, 186 | batch_size, 187 | grad_accumulation_steps, 188 | run_on_multi_gpus 189 | ) 190 | 191 | current_results = current_system_wrapper._evaluate_impl(val_dataset, batch_size, run_on_multi_gpus) 192 | results.append([current_results['macro-f1'].score, (lr, dp, batch_size, hs)]) 193 | 194 | return results 195 | -------------------------------------------------------------------------------- /examples/ner/bert/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from itertools import product 9 | from transformers import AutoTokenizer, AutoModel, AdamW 10 | from functools import partial 11 | 12 | from ...utils import loss_wrappers, evaluators 13 | from .model import NERBERTModel 14 | from .dataset import NERBERTDataset 15 | 16 | 17 | class NERBERTSystemWrapper: 18 | 19 | def __init__(self, pretrained_bert_name, preprocessing_function, bert_like_special_tokens, model_params): 20 | 21 | self._pretrained_bert_name = pretrained_bert_name 22 | bert_model = AutoModel.from_pretrained(pretrained_bert_name) 23 | model = NERBERTModel(bert_model, **model_params) 24 | self._preprocessing_function = preprocessing_function 25 | self._bert_like_special_tokens = bert_like_special_tokens 26 | 27 | if torch.cuda.is_available(): 28 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 29 | else: 30 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 31 | 32 | def train(self, 33 | train_dataset_file, 34 | val_dataset_file, 35 | lr, 36 | batch_size, 37 | grad_accumulation_steps, 38 | run_on_multi_gpus, 39 | verbose=True, 40 | seed=0): 41 | torch.manual_seed(seed) 42 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 43 | 44 | train_dataset = NERBERTDataset( 45 | train_dataset_file, 46 | tokenizer, 47 | self._bert_like_special_tokens, 48 | self._preprocessing_function 49 | ) 50 | 51 | val_dataset = NERBERTDataset( 52 | val_dataset_file, 53 | tokenizer, 54 | self._bert_like_special_tokens, 55 | self._preprocessing_function 56 | ) 57 | 58 | self._train_impl( 59 | train_dataset, 60 | val_dataset, 61 | lr, 62 | batch_size, 63 | grad_accumulation_steps, 64 | run_on_multi_gpus, 65 | tokenizer.pad_token_id, 66 | verbose 67 | ) 68 | 69 | def _train_impl(self, 70 | train_dataset, 71 | val_dataset, 72 | lr, 73 | batch_size, 74 | grad_accumulation_steps, 75 | run_on_multi_gpus, 76 | pad_value, 77 | verbose=True): 78 | 79 | train_dataloader = DataLoader( 80 | train_dataset, 81 | sampler=RandomSampler(train_dataset), 82 | batch_size=batch_size, 83 | collate_fn=partial(NERBERTDataset.collate_fn, pad_value=pad_value) 84 | ) 85 | 86 | val_dataloader = DataLoader( 87 | val_dataset, 88 | sampler=SequentialSampler(val_dataset), 89 | batch_size=batch_size, 90 | collate_fn=partial(NERBERTDataset.collate_fn, pad_value=pad_value) 91 | ) 92 | 93 | loss_wrapper = loss_wrappers.MaskedTokenLabelingGenericPointWiseLossWrapper(nn.CrossEntropyLoss()) 94 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 95 | 96 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 97 | os.makedirs(base_es_path, exist_ok=True) 98 | 99 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 100 | 101 | _ = train_method( 102 | loss_wrapper, 103 | optimizer, 104 | train_data_loader=train_dataloader, 105 | evaluation_data_loaders={'val': val_dataloader}, 106 | evaluators={ 107 | 'macro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator(train_dataset.I2L) 108 | }, 109 | gradient_accumulation_steps=grad_accumulation_steps, 110 | callbacks=[ 111 | pw.training_callbacks.EarlyStoppingCriterionCallback( 112 | patience=3, 113 | evaluation_data_loader_key='val', 114 | evaluator_key='macro-f1', 115 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 116 | ) 117 | ], 118 | verbose=verbose 119 | ) 120 | 121 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, verbose=True): 122 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 123 | eval_dataset = NERBERTDataset( 124 | eval_dataset_file, 125 | tokenizer, 126 | self._bert_like_special_tokens, 127 | self._preprocessing_function 128 | ) 129 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, tokenizer.pad_token_id, verbose) 130 | 131 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, pad_value, verbose=True): 132 | 133 | eval_dataloader = DataLoader( 134 | eval_dataset, 135 | sampler=SequentialSampler(eval_dataset), 136 | batch_size=batch_size, 137 | collate_fn=partial(NERBERTDataset.collate_fn, pad_value=pad_value) 138 | ) 139 | 140 | evals = { 141 | 'macro-prec': evaluators.MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 142 | 'macro-rec': evaluators.MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 143 | 'macro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator(eval_dataset.I2L), 144 | 'micro-prec': evaluators.MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator( 145 | eval_dataset.I2L, 146 | average='micro' 147 | ), 148 | 'micro-rec': evaluators.MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator( 149 | eval_dataset.I2L, 150 | average='micro' 151 | ), 152 | 'micro-f1': evaluators.MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator( 153 | eval_dataset.I2L, 154 | average='micro' 155 | ), 156 | } 157 | 158 | if run_on_multi_gpus: 159 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evals, verbose=verbose) 160 | else: 161 | return self._system.evaluate(eval_dataloader, evals, verbose=verbose) 162 | 163 | def save_model_state(self, path): 164 | self._system.save_model_state(path) 165 | 166 | @staticmethod 167 | def tune(pretrained_bert_name, 168 | preprocessing_function, 169 | bert_like_special_tokens, 170 | train_dataset_file, 171 | val_dataset_file, 172 | run_on_multi_gpus): 173 | lrs = [5e-5, 3e-5, 2e-5] 174 | dp = [0, 0.1, 0.2] 175 | grad_accumulation_steps = [4, 8] 176 | batch_size = 4 177 | params = list(product(lrs, dp, grad_accumulation_steps)) 178 | 179 | tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_name) 180 | 181 | train_dataset = NERBERTDataset( 182 | train_dataset_file, 183 | tokenizer, 184 | bert_like_special_tokens, 185 | preprocessing_function 186 | ) 187 | 188 | val_dataset = NERBERTDataset( 189 | val_dataset_file, 190 | tokenizer, 191 | bert_like_special_tokens, 192 | preprocessing_function 193 | ) 194 | 195 | results = [] 196 | for i, (lr, dp, grad_accumulation_steps) in enumerate(params): 197 | print(f'{i + 1}/{len(params)}') 198 | torch.manual_seed(0) 199 | current_system_wrapper = NERBERTSystemWrapper( 200 | pretrained_bert_name, 201 | preprocessing_function, 202 | bert_like_special_tokens, 203 | {'dp': dp} 204 | ) 205 | 206 | current_system_wrapper._train_impl( 207 | train_dataset, 208 | val_dataset, 209 | lr, 210 | batch_size, 211 | grad_accumulation_steps, 212 | tokenizer.pad_token_id, 213 | run_on_multi_gpus 214 | ) 215 | 216 | current_results = current_system_wrapper._evaluate_impl( 217 | val_dataset, 218 | batch_size, 219 | run_on_multi_gpus, 220 | tokenizer.pad_token_id 221 | ) 222 | results.append([current_results['macro-f1'].score, (lr, dp, grad_accumulation_steps)]) 223 | 224 | return results 225 | -------------------------------------------------------------------------------- /examples/ud/bert/system_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytorch_wrapper as pw 2 | import torch 3 | import os 4 | import uuid 5 | 6 | from torch import nn 7 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 8 | from itertools import product 9 | from transformers import AutoTokenizer, AutoModel, AdamW 10 | from functools import partial 11 | 12 | from ...utils import loss_wrappers, evaluators 13 | from .model import UDBERTModel 14 | from .dataset import UDBERTDataset 15 | 16 | 17 | class UDBERTSystemWrapper: 18 | 19 | def __init__(self, pretrained_bert_name, preprocessing_function, bert_like_special_tokens, model_params): 20 | 21 | self._pretrained_bert_name = pretrained_bert_name 22 | bert_model = AutoModel.from_pretrained(pretrained_bert_name) 23 | model = UDBERTModel(bert_model, **model_params) 24 | self._preprocessing_function = preprocessing_function 25 | self._bert_like_special_tokens = bert_like_special_tokens 26 | 27 | if torch.cuda.is_available(): 28 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cuda')) 29 | else: 30 | self._system = pw.System(model, last_activation=nn.Softmax(dim=-1), device=torch.device('cpu')) 31 | 32 | def train(self, 33 | train_dataset_file, 34 | val_dataset_file, 35 | lr, 36 | batch_size, 37 | grad_accumulation_steps, 38 | run_on_multi_gpus, 39 | verbose=True, 40 | seed=0): 41 | torch.manual_seed(seed) 42 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 43 | 44 | train_dataset = UDBERTDataset( 45 | train_dataset_file, 46 | tokenizer, 47 | self._bert_like_special_tokens, 48 | self._preprocessing_function 49 | ) 50 | 51 | val_dataset = UDBERTDataset( 52 | val_dataset_file, 53 | tokenizer, 54 | self._bert_like_special_tokens, 55 | self._preprocessing_function 56 | ) 57 | 58 | self._train_impl( 59 | train_dataset, 60 | val_dataset, 61 | lr, 62 | batch_size, 63 | grad_accumulation_steps, 64 | run_on_multi_gpus, 65 | tokenizer.pad_token_id, 66 | verbose 67 | ) 68 | 69 | def _train_impl(self, 70 | train_dataset, 71 | val_dataset, 72 | lr, 73 | batch_size, 74 | grad_accumulation_steps, 75 | run_on_multi_gpus, 76 | pad_value, 77 | verbose=True): 78 | 79 | train_dataloader = DataLoader( 80 | train_dataset, 81 | sampler=RandomSampler(train_dataset), 82 | batch_size=batch_size, 83 | collate_fn=partial(UDBERTDataset.collate_fn, pad_value=pad_value) 84 | ) 85 | 86 | val_dataloader = DataLoader( 87 | val_dataset, 88 | sampler=SequentialSampler(val_dataset), 89 | batch_size=batch_size, 90 | collate_fn=partial(UDBERTDataset.collate_fn, pad_value=pad_value) 91 | ) 92 | 93 | loss_wrapper = loss_wrappers.MaskedTokenLabelingGenericPointWiseLossWrapper(nn.CrossEntropyLoss()) 94 | optimizer = AdamW(self._system.model.parameters(), lr=lr) 95 | 96 | base_es_path = f'/tmp/{uuid.uuid4().hex[:30]}/' 97 | os.makedirs(base_es_path, exist_ok=True) 98 | 99 | train_method = self._system.train_on_multi_gpus if run_on_multi_gpus else self._system.train 100 | 101 | _ = train_method( 102 | loss_wrapper, 103 | optimizer, 104 | train_data_loader=train_dataloader, 105 | evaluation_data_loaders={'val': val_dataloader}, 106 | evaluators={ 107 | 'macro-f1': evaluators.MaskedTokenLabelingEvaluatorWrapper( 108 | pw.evaluators.MultiClassF1Evaluator(average='macro') 109 | ) 110 | }, 111 | gradient_accumulation_steps=grad_accumulation_steps, 112 | callbacks=[ 113 | pw.training_callbacks.EarlyStoppingCriterionCallback( 114 | patience=3, 115 | evaluation_data_loader_key='val', 116 | evaluator_key='macro-f1', 117 | tmp_best_state_filepath=f'{base_es_path}/temp.es.weights' 118 | ) 119 | ], 120 | verbose=verbose 121 | ) 122 | 123 | def evaluate(self, eval_dataset_file, batch_size, run_on_multi_gpus, verbose=True): 124 | tokenizer = AutoTokenizer.from_pretrained(self._pretrained_bert_name) 125 | eval_dataset = UDBERTDataset( 126 | eval_dataset_file, 127 | tokenizer, 128 | self._bert_like_special_tokens, 129 | self._preprocessing_function 130 | ) 131 | return self._evaluate_impl(eval_dataset, batch_size, run_on_multi_gpus, tokenizer.pad_token_id, verbose) 132 | 133 | def _evaluate_impl(self, eval_dataset, batch_size, run_on_multi_gpus, pad_value, verbose=True): 134 | 135 | eval_dataloader = DataLoader( 136 | eval_dataset, 137 | sampler=SequentialSampler(eval_dataset), 138 | batch_size=batch_size, 139 | collate_fn=partial(UDBERTDataset.collate_fn, pad_value=pad_value) 140 | ) 141 | 142 | evals = { 143 | 'acc': evaluators.MaskedTokenLabelingEvaluatorWrapper(pw.evaluators.MultiClassAccuracyEvaluator()), 144 | 'macro-prec': evaluators.MaskedTokenLabelingEvaluatorWrapper( 145 | pw.evaluators.MultiClassPrecisionEvaluator(average='macro') 146 | ), 147 | 'macro-rec': evaluators.MaskedTokenLabelingEvaluatorWrapper( 148 | pw.evaluators.MultiClassRecallEvaluator(average='macro') 149 | ), 150 | 'macro-f1': evaluators.MaskedTokenLabelingEvaluatorWrapper( 151 | pw.evaluators.MultiClassF1Evaluator(average='macro') 152 | ), 153 | 'micro-prec': evaluators.MaskedTokenLabelingEvaluatorWrapper( 154 | pw.evaluators.MultiClassPrecisionEvaluator(average='micro') 155 | ), 156 | 'micro-rec': evaluators.MaskedTokenLabelingEvaluatorWrapper( 157 | pw.evaluators.MultiClassRecallEvaluator(average='micro') 158 | ), 159 | 'micro-f1': evaluators.MaskedTokenLabelingEvaluatorWrapper( 160 | pw.evaluators.MultiClassF1Evaluator(average='micro') 161 | ) 162 | } 163 | 164 | if run_on_multi_gpus: 165 | return self._system.evaluate_on_multi_gpus(eval_dataloader, evals, verbose=verbose) 166 | else: 167 | return self._system.evaluate(eval_dataloader, evals, verbose=verbose) 168 | 169 | def save_model_state(self, path): 170 | self._system.save_model_state(path) 171 | 172 | @staticmethod 173 | def tune(pretrained_bert_name, 174 | preprocessing_function, 175 | bert_like_special_tokens, 176 | train_dataset_file, 177 | val_dataset_file, 178 | run_on_multi_gpus): 179 | lrs = [5e-5, 3e-5, 2e-5] 180 | dp = [0, 0.1, 0.2] 181 | grad_accumulation_steps = [4, 8] 182 | batch_size = 4 183 | params = list(product(lrs, dp, grad_accumulation_steps)) 184 | 185 | tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_name) 186 | 187 | train_dataset = UDBERTDataset( 188 | train_dataset_file, 189 | tokenizer, 190 | bert_like_special_tokens, 191 | preprocessing_function 192 | ) 193 | 194 | val_dataset = UDBERTDataset( 195 | val_dataset_file, 196 | tokenizer, 197 | bert_like_special_tokens, 198 | preprocessing_function 199 | ) 200 | 201 | results = [] 202 | for i, (lr, dp, grad_accumulation_steps) in enumerate(params): 203 | print(f'{i + 1}/{len(params)}') 204 | torch.manual_seed(0) 205 | current_system_wrapper = UDBERTSystemWrapper( 206 | pretrained_bert_name, 207 | preprocessing_function, 208 | bert_like_special_tokens, 209 | {'dp': dp} 210 | ) 211 | current_system_wrapper._train_impl( 212 | train_dataset, 213 | val_dataset, 214 | lr, 215 | batch_size, 216 | grad_accumulation_steps, 217 | tokenizer.pad_token_id, 218 | run_on_multi_gpus 219 | ) 220 | 221 | current_results = current_system_wrapper._evaluate_impl( 222 | val_dataset, 223 | batch_size, 224 | run_on_multi_gpus, 225 | tokenizer.pad_token_id 226 | ) 227 | results.append([current_results['macro-f1'].score, (lr, dp, grad_accumulation_steps)]) 228 | 229 | return results 230 | -------------------------------------------------------------------------------- /examples/utils/evaluators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from abc import abstractmethod 4 | from pytorch_wrapper.evaluators import AbstractEvaluator, GenericEvaluatorResults 5 | 6 | 7 | def convert_tags_to_entities(seq): 8 | # for nested list 9 | if any(isinstance(s, list) for s in seq): 10 | seq = [item for sublist in seq for item in sublist + ['O']] 11 | 12 | prev_tag = 'O' 13 | prev_tag_type = '' 14 | begin_offset = 0 15 | chunks = [] 16 | for i, chunk in enumerate(seq + ['O']): 17 | tag = chunk[0] 18 | tag_type = '-'.join(chunk.split('-')[1:]) 19 | if _is_end_of_chunk(prev_tag, tag, prev_tag_type, tag_type): 20 | chunks.append((prev_tag_type, begin_offset, i - 1)) 21 | if _is_start_of_chunk(prev_tag, tag, prev_tag_type, tag_type): 22 | begin_offset = i 23 | prev_tag = tag 24 | prev_tag_type = tag_type 25 | 26 | return chunks 27 | 28 | 29 | def _is_end_of_chunk(prev_tag, tag, prev_tag_type, tag_type): 30 | return ( 31 | prev_tag == 'E' or 32 | prev_tag == 'S' or 33 | prev_tag == 'B' and tag == 'B' or 34 | prev_tag == 'B' and tag == 'S' or 35 | prev_tag == 'B' and tag == 'O' or 36 | prev_tag == 'I' and tag == 'B' or 37 | prev_tag == 'I' and tag == 'S' or 38 | prev_tag == 'I' and tag == 'O' or 39 | prev_tag != 'O' and prev_tag_type != tag_type 40 | ) 41 | 42 | 43 | def _is_start_of_chunk(prev_tag, tag, prev_tag_type, tag_type): 44 | return ( 45 | tag == 'B' or 46 | tag == 'S' or 47 | prev_tag == 'E' and tag == 'E' or 48 | prev_tag == 'E' and tag == 'I' or 49 | prev_tag == 'S' and tag == 'E' or 50 | prev_tag == 'S' and tag == 'I' or 51 | prev_tag == 'O' and tag == 'E' or 52 | prev_tag == 'O' and tag == 'I' or 53 | tag != 'O' and prev_tag_type != tag_type 54 | ) 55 | 56 | 57 | class MaskedTokenLabelingEvaluatorWrapper(AbstractEvaluator): 58 | 59 | def __init__(self, evaluator, batch_input_key='input', model_output_key=None, 60 | batch_target_key='target', batch_mask_key='mask'): 61 | self._evaluator = evaluator 62 | super(MaskedTokenLabelingEvaluatorWrapper, self).__init__() 63 | self._batch_input_key = batch_input_key 64 | self._model_output_key = model_output_key 65 | self._batch_target_key = batch_target_key 66 | self._batch_mask_key = batch_mask_key 67 | self.reset() 68 | 69 | def reset(self): 70 | self._evaluator.reset() 71 | 72 | def step(self, output, batch, last_activation=None): 73 | if self._model_output_key is not None: 74 | output = output[self._model_output_key] 75 | 76 | mask = batch[self._batch_mask_key].to(output.device) 77 | 78 | output_extra_dims = output.dim() - mask.dim() 79 | output_mask_new_shape = list(mask.shape) + [1] * output_extra_dims 80 | output_extra_dims_shape = list(output.shape)[mask.dim():] 81 | output = torch.masked_select(output, mask.view(*output_mask_new_shape)) 82 | output = output.view(-1, *output_extra_dims_shape) 83 | 84 | target = batch[self._batch_target_key].to(output.device) 85 | target_extra_dims = target.dim() - mask.dim() 86 | target_mask_new_shape = list(mask.shape) + [1] * target_extra_dims 87 | target_extra_dims_shape = list(target.shape)[mask.dim():] 88 | target = torch.masked_select(target, mask.view(*target_mask_new_shape)) 89 | target = target.view(-1, *target_extra_dims_shape) 90 | 91 | new_batch = {k: batch[k] for k in batch if k != self._batch_target_key} 92 | new_batch[self._batch_target_key] = target 93 | 94 | self._evaluator.step(output, new_batch, last_activation) 95 | 96 | def calculate(self): 97 | return self._evaluator.calculate() 98 | 99 | 100 | class AbstractMaskedTokenEntityLabelingEvaluator(AbstractEvaluator): 101 | 102 | def __init__(self, i2l, batch_input_key='input', model_output_key=None, batch_target_key='target', 103 | batch_mask_key='mask'): 104 | self._batch_input_key = batch_input_key 105 | self._model_output_key = model_output_key 106 | self._batch_target_key = batch_target_key 107 | self._batch_mask_key = batch_mask_key 108 | self._i2l = i2l 109 | self._labels = [l[2:] for l in i2l if l[:2] == 'B-'] 110 | super(AbstractMaskedTokenEntityLabelingEvaluator, self).__init__() 111 | 112 | def reset(self): 113 | self._tp = {} 114 | self._fp = {} 115 | self._fn = {} 116 | 117 | for l in self._labels: 118 | self._tp[l] = 0 119 | self._fp[l] = 0 120 | self._fn[l] = 0 121 | 122 | def step(self, output, batch, last_activation=None): 123 | if self._model_output_key is not None: 124 | output = output[self._model_output_key] 125 | 126 | output = output.argmax(dim=-1) 127 | mask = batch[self._batch_mask_key].to(output.device) 128 | targets = batch[self._batch_target_key].to(output.device) 129 | 130 | for i in range(output.shape[0]): 131 | cur_out = torch.masked_select(output[i], mask[i]).tolist() 132 | cur_targets = torch.masked_select(targets[i], mask[i]).tolist() 133 | 134 | converted_out = set(convert_tags_to_entities([self._i2l[o] for o in cur_out])) 135 | converted_targets = set(convert_tags_to_entities([self._i2l[t] for t in cur_targets])) 136 | 137 | cur_tp = converted_out.intersection(converted_targets) 138 | cur_fp = converted_out.difference(converted_targets) 139 | cur_fn = converted_targets.difference(converted_out) 140 | 141 | for l in self._labels: 142 | self._tp[l] += len([c for c in cur_tp if c[0] == l]) 143 | self._fp[l] += len([c for c in cur_fp if c[0] == l]) 144 | self._fn[l] += len([c for c in cur_fn if c[0] == l]) 145 | 146 | @abstractmethod 147 | def calculate(self): 148 | pass 149 | 150 | 151 | class MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator \ 152 | (AbstractMaskedTokenEntityLabelingEvaluator): 153 | 154 | def __init__(self, i2l, average='macro', batch_input_key='input', model_output_key=None, batch_target_key='target', 155 | batch_mask_key='mask'): 156 | super( 157 | MultiClassPrecisionEvaluatorMaskedTokenEntityLabelingEvaluator, 158 | self 159 | ).__init__( 160 | i2l, 161 | batch_input_key, 162 | model_output_key, 163 | batch_target_key, 164 | batch_mask_key, 165 | ) 166 | 167 | self._average = average 168 | 169 | def calculate(self): 170 | if self._average == 'macro': 171 | per_class_score = {} 172 | for l in self._labels: 173 | denominator = self._tp[l] + self._fp[l] 174 | per_class_score[l] = (self._tp[l] / denominator) if denominator != 0 else 0 175 | score = sum(per_class_score[l] for l in per_class_score) / len(per_class_score) 176 | else: 177 | global_tp = sum(self._tp[l] for l in self._tp) 178 | global_fp = sum(self._fp[l] for l in self._fp) 179 | denominator = global_tp + global_fp 180 | score = (global_tp / denominator) if denominator > 0 else 0 181 | 182 | return GenericEvaluatorResults(score, self._average + '-precision', '%5.4f', is_max_better=True) 183 | 184 | 185 | class MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator \ 186 | (AbstractMaskedTokenEntityLabelingEvaluator): 187 | 188 | def __init__(self, i2l, average='macro', batch_input_key='input', model_output_key=None, batch_target_key='target', 189 | batch_mask_key='mask'): 190 | super( 191 | MultiClassRecallEvaluatorMaskedTokenEntityLabelingEvaluator, 192 | self 193 | ).__init__( 194 | i2l, 195 | batch_input_key, 196 | model_output_key, 197 | batch_target_key, 198 | batch_mask_key, 199 | ) 200 | 201 | self._average = average 202 | 203 | def calculate(self): 204 | if self._average == 'macro': 205 | per_class_score = {} 206 | for l in self._labels: 207 | denominator = self._tp[l] + self._fn[l] 208 | per_class_score[l] = (self._tp[l] / denominator) if denominator != 0 else 0 209 | score = sum(per_class_score[l] for l in per_class_score) / len(per_class_score) 210 | else: 211 | global_tp = sum(self._tp[l] for l in self._tp) 212 | global_fn = sum(self._fn[l] for l in self._fn) 213 | denominator = global_tp + global_fn 214 | score = (global_tp / denominator) if denominator > 0 else 0 215 | 216 | return GenericEvaluatorResults(score, self._average + '-recall', '%5.4f', is_max_better=True) 217 | 218 | 219 | class MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator \ 220 | (AbstractMaskedTokenEntityLabelingEvaluator): 221 | 222 | def __init__(self, i2l, average='macro', batch_input_key='input', model_output_key=None, batch_target_key='target', 223 | batch_mask_key='mask'): 224 | super( 225 | MultiClassF1EvaluatorMaskedTokenEntityLabelingEvaluator, 226 | self 227 | ).__init__( 228 | i2l, 229 | batch_input_key, 230 | model_output_key, 231 | batch_target_key, 232 | batch_mask_key, 233 | ) 234 | 235 | self._average = average 236 | 237 | def calculate(self): 238 | if self._average == 'macro': 239 | per_class_score = {} 240 | for l in self._labels: 241 | pr_denominator = self._tp[l] + self._fp[l] 242 | pr_score = (self._tp[l] / pr_denominator) if pr_denominator > 0 else 0 243 | rec_denominator = self._tp[l] + self._fn[l] 244 | rec_score = (self._tp[l] / rec_denominator) if rec_denominator > 0 else 0 245 | denominator = pr_score + rec_score 246 | per_class_score[l] = (2 * pr_score * rec_score) / denominator if denominator > 0 else 0 247 | score = sum(per_class_score[l] for l in per_class_score) / len(per_class_score) 248 | else: 249 | global_tp = sum(self._tp[l] for l in self._tp) 250 | global_fn = sum(self._fn[l] for l in self._fn) 251 | global_fp = sum(self._fp[l] for l in self._fp) 252 | pr_denominator = global_tp + global_fp 253 | pr_score = (global_tp / pr_denominator) if pr_denominator > 0 else 0 254 | rec_denominator = global_tp + global_fn 255 | rec_score = (global_tp / rec_denominator) if rec_denominator > 0 else 0 256 | denominator = pr_score + rec_score 257 | score = (2 * pr_score * rec_score) / denominator if denominator > 0 else 0 258 | 259 | return GenericEvaluatorResults(score, self._average + '-f1', '%5.4f', is_max_better=True) 260 | -------------------------------------------------------------------------------- /examples/ner/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import fasttext 3 | import numpy as np 4 | import pickle 5 | 6 | from ..utils.fasttext_downloader import download_model 7 | from ..utils.text import strip_accents_and_lowercase 8 | from .utils import parse_ner_dataset_file 9 | from .bert.system_wrapper import NERBERTSystemWrapper 10 | from .rnn.system_wrapper import NERRNNSystemWrapper 11 | 12 | 13 | @click.group() 14 | def ner(): 15 | pass 16 | 17 | 18 | @ner.group() 19 | def multi_bert(): 20 | pass 21 | 22 | 23 | @multi_bert.command() 24 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 25 | @click.argument('val_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 26 | @click.option('--multi-gpu', is_flag=True) 27 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 28 | results = NERBERTSystemWrapper.tune( 29 | 'bert-base-multilingual-uncased', 30 | strip_accents_and_lowercase, 31 | True, 32 | train_dataset_file, 33 | val_dataset_file, 34 | multi_gpu 35 | ) 36 | 37 | print(max(results, key=lambda x: x[0])) 38 | 39 | 40 | @multi_bert.command() 41 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 42 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 43 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ner/test.txt') 44 | @click.option('--batch-size', type=int, default=8) 45 | @click.option('--lr', type=float, default=3e-05) 46 | @click.option('--dp', type=float, default=0) 47 | @click.option('--grad-accumulation-steps', type=int, default=2) 48 | @click.option('--multi-gpu', is_flag=True) 49 | @click.option('--silent', is_flag=True) 50 | @click.option('--seed', type=int, default=0) 51 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, batch_size, lr, dp, grad_accumulation_steps, 52 | multi_gpu, silent, seed): 53 | sw = NERBERTSystemWrapper( 54 | 'bert-base-multilingual-uncased', 55 | strip_accents_and_lowercase, 56 | True, 57 | {'dp': dp} 58 | ) 59 | 60 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 61 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 62 | 63 | print(results) 64 | 65 | 66 | @ner.group() 67 | def greek_bert(): 68 | pass 69 | 70 | 71 | @greek_bert.command() 72 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 73 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 74 | @click.option('--multi-gpu', is_flag=True) 75 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 76 | results = NERBERTSystemWrapper.tune( 77 | 'nlpaueb/bert-base-greek-uncased-v1', 78 | strip_accents_and_lowercase, 79 | True, 80 | train_dataset_file, 81 | dev_dataset_file, 82 | multi_gpu 83 | ) 84 | 85 | print(max(results, key=lambda x: x[0])) 86 | 87 | 88 | @greek_bert.command() 89 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 90 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 91 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ner/test.txt') 92 | @click.option('--model-weights-save-path', type=str, default=None) 93 | @click.option('--batch-size', type=int, default=8) 94 | @click.option('--lr', type=float, default=5e-05) 95 | @click.option('--dp', type=float, default=0.2) 96 | @click.option('--grad-accumulation-steps', type=int, default=2) 97 | @click.option('--multi-gpu', is_flag=True) 98 | @click.option('--silent', is_flag=True) 99 | @click.option('--seed', type=int, default=0) 100 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 101 | grad_accumulation_steps, multi_gpu, silent, seed): 102 | sw = NERBERTSystemWrapper( 103 | 'nlpaueb/bert-base-greek-uncased-v1', 104 | strip_accents_and_lowercase, 105 | True, 106 | {'dp': dp} 107 | ) 108 | 109 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 110 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 111 | 112 | print(results) 113 | if model_weights_save_path: 114 | sw.save_model_state(model_weights_save_path) 115 | 116 | 117 | @ner.group() 118 | def cased_multi_bert(): 119 | pass 120 | 121 | 122 | @cased_multi_bert.command() 123 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 124 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 125 | @click.option('--multi-gpu', is_flag=True) 126 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 127 | results = NERBERTSystemWrapper.tune( 128 | 'bert-base-multilingual-cased', 129 | None, 130 | True, 131 | train_dataset_file, 132 | dev_dataset_file, 133 | multi_gpu 134 | ) 135 | 136 | print(max(results, key=lambda x: x[0])) 137 | 138 | 139 | @cased_multi_bert.command() 140 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 141 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 142 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ner/test.txt') 143 | @click.option('--batch-size', type=int, default=4) 144 | @click.option('--lr', type=float, default=2e-05) 145 | @click.option('--dp', type=float, default=0) 146 | @click.option('--grad-accumulation-steps', type=int, default=8) 147 | @click.option('--multi-gpu', is_flag=True) 148 | @click.option('--silent', is_flag=True) 149 | @click.option('--seed', type=int, default=0) 150 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, batch_size, lr, dp, grad_accumulation_steps, 151 | multi_gpu, silent, seed): 152 | sw = NERBERTSystemWrapper( 153 | 'bert-base-multilingual-cased', 154 | None, 155 | True, 156 | {'dp': dp} 157 | ) 158 | 159 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 160 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 161 | 162 | print(results) 163 | 164 | 165 | @ner.group() 166 | def xlm_r(): 167 | pass 168 | 169 | 170 | @xlm_r.command() 171 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 172 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 173 | @click.option('--multi-gpu', is_flag=True) 174 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 175 | results = NERBERTSystemWrapper.tune( 176 | 'xlm-roberta-base', 177 | None, 178 | False, 179 | train_dataset_file, 180 | dev_dataset_file, 181 | multi_gpu 182 | ) 183 | 184 | print(max(results, key=lambda x: x[0])) 185 | 186 | 187 | @xlm_r.command() 188 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 189 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 190 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ner/test.txt') 191 | @click.option('--model-weights-save-path', type=str, default=None) 192 | @click.option('--batch-size', type=int, default=8) 193 | @click.option('--lr', type=float, default=2e-05) 194 | @click.option('--dp', type=float, default=0) 195 | @click.option('--grad-accumulation-steps', type=int, default=2) 196 | @click.option('--multi-gpu', is_flag=True) 197 | @click.option('--silent', is_flag=True) 198 | @click.option('--seed', type=int, default=0) 199 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 200 | grad_accumulation_steps, multi_gpu, silent, seed): 201 | sw = NERBERTSystemWrapper( 202 | 'xlm-roberta-base', 203 | None, 204 | False, 205 | {'dp': dp} 206 | ) 207 | 208 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 209 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 210 | 211 | print(results) 212 | if model_weights_save_path: 213 | sw.save_model_state(model_weights_save_path) 214 | 215 | 216 | @ner.group() 217 | def rnn(): 218 | pass 219 | 220 | 221 | @rnn.command() 222 | @click.argument('tmp_download_path', type=str, default='data') 223 | @click.argument('embeddings_save_path', type=str, default='data/ner/ner_ft.pkl') 224 | @click.argument('dataset_file_paths', type=str, nargs=-1) 225 | def download_embeddings(tmp_download_path, embeddings_save_path, dataset_file_paths): 226 | download_model('el', tmp_download_path, if_exists='ignore') 227 | ft = fasttext.load_model(f'{tmp_download_path}/cc.el.300.bin') 228 | 229 | if not dataset_file_paths: 230 | dataset_file_paths = [f'data/ner/{ds}.txt' for ds in ('silver_train', 'dev', 'test')] 231 | 232 | vocab = set() 233 | for p in dataset_file_paths: 234 | with open(p) as fr: 235 | for e in parse_ner_dataset_file(fr): 236 | for t in e: 237 | vocab.add(t['text'].lower()) 238 | 239 | word_vectors = [] 240 | i2w = list(vocab) 241 | for word in i2w: 242 | word_vectors.append(ft.get_word_vector(word)) 243 | word_vectors = [[0] * len(word_vectors[0])] + word_vectors 244 | i2w = [''] + i2w 245 | w2i = {w: i for i, w in enumerate(i2w)} 246 | 247 | with open(embeddings_save_path, 'wb') as fw: 248 | pickle.dump((np.array(word_vectors), w2i, i2w), fw) 249 | 250 | 251 | @rnn.command() 252 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/silver_train.txt') 253 | @click.argument('char_vocab_save_path', type=str, default='data/ner/char_voc.pkl') 254 | def create_char_vocab(train_dataset_file, char_vocab_save_path): 255 | vocab = set() 256 | for e in parse_ner_dataset_file(train_dataset_file): 257 | for t in e: 258 | vocab.update(list(t['text'])) 259 | 260 | c2i = {c: i + 4 for i, c in enumerate(vocab)} 261 | c2i[''] = 0 262 | c2i[''] = 1 263 | c2i[''] = 2 264 | c2i[''] = 3 265 | 266 | with open(char_vocab_save_path, 'wb') as fw: 267 | pickle.dump(c2i, fw) 268 | 269 | 270 | @rnn.command() 271 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 272 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 273 | @click.argument('embeddings_file', type=click.File('rb'), default='data/ner/ner_ft.pkl') 274 | @click.argument('char_vocab_file', type=click.File('rb'), default='data/ner/char_voc.pkl') 275 | @click.option('--multi-gpu', is_flag=True) 276 | def tune(train_dataset_file, dev_dataset_file, embeddings_file, char_vocab_file, multi_gpu): 277 | embeddings, w2i, _ = pickle.load(embeddings_file) 278 | c2i = pickle.load(char_vocab_file) 279 | 280 | results = NERRNNSystemWrapper.tune( 281 | embeddings, 282 | w2i, 283 | c2i, 284 | train_dataset_file, 285 | dev_dataset_file, 286 | multi_gpu 287 | ) 288 | 289 | print(max(results, key=lambda x: x[0])) 290 | 291 | 292 | @rnn.command() 293 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ner/train.txt') 294 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ner/dev.txt') 295 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ner/test.txt') 296 | @click.argument('embeddings_file', type=click.File('rb'), default='data/ner/ner_ft.pkl') 297 | @click.argument('char_vocab_file', type=click.File('rb'), default='data/ner/char_voc.pkl') 298 | @click.option('--batch-size', type=int, default=16) 299 | @click.option('--lr', type=float, default=1e-03) 300 | @click.option('--dp', type=float, default=0.3) 301 | @click.option('--rnn-hs', type=int, default=300) 302 | @click.option('--char-emb-size', type=int, default=30) 303 | @click.option('--grad-accumulation-steps', type=int, default=1) 304 | @click.option('--multi-gpu', is_flag=True) 305 | @click.option('--silent', is_flag=True) 306 | @click.option('--seed', type=int, default=0) 307 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, embeddings_file, char_vocab_file, batch_size, lr, dp, 308 | rnn_hs, char_emb_size, grad_accumulation_steps, multi_gpu, silent, seed): 309 | embeddings, w2i, _ = pickle.load(embeddings_file) 310 | c2i = pickle.load(char_vocab_file) 311 | 312 | sw = NERRNNSystemWrapper( 313 | embeddings, 314 | w2i, 315 | c2i, 316 | { 317 | 'rnn_dp': dp, 318 | 'mlp_dp': dp, 319 | 'rnn_hidden_size': rnn_hs, 320 | 'char_embeddings_shape': (len(c2i), char_emb_size) 321 | } 322 | ) 323 | 324 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 325 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 326 | 327 | print(results) 328 | 329 | 330 | if __name__ == '__main__': 331 | ner() 332 | -------------------------------------------------------------------------------- /examples/xnli/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import json 4 | import fasttext 5 | import numpy as np 6 | import pickle 7 | 8 | from zipfile import ZipFile 9 | 10 | from ..utils.text import strip_accents_and_lowercase 11 | 12 | 13 | @click.group() 14 | def xnli(): 15 | pass 16 | 17 | 18 | @xnli.command() 19 | @click.argument('data_folder_path', type=str, default='data') 20 | def download_data(data_folder_path): 21 | os.makedirs(data_folder_path, exist_ok=True) 22 | os.system(f'wget http://nlp.cs.aueb.gr/software_and_datasets/xnli_el.zip -P {data_folder_path}') 23 | 24 | with ZipFile(f'{data_folder_path}/xnli_el.zip', 'r') as z: 25 | z.extractall(data_folder_path) 26 | 27 | 28 | @xnli.group() 29 | def multi_bert(): 30 | pass 31 | 32 | 33 | @multi_bert.command() 34 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train40K.jsonl') 35 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 36 | @click.option('--multi-gpu', is_flag=True) 37 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 38 | from .bert.system_wrapper import XNLIBERTSystemWrapper 39 | 40 | results = XNLIBERTSystemWrapper.tune( 41 | 'bert-base-multilingual-uncased', 42 | train_dataset_file, 43 | val_dataset_file, 44 | strip_accents_and_lowercase, 45 | multi_gpu 46 | ) 47 | 48 | print(max(results, key=lambda x: x[0])) 49 | 50 | 51 | @multi_bert.command() 52 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train.jsonl') 53 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 54 | @click.argument('test_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.test.jsonl') 55 | @click.option('--model-weights-save-path', type=str, default=None) 56 | @click.option('--batch-size', type=int, default=8) 57 | @click.option('--lr', type=float, default=3e-05) 58 | @click.option('--dp', type=float, default=0.1) 59 | @click.option('--grad-accumulation-steps', type=int, default=4) 60 | @click.option('--multi-gpu', is_flag=True) 61 | @click.option('--silent', is_flag=True) 62 | @click.option('--seed', type=int, default=0) 63 | def run(train_dataset_file, val_dataset_file, test_dataset_file, model_weights_save_path, 64 | batch_size, lr, dp, grad_accumulation_steps, multi_gpu, silent, seed): 65 | from .bert.system_wrapper import XNLIBERTSystemWrapper 66 | 67 | sw = XNLIBERTSystemWrapper('bert-base-multilingual-uncased', {'dp': dp}) 68 | 69 | sw.train(train_dataset_file, val_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, 70 | strip_accents_and_lowercase, not silent, seed) 71 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, 72 | strip_accents_and_lowercase, not silent) 73 | 74 | print(results) 75 | if model_weights_save_path: 76 | sw.save_model_state(model_weights_save_path) 77 | 78 | 79 | @xnli.group() 80 | def cased_multi_bert(): 81 | pass 82 | 83 | 84 | @cased_multi_bert.command() 85 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train40K.jsonl') 86 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 87 | @click.option('--multi-gpu', is_flag=True) 88 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 89 | from .bert.system_wrapper import XNLIBERTSystemWrapper 90 | 91 | results = XNLIBERTSystemWrapper.tune( 92 | 'bert-base-multilingual-cased', 93 | train_dataset_file, 94 | val_dataset_file, 95 | None, 96 | multi_gpu 97 | ) 98 | 99 | print(max(results, key=lambda x: x[0])) 100 | 101 | 102 | @cased_multi_bert.command() 103 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train.jsonl') 104 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 105 | @click.argument('test_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.test.jsonl') 106 | @click.option('--model-weights-save-path', type=str, default=None) 107 | @click.option('--batch-size', type=int, default=8) 108 | @click.option('--lr', type=float, default=2e-05) 109 | @click.option('--dp', type=float, default=0.1) 110 | @click.option('--grad-accumulation-steps', type=int, default=2) 111 | @click.option('--multi-gpu', is_flag=True) 112 | @click.option('--silent', is_flag=True) 113 | @click.option('--seed', type=int, default=0) 114 | def run(train_dataset_file, val_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 115 | grad_accumulation_steps, multi_gpu, silent, seed): 116 | from .bert.system_wrapper import XNLIBERTSystemWrapper 117 | 118 | sw = XNLIBERTSystemWrapper('bert-base-multilingual-cased', {'dp': dp}) 119 | 120 | sw.train(train_dataset_file, val_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, None, not silent, 121 | seed) 122 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, None, not silent) 123 | 124 | print(results) 125 | if model_weights_save_path: 126 | sw.save_model_state(model_weights_save_path) 127 | 128 | 129 | @xnli.group() 130 | def greek_bert(): 131 | pass 132 | 133 | 134 | @greek_bert.command() 135 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train40K.jsonl') 136 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 137 | @click.option('--multi-gpu', is_flag=True) 138 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 139 | from .bert.system_wrapper import XNLIBERTSystemWrapper 140 | 141 | results = XNLIBERTSystemWrapper.tune( 142 | 'nlpaueb/bert-base-greek-uncased-v1', 143 | train_dataset_file, 144 | val_dataset_file, 145 | multi_gpu, 146 | strip_accents_and_lowercase 147 | ) 148 | 149 | print(max(results, key=lambda x: x[0])) 150 | 151 | 152 | @greek_bert.command() 153 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train.jsonl') 154 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 155 | @click.argument('test_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.test.jsonl') 156 | @click.option('--model-weights-save-path', type=str, default=None) 157 | @click.option('--batch-size', type=int, default=8) 158 | @click.option('--lr', type=float, default=2e-05) 159 | @click.option('--dp', type=float, default=0) 160 | @click.option('--grad-accumulation-steps', type=int, default=2) 161 | @click.option('--multi-gpu', is_flag=True) 162 | @click.option('--silent', is_flag=True) 163 | @click.option('--seed', type=int, default=0) 164 | def run(train_dataset_file, val_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 165 | grad_accumulation_steps, multi_gpu, silent, seed): 166 | from .bert.system_wrapper import XNLIBERTSystemWrapper 167 | 168 | sw = XNLIBERTSystemWrapper('nlpaueb/bert-base-greek-uncased-v1', {'dp': dp}) 169 | 170 | sw.train(train_dataset_file, val_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, 171 | strip_accents_and_lowercase, not silent, seed) 172 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, 173 | strip_accents_and_lowercase, not silent) 174 | 175 | print(results) 176 | if model_weights_save_path: 177 | sw.save_model_state(model_weights_save_path) 178 | 179 | 180 | @xnli.group() 181 | def xlm_r(): 182 | pass 183 | 184 | 185 | @xlm_r.command() 186 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train40K.jsonl') 187 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 188 | @click.option('--multi-gpu', is_flag=True) 189 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 190 | from .bert.system_wrapper import XNLIBERTSystemWrapper 191 | 192 | results = XNLIBERTSystemWrapper.tune( 193 | 'xlm-roberta-base', 194 | train_dataset_file, 195 | val_dataset_file, 196 | multi_gpu, 197 | strip_accents_and_lowercase 198 | ) 199 | 200 | print(max(results, key=lambda x: x[0])) 201 | 202 | 203 | @xlm_r.command() 204 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train.jsonl') 205 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 206 | @click.argument('test_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.test.jsonl') 207 | @click.option('--model-weights-save-path', type=str, default=None) 208 | @click.option('--batch-size', type=int, default=4) 209 | @click.option('--lr', type=float, default=1e-05) 210 | @click.option('--dp', type=float, default=0.1) 211 | @click.option('--grad-accumulation-steps', type=int, default=4) 212 | @click.option('--multi-gpu', is_flag=True) 213 | @click.option('--silent', is_flag=True) 214 | @click.option('--seed', type=int, default=0) 215 | def run(train_dataset_file, val_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 216 | grad_accumulation_steps, multi_gpu, silent, seed): 217 | from .bert.system_wrapper import XNLIBERTSystemWrapper 218 | 219 | sw = XNLIBERTSystemWrapper('xlm-roberta-base', {'dp': dp}) 220 | 221 | sw.train(train_dataset_file, val_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, 222 | strip_accents_and_lowercase, not silent, seed) 223 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, 224 | strip_accents_and_lowercase, not silent) 225 | 226 | print(results) 227 | if model_weights_save_path: 228 | sw.save_model_state(model_weights_save_path) 229 | 230 | 231 | @xnli.group() 232 | def dam(): 233 | pass 234 | 235 | 236 | @dam.command() 237 | @click.argument('tmp_download_path', type=str, default='data') 238 | @click.argument('embeddings_save_path', type=str, default='data/xnli_el/xnli_ft.pkl') 239 | @click.argument('dataset_file_paths', type=str, nargs=-1) 240 | def download_embeddings(tmp_download_path, embeddings_save_path, dataset_file_paths): 241 | from ..utils.fasttext_downloader import download_model 242 | from .dam.dataset import XNLIDAMDataset 243 | 244 | if not dataset_file_paths: 245 | dataset_file_paths = [f'data/xnli_el/xnli.el.{ds}.jsonl' for ds in ('train', 'dev', 'test')] 246 | 247 | download_model('el', tmp_download_path, if_exists='ignore') 248 | ft = fasttext.load_model(f'{tmp_download_path}/cc.el.300.bin') 249 | 250 | vocab = set() 251 | for ds in dataset_file_paths: 252 | with open(ds) as fr: 253 | for line in fr: 254 | ex = json.loads(line) 255 | vocab.update(XNLIDAMDataset.process_text(ex['prem'])) 256 | vocab.update(XNLIDAMDataset.process_text(ex['hypo'])) 257 | 258 | word_vectors = [] 259 | i2w = list(vocab) 260 | for word in i2w: 261 | word_vectors.append(ft.get_word_vector(word)) 262 | word_vectors = [[0] * len(word_vectors[0])] + word_vectors 263 | i2w = [''] + i2w 264 | w2i = {w: i for i, w in enumerate(i2w)} 265 | 266 | with open(embeddings_save_path, 'wb') as fw: 267 | pickle.dump((np.array(word_vectors), w2i, i2w), fw) 268 | 269 | 270 | @dam.command() 271 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train40K.jsonl') 272 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 273 | @click.argument('embeddings_file', type=click.File('rb'), default='data/xnli_el/xnli_ft.pkl') 274 | @click.option('--multi-gpu', is_flag=True) 275 | def tune(train_dataset_file, val_dataset_file, embeddings_file, multi_gpu): 276 | from .dam.system_wrapper import XNLIDAMSystemWrapper 277 | 278 | embeddings, w2i, _ = pickle.load(embeddings_file) 279 | 280 | results = XNLIDAMSystemWrapper.tune( 281 | embeddings, 282 | w2i, 283 | train_dataset_file, 284 | val_dataset_file, 285 | multi_gpu 286 | ) 287 | 288 | print(max(results, key=lambda x: x[0])) 289 | 290 | 291 | @dam.command() 292 | @click.argument('train_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.train.jsonl') 293 | @click.argument('val_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.dev.jsonl') 294 | @click.argument('test_dataset_file', type=click.File('r'), default='data/xnli_el/xnli.el.test.jsonl') 295 | @click.argument('embeddings_file', type=click.File('rb'), default='data/xnli_el/xnli_ft.pkl') 296 | @click.option('--model-weights-save-path', type=str, default=None) 297 | @click.option('--batch-size', type=int, default=64) 298 | @click.option('--lr', type=float, default=0.001) 299 | @click.option('--dp', type=float, default=0.2) 300 | @click.option('--grad-accumulation-steps', type=int, default=1) 301 | @click.option('--multi-gpu', is_flag=True) 302 | @click.option('--silent', is_flag=True) 303 | @click.option('--seed', type=int, default=0) 304 | def run(train_dataset_file, val_dataset_file, test_dataset_file, embeddings_file, model_weights_save_path, batch_size, 305 | lr, dp, grad_accumulation_steps, multi_gpu, silent, seed): 306 | from .dam.system_wrapper import XNLIDAMSystemWrapper 307 | 308 | embeddings, w2i, _ = pickle.load(embeddings_file) 309 | 310 | sw = XNLIDAMSystemWrapper( 311 | embeddings, 312 | w2i, { 313 | 'mlp_dp': dp 314 | }) 315 | 316 | sw.train(train_dataset_file, val_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 317 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 318 | 319 | print(results) 320 | if model_weights_save_path: 321 | sw.save_model_state(model_weights_save_path) 322 | 323 | 324 | if __name__ == '__main__': 325 | xnli() 326 | -------------------------------------------------------------------------------- /examples/ud/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import fasttext 4 | import numpy as np 5 | import pickle 6 | import urllib.request 7 | 8 | from conllu import parse_incr 9 | 10 | from ..utils.fasttext_downloader import download_model 11 | from ..utils.text import strip_accents_and_lowercase 12 | from .bert.system_wrapper import UDBERTSystemWrapper 13 | from .rnn.system_wrapper import UDRNNSystemWrapper 14 | 15 | 16 | @click.group() 17 | def ud(): 18 | pass 19 | 20 | 21 | @ud.command() 22 | @click.argument('data_folder_path', type=str, default='data') 23 | def download_data(data_folder_path): 24 | os.makedirs(f'{data_folder_path}/ud/', exist_ok=True) 25 | for name, url in [ 26 | ('train', 'https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master/el_gdt-ud-train.conllu'), 27 | ('dev', 'https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master/el_gdt-ud-dev.conllu'), 28 | ('test', 'https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master/el_gdt-ud-test.conllu') 29 | ]: 30 | urllib.request.urlretrieve(url, f'{data_folder_path}/ud/{name}.conllu') 31 | 32 | 33 | @ud.group() 34 | def multi_bert(): 35 | pass 36 | 37 | 38 | @multi_bert.command() 39 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 40 | @click.argument('val_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 41 | @click.option('--multi-gpu', is_flag=True) 42 | def tune(train_dataset_file, val_dataset_file, multi_gpu): 43 | results = UDBERTSystemWrapper.tune( 44 | 'bert-base-multilingual-uncased', 45 | strip_accents_and_lowercase, 46 | True, 47 | train_dataset_file, 48 | val_dataset_file, 49 | multi_gpu 50 | ) 51 | 52 | print(max(results, key=lambda x: x[0])) 53 | 54 | 55 | @multi_bert.command() 56 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 57 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 58 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ud/test.conllu') 59 | @click.option('--batch-size', type=int, default=8) 60 | @click.option('--lr', type=float, default=2e-05) 61 | @click.option('--dp', type=float, default=0.2) 62 | @click.option('--grad-accumulation-steps', type=int, default=2) 63 | @click.option('--multi-gpu', is_flag=True) 64 | @click.option('--silent', is_flag=True) 65 | @click.option('--seed', type=int, default=0) 66 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, batch_size, lr, dp, grad_accumulation_steps, 67 | multi_gpu, silent, seed): 68 | sw = UDBERTSystemWrapper( 69 | 'bert-base-multilingual-uncased', 70 | strip_accents_and_lowercase, 71 | True, 72 | {'dp': dp} 73 | ) 74 | 75 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 76 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 77 | 78 | print(results) 79 | 80 | 81 | @ud.group() 82 | def greek_bert(): 83 | pass 84 | 85 | 86 | @greek_bert.command() 87 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 88 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 89 | @click.option('--multi-gpu', is_flag=True) 90 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 91 | results = UDBERTSystemWrapper.tune( 92 | 'nlpaueb/bert-base-greek-uncased-v1', 93 | strip_accents_and_lowercase, 94 | True, 95 | train_dataset_file, 96 | dev_dataset_file, 97 | multi_gpu 98 | ) 99 | 100 | print(max(results, key=lambda x: x[0])) 101 | 102 | 103 | @greek_bert.command() 104 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 105 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 106 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ud/test.conllu') 107 | @click.option('--model-weights-save-path', type=str, default=None) 108 | @click.option('--batch-size', type=int, default=16) 109 | @click.option('--lr', type=float, default=5e-05) 110 | @click.option('--dp', type=float, default=0.1) 111 | @click.option('--grad-accumulation-steps', type=int, default=1) 112 | @click.option('--multi-gpu', is_flag=True) 113 | @click.option('--silent', is_flag=True) 114 | @click.option('--seed', type=int, default=0) 115 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 116 | grad_accumulation_steps, multi_gpu, silent, seed): 117 | sw = UDBERTSystemWrapper( 118 | 'nlpaueb/bert-base-greek-uncased-v1', 119 | strip_accents_and_lowercase, 120 | True, 121 | {'dp': dp} 122 | ) 123 | 124 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 125 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 126 | 127 | print(results) 128 | if model_weights_save_path: 129 | sw.save_model_state(model_weights_save_path) 130 | 131 | 132 | @ud.group() 133 | def cased_multi_bert(): 134 | pass 135 | 136 | 137 | @cased_multi_bert.command() 138 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 139 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 140 | @click.option('--multi-gpu', is_flag=True) 141 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 142 | results = UDBERTSystemWrapper.tune( 143 | 'bert-base-multilingual-cased', 144 | None, 145 | True, 146 | train_dataset_file, 147 | dev_dataset_file, 148 | multi_gpu 149 | ) 150 | 151 | print(max(results, key=lambda x: x[0])) 152 | 153 | 154 | @cased_multi_bert.command() 155 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 156 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 157 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ud/test.conllu') 158 | @click.option('--batch-size', type=int, default=4) 159 | @click.option('--lr', type=float, default=3e-05) 160 | @click.option('--dp', type=float, default=0) 161 | @click.option('--grad-accumulation-steps', type=int, default=8) 162 | @click.option('--multi-gpu', is_flag=True) 163 | @click.option('--silent', is_flag=True) 164 | @click.option('--seed', type=int, default=0) 165 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, batch_size, lr, dp, grad_accumulation_steps, 166 | multi_gpu, silent, seed): 167 | sw = UDBERTSystemWrapper( 168 | 'bert-base-multilingual-cased', 169 | None, 170 | True, 171 | {'dp': dp} 172 | ) 173 | 174 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 175 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 176 | 177 | print(results) 178 | 179 | 180 | @ud.group() 181 | def xlm_r(): 182 | pass 183 | 184 | 185 | @xlm_r.command() 186 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 187 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 188 | @click.option('--multi-gpu', is_flag=True) 189 | def tune(train_dataset_file, dev_dataset_file, multi_gpu): 190 | results = UDBERTSystemWrapper.tune( 191 | 'xlm-roberta-base', 192 | None, 193 | False, 194 | train_dataset_file, 195 | dev_dataset_file, 196 | multi_gpu 197 | ) 198 | 199 | print(max(results, key=lambda x: x[0])) 200 | 201 | 202 | @xlm_r.command() 203 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 204 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 205 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ud/test.conllu') 206 | @click.option('--model-weights-save-path', type=str, default=None) 207 | @click.option('--batch-size', type=int, default=16) 208 | @click.option('--lr', type=float, default=5e-05) 209 | @click.option('--dp', type=float, default=0.2) 210 | @click.option('--grad-accumulation-steps', type=int, default=1) 211 | @click.option('--multi-gpu', is_flag=True) 212 | @click.option('--silent', is_flag=True) 213 | @click.option('--seed', type=int, default=0) 214 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, model_weights_save_path, batch_size, lr, dp, 215 | grad_accumulation_steps, multi_gpu, silent, seed): 216 | sw = UDBERTSystemWrapper( 217 | 'xlm-roberta-base', 218 | None, 219 | False, 220 | {'dp': dp} 221 | ) 222 | 223 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 224 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 225 | 226 | print(results) 227 | if model_weights_save_path: 228 | sw.save_model_state(model_weights_save_path) 229 | 230 | 231 | @ud.group() 232 | def rnn(): 233 | pass 234 | 235 | 236 | @rnn.command() 237 | @click.argument('tmp_download_path', type=str, default='data') 238 | @click.argument('embeddings_save_path', type=str, default='data/ud/ud_ft.pkl') 239 | @click.argument('dataset_file_paths', type=str, nargs=-1) 240 | def download_embeddings(tmp_download_path, embeddings_save_path, dataset_file_paths): 241 | download_model('el', tmp_download_path, if_exists='ignore') 242 | ft = fasttext.load_model(f'{tmp_download_path}/cc.el.300.bin') 243 | 244 | if not dataset_file_paths: 245 | dataset_file_paths = [f'data/ud/{ds}.conllu' for ds in ('train', 'dev', 'test')] 246 | 247 | vocab = set() 248 | for p in dataset_file_paths: 249 | with open(p) as fr: 250 | for e in parse_incr(fr): 251 | for t in e: 252 | vocab.add(t['form'].lower()) 253 | 254 | word_vectors = [] 255 | i2w = list(vocab) 256 | for word in i2w: 257 | word_vectors.append(ft.get_word_vector(word)) 258 | word_vectors = [[0] * len(word_vectors[0])] + word_vectors 259 | i2w = [''] + i2w 260 | w2i = {w: i for i, w in enumerate(i2w)} 261 | 262 | with open(embeddings_save_path, 'wb') as fw: 263 | pickle.dump((np.array(word_vectors), w2i, i2w), fw) 264 | 265 | 266 | @rnn.command() 267 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 268 | @click.argument('char_vocab_save_path', type=str, default='data/ud/char_voc.pkl') 269 | def create_char_vocab(train_dataset_file, char_vocab_save_path): 270 | vocab = set() 271 | for e in parse_incr(train_dataset_file): 272 | for t in e: 273 | vocab.update(list(t['form'])) 274 | 275 | c2i = {c: i + 4 for i, c in enumerate(vocab)} 276 | c2i[''] = 0 277 | c2i[''] = 1 278 | c2i[''] = 2 279 | c2i[''] = 3 280 | 281 | with open(char_vocab_save_path, 'wb') as fw: 282 | pickle.dump(c2i, fw) 283 | 284 | 285 | @rnn.command() 286 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 287 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 288 | @click.argument('embeddings_file', type=click.File('rb'), default='data/ud/ud_ft.pkl') 289 | @click.argument('char_vocab_file', type=click.File('rb'), default='data/ud/char_voc.pkl') 290 | @click.option('--multi-gpu', is_flag=True) 291 | def tune(train_dataset_file, dev_dataset_file, embeddings_file, char_vocab_file, multi_gpu): 292 | embeddings, w2i, _ = pickle.load(embeddings_file) 293 | c2i = pickle.load(char_vocab_file) 294 | 295 | results = UDRNNSystemWrapper.tune( 296 | embeddings, 297 | w2i, 298 | c2i, 299 | train_dataset_file, 300 | dev_dataset_file, 301 | multi_gpu 302 | ) 303 | 304 | print(max(results, key=lambda x: x[0])) 305 | 306 | 307 | @rnn.command() 308 | @click.argument('train_dataset_file', type=click.File('r'), default='data/ud/train.conllu') 309 | @click.argument('dev_dataset_file', type=click.File('r'), default='data/ud/dev.conllu') 310 | @click.argument('test_dataset_file', type=click.File('r'), default='data/ud/test.conllu') 311 | @click.argument('embeddings_file', type=click.File('rb'), default='data/ud/ud_ft.pkl') 312 | @click.argument('char_vocab_file', type=click.File('rb'), default='data/ud/char_voc.pkl') 313 | @click.option('--batch-size', type=int, default=64) 314 | @click.option('--lr', type=float, default=1e-02) 315 | @click.option('--dp', type=float, default=0.0) 316 | @click.option('--rnn-hs', type=int, default=100) 317 | @click.option('--char-emb-size', type=int, default=30) 318 | @click.option('--grad-accumulation-steps', type=int, default=1) 319 | @click.option('--multi-gpu', is_flag=True) 320 | @click.option('--silent', is_flag=True) 321 | @click.option('--seed', type=int, default=0) 322 | def run(train_dataset_file, dev_dataset_file, test_dataset_file, embeddings_file, char_vocab_file, batch_size, lr, dp, 323 | rnn_hs, char_emb_size, grad_accumulation_steps, multi_gpu, silent, seed): 324 | embeddings, w2i, _ = pickle.load(embeddings_file) 325 | c2i = pickle.load(char_vocab_file) 326 | 327 | sw = UDRNNSystemWrapper( 328 | embeddings, 329 | w2i, 330 | c2i, 331 | { 332 | 'rnn_dp': dp, 333 | 'mlp_dp': dp, 334 | 'rnn_hidden_size': rnn_hs, 335 | 'char_embeddings_shape': (len(c2i), char_emb_size) 336 | } 337 | ) 338 | 339 | sw.train(train_dataset_file, dev_dataset_file, lr, batch_size, grad_accumulation_steps, multi_gpu, not silent, seed) 340 | results = sw.evaluate(test_dataset_file, batch_size, multi_gpu, not silent) 341 | 342 | print(results) 343 | 344 | 345 | if __name__ == '__main__': 346 | ud() 347 | --------------------------------------------------------------------------------