├── .DS_Store ├── LICENSE ├── README.md ├── setup.py └── src ├── bert_sequence_tagger ├── __init__.py ├── bert_for_token_classification_custom.py ├── bert_utils.py ├── metrics.py ├── model_trainer_bert.py └── sequence_tagger_bert.py └── example.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IINemo/bert_sequence_tagger/89307d26153eea64f516128b7e68265cf93e097f/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Artem Shelmanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | BERT sequence tagger that accepts token list as an input (not BPE but any "general" tokenizer like NLTK or Standford) and produces tagged results in IOB format. 4 | 5 | Basically, you can do: 6 | ```python 7 | from bert_sequence_tagger import BertSequenceTagger, ModelTrainerBert 8 | 9 | seq_tagger = BertSequenceTagger(...) # initialize the model for training or load trained one. 10 | # ... train model with ModelTrainerBert 11 | 12 | seq_tagger.predict([['We', 'are', 'living', 'in', 'New', 'York', 'city', '.'], 13 | ['Satya', 'Narayana', 'Nadella', 'is', 'an', 'engineer', 'and', 'business', 'executive', '.']]) 14 | ``` 15 | Result: 16 | ``` 17 | ([['O', 'O', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'O'], 18 | ['I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']], 19 | [10.09477, 10.004749]) 20 | ``` 21 | 22 | Training BERT model has many caveats that include but not limited to: 23 | - Proper masking of the input. 24 | - Proper padding of input. 25 | - Loss masking (masking loss of the padded tokens and loss of the BPE suffixes). 26 | - Adding proper special tokens like [CLS], [SEP] to the beginning and an end of a sequence. 27 | - Annealing of the learning rate, as well as properly handling the best models. 28 | - Proper calculation of the validation / training loss (taking into account masked tokens and masked loss elements). 29 | 30 | [Pytorch_transformers](https://github.com/huggingface/transformers) provides a good pytorch implementation of BertForTokenClassification, however, it lacks code for proper trainig of sequence tagging models. Noticable effort is required to convert a tokenized text into an input suitable for BERT, with which you can achieve SOTA. 31 | 32 | This library does this work for you: it takes a tokenized input, performs bpe tokenization, padding, preparations, and all other work to prepare input for BERT. It also provides a trainer that can achieve the best performance for BERT models. See below example for CoNLL-2003 dataset. More detailed example in jupyter notebook is [here](https://github.com/IINemo/bert_sequence_tagger/blob/master/src/example.ipynb). 33 | 34 | # Example 35 | 36 | ```python 37 | 38 | from bert_sequence_tagger import SequenceTaggerBert, BertForTokenClassificationCustom 39 | from pytorch_transformers import BertTokenizer 40 | 41 | from bert_sequence_tagger.bert_utils import get_model_parameters, prepare_flair_corpus 42 | from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus 43 | from bert_sequence_tagger.model_trainer_bert import ModelTrainerBert 44 | from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level 45 | 46 | from pytorch_transformers import AdamW, WarmupLinearSchedule 47 | 48 | 49 | import logging 50 | import sys 51 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 52 | logger = logging.getLogger('sequence_tagger_bert') 53 | 54 | 55 | # Loading corpus ############################ 56 | 57 | from flair.datasets import ColumnCorpus 58 | 59 | data_folder = './conll2003' 60 | corpus = ColumnCorpus(data_folder, 61 | {0 : 'text', 3 : 'ner'}, 62 | train_file='eng.train', 63 | test_file='eng.testb', 64 | dev_file='eng.testa') 65 | 66 | 67 | # Creating model ############################ 68 | 69 | batch_size = 16 70 | n_epochs = 4 71 | model_type = 'bert-base-cased' 72 | bpe_tokenizer = BertTokenizer.from_pretrained(model_type, do_lower_case=False) 73 | 74 | idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus) 75 | 76 | model = BertForTokenClassificationCustom.from_pretrained(model_type, 77 | num_labels=len(tag2idx)).cuda() 78 | 79 | seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=bpe_tokenizer, 80 | idx2tag=idx2tag, tag2idx=tag2idx, max_len=128, 81 | batch_size=batch_size) 82 | 83 | 84 | # Training ############################ 85 | 86 | train_dataset = prepare_flair_corpus(corpus.train) 87 | val_dataset = prepare_flair_corpus(corpus.dev) 88 | 89 | optimizer = AdamW(get_model_parameters(model), lr=5e-5, betas=(0.9, 0.999), 90 | eps=1e-6, weight_decay=0.01, correct_bias=True) 91 | 92 | n_iterations_per_epoch = len(corpus.train) / batch_size 93 | n_steps = n_iterations_per_epoch * n_epochs 94 | lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, t_total=n_steps) 95 | 96 | trainer = ModelTrainerBert(model=seq_tagger, 97 | optimizer=optimizer, 98 | lr_scheduler=lr_scheduler, 99 | train_dataset=train_dataset, 100 | val_dataset=val_dataset, 101 | validation_metrics=[f1_entity_level], 102 | batch_size=batch_size) 103 | 104 | trainer.train(epochs=n_epochs) 105 | 106 | # Testing ############################ 107 | 108 | test_dataset = prepare_flair_corpus(corpus.test) 109 | _, __, test_metrics = seq_tagger.predict(test_dataset, evaluate=True, 110 | metrics=[f1_entity_level, f1_token_level]) 111 | print(f'Entity-level f1: {test_metrics[1]}') 112 | print(f'Token-level f1: {test_metrics[2]}') 113 | 114 | # Predicting ############################ 115 | seq_tagger.predict([['We', 'are', 'living', 'in', 'New', 'York', 'city', '.']]) 116 | 117 | ``` 118 | 119 | # Installation 120 | 121 | pip install git+https://github.com/IINemo/bert_sequence_tagger.git 122 | 123 | # Requirements 124 | 125 | - torch 126 | - tensorflow 127 | - pytorch_transformers 128 | - flair (optional for reading conll formatted files) 129 | - seqeval (optional for evaluation) 130 | - sklearn (optional for evaluation) 131 | 132 | # Cite 133 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 134 | @inproceedings{shelmanov2019bibm, 135 | title={Active Learning with Deep Pre-trained Models for Sequence Tagging of Clinical and Biomedical Texts}, 136 | author={Artem Shelmanov and Vadim Liventsev and Danil Kireev and Nikita Khromov and Alexander Panchenko and Irina Fedulova and Dmitry V. Dylov}, 137 | booktitle={Proceedings of International Conference on Bioinformatics & Biomedicine (BIBM)}, 138 | year={2019} 139 | } 140 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 141 | 142 | # TODO 143 | - Remove dependency from tensorflow 144 | - Make ModelTrainer more generalizable 145 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import io 4 | import re 5 | from glob import glob 6 | from os.path import basename 7 | from os.path import dirname 8 | from os.path import join 9 | from os.path import splitext 10 | 11 | from setuptools import find_packages 12 | from setuptools import setup 13 | 14 | 15 | def read(*names, **kwargs): 16 | return io.open( 17 | join(dirname(__file__), *names), 18 | encoding=kwargs.get('encoding', 'utf8') 19 | ).read() 20 | 21 | 22 | setup( 23 | name='bert_sequence_tagger', 24 | version='0.1.0', 25 | description='A wrapper for hugging face transformers library', 26 | author='IINemo', 27 | author_email='', 28 | packages=['bert_sequence_tagger'], 29 | include_package_data=True, 30 | zip_safe=False, 31 | package_dir={'' : 'src'}, 32 | install_requires=[ 33 | 'pytorch_transformers' 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence_tagger_bert import SequenceTaggerBert 2 | from .bert_for_token_classification_custom import BertForTokenClassificationCustom 3 | from .model_trainer_bert import ModelTrainerBert 4 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/bert_for_token_classification_custom.py: -------------------------------------------------------------------------------- 1 | from pytorch_transformers import BertForTokenClassification 2 | 3 | from torch.nn import CrossEntropyLoss 4 | 5 | 6 | class BertForTokenClassificationCustom(BertForTokenClassification): 7 | def __init__(self, config): 8 | super().__init__(config) 9 | 10 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, 11 | position_ids=None, head_mask=None, loss_mask=None): 12 | outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, 13 | attention_mask=attention_mask, head_mask=head_mask) 14 | sequence_output = outputs[0] 15 | 16 | sequence_output = self.dropout(sequence_output) 17 | logits = self.classifier(sequence_output) 18 | 19 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 20 | if labels is not None: 21 | loss_fct = CrossEntropyLoss() 22 | # Only keep active parts of the loss 23 | if attention_mask is not None: 24 | active_loss = (attention_mask.view(-1) == 1) 25 | if loss_mask is not None: 26 | active_loss &= loss_mask.view(-1) 27 | 28 | active_logits = logits.view(-1, self.num_labels)[active_loss] 29 | active_labels = labels.view(-1)[active_loss] 30 | loss = loss_fct(active_logits, active_labels) 31 | else: 32 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 33 | outputs = (loss,) + outputs 34 | 35 | return outputs # (loss), scores, (hidden_states), (attentions) 36 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/bert_utils.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | from torch.utils.data import RandomSampler, SequentialSampler 3 | 4 | 5 | def make_bert_tag_dict_from_flair_corpus(corpus): 6 | tags_vals = corpus.make_tag_dictionary('ner').get_items() 7 | tags_vals.remove('') 8 | tags_vals.remove('') 9 | tags_vals.remove('') 10 | tags_vals = ['[PAD]'] + tags_vals # + ['X']#, '[CLS]', '[SEP]'] 11 | tag2idx = {t : i for i, t in enumerate(tags_vals)} 12 | return tags_vals, tag2idx 13 | 14 | 15 | def prepare_flair_corpus(corpus, name='ner', filter_tokens={'-DOCSTART-'}): 16 | result = [] 17 | for sent in corpus: 18 | if sent[0].text in filter_tokens: 19 | continue 20 | else: 21 | result.append(([token.text for token in sent.tokens], 22 | [token.get_tag(name).value for token in sent.tokens])) 23 | 24 | return result 25 | 26 | 27 | def get_parameters_without_decay(model, no_decay={'bias', 'gamma', 'beta'}): 28 | params_no_decay = [] 29 | params_decay = [] 30 | for n, p in model.named_parameters(): 31 | if any((e in n) for e in no_decay): 32 | params_no_decay.append(p) 33 | else: 34 | params_decay.append(p) 35 | 36 | return [{'params' : params_no_decay, 'weight_decay' : 0.}, 37 | {'params' : params_decay}] 38 | 39 | 40 | def get_model_parameters(model, no_decay={'bias', 'gamma', 'beta'}, 41 | full_finetuning=True, lr_head=None): 42 | grouped_parameters = get_parameters_without_decay(model.classifier, no_decay) 43 | if lr_head is not None: 44 | for param in grouped_parameters: 45 | param['lr'] = lr_head 46 | 47 | if full_finetuning: 48 | grouped_parameters = (get_parameters_without_decay(model.bert, no_decay) 49 | + grouped_parameters) 50 | 51 | return grouped_parameters 52 | 53 | 54 | def create_loader_from_flair_corpus(corpus, sampler_ctor=None, batch_size=100, shuffle=True): 55 | collate_fn = lambda inpt: tuple(zip(*inpt)) 56 | 57 | if sampler_ctor is None: 58 | sampler_ctor = RandomSampler if shuffle else SequentialSampler 59 | 60 | dataset = prepare_flair_corpus(corpus) 61 | sampler = sampler_ctor(dataset) 62 | dataloader = DataLoader(dataset, 63 | sampler=sampler, 64 | batch_size=batch_size, 65 | collate_fn=collate_fn) 66 | return dataloader 67 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/metrics.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from sklearn.metrics import f1_score as f1_score_sklearn 3 | from seqeval.metrics import f1_score 4 | 5 | 6 | def f1_entity_level(*args, **kwargs): 7 | return f1_score(*args, **kwargs) 8 | 9 | 10 | def f1_token_level(true_labels, predictions): 11 | true_labels = list(itertools.chain(*true_labels)) 12 | predictions = list(itertools.chain(*predictions)) 13 | 14 | labels = list(set(true_labels) - {'[PAD]', 'O'}) 15 | 16 | return f1_score_sklearn(true_labels, 17 | predictions, 18 | average='micro', 19 | labels=labels) 20 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/model_trainer_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pytorch_transformers import AdamW 3 | from torch.utils.data import DataLoader 4 | 5 | import copy 6 | from tqdm import trange 7 | 8 | import logging 9 | logger = logging.getLogger('sequence_tagger_bert') 10 | 11 | 12 | 13 | class ModelTrainerBert: 14 | def __init__(self, 15 | model, 16 | optimizer, 17 | lr_scheduler, 18 | train_dataset, 19 | val_dataset, 20 | update_scheduler='es', # ee(every_epoch) or every_step(es) 21 | keep_best_model=False, 22 | restore_bm_on_lr_change=False, 23 | max_grad_norm=1.0, 24 | smallest_lr=0., 25 | validation_metrics=None, 26 | decision_metric=None, 27 | loader_args={'num_workers' : 1}, 28 | batch_size=32): 29 | self._model = model 30 | self._optimizer = optimizer 31 | self._lr_scheduler = lr_scheduler 32 | 33 | self._train_dataset = train_dataset 34 | self._val_dataset = val_dataset 35 | 36 | self._update_scheduler = update_scheduler 37 | self._keep_best_model = keep_best_model 38 | self._restore_bm_on_lr_change = restore_bm_on_lr_change 39 | self._max_grad_norm = max_grad_norm 40 | self._smallest_lr = smallest_lr 41 | self._validation_metrics = validation_metrics 42 | self._decision_metric = decision_metric 43 | if self._decision_metric is None: 44 | self._decision_metric = lambda metrics: metrics[0] 45 | 46 | self._loader_args = loader_args 47 | self._batch_size = batch_size 48 | 49 | def _make_tensors(self, dataset_row): 50 | tokens, labels = tuple(zip(*dataset_row)) 51 | return self._model.generate_tensors_for_training(tokens, labels) 52 | 53 | def train(self, epochs): 54 | best_model = {} 55 | best_dec_metric = float('inf') 56 | 57 | get_lr = lambda: self._optimizer.param_groups[0]['lr'] 58 | 59 | train_dataloader = DataLoader(self._train_dataset, 60 | batch_size=self._batch_size, 61 | shuffle=True, 62 | collate_fn=self._make_tensors) 63 | iterator = trange(epochs, desc='Epoch') 64 | for epoch in iterator: 65 | self._model._bert_model.train() 66 | 67 | cum_loss = 0. 68 | for nb, tensors in enumerate(train_dataloader): 69 | loss = self._model.batch_loss_tensors(*tensors) 70 | cum_loss += loss.item() 71 | 72 | self._model._bert_model.zero_grad() 73 | loss.backward() 74 | if self._max_grad_norm > 0.: 75 | torch.nn.utils.clip_grad_norm_(parameters=self._model._bert_model.parameters(), 76 | max_norm=self._max_grad_norm) 77 | 78 | self._optimizer.step() 79 | 80 | if self._update_scheduler == 'es': 81 | self._lr_scheduler.step() 82 | 83 | prev_lr = get_lr() 84 | logger.info(f'Current learning rate: {prev_lr}') 85 | 86 | cum_loss /= (nb + 1) 87 | logger.info(f'Train loss: {cum_loss}') 88 | 89 | dec_metric = 0. 90 | if self._val_dataset is not None: 91 | _, __, val_metrics = self._model.predict(self._val_dataset, evaluate=True, 92 | metrics=self._validation_metrics) 93 | val_loss = val_metrics[0] 94 | logger.info(f'Validation loss: {val_loss}') 95 | logger.info(f'Validation metrics: {val_metrics[1:]}') 96 | 97 | dec_metric = self._decision_metric(val_metrics) 98 | 99 | if self._keep_best_model and (dec_metric < best_dec_metric): 100 | best_model = copy.deepcopy(self._model._bert_model.state_dict()) 101 | best_dec_metric = dec_metric 102 | 103 | if self._update_scheduler == 'ee': 104 | self._lr_scheduler.step(dec_metric) 105 | 106 | if self._restore_bm_on_lr_change and get_lr() < prev_lr: 107 | if get_lr() < self._smallest_lr: 108 | iterator.close() 109 | break 110 | 111 | prev_lr = get_lr() 112 | logger.info(f'Reduced learning rate to: {prev_lr}') 113 | 114 | logger.info('Restoring best model...') 115 | self._model._bert_model.load_state_dict(best_model) 116 | 117 | if best_model: 118 | self._model._bert_model.load_state_dict(best_model) 119 | 120 | torch.cuda.empty_cache() 121 | -------------------------------------------------------------------------------- /src/bert_sequence_tagger/sequence_tagger_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | 4 | from tensorflow.keras.preprocessing.sequence import pad_sequences 5 | 6 | from .bert_for_token_classification_custom import BertForTokenClassificationCustom 7 | 8 | import itertools 9 | from tqdm import trange 10 | import numpy as np 11 | import pickle 12 | import json 13 | import os 14 | 15 | 16 | import logging 17 | logger = logging.getLogger('sequence_tagger_bert') 18 | 19 | 20 | class SequenceTaggerBert: 21 | def __init__(self, bert_model, bpe_tokenizer, idx2tag, tag2idx, 22 | max_len=100, pred_loader_args={'num_workers' : 1}, 23 | pred_batch_size=100): 24 | super().__init__() 25 | 26 | self._bert_model = bert_model 27 | self._bpe_tokenizer = bpe_tokenizer 28 | self._idx2tag = idx2tag 29 | self._tag2idx = tag2idx 30 | self._max_len = max_len 31 | self._pred_loader_args = pred_loader_args 32 | self._pred_batch_size = pred_batch_size 33 | 34 | def _bpe_tokenize(self, words): 35 | new_words = [] 36 | bpe_masks = [] 37 | for word in words: 38 | bpe_tokens = self._bpe_tokenizer.tokenize(word) 39 | new_words += bpe_tokens 40 | bpe_masks += [1] + [0] * (len(bpe_tokens) - 1) 41 | 42 | return new_words, bpe_masks 43 | 44 | def _make_tokens_tensors(self, tokens, max_len): 45 | bpe_tokens, bpe_masks = tuple(zip(*[self._bpe_tokenize(sent) for sent in tokens])) 46 | bpe_tokens = prepare_bpe_tokens_for_bert(bpe_tokens, max_len=max_len) 47 | bpe_masks = [[1] + masks[:max_len-2] + [1] for masks in bpe_masks] 48 | max_len = max(len(sent) for sent in bpe_tokens) 49 | token_ids = torch.tensor(create_tensors_for_tokens(self._bpe_tokenizer, bpe_tokens, max_len=max_len)) 50 | token_masks = generate_masks(token_ids) 51 | return bpe_tokens, max_len, token_ids, token_masks, bpe_masks 52 | 53 | def _add_x_labels(self, labels, bpe_masks): 54 | result_labels = [] 55 | for l_sent, m_sent in zip(labels, bpe_masks): 56 | m_sent = m_sent[1:-1] 57 | sent_res = [] 58 | i = 0 59 | for l in l_sent: 60 | sent_res.append(l) 61 | 62 | i += 1 63 | while i < len(m_sent) and (m_sent[i] == 0): 64 | i += 1 65 | sent_res.append('[PAD]') 66 | 67 | result_labels.append(sent_res) 68 | 69 | return result_labels 70 | 71 | def _make_label_tensors(self, labels, bpe_masks, max_len): 72 | bpe_labels = self._add_x_labels(labels, bpe_masks) 73 | bpe_labels = prepare_bpe_labels_for_bert(bpe_labels, max_len=max_len) 74 | label_ids = torch.tensor(create_tensors_for_labels(self._tag2idx, bpe_labels, max_len=max_len)) 75 | loss_masks = label_ids != self._tag2idx['[PAD]'] 76 | return label_ids, loss_masks 77 | 78 | def _logits_to_preds(self, logits, bpe_masks, tokens): 79 | preds = logits.argmax(dim=2).numpy() 80 | probs = logits.numpy().max(axis=2) 81 | prob = [np.mean([p for p, m in zip(prob[:len(masks)], masks[:len(prob)]) if m][1:-1]) 82 | for prob, masks in zip(probs, bpe_masks)] 83 | preds = [[self._idx2tag[p] for p, m in zip(pred[:len(masks)], masks[:len(pred)]) if m][1:-1] 84 | for pred, masks in zip(preds, bpe_masks)] 85 | preds = [pred + ['O']*(max(0, len(toks) - len(pred))) for pred, toks in zip(preds, tokens)] 86 | return preds, prob 87 | 88 | def generate_tensors_for_prediction(self, evaluate, dataset_row): 89 | dataset_row = dataset_row 90 | labels = None 91 | if evaluate: 92 | tokens, labels = tuple(zip(*dataset_row)) 93 | else: 94 | tokens = dataset_row 95 | 96 | _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) 97 | label_ids = None 98 | loss_masks = None 99 | 100 | if evaluate: 101 | label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len) 102 | 103 | return token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels 104 | 105 | def predict(self, dataset, evaluate=False, metrics=None): 106 | if metrics is None: 107 | metrics = [] 108 | 109 | self._bert_model.eval() 110 | 111 | dataloader = DataLoader(dataset, 112 | collate_fn=lambda dataset_row: self.generate_tensors_for_prediction(evaluate, dataset_row), 113 | **self._pred_loader_args, 114 | batch_size=self._pred_batch_size) 115 | 116 | predictions = [] 117 | probas = [] 118 | 119 | if evaluate: 120 | cum_loss = 0. 121 | true_labels = [] 122 | 123 | for nb, tensors in enumerate(dataloader): 124 | token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels = tensors 125 | 126 | if evaluate: 127 | true_labels.extend(labels) 128 | 129 | with torch.no_grad(): 130 | token_ids = token_ids.cuda() 131 | token_masks = token_masks.cuda() 132 | 133 | if evaluate: 134 | label_ids = label_ids.cuda() 135 | loss_masks = loss_masks.cuda() 136 | 137 | if type(self._bert_model) is BertForTokenClassificationCustom: 138 | logits = self._bert_model(token_ids, 139 | token_type_ids=None, 140 | attention_mask=token_masks, 141 | labels=label_ids, 142 | loss_mask=loss_masks) 143 | else: 144 | logits = self._bert_model(token_ids, 145 | token_type_ids=None, 146 | attention_mask=token_masks, 147 | labels=label_ids,) 148 | 149 | if evaluate: 150 | loss, logits = logits 151 | cum_loss += loss.mean().item() 152 | else: 153 | logits = logits[0] 154 | 155 | b_preds, b_prob = self._logits_to_preds(logits.cpu(), bpe_masks, tokens) 156 | 157 | predictions.extend(b_preds) 158 | probas.extend(b_prob) 159 | 160 | if evaluate: 161 | cum_loss /= (nb + 1) 162 | 163 | result_metrics = [] 164 | for metric in metrics: 165 | result_metrics.append(metric(true_labels, predictions)) 166 | 167 | return predictions, probas, tuple([cum_loss] + result_metrics) 168 | else: 169 | return predictions, probas 170 | 171 | def generate_tensors_for_training(self, tokens, labels): 172 | _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) 173 | label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len) 174 | return token_ids, token_masks, label_ids, loss_masks 175 | 176 | def generate_feature_tensors_for_prediction(self, tokens): 177 | _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) 178 | return token_ids, token_masks, bpe_masks 179 | 180 | def batch_loss_tensors(self, *tensors): 181 | token_ids, token_masks, label_ids, loss_masks = tensors 182 | token_ids = token_ids.cuda() 183 | token_masks = token_masks.cuda() 184 | label_ids = label_ids.cuda() 185 | loss_masks = loss_masks.cuda() 186 | 187 | if type(self._bert_model) is BertForTokenClassificationCustom: 188 | output = self._bert_model(token_ids, 189 | token_type_ids=None, 190 | attention_mask=token_masks, 191 | labels=label_ids, 192 | loss_mask=loss_masks) 193 | else: 194 | output = self._bert_model(token_ids, 195 | token_type_ids=None, 196 | attention_mask=token_masks, 197 | labels=label_ids) 198 | 199 | loss = output[0] 200 | return loss.mean() 201 | 202 | def batch_loss(self, tokens, labels): 203 | token_ids, token_masks, label_ids, loss_masks = self.generate_tensors_for_training(tokens, labels) 204 | return self.batch_loss_tensors(token_ids, None, token_masks, label_ids, loss_masks) 205 | 206 | def batch_logits(self, tokens): 207 | _, max_len, token_ids, token_masks, __ = self._make_tokens_tensors(tokens, self._max_len) 208 | token_ids = token_ids.cuda() 209 | token_masks = token_masks.cuda() 210 | 211 | logits = self._bert_model(token_ids, 212 | token_type_ids=None, 213 | attention_mask=token_masks, 214 | labels=None, 215 | loss_mask=None)[0] 216 | 217 | return logits 218 | 219 | def save_serialize(self, save_dir_path): 220 | if not os.path.exists(save_dir_path): 221 | os.makedirs(save_dir_path) 222 | 223 | torch.save(self._bert_model.state_dict(), os.path.join(save_dir_path, 'pytorch_model.bin')) 224 | with open(os.path.join(save_dir_path, 'bpe_tokenizer.pckl'), 'wb') as f: 225 | pickle.dump(self._bpe_tokenizer, f) 226 | 227 | self._bert_model.config.save_pretrained(os.path.join(save_dir_path)) 228 | 229 | parameters_dict = { 230 | 'idx2tag' : self._idx2tag, 231 | 'tag2idx' : self._tag2idx, 232 | 'max_len' : self._max_len, 233 | 'pred_loader_args' : self._pred_loader_args, 234 | 'pred_batch_size' : self._pred_batch_size 235 | } 236 | with open(os.path.join(save_dir_path, 'sec_parameters.json'), 'w') as f: 237 | json.dump(parameters_dict, f) 238 | 239 | @classmethod 240 | def load_serialized(cls, load_dir_path, bert_model_type): 241 | with open(os.path.join(load_dir_path, 'sec_parameters.json'), 'r') as f: 242 | parameters_dict = json.load(f) 243 | 244 | bert_model = bert_model_type.from_pretrained(load_dir_path).cuda() 245 | 246 | with open(os.path.join(load_dir_path, 'bpe_tokenizer.pckl'), 'rb') as f: 247 | bpe_tokenizer = pickle.load(f) 248 | 249 | return SequenceTaggerBert(bert_model, bpe_tokenizer, 250 | idx2tag=parameters_dict['idx2tag'], 251 | tag2idx=parameters_dict['tag2idx'], 252 | max_len=parameters_dict['max_len'], 253 | pred_loader_args=parameters_dict['pred_loader_args'], 254 | pred_batch_size=parameters_dict['pred_batch_size']) 255 | 256 | # TODO: raw batch 257 | 258 | 259 | def prepare_bpe_tokens_for_bert(tokens, max_len): 260 | return [['[CLS]'] + list(toks[:max_len - 2]) + ['[SEP]'] for toks in tokens] 261 | 262 | 263 | def prepare_bpe_labels_for_bert(labels, max_len): 264 | return [['[PAD]'] + list(ls[:max_len - 2]) + ['[PAD]'] for ls in labels] 265 | 266 | 267 | def generate_masks(input_ids): 268 | res = input_ids > 0 269 | return res.astype('float') if type(input_ids) is np.ndarray else res 270 | 271 | 272 | def create_tensors_for_tokens(bpe_tokenizer, sents, max_len): 273 | return pad_sequences([bpe_tokenizer.convert_tokens_to_ids(sent) for sent in sents], 274 | maxlen=max_len, dtype='long', 275 | truncating='post', padding='post') 276 | 277 | 278 | def create_tensors_for_labels(tag2idx, labels, max_len): 279 | return pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 280 | maxlen=max_len, value=tag2idx['[PAD]'], padding='post', 281 | dtype='long', truncating='post') 282 | -------------------------------------------------------------------------------- /src/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Readme" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "- This example illustrates processing CoNLL-2003 corpus with BERT for sequence tagging from https://github.com/huggingface/transformers\n", 15 | "- Using this script it is possible to achieve 91.4 F1 entity-level score and 92.8 F1 token-level score with bert-base\n", 16 | "- With 100 epochs and lr=1e-5, batch_size=8 it is possible to achieve the same score as in https://gluon-nlp.mxnet.io/model_zoo/bert/index.html " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Install dependencies" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "!pip install pytorch_transformers flair seqeval" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "# Download CoNLL-2003" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "!mkdir -p conll2003\n", 49 | "!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa -O ./conll2003/eng.testa\n", 50 | "!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb -O ./conll2003/eng.testb\n", 51 | "!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train -O ./conll2003/eng.train" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# Initialization" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Choosing cuda devices if there are multiple\n", 68 | "\n", 69 | "import os\n", 70 | "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import logging\n", 80 | "import sys\n", 81 | "\n", 82 | "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", 83 | "logger = logging.getLogger('sequence_tagger_bert')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "Tesla V100-DGXS-16GB\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "import torch\n", 101 | "\n", 102 | "device = torch.device('cuda')\n", 103 | "n_gpu = torch.cuda.device_count()\n", 104 | "\n", 105 | "for i in range(n_gpu):\n", 106 | " print(torch.cuda.get_device_name(i))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "CACHE_DIR = 'cache'\n", 116 | "BATCH_SIZE = 16\n", 117 | "PRED_BATCH_SIZE = 100\n", 118 | "MAX_LEN = 128\n", 119 | "MAX_N_EPOCHS = 4\n", 120 | "WEIGHT_DECAY = 0.01\n", 121 | "LEARNING_RATE = 5e-5" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "import torch\n", 142 | "torch.manual_seed(117)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Load corpus" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "INFO:transformers.file_utils:TensorFlow version 2.0.0 available.\n", 162 | "INFO:transformers.file_utils:PyTorch version 1.3.1 available.\n", 163 | "INFO:transformers.modeling_xlnet:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n", 164 | "2019-12-13 00:35:35,712 Reading data from conll2003\n", 165 | "2019-12-13 00:35:35,713 Train: conll2003/eng.train\n", 166 | "2019-12-13 00:35:35,713 Dev: conll2003/eng.testa\n", 167 | "2019-12-13 00:35:35,714 Test: conll2003/eng.testb\n", 168 | "{\n", 169 | " \"TRAIN\": {\n", 170 | " \"dataset\": \"TRAIN\",\n", 171 | " \"total_number_of_documents\": 14987,\n", 172 | " \"number_of_documents_per_class\": {},\n", 173 | " \"number_of_tokens_per_tag\": {},\n", 174 | " \"number_of_tokens\": {\n", 175 | " \"total\": 204567,\n", 176 | " \"min\": 1,\n", 177 | " \"max\": 113,\n", 178 | " \"avg\": 13.649629679055181\n", 179 | " }\n", 180 | " },\n", 181 | " \"TEST\": {\n", 182 | " \"dataset\": \"TEST\",\n", 183 | " \"total_number_of_documents\": 3684,\n", 184 | " \"number_of_documents_per_class\": {},\n", 185 | " \"number_of_tokens_per_tag\": {},\n", 186 | " \"number_of_tokens\": {\n", 187 | " \"total\": 46666,\n", 188 | " \"min\": 1,\n", 189 | " \"max\": 124,\n", 190 | " \"avg\": 12.667209554831704\n", 191 | " }\n", 192 | " },\n", 193 | " \"DEV\": {\n", 194 | " \"dataset\": \"DEV\",\n", 195 | " \"total_number_of_documents\": 3466,\n", 196 | " \"number_of_documents_per_class\": {},\n", 197 | " \"number_of_tokens_per_tag\": {},\n", 198 | " \"number_of_tokens\": {\n", 199 | " \"total\": 51578,\n", 200 | " \"min\": 1,\n", 201 | " \"max\": 109,\n", 202 | " \"avg\": 14.881130986728216\n", 203 | " }\n", 204 | " }\n", 205 | "}\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "from flair.datasets import ColumnCorpus\n", 211 | "\n", 212 | "\n", 213 | "data_folder = 'conll2003'\n", 214 | "corpus = ColumnCorpus(data_folder, \n", 215 | " {0 : 'text', 3 : 'ner'},\n", 216 | " train_file='eng.train',\n", 217 | " test_file='eng.testb',\n", 218 | " dev_file='eng.testa')\n", 219 | "\n", 220 | "print(corpus.obtain_statistics())" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "# Create model" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "INFO:pytorch_transformers.modeling_bert:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n", 240 | "INFO:pytorch_transformers.modeling_xlnet:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n", 241 | "INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at cache/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n", 242 | "INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at cache/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6\n", 243 | "INFO:pytorch_transformers.modeling_utils:Model config {\n", 244 | " \"attention_probs_dropout_prob\": 0.1,\n", 245 | " \"finetuning_task\": null,\n", 246 | " \"hidden_act\": \"gelu\",\n", 247 | " \"hidden_dropout_prob\": 0.1,\n", 248 | " \"hidden_size\": 768,\n", 249 | " \"initializer_range\": 0.02,\n", 250 | " \"intermediate_size\": 3072,\n", 251 | " \"layer_norm_eps\": 1e-12,\n", 252 | " \"max_position_embeddings\": 512,\n", 253 | " \"num_attention_heads\": 12,\n", 254 | " \"num_hidden_layers\": 12,\n", 255 | " \"num_labels\": 9,\n", 256 | " \"output_attentions\": false,\n", 257 | " \"output_hidden_states\": false,\n", 258 | " \"pruned_heads\": {},\n", 259 | " \"torchscript\": false,\n", 260 | " \"type_vocab_size\": 2,\n", 261 | " \"vocab_size\": 28996\n", 262 | "}\n", 263 | "\n", 264 | "INFO:pytorch_transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin from cache at cache/35d8b9d36faaf46728a0192d82bf7d00137490cd6074e8500778afed552a67e5.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2\n", 265 | "INFO:pytorch_transformers.modeling_utils:Weights of BertForTokenClassificationCustom not initialized from pretrained model: ['classifier.weight', 'classifier.bias']\n", 266 | "INFO:pytorch_transformers.modeling_utils:Weights from pretrained model not used in BertForTokenClassificationCustom: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "from bert_sequence_tagger import SequenceTaggerBert, BertForTokenClassificationCustom\n", 272 | "from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus\n", 273 | "\n", 274 | "from pytorch_transformers import BertTokenizer, BertForTokenClassification\n", 275 | "\n", 276 | "\n", 277 | "bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', \n", 278 | " cache_dir=CACHE_DIR, \n", 279 | " do_lower_case=False)\n", 280 | "\n", 281 | "idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)\n", 282 | "\n", 283 | "model = BertForTokenClassificationCustom.from_pretrained('bert-base-cased', \n", 284 | " cache_dir=CACHE_DIR, \n", 285 | " num_labels=len(tag2idx)).cuda()\n", 286 | "\n", 287 | "seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=bpe_tokenizer, \n", 288 | " idx2tag=idx2tag, tag2idx=tag2idx, max_len=MAX_LEN,\n", 289 | " pred_batch_size=PRED_BATCH_SIZE)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "# Train model" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stderr", 306 | "output_type": "stream", 307 | "text": [ 308 | "Epoch: 0%| | 0/4 [00:00