├── .gitignore ├── requirements.txt ├── sample.tsv ├── train_command.txt ├── README.md ├── api.py ├── glue_utils.py └── run_glue.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==2.2 2 | torch>=1.2 -------------------------------------------------------------------------------- /sample.tsv: -------------------------------------------------------------------------------- 1 | text label 2 | this will contain the first sentence 0 3 | this will contain the second sentence 1 4 | -------------------------------------------------------------------------------- /train_command.txt: -------------------------------------------------------------------------------- 1 | python run_glue.py --data_dir data --model_type albert --model_name_or_path albert-base-v2 --output_dir output --do_train --task_name sst-2 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ALBERT for Sentiment Analysis 2 | 3 | ## Dataset preparation 4 | A tab seperated (.tsv) file is required with the name of train i.e. train.tsv 5 | Train dataset needs to be placed in a folder. 6 | 7 | ## How to fine-tune 8 | #### Following parameters are required 9 | 1. --data_dir - Directory where data is stored 10 | 2. --model_type - The model which we wanna use for fine-tuning. Here, we are using albert 11 | 3. --model_name_or_path - The variant of albert which you want to use. 12 | 4. --output_dir - path where you want to save the model. 13 | 5. --do_train - because we are training the model. 14 | 15 | #### Example 16 | ``` 17 | $ python run_glue.py --data_dir data --model_type albert --model_name_or_path albert-base-v2 --output_dir output --do_train 18 | ``` 19 | 20 | ## Different Models available for use 21 | | | Average | SQuAD1.1 | SQuAD2.0 | MNLI | SST-2 | RACE | 22 | |----------------|----------|----------|----------|----------|----------|----------| 23 | |V2 | 24 | |ALBERT-base |82.3 |90.2/83.2 |82.1/79.3 |84.6 |92.9 |66.8 | 25 | |ALBERT-large |85.7 |91.8/85.2 |84.9/81.8 |86.5 |94.9 |75.2 | 26 | |ALBERT-xlarge |87.9 |92.9/86.4 |87.9/84.1 |87.9 |95.4 |80.7 | 27 | |ALBERT-xxlarge |90.9 |94.6/89.1 |89.8/86.9 |90.6 |96.8 |86.8 | 28 | |V1 | 29 | |ALBERT-base |80.1 |89.3/82.3 | 80.0/77.1|81.6 |90.3 | 64.0 | 30 | |ALBERT-large |82.4 |90.6/83.9 | 82.3/79.4|83.5 |91.7 | 68.5 | 31 | |ALBERT-xlarge |85.5 |92.5/86.1 | 86.1/83.1|86.4 |92.4 | 74.8 | 32 | |ALBERT-xxlarge |91.0 |94.8/89.3 | 90.2/87.4|90.8 |96.9 | 86.5 | 33 | (table taken from Google-research) 34 | 35 | ## Prediction 36 | Both docker and python file are available for prediction. 37 | 1. Set the name of folder where model files are stored. 38 | 2. Run api.py file 39 | ``` 40 | $ python api.py 41 | ``` 42 | or 43 | ``` 44 | from api import SentimentAnalyzer 45 | classifier = SentimentAnalyzer() 46 | print(classifier.predict('the movie was nice')) 47 | ``` 48 | 49 | ## Thanks to HuggingFace for making the implementation simple and also Google for this awesome pretrained model. 50 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | from transformers import (AlbertConfig, 5 | AlbertForSequenceClassification, 6 | AlbertTokenizer, 7 | ) 8 | 9 | 10 | class SentimentAnalyzer: 11 | def __init__(self, path='model', model_type='albert-base-v2'): 12 | self.path = path 13 | self.model_type = model_type 14 | self.tokenizer = AlbertTokenizer.from_pretrained(self.model_type, do_lower_case=True) 15 | self.model = AlbertForSequenceClassification.from_pretrained(self.path) 16 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 17 | self.model.to(self.device) 18 | self.model.eval() 19 | 20 | def convert_to_features(self, sentence): 21 | 22 | text_a = sentence 23 | text_b = None 24 | max_length = 512 25 | pad_on_left = False 26 | pad_token = self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0] 27 | pad_token_segment_id = 0 28 | mask_padding_with_zero = True 29 | 30 | inputs = self.tokenizer.encode_plus( 31 | text_a, 32 | text_b, 33 | add_special_tokens=True, 34 | max_length=max_length) 35 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 36 | 37 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 38 | # tokens are attended to. 39 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 40 | 41 | # Zero-pad up to the sequence length. 42 | padding_length = max_length - len(input_ids) 43 | if pad_on_left: 44 | input_ids = ([pad_token] * padding_length) + input_ids 45 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask 46 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 47 | else: 48 | input_ids = input_ids + ([pad_token] * padding_length) 49 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 50 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 51 | return [input_ids, attention_mask, token_type_ids] 52 | 53 | def convert_to_tensors(self, features): 54 | input_ids = torch.tensor([features[0]], 55 | dtype=torch.long) 56 | 57 | attention_mask = torch.tensor([features[1]], 58 | dtype=torch.long) 59 | 60 | token_type_ids = torch.tensor([features[2]], 61 | dtype=torch.long) 62 | 63 | inputs = {'input_ids': input_ids, 64 | 'attention_mask': attention_mask} 65 | return inputs 66 | 67 | def interpret_result(self, output): 68 | result = {} 69 | logits = F.softmax(output[0][0], dim=0) 70 | logits_label = torch.argmax(logits, dim=0) 71 | logits_label = logits_label.detach().cpu().numpy().tolist() 72 | score = round(logits[logits_label].detach().cpu().numpy().tolist(), 5) 73 | logits = logits.detach().cpu().numpy().tolist() 74 | logits = [round(logit, 4) for logit in logits] 75 | result['label'] = logits_label 76 | result['confidence'] = score 77 | result['logits'] = logits 78 | return result 79 | 80 | def predict(self, text): 81 | features = self.convert_to_features(text) 82 | tensor = self.convert_to_tensors(features) 83 | outputs = self.model(**tensor) 84 | result = self.interpret_result(outputs) 85 | return result 86 | 87 | 88 | if __name__ == '__main__': 89 | text = 'Movie was very good' 90 | analyzer = SentimentAnalyzer() 91 | print(analyzer.predict(text)) 92 | -------------------------------------------------------------------------------- /glue_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ GLUE processors and helpers """ 17 | 18 | import logging 19 | import os 20 | import csv 21 | import sys 22 | import copy 23 | import json 24 | 25 | import pandas as pd 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class InputExample(object): 31 | """ 32 | A single training/test example for simple sequence classification. 33 | 34 | Args: 35 | guid: Unique id for the example. 36 | text_a: string. The untokenized text of the first sequence. For single 37 | sequence tasks, only this sequence must be specified. 38 | text_b: (Optional) string. The untokenized text of the second sequence. 39 | Only must be specified for sequence pair tasks. 40 | label: (Optional) string. The label of the example. This should be 41 | specified for train and dev examples, but not for test examples. 42 | """ 43 | def __init__(self, guid, text_a, text_b=None, label=None): 44 | self.guid = guid 45 | self.text_a = text_a 46 | self.text_b = text_b 47 | self.label = label 48 | 49 | def __repr__(self): 50 | return str(self.to_json_string()) 51 | 52 | def to_dict(self): 53 | """Serializes this instance to a Python dictionary.""" 54 | output = copy.deepcopy(self.__dict__) 55 | return output 56 | 57 | def to_json_string(self): 58 | """Serializes this instance to a JSON string.""" 59 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 60 | 61 | 62 | class InputFeatures(object): 63 | """ 64 | A single set of features of data. 65 | 66 | Args: 67 | input_ids: Indices of input sequence tokens in the vocabulary. 68 | attention_mask: Mask to avoid performing attention on padding token indices. 69 | Mask values selected in ``[0, 1]``: 70 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 71 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 72 | label: Label corresponding to the input 73 | """ 74 | 75 | def __init__(self, input_ids, attention_mask, token_type_ids, label): 76 | self.input_ids = input_ids 77 | self.attention_mask = attention_mask 78 | self.token_type_ids = token_type_ids 79 | self.label = label 80 | 81 | def __repr__(self): 82 | return str(self.to_json_string()) 83 | 84 | def to_dict(self): 85 | """Serializes this instance to a Python dictionary.""" 86 | output = copy.deepcopy(self.__dict__) 87 | return output 88 | 89 | def to_json_string(self): 90 | """Serializes this instance to a JSON string.""" 91 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 92 | 93 | 94 | class DataProcessor(object): 95 | """Base class for data converters for sequence classification data sets.""" 96 | 97 | def get_example_from_tensor_dict(self, tensor_dict): 98 | """Gets an example from a dict with tensorflow tensors 99 | 100 | Args: 101 | tensor_dict: Keys and values should match the corresponding Glue 102 | tensorflow_dataset examples. 103 | """ 104 | raise NotImplementedError() 105 | 106 | def get_train_examples(self, data_dir): 107 | """Gets a collection of `InputExample`s for the train set.""" 108 | raise NotImplementedError() 109 | 110 | def get_dev_examples(self, data_dir): 111 | """Gets a collection of `InputExample`s for the dev set.""" 112 | raise NotImplementedError() 113 | 114 | def get_labels(self): 115 | """Gets the list of labels for this data set.""" 116 | raise NotImplementedError() 117 | 118 | def tfds_map(self, example): 119 | """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 120 | This method converts examples to the correct format.""" 121 | if len(self.get_labels()) > 1: 122 | example.label = self.get_labels()[int(example.label)] 123 | return example 124 | 125 | @classmethod 126 | def _read_tsv(cls, input_file, quotechar=None): 127 | """Reads a tab separated value file.""" 128 | lines = [] 129 | df = pd.read_csv(input_file, delimiter='\t') 130 | for item in df.iterrows(): 131 | temp = [] 132 | temp.append(item[1][0]) 133 | temp.append(item[1][1]) 134 | lines.append(temp) 135 | return lines 136 | 137 | 138 | def glue_convert_examples_to_features(examples, tokenizer, 139 | max_length=256, 140 | task=None, 141 | label_list=None, 142 | output_mode=None, 143 | pad_on_left=False, 144 | pad_token=0, 145 | pad_token_segment_id=0, 146 | mask_padding_with_zero=True): 147 | """ 148 | Loads a data file into a list of ``InputFeatures`` 149 | 150 | Args: 151 | examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. 152 | tokenizer: Instance of a tokenizer that will tokenize the examples 153 | max_length: Maximum example length 154 | task: GLUE task 155 | label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method 156 | output_mode: String indicating the output mode. Either ``regression`` or ``classification`` 157 | pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) 158 | pad_token: Padding token 159 | pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) 160 | mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values 161 | and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for 162 | actual values) 163 | 164 | Returns: 165 | If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` 166 | containing the task-specific features. If the input is a list of ``InputExamples``, will return 167 | a list of task-specific ``InputFeatures`` which can be fed to the model. 168 | 169 | """ 170 | if task is not None: 171 | processor = glue_processors[task]() 172 | if label_list is None: 173 | label_list = processor.get_labels() 174 | logger.info("Using label list %s for task %s" % (label_list, task)) 175 | if output_mode is None: 176 | output_mode = glue_output_modes[task] 177 | logger.info("Using output mode %s for task %s" % (output_mode, task)) 178 | 179 | label_map = {label: i for i, label in enumerate(label_list)} 180 | 181 | features = [] 182 | for (ex_index, example) in enumerate(examples): 183 | 184 | inputs = tokenizer.encode_plus( 185 | example.text_a, 186 | example.text_b, 187 | add_special_tokens=True, 188 | max_length=max_length, 189 | ) 190 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 191 | 192 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 193 | # tokens are attended to. 194 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 195 | 196 | # Zero-pad up to the sequence length. 197 | padding_length = max_length - len(input_ids) 198 | if pad_on_left: 199 | input_ids = ([pad_token] * padding_length) + input_ids 200 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask 201 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 202 | else: 203 | input_ids = input_ids + ([pad_token] * padding_length) 204 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 205 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 206 | 207 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) 208 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) 209 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) 210 | 211 | label = label_map[str(example.label)] 212 | 213 | features.append( 214 | InputFeatures(input_ids=input_ids, 215 | attention_mask=attention_mask, 216 | token_type_ids=token_type_ids, 217 | label=label)) 218 | return features 219 | 220 | 221 | class Sst2Processor(DataProcessor): 222 | """Processor for the SST-2 data set (GLUE version).""" 223 | 224 | def get_example_from_tensor_dict(self, tensor_dict): 225 | """See base class.""" 226 | return InputExample(tensor_dict['idx'].numpy(), 227 | tensor_dict['sentence'].numpy().decode('utf-8'), 228 | None, 229 | str(tensor_dict['label'].numpy())) 230 | 231 | def get_train_examples(self, data_dir): 232 | """See base class.""" 233 | return self._create_examples( 234 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 235 | 236 | def get_dev_examples(self, data_dir): 237 | """See base class.""" 238 | return self._create_examples( 239 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 240 | 241 | def get_labels(self): 242 | """See base class.""" 243 | return ["0", "1"] 244 | 245 | def _create_examples(self, lines, set_type): 246 | """Creates examples for the training and dev sets.""" 247 | examples = [] 248 | for (i, line) in enumerate(lines): 249 | if i == 0: 250 | continue 251 | guid = "%s-%s" % (set_type, i) 252 | text_a = line[0] 253 | label = line[1] 254 | examples.append( 255 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 256 | return examples 257 | 258 | class Sst5Processor(DataProcessor): 259 | """Processor for the SST-5 data set (GLUE version).""" 260 | 261 | def get_example_from_tensor_dict(self, tensor_dict): 262 | """See base class.""" 263 | return InputExample(tensor_dict['idx'].numpy(), 264 | tensor_dict['sentence'].numpy().decode('utf-8'), 265 | None, 266 | str(tensor_dict['label'].numpy())) 267 | 268 | def get_train_examples(self, data_dir): 269 | """See base class.""" 270 | return self._create_examples( 271 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 272 | 273 | def get_dev_examples(self, data_dir): 274 | """See base class.""" 275 | return self._create_examples( 276 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 277 | 278 | def get_labels(self): 279 | """See base class.""" 280 | return ["1", "2", "3", "4", "5"] 281 | 282 | def _create_examples(self, lines, set_type): 283 | """Creates examples for the training and dev sets.""" 284 | examples = [] 285 | for (i, line) in enumerate(lines): 286 | if i == 0: 287 | continue 288 | guid = "%s-%s" % (set_type, i) 289 | text_a = line[0] 290 | label = line[1] 291 | examples.append( 292 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 293 | return examples 294 | 295 | glue_tasks_num_labels = { 296 | "sst-2": 2, 297 | "sst-5": 5, 298 | } 299 | 300 | glue_processors = { 301 | "sst-2": Sst2Processor, 302 | "sst-5": Sst5Processor, 303 | } 304 | 305 | glue_output_modes = { 306 | "sst-2": "classification", 307 | "sst-5": "classification", 308 | } 309 | 310 | 311 | def simple_accuracy(preds, labels): 312 | return (preds == labels).mean() 313 | 314 | 315 | def glue_compute_metrics(task_name, preds, labels): 316 | assert len(preds) == len(labels) 317 | if task_name == "sst-2": 318 | return {"acc": simple_accuracy(preds, labels)} 319 | elif task_name == "sst-5": 320 | return {"acc": simple_accuracy(preds, labels)} 321 | else: 322 | raise KeyError(task_name) -------------------------------------------------------------------------------- /run_glue.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import argparse 21 | import glob 22 | import logging 23 | import os 24 | import random 25 | 26 | import numpy as np 27 | import torch 28 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 29 | TensorDataset) 30 | from torch.utils.data.distributed import DistributedSampler 31 | 32 | from tqdm import tqdm, trange 33 | 34 | from transformers import (AlbertConfig, 35 | AlbertForSequenceClassification, 36 | AlbertTokenizer, 37 | ) 38 | 39 | from transformers import AdamW, get_linear_schedule_with_warmup 40 | 41 | from glue_utils import glue_compute_metrics as compute_metrics 42 | from glue_utils import glue_output_modes as output_modes 43 | from glue_utils import glue_processors as processors 44 | from glue_utils import glue_convert_examples_to_features as convert_examples_to_features 45 | 46 | logger = logging.getLogger(__name__) 47 | 48 | MODEL_CLASSES = { 49 | 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer) 50 | } 51 | 52 | 53 | def set_seed(args): 54 | random.seed(args.seed) 55 | np.random.seed(args.seed) 56 | torch.manual_seed(args.seed) 57 | if args.n_gpu > 0: 58 | torch.cuda.manual_seed_all(args.seed) 59 | 60 | 61 | def train(args, train_dataset, model, tokenizer): 62 | """ Train the model """ 63 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 64 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 65 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 66 | 67 | if args.max_steps > 0: 68 | t_total = args.max_steps 69 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 70 | else: 71 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 72 | 73 | # Prepare optimizer and schedule (linear warmup and decay) 74 | no_decay = ['bias', 'LayerNorm.weight'] 75 | optimizer_grouped_parameters = [ 76 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 77 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 78 | ] 79 | 80 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 81 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) 82 | if args.fp16: 83 | try: 84 | from apex import amp 85 | except ImportError: 86 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 87 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 88 | 89 | # multi-gpu training (should be after apex fp16 initialization) 90 | if args.n_gpu > 1: 91 | model = torch.nn.DataParallel(model) 92 | 93 | # Distributed training (should be after apex fp16 initialization) 94 | if args.local_rank != -1: 95 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 96 | output_device=args.local_rank, 97 | find_unused_parameters=True) 98 | 99 | # Train! 100 | logger.info("***** Running training *****") 101 | logger.info(" Num examples = %d", len(train_dataset)) 102 | logger.info(" Num Epochs = %d", args.num_train_epochs) 103 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 104 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 105 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 106 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 107 | logger.info(" Total optimization steps = %d", t_total) 108 | 109 | global_step = 0 110 | tr_loss, logging_loss = 0.0, 0.0 111 | model.zero_grad() 112 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 113 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 114 | for _ in train_iterator: 115 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 116 | for step, batch in enumerate(epoch_iterator): 117 | model.train() 118 | batch = tuple(t.to(args.device) for t in batch) 119 | inputs = {'input_ids': batch[0], 120 | 'attention_mask': batch[1], 121 | 'labels': batch[3]} 122 | if args.model_type != 'distilbert': 123 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 124 | outputs = model(**inputs) 125 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 126 | 127 | if args.n_gpu > 1: 128 | loss = loss.mean() # mean() to average on multi-gpu parallel training 129 | if args.gradient_accumulation_steps > 1: 130 | loss = loss / args.gradient_accumulation_steps 131 | 132 | if args.fp16: 133 | with amp.scale_loss(loss, optimizer) as scaled_loss: 134 | scaled_loss.backward() 135 | else: 136 | loss.backward() 137 | 138 | tr_loss += loss.item() 139 | if (step + 1) % args.gradient_accumulation_steps == 0: 140 | if args.fp16: 141 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 142 | else: 143 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 144 | 145 | optimizer.step() 146 | scheduler.step() # Update learning rate schedule 147 | model.zero_grad() 148 | global_step += 1 149 | 150 | if args.max_steps > 0 and global_step > args.max_steps: 151 | epoch_iterator.close() 152 | break 153 | if args.max_steps > 0 and global_step > args.max_steps: 154 | train_iterator.close() 155 | break 156 | 157 | return global_step, tr_loss / global_step 158 | 159 | 160 | def evaluate(args, model, tokenizer, prefix=""): 161 | # Loop to handle MNLI double evaluation (matched, mis-matched) 162 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) 163 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) 164 | 165 | results = {} 166 | for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): 167 | eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) 168 | 169 | if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: 170 | os.makedirs(eval_output_dir) 171 | 172 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 173 | # Note that DistributedSampler samples randomly 174 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 175 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) 176 | 177 | # multi-gpu eval 178 | if args.n_gpu > 1: 179 | model = torch.nn.DataParallel(model) 180 | 181 | # Eval! 182 | logger.info("***** Running evaluation {} *****".format(prefix)) 183 | logger.info(" Num examples = %d", len(eval_dataset)) 184 | logger.info(" Batch size = %d", args.eval_batch_size) 185 | eval_loss = 0.0 186 | nb_eval_steps = 0 187 | preds = None 188 | out_label_ids = None 189 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 190 | model.eval() 191 | batch = tuple(t.to(args.device) for t in batch) 192 | 193 | with torch.no_grad(): 194 | inputs = {'input_ids': batch[0], 195 | 'attention_mask': batch[1], 196 | 'labels': batch[3]} 197 | if args.model_type != 'distilbert': 198 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 199 | outputs = model(**inputs) 200 | tmp_eval_loss, logits = outputs[:2] 201 | 202 | eval_loss += tmp_eval_loss.mean().item() 203 | nb_eval_steps += 1 204 | if preds is None: 205 | preds = logits.detach().cpu().numpy() 206 | out_label_ids = inputs['labels'].detach().cpu().numpy() 207 | else: 208 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 209 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 210 | 211 | eval_loss = eval_loss / nb_eval_steps 212 | if args.output_mode == "classification": 213 | preds = np.argmax(preds, axis=1) 214 | elif args.output_mode == "regression": 215 | preds = np.squeeze(preds) 216 | result = compute_metrics(eval_task, preds, out_label_ids) 217 | results.update(result) 218 | 219 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 220 | with open(output_eval_file, "w") as writer: 221 | logger.info("***** Eval results {} *****".format(prefix)) 222 | for key in sorted(result.keys()): 223 | logger.info(" %s = %s", key, str(result[key])) 224 | writer.write("%s = %s\n" % (key, str(result[key]))) 225 | 226 | return results 227 | 228 | 229 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 230 | if args.local_rank not in [-1, 0] and not evaluate: 231 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 232 | 233 | processor = processors[task]() 234 | output_mode = output_modes[task] 235 | 236 | label_list = processor.get_labels() 237 | examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) 238 | features = convert_examples_to_features(examples, 239 | tokenizer, 240 | label_list=label_list, 241 | max_length=args.max_seq_length, 242 | output_mode=output_mode, 243 | pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet 244 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 245 | pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, 246 | ) 247 | 248 | 249 | if args.local_rank == 0 and not evaluate: 250 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 251 | 252 | # Convert to Tensors and build dataset 253 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 254 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) 255 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) 256 | if output_mode == "classification": 257 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long) 258 | elif output_mode == "regression": 259 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float) 260 | 261 | dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) 262 | return dataset 263 | 264 | 265 | def main(): 266 | parser = argparse.ArgumentParser() 267 | 268 | ## Required parameters 269 | parser.add_argument("--data_dir", default=None, type=str, required=True, 270 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 271 | parser.add_argument("--model_type", default=None, type=str, required=True, 272 | help="Model type") 273 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 274 | help="Path to pre-trained model or shortcut name") 275 | parser.add_argument("--task_name", default=None, type=str, required=True, 276 | help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) 277 | parser.add_argument("--output_dir", default=None, type=str, required=True, 278 | help="The output directory where the model predictions and checkpoints will be written.") 279 | 280 | ## Other parameters 281 | parser.add_argument("--config_name", default="", type=str, 282 | help="Pretrained config name or path if not the same as model_name") 283 | parser.add_argument("--tokenizer_name", default="", type=str, 284 | help="Pretrained tokenizer name or path if not the same as model_name") 285 | parser.add_argument("--cache_dir", default="", type=str, 286 | help="Where do you want to store the pre-trained models downloaded from s3") 287 | parser.add_argument("--max_seq_length", default=128, type=int, 288 | help="The maximum total input sequence length after tokenization. Sequences longer " 289 | "than this will be truncated, sequences shorter will be padded.") 290 | parser.add_argument("--do_train", action='store_true', 291 | help="Whether to run training.") 292 | parser.add_argument("--do_eval", action='store_true', 293 | help="Whether to run eval on the dev set.") 294 | parser.add_argument("--evaluate_during_training", action='store_true', 295 | help="Rul evaluation during training at each logging step.") 296 | parser.add_argument("--do_lower_case", action='store_true', default=True, 297 | help="Set this flag if you are using an uncased model.") 298 | 299 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 300 | help="Batch size per GPU/CPU for training.") 301 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 302 | help="Batch size per GPU/CPU for evaluation.") 303 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 304 | help="Number of updates steps to accumulate before performing a backward/update pass.") 305 | parser.add_argument("--learning_rate", default=5e-5, type=float, 306 | help="The initial learning rate for Adam.") 307 | parser.add_argument("--weight_decay", default=0.0, type=float, 308 | help="Weight deay if we apply some.") 309 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 310 | help="Epsilon for Adam optimizer.") 311 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 312 | help="Max gradient norm.") 313 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 314 | help="Total number of training epochs to perform.") 315 | parser.add_argument("--max_steps", default=-1, type=int, 316 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 317 | parser.add_argument("--warmup_steps", default=0, type=int, 318 | help="Linear warmup over warmup_steps.") 319 | 320 | parser.add_argument('--logging_steps', type=int, default=1000, 321 | help="Log every X updates steps.") 322 | parser.add_argument('--save_steps', type=int, default=50, 323 | help="Save checkpoint every X updates steps.") 324 | parser.add_argument("--eval_all_checkpoints", action='store_true', 325 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 326 | parser.add_argument("--no_cuda", action='store_true', 327 | help="Avoid using CUDA when available") 328 | parser.add_argument('--overwrite_output_dir', action='store_true', 329 | help="Overwrite the content of the output directory") 330 | parser.add_argument('--overwrite_cache', action='store_true', 331 | help="Overwrite the cached training and evaluation sets") 332 | parser.add_argument('--seed', type=int, default=42, 333 | help="random seed for initialization") 334 | 335 | parser.add_argument('--fp16', action='store_true', 336 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 337 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 338 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 339 | "See details at https://nvidia.github.io/apex/amp.html") 340 | parser.add_argument("--local_rank", type=int, default=-1, 341 | help="For distributed training: local_rank") 342 | args = parser.parse_args() 343 | 344 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: 345 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 346 | 347 | # Setup CUDA, GPU & distributed training 348 | if args.local_rank == -1 or args.no_cuda: 349 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 350 | args.n_gpu = torch.cuda.device_count() 351 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 352 | torch.cuda.set_device(args.local_rank) 353 | device = torch.device("cuda", args.local_rank) 354 | torch.distributed.init_process_group(backend='nccl') 355 | args.n_gpu = 1 356 | args.device = device 357 | 358 | # Setup logging 359 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 360 | datefmt = '%m/%d/%Y %H:%M:%S', 361 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 362 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 363 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 364 | 365 | # Set seed 366 | set_seed(args) 367 | 368 | # Prepare GLUE task 369 | args.task_name = args.task_name.lower() 370 | if args.task_name not in processors: 371 | raise ValueError("Task not found: %s" % (args.task_name)) 372 | processor = processors[args.task_name]() 373 | args.output_mode = output_modes[args.task_name] 374 | label_list = processor.get_labels() 375 | num_labels = len(label_list) 376 | 377 | # Load pretrained model and tokenizer 378 | if args.local_rank not in [-1, 0]: 379 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 380 | 381 | args.model_type = args.model_type.lower() 382 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 383 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 384 | num_labels=num_labels, 385 | finetuning_task=args.task_name, 386 | cache_dir=args.cache_dir if args.cache_dir else None) 387 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 388 | do_lower_case=args.do_lower_case, 389 | cache_dir=args.cache_dir if args.cache_dir else None) 390 | model = model_class.from_pretrained(args.model_name_or_path, 391 | from_tf=bool('.ckpt' in args.model_name_or_path), 392 | config=config, 393 | cache_dir=args.cache_dir if args.cache_dir else None) 394 | 395 | if args.local_rank == 0: 396 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 397 | 398 | model.to(args.device) 399 | 400 | logger.info("Training/evaluation parameters %s", args) 401 | 402 | 403 | # Training 404 | if args.do_train: 405 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 406 | global_step, tr_loss = train(args, train_dataset, model, tokenizer) 407 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 408 | 409 | 410 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() 411 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 412 | # Create output directory if needed 413 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 414 | os.makedirs(args.output_dir) 415 | 416 | logger.info("Saving model checkpoint to %s", args.output_dir) 417 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 418 | # They can then be reloaded using `from_pretrained()` 419 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 420 | model_to_save.save_pretrained(args.output_dir) 421 | tokenizer.save_pretrained(args.output_dir) 422 | 423 | # Good practice: save your training arguments together with the trained model 424 | torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) 425 | 426 | # Load a trained model and vocabulary that you have fine-tuned 427 | model = model_class.from_pretrained(args.output_dir) 428 | tokenizer = tokenizer_class.from_pretrained(args.output_dir) 429 | model.to(args.device) 430 | 431 | 432 | # Evaluation 433 | results = {} 434 | if args.do_eval and args.local_rank in [-1, 0]: 435 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 436 | checkpoints = [args.output_dir] 437 | if args.eval_all_checkpoints: 438 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) 439 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 440 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 441 | for checkpoint in checkpoints: 442 | global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" 443 | prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" 444 | 445 | model = model_class.from_pretrained(checkpoint) 446 | model.to(args.device) 447 | result = evaluate(args, model, tokenizer, prefix=prefix) 448 | result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) 449 | results.update(result) 450 | 451 | return results 452 | 453 | 454 | if __name__ == "__main__": 455 | main() 456 | --------------------------------------------------------------------------------