├── Data ├── Readme.md ├── Word_Dictionary.xlsx ├── dev.tsv └── test_set.xls ├── Introduction.png ├── Model ├── FGM.py ├── PGD.py ├── Readme.md ├── requirements.txt └── run_glue_pgd.py ├── Questionaire ├── Questionaire.pdf ├── Readme.md └── temp_US.xls └── README.md /Data/Readme.md: -------------------------------------------------------------------------------- 1 | Please note that the testing data should be used for research purpose only. 2 | -------------------------------------------------------------------------------- /Data/Word_Dictionary.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Data/Word_Dictionary.xlsx -------------------------------------------------------------------------------- /Data/test_set.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Data/test_set.xls -------------------------------------------------------------------------------- /Introduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Introduction.png -------------------------------------------------------------------------------- /Model/FGM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | class FGM(): 3 | def __init__(self, model): 4 | self.model = model 5 | self.backup = {} 6 | 7 | def attack(self, epsilon=1., emb_name='word_embeddings'): 8 | # emb_name这个参数要换成你模型中embedding的参数名 9 | for name, param in self.model.named_parameters(): 10 | if param.requires_grad and emb_name in name: 11 | self.backup[name] = param.data.clone() 12 | norm = torch.norm(param.grad) 13 | if norm != 0 and not torch.isnan(norm): 14 | r_at = epsilon * param.grad / norm 15 | param.data.add_(r_at) 16 | 17 | def restore(self, emb_name='word_embeddings'): 18 | # emb_name这个参数要换成你模型中embedding的参数名 19 | for name, param in self.model.named_parameters(): 20 | if param.requires_grad and emb_name in name: 21 | assert name in self.backup 22 | param.data = self.backup[name] 23 | self.backup = {} -------------------------------------------------------------------------------- /Model/PGD.py: -------------------------------------------------------------------------------- 1 | #Class PGD 2 | import torch 3 | class PGD(): 4 | def __init__(self, model): 5 | self.model = model 6 | self.emb_backup = {} 7 | self.grad_backup = {} 8 | 9 | def attack(self, epsilon=1., alpha=0.3, emb_name='word_embeddings', is_first_attack=False): 10 | # emb_name这个参数要换成你模型中embedding的参数名 11 | for name, param in self.model.named_parameters(): 12 | #print(name) 13 | if param.requires_grad and emb_name in name: 14 | if is_first_attack: 15 | self.emb_backup[name] = param.data.clone() 16 | norm = torch.norm(param.grad) 17 | if norm != 0 and not torch.isnan(norm): 18 | r_at = alpha * param.grad / norm 19 | param.data.add_(r_at) 20 | param.data = self.project(name, param.data, epsilon) 21 | 22 | def restore(self, emb_name='word_embeddings'): 23 | # emb_name这个参数要换成你模型中embedding的参数名 24 | for name, param in self.model.named_parameters(): 25 | #print(name) 26 | if param.requires_grad and emb_name in name: 27 | assert name in self.emb_backup 28 | param.data = self.emb_backup[name] 29 | self.emb_backup = {} 30 | 31 | def project(self, param_name, param_data, epsilon): 32 | r = param_data - self.emb_backup[param_name] 33 | if torch.norm(r) > epsilon: 34 | r = epsilon * r / torch.norm(r) 35 | return param_data + r 36 | 37 | def backup_grad(self): 38 | for name, param in self.model.named_parameters(): 39 | if param.requires_grad: 40 | self.grad_backup[name] = param.grad 41 | 42 | def restore_grad(self): 43 | for name, param in self.model.named_parameters(): 44 | if param.requires_grad: 45 | param.grad = self.grad_backup[name] -------------------------------------------------------------------------------- /Model/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | We provide the code of Transformer-based methods with adversarial traning (FGM and PGD) for text classification in this folder. 3 | -------------------------------------------------------------------------------- /Model/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | tensorboard 3 | scikit-learn 4 | seqeval 5 | transformers -------------------------------------------------------------------------------- /Model/run_glue_pgd.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import argparse 21 | import glob 22 | import logging 23 | import os 24 | import random 25 | 26 | import numpy as np 27 | import torch 28 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 29 | TensorDataset) 30 | from torch.utils.data.distributed import DistributedSampler 31 | 32 | try: 33 | from torch.utils.tensorboard import SummaryWriter 34 | except: 35 | from tensorboardX import SummaryWriter 36 | 37 | from tqdm import tqdm, trange 38 | from FGM import FGM 39 | from PGD import PGD 40 | from transformers import (WEIGHTS_NAME, BertConfig, 41 | BertForSequenceClassification, BertTokenizer, 42 | RobertaConfig, 43 | RobertaForSequenceClassification, 44 | RobertaTokenizer, 45 | XLMConfig, XLMForSequenceClassification, 46 | XLMTokenizer, XLNetConfig, 47 | XLNetForSequenceClassification, 48 | XLNetTokenizer, 49 | DistilBertConfig, 50 | DistilBertForSequenceClassification, 51 | DistilBertTokenizer) 52 | 53 | from transformers import AdamW, WarmupLinearSchedule 54 | 55 | from transformers import glue_compute_metrics as compute_metrics 56 | from transformers import glue_output_modes as output_modes 57 | from transformers import glue_processors as processors 58 | from transformers import glue_convert_examples_to_features as convert_examples_to_features 59 | import warnings 60 | #warnings.filterwarnings("ignore", category=DeprecationWarning) 61 | #warnings.filterwarnings("ignore") 62 | 63 | logger = logging.getLogger(__name__) 64 | 65 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 66 | RobertaConfig, DistilBertConfig)), ()) 67 | 68 | MODEL_CLASSES = { 69 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 70 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 71 | 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 72 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 73 | 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) 74 | } 75 | 76 | 77 | def set_seed(args): 78 | random.seed(args.seed) 79 | np.random.seed(args.seed) 80 | torch.manual_seed(args.seed) 81 | if args.n_gpu > 0: 82 | torch.cuda.manual_seed_all(args.seed) 83 | 84 | 85 | def train(args, train_dataset, model, tokenizer): 86 | """ Train the model """ 87 | if args.local_rank in [-1, 0]: 88 | tb_writer = SummaryWriter() 89 | 90 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 91 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 92 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 93 | 94 | if args.max_steps > 0: 95 | t_total = args.max_steps 96 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 97 | else: 98 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 99 | 100 | # Prepare optimizer and schedule (linear warmup and decay) 101 | no_decay = ['bias', 'LayerNorm.weight'] 102 | optimizer_grouped_parameters = [ 103 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 104 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 105 | ] 106 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 107 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) 108 | if args.fp16: 109 | try: 110 | from apex import amp 111 | except ImportError: 112 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 113 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 114 | 115 | # multi-gpu training (should be after apex fp16 initialization) 116 | if args.n_gpu > 1: 117 | model = torch.nn.DataParallel(model) 118 | 119 | # Distributed training (should be after apex fp16 initialization) 120 | if args.local_rank != -1: 121 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 122 | output_device=args.local_rank, 123 | find_unused_parameters=True) 124 | 125 | # Train! 126 | logger.info("***** Running training *****") 127 | logger.info(" Num examples = %d", len(train_dataset)) 128 | logger.info(" Num Epochs = %d", args.num_train_epochs) 129 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 130 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 131 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 132 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 133 | logger.info(" Total optimization steps = %d", t_total) 134 | 135 | global_step = 0 136 | tr_loss, logging_loss = 0.0, 0.0 137 | model.zero_grad() 138 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 139 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 140 | #fgm = FGM(model) 141 | pgd = PGD(model) 142 | 143 | for _ in train_iterator: 144 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 145 | for step, batch in enumerate(epoch_iterator): 146 | model.train() 147 | batch = tuple(t.to(args.device) for t in batch) 148 | inputs = {'input_ids': batch[0], 149 | 'attention_mask': batch[1], 150 | 'labels': batch[3]} 151 | if args.model_type != 'distilbert': 152 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 153 | outputs = model(**inputs) 154 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 155 | 156 | if args.n_gpu > 1: 157 | loss = loss.mean() # mean() to average on multi-gpu parallel training 158 | if args.gradient_accumulation_steps > 1: 159 | loss = loss / args.gradient_accumulation_steps 160 | 161 | #print("Original Loss: ", loss) 162 | if args.fp16: 163 | with amp.scale_loss(loss, optimizer) as scaled_loss: 164 | scaled_loss.backward() 165 | pgd.backup_grad() 166 | #fgm.attack() 167 | else: 168 | loss.backward() 169 | pgd.backup_grad() 170 | #fgm.attack() 171 | # loss_adv = model(**inputs) 172 | #loss_adv = loss_adv[0] 173 | #print("Loss Adv: ", loss_adv) 174 | #loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 175 | #fgm.restore() # 恢复embedding参数 176 | 177 | for t in range(3): 178 | pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data 179 | if t != 2: 180 | model.zero_grad() 181 | else: 182 | pgd.restore_grad() 183 | loss_adv = model(**inputs) 184 | loss_adv = loss_adv[0] 185 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 186 | pgd.restore() # 恢复embedding参数 187 | 188 | tr_loss += loss_adv.item() 189 | if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu: 190 | if args.fp16: 191 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 192 | else: 193 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 194 | 195 | optimizer.step() 196 | scheduler.step() # Update learning rate schedule 197 | model.zero_grad() 198 | global_step += 1 199 | 200 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 201 | # Log metrics 202 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well 203 | results = evaluate(args, model, tokenizer) 204 | for key, value in results.items(): 205 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 206 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) 207 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 208 | logging_loss = tr_loss 209 | 210 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 211 | # Save model checkpoint 212 | output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) 213 | if not os.path.exists(output_dir): 214 | os.makedirs(output_dir) 215 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 216 | model_to_save.save_pretrained(output_dir) 217 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 218 | logger.info("Saving model checkpoint to %s", output_dir) 219 | 220 | if args.tpu: 221 | args.xla_model.optimizer_step(optimizer, barrier=True) 222 | model.zero_grad() 223 | global_step += 1 224 | 225 | if args.max_steps > 0 and global_step > args.max_steps: 226 | epoch_iterator.close() 227 | break 228 | if args.max_steps > 0 and global_step > args.max_steps: 229 | train_iterator.close() 230 | break 231 | 232 | if args.local_rank in [-1, 0]: 233 | tb_writer.close() 234 | 235 | return global_step, tr_loss / global_step 236 | 237 | 238 | def evaluate(args, model, tokenizer, prefix=""): 239 | # Loop to handle MNLI double evaluation (matched, mis-matched) 240 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) 241 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) 242 | 243 | results = {} 244 | for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): 245 | eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) 246 | 247 | if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: 248 | os.makedirs(eval_output_dir) 249 | 250 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 251 | # Note that DistributedSampler samples randomly 252 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 253 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) 254 | 255 | # Eval! 256 | logger.info("***** Running evaluation {} *****".format(prefix)) 257 | logger.info(" Num examples = %d", len(eval_dataset)) 258 | logger.info(" Batch size = %d", args.eval_batch_size) 259 | eval_loss = 0.0 260 | nb_eval_steps = 0 261 | preds = None 262 | out_label_ids = None 263 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 264 | model.eval() 265 | batch = tuple(t.to(args.device) for t in batch) 266 | 267 | with torch.no_grad(): 268 | inputs = {'input_ids': batch[0], 269 | 'attention_mask': batch[1], 270 | 'labels': batch[3]} 271 | if args.model_type != 'distilbert': 272 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 273 | outputs = model(**inputs) 274 | tmp_eval_loss, logits = outputs[:2] 275 | 276 | eval_loss += tmp_eval_loss.mean().item() 277 | nb_eval_steps += 1 278 | if preds is None: 279 | preds = logits.detach().cpu().numpy() 280 | out_label_ids = inputs['labels'].detach().cpu().numpy() 281 | else: 282 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 283 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 284 | 285 | eval_loss = eval_loss / nb_eval_steps 286 | if args.output_mode == "classification": 287 | preds = np.argmax(preds, axis=1) 288 | elif args.output_mode == "regression": 289 | preds = np.squeeze(preds) 290 | result = compute_metrics(eval_task, preds, out_label_ids) 291 | results.update(result) 292 | 293 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 294 | with open(output_eval_file, "w") as writer: 295 | logger.info("***** Eval results {} *****".format(prefix)) 296 | for key in sorted(result.keys()): 297 | logger.info(" %s = %s", key, str(result[key])) 298 | writer.write("%s = %s\n" % (key, str(result[key]))) 299 | 300 | return results 301 | 302 | 303 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 304 | if args.local_rank not in [-1, 0] and not evaluate: 305 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 306 | 307 | processor = processors[task]() 308 | output_mode = output_modes[task] 309 | # Load data features from cache or dataset file 310 | cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 311 | 'dev' if evaluate else 'train', 312 | list(filter(None, args.model_name_or_path.split('/'))).pop(), 313 | str(args.max_seq_length), 314 | str(task))) 315 | if os.path.exists(cached_features_file) and not args.overwrite_cache: 316 | logger.info("Loading features from cached file %s", cached_features_file) 317 | features = torch.load(cached_features_file) 318 | else: 319 | logger.info("Creating features from dataset file at %s", args.data_dir) 320 | label_list = processor.get_labels() 321 | if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: 322 | # HACK(label indices are swapped in RoBERTa pretrained model) 323 | label_list[1], label_list[2] = label_list[2], label_list[1] 324 | examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) 325 | features = convert_examples_to_features(examples, 326 | tokenizer, 327 | label_list=label_list, 328 | max_length=args.max_seq_length, 329 | output_mode=output_mode, 330 | pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet 331 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 332 | pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, 333 | ) 334 | if args.local_rank in [-1, 0]: 335 | logger.info("Saving features into cached file %s", cached_features_file) 336 | torch.save(features, cached_features_file) 337 | 338 | if args.local_rank == 0 and not evaluate: 339 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 340 | 341 | # Convert to Tensors and build dataset 342 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 343 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) 344 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) 345 | if output_mode == "classification": 346 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long) 347 | elif output_mode == "regression": 348 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float) 349 | 350 | dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) 351 | return dataset 352 | 353 | 354 | def main(): 355 | parser = argparse.ArgumentParser() 356 | 357 | ## Required parameters 358 | parser.add_argument("--data_dir", default=None, type=str, required=True, 359 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 360 | parser.add_argument("--model_type", default=None, type=str, required=True, 361 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 362 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 363 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) 364 | parser.add_argument("--task_name", default=None, type=str, required=True, 365 | help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) 366 | parser.add_argument("--output_dir", default=None, type=str, required=True, 367 | help="The output directory where the model predictions and checkpoints will be written.") 368 | 369 | ## Other parameters 370 | parser.add_argument("--config_name", default="", type=str, 371 | help="Pretrained config name or path if not the same as model_name") 372 | parser.add_argument("--tokenizer_name", default="", type=str, 373 | help="Pretrained tokenizer name or path if not the same as model_name") 374 | parser.add_argument("--cache_dir", default="", type=str, 375 | help="Where do you want to store the pre-trained models downloaded from s3") 376 | parser.add_argument("--max_seq_length", default=128, type=int, 377 | help="The maximum total input sequence length after tokenization. Sequences longer " 378 | "than this will be truncated, sequences shorter will be padded.") 379 | parser.add_argument("--do_train", action='store_true', 380 | help="Whether to run training.") 381 | parser.add_argument("--do_eval", action='store_true', 382 | help="Whether to run eval on the dev set.") 383 | parser.add_argument("--evaluate_during_training", action='store_true', 384 | help="Rul evaluation during training at each logging step.") 385 | parser.add_argument("--do_lower_case", action='store_true', 386 | help="Set this flag if you are using an uncased model.") 387 | 388 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 389 | help="Batch size per GPU/CPU for training.") 390 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 391 | help="Batch size per GPU/CPU for evaluation.") 392 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 393 | help="Number of updates steps to accumulate before performing a backward/update pass.") 394 | parser.add_argument("--learning_rate", default=5e-5, type=float, 395 | help="The initial learning rate for Adam.") 396 | parser.add_argument("--weight_decay", default=0.0, type=float, 397 | help="Weight deay if we apply some.") 398 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 399 | help="Epsilon for Adam optimizer.") 400 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 401 | help="Max gradient norm.") 402 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 403 | help="Total number of training epochs to perform.") 404 | parser.add_argument("--max_steps", default=-1, type=int, 405 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 406 | parser.add_argument("--warmup_steps", default=0, type=int, 407 | help="Linear warmup over warmup_steps.") 408 | 409 | parser.add_argument('--logging_steps', type=int, default=50, 410 | help="Log every X updates steps.") 411 | parser.add_argument('--save_steps', type=int, default=50, 412 | help="Save checkpoint every X updates steps.") 413 | parser.add_argument("--eval_all_checkpoints", action='store_true', 414 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 415 | parser.add_argument("--no_cuda", action='store_true', 416 | help="Avoid using CUDA when available") 417 | parser.add_argument('--overwrite_output_dir', action='store_true', 418 | help="Overwrite the content of the output directory") 419 | parser.add_argument('--overwrite_cache', action='store_true', 420 | help="Overwrite the cached training and evaluation sets") 421 | parser.add_argument('--seed', type=int, default=42, 422 | help="random seed for initialization") 423 | 424 | parser.add_argument('--tpu', action='store_true', 425 | help="Whether to run on the TPU defined in the environment variables") 426 | parser.add_argument('--tpu_ip_address', type=str, default='', 427 | help="TPU IP address if none are set in the environment variables") 428 | parser.add_argument('--tpu_name', type=str, default='', 429 | help="TPU name if none are set in the environment variables") 430 | parser.add_argument('--xrt_tpu_config', type=str, default='', 431 | help="XRT TPU config if none are set in the environment variables") 432 | 433 | parser.add_argument('--fp16', action='store_true', 434 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 435 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 436 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 437 | "See details at https://nvidia.github.io/apex/amp.html") 438 | parser.add_argument("--local_rank", type=int, default=-1, 439 | help="For distributed training: local_rank") 440 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 441 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 442 | args = parser.parse_args() 443 | 444 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: 445 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 446 | 447 | # Setup distant debugging if needed 448 | if args.server_ip and args.server_port: 449 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 450 | import ptvsd 451 | print("Waiting for debugger attach") 452 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 453 | ptvsd.wait_for_attach() 454 | 455 | # Setup CUDA, GPU & distributed training 456 | if args.local_rank == -1 or args.no_cuda: 457 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 458 | args.n_gpu = torch.cuda.device_count() 459 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 460 | torch.cuda.set_device(args.local_rank) 461 | device = torch.device("cuda", args.local_rank) 462 | torch.distributed.init_process_group(backend='nccl') 463 | args.n_gpu = 1 464 | args.device = device 465 | 466 | if args.tpu: 467 | if args.tpu_ip_address: 468 | os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address 469 | if args.tpu_name: 470 | os.environ["TPU_NAME"] = args.tpu_name 471 | if args.xrt_tpu_config: 472 | os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config 473 | 474 | assert "TPU_IP_ADDRESS" in os.environ 475 | assert "TPU_NAME" in os.environ 476 | assert "XRT_TPU_CONFIG" in os.environ 477 | 478 | import torch_xla 479 | import torch_xla.core.xla_model as xm 480 | args.device = xm.xla_device() 481 | args.xla_model = xm 482 | 483 | # Setup logging 484 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 485 | datefmt = '%m/%d/%Y %H:%M:%S', 486 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 487 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 488 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 489 | 490 | # Set seed 491 | set_seed(args) 492 | 493 | # Prepare GLUE task 494 | args.task_name = args.task_name.lower() 495 | if args.task_name not in processors: 496 | raise ValueError("Task not found: %s" % (args.task_name)) 497 | processor = processors[args.task_name]() 498 | args.output_mode = output_modes[args.task_name] 499 | label_list = processor.get_labels() 500 | num_labels = len(label_list) 501 | 502 | # Load pretrained model and tokenizer 503 | if args.local_rank not in [-1, 0]: 504 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 505 | 506 | args.model_type = args.model_type.lower() 507 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 508 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 509 | num_labels=num_labels, 510 | finetuning_task=args.task_name, 511 | cache_dir=args.cache_dir if args.cache_dir else None) 512 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 513 | do_lower_case=args.do_lower_case, 514 | cache_dir=args.cache_dir if args.cache_dir else None) 515 | model = model_class.from_pretrained(args.model_name_or_path, 516 | from_tf=bool('.ckpt' in args.model_name_or_path), 517 | config=config, 518 | cache_dir=args.cache_dir if args.cache_dir else None) 519 | 520 | if args.local_rank == 0: 521 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 522 | 523 | model.to(args.device) 524 | 525 | logger.info("Training/evaluation parameters %s", args) 526 | 527 | 528 | # Training 529 | if args.do_train: 530 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 531 | global_step, tr_loss = train(args, train_dataset, model, tokenizer) 532 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 533 | 534 | 535 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() 536 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu: 537 | # Create output directory if needed 538 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 539 | os.makedirs(args.output_dir) 540 | 541 | logger.info("Saving model checkpoint to %s", args.output_dir) 542 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 543 | # They can then be reloaded using `from_pretrained()` 544 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 545 | model_to_save.save_pretrained(args.output_dir) 546 | tokenizer.save_pretrained(args.output_dir) 547 | 548 | # Good practice: save your training arguments together with the trained model 549 | torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) 550 | 551 | # Load a trained model and vocabulary that you have fine-tuned 552 | model = model_class.from_pretrained(args.output_dir) 553 | tokenizer = tokenizer_class.from_pretrained(args.output_dir) 554 | model.to(args.device) 555 | 556 | 557 | # Evaluation 558 | results = {} 559 | if args.do_eval and args.local_rank in [-1, 0]: 560 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 561 | checkpoints = [args.output_dir] 562 | if args.eval_all_checkpoints: 563 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) 564 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 565 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 566 | for checkpoint in checkpoints: 567 | global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" 568 | prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" 569 | 570 | model = model_class.from_pretrained(checkpoint) 571 | model.to(args.device) 572 | result = evaluate(args, model, tokenizer, prefix=prefix) 573 | result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) 574 | results.update(result) 575 | 576 | return results 577 | 578 | 579 | if __name__ == "__main__": 580 | main() 581 | -------------------------------------------------------------------------------- /Questionaire/Questionaire.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Questionaire/Questionaire.pdf -------------------------------------------------------------------------------- /Questionaire/Readme.md: -------------------------------------------------------------------------------- 1 | The questionaire used for user study is provided in this fold. 2 | -------------------------------------------------------------------------------- /Questionaire/temp_US.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Questionaire/temp_US.xls -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Explainable-Financial-Text-Classification 2 | Repository for COLING-20 Paper: Generating Plausible Counterfactual Explanations for Mergers and Acquisitions Prediction (M&A) 3 | 4 | 5 | 6 | If you find this repository help your research, please consider cite our following paper: 7 | 8 | @inproceedings{Yang2020GeneratingPC, 9 | title={Generating Plausible Counterfactual Explanations for Deep Transformers in Financial Text Classification}, 10 | author={Linyi Yang and Eoin M. Kenny and Tin Lok James Ng and K. Z. Zhang P.K. Kannan Y Yang and Barry Smyth and Ruihai Dong}, 11 | booktitle={COLING}, 12 | year={2020} 13 | } 14 | --------------------------------------------------------------------------------