├── Data
    ├── Readme.md
    ├── Word_Dictionary.xlsx
    ├── dev.tsv
    └── test_set.xls
├── Introduction.png
├── Model
    ├── FGM.py
    ├── PGD.py
    ├── Readme.md
    ├── requirements.txt
    └── run_glue_pgd.py
├── Questionaire
    ├── Questionaire.pdf
    ├── Readme.md
    └── temp_US.xls
└── README.md


/Data/Readme.md:
--------------------------------------------------------------------------------
1 | Please note that the testing data should be used for research purpose only.
2 | 


--------------------------------------------------------------------------------
/Data/Word_Dictionary.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Data/Word_Dictionary.xlsx


--------------------------------------------------------------------------------
/Data/test_set.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Data/test_set.xls


--------------------------------------------------------------------------------
/Introduction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Introduction.png


--------------------------------------------------------------------------------
/Model/FGM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | class FGM():
 3 |     def __init__(self, model):
 4 |         self.model = model
 5 |         self.backup = {}
 6 | 
 7 |     def attack(self, epsilon=1., emb_name='word_embeddings'):
 8 |         # emb_name这个参数要换成你模型中embedding的参数名
 9 |         for name, param in self.model.named_parameters():
10 |             if param.requires_grad and emb_name in name:
11 |                 self.backup[name] = param.data.clone()
12 |                 norm = torch.norm(param.grad)
13 |                 if norm != 0 and not torch.isnan(norm):
14 |                     r_at = epsilon * param.grad / norm
15 |                     param.data.add_(r_at)
16 | 
17 |     def restore(self, emb_name='word_embeddings'):
18 |         # emb_name这个参数要换成你模型中embedding的参数名
19 |         for name, param in self.model.named_parameters():
20 |             if param.requires_grad and emb_name in name: 
21 |                 assert name in self.backup
22 |                 param.data = self.backup[name]
23 |         self.backup = {}


--------------------------------------------------------------------------------
/Model/PGD.py:
--------------------------------------------------------------------------------
 1 | #Class PGD
 2 | import torch
 3 | class PGD():
 4 |     def __init__(self, model):
 5 |         self.model = model
 6 |         self.emb_backup = {}
 7 |         self.grad_backup = {}
 8 | 
 9 |     def attack(self, epsilon=1., alpha=0.3, emb_name='word_embeddings', is_first_attack=False):
10 |         # emb_name这个参数要换成你模型中embedding的参数名
11 |         for name, param in self.model.named_parameters():
12 |             #print(name)
13 |             if param.requires_grad and emb_name in name:
14 |                 if is_first_attack:
15 |                     self.emb_backup[name] = param.data.clone()
16 |                 norm = torch.norm(param.grad)
17 |                 if norm != 0 and not torch.isnan(norm):
18 |                     r_at = alpha * param.grad / norm
19 |                     param.data.add_(r_at)
20 |                     param.data = self.project(name, param.data, epsilon)
21 | 
22 |     def restore(self, emb_name='word_embeddings'):
23 |         # emb_name这个参数要换成你模型中embedding的参数名
24 |         for name, param in self.model.named_parameters():
25 |             #print(name)
26 |             if param.requires_grad and emb_name in name: 
27 |                 assert name in self.emb_backup
28 |                 param.data = self.emb_backup[name]
29 |         self.emb_backup = {}
30 | 
31 |     def project(self, param_name, param_data, epsilon):
32 |         r = param_data - self.emb_backup[param_name]
33 |         if torch.norm(r) > epsilon:
34 |             r = epsilon * r / torch.norm(r)
35 |         return param_data + r
36 | 
37 |     def backup_grad(self):
38 |         for name, param in self.model.named_parameters():
39 |             if param.requires_grad:
40 |                 self.grad_backup[name] = param.grad
41 | 
42 |     def restore_grad(self):
43 |         for name, param in self.model.named_parameters():
44 |             if param.requires_grad:
45 |                 param.grad = self.grad_backup[name]


--------------------------------------------------------------------------------
/Model/Readme.md:
--------------------------------------------------------------------------------
1 | 
2 | We provide the code of Transformer-based methods with adversarial traning (FGM and PGD) for text classification in this folder.
3 | 


--------------------------------------------------------------------------------
/Model/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | tensorboard
3 | scikit-learn
4 | seqeval
5 | transformers


--------------------------------------------------------------------------------
/Model/run_glue_pgd.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import argparse
 21 | import glob
 22 | import logging
 23 | import os
 24 | import random
 25 | 
 26 | import numpy as np
 27 | import torch
 28 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 29 |                               TensorDataset)
 30 | from torch.utils.data.distributed import DistributedSampler
 31 | 
 32 | try:
 33 |     from torch.utils.tensorboard import SummaryWriter
 34 | except:
 35 |     from tensorboardX import SummaryWriter
 36 | 
 37 | from tqdm import tqdm, trange
 38 | from FGM import FGM
 39 | from PGD import PGD
 40 | from transformers import (WEIGHTS_NAME, BertConfig,
 41 |                                   BertForSequenceClassification, BertTokenizer,
 42 |                                   RobertaConfig,
 43 |                                   RobertaForSequenceClassification,
 44 |                                   RobertaTokenizer,
 45 |                                   XLMConfig, XLMForSequenceClassification,
 46 |                                   XLMTokenizer, XLNetConfig,
 47 |                                   XLNetForSequenceClassification,
 48 |                                   XLNetTokenizer,
 49 |                                   DistilBertConfig,
 50 |                                   DistilBertForSequenceClassification,
 51 |                                   DistilBertTokenizer)
 52 | 
 53 | from transformers import AdamW, WarmupLinearSchedule
 54 | 
 55 | from transformers import glue_compute_metrics as compute_metrics
 56 | from transformers import glue_output_modes as output_modes
 57 | from transformers import glue_processors as processors
 58 | from transformers import glue_convert_examples_to_features as convert_examples_to_features
 59 | import warnings
 60 | #warnings.filterwarnings("ignore", category=DeprecationWarning) 
 61 | #warnings.filterwarnings("ignore") 
 62 | 
 63 | logger = logging.getLogger(__name__)
 64 | 
 65 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
 66 |                                                                                 RobertaConfig, DistilBertConfig)), ())
 67 | 
 68 | MODEL_CLASSES = {
 69 |     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
 70 |     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
 71 |     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 72 |     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 73 |     'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 74 | }
 75 | 
 76 | 
 77 | def set_seed(args):
 78 |     random.seed(args.seed)
 79 |     np.random.seed(args.seed)
 80 |     torch.manual_seed(args.seed)
 81 |     if args.n_gpu > 0:
 82 |         torch.cuda.manual_seed_all(args.seed)
 83 | 
 84 | 
 85 | def train(args, train_dataset, model, tokenizer):
 86 |     """ Train the model """
 87 |     if args.local_rank in [-1, 0]:
 88 |         tb_writer = SummaryWriter()
 89 | 
 90 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
 91 |     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
 92 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 93 | 
 94 |     if args.max_steps > 0:
 95 |         t_total = args.max_steps
 96 |         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
 97 |     else:
 98 |         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 99 | 
100 |     # Prepare optimizer and schedule (linear warmup and decay)
101 |     no_decay = ['bias', 'LayerNorm.weight']
102 |     optimizer_grouped_parameters = [
103 |         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
104 |         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
105 |         ]
106 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
107 |     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
108 |     if args.fp16:
109 |         try:
110 |             from apex import amp
111 |         except ImportError:
112 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
113 |         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
114 | 
115 |     # multi-gpu training (should be after apex fp16 initialization)
116 |     if args.n_gpu > 1:
117 |         model = torch.nn.DataParallel(model)
118 | 
119 |     # Distributed training (should be after apex fp16 initialization)
120 |     if args.local_rank != -1:
121 |         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
122 |                                                           output_device=args.local_rank,
123 |                                                           find_unused_parameters=True)
124 | 
125 |     # Train!
126 |     logger.info("***** Running training *****")
127 |     logger.info("  Num examples = %d", len(train_dataset))
128 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
129 |     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
130 |     logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
131 |                    args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
132 |     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
133 |     logger.info("  Total optimization steps = %d", t_total)
134 | 
135 |     global_step = 0
136 |     tr_loss, logging_loss = 0.0, 0.0
137 |     model.zero_grad()
138 |     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
139 |     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
140 |     #fgm = FGM(model)
141 |     pgd = PGD(model)
142 |     
143 |     for _ in train_iterator:
144 |         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
145 |         for step, batch in enumerate(epoch_iterator):
146 |             model.train()
147 |             batch = tuple(t.to(args.device) for t in batch)
148 |             inputs = {'input_ids':      batch[0],
149 |                       'attention_mask': batch[1],
150 |                       'labels':         batch[3]}
151 |             if args.model_type != 'distilbert':
152 |                 inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
153 |             outputs = model(**inputs)
154 |             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
155 | 
156 |             if args.n_gpu > 1:
157 |                 loss = loss.mean() # mean() to average on multi-gpu parallel training
158 |             if args.gradient_accumulation_steps > 1:
159 |                 loss = loss / args.gradient_accumulation_steps
160 |             
161 |             #print("Original Loss: ", loss)
162 |             if args.fp16:
163 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
164 |                     scaled_loss.backward()
165 |                     pgd.backup_grad()
166 |                     #fgm.attack()
167 |             else:
168 |                 loss.backward()
169 |                 pgd.backup_grad()
170 |                 #fgm.attack()
171 |            # loss_adv = model(**inputs)
172 |             #loss_adv = loss_adv[0]
173 |             #print("Loss Adv: ", loss_adv)
174 |             #loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
175 |             #fgm.restore() # 恢复embedding参数
176 |             
177 |             for t in range(3):
178 |                 pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
179 |                 if t != 2:
180 |                     model.zero_grad()
181 |                 else:
182 |                     pgd.restore_grad()
183 |                 loss_adv = model(**inputs)
184 |                 loss_adv = loss_adv[0]
185 |                 loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
186 |             pgd.restore() # 恢复embedding参数
187 | 
188 |             tr_loss += loss_adv.item()
189 |             if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
190 |                 if args.fp16:
191 |                     torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
192 |                 else:
193 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
194 | 
195 |                 optimizer.step()
196 |                 scheduler.step()  # Update learning rate schedule
197 |                 model.zero_grad()
198 |                 global_step += 1
199 | 
200 |                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
201 |                     # Log metrics
202 |                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
203 |                         results = evaluate(args, model, tokenizer)
204 |                         for key, value in results.items():
205 |                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
206 |                     tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
207 |                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
208 |                     logging_loss = tr_loss
209 | 
210 |                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
211 |                     # Save model checkpoint
212 |                     output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
213 |                     if not os.path.exists(output_dir):
214 |                         os.makedirs(output_dir)
215 |                     model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
216 |                     model_to_save.save_pretrained(output_dir)
217 |                     torch.save(args, os.path.join(output_dir, 'training_args.bin'))
218 |                     logger.info("Saving model checkpoint to %s", output_dir)
219 | 
220 |             if args.tpu:
221 |                 args.xla_model.optimizer_step(optimizer, barrier=True)
222 |                 model.zero_grad()
223 |                 global_step += 1
224 | 
225 |             if args.max_steps > 0 and global_step > args.max_steps:
226 |                 epoch_iterator.close()
227 |                 break
228 |         if args.max_steps > 0 and global_step > args.max_steps:
229 |             train_iterator.close()
230 |             break
231 | 
232 |     if args.local_rank in [-1, 0]:
233 |         tb_writer.close()
234 | 
235 |     return global_step, tr_loss / global_step
236 | 
237 | 
238 | def evaluate(args, model, tokenizer, prefix=""):
239 |     # Loop to handle MNLI double evaluation (matched, mis-matched)
240 |     eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
241 |     eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
242 | 
243 |     results = {}
244 |     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
245 |         eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
246 | 
247 |         if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
248 |             os.makedirs(eval_output_dir)
249 | 
250 |         args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
251 |         # Note that DistributedSampler samples randomly
252 |         eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
253 |         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
254 | 
255 |         # Eval!
256 |         logger.info("***** Running evaluation {} *****".format(prefix))
257 |         logger.info("  Num examples = %d", len(eval_dataset))
258 |         logger.info("  Batch size = %d", args.eval_batch_size)
259 |         eval_loss = 0.0
260 |         nb_eval_steps = 0
261 |         preds = None
262 |         out_label_ids = None
263 |         for batch in tqdm(eval_dataloader, desc="Evaluating"):
264 |             model.eval()
265 |             batch = tuple(t.to(args.device) for t in batch)
266 | 
267 |             with torch.no_grad():
268 |                 inputs = {'input_ids':      batch[0],
269 |                           'attention_mask': batch[1],
270 |                           'labels':         batch[3]}
271 |                 if args.model_type != 'distilbert':
272 |                     inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
273 |                 outputs = model(**inputs)
274 |                 tmp_eval_loss, logits = outputs[:2]
275 | 
276 |                 eval_loss += tmp_eval_loss.mean().item()
277 |             nb_eval_steps += 1
278 |             if preds is None:
279 |                 preds = logits.detach().cpu().numpy()
280 |                 out_label_ids = inputs['labels'].detach().cpu().numpy()
281 |             else:
282 |                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
283 |                 out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
284 | 
285 |         eval_loss = eval_loss / nb_eval_steps
286 |         if args.output_mode == "classification":
287 |             preds = np.argmax(preds, axis=1)
288 |         elif args.output_mode == "regression":
289 |             preds = np.squeeze(preds)
290 |         result = compute_metrics(eval_task, preds, out_label_ids)
291 |         results.update(result)
292 | 
293 |         output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
294 |         with open(output_eval_file, "w") as writer:
295 |             logger.info("***** Eval results {} *****".format(prefix))
296 |             for key in sorted(result.keys()):
297 |                 logger.info("  %s = %s", key, str(result[key]))
298 |                 writer.write("%s = %s\n" % (key, str(result[key])))
299 | 
300 |     return results
301 | 
302 | 
303 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
304 |     if args.local_rank not in [-1, 0] and not evaluate:
305 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
306 | 
307 |     processor = processors[task]()
308 |     output_mode = output_modes[task]
309 |     # Load data features from cache or dataset file
310 |     cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
311 |         'dev' if evaluate else 'train',
312 |         list(filter(None, args.model_name_or_path.split('/'))).pop(),
313 |         str(args.max_seq_length),
314 |         str(task)))
315 |     if os.path.exists(cached_features_file) and not args.overwrite_cache:
316 |         logger.info("Loading features from cached file %s", cached_features_file)
317 |         features = torch.load(cached_features_file)
318 |     else:
319 |         logger.info("Creating features from dataset file at %s", args.data_dir)
320 |         label_list = processor.get_labels()
321 |         if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
322 |             # HACK(label indices are swapped in RoBERTa pretrained model)
323 |             label_list[1], label_list[2] = label_list[2], label_list[1] 
324 |         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
325 |         features = convert_examples_to_features(examples,
326 |                                                 tokenizer,
327 |                                                 label_list=label_list,
328 |                                                 max_length=args.max_seq_length,
329 |                                                 output_mode=output_mode,
330 |                                                 pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
331 |                                                 pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
332 |                                                 pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
333 |         )
334 |         if args.local_rank in [-1, 0]:
335 |             logger.info("Saving features into cached file %s", cached_features_file)
336 |             torch.save(features, cached_features_file)
337 | 
338 |     if args.local_rank == 0 and not evaluate:
339 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
340 | 
341 |     # Convert to Tensors and build dataset
342 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
343 |     all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
344 |     all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
345 |     if output_mode == "classification":
346 |         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
347 |     elif output_mode == "regression":
348 |         all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
349 | 
350 |     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
351 |     return dataset
352 | 
353 | 
354 | def main():
355 |     parser = argparse.ArgumentParser()
356 | 
357 |     ## Required parameters
358 |     parser.add_argument("--data_dir", default=None, type=str, required=True,
359 |                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
360 |     parser.add_argument("--model_type", default=None, type=str, required=True,
361 |                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
362 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
363 |                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
364 |     parser.add_argument("--task_name", default=None, type=str, required=True,
365 |                         help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
366 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
367 |                         help="The output directory where the model predictions and checkpoints will be written.")
368 | 
369 |     ## Other parameters
370 |     parser.add_argument("--config_name", default="", type=str,
371 |                         help="Pretrained config name or path if not the same as model_name")
372 |     parser.add_argument("--tokenizer_name", default="", type=str,
373 |                         help="Pretrained tokenizer name or path if not the same as model_name")
374 |     parser.add_argument("--cache_dir", default="", type=str,
375 |                         help="Where do you want to store the pre-trained models downloaded from s3")
376 |     parser.add_argument("--max_seq_length", default=128, type=int,
377 |                         help="The maximum total input sequence length after tokenization. Sequences longer "
378 |                              "than this will be truncated, sequences shorter will be padded.")
379 |     parser.add_argument("--do_train", action='store_true',
380 |                         help="Whether to run training.")
381 |     parser.add_argument("--do_eval", action='store_true',
382 |                         help="Whether to run eval on the dev set.")
383 |     parser.add_argument("--evaluate_during_training", action='store_true',
384 |                         help="Rul evaluation during training at each logging step.")
385 |     parser.add_argument("--do_lower_case", action='store_true',
386 |                         help="Set this flag if you are using an uncased model.")
387 | 
388 |     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
389 |                         help="Batch size per GPU/CPU for training.")
390 |     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
391 |                         help="Batch size per GPU/CPU for evaluation.")
392 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
393 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
394 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
395 |                         help="The initial learning rate for Adam.")
396 |     parser.add_argument("--weight_decay", default=0.0, type=float,
397 |                         help="Weight deay if we apply some.")
398 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
399 |                         help="Epsilon for Adam optimizer.")
400 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
401 |                         help="Max gradient norm.")
402 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
403 |                         help="Total number of training epochs to perform.")
404 |     parser.add_argument("--max_steps", default=-1, type=int,
405 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
406 |     parser.add_argument("--warmup_steps", default=0, type=int,
407 |                         help="Linear warmup over warmup_steps.")
408 | 
409 |     parser.add_argument('--logging_steps', type=int, default=50,
410 |                         help="Log every X updates steps.")
411 |     parser.add_argument('--save_steps', type=int, default=50,
412 |                         help="Save checkpoint every X updates steps.")
413 |     parser.add_argument("--eval_all_checkpoints", action='store_true',
414 |                         help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
415 |     parser.add_argument("--no_cuda", action='store_true',
416 |                         help="Avoid using CUDA when available")
417 |     parser.add_argument('--overwrite_output_dir', action='store_true',
418 |                         help="Overwrite the content of the output directory")
419 |     parser.add_argument('--overwrite_cache', action='store_true',
420 |                         help="Overwrite the cached training and evaluation sets")
421 |     parser.add_argument('--seed', type=int, default=42,
422 |                         help="random seed for initialization")
423 | 
424 |     parser.add_argument('--tpu', action='store_true',
425 |                         help="Whether to run on the TPU defined in the environment variables")
426 |     parser.add_argument('--tpu_ip_address', type=str, default='',
427 |                         help="TPU IP address if none are set in the environment variables")
428 |     parser.add_argument('--tpu_name', type=str, default='',
429 |                         help="TPU name if none are set in the environment variables")
430 |     parser.add_argument('--xrt_tpu_config', type=str, default='',
431 |                         help="XRT TPU config if none are set in the environment variables")
432 | 
433 |     parser.add_argument('--fp16', action='store_true',
434 |                         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
435 |     parser.add_argument('--fp16_opt_level', type=str, default='O1',
436 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
437 |                              "See details at https://nvidia.github.io/apex/amp.html")
438 |     parser.add_argument("--local_rank", type=int, default=-1,
439 |                         help="For distributed training: local_rank")
440 |     parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
441 |     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
442 |     args = parser.parse_args()
443 | 
444 |     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
445 |         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
446 | 
447 |     # Setup distant debugging if needed
448 |     if args.server_ip and args.server_port:
449 |         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
450 |         import ptvsd
451 |         print("Waiting for debugger attach")
452 |         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
453 |         ptvsd.wait_for_attach()
454 | 
455 |     # Setup CUDA, GPU & distributed training
456 |     if args.local_rank == -1 or args.no_cuda:
457 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
458 |         args.n_gpu = torch.cuda.device_count()
459 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
460 |         torch.cuda.set_device(args.local_rank)
461 |         device = torch.device("cuda", args.local_rank)
462 |         torch.distributed.init_process_group(backend='nccl')
463 |         args.n_gpu = 1
464 |     args.device = device
465 | 
466 |     if args.tpu:
467 |         if args.tpu_ip_address:
468 |             os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address
469 |         if args.tpu_name:
470 |             os.environ["TPU_NAME"] = args.tpu_name
471 |         if args.xrt_tpu_config:
472 |             os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config
473 | 
474 |         assert "TPU_IP_ADDRESS" in os.environ
475 |         assert "TPU_NAME" in os.environ
476 |         assert "XRT_TPU_CONFIG" in os.environ
477 | 
478 |         import torch_xla
479 |         import torch_xla.core.xla_model as xm
480 |         args.device = xm.xla_device()
481 |         args.xla_model = xm
482 | 
483 |     # Setup logging
484 |     logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
485 |                         datefmt = '%m/%d/%Y %H:%M:%S',
486 |                         level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
487 |     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
488 |                     args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
489 | 
490 |     # Set seed
491 |     set_seed(args)
492 | 
493 |     # Prepare GLUE task
494 |     args.task_name = args.task_name.lower()
495 |     if args.task_name not in processors:
496 |         raise ValueError("Task not found: %s" % (args.task_name))
497 |     processor = processors[args.task_name]()
498 |     args.output_mode = output_modes[args.task_name]
499 |     label_list = processor.get_labels()
500 |     num_labels = len(label_list)
501 | 
502 |     # Load pretrained model and tokenizer
503 |     if args.local_rank not in [-1, 0]:
504 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
505 | 
506 |     args.model_type = args.model_type.lower()
507 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
508 |     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
509 |                                           num_labels=num_labels,
510 |                                           finetuning_task=args.task_name,
511 |                                           cache_dir=args.cache_dir if args.cache_dir else None)
512 |     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
513 |                                                 do_lower_case=args.do_lower_case,
514 |                                                 cache_dir=args.cache_dir if args.cache_dir else None)
515 |     model = model_class.from_pretrained(args.model_name_or_path,
516 |                                         from_tf=bool('.ckpt' in args.model_name_or_path),
517 |                                         config=config,
518 |                                         cache_dir=args.cache_dir if args.cache_dir else None)
519 | 
520 |     if args.local_rank == 0:
521 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
522 | 
523 |     model.to(args.device)
524 | 
525 |     logger.info("Training/evaluation parameters %s", args)
526 | 
527 | 
528 |     # Training
529 |     if args.do_train:
530 |         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
531 |         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
532 |         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
533 | 
534 | 
535 |     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
536 |     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu:
537 |         # Create output directory if needed
538 |         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
539 |             os.makedirs(args.output_dir)
540 | 
541 |         logger.info("Saving model checkpoint to %s", args.output_dir)
542 |         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
543 |         # They can then be reloaded using `from_pretrained()`
544 |         model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
545 |         model_to_save.save_pretrained(args.output_dir)
546 |         tokenizer.save_pretrained(args.output_dir)
547 | 
548 |         # Good practice: save your training arguments together with the trained model
549 |         torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
550 | 
551 |         # Load a trained model and vocabulary that you have fine-tuned
552 |         model = model_class.from_pretrained(args.output_dir)
553 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
554 |         model.to(args.device)
555 | 
556 | 
557 |     # Evaluation
558 |     results = {}
559 |     if args.do_eval and args.local_rank in [-1, 0]:
560 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
561 |         checkpoints = [args.output_dir]
562 |         if args.eval_all_checkpoints:
563 |             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
564 |             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
565 |         logger.info("Evaluate the following checkpoints: %s", checkpoints)
566 |         for checkpoint in checkpoints:
567 |             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
568 |             prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
569 |             
570 |             model = model_class.from_pretrained(checkpoint)
571 |             model.to(args.device)
572 |             result = evaluate(args, model, tokenizer, prefix=prefix)
573 |             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
574 |             results.update(result)
575 | 
576 |     return results
577 | 
578 | 
579 | if __name__ == "__main__":
580 |     main()
581 | 


--------------------------------------------------------------------------------
/Questionaire/Questionaire.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Questionaire/Questionaire.pdf


--------------------------------------------------------------------------------
/Questionaire/Readme.md:
--------------------------------------------------------------------------------
1 | The questionaire used for user study is provided in this fold.
2 | 


--------------------------------------------------------------------------------
/Questionaire/temp_US.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangLinyi/Explainable-Financial-Text-Classification/fe1eaa03443dd36144ae836a9e547fba2b553157/Questionaire/temp_US.xls


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Explainable-Financial-Text-Classification
 2 | Repository for COLING-20 Paper: Generating Plausible Counterfactual Explanations for Mergers and Acquisitions Prediction (M&amp;A)
 3 | 
 4 | 
 5 | 
 6 | If you find this repository help your research, please consider cite our following paper:
 7 |     
 8 |     @inproceedings{Yang2020GeneratingPC,
 9 |     title={Generating Plausible Counterfactual Explanations for Deep Transformers in Financial Text Classification},
10 |     author={Linyi Yang and Eoin M. Kenny and Tin Lok James Ng and K. Z. Zhang P.K. Kannan Y Yang and Barry Smyth and Ruihai Dong},
11 |     booktitle={COLING},
12 |     year={2020}
13 |   }
14 | 


--------------------------------------------------------------------------------