├── README.md ├── arguments.py ├── attack.sh ├── attack ├── __init__.py ├── attack_recipe.py ├── custom_dataset.py ├── ranmask_wrapper.py ├── sklearn_utils.py └── utils.py ├── data_in.zip ├── main.py ├── run_attack.py ├── train.sh └── utils ├── __init__.py ├── flooding_model.py ├── metric_based.py ├── metric_utils.py ├── ranmask_model.py ├── rdrop.py ├── scrn_model.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Robust-AIGC-Detector 2 | 3 | Code for ACL 2024 long paper: Are AI-Generated Text Detectors Robust to Adversarial Perturbations? 4 | 5 | ### Environments 6 | 7 | ```bash 8 | torch==1.11.0 9 | transformers==4.30.2 10 | textattack==0.3.9 11 | tensorflow==2.9.1 12 | tensorflow_hub==0.15.0 13 | ``` 14 | 15 | 16 | ### Data Preparation 17 | 18 | ```bash 19 | unzip data_in.zip 20 | mkdir data_out 21 | ``` 22 | 23 | ### Training 24 | ```bash 25 | $ bash train.sh 26 | ``` 27 | 28 | ### Checkpoints 29 | The checkpoints of in-domain detector, cross-domain detector, and cross-genre detector can be found in . (These detectors are trained on the same training set and evaluated on different test sets.) 30 | 31 | The checkpoint of mixed-source detector can be found in . 32 | 33 | ### Robustness Evaluation 34 | ```bash 35 | $ bash attack.sh 36 | ``` 37 | 38 | ### Citation 39 | If you find our work useful to your research, you can cite the paper below: 40 | ```bash 41 | @article{huang2024ai, 42 | title={Are AI-Generated Text Detectors Robust to Adversarial Perturbations?}, 43 | author={Huang, Guanhua and Zhang, Yuchen and Li, Zhe and You, Yongjian and Wang, Mingze and Yang, Zhouwang}, 44 | journal={arXiv preprint arXiv:2406.01179}, 45 | year={2024} 46 | } 47 | ``` -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import transformers 3 | from typing import Optional 4 | from dataclasses import dataclass, field 5 | from transformers import TrainingArguments as OriginalTrainingArguments 6 | 7 | 8 | @dataclass 9 | class ModelArguments: 10 | """ 11 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 12 | """ 13 | 14 | model: str = field( 15 | default="BERT", 16 | metadata={"help": "Model name (BERT, BART, ALBERT, ... )"} 17 | ) 18 | 19 | model_name_or_path: str = field( 20 | default=None, 21 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 22 | ) 23 | 24 | metric_base_model_name_or_path: str = field( 25 | default='gpt2', 26 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 27 | ) 28 | 29 | config_name: Optional[str] = field( 30 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 31 | ) 32 | 33 | tokenizer_name: Optional[str] = field( 34 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 35 | ) 36 | 37 | cache_dir: Optional[str] = field( 38 | default=".cache", metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} 39 | ) 40 | 41 | use_auth_token: bool = field( 42 | default=False, 43 | metadata={ 44 | "help": ( 45 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script " 46 | "with private models)." 47 | ) 48 | }, 49 | ) 50 | train_mask_percentage: float = field(default=0.3, metadata={"help": "RanMask train mask rate."}) 51 | infer_mask_percentage: float = field(default=0.3, metadata={"help": "RanMask inference mask rate."}) 52 | ensemble_num: float = field(default=100, metadata={"help": "RanMask inference ensemble number."}) 53 | ensemble_method: str = field(default="votes", metadata={"help": "RanMask inference ensemble method."}) 54 | 55 | 56 | @dataclass 57 | class DataTrainingArguments: 58 | """ 59 | Arguments pertaining to what data we are going to input our model for training and eval. 60 | """ 61 | 62 | task_name: str = field(default="classification", metadata={"help": "The name of the task"}) 63 | data_files: str = field(default="data_in", metadata={"help": "Should contain the data files for the task."}) 64 | num_labels: int = field(default=2, metadata={"help": "The number of labels on dataset"}) 65 | max_seq_length: int = field( 66 | default=512, 67 | metadata={ 68 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 69 | "than this will be truncated, sequences shorter will be padded." 70 | }, 71 | ) 72 | preprocessing_num_workers: Optional[int] = field( 73 | default=None, 74 | metadata={"help": "The number of processes to use for the preprocessing."}, 75 | ) 76 | overwrite_cache: bool = field( 77 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 78 | ) 79 | 80 | 81 | @dataclass 82 | class TrainingArguments(OriginalTrainingArguments): 83 | 84 | do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) 85 | do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) 86 | output_dir: str = field( 87 | default="data_out", 88 | metadata={"help": "The output directory where the model predictions and checkpoints will be written."} 89 | ) 90 | logging_dir: Optional[str] = field(default="data_out", metadata={"help": "Tensorboard log dir."}) 91 | eval_delay: Optional[float] = 0 92 | evaluation_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch' 93 | save_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch' 94 | logging_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch' 95 | lr_scheduler_type: typing.Union[transformers.trainer_utils.SchedulerType, str] = 'linear' -------------------------------------------------------------------------------- /attack.sh: -------------------------------------------------------------------------------- 1 | export OUTPUT_DIR=./data_out/scrn_in-domain # path/model_dataset 2 | export MODEL_TYPE=hf 3 | export BASE_MODEL=roberta-base 4 | export NUM_EXAMPLES=200 5 | export ENSEMBLE_NUM=1 6 | export MASK_PERCENTAGE=0.30 7 | export TRANSFER_DATASET_ABBR=self 8 | export ATTACK_CLASS=ai # [human, ai] 9 | export ATTACK_RECIPE=deep-word-bug # [pwws, deep-word-bug, pruthi] 10 | 11 | python3 -u run_attack.py \ 12 | --model_type ${MODEL_TYPE} \ 13 | --bert_name_or_path ${BASE_MODEL} \ 14 | --metric_base_model_name_or_path gpt2 \ 15 | --attack_class ${ATTACK_CLASS} \ 16 | --attack_recipe ${ATTACK_RECIPE} \ 17 | --transfer_dataset_abbr ${TRANSFER_DATASET_ABBR} \ 18 | --output_dir ${OUTPUT_DIR} \ 19 | --num_examples ${NUM_EXAMPLES} \ 20 | --ensemble_num ${ENSEMBLE_NUM} \ 21 | --mask_percentage ${MASK_PERCENTAGE} \ -------------------------------------------------------------------------------- /attack/__init__.py: -------------------------------------------------------------------------------- 1 | from .ranmask_wrapper import HuggingFaceModelMaskEnsembleWrapper -------------------------------------------------------------------------------- /attack/attack_recipe.py: -------------------------------------------------------------------------------- 1 | from textattack import Attack 2 | from textattack.constraints.pre_transformation import ( 3 | RepeatModification, 4 | StopwordModification, 5 | ) 6 | from textattack.constraints.pre_transformation import MaxModificationRate 7 | from textattack.constraints.overlap import MaxWordsPerturbed 8 | from textattack.goal_functions import InputReduction, UntargetedClassification 9 | from textattack.search_methods import GreedyWordSwapWIR 10 | from textattack.transformations import WordDeletion, WordSwapWordNet 11 | 12 | from textattack.attack_recipes import AttackRecipe 13 | 14 | 15 | class InputReductionFeng2018(AttackRecipe): 16 | """Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018). 17 | 18 | Pathologies of Neural Models Make Interpretations Difficult. 19 | 20 | https://arxiv.org/abs/1804.07781 21 | """ 22 | 23 | @staticmethod 24 | def build(model_wrapper): 25 | # At each step, we remove the word with the lowest importance value until 26 | # the model changes its prediction. 27 | transformation = WordDeletion() 28 | 29 | constraints = [RepeatModification(), StopwordModification()] 30 | # 31 | # Goal is untargeted classification 32 | # 33 | goal_function = InputReduction(model_wrapper, maximizable=True) 34 | # 35 | # "For each word in an input sentence, we measure its importance by the 36 | # change in the confidence of the original prediction when we remove 37 | # that word from the sentence." 38 | # 39 | # "Instead of looking at the words with high importance values—what 40 | # interpretation methods commonly do—we take a complementary approach 41 | # and study how the model behaves when the supposedly unimportant words are 42 | # removed." 43 | # 44 | search_method = GreedyWordSwapWIR(wir_method="delete") 45 | 46 | return Attack(goal_function, constraints, transformation, search_method) 47 | 48 | class PWWSRen2019_threshold(AttackRecipe): 49 | """Add threshold 50 | """ 51 | 52 | @staticmethod 53 | def build(model_wrapper, target_max_score=None): 54 | transformation = WordSwapWordNet() 55 | constraints = [RepeatModification(), StopwordModification()] 56 | goal_function = UntargetedClassification(model_wrapper, target_max_score=target_max_score) 57 | # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is 58 | search_method = GreedyWordSwapWIR("weighted-saliency") 59 | return Attack(goal_function, constraints, transformation, search_method) -------------------------------------------------------------------------------- /attack/custom_dataset.py: -------------------------------------------------------------------------------- 1 | import textattack 2 | import pandas as pd 3 | import numpy as np 4 | import json 5 | import random 6 | from datasets import load_dataset 7 | 8 | def default_load_json(json_file_path, encoding='utf-8', **kwargs): 9 | with open(json_file_path, 'r', encoding=encoding) as fin: 10 | tmp_json = json.load(fin, **kwargs) 11 | return tmp_json 12 | 13 | def dump_jsonline(json_file_path, data, encoding="utf-8"): 14 | with open(json_file_path, "wt", encoding=encoding) as fout: 15 | for ins in data: 16 | fout.write(f"{json.dumps(ins, ensure_ascii=False)}\n") 17 | fout.close() 18 | return 0 19 | 20 | def load_attack_dataset(data_files, attack_class='ai'): 21 | dataset_abbr = data_files.split('/')[-1] 22 | if dataset_abbr in ["in-domain", "cross-domain", "cross-genre", "mixed-source"]: 23 | # these datasets have been shuffled in train/test split 24 | data = load_dataset( 25 | 'json', 26 | data_files={"train": data_files + "/train.json", 27 | "test": data_files + "/test.json", }, 28 | )["test"] 29 | if attack_class == 'ai': 30 | dataset = [] 31 | for x in data: 32 | if x['labels'] == 1: 33 | dataset.append((x['text'], x['labels'])) 34 | elif attack_class == 'human': 35 | dataset = [] 36 | for x in data: 37 | if x['labels'] == 0: 38 | dataset.append((x['text'], x['labels'])) 39 | else: 40 | raise ValueError('Dataset not exist: %s'%data_files) 41 | else: 42 | raise ValueError('Attack class not exist: %s'%attack_class) 43 | 44 | return textattack.datasets.Dataset(dataset) -------------------------------------------------------------------------------- /attack/ranmask_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | HuggingFace Model Wrapper 3 | -------------------------- 4 | """ 5 | import os 6 | import torch 7 | import transformers 8 | 9 | import textattack 10 | import numpy as np 11 | from textattack.models.wrappers import PyTorchModelWrapper 12 | 13 | from typing import List, Tuple 14 | from scipy.special import softmax 15 | from sklearn.preprocessing import normalize 16 | from torch import nn as nn 17 | from transformers import PreTrainedTokenizer, AutoModelForMaskedLM, RobertaTokenizer 18 | 19 | class HuggingFaceModelMaskEnsembleWrapper(PyTorchModelWrapper): 20 | """Loads a HuggingFace ``transformers`` model and tokenizer.""" 21 | def __init__(self, model, tokenizer, mask_percentage=0.30, ensemble_num=3, ensemble_method="vote", batch_size=32): 22 | self.model = model 23 | self.tokenizer = tokenizer 24 | self.mask_percentage = mask_percentage 25 | self.ensemble_num = ensemble_num 26 | self.batch_size = batch_size 27 | self.ensemble_method = ensemble_method 28 | 29 | def __call__(self, text_input_list): 30 | """Passes inputs to HuggingFace models as keyword arguments. 31 | 32 | (Regular PyTorch ``nn.Module`` models typically take inputs as 33 | positional arguments.) 34 | """ 35 | # Default max length is set to be int(1e30), so we force 512 to enable batching. 36 | max_length = ( 37 | 512 38 | if self.tokenizer.model_max_length == int(1e30) 39 | else self.tokenizer.model_max_length 40 | ) 41 | # start ensemble 42 | ensemble_mask_text_input_list = self.ensemble_mask_tokens(text_input_list, 43 | mask_percentage=self.mask_percentage, 44 | ensemble_num=self.ensemble_num, 45 | mask_token=self.tokenizer.mask_token) 46 | outputs_list = [] 47 | i = 0 48 | while i < len(ensemble_mask_text_input_list): 49 | batched_text_input_list = ensemble_mask_text_input_list[i : i + self.batch_size] 50 | inputs_dict = self.tokenizer( 51 | batched_text_input_list, 52 | add_special_tokens=True, 53 | padding="max_length", 54 | max_length=max_length, 55 | truncation=True, 56 | return_tensors="pt", 57 | ) 58 | model_device = next(self.model.parameters()).device 59 | inputs_dict.to(model_device) 60 | 61 | with torch.no_grad(): 62 | outputs = self.model(**inputs_dict) 63 | 64 | if isinstance(outputs[0], str): 65 | # HuggingFace sequence-to-sequence models return a list of 66 | # string predictions as output. In this case, return the full 67 | # list of outputs. 68 | outputs_list.append(outputs) 69 | else: 70 | # HuggingFace classification models return a tuple as output 71 | # where the first item in the tuple corresponds to the list of 72 | # scores for each input. 73 | outputs_list.append(outputs.logits) 74 | i += self.batch_size 75 | # logits ensemble 76 | output_logits = torch.cat(outputs_list, dim=0).cpu().numpy() #[bsz, label_num] 77 | ensemble_logits_for_each_input = np.split(output_logits, indices_or_sections=len(text_input_list), axis=0) 78 | logits_list = [] 79 | for logits in ensemble_logits_for_each_input: 80 | if self.ensemble_method == 'votes': 81 | voted_label = np.argmax(np.bincount(np.argmax(logits, axis=-1), minlength=logits.shape[-1])) 82 | voted_logits_array = logits[np.where(np.argmax(logits, axis=-1)==voted_label)[0]] 83 | voted_logits = np.mean(voted_logits_array, axis=0, keepdims=True) #[1, num_labels] 84 | logits_list.append(torch.from_numpy(voted_logits)) 85 | else: 86 | avg_logits = np.mean(logits, axis=0, keepdims=True) 87 | logits_list.append(torch.from_numpy(avg_logits)) 88 | return torch.cat(logits_list, dim=0).to(model_device) 89 | 90 | def get_grad(self, text_input): 91 | """Get gradient of loss with respect to input tokens. 92 | 93 | Args: 94 | text_input (str): input string 95 | Returns: 96 | Dict of ids, tokens, and gradient as numpy array. 97 | """ 98 | if isinstance(self.model, textattack.models.helpers.T5ForTextToText): 99 | raise NotImplementedError( 100 | "`get_grads` for T5FotTextToText has not been implemented yet." 101 | ) 102 | 103 | self.model.train() 104 | embedding_layer = self.model.get_input_embeddings() 105 | original_state = embedding_layer.weight.requires_grad 106 | embedding_layer.weight.requires_grad = True 107 | 108 | emb_grads = [] 109 | 110 | def grad_hook(module, grad_in, grad_out): 111 | emb_grads.append(grad_out[0]) 112 | 113 | emb_hook = embedding_layer.register_backward_hook(grad_hook) 114 | 115 | self.model.zero_grad() 116 | model_device = next(self.model.parameters()).device 117 | input_dict = self.tokenizer( 118 | [text_input], 119 | add_special_tokens=True, 120 | return_tensors="pt", 121 | padding="max_length", 122 | truncation=True, 123 | ) 124 | input_dict.to(model_device) 125 | predictions = self.model(**input_dict).logits 126 | 127 | try: 128 | labels = predictions.argmax(dim=1) 129 | loss = self.model(**input_dict, labels=labels)[0] 130 | except TypeError: 131 | raise TypeError( 132 | f"{type(self.model)} class does not take in `labels` to calculate loss. " 133 | "One cause for this might be if you instantiatedyour model using `transformer.AutoModel` " 134 | "(instead of `transformers.AutoModelForSequenceClassification`)." 135 | ) 136 | 137 | loss.backward() 138 | 139 | # grad w.r.t to word embeddings 140 | grad = emb_grads[0][0].cpu().numpy() 141 | 142 | embedding_layer.weight.requires_grad = original_state 143 | emb_hook.remove() 144 | self.model.eval() 145 | 146 | output = {"ids": input_dict["input_ids"], "gradient": grad} 147 | 148 | return output 149 | 150 | 151 | def _tokenize(self, inputs): 152 | """Helper method that for `tokenize` 153 | Args: 154 | inputs (list[str]): list of input strings 155 | Returns: 156 | tokens (list[list[str]]): List of list of tokens as strings 157 | """ 158 | return [ 159 | self.tokenizer.convert_ids_to_tokens( 160 | self.tokenizer([x], truncation=True)["input_ids"][0] 161 | ) 162 | for x in inputs 163 | ] 164 | 165 | def ensemble_mask_tokens(self, strings, mask_percentage=0.3, ensemble_num=3, mask_token=''): 166 | """ 167 | strings: (list[str]): List of strings 168 | Returns: (list[str]): List of strings 169 | """ 170 | masked_strings = [] 171 | for string in strings: 172 | for iter_idx in range(ensemble_num): 173 | tokens = np.array(string.split()) 174 | num_tokens = len(tokens) 175 | num_masked_tokens = int(num_tokens * mask_percentage) 176 | 177 | masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False) 178 | masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens).tolist() 179 | masked_string = ' '.join(masked_tokens) 180 | masked_strings.append(masked_string) 181 | return masked_strings -------------------------------------------------------------------------------- /attack/sklearn_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import textattack 3 | from textattack.models.wrappers import SklearnModelWrapper 4 | import numpy as np 5 | from sklearn.linear_model import LogisticRegression 6 | import sys 7 | sys.path.append("..") 8 | from utils.metric_based import get_ll, get_rank, get_entropy, get_rank_GLTR 9 | from utils.metric_utils import cut_length 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | 12 | class CustomSklearnModelWrapper(SklearnModelWrapper): 13 | """ 14 | subclass of SklearnModelWrapper 15 | """ 16 | 17 | def __init__(self, model, tokenizer): 18 | self.model = model 19 | self.tokenizer = tokenizer 20 | 21 | def __call__(self, text_input_list, batch_size=None): 22 | x_test = self.tokenizer.transform(text_input_list) 23 | return self.model.predict_proba(x_test) 24 | 25 | def get_grad(self, text_input): 26 | raise NotImplementedError() 27 | 28 | class CustomSklearnTokenizer(object): 29 | 30 | def __init__(self, base_model, base_tokenizer, device, feature_fn='Log-Likelihood', max_length=512): 31 | self.base_model = base_model 32 | self.base_tokenizer = base_tokenizer 33 | self.feature_fn = feature_fn 34 | self.max_length = max_length 35 | self.device = device 36 | 37 | def transform(self, text_list): 38 | 39 | if self.feature_fn == 'Log-Likelihood': 40 | x_test = [get_ll(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list] 41 | elif self.feature_fn == 'Rank': 42 | x_test = [-get_rank(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device, log=False) for text in text_list] 43 | elif self.feature_fn == 'Log-Rank': 44 | x_test = [-get_rank(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device, log=True) for text in text_list] 45 | elif self.feature_fn == 'Entropy': 46 | x_test = [get_entropy(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list] 47 | elif self.feature_fn == 'GLTR': 48 | x_test = [get_rank_GLTR(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list] 49 | else: 50 | raise ValueError("Invalid feature function") 51 | 52 | x_test = np.array(x_test) 53 | if self.feature_fn in ["Log-Likelihood", "Rank", "Log-Rank", "Entropy"]: 54 | x_test = np.expand_dims(x_test, axis=-1) 55 | 56 | return x_test -------------------------------------------------------------------------------- /attack/utils.py: -------------------------------------------------------------------------------- 1 | from textattack.metrics.attack_metrics import ( 2 | AttackQueries, 3 | AttackSuccessRate, 4 | WordsPerturbed, 5 | ) 6 | 7 | def log_summary(results): 8 | total_attacks = len(results) 9 | if total_attacks == 0: 10 | return 11 | 12 | # Default metrics - calculated on every attack 13 | attack_success_stats = AttackSuccessRate().calculate(results) 14 | words_perturbed_stats = WordsPerturbed().calculate(results) 15 | attack_query_stats = AttackQueries().calculate(results) 16 | 17 | # @TODO generate this table based on user input - each column in specific class 18 | # Example to demonstrate: 19 | # summary_table_rows = attack_success_stats.display_row() + words_perturbed_stats.display_row() + ... 20 | summary_table_rows = [ 21 | [ 22 | "Number of successful attacks:", 23 | attack_success_stats["successful_attacks"], 24 | ], 25 | ["Number of failed attacks:", attack_success_stats["failed_attacks"]], 26 | ["Number of skipped attacks:", attack_success_stats["skipped_attacks"]], 27 | [ 28 | "Original accuracy:", 29 | str(attack_success_stats["original_accuracy"]) + "%", 30 | ], 31 | [ 32 | "Accuracy under attack:", 33 | str(attack_success_stats["attack_accuracy_perc"]) + "%", 34 | ], 35 | [ 36 | "Attack success rate:", 37 | str(attack_success_stats["attack_success_rate"]) + "%", 38 | ], 39 | [ 40 | "Average perturbed word %:", 41 | str(words_perturbed_stats["avg_word_perturbed_perc"]) + "%", 42 | ], 43 | [ 44 | "Average num. words per input:", 45 | words_perturbed_stats["avg_word_perturbed"], 46 | ], 47 | ] 48 | 49 | summary_table_rows.append( 50 | ["Avg num queries:", attack_query_stats["avg_num_queries"]] 51 | ) 52 | 53 | for metric_name, metric in self.metrics.items(): 54 | summary_table_rows.append([metric_name, metric.calculate(self.results)]) 55 | 56 | self.log_summary_rows( 57 | summary_table_rows, "Attack Results", "attack_results_summary" 58 | ) 59 | -------------------------------------------------------------------------------- /data_in.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarlanLark/Robust-AIGC-Detector/10989242d09ad46fa49592928c7490460d744ebd/data_in.zip -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from utils.utils import set_logger, path_checker, metrics_fn, compute_metrics 2 | 3 | import torch 4 | import numpy as np 5 | import random 6 | import pickle 7 | import datetime 8 | import json 9 | 10 | from transformers import (AutoConfig, AutoModelForSequenceClassification, Trainer, HfArgumentParser, set_seed, 11 | AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForCausalLM) 12 | 13 | from arguments import ModelArguments, DataTrainingArguments, TrainingArguments 14 | from datasets import load_dataset 15 | from utils.scrn_model import SCRNModel, SCRNTrainer 16 | from utils.metric_based import get_ll, get_rank, get_entropy, get_rank_GLTR, run_threshold_experiment, run_GLTR_experiment 17 | from utils.metric_utils import load_base_model_and_tokenizer 18 | from utils.flooding_model import FloodingTrainer 19 | from utils.rdrop import RDropTrainer 20 | from utils.ranmask_model import RanMaskModel 21 | from utils.utils import mask_tokens 22 | 23 | import wandb 24 | import os 25 | 26 | os.environ["WANDB_MODE"] = "offline" 27 | os.environ["WANDB__SERVICE_WAIT"] = "300" 28 | 29 | class CustomDataCollatorForSeqCLS(DataCollatorForSeq2Seq): 30 | def __call__(self, features, return_tensors=None): 31 | if return_tensors is None: 32 | return_tensors = self.return_tensors 33 | 34 | features = self.tokenizer.pad( 35 | features, 36 | padding=self.padding, 37 | max_length=self.max_length, 38 | pad_to_multiple_of=self.pad_to_multiple_of, 39 | return_tensors=return_tensors, 40 | ) 41 | 42 | return features 43 | 44 | 45 | def metrics_fn(outputs): 46 | y_true = outputs.label_ids 47 | y_pred = outputs.predictions.argmax(-1) 48 | y_score = torch.tensor(outputs.predictions).softmax(-1).numpy()[:, 1] 49 | return compute_metrics(y_true, y_pred, y_score) 50 | 51 | def main(): 52 | supervised_model_list = ['bert-base', 'roberta-base', 'deberta-base', 'ChatGPT-Detector', 'flooding', 'rdrop', 'ranmask', 'scrn'] 53 | metric_based_model_list = ["Log-Likelihood", "Rank", "Log-Rank", "Entropy", "GLTR"] 54 | 55 | # Get arguments 56 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 57 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 58 | model_abbr = training_args.output_dir.split('/')[-1] 59 | dataset_abbr = data_args.data_files.split('/')[-1] 60 | training_args.output_dir = training_args.output_dir + '_' + dataset_abbr 61 | 62 | # Path check and set logger 63 | # path_checker(training_args) 64 | try: 65 | os.mkdir(training_args.output_dir) 66 | except: 67 | print('Output directory already exists: %s'%training_args.output_dir) 68 | logger = set_logger(training_args) 69 | 70 | # Set seed 71 | set_seed(training_args.seed) 72 | 73 | # Load dataset 74 | raw_dataset = load_dataset( 75 | 'json', 76 | data_files={"train": data_args.data_files + "/train.json", 77 | "test": data_args.data_files + "/test.json", }, 78 | cache_dir=model_args.cache_dir, 79 | use_auth_token=True if model_args.use_auth_token else None, 80 | ) 81 | if model_abbr in supervised_model_list: 82 | # Load model 83 | config = AutoConfig.from_pretrained(model_args.model_name_or_path) 84 | tokenizer = AutoTokenizer.from_pretrained( 85 | model_args.model_name_or_path, 86 | model_max_length=data_args.max_seq_length, 87 | padding_side="right", 88 | use_fast=False, 89 | ) 90 | if model_abbr == 'scrn': 91 | model = SCRNModel(model_args.model_name_or_path, config) 92 | else: 93 | model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) 94 | 95 | 96 | def preprocess_function_for_ranmask(examples): 97 | examples["text"] = mask_tokens(examples["text"], mask_token=tokenizer.mask_token) 98 | inputs = tokenizer(examples["text"], truncation=True) 99 | model_inputs = inputs 100 | return model_inputs 101 | 102 | def preprocess_function_for_seq_cls(examples): 103 | inputs = tokenizer(examples["text"], truncation=True) 104 | model_inputs = inputs 105 | return model_inputs 106 | 107 | if model_abbr == 'ranmask': 108 | train_data_preprocess_fn = preprocess_function_for_ranmask 109 | infer_data_preprocess_fn = preprocess_function_for_seq_cls 110 | else: 111 | train_data_preprocess_fn = preprocess_function_for_seq_cls 112 | infer_data_preprocess_fn = preprocess_function_for_seq_cls 113 | 114 | 115 | 116 | # Preprocess dataset 117 | train_dataset, test_dataset = raw_dataset["train"], raw_dataset["test"] 118 | 119 | with training_args.main_process_first(desc="train dataset map pre-processing"): 120 | train_dataset = train_dataset.map( 121 | train_data_preprocess_fn, 122 | batched=True, 123 | num_proc=data_args.preprocessing_num_workers, 124 | load_from_cache_file=not data_args.overwrite_cache, 125 | desc="Running tokenizer on train dataset", 126 | ) 127 | test_dataset = test_dataset.map( 128 | infer_data_preprocess_fn, 129 | batched=True, 130 | num_proc=data_args.preprocessing_num_workers, 131 | load_from_cache_file=not data_args.overwrite_cache, 132 | desc="Running tokenizer on test dataset", 133 | ) 134 | 135 | data_collator = CustomDataCollatorForSeqCLS(tokenizer, model=model, pad_to_multiple_of=8 if training_args.fp16 else None,) 136 | 137 | 138 | # Set trainer 139 | if model_abbr == 'scrn': 140 | trainer_fn = SCRNTrainer 141 | elif model_abbr == 'flooding': 142 | trainer_fn = FloodingTrainer 143 | elif model_abbr == 'rdrop': 144 | trainer_fn = RDropTrainer 145 | else: 146 | trainer_fn = Trainer 147 | trainer = trainer_fn( 148 | model=model, 149 | args=training_args, 150 | train_dataset=train_dataset, 151 | tokenizer=tokenizer, 152 | data_collator=data_collator, 153 | eval_dataset=test_dataset, 154 | compute_metrics=metrics_fn, 155 | ) 156 | 157 | # Training 158 | if training_args.do_train: 159 | train_result = trainer.train() 160 | # trainer.save_state() 161 | trainer.save_model() 162 | 163 | # Predict 164 | if training_args.do_predict: 165 | if model_abbr == 'ranmask': 166 | config = AutoConfig.from_pretrained(training_args.output_dir) 167 | model = RanMaskModel.from_pretrained(training_args.output_dir) 168 | # set params for ensemble inference 169 | model.tokenizer = tokenizer 170 | model.mask_percentage = model_args.infer_mask_percentage 171 | model.ensemble_num = model_args.ensemble_num 172 | model.ensemble_method = model_args.ensemble_method 173 | elif model_abbr == 'scrn': 174 | config = AutoConfig.from_pretrained(training_args.output_dir) 175 | model = SCRNModel(model_args.model_name_or_path, config=config) 176 | model.load_state_dict(torch.load(os.path.join(training_args.output_dir,'pytorch_model.bin'))) 177 | else: 178 | config = AutoConfig.from_pretrained(training_args.output_dir) 179 | model = AutoModelForSequenceClassification.from_pretrained(training_args.output_dir) 180 | trainer = trainer_fn( 181 | model=model, 182 | args=training_args, 183 | tokenizer=tokenizer, 184 | data_collator=data_collator, 185 | eval_dataset=test_dataset, 186 | compute_metrics=metrics_fn, 187 | ) 188 | predict_results = trainer.evaluate() 189 | trainer.save_metrics("predict", predict_results) 190 | 191 | elif model_abbr in metric_based_model_list: 192 | DEVICE = 'cuda' 193 | START_DATE = datetime.datetime.now().strftime('%Y-%m-%d') 194 | START_TIME = datetime.datetime.now().strftime('%H-%M-%S-%f') 195 | 196 | # get generative model and set device 197 | # gpt-2 198 | base_model, base_tokenizer = load_base_model_and_tokenizer(model_args.metric_base_model_name_or_path) 199 | base_model.to(DEVICE) 200 | 201 | # build features 202 | 203 | def ll_criterion(text): return get_ll(text, base_model, base_tokenizer, DEVICE) 204 | 205 | def rank_criterion(text): return -get_rank(text, base_model, base_tokenizer, DEVICE, log=False) 206 | 207 | def logrank_criterion(text): return -get_rank(text, base_model, base_tokenizer, DEVICE, log=True) 208 | 209 | def entropy_criterion(text): return get_entropy(text, base_model, base_tokenizer, DEVICE) 210 | 211 | def GLTR_criterion(text): return get_rank_GLTR(text, base_model, base_tokenizer, DEVICE) 212 | 213 | outputs = [] 214 | data = raw_dataset 215 | if model_abbr == "Log-Likelihood": 216 | outputs.append(run_threshold_experiment(data, ll_criterion, "likelihood", logger=logger)) 217 | elif model_abbr == "Rank": 218 | outputs.append(run_threshold_experiment(data, rank_criterion, "rank", logger=logger)) 219 | elif model_abbr == "Log-Rank": 220 | outputs.append(run_threshold_experiment(data, logrank_criterion, "log_rank", logger=logger)) 221 | elif model_abbr == "Entropy": 222 | outputs.append(run_threshold_experiment(data, entropy_criterion, "entropy", logger=logger)) 223 | elif model_abbr == "GLTR": 224 | outputs.append(run_GLTR_experiment(data, GLTR_criterion, "rank_GLTR", logger=logger)) 225 | clf = outputs[0]['clf'] 226 | filename = training_args.output_dir + '/classifier.bin' 227 | pickle.dump(clf, open(filename, 'wb')) 228 | # save metrics 229 | test_metrics = {'eval_%s'%k:v for k, v in outputs[0]['general_test'].items()} 230 | file_name = training_args.output_dir + '/predict_results.json' 231 | json.dump(test_metrics, open(file_name, 'w')) 232 | 233 | 234 | 235 | 236 | else: 237 | raise ValueError("Invalid model abbreviation") 238 | 239 | 240 | if __name__ == "__main__": 241 | main() 242 | -------------------------------------------------------------------------------- /run_attack.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import textattack 3 | import pickle 4 | import random 5 | import torch 6 | import numpy as np 7 | import os 8 | import json 9 | from attack.sklearn_utils import CustomSklearnModelWrapper, CustomSklearnTokenizer 10 | from textattack.models.wrappers import HuggingFaceModelWrapper 11 | from attack import HuggingFaceModelMaskEnsembleWrapper 12 | from transformers import AutoTokenizer, AutoModelForCausalLM 13 | from textattack.attack_recipes import PWWSRen2019, Pruthi2019, DeepWordBugGao2018 14 | from attack.attack_recipe import PWWSRen2019_threshold 15 | from textattack import Attacker 16 | from datasets import load_dataset 17 | from utils.metric_utils import load_base_model_and_tokenizer 18 | from utils.scrn_model import SCRNModel 19 | from attack.custom_dataset import load_attack_dataset 20 | from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--model_type', type=str, default="hf") # hf/sklearn 24 | parser.add_argument('--ensemble_num', type=int, default=1) 25 | parser.add_argument('--mask_percentage', type=float, default=0.30) 26 | parser.add_argument('--transfer_dataset_abbr', type=str, default="self") 27 | parser.add_argument('--num_examples', type=int, default=10) 28 | parser.add_argument('--attack_class', type=str, default="ai") 29 | parser.add_argument('--attack_recipe', type=str, default="pwws") 30 | parser.add_argument('--data_files', type=str, default="./data_in") 31 | parser.add_argument('--output_dir', type=str, default="./data_out") 32 | parser.add_argument('--bert_name_or_path', type=str, default="bert-base-uncased") 33 | parser.add_argument('--metric_base_model_name_or_path', type=str, default="gpt2") 34 | parser.add_argument('--seed', type=int, default=2020) 35 | parser.add_argument('--log_summary', type=str, default='yes') 36 | args = parser.parse_args() 37 | 38 | 39 | random.seed(args.seed) 40 | np.random.seed(args.seed) 41 | torch.manual_seed(args.seed) 42 | torch.cuda.manual_seed_all(args.seed) 43 | 44 | 45 | model_abbr, dataset_abbr = args.output_dir.split('/')[-1].split('_') 46 | if args.transfer_dataset_abbr!= "self": 47 | dataset_abbr = args.transfer_dataset_abbr 48 | args.data_files = args.data_files + '/' + dataset_abbr 49 | # dataset 50 | dataset = load_attack_dataset(data_files=args.data_files, attack_class=args.attack_class) 51 | 52 | 53 | if args.model_type == 'hf': 54 | # load config and tokenizer 55 | config = AutoConfig.from_pretrained(args.output_dir) 56 | tokenizer = AutoTokenizer.from_pretrained( 57 | args.output_dir, 58 | model_max_length=512, 59 | padding_side="right", 60 | use_fast=False, 61 | ) 62 | # load model 63 | if model_abbr == 'scrn': 64 | model = SCRNModel(args.bert_name_or_path, config=config) 65 | model.load_state_dict(torch.load(os.path.join(args.output_dir,'pytorch_model.bin'))) 66 | else: 67 | model = AutoModelForSequenceClassification.from_pretrained(args.output_dir, config=config) 68 | # select model_wrapper 69 | if args.ensemble_num > 1: 70 | model_wrapper = HuggingFaceModelMaskEnsembleWrapper(model, tokenizer, ensemble_num=args.ensemble_num, mask_percentage=args.mask_percentage) 71 | else: 72 | model_wrapper = HuggingFaceModelWrapper(model, tokenizer) 73 | elif args.model_type =='sklearn': 74 | # model 75 | DEVICE = 'cuda' 76 | base_model, base_tokenizer = load_base_model_and_tokenizer(args.metric_base_model_name_or_path) 77 | base_model.to(DEVICE) 78 | tokenizer = CustomSklearnTokenizer(base_model, base_tokenizer, DEVICE, feature_fn=model_abbr) 79 | filename = args.output_dir + '/' + 'classifier.bin' 80 | # load the model from disk 81 | model = pickle.load(open(filename, 'rb')) 82 | model_wrapper = CustomSklearnModelWrapper(model, tokenizer) 83 | else: 84 | raise ValueError('Unknown model type %s'%args.model_type) 85 | 86 | if args.num_examples == -1: 87 | num_examples = len(dataset) 88 | else: 89 | num_examples = args.num_examples 90 | 91 | max_num_word_swaps = np.mean([len(x[0]['text'].split(' ')) for x in dataset][:num_examples]) // 20 92 | if max_num_word_swaps >= 10: 93 | max_num_word_swaps = 10 94 | elif max_num_word_swaps <= 1: 95 | max_num_word_swaps = 1 96 | else: 97 | _ = 0 98 | 99 | if args.attack_recipe == 'pwws': # word sub 100 | attack = PWWSRen2019.build(model_wrapper) 101 | elif args.attack_recipe == 'pwwsTaip': # add threshold ai as positive 102 | # get threshold 103 | with open(f"{args.output_dir}/predict_results.json", "r") as fin: 104 | metrics = json.load(fin) 105 | if args.attack_class == "ai": 106 | target_max_score = metrics["eval_aip_threshold_chatgpt"] 107 | elif args.attack_class == "human": 108 | target_max_score = metrics["eval_aip_threshold_human"] 109 | else: 110 | raise ValueError('Unknown attack class %s'%args.attack_class) 111 | attack = PWWSRen2019_threshold.build(model_wrapper, target_max_score=target_max_score) 112 | elif args.attack_recipe == 'pwwsThp': # add threshold human as positive 113 | with open(f"{args.output_dir}/predict_results.json", "r") as fin: 114 | metrics = json.load(fin) 115 | if args.attack_class == "ai": 116 | target_max_score = metrics["eval_hp_threshold_chatgpt"] 117 | elif args.attack_class == "human": 118 | target_max_score = metrics["eval_hp_threshold_human"] 119 | else: 120 | raise ValueError('Unknown attack class %s'%args.attack_class) 121 | attack = PWWSRen2019_threshold.build(model_wrapper, target_max_score=target_max_score) 122 | elif args.attack_recipe == 'pruthi': # char sub delete insert etc 123 | attack = Pruthi2019.build(model_wrapper, max_num_word_swaps=max_num_word_swaps) 124 | elif args.attack_recipe == 'deep-word-bug': # word sub, char sub, word del, word insert etc 125 | attack = DeepWordBugGao2018.build(model_wrapper) 126 | else: 127 | raise ValueError('Unknown attack recipe %s'%args.attack_recipe) 128 | 129 | attack_args = textattack.AttackArgs( 130 | num_examples=num_examples, 131 | log_to_csv='%s/attack_results_%s_%s_%s.csv'%(args.output_dir, dataset_abbr, args.attack_class, args.attack_recipe), 132 | csv_coloring_style='html', 133 | ) 134 | attacker = Attacker(attack, dataset, attack_args) 135 | results = attacker.attack_dataset() 136 | if args.log_summary == 'yes': 137 | attacker.attack_log_manager.add_output_file(filename="%s/attack_summary_%s_%s_%s.log"%(args.output_dir, dataset_abbr, args.attack_class, args.attack_recipe), color_method="file") 138 | attacker.attack_log_manager.log_summary() 139 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | # metric-based detectors: ["Log-Likelihood", "Log-Rank", "Entropy", "GLTR"] 2 | # model-based detectors: ['bert-base', 'roberta-base', 'deberta-base', 'ChatGPT-Detector', 'flooding', 'rdrop', 'ranmask', 'scrn'] 3 | 4 | export DATASET_ABBR=mixed-source 5 | export MODEL_ABBR=scrn 6 | export BERT_MODEL=roberta-base # just used for huggingface wrapped model 7 | 8 | python3 -u main.py \ 9 | --do_train True \ 10 | --do_predict True \ 11 | --cache_dir .cache \ 12 | --seed 2020 \ 13 | --save_total_limit 5 \ 14 | --learning_rate 1e-4 \ 15 | --per_device_train_batch_size 16 \ 16 | --per_device_eval_batch_size 16 \ 17 | --num_train_epochs 2.0 \ 18 | --max_seq_length 512 \ 19 | --num_labels 2 \ 20 | --logging_steps 50 \ 21 | --gradient_accumulation_steps 1 \ 22 | --metric_base_model_name_or_path gpt2 \ 23 | --model_name_or_path ${BERT_MODEL} \ 24 | --data_files ./data_in/${DATASET_ABBR} \ 25 | --output_dir ./data_out/${MODEL_ABBR} \ -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarlanLark/Robust-AIGC-Detector/10989242d09ad46fa49592928c7490460d744ebd/utils/__init__.py -------------------------------------------------------------------------------- /utils/flooding_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM 5 | from transformers.modeling_outputs import SequenceClassifierOutput 6 | 7 | class FloodingTrainer(Trainer): 8 | 9 | def compute_loss(self, model, inputs, return_outputs=False): 10 | outputs = model(**inputs) 11 | loss, logits = outputs.loss, outputs.logits 12 | loss = (loss - 0.15).abs() + 0.15 13 | 14 | outputs = SequenceClassifierOutput( 15 | loss=loss, 16 | logits=logits, 17 | hidden_states=None, 18 | attentions=None, 19 | ) 20 | return (loss, outputs) if return_outputs else loss -------------------------------------------------------------------------------- /utils/metric_based.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | import time 5 | from utils.metric_utils import timeit, get_clf_results, cut_length, cal_metrics 6 | from tqdm import tqdm 7 | 8 | 9 | def get_ll(text, base_model, base_tokenizer, DEVICE): 10 | with torch.no_grad(): 11 | if len(base_tokenizer.encode(text)) == 1: 12 | text += ' %s'%(base_tokenizer.pad_token) 13 | tokenized = base_tokenizer( 14 | text, 15 | padding=True, 16 | truncation=True, 17 | max_length=512, 18 | return_tensors="pt").to(DEVICE) 19 | labels = tokenized.input_ids 20 | return -base_model(**tokenized, labels=labels).loss.item() 21 | # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L1317 22 | 23 | 24 | def get_lls(texts, base_model, base_tokenizer, DEVICE): 25 | return [get_ll(_, base_model, base_tokenizer, DEVICE) for _ in texts] 26 | 27 | 28 | # get the average rank of each observed token sorted by model likelihood 29 | def get_rank(text, base_model, base_tokenizer, DEVICE, log=False): 30 | with torch.no_grad(): 31 | if len(base_tokenizer.encode(text)) == 1: 32 | text += ' %s'%(base_tokenizer.pad_token) 33 | tokenized = base_tokenizer( 34 | text, 35 | truncation=True, 36 | max_length=512, 37 | return_tensors="pt", 38 | ).to(DEVICE) 39 | logits = base_model(**tokenized).logits[:, :-1] 40 | labels = tokenized.input_ids[:, 1:] 41 | 42 | # get rank of each label token in the model's likelihood ordering 43 | matches = (logits.argsort(-1, descending=True) 44 | == labels.unsqueeze(-1)).nonzero() 45 | 46 | assert matches.shape[ 47 | 1] == 3, f"Expected 3 dimensions in matches tensor, got {matches.shape}" 48 | 49 | ranks, timesteps = matches[:, -1], matches[:, -2] 50 | 51 | # make sure we got exactly one match for each timestep in the sequence 52 | assert (timesteps == torch.arange(len(timesteps)).to( 53 | timesteps.device)).all(), "Expected one match per timestep" 54 | 55 | ranks = ranks.float() + 1 # convert to 1-indexed rank 56 | if log: 57 | ranks = torch.log(ranks) 58 | 59 | return ranks.float().mean().item() 60 | 61 | 62 | def get_ranks(texts, base_model, base_tokenizer, DEVICE, log=False): 63 | return [get_rank(_, base_model, base_tokenizer, DEVICE, log) 64 | for _ in texts] 65 | 66 | 67 | def get_rank_GLTR(text, base_model, base_tokenizer, DEVICE, log=False): 68 | with torch.no_grad(): 69 | if len(base_tokenizer.encode(text)) == 1: 70 | text += ' %s'%(base_tokenizer.pad_token) 71 | tokenized = base_tokenizer( 72 | text, 73 | truncation=True, 74 | max_length=512, 75 | return_tensors="pt").to(DEVICE) 76 | logits = base_model(**tokenized).logits[:, :-1] 77 | labels = tokenized.input_ids[:, 1:] 78 | 79 | # get rank of each label token in the model's likelihood ordering 80 | matches = (logits.argsort(-1, descending=True) 81 | == labels.unsqueeze(-1)).nonzero() 82 | 83 | assert matches.shape[ 84 | 1] == 3, f"Expected 3 dimensions in matches tensor, got {matches.shape}" 85 | 86 | ranks, timesteps = matches[:, -1], matches[:, -2] 87 | 88 | # make sure we got exactly one match for each timestep in the sequence 89 | assert (timesteps == torch.arange(len(timesteps)).to( 90 | timesteps.device)).all(), "Expected one match per timestep" 91 | ranks = ranks.float() 92 | res = np.array([0.0, 0.0, 0.0, 0.0]) 93 | for i in range(len(ranks)): 94 | if ranks[i] < 10: 95 | res[0] += 1 96 | elif ranks[i] < 100: 97 | res[1] += 1 98 | elif ranks[i] < 1000: 99 | res[2] += 1 100 | else: 101 | res[3] += 1 102 | if res.sum() > 0: 103 | res = res / res.sum() 104 | 105 | return res 106 | 107 | 108 | # get average entropy of each token in the text 109 | def get_entropy(text, base_model, base_tokenizer, DEVICE): 110 | with torch.no_grad(): 111 | if len(base_tokenizer.encode(text)) == 1: 112 | text += ' %s'%(base_tokenizer.pad_token) 113 | tokenized = base_tokenizer( 114 | text, 115 | truncation=True, 116 | max_length=512, 117 | return_tensors="pt").to(DEVICE) 118 | logits = base_model(**tokenized).logits[:, :-1] 119 | neg_entropy = F.softmax(logits, dim=-1) * F.log_softmax(logits, dim=-1) 120 | return -neg_entropy.sum(-1).mean().item() 121 | 122 | 123 | @timeit 124 | def run_threshold_experiment(data, criterion_fn, name, logger=None): 125 | torch.manual_seed(0) 126 | np.random.seed(0) 127 | 128 | # get train data 129 | train_text = data['train']['text'] 130 | train_label = data['train']['labels'] 131 | t1 = time.time() 132 | train_criterion = [ 133 | criterion_fn( 134 | train_text[idx]) for idx in tqdm( 135 | range( 136 | len(train_text)), 137 | desc="Train criterion")] 138 | x_train = np.array(train_criterion) 139 | 140 | y_train = np.array(train_label) 141 | 142 | test_text = data['test']['text'] 143 | test_label = data['test']['labels'] 144 | test_criterion = [ 145 | criterion_fn( 146 | test_text[idx]) for idx in tqdm( 147 | range( 148 | len(test_text)), 149 | desc="Test criterion")] 150 | x_test = np.array(test_criterion) 151 | 152 | y_test = np.array(test_label) 153 | 154 | # remove nan values 155 | select_train_index = ~np.isnan(x_train) 156 | select_test_index = ~np.isnan(x_test) 157 | x_train = x_train[select_train_index] 158 | y_train = y_train[select_train_index] 159 | x_test = x_test[select_test_index] 160 | y_test = y_test[select_test_index] 161 | x_train = np.expand_dims(x_train, axis=-1) 162 | x_test = np.expand_dims(x_test, axis=-1) 163 | 164 | # import pdb;pdb.set_trace() 165 | clf, train_res, test_res = get_clf_results( 166 | x_train, y_train, x_test, y_test) 167 | 168 | print('----- train -----') 169 | print(train_res) 170 | print('----- test -----') 171 | print(test_res) 172 | if logger: 173 | logger.info('----- train -----') 174 | logger.info(train_res) 175 | logger.info('----- test -----') 176 | logger.info(test_res) 177 | 178 | return { 179 | 'name': f'{name}_threshold', 180 | 'predictions': {'train': train_criterion, 'test': test_criterion}, 181 | 'general_train': train_res, 182 | 'general_test': test_res, 183 | 'clf': clf 184 | } 185 | 186 | 187 | @timeit 188 | def run_threshold_experiment_multiple_test_length( 189 | clf, data, criterion_fn, name, lengths=[ 190 | 10, 20, 50, 100, 200, 500, -1]): 191 | torch.manual_seed(0) 192 | np.random.seed(0) 193 | res = {} 194 | for length in lengths: 195 | test_text = data['test']['text'] 196 | test_label = data['test']['labels'] 197 | test_criterion = [ 198 | criterion_fn( 199 | cut_length( 200 | test_text[idx], 201 | length)) for idx in tqdm( 202 | range( 203 | len(test_text)), 204 | desc="Test criterion")] 205 | x_test = np.array(test_criterion) 206 | y_test = np.array(test_label) 207 | 208 | # remove nan values 209 | select_test_index = ~np.isnan(x_test) 210 | x_test = x_test[select_test_index] 211 | y_test = y_test[select_test_index] 212 | x_test = np.expand_dims(x_test, axis=-1) 213 | 214 | y_test_pred = clf.predict(x_test) 215 | y_test_pred_prob = clf.predict_proba(x_test) 216 | y_test_pred_prob = [_[1] for _ in y_test_pred_prob] 217 | acc_test, precision_test, recall_test, f1_test, auc_test = cal_metrics( 218 | y_test, y_test_pred, y_test_pred_prob) 219 | test_res = acc_test, precision_test, recall_test, f1_test, auc_test 220 | 221 | print(f"{name} {length} acc_test: {acc_test}, precision_test: {precision_test}, recall_test: {recall_test}, f1_test: {f1_test}, auc_test: {auc_test}") 222 | res[length] = test_res 223 | 224 | return res 225 | 226 | 227 | @timeit 228 | def run_GLTR_experiment(data, criterion_fn, name, logger=None): 229 | torch.manual_seed(0) 230 | np.random.seed(0) 231 | 232 | train_text = data['train']['text'] 233 | train_label = data['train']['labels'] 234 | train_criterion = [criterion_fn(train_text[idx]) 235 | for idx in range(len(train_text))] 236 | x_train = np.array(train_criterion) 237 | y_train = train_label 238 | 239 | test_text = data['test']['text'] 240 | test_label = data['test']['labels'] 241 | test_criterion = [criterion_fn(test_text[idx]) 242 | for idx in range(len(test_text))] 243 | x_test = np.array(test_criterion) 244 | y_test = test_label 245 | 246 | clf, train_res, test_res = get_clf_results( 247 | x_train, y_train, x_test, y_test) 248 | 249 | print('----- train -----') 250 | print(train_res) 251 | print('----- test -----') 252 | print(test_res) 253 | if logger: 254 | logger.info('----- train -----') 255 | logger.info(train_res) 256 | logger.info('----- test -----') 257 | logger.info(test_res) 258 | 259 | return { 260 | 'name': f'{name}_threshold', 261 | 'predictions': {'train': train_criterion, 'test': test_criterion}, 262 | 'general_train': train_res, 263 | 'general_test': test_res, 264 | 'clf': clf 265 | } 266 | 267 | 268 | @timeit 269 | def run_GLTR_experiment_multiple_test_length( 270 | clf, data, criterion_fn, name, lengths=[ 271 | 10, 20, 50, 100, 200, 500, -1]): 272 | torch.manual_seed(0) 273 | np.random.seed(0) 274 | 275 | res = {} 276 | for length in lengths: 277 | test_text = data['test']['text'] 278 | test_label = data['test']['labels'] 279 | test_criterion = [ 280 | criterion_fn( 281 | cut_length( 282 | test_text[idx], 283 | length)) for idx in tqdm( 284 | range( 285 | len(test_text)), 286 | desc="Test criterion")] 287 | x_test = np.array(test_criterion) 288 | y_test = np.array(test_label) 289 | 290 | y_test_pred = clf.predict(x_test) 291 | y_test_pred_prob = clf.predict_proba(x_test) 292 | y_test_pred_prob = [_[1] for _ in y_test_pred_prob] 293 | acc_test, precision_test, recall_test, f1_test, auc_test = cal_metrics( 294 | y_test, y_test_pred, y_test_pred_prob) 295 | test_res = acc_test, precision_test, recall_test, f1_test, auc_test 296 | 297 | print(f"{name} {length} acc_test: {acc_test}, precision_test: {precision_test}, recall_test: {recall_test}, f1_test: {f1_test}, auc_test: {auc_test}") 298 | res[length] = test_res 299 | 300 | return res -------------------------------------------------------------------------------- /utils/metric_utils.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import re 3 | import numpy as np 4 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 5 | from sklearn.linear_model import LogisticRegression 6 | import time 7 | from functools import wraps 8 | import random 9 | from utils.utils import compute_metrics 10 | 11 | 12 | 13 | def timeit(func): 14 | @wraps(func) 15 | def timeit_wrapper(*args, **kwargs): 16 | start_time = time.time() 17 | result = func(*args, **kwargs) 18 | end_time = time.time() 19 | total_time = end_time - start_time 20 | print(f'Function {func.__name__} Took {total_time:.4f} seconds\n\n') 21 | return result 22 | return timeit_wrapper 23 | 24 | 25 | # define regex to match all tokens, where * is an integer 26 | pattern = re.compile(r"") 27 | 28 | 29 | def select_train_data(data, select_num=-1): 30 | new_train = { 31 | 'text': [], 32 | 'label': [], 33 | } 34 | 35 | if select_num == -1: 36 | return data 37 | else: 38 | new_train['text'] = data['train']['text'][:select_num] 39 | new_train['label'] = data['train']['label'][:select_num] 40 | data['train'] = new_train 41 | 42 | return data 43 | 44 | 45 | def filter_test_data(data, max_length=25): 46 | new_test = { 47 | 'text': [], 48 | 'label': [], 49 | } 50 | for i in range(len(data['test']['text'])): 51 | text = data['test']['text'][i] 52 | label = data['test']['label'][i] 53 | if len(text.split()) <= max_length: 54 | new_test['text'].append(text) 55 | new_test['label'].append(label) 56 | data['test'] = new_test 57 | return data 58 | 59 | 60 | def cut_length(text, max_length=-1): 61 | if max_length == -1: 62 | return text 63 | else: 64 | text = text.split()[:max_length] 65 | text = " ".join(text) 66 | return text 67 | 68 | 69 | def sample_dataset(data, num_train, num_test): 70 | data["train"]["text"] = data["train"]["text"][:num_train] 71 | data["train"]["label"] = data["train"]["label"][:num_train] 72 | data["test"]["text"] = data["test"]["text"][:num_test] 73 | data["test"]["label"] = data["test"]["label"][:num_test] 74 | return data 75 | 76 | 77 | def load_base_model_and_tokenizer(model_name_or_path, cache_dir=".cache"): 78 | 79 | print(f'Loading BASE model {model_name_or_path}...') 80 | base_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, cache_dir=cache_dir) 81 | base_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) 82 | # TODO check gpt for pad_token 83 | base_tokenizer.pad_token = base_tokenizer.eos_token 84 | base_tokenizer.pad_token_id = base_tokenizer.eos_token_id 85 | 86 | return base_model, base_tokenizer 87 | 88 | 89 | def load_base_model(base_model, DEVICE): 90 | print('MOVING BASE MODEL TO GPU...', end='', flush=True) 91 | start = time.time() 92 | 93 | base_model.to(DEVICE) 94 | print(f'DONE ({time.time() - start:.2f}s)') 95 | 96 | 97 | def cal_metrics(label, pred_label, pred_posteriors): 98 | if len(set(label)) < 3: 99 | acc = accuracy_score(label, pred_label) 100 | precision = precision_score(label, pred_label) 101 | recall = recall_score(label, pred_label) 102 | f1 = f1_score(label, pred_label) 103 | auc = roc_auc_score(label, pred_posteriors) 104 | else: 105 | acc = accuracy_score(label, pred_label) 106 | precision = precision_score(label, pred_label, average='weighted') 107 | recall = recall_score(label, pred_label, average='weighted') 108 | f1 = f1_score(label, pred_label, average='weighted') 109 | auc = -1.0 110 | conf_m = confusion_matrix(label, pred_label) 111 | print(conf_m) 112 | return acc, precision, recall, f1, auc 113 | 114 | 115 | def get_clf_results(x_train, y_train, x_test, y_test): 116 | 117 | clf = LogisticRegression(random_state=2020).fit(x_train, y_train) 118 | 119 | y_train_pred = clf.predict(x_train) 120 | y_train_pred_prob = clf.predict_proba(x_train) 121 | y_train_pred_prob = np.array([_[1] for _ in y_train_pred_prob]) 122 | train_res = compute_metrics(y_train, y_train_pred, y_train_pred_prob) 123 | 124 | y_test_pred = clf.predict(x_test) 125 | y_test_pred_prob = clf.predict_proba(x_test) 126 | y_test_pred_prob = np.array([_[1] for _ in y_test_pred_prob]) 127 | test_res = compute_metrics(y_test, y_test_pred, y_test_pred_prob) 128 | 129 | return clf, train_res, test_res -------------------------------------------------------------------------------- /utils/ranmask_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from transformers import PreTrainedModel, RobertaPreTrainedModel, Trainer, AutoModel, RobertaModel 5 | from typing import List, Optional, Tuple, Union 6 | import pdb 7 | from transformers.modeling_outputs import SequenceClassifierOutput 8 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss 9 | from .utils import ensemble_mask_tokens 10 | import numpy as np 11 | from sklearn.preprocessing import normalize 12 | 13 | 14 | class RobertaClassificationHead(nn.Module): 15 | """Head for sentence-level classification tasks.""" 16 | 17 | def __init__(self, config): 18 | super().__init__() 19 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 20 | classifier_dropout = ( 21 | config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob 22 | ) 23 | self.dropout = nn.Dropout(classifier_dropout) 24 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 25 | 26 | def forward(self, features, **kwargs): 27 | x = features[:, 0, :] # take token (equiv. to [CLS]) 28 | x = self.dropout(x) 29 | x = self.dense(x) 30 | x = torch.tanh(x) 31 | x = self.dropout(x) 32 | x = self.out_proj(x) 33 | return x 34 | 35 | class RanMaskModel(RobertaPreTrainedModel): 36 | def __init__(self, config): 37 | super().__init__(config) 38 | self.num_labels = config.num_labels 39 | self.config = config 40 | 41 | self.roberta = RobertaModel(config, add_pooling_layer=False) 42 | self.classifier = RobertaClassificationHead(config) 43 | 44 | self.tokenizer = None 45 | self.infer_mask_percentage = 0.05 46 | self.ensemble_num = 5 47 | self.ensemble_method = "votes" 48 | 49 | # Initialize weights and apply final processing 50 | self.post_init() 51 | 52 | # ensemble forward 53 | def forward( 54 | self, 55 | input_ids: Optional[torch.Tensor] = None, 56 | attention_mask: Optional[torch.Tensor] = None, 57 | token_type_ids: Optional[torch.Tensor] = None, 58 | position_ids: Optional[torch.Tensor] = None, 59 | head_mask: Optional[torch.Tensor] = None, 60 | inputs_embeds: Optional[torch.Tensor] = None, 61 | labels: Optional[torch.Tensor] = None, 62 | output_attentions: Optional[bool] = None, 63 | output_hidden_states: Optional[bool] = None, 64 | return_dict: Optional[bool] = None, 65 | ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: 66 | r""" 67 | labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): 68 | Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., 69 | config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If 70 | `config.num_labels > 1` a classification loss is computed (Cross-Entropy). 71 | """ 72 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 73 | # ensemble infer 74 | input_strings = self.tokenizer.batch_decode(input_ids, skip_special_tokens=True) 75 | ensemble_strings = ensemble_mask_tokens(input_strings, self.mask_percentage, self.ensemble_num, mask_token=self.tokenizer.mask_token) 76 | model_device = input_ids.device 77 | batch_size = 32#len(input_strings) 78 | i = 0 79 | ensemble_logits_list = [] 80 | while i < len(ensemble_strings): 81 | batch_ensemble_strings = ensemble_strings[i:i+batch_size] 82 | batch_inputs = self.tokenizer(batch_ensemble_strings, return_tensors="pt", padding=True, truncation=True) 83 | batch_inputs = {key: value.to(model_device) for key, value in batch_inputs.items()} 84 | 85 | outputs = self.roberta(**batch_inputs, return_dict=return_dict,) 86 | sequence_output = outputs[0] 87 | logits = self.classifier(sequence_output) 88 | 89 | ensemble_logits_list.append(logits) 90 | i += batch_size 91 | ensemble_logits = torch.cat(ensemble_logits_list, dim=0).cpu().numpy() #[bsz, label_num] 92 | # get ensembled logits 93 | ensemble_logits_for_each_input = np.split(ensemble_logits, indices_or_sections=len(input_strings), axis=0) 94 | logits_list = [] 95 | 96 | for logits in ensemble_logits_for_each_input: 97 | if self.ensemble_method == 'votes': 98 | voted_label = np.argmax(np.bincount(np.argmax(logits, axis=-1), minlength=logits.shape[-1])) 99 | voted_logits_array = logits[np.where(np.argmax(logits, axis=-1)==voted_label)[0]] 100 | voted_logits = np.mean(voted_logits_array, axis=0, keepdims=True) #[1, num_labels] 101 | logits_list.append(torch.from_numpy(voted_logits)) 102 | else: 103 | avg_logits = np.mean(logits, axis=0, keepdims=True) 104 | logits_list.append(torch.from_numpy(avg_logits)) 105 | 106 | logits = torch.cat(logits_list, dim=0).to(model_device) 107 | 108 | loss = None 109 | if labels is not None: 110 | labels = labels.to(logits.device) 111 | if self.config.problem_type is None: 112 | if self.num_labels == 1: 113 | self.config.problem_type = "regression" 114 | elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 115 | self.config.problem_type = "single_label_classification" 116 | else: 117 | self.config.problem_type = "multi_label_classification" 118 | 119 | if self.config.problem_type == "regression": 120 | loss_fct = MSELoss() 121 | if self.num_labels == 1: 122 | loss = loss_fct(logits.squeeze(), labels.squeeze()) 123 | else: 124 | loss = loss_fct(logits, labels) 125 | elif self.config.problem_type == "single_label_classification": 126 | loss_fct = CrossEntropyLoss() 127 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 128 | elif self.config.problem_type == "multi_label_classification": 129 | loss_fct = BCEWithLogitsLoss() 130 | loss = loss_fct(logits, labels) 131 | if not return_dict: 132 | output = (logits,) + outputs[2:] 133 | return ((loss,) + output) if loss is not None else output 134 | 135 | return SequenceClassifierOutput( 136 | loss=loss, 137 | logits=logits, 138 | hidden_states=None, 139 | attentions=None, 140 | ) 141 | 142 | -------------------------------------------------------------------------------- /utils/rdrop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM 5 | from transformers.modeling_outputs import SequenceClassifierOutput 6 | 7 | 8 | def compute_kl_loss(p, q, pad_mask=None): 9 | 10 | p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none') 11 | q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none') 12 | 13 | # pad_mask is for seq-level tasks 14 | if pad_mask is not None: 15 | p_loss.masked_fill_(pad_mask, 0.) 16 | q_loss.masked_fill_(pad_mask, 0.) 17 | 18 | # You can choose whether to use function "sum" and "mean" depending on your task 19 | p_loss = p_loss.mean() 20 | q_loss = q_loss.mean() 21 | 22 | loss = (p_loss + q_loss) / 2 23 | return loss 24 | 25 | class RDropTrainer(Trainer): 26 | 27 | def compute_loss(self, model, inputs, return_outputs=False): 28 | labels = inputs.get("labels") 29 | loss_fct = nn.CrossEntropyLoss() 30 | 31 | logits = model(**inputs).logits 32 | logits2 = model(**inputs).logits 33 | 34 | ce_loss = 0.5 * (loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + loss_fct(logits2.view(-1, self.model.config.num_labels), labels.view(-1))) 35 | kl_loss = compute_kl_loss(logits, logits2) 36 | loss = ce_loss + kl_loss 37 | 38 | outputs = SequenceClassifierOutput( 39 | loss=loss, 40 | logits=logits, 41 | hidden_states=None, 42 | attentions=None, 43 | ) 44 | return (loss, outputs) if return_outputs else loss -------------------------------------------------------------------------------- /utils/scrn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM 5 | from typing import List, Optional, Tuple, Union 6 | import pdb 7 | import json 8 | from transformers.modeling_outputs import SequenceClassifierOutput 9 | 10 | class Disentangle_Layer(nn.Module): 11 | def __init__(self, input_dim = 768, latent_dim = 64, hidden_dim = 512): 12 | super(Disentangle_Layer, self).__init__() 13 | self.input_dim = input_dim 14 | self.latent_dim = latent_dim 15 | self.hidden_dim = hidden_dim 16 | 17 | self.squeezer = nn.ModuleList([nn.Linear(self.input_dim, self.hidden_dim), nn.ReLU()]) 18 | self.semantic_proj = nn.Linear(self.hidden_dim, self.latent_dim) 19 | self.perturbation_proj = nn.Linear(self.hidden_dim, 1) 20 | 21 | def forward(self, input): 22 | latent_rep = input 23 | for layer in self.squeezer: 24 | latent_rep = layer(latent_rep) # [B, T, D] 25 | senmantic_rep = self.semantic_proj(latent_rep) 26 | perturbation_log_rep = self.perturbation_proj(latent_rep) 27 | 28 | return senmantic_rep, perturbation_log_rep 29 | 30 | class Reconstruction_Layer(nn.Module): 31 | def __init__(self, output_dim = 768, latent_dim = 64, hidden_dim = 512): 32 | super(Reconstruction_Layer, self).__init__() 33 | self.output_dim = output_dim 34 | self.latent_dim = latent_dim 35 | self.hidden_dim = hidden_dim 36 | 37 | self.recon_layers = nn.ModuleList([nn.Linear(self.latent_dim, self.hidden_dim), nn.ReLU(), nn.Linear(self.hidden_dim, self.output_dim)]) 38 | 39 | def forward(self, latent): 40 | recon_rep = latent 41 | for layer in self.recon_layers: 42 | recon_rep = layer(recon_rep) # [B, T, D] 43 | return recon_rep 44 | 45 | class Reconstruction_Network(nn.Module): 46 | def __init__(self, input_dim = 768, latent_dim = 64): 47 | super(Reconstruction_Network, self).__init__() 48 | 49 | self.encoder = Disentangle_Layer(input_dim, latent_dim) 50 | self.decoder = Reconstruction_Layer(input_dim, latent_dim) 51 | 52 | 53 | def forward(self, input, beta = 0.5): 54 | senmantic_rep, perturbation_log_rep = self.encoder(input) 55 | 56 | noised_rep = self.gaussian_random_perturb(senmantic_rep, torch.exp(0.5 * perturbation_log_rep)) 57 | output = self.decoder(noised_rep) 58 | mse_loss = self.recon_loss(output, input) 59 | reg_loss = self.regularization_loss(senmantic_rep, perturbation_log_rep) 60 | 61 | loss = mse_loss + beta * reg_loss # [B, T] 62 | return output, loss.mean() 63 | 64 | def gaussian_random_perturb(self, semantic_rep, perturbation_log_rep): 65 | gaussian_noise = torch.randn_like(perturbation_log_rep) 66 | return semantic_rep + gaussian_noise * perturbation_log_rep 67 | 68 | def recon_loss(self, output, input): 69 | return F.mse_loss(output, input, reduction="none").mean(dim = -1) 70 | 71 | def regularization_loss(self, semantic_rep, perturbation_log_rep, alpha = -1): 72 | return torch.mean(semantic_rep.pow(2) + perturbation_log_rep.exp() + alpha * perturbation_log_rep, dim=-1) 73 | 74 | 75 | class ClassificationHead(nn.Module): 76 | """Head for sentence-level classification tasks.""" 77 | 78 | def __init__(self, config): 79 | super().__init__() 80 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 81 | classifier_dropout = ( 82 | config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob 83 | ) 84 | self.dropout = nn.Dropout(classifier_dropout) 85 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 86 | 87 | def forward(self, features, **kwargs): 88 | x = features.max(dim = 1)[0] 89 | x = self.dropout(x) 90 | x = self.dense(x) 91 | x = torch.tanh(x) 92 | x = self.dropout(x) 93 | x = self.out_proj(x) 94 | return x 95 | 96 | class Calibrator(nn.Module): 97 | def __init__(self, symmetry=True): 98 | super(Calibrator, self).__init__() 99 | self.symmetry = symmetry 100 | self.kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=False) 101 | 102 | def forward(self, logits_p, logits_q): 103 | 104 | log_dist_p, log_dist_q = F.log_softmax(logits_p, dim=-1), F.log_softmax(logits_q, dim=-1) 105 | dist_p, dist_q = F.softmax(logits_p, dim=-1), F.softmax(logits_q, dim=-1) 106 | if self.symmetry: 107 | calib_loss = 0.5 * (self.kl_loss(log_dist_p, dist_q) + self.kl_loss(log_dist_q, dist_p)) 108 | else: 109 | calib_loss = self.kl_loss(dist_p, dist_q) 110 | return calib_loss 111 | 112 | class SCRNModel(PreTrainedModel): 113 | 114 | def __init__(self, model_name, config): 115 | super(SCRNModel, self).__init__(config) 116 | 117 | self.bert = AutoModel.from_pretrained(model_name, config=config) 118 | self.classifier = ClassificationHead(config=config) 119 | self.reconNN = Reconstruction_Network(input_dim = config.hidden_size, latent_dim = 512) 120 | 121 | 122 | 123 | def forward( 124 | self, 125 | input_ids: Optional[torch.Tensor] = None, 126 | attention_mask: Optional[torch.Tensor] = None, 127 | token_type_ids: Optional[torch.Tensor] = None, 128 | position_ids: Optional[torch.Tensor] = None, 129 | inputs_embeds: Optional[torch.Tensor] = None, 130 | labels: Optional[torch.Tensor] = None, 131 | output_attentions: Optional[bool] = None, 132 | output_hidden_states: Optional[bool] = None, 133 | return_dict: Optional[bool] = None, 134 | ): 135 | output = self.bert(input_ids = input_ids , 136 | attention_mask = attention_mask, 137 | token_type_ids = token_type_ids, 138 | position_ids = position_ids, 139 | inputs_embeds = inputs_embeds, 140 | output_attentions = output_attentions, 141 | output_hidden_states = True, 142 | return_dict = return_dict) 143 | last_hidden_state = output.last_hidden_state 144 | recon_output, recon_loss = self.reconNN(last_hidden_state) 145 | logits = self.classifier(recon_output) 146 | 147 | return SequenceClassifierOutput( 148 | loss=recon_loss, 149 | logits=logits, 150 | hidden_states=output.hidden_states, 151 | attentions=output.attentions, 152 | ) 153 | 154 | class SCRNTrainer(Trainer): 155 | def __init__(self, *args, **kwargs): 156 | super().__init__(*args, **kwargs) 157 | self.calibrator = Calibrator() 158 | 159 | def compute_loss(self, model, inputs, return_outputs=False): 160 | labels = inputs.get("labels") 161 | 162 | # siamese branch 1 163 | outputs_p = model.forward(**inputs) 164 | recon_loss_p, logits_p = outputs_p.loss.mean(), outputs_p.logits 165 | cls_loss_p = F.cross_entropy(logits_p, labels) 166 | 167 | # siamese branch 2 168 | outputs_q = model.forward(**inputs) 169 | recon_loss_q, logits_q = outputs_q.loss.mean(), outputs_q.logits 170 | cls_loss_q = F.cross_entropy(logits_q, labels) 171 | 172 | # cablibration 173 | calib_loss = self.calibrator(logits_p, logits_q) 174 | 175 | # final loss 176 | loss = 0.5 * (cls_loss_p + cls_loss_q) + 0.5 * calib_loss + 0.01 * (recon_loss_p + recon_loss_q) 177 | outputs = SequenceClassifierOutput( 178 | loss=loss, 179 | logits=logits_p, 180 | hidden_states=None, 181 | attentions=None, 182 | ) 183 | return (loss, outputs) if return_outputs else loss -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import numpy as np 4 | from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve 5 | 6 | def set_logger(training_args): 7 | 8 | # Setup logging 9 | logger = logging.getLogger(__name__) 10 | logging.basicConfig( 11 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 12 | datefmt="%m/%d/%Y %H:%M:%S", 13 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 14 | handlers=[logging.FileHandler(training_args.output_dir + "/train.log", 'w', encoding='utf-8'), 15 | logging.StreamHandler()] 16 | ) 17 | logger.warning( 18 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 19 | training_args.local_rank, 20 | training_args.device, 21 | training_args.n_gpu, 22 | bool(training_args.local_rank != -1), 23 | training_args.fp16, 24 | ) 25 | logger.info("Training/evaluation parameters %s", training_args) 26 | 27 | return logger 28 | 29 | 30 | def path_checker(training_args): 31 | if ( 32 | os.path.exists(training_args.output_dir) 33 | and os.listdir(training_args.output_dir) 34 | and training_args.do_train 35 | and not training_args.overwrite_output_dir 36 | ): 37 | raise ValueError( 38 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." 39 | ) 40 | 41 | if not os.path.exists(training_args.logging_dir): 42 | os.mkdir(training_args.logging_dir) 43 | 44 | 45 | def simple_accuracy(preds, labels): 46 | return (preds == labels).mean() 47 | 48 | 49 | def metrics_fn(p): 50 | preds = np.argmax(p.predictions, axis=1) 51 | return {"acc": simple_accuracy(preds, p.label_ids)} 52 | 53 | 54 | def prediction(logit): 55 | return np.argmax(logit, axis=1) 56 | 57 | def find_thres(fpr, thresholds, target_fpr=0.01): 58 | idx = 0 59 | while fpr[idx+1] <= target_fpr: 60 | idx += 1 61 | return {'fpr': fpr[idx], 'threshold': thresholds[idx]} 62 | 63 | def compute_metrics(y_true, y_pred, y_score): 64 | clf_report = classification_report(y_true, y_pred, output_dict=True) 65 | auc = roc_auc_score(y_true, y_score) 66 | hp_fpr, hp_tpr, hp_thresholds = roc_curve(y_true, 1-y_score, pos_label=0)# human as positive samples 67 | hp_fpr_thres = find_thres(hp_fpr, hp_thresholds, target_fpr=0.01) 68 | aip_fpr, aip_tpr, aip_thresholds = roc_curve(y_true, y_score, pos_label=1) # ai as positive samples 69 | aip_fpr_thres = find_thres(aip_fpr, aip_thresholds, target_fpr=0.01) 70 | # con_mat = confusion_matrix(y_true, preds) 71 | return { 72 | "AUC": auc, 73 | "hp_fpr": hp_fpr_thres['fpr'], 74 | "hp_threshold_chatgpt": 1 - hp_fpr_thres['threshold'], 75 | "hp_threshold_human": hp_fpr_thres['threshold'], 76 | "aip_fpr": aip_fpr_thres['fpr'], 77 | "aip_threshold_chatgpt": aip_fpr_thres['threshold'], 78 | "aip_threshold_human": 1 - aip_fpr_thres['threshold'], 79 | "acc": clf_report['accuracy'], 80 | "precision_overall_weighted": clf_report['weighted avg']['precision'], 81 | "recall_overall_weighted": clf_report['weighted avg']['recall'], 82 | "fscore_overall_weighted": clf_report['weighted avg']['f1-score'], 83 | "precision_chatgpt": clf_report['1']['precision'], 84 | "recall_chatgpt": clf_report['1']['recall'], 85 | "fscore_chatgpt": clf_report['1']['f1-score'], 86 | "support_chatgpt": clf_report['1']['support'], 87 | "precision_human": clf_report['0']['precision'], 88 | "recall_human": clf_report['0']['recall'], 89 | "fscore_human": clf_report['0']['f1-score'], 90 | "support_human": clf_report['0']['support'], 91 | # "confusion_matrix": con_mat.tolist() 92 | } 93 | 94 | def mask_tokens(strings, mask_percentage=0.3, mask_token=''): 95 | masked_strings = [] 96 | 97 | for string in strings: 98 | tokens = np.array(string.split()) 99 | num_tokens = len(tokens) 100 | num_masked_tokens = int(num_tokens * mask_percentage) 101 | 102 | masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False) 103 | masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens) 104 | masked_string = ' '.join(masked_tokens) 105 | masked_strings.append(masked_string) 106 | 107 | return masked_strings 108 | 109 | def ensemble_mask_tokens(strings, mask_percentage=0.3, ensemble_num=3, mask_token=''): 110 | """ 111 | strings: (list[str]): List of strings 112 | Returns: (list[str]): List of strings 113 | """ 114 | masked_strings = [] 115 | for string in strings: 116 | for iter_idx in range(ensemble_num): 117 | tokens = np.array(string.split()) 118 | num_tokens = len(tokens) 119 | num_masked_tokens = int(num_tokens * mask_percentage) 120 | 121 | masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False) 122 | masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens).tolist() 123 | masked_string = ' '.join(masked_tokens) 124 | masked_strings.append(masked_string) 125 | return masked_strings --------------------------------------------------------------------------------