├── README.md
├── arguments.py
├── attack.sh
├── attack
    ├── __init__.py
    ├── attack_recipe.py
    ├── custom_dataset.py
    ├── ranmask_wrapper.py
    ├── sklearn_utils.py
    └── utils.py
├── data_in.zip
├── main.py
├── run_attack.py
├── train.sh
└── utils
    ├── __init__.py
    ├── flooding_model.py
    ├── metric_based.py
    ├── metric_utils.py
    ├── ranmask_model.py
    ├── rdrop.py
    ├── scrn_model.py
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Robust-AIGC-Detector
 2 | 
 3 | Code for ACL 2024 long paper: Are AI-Generated Text Detectors Robust to Adversarial Perturbations?
 4 | 
 5 | ### Environments
 6 | 
 7 | ```bash
 8 | torch==1.11.0
 9 | transformers==4.30.2
10 | textattack==0.3.9 
11 | tensorflow==2.9.1 
12 | tensorflow_hub==0.15.0
13 | ```
14 | 
15 | 
16 | ### Data Preparation
17 | 
18 | ```bash
19 | unzip data_in.zip
20 | mkdir data_out
21 | ```
22 | 
23 | ### Training
24 | ```bash
25 | $ bash train.sh
26 | ```
27 | 
28 | ### Checkpoints
29 | The checkpoints of in-domain detector, cross-domain detector, and cross-genre detector can be found in <https://huggingface.co/CarlanLark/AIGT-detector-in-domain>. (These detectors are trained on the same training set and evaluated on different test sets.)
30 | 
31 | The checkpoint of mixed-source detector can be found in <https://huggingface.co/CarlanLark/AIGT-detector-mixed-source>.
32 | 
33 | ### Robustness Evaluation
34 | ```bash
35 | $ bash attack.sh
36 | ```
37 | 
38 | ### Citation
39 | If you find our work useful to your research, you can cite the paper below:
40 | ```bash
41 | @article{huang2024ai,
42 |   title={Are AI-Generated Text Detectors Robust to Adversarial Perturbations?},
43 |   author={Huang, Guanhua and Zhang, Yuchen and Li, Zhe and You, Yongjian and Wang, Mingze and Yang, Zhouwang},
44 |   journal={arXiv preprint arXiv:2406.01179},
45 |   year={2024}
46 | }
47 | ```


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | import transformers
 3 | from typing import Optional
 4 | from dataclasses import dataclass, field
 5 | from transformers import TrainingArguments as OriginalTrainingArguments
 6 | 
 7 | 
 8 | @dataclass
 9 | class ModelArguments:
10 |     """
11 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
12 |     """
13 | 
14 |     model: str = field(
15 |         default="BERT",
16 |         metadata={"help": "Model name (BERT, BART, ALBERT, ... )"}
17 |     )
18 | 
19 |     model_name_or_path: str = field(
20 |         default=None,
21 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
22 |     )
23 | 
24 |     metric_base_model_name_or_path: str = field(
25 |         default='gpt2',
26 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
27 |     )
28 | 
29 |     config_name: Optional[str] = field(
30 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
31 |     )
32 | 
33 |     tokenizer_name: Optional[str] = field(
34 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
35 |     )
36 | 
37 |     cache_dir: Optional[str] = field(
38 |         default=".cache", metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
39 |     )
40 | 
41 |     use_auth_token: bool = field(
42 |         default=False,
43 |         metadata={
44 |             "help": (
45 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
46 |                 "with private models)."
47 |             )
48 |         },
49 |     )
50 |     train_mask_percentage: float = field(default=0.3, metadata={"help": "RanMask train mask rate."})
51 |     infer_mask_percentage: float = field(default=0.3, metadata={"help": "RanMask inference mask rate."})
52 |     ensemble_num: float = field(default=100, metadata={"help": "RanMask inference ensemble number."})
53 |     ensemble_method: str = field(default="votes", metadata={"help": "RanMask inference ensemble method."})
54 | 
55 | 
56 | @dataclass
57 | class DataTrainingArguments:
58 |     """
59 |     Arguments pertaining to what data we are going to input our model for training and eval.
60 |     """
61 | 
62 |     task_name: str = field(default="classification", metadata={"help": "The name of the task"})
63 |     data_files: str = field(default="data_in", metadata={"help": "Should contain the data files for the task."})
64 |     num_labels: int = field(default=2, metadata={"help": "The number of labels on dataset"})
65 |     max_seq_length: int = field(
66 |         default=512,
67 |         metadata={
68 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
69 |             "than this will be truncated, sequences shorter will be padded."
70 |         },
71 |     )
72 |     preprocessing_num_workers: Optional[int] = field(
73 |         default=None,
74 |         metadata={"help": "The number of processes to use for the preprocessing."},
75 |     )
76 |     overwrite_cache: bool = field(
77 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
78 |     )
79 | 
80 | 
81 | @dataclass
82 | class TrainingArguments(OriginalTrainingArguments):
83 |     
84 |     do_train: bool = field(default=True, metadata={"help": "Whether to run training."})
85 |     do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
86 |     output_dir: str = field(
87 |         default="data_out",
88 |         metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
89 |     )
90 |     logging_dir: Optional[str] = field(default="data_out", metadata={"help": "Tensorboard log dir."})
91 |     eval_delay: Optional[float] = 0
92 |     evaluation_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch'
93 |     save_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch'
94 |     logging_strategy: typing.Union[transformers.trainer_utils.IntervalStrategy, str] = 'epoch'
95 |     lr_scheduler_type: typing.Union[transformers.trainer_utils.SchedulerType, str] = 'linear'


--------------------------------------------------------------------------------
/attack.sh:
--------------------------------------------------------------------------------
 1 | export OUTPUT_DIR=./data_out/scrn_in-domain # path/model_dataset
 2 | export MODEL_TYPE=hf
 3 | export BASE_MODEL=roberta-base
 4 | export NUM_EXAMPLES=200
 5 | export ENSEMBLE_NUM=1
 6 | export MASK_PERCENTAGE=0.30
 7 | export TRANSFER_DATASET_ABBR=self
 8 | export ATTACK_CLASS=ai # [human, ai]
 9 | export ATTACK_RECIPE=deep-word-bug # [pwws, deep-word-bug, pruthi]
10 | 
11 | python3 -u run_attack.py \
12 | --model_type ${MODEL_TYPE} \
13 | --bert_name_or_path ${BASE_MODEL} \
14 | --metric_base_model_name_or_path gpt2 \
15 | --attack_class ${ATTACK_CLASS} \
16 | --attack_recipe ${ATTACK_RECIPE} \
17 | --transfer_dataset_abbr ${TRANSFER_DATASET_ABBR} \
18 | --output_dir ${OUTPUT_DIR} \
19 | --num_examples ${NUM_EXAMPLES} \
20 | --ensemble_num ${ENSEMBLE_NUM} \
21 | --mask_percentage ${MASK_PERCENTAGE} \


--------------------------------------------------------------------------------
/attack/__init__.py:
--------------------------------------------------------------------------------
1 | from .ranmask_wrapper import HuggingFaceModelMaskEnsembleWrapper


--------------------------------------------------------------------------------
/attack/attack_recipe.py:
--------------------------------------------------------------------------------
 1 | from textattack import Attack
 2 | from textattack.constraints.pre_transformation import (
 3 |     RepeatModification,
 4 |     StopwordModification,
 5 | )
 6 | from textattack.constraints.pre_transformation import MaxModificationRate
 7 | from textattack.constraints.overlap import MaxWordsPerturbed
 8 | from textattack.goal_functions import InputReduction, UntargetedClassification
 9 | from textattack.search_methods import GreedyWordSwapWIR
10 | from textattack.transformations import WordDeletion, WordSwapWordNet
11 | 
12 | from textattack.attack_recipes import AttackRecipe
13 | 
14 | 
15 | class InputReductionFeng2018(AttackRecipe):
16 |     """Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018).
17 | 
18 |     Pathologies of Neural Models Make Interpretations Difficult.
19 | 
20 |     https://arxiv.org/abs/1804.07781
21 |     """
22 | 
23 |     @staticmethod
24 |     def build(model_wrapper):
25 |         # At each step, we remove the word with the lowest importance value until
26 |         # the model changes its prediction.
27 |         transformation = WordDeletion()
28 | 
29 |         constraints = [RepeatModification(), StopwordModification()]
30 |         #
31 |         # Goal is untargeted classification
32 |         #
33 |         goal_function = InputReduction(model_wrapper, maximizable=True)
34 |         #
35 |         # "For each word in an input sentence, we measure its importance by the
36 |         # change in the confidence of the original prediction when we remove
37 |         # that word from the sentence."
38 |         #
39 |         # "Instead of looking at the words with high importance values—what
40 |         # interpretation methods commonly do—we take a complementary approach
41 |         # and study how the model behaves when the supposedly unimportant words are
42 |         # removed."
43 |         #
44 |         search_method = GreedyWordSwapWIR(wir_method="delete")
45 | 
46 |         return Attack(goal_function, constraints, transformation, search_method)
47 | 
48 | class PWWSRen2019_threshold(AttackRecipe):
49 |     """Add threshold
50 |     """
51 | 
52 |     @staticmethod
53 |     def build(model_wrapper, target_max_score=None):
54 |         transformation = WordSwapWordNet()
55 |         constraints = [RepeatModification(), StopwordModification()]
56 |         goal_function = UntargetedClassification(model_wrapper, target_max_score=target_max_score)
57 |         # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is
58 |         search_method = GreedyWordSwapWIR("weighted-saliency")
59 |         return Attack(goal_function, constraints, transformation, search_method)


--------------------------------------------------------------------------------
/attack/custom_dataset.py:
--------------------------------------------------------------------------------
 1 | import textattack
 2 | import pandas as pd
 3 | import numpy as np
 4 | import json
 5 | import random
 6 | from datasets import load_dataset
 7 | 
 8 | def default_load_json(json_file_path, encoding='utf-8', **kwargs):
 9 |     with open(json_file_path, 'r', encoding=encoding) as fin:
10 |         tmp_json = json.load(fin, **kwargs)
11 |     return tmp_json
12 | 
13 | def dump_jsonline(json_file_path, data, encoding="utf-8"):
14 |     with open(json_file_path, "wt", encoding=encoding) as fout:
15 |         for ins in data:
16 |             fout.write(f"{json.dumps(ins, ensure_ascii=False)}\n")
17 |     fout.close()
18 |     return 0
19 | 
20 | def load_attack_dataset(data_files, attack_class='ai'):
21 |     dataset_abbr = data_files.split('/')[-1]
22 |     if dataset_abbr in ["in-domain", "cross-domain", "cross-genre", "mixed-source"]: 
23 |         # these datasets have been shuffled in train/test split
24 |         data = load_dataset(
25 |                     'json',
26 |                     data_files={"train": data_files + "/train.json", 
27 |                                 "test": data_files + "/test.json", },
28 |                 )["test"]
29 |         if attack_class == 'ai':
30 |             dataset = []
31 |             for x in data:
32 |                 if x['labels'] == 1:
33 |                     dataset.append((x['text'], x['labels']))
34 |         elif attack_class == 'human':
35 |             dataset = []
36 |             for x in data:
37 |                 if x['labels'] == 0:
38 |                     dataset.append((x['text'], x['labels']))
39 |         else:
40 |             raise ValueError('Dataset not exist: %s'%data_files)
41 |     else:
42 |         raise ValueError('Attack class not exist: %s'%attack_class)
43 |     
44 |     return textattack.datasets.Dataset(dataset)


--------------------------------------------------------------------------------
/attack/ranmask_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | HuggingFace Model Wrapper
  3 | --------------------------
  4 | """
  5 | import os
  6 | import torch
  7 | import transformers
  8 | 
  9 | import textattack
 10 | import numpy as np
 11 | from textattack.models.wrappers import PyTorchModelWrapper
 12 | 
 13 | from typing import List, Tuple
 14 | from scipy.special import softmax
 15 | from sklearn.preprocessing import normalize
 16 | from torch import nn as nn
 17 | from transformers import PreTrainedTokenizer, AutoModelForMaskedLM, RobertaTokenizer
 18 | 
 19 | class HuggingFaceModelMaskEnsembleWrapper(PyTorchModelWrapper):
 20 |     """Loads a HuggingFace ``transformers`` model and tokenizer."""
 21 |     def __init__(self, model, tokenizer, mask_percentage=0.30, ensemble_num=3, ensemble_method="vote", batch_size=32):
 22 |         self.model = model
 23 |         self.tokenizer = tokenizer
 24 |         self.mask_percentage = mask_percentage
 25 |         self.ensemble_num = ensemble_num
 26 |         self.batch_size = batch_size
 27 |         self.ensemble_method = ensemble_method
 28 | 
 29 |     def __call__(self, text_input_list):
 30 |         """Passes inputs to HuggingFace models as keyword arguments.
 31 | 
 32 |         (Regular PyTorch ``nn.Module`` models typically take inputs as
 33 |         positional arguments.)
 34 |         """
 35 |         # Default max length is set to be int(1e30), so we force 512 to enable batching.
 36 |         max_length = (
 37 |             512
 38 |             if self.tokenizer.model_max_length == int(1e30)
 39 |             else self.tokenizer.model_max_length
 40 |         )
 41 |         # start ensemble
 42 |         ensemble_mask_text_input_list = self.ensemble_mask_tokens(text_input_list, 
 43 |                                                                     mask_percentage=self.mask_percentage,
 44 |                                                                     ensemble_num=self.ensemble_num, 
 45 |                                                                     mask_token=self.tokenizer.mask_token)
 46 |         outputs_list = []
 47 |         i = 0
 48 |         while i < len(ensemble_mask_text_input_list):
 49 |             batched_text_input_list = ensemble_mask_text_input_list[i : i + self.batch_size]
 50 |             inputs_dict = self.tokenizer(
 51 |                 batched_text_input_list,
 52 |                 add_special_tokens=True,
 53 |                 padding="max_length",
 54 |                 max_length=max_length,
 55 |                 truncation=True,
 56 |                 return_tensors="pt",
 57 |             )
 58 |             model_device = next(self.model.parameters()).device
 59 |             inputs_dict.to(model_device)
 60 | 
 61 |             with torch.no_grad():
 62 |                 outputs = self.model(**inputs_dict)
 63 | 
 64 |             if isinstance(outputs[0], str):
 65 |                 # HuggingFace sequence-to-sequence models return a list of
 66 |                 # string predictions as output. In this case, return the full
 67 |                 # list of outputs.
 68 |                 outputs_list.append(outputs)
 69 |             else:
 70 |                 # HuggingFace classification models return a tuple as output
 71 |                 # where the first item in the tuple corresponds to the list of
 72 |                 # scores for each input.
 73 |                 outputs_list.append(outputs.logits) 
 74 |             i += self.batch_size
 75 |         # logits ensemble
 76 |         output_logits = torch.cat(outputs_list, dim=0).cpu().numpy() #[bsz, label_num]
 77 |         ensemble_logits_for_each_input = np.split(output_logits, indices_or_sections=len(text_input_list), axis=0)
 78 |         logits_list = []
 79 |         for logits in ensemble_logits_for_each_input:
 80 |             if self.ensemble_method == 'votes':
 81 |                 voted_label = np.argmax(np.bincount(np.argmax(logits, axis=-1), minlength=logits.shape[-1]))
 82 |                 voted_logits_array = logits[np.where(np.argmax(logits, axis=-1)==voted_label)[0]]
 83 |                 voted_logits = np.mean(voted_logits_array, axis=0, keepdims=True) #[1, num_labels]
 84 |                 logits_list.append(torch.from_numpy(voted_logits))
 85 |             else:
 86 |                 avg_logits = np.mean(logits, axis=0, keepdims=True)
 87 |                 logits_list.append(torch.from_numpy(avg_logits))
 88 |         return torch.cat(logits_list, dim=0).to(model_device)
 89 | 
 90 |     def get_grad(self, text_input):
 91 |         """Get gradient of loss with respect to input tokens.
 92 | 
 93 |         Args:
 94 |             text_input (str): input string
 95 |         Returns:
 96 |             Dict of ids, tokens, and gradient as numpy array.
 97 |         """
 98 |         if isinstance(self.model, textattack.models.helpers.T5ForTextToText):
 99 |             raise NotImplementedError(
100 |                 "`get_grads` for T5FotTextToText has not been implemented yet."
101 |             )
102 | 
103 |         self.model.train()
104 |         embedding_layer = self.model.get_input_embeddings()
105 |         original_state = embedding_layer.weight.requires_grad
106 |         embedding_layer.weight.requires_grad = True
107 | 
108 |         emb_grads = []
109 | 
110 |         def grad_hook(module, grad_in, grad_out):
111 |             emb_grads.append(grad_out[0])
112 | 
113 |         emb_hook = embedding_layer.register_backward_hook(grad_hook)
114 | 
115 |         self.model.zero_grad()
116 |         model_device = next(self.model.parameters()).device
117 |         input_dict = self.tokenizer(
118 |             [text_input],
119 |             add_special_tokens=True,
120 |             return_tensors="pt",
121 |             padding="max_length",
122 |             truncation=True,
123 |         )
124 |         input_dict.to(model_device)
125 |         predictions = self.model(**input_dict).logits
126 | 
127 |         try:
128 |             labels = predictions.argmax(dim=1)
129 |             loss = self.model(**input_dict, labels=labels)[0]
130 |         except TypeError:
131 |             raise TypeError(
132 |                 f"{type(self.model)} class does not take in `labels` to calculate loss. "
133 |                 "One cause for this might be if you instantiatedyour model using `transformer.AutoModel` "
134 |                 "(instead of `transformers.AutoModelForSequenceClassification`)."
135 |             )
136 | 
137 |         loss.backward()
138 | 
139 |         # grad w.r.t to word embeddings
140 |         grad = emb_grads[0][0].cpu().numpy()
141 | 
142 |         embedding_layer.weight.requires_grad = original_state
143 |         emb_hook.remove()
144 |         self.model.eval()
145 | 
146 |         output = {"ids": input_dict["input_ids"], "gradient": grad}
147 | 
148 |         return output
149 | 
150 | 
151 |     def _tokenize(self, inputs):
152 |         """Helper method that for `tokenize`
153 |         Args:
154 |             inputs (list[str]): list of input strings
155 |         Returns:
156 |             tokens (list[list[str]]): List of list of tokens as strings
157 |         """
158 |         return [
159 |             self.tokenizer.convert_ids_to_tokens(
160 |                 self.tokenizer([x], truncation=True)["input_ids"][0]
161 |             )
162 |             for x in inputs
163 |         ]
164 | 
165 |     def ensemble_mask_tokens(self, strings, mask_percentage=0.3, ensemble_num=3, mask_token='<mask>'):
166 |         """
167 |         strings: (list[str]): List of strings
168 |         Returns: (list[str]): List of strings
169 |         """
170 |         masked_strings = []
171 |         for string in strings:
172 |             for iter_idx in range(ensemble_num):
173 |                 tokens = np.array(string.split())
174 |                 num_tokens = len(tokens)
175 |                 num_masked_tokens = int(num_tokens * mask_percentage)
176 | 
177 |                 masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False)
178 |                 masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens).tolist()
179 |                 masked_string = ' '.join(masked_tokens)
180 |                 masked_strings.append(masked_string)
181 |         return masked_strings


--------------------------------------------------------------------------------
/attack/sklearn_utils.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import textattack
 3 | from textattack.models.wrappers import SklearnModelWrapper
 4 | import numpy as np
 5 | from sklearn.linear_model import LogisticRegression
 6 | import sys
 7 | sys.path.append("..")
 8 | from utils.metric_based import get_ll, get_rank, get_entropy, get_rank_GLTR
 9 | from utils.metric_utils import cut_length
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | 
12 | class CustomSklearnModelWrapper(SklearnModelWrapper):
13 |     """
14 |     subclass of SklearnModelWrapper
15 |     """
16 | 
17 |     def __init__(self, model, tokenizer):
18 |         self.model = model
19 |         self.tokenizer = tokenizer
20 | 
21 |     def __call__(self, text_input_list, batch_size=None):
22 |         x_test = self.tokenizer.transform(text_input_list)
23 |         return self.model.predict_proba(x_test)
24 | 
25 |     def get_grad(self, text_input):
26 |         raise NotImplementedError()
27 | 
28 | class CustomSklearnTokenizer(object):
29 |     
30 |     def __init__(self, base_model, base_tokenizer, device, feature_fn='Log-Likelihood', max_length=512):
31 |         self.base_model = base_model
32 |         self.base_tokenizer = base_tokenizer
33 |         self.feature_fn = feature_fn
34 |         self.max_length = max_length
35 |         self.device = device
36 |         
37 |     def transform(self, text_list):
38 |         
39 |         if self.feature_fn == 'Log-Likelihood':
40 |             x_test = [get_ll(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list]
41 |         elif self.feature_fn == 'Rank':
42 |             x_test = [-get_rank(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device, log=False) for text in text_list]
43 |         elif self.feature_fn == 'Log-Rank':
44 |             x_test = [-get_rank(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device, log=True) for text in text_list]
45 |         elif self.feature_fn == 'Entropy':
46 |             x_test = [get_entropy(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list]
47 |         elif self.feature_fn == 'GLTR':
48 |             x_test = [get_rank_GLTR(cut_length(text,self.max_length), self.base_model, self.base_tokenizer, self.device) for text in text_list]
49 |         else:
50 |             raise ValueError("Invalid feature function")
51 | 
52 |         x_test = np.array(x_test)
53 |         if self.feature_fn in ["Log-Likelihood", "Rank", "Log-Rank", "Entropy"]:
54 |             x_test = np.expand_dims(x_test, axis=-1)
55 | 
56 |         return x_test


--------------------------------------------------------------------------------
/attack/utils.py:
--------------------------------------------------------------------------------
 1 | from textattack.metrics.attack_metrics import (
 2 |     AttackQueries,
 3 |     AttackSuccessRate,
 4 |     WordsPerturbed,
 5 | )
 6 | 
 7 | def log_summary(results):
 8 |     total_attacks = len(results)
 9 |     if total_attacks == 0:
10 |         return
11 | 
12 |     # Default metrics - calculated on every attack
13 |     attack_success_stats = AttackSuccessRate().calculate(results)
14 |     words_perturbed_stats = WordsPerturbed().calculate(results)
15 |     attack_query_stats = AttackQueries().calculate(results)
16 | 
17 |     # @TODO generate this table based on user input - each column in specific class
18 |     # Example to demonstrate:
19 |     # summary_table_rows = attack_success_stats.display_row() + words_perturbed_stats.display_row() + ...
20 |     summary_table_rows = [
21 |         [
22 |             "Number of successful attacks:",
23 |             attack_success_stats["successful_attacks"],
24 |         ],
25 |         ["Number of failed attacks:", attack_success_stats["failed_attacks"]],
26 |         ["Number of skipped attacks:", attack_success_stats["skipped_attacks"]],
27 |         [
28 |             "Original accuracy:",
29 |             str(attack_success_stats["original_accuracy"]) + "%",
30 |         ],
31 |         [
32 |             "Accuracy under attack:",
33 |             str(attack_success_stats["attack_accuracy_perc"]) + "%",
34 |         ],
35 |         [
36 |             "Attack success rate:",
37 |             str(attack_success_stats["attack_success_rate"]) + "%",
38 |         ],
39 |         [
40 |             "Average perturbed word %:",
41 |             str(words_perturbed_stats["avg_word_perturbed_perc"]) + "%",
42 |         ],
43 |         [
44 |             "Average num. words per input:",
45 |             words_perturbed_stats["avg_word_perturbed"],
46 |         ],
47 |     ]
48 | 
49 |     summary_table_rows.append(
50 |         ["Avg num queries:", attack_query_stats["avg_num_queries"]]
51 |     )
52 | 
53 |     for metric_name, metric in self.metrics.items():
54 |         summary_table_rows.append([metric_name, metric.calculate(self.results)])
55 | 
56 |     self.log_summary_rows(
57 |         summary_table_rows, "Attack Results", "attack_results_summary"
58 |     )
59 | 


--------------------------------------------------------------------------------
/data_in.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarlanLark/Robust-AIGC-Detector/10989242d09ad46fa49592928c7490460d744ebd/data_in.zip


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from utils.utils import set_logger, path_checker, metrics_fn, compute_metrics
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | import random
  6 | import pickle
  7 | import datetime
  8 | import json
  9 | 
 10 | from transformers import (AutoConfig, AutoModelForSequenceClassification, Trainer, HfArgumentParser, set_seed, 
 11 | AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForCausalLM)
 12 | 
 13 | from arguments import ModelArguments, DataTrainingArguments, TrainingArguments
 14 | from datasets import load_dataset
 15 | from utils.scrn_model import SCRNModel, SCRNTrainer
 16 | from utils.metric_based import get_ll, get_rank, get_entropy, get_rank_GLTR, run_threshold_experiment, run_GLTR_experiment
 17 | from utils.metric_utils import load_base_model_and_tokenizer
 18 | from utils.flooding_model import FloodingTrainer
 19 | from utils.rdrop import RDropTrainer
 20 | from utils.ranmask_model import RanMaskModel
 21 | from utils.utils import mask_tokens
 22 | 
 23 | import wandb
 24 | import os
 25 | 
 26 | os.environ["WANDB_MODE"] = "offline"
 27 | os.environ["WANDB__SERVICE_WAIT"] = "300"
 28 | 
 29 | class CustomDataCollatorForSeqCLS(DataCollatorForSeq2Seq):    
 30 |     def __call__(self, features, return_tensors=None): 
 31 |         if return_tensors is None:
 32 |             return_tensors = self.return_tensors
 33 | 
 34 |         features = self.tokenizer.pad(
 35 |             features,
 36 |             padding=self.padding,
 37 |             max_length=self.max_length,
 38 |             pad_to_multiple_of=self.pad_to_multiple_of,
 39 |             return_tensors=return_tensors,
 40 |         )
 41 | 
 42 |         return features
 43 | 
 44 | 
 45 | def metrics_fn(outputs):
 46 |     y_true = outputs.label_ids
 47 |     y_pred = outputs.predictions.argmax(-1)
 48 |     y_score = torch.tensor(outputs.predictions).softmax(-1).numpy()[:, 1]
 49 |     return compute_metrics(y_true, y_pred, y_score)    
 50 | 
 51 | def main():
 52 |     supervised_model_list = ['bert-base', 'roberta-base', 'deberta-base', 'ChatGPT-Detector', 'flooding', 'rdrop', 'ranmask', 'scrn']
 53 |     metric_based_model_list = ["Log-Likelihood", "Rank", "Log-Rank", "Entropy", "GLTR"]
 54 | 
 55 |     # Get arguments
 56 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
 57 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 58 |     model_abbr = training_args.output_dir.split('/')[-1]
 59 |     dataset_abbr = data_args.data_files.split('/')[-1]
 60 |     training_args.output_dir = training_args.output_dir + '_' + dataset_abbr
 61 |     
 62 |     # Path check and set logger
 63 |     # path_checker(training_args)
 64 |     try:
 65 |         os.mkdir(training_args.output_dir)
 66 |     except:
 67 |         print('Output directory already exists: %s'%training_args.output_dir)
 68 |     logger = set_logger(training_args)
 69 | 
 70 |     # Set seed
 71 |     set_seed(training_args.seed)
 72 | 
 73 |     # Load dataset
 74 |     raw_dataset = load_dataset(
 75 |             'json',
 76 |             data_files={"train": data_args.data_files + "/train.json", 
 77 |                         "test": data_args.data_files + "/test.json", },
 78 |             cache_dir=model_args.cache_dir,
 79 |             use_auth_token=True if model_args.use_auth_token else None,
 80 |         )
 81 |     if model_abbr in supervised_model_list:
 82 |         # Load model
 83 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path)
 84 |         tokenizer = AutoTokenizer.from_pretrained(
 85 |             model_args.model_name_or_path,
 86 |             model_max_length=data_args.max_seq_length,
 87 |             padding_side="right",
 88 |             use_fast=False,
 89 |         )
 90 |         if model_abbr == 'scrn':
 91 |             model = SCRNModel(model_args.model_name_or_path, config)
 92 |         else:
 93 |             model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
 94 |         
 95 |         
 96 |         def preprocess_function_for_ranmask(examples):
 97 |             examples["text"] = mask_tokens(examples["text"], mask_token=tokenizer.mask_token)
 98 |             inputs = tokenizer(examples["text"], truncation=True)
 99 |             model_inputs = inputs
100 |             return model_inputs
101 |         
102 |         def preprocess_function_for_seq_cls(examples):
103 |             inputs = tokenizer(examples["text"], truncation=True)
104 |             model_inputs = inputs
105 |             return model_inputs
106 |         
107 |         if model_abbr == 'ranmask':
108 |             train_data_preprocess_fn = preprocess_function_for_ranmask
109 |             infer_data_preprocess_fn = preprocess_function_for_seq_cls
110 |         else:
111 |             train_data_preprocess_fn = preprocess_function_for_seq_cls
112 |             infer_data_preprocess_fn = preprocess_function_for_seq_cls
113 | 
114 | 
115 |         
116 |         # Preprocess dataset
117 |         train_dataset, test_dataset = raw_dataset["train"], raw_dataset["test"]
118 | 
119 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
120 |             train_dataset = train_dataset.map(
121 |                 train_data_preprocess_fn,
122 |                 batched=True,
123 |                 num_proc=data_args.preprocessing_num_workers,
124 |                 load_from_cache_file=not data_args.overwrite_cache,
125 |                 desc="Running tokenizer on train dataset",
126 |             )
127 |             test_dataset = test_dataset.map(
128 |                 infer_data_preprocess_fn,
129 |                 batched=True,
130 |                 num_proc=data_args.preprocessing_num_workers,
131 |                 load_from_cache_file=not data_args.overwrite_cache,
132 |                 desc="Running tokenizer on test dataset",
133 |             )
134 |         
135 |         data_collator = CustomDataCollatorForSeqCLS(tokenizer, model=model, pad_to_multiple_of=8 if training_args.fp16 else None,)
136 | 
137 | 
138 |         # Set trainer
139 |         if model_abbr == 'scrn':
140 |             trainer_fn = SCRNTrainer
141 |         elif model_abbr == 'flooding':
142 |             trainer_fn = FloodingTrainer
143 |         elif model_abbr == 'rdrop':
144 |             trainer_fn = RDropTrainer
145 |         else:
146 |             trainer_fn = Trainer
147 |         trainer = trainer_fn(
148 |             model=model,
149 |             args=training_args,
150 |             train_dataset=train_dataset,
151 |             tokenizer=tokenizer,
152 |             data_collator=data_collator,
153 |             eval_dataset=test_dataset,
154 |             compute_metrics=metrics_fn,
155 |         )
156 | 
157 |         # Training
158 |         if training_args.do_train:
159 |             train_result = trainer.train()
160 |             # trainer.save_state()
161 |             trainer.save_model()
162 | 
163 |         # Predict
164 |         if training_args.do_predict:
165 |             if model_abbr == 'ranmask':
166 |                 config = AutoConfig.from_pretrained(training_args.output_dir)
167 |                 model = RanMaskModel.from_pretrained(training_args.output_dir)
168 |                 # set params for ensemble inference
169 |                 model.tokenizer = tokenizer
170 |                 model.mask_percentage = model_args.infer_mask_percentage
171 |                 model.ensemble_num = model_args.ensemble_num
172 |                 model.ensemble_method = model_args.ensemble_method
173 |             elif model_abbr == 'scrn':
174 |                 config = AutoConfig.from_pretrained(training_args.output_dir)
175 |                 model = SCRNModel(model_args.model_name_or_path, config=config)
176 |                 model.load_state_dict(torch.load(os.path.join(training_args.output_dir,'pytorch_model.bin')))
177 |             else:
178 |                 config = AutoConfig.from_pretrained(training_args.output_dir)
179 |                 model = AutoModelForSequenceClassification.from_pretrained(training_args.output_dir)
180 |             trainer = trainer_fn(
181 |                 model=model,
182 |                 args=training_args,
183 |                 tokenizer=tokenizer,
184 |                 data_collator=data_collator,
185 |                 eval_dataset=test_dataset,
186 |                 compute_metrics=metrics_fn,
187 |             )
188 |             predict_results = trainer.evaluate()
189 |             trainer.save_metrics("predict", predict_results)
190 | 
191 |     elif model_abbr in metric_based_model_list:
192 |         DEVICE = 'cuda'
193 |         START_DATE = datetime.datetime.now().strftime('%Y-%m-%d')
194 |         START_TIME = datetime.datetime.now().strftime('%H-%M-%S-%f')
195 | 
196 |         # get generative model and set device
197 |         # gpt-2
198 |         base_model, base_tokenizer = load_base_model_and_tokenizer(model_args.metric_base_model_name_or_path)
199 |         base_model.to(DEVICE)
200 | 
201 |         # build features
202 | 
203 |         def ll_criterion(text): return get_ll(text, base_model, base_tokenizer, DEVICE)
204 | 
205 |         def rank_criterion(text): return -get_rank(text, base_model, base_tokenizer, DEVICE, log=False)
206 | 
207 |         def logrank_criterion(text): return -get_rank(text, base_model, base_tokenizer, DEVICE, log=True)
208 | 
209 |         def entropy_criterion(text): return get_entropy(text, base_model, base_tokenizer, DEVICE)
210 | 
211 |         def GLTR_criterion(text): return get_rank_GLTR(text, base_model, base_tokenizer, DEVICE)
212 |     
213 |         outputs = []
214 |         data = raw_dataset
215 |         if model_abbr == "Log-Likelihood":
216 |             outputs.append(run_threshold_experiment(data, ll_criterion, "likelihood", logger=logger))
217 |         elif model_abbr == "Rank":
218 |             outputs.append(run_threshold_experiment(data, rank_criterion, "rank", logger=logger))
219 |         elif model_abbr == "Log-Rank":
220 |             outputs.append(run_threshold_experiment(data, logrank_criterion, "log_rank", logger=logger))
221 |         elif model_abbr == "Entropy":
222 |             outputs.append(run_threshold_experiment(data, entropy_criterion, "entropy", logger=logger))
223 |         elif model_abbr == "GLTR":
224 |             outputs.append(run_GLTR_experiment(data, GLTR_criterion, "rank_GLTR", logger=logger))
225 |         clf = outputs[0]['clf']
226 |         filename = training_args.output_dir + '/classifier.bin'
227 |         pickle.dump(clf, open(filename, 'wb'))
228 |         # save metrics
229 |         test_metrics = {'eval_%s'%k:v for k, v in outputs[0]['general_test'].items()}
230 |         file_name = training_args.output_dir + '/predict_results.json'
231 |         json.dump(test_metrics, open(file_name, 'w'))
232 |     
233 |     
234 |     
235 |     
236 |     else:
237 |         raise ValueError("Invalid model abbreviation")
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     main()
242 | 


--------------------------------------------------------------------------------
/run_attack.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import textattack
  3 | import pickle
  4 | import random
  5 | import torch
  6 | import numpy as np
  7 | import os
  8 | import json
  9 | from attack.sklearn_utils import CustomSklearnModelWrapper, CustomSklearnTokenizer
 10 | from textattack.models.wrappers import HuggingFaceModelWrapper
 11 | from attack import HuggingFaceModelMaskEnsembleWrapper
 12 | from transformers import AutoTokenizer, AutoModelForCausalLM
 13 | from textattack.attack_recipes import PWWSRen2019, Pruthi2019, DeepWordBugGao2018
 14 | from attack.attack_recipe import PWWSRen2019_threshold
 15 | from textattack import Attacker
 16 | from datasets import load_dataset
 17 | from utils.metric_utils import load_base_model_and_tokenizer
 18 | from utils.scrn_model import SCRNModel
 19 | from attack.custom_dataset import load_attack_dataset
 20 | from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument('--model_type', type=str, default="hf") # hf/sklearn
 24 | parser.add_argument('--ensemble_num', type=int, default=1) 
 25 | parser.add_argument('--mask_percentage', type=float, default=0.30) 
 26 | parser.add_argument('--transfer_dataset_abbr', type=str, default="self")
 27 | parser.add_argument('--num_examples', type=int, default=10)
 28 | parser.add_argument('--attack_class', type=str, default="ai")
 29 | parser.add_argument('--attack_recipe', type=str, default="pwws")
 30 | parser.add_argument('--data_files', type=str, default="./data_in")
 31 | parser.add_argument('--output_dir', type=str, default="./data_out")
 32 | parser.add_argument('--bert_name_or_path', type=str, default="bert-base-uncased")
 33 | parser.add_argument('--metric_base_model_name_or_path', type=str, default="gpt2")
 34 | parser.add_argument('--seed', type=int, default=2020)
 35 | parser.add_argument('--log_summary', type=str, default='yes')
 36 | args = parser.parse_args()
 37 | 
 38 | 
 39 | random.seed(args.seed)
 40 | np.random.seed(args.seed)
 41 | torch.manual_seed(args.seed)
 42 | torch.cuda.manual_seed_all(args.seed)
 43 | 
 44 | 
 45 | model_abbr, dataset_abbr = args.output_dir.split('/')[-1].split('_')
 46 | if args.transfer_dataset_abbr!= "self":
 47 |     dataset_abbr = args.transfer_dataset_abbr
 48 | args.data_files = args.data_files + '/' + dataset_abbr
 49 | # dataset
 50 | dataset = load_attack_dataset(data_files=args.data_files, attack_class=args.attack_class)
 51 | 
 52 | 
 53 | if args.model_type == 'hf':
 54 |     # load config and tokenizer
 55 |     config = AutoConfig.from_pretrained(args.output_dir)
 56 |     tokenizer = AutoTokenizer.from_pretrained(
 57 |         args.output_dir,
 58 |         model_max_length=512,
 59 |         padding_side="right",
 60 |         use_fast=False,
 61 |     )
 62 |     # load model
 63 |     if model_abbr == 'scrn':
 64 |         model = SCRNModel(args.bert_name_or_path, config=config)
 65 |         model.load_state_dict(torch.load(os.path.join(args.output_dir,'pytorch_model.bin')))
 66 |     else:
 67 |         model = AutoModelForSequenceClassification.from_pretrained(args.output_dir, config=config)
 68 |     # select model_wrapper
 69 |     if args.ensemble_num > 1:
 70 |         model_wrapper = HuggingFaceModelMaskEnsembleWrapper(model, tokenizer, ensemble_num=args.ensemble_num, mask_percentage=args.mask_percentage)
 71 |     else:
 72 |         model_wrapper = HuggingFaceModelWrapper(model, tokenizer)
 73 | elif args.model_type =='sklearn':
 74 |     # model
 75 |     DEVICE = 'cuda'
 76 |     base_model, base_tokenizer = load_base_model_and_tokenizer(args.metric_base_model_name_or_path)
 77 |     base_model.to(DEVICE)
 78 |     tokenizer = CustomSklearnTokenizer(base_model, base_tokenizer, DEVICE, feature_fn=model_abbr)
 79 |     filename = args.output_dir + '/' + 'classifier.bin'
 80 |     # load the model from disk
 81 |     model = pickle.load(open(filename, 'rb'))
 82 |     model_wrapper = CustomSklearnModelWrapper(model, tokenizer)
 83 | else:
 84 |     raise ValueError('Unknown model type %s'%args.model_type)
 85 | 
 86 | if args.num_examples == -1:
 87 |     num_examples = len(dataset)
 88 | else:
 89 |     num_examples = args.num_examples
 90 | 
 91 | max_num_word_swaps = np.mean([len(x[0]['text'].split(' ')) for x in dataset][:num_examples]) // 20
 92 | if max_num_word_swaps >= 10:
 93 |     max_num_word_swaps = 10
 94 | elif max_num_word_swaps <= 1:
 95 |     max_num_word_swaps = 1
 96 | else:
 97 |     _ = 0
 98 | 
 99 | if args.attack_recipe == 'pwws': # word sub
100 |     attack = PWWSRen2019.build(model_wrapper)
101 | elif args.attack_recipe == 'pwwsTaip': # add threshold ai as positive
102 |     # get threshold
103 |     with open(f"{args.output_dir}/predict_results.json", "r") as fin:
104 |         metrics = json.load(fin)
105 |     if args.attack_class == "ai":
106 |         target_max_score = metrics["eval_aip_threshold_chatgpt"]
107 |     elif args.attack_class == "human":
108 |         target_max_score = metrics["eval_aip_threshold_human"]
109 |     else:
110 |         raise ValueError('Unknown attack class %s'%args.attack_class)
111 |     attack = PWWSRen2019_threshold.build(model_wrapper, target_max_score=target_max_score)
112 | elif args.attack_recipe == 'pwwsThp': # add threshold human as positive
113 |     with open(f"{args.output_dir}/predict_results.json", "r") as fin:
114 |         metrics = json.load(fin)
115 |     if args.attack_class == "ai":
116 |         target_max_score = metrics["eval_hp_threshold_chatgpt"]
117 |     elif args.attack_class == "human":
118 |         target_max_score = metrics["eval_hp_threshold_human"]
119 |     else:
120 |         raise ValueError('Unknown attack class %s'%args.attack_class)
121 |     attack = PWWSRen2019_threshold.build(model_wrapper, target_max_score=target_max_score)
122 | elif args.attack_recipe == 'pruthi': # char sub delete insert etc
123 |     attack = Pruthi2019.build(model_wrapper, max_num_word_swaps=max_num_word_swaps)
124 | elif args.attack_recipe == 'deep-word-bug': # word sub, char sub, word del, word insert etc
125 |     attack = DeepWordBugGao2018.build(model_wrapper)
126 | else:
127 |     raise ValueError('Unknown attack recipe %s'%args.attack_recipe)
128 | 
129 | attack_args = textattack.AttackArgs(
130 |     num_examples=num_examples,
131 |     log_to_csv='%s/attack_results_%s_%s_%s.csv'%(args.output_dir, dataset_abbr, args.attack_class, args.attack_recipe),
132 |     csv_coloring_style='html', 
133 | )
134 | attacker = Attacker(attack, dataset, attack_args)
135 | results = attacker.attack_dataset()
136 | if args.log_summary == 'yes':
137 |     attacker.attack_log_manager.add_output_file(filename="%s/attack_summary_%s_%s_%s.log"%(args.output_dir, dataset_abbr, args.attack_class, args.attack_recipe), color_method="file")
138 |     attacker.attack_log_manager.log_summary()
139 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | # metric-based detectors: ["Log-Likelihood", "Log-Rank", "Entropy", "GLTR"]
 2 | # model-based detectors: ['bert-base', 'roberta-base', 'deberta-base', 'ChatGPT-Detector', 'flooding', 'rdrop', 'ranmask', 'scrn']
 3 | 
 4 | export DATASET_ABBR=mixed-source
 5 | export MODEL_ABBR=scrn
 6 | export BERT_MODEL=roberta-base # just used for huggingface wrapped model
 7 | 
 8 | python3 -u main.py  \
 9 | --do_train True \
10 | --do_predict True \
11 | --cache_dir .cache  \
12 | --seed 2020 \
13 | --save_total_limit 5 \
14 | --learning_rate 1e-4 \
15 | --per_device_train_batch_size 16 \
16 | --per_device_eval_batch_size 16 \
17 | --num_train_epochs 2.0 \
18 | --max_seq_length 512 \
19 | --num_labels 2 \
20 | --logging_steps 50 \
21 | --gradient_accumulation_steps 1 \
22 | --metric_base_model_name_or_path gpt2 \
23 | --model_name_or_path ${BERT_MODEL} \
24 | --data_files ./data_in/${DATASET_ABBR}  \
25 | --output_dir ./data_out/${MODEL_ABBR} \


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarlanLark/Robust-AIGC-Detector/10989242d09ad46fa49592928c7490460d744ebd/utils/__init__.py


--------------------------------------------------------------------------------
/utils/flooding_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM
 5 | from transformers.modeling_outputs import SequenceClassifierOutput
 6 | 
 7 | class FloodingTrainer(Trainer):
 8 |         
 9 |     def compute_loss(self, model, inputs, return_outputs=False):
10 |         outputs = model(**inputs)
11 |         loss, logits = outputs.loss, outputs.logits
12 |         loss = (loss - 0.15).abs() + 0.15
13 |         
14 |         outputs = SequenceClassifierOutput(
15 |             loss=loss,
16 |             logits=logits,
17 |             hidden_states=None,
18 |             attentions=None,
19 |         )
20 |         return (loss, outputs) if return_outputs else loss


--------------------------------------------------------------------------------
/utils/metric_based.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import time
  5 | from utils.metric_utils import timeit, get_clf_results, cut_length, cal_metrics
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | def get_ll(text, base_model, base_tokenizer, DEVICE):
 10 |     with torch.no_grad():
 11 |         if len(base_tokenizer.encode(text)) == 1:
 12 |             text += ' %s'%(base_tokenizer.pad_token)
 13 |         tokenized = base_tokenizer(
 14 |             text,
 15 |             padding=True,
 16 |             truncation=True,
 17 |             max_length=512,
 18 |             return_tensors="pt").to(DEVICE)
 19 |         labels = tokenized.input_ids
 20 |         return -base_model(**tokenized, labels=labels).loss.item()
 21 |         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L1317
 22 | 
 23 | 
 24 | def get_lls(texts, base_model, base_tokenizer, DEVICE):
 25 |     return [get_ll(_, base_model, base_tokenizer, DEVICE) for _ in texts]
 26 | 
 27 | 
 28 | # get the average rank of each observed token sorted by model likelihood
 29 | def get_rank(text, base_model, base_tokenizer, DEVICE, log=False):
 30 |     with torch.no_grad():
 31 |         if len(base_tokenizer.encode(text)) == 1:
 32 |             text += ' %s'%(base_tokenizer.pad_token)
 33 |         tokenized = base_tokenizer(
 34 |             text,
 35 |             truncation=True,
 36 |             max_length=512,
 37 |             return_tensors="pt",
 38 |         ).to(DEVICE)
 39 |         logits = base_model(**tokenized).logits[:, :-1]
 40 |         labels = tokenized.input_ids[:, 1:]
 41 | 
 42 |         # get rank of each label token in the model's likelihood ordering
 43 |         matches = (logits.argsort(-1, descending=True)
 44 |                    == labels.unsqueeze(-1)).nonzero()
 45 | 
 46 |         assert matches.shape[
 47 |             1] == 3, f"Expected 3 dimensions in matches tensor, got {matches.shape}"
 48 | 
 49 |         ranks, timesteps = matches[:, -1], matches[:, -2]
 50 | 
 51 |         # make sure we got exactly one match for each timestep in the sequence
 52 |         assert (timesteps == torch.arange(len(timesteps)).to(
 53 |             timesteps.device)).all(), "Expected one match per timestep"
 54 | 
 55 |         ranks = ranks.float() + 1  # convert to 1-indexed rank
 56 |         if log:
 57 |             ranks = torch.log(ranks)
 58 | 
 59 |         return ranks.float().mean().item()
 60 | 
 61 | 
 62 | def get_ranks(texts, base_model, base_tokenizer, DEVICE, log=False):
 63 |     return [get_rank(_, base_model, base_tokenizer, DEVICE, log)
 64 |             for _ in texts]
 65 | 
 66 | 
 67 | def get_rank_GLTR(text, base_model, base_tokenizer, DEVICE, log=False):
 68 |     with torch.no_grad():
 69 |         if len(base_tokenizer.encode(text)) == 1:
 70 |             text += ' %s'%(base_tokenizer.pad_token)
 71 |         tokenized = base_tokenizer(
 72 |             text,
 73 |             truncation=True,
 74 |             max_length=512,
 75 |             return_tensors="pt").to(DEVICE)
 76 |         logits = base_model(**tokenized).logits[:, :-1]
 77 |         labels = tokenized.input_ids[:, 1:]
 78 | 
 79 |         # get rank of each label token in the model's likelihood ordering
 80 |         matches = (logits.argsort(-1, descending=True)
 81 |                    == labels.unsqueeze(-1)).nonzero()
 82 | 
 83 |         assert matches.shape[
 84 |             1] == 3, f"Expected 3 dimensions in matches tensor, got {matches.shape}"
 85 | 
 86 |         ranks, timesteps = matches[:, -1], matches[:, -2]
 87 | 
 88 |         # make sure we got exactly one match for each timestep in the sequence
 89 |         assert (timesteps == torch.arange(len(timesteps)).to(
 90 |             timesteps.device)).all(), "Expected one match per timestep"
 91 |         ranks = ranks.float()
 92 |         res = np.array([0.0, 0.0, 0.0, 0.0])
 93 |         for i in range(len(ranks)):
 94 |             if ranks[i] < 10:
 95 |                 res[0] += 1
 96 |             elif ranks[i] < 100:
 97 |                 res[1] += 1
 98 |             elif ranks[i] < 1000:
 99 |                 res[2] += 1
100 |             else:
101 |                 res[3] += 1
102 |         if res.sum() > 0:
103 |             res = res / res.sum()
104 | 
105 |         return res
106 | 
107 | 
108 | # get average entropy of each token in the text
109 | def get_entropy(text, base_model, base_tokenizer, DEVICE):
110 |     with torch.no_grad():
111 |         if len(base_tokenizer.encode(text)) == 1:
112 |             text += ' %s'%(base_tokenizer.pad_token)
113 |         tokenized = base_tokenizer(
114 |             text,
115 |             truncation=True,
116 |             max_length=512,
117 |             return_tensors="pt").to(DEVICE)
118 |         logits = base_model(**tokenized).logits[:, :-1]
119 |         neg_entropy = F.softmax(logits, dim=-1) * F.log_softmax(logits, dim=-1)
120 |         return -neg_entropy.sum(-1).mean().item()
121 | 
122 | 
123 | @timeit
124 | def run_threshold_experiment(data, criterion_fn, name, logger=None):
125 |     torch.manual_seed(0)
126 |     np.random.seed(0)
127 | 
128 |     # get train data
129 |     train_text = data['train']['text']
130 |     train_label = data['train']['labels']
131 |     t1 = time.time()
132 |     train_criterion = [
133 |         criterion_fn(
134 |             train_text[idx]) for idx in tqdm(
135 |             range(
136 |                 len(train_text)),
137 |             desc="Train criterion")]
138 |     x_train = np.array(train_criterion)
139 | 
140 |     y_train = np.array(train_label)
141 | 
142 |     test_text = data['test']['text']
143 |     test_label = data['test']['labels']
144 |     test_criterion = [
145 |         criterion_fn(
146 |             test_text[idx]) for idx in tqdm(
147 |             range(
148 |                 len(test_text)),
149 |             desc="Test criterion")]
150 |     x_test = np.array(test_criterion)
151 | 
152 |     y_test = np.array(test_label)
153 | 
154 |     # remove nan values
155 |     select_train_index = ~np.isnan(x_train)
156 |     select_test_index = ~np.isnan(x_test)
157 |     x_train = x_train[select_train_index]
158 |     y_train = y_train[select_train_index]
159 |     x_test = x_test[select_test_index]
160 |     y_test = y_test[select_test_index]
161 |     x_train = np.expand_dims(x_train, axis=-1)
162 |     x_test = np.expand_dims(x_test, axis=-1)
163 | 
164 |     # import pdb;pdb.set_trace()
165 |     clf, train_res, test_res = get_clf_results(
166 |         x_train, y_train, x_test, y_test)
167 | 
168 |     print('-----  train  -----')
169 |     print(train_res)
170 |     print('-----  test  -----')
171 |     print(test_res)
172 |     if logger:
173 |         logger.info('-----  train  -----')
174 |         logger.info(train_res)
175 |         logger.info('-----  test  -----')
176 |         logger.info(test_res)
177 | 
178 |     return {
179 |         'name': f'{name}_threshold',
180 |         'predictions': {'train': train_criterion, 'test': test_criterion},
181 |         'general_train': train_res, 
182 |         'general_test': test_res, 
183 |         'clf': clf
184 |     }
185 | 
186 | 
187 | @timeit
188 | def run_threshold_experiment_multiple_test_length(
189 |     clf, data, criterion_fn, name, lengths=[
190 |         10, 20, 50, 100, 200, 500, -1]):
191 |     torch.manual_seed(0)
192 |     np.random.seed(0)
193 |     res = {}
194 |     for length in lengths:
195 |         test_text = data['test']['text']
196 |         test_label = data['test']['labels']
197 |         test_criterion = [
198 |             criterion_fn(
199 |                 cut_length(
200 |                     test_text[idx],
201 |                     length)) for idx in tqdm(
202 |                 range(
203 |                     len(test_text)),
204 |                 desc="Test criterion")]
205 |         x_test = np.array(test_criterion)
206 |         y_test = np.array(test_label)
207 | 
208 |         # remove nan values
209 |         select_test_index = ~np.isnan(x_test)
210 |         x_test = x_test[select_test_index]
211 |         y_test = y_test[select_test_index]
212 |         x_test = np.expand_dims(x_test, axis=-1)
213 | 
214 |         y_test_pred = clf.predict(x_test)
215 |         y_test_pred_prob = clf.predict_proba(x_test)
216 |         y_test_pred_prob = [_[1] for _ in y_test_pred_prob]
217 |         acc_test, precision_test, recall_test, f1_test, auc_test = cal_metrics(
218 |             y_test, y_test_pred, y_test_pred_prob)
219 |         test_res = acc_test, precision_test, recall_test, f1_test, auc_test
220 | 
221 |         print(f"{name} {length} acc_test: {acc_test}, precision_test: {precision_test}, recall_test: {recall_test}, f1_test: {f1_test}, auc_test: {auc_test}")
222 |         res[length] = test_res
223 |     
224 |     return res
225 | 
226 | 
227 | @timeit
228 | def run_GLTR_experiment(data, criterion_fn, name, logger=None):
229 |     torch.manual_seed(0)
230 |     np.random.seed(0)
231 | 
232 |     train_text = data['train']['text']
233 |     train_label = data['train']['labels']
234 |     train_criterion = [criterion_fn(train_text[idx])
235 |                        for idx in range(len(train_text))]
236 |     x_train = np.array(train_criterion)
237 |     y_train = train_label
238 | 
239 |     test_text = data['test']['text']
240 |     test_label = data['test']['labels']
241 |     test_criterion = [criterion_fn(test_text[idx])
242 |                       for idx in range(len(test_text))]
243 |     x_test = np.array(test_criterion)
244 |     y_test = test_label
245 | 
246 |     clf, train_res, test_res = get_clf_results(
247 |         x_train, y_train, x_test, y_test)
248 | 
249 |     print('-----  train  -----')
250 |     print(train_res)
251 |     print('-----  test  -----')
252 |     print(test_res)
253 |     if logger:
254 |         logger.info('-----  train  -----')
255 |         logger.info(train_res)
256 |         logger.info('-----  test  -----')
257 |         logger.info(test_res)
258 | 
259 |     return {
260 |         'name': f'{name}_threshold',
261 |         'predictions': {'train': train_criterion, 'test': test_criterion},
262 |         'general_train': train_res, 
263 |         'general_test': test_res, 
264 |         'clf': clf
265 |     }
266 | 
267 | 
268 | @timeit
269 | def run_GLTR_experiment_multiple_test_length(
270 |     clf, data, criterion_fn, name, lengths=[
271 |         10, 20, 50, 100, 200, 500, -1]):
272 |     torch.manual_seed(0)
273 |     np.random.seed(0)
274 | 
275 |     res = {}
276 |     for length in lengths:
277 |         test_text = data['test']['text']
278 |         test_label = data['test']['labels']
279 |         test_criterion = [
280 |             criterion_fn(
281 |                 cut_length(
282 |                     test_text[idx],
283 |                     length)) for idx in tqdm(
284 |                 range(
285 |                     len(test_text)),
286 |                 desc="Test criterion")]
287 |         x_test = np.array(test_criterion)
288 |         y_test = np.array(test_label)
289 | 
290 |         y_test_pred = clf.predict(x_test)
291 |         y_test_pred_prob = clf.predict_proba(x_test)
292 |         y_test_pred_prob = [_[1] for _ in y_test_pred_prob]
293 |         acc_test, precision_test, recall_test, f1_test, auc_test = cal_metrics(
294 |             y_test, y_test_pred, y_test_pred_prob)
295 |         test_res = acc_test, precision_test, recall_test, f1_test, auc_test
296 | 
297 |         print(f"{name} {length} acc_test: {acc_test}, precision_test: {precision_test}, recall_test: {recall_test}, f1_test: {f1_test}, auc_test: {auc_test}")
298 |         res[length] = test_res
299 | 
300 |     return res


--------------------------------------------------------------------------------
/utils/metric_utils.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | import re
  3 | import numpy as np
  4 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
  5 | from sklearn.linear_model import LogisticRegression
  6 | import time
  7 | from functools import wraps
  8 | import random
  9 | from utils.utils import compute_metrics
 10 | 
 11 | 
 12 | 
 13 | def timeit(func):
 14 |     @wraps(func)
 15 |     def timeit_wrapper(*args, **kwargs):
 16 |         start_time = time.time()
 17 |         result = func(*args, **kwargs)
 18 |         end_time = time.time()
 19 |         total_time = end_time - start_time
 20 |         print(f'Function {func.__name__} Took {total_time:.4f} seconds\n\n')
 21 |         return result
 22 |     return timeit_wrapper
 23 | 
 24 | 
 25 | # define regex to match all <extra_id_*> tokens, where * is an integer
 26 | pattern = re.compile(r"<extra_id_\d+>")
 27 | 
 28 | 
 29 | def select_train_data(data, select_num=-1):
 30 |     new_train = {
 31 |         'text': [],
 32 |         'label': [],
 33 |     }
 34 | 
 35 |     if select_num == -1:
 36 |         return data
 37 |     else:
 38 |         new_train['text'] = data['train']['text'][:select_num]
 39 |         new_train['label'] = data['train']['label'][:select_num]
 40 |         data['train'] = new_train
 41 | 
 42 |     return data
 43 | 
 44 | 
 45 | def filter_test_data(data, max_length=25):
 46 |     new_test = {
 47 |         'text': [],
 48 |         'label': [],
 49 |     }
 50 |     for i in range(len(data['test']['text'])):
 51 |         text = data['test']['text'][i]
 52 |         label = data['test']['label'][i]
 53 |         if len(text.split()) <= max_length:
 54 |             new_test['text'].append(text)
 55 |             new_test['label'].append(label)
 56 |     data['test'] = new_test
 57 |     return data
 58 | 
 59 | 
 60 | def cut_length(text, max_length=-1):
 61 |     if max_length == -1:
 62 |         return text
 63 |     else:
 64 |         text = text.split()[:max_length]
 65 |         text = " ".join(text)
 66 |         return text
 67 | 
 68 | 
 69 | def sample_dataset(data, num_train, num_test):
 70 |     data["train"]["text"] = data["train"]["text"][:num_train]
 71 |     data["train"]["label"] = data["train"]["label"][:num_train]
 72 |     data["test"]["text"] = data["test"]["text"][:num_test]
 73 |     data["test"]["label"] = data["test"]["label"][:num_test]
 74 |     return data
 75 | 
 76 | 
 77 | def load_base_model_and_tokenizer(model_name_or_path, cache_dir=".cache"):
 78 | 
 79 |     print(f'Loading BASE model {model_name_or_path}...')
 80 |     base_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, cache_dir=cache_dir)
 81 |     base_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
 82 |     # TODO check gpt for pad_token
 83 |     base_tokenizer.pad_token = base_tokenizer.eos_token
 84 |     base_tokenizer.pad_token_id = base_tokenizer.eos_token_id
 85 | 
 86 |     return base_model, base_tokenizer
 87 | 
 88 | 
 89 | def load_base_model(base_model, DEVICE):
 90 |     print('MOVING BASE MODEL TO GPU...', end='', flush=True)
 91 |     start = time.time()
 92 | 
 93 |     base_model.to(DEVICE)
 94 |     print(f'DONE ({time.time() - start:.2f}s)')
 95 | 
 96 | 
 97 | def cal_metrics(label, pred_label, pred_posteriors):
 98 |     if len(set(label)) < 3:
 99 |         acc = accuracy_score(label, pred_label)
100 |         precision = precision_score(label, pred_label)
101 |         recall = recall_score(label, pred_label)
102 |         f1 = f1_score(label, pred_label)
103 |         auc = roc_auc_score(label, pred_posteriors)
104 |     else:
105 |         acc = accuracy_score(label, pred_label)
106 |         precision = precision_score(label, pred_label, average='weighted')
107 |         recall = recall_score(label, pred_label, average='weighted')
108 |         f1 = f1_score(label, pred_label, average='weighted')
109 |         auc = -1.0
110 |         conf_m = confusion_matrix(label, pred_label)
111 |         print(conf_m)
112 |     return acc, precision, recall, f1, auc
113 | 
114 | 
115 | def get_clf_results(x_train, y_train, x_test, y_test):
116 | 
117 |     clf = LogisticRegression(random_state=2020).fit(x_train, y_train)
118 | 
119 |     y_train_pred = clf.predict(x_train)
120 |     y_train_pred_prob = clf.predict_proba(x_train)
121 |     y_train_pred_prob = np.array([_[1] for _ in y_train_pred_prob])
122 |     train_res = compute_metrics(y_train, y_train_pred, y_train_pred_prob)
123 | 
124 |     y_test_pred = clf.predict(x_test)
125 |     y_test_pred_prob = clf.predict_proba(x_test)
126 |     y_test_pred_prob = np.array([_[1] for _ in y_test_pred_prob])
127 |     test_res = compute_metrics(y_test, y_test_pred, y_test_pred_prob)
128 | 
129 |     return clf, train_res, test_res


--------------------------------------------------------------------------------
/utils/ranmask_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | from transformers import PreTrainedModel, RobertaPreTrainedModel, Trainer, AutoModel, RobertaModel
  5 | from typing import List, Optional, Tuple, Union
  6 | import pdb
  7 | from transformers.modeling_outputs import SequenceClassifierOutput
  8 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
  9 | from .utils import ensemble_mask_tokens
 10 | import numpy as np
 11 | from sklearn.preprocessing import normalize
 12 | 
 13 | 
 14 | class RobertaClassificationHead(nn.Module):
 15 |     """Head for sentence-level classification tasks."""
 16 | 
 17 |     def __init__(self, config):
 18 |         super().__init__()
 19 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 20 |         classifier_dropout = (
 21 |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
 22 |         )
 23 |         self.dropout = nn.Dropout(classifier_dropout)
 24 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
 25 | 
 26 |     def forward(self, features, **kwargs):
 27 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
 28 |         x = self.dropout(x)
 29 |         x = self.dense(x)
 30 |         x = torch.tanh(x)
 31 |         x = self.dropout(x)
 32 |         x = self.out_proj(x)
 33 |         return x
 34 | 
 35 | class RanMaskModel(RobertaPreTrainedModel):
 36 |     def __init__(self, config):
 37 |         super().__init__(config)
 38 |         self.num_labels = config.num_labels
 39 |         self.config = config
 40 | 
 41 |         self.roberta = RobertaModel(config, add_pooling_layer=False)
 42 |         self.classifier = RobertaClassificationHead(config)
 43 | 
 44 |         self.tokenizer = None
 45 |         self.infer_mask_percentage = 0.05
 46 |         self.ensemble_num = 5
 47 |         self.ensemble_method = "votes"
 48 | 
 49 |         # Initialize weights and apply final processing
 50 |         self.post_init()
 51 | 
 52 |     # ensemble forward
 53 |     def forward(
 54 |         self,
 55 |         input_ids: Optional[torch.Tensor] = None,
 56 |         attention_mask: Optional[torch.Tensor] = None,
 57 |         token_type_ids: Optional[torch.Tensor] = None,
 58 |         position_ids: Optional[torch.Tensor] = None,
 59 |         head_mask: Optional[torch.Tensor] = None,
 60 |         inputs_embeds: Optional[torch.Tensor] = None,
 61 |         labels: Optional[torch.Tensor] = None,
 62 |         output_attentions: Optional[bool] = None,
 63 |         output_hidden_states: Optional[bool] = None,
 64 |         return_dict: Optional[bool] = None,
 65 |     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
 66 |         r"""
 67 |         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
 68 |             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
 69 |             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
 70 |             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 71 |         """
 72 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 73 |         # ensemble infer
 74 |         input_strings = self.tokenizer.batch_decode(input_ids, skip_special_tokens=True)
 75 |         ensemble_strings = ensemble_mask_tokens(input_strings, self.mask_percentage, self.ensemble_num, mask_token=self.tokenizer.mask_token)
 76 |         model_device = input_ids.device
 77 |         batch_size = 32#len(input_strings)
 78 |         i = 0
 79 |         ensemble_logits_list = []
 80 |         while i < len(ensemble_strings):
 81 |             batch_ensemble_strings = ensemble_strings[i:i+batch_size]
 82 |             batch_inputs = self.tokenizer(batch_ensemble_strings, return_tensors="pt", padding=True, truncation=True)
 83 |             batch_inputs = {key: value.to(model_device) for key, value in batch_inputs.items()}
 84 | 
 85 |             outputs = self.roberta(**batch_inputs, return_dict=return_dict,)
 86 |             sequence_output = outputs[0]
 87 |             logits = self.classifier(sequence_output)
 88 | 
 89 |             ensemble_logits_list.append(logits)
 90 |             i += batch_size
 91 |         ensemble_logits = torch.cat(ensemble_logits_list, dim=0).cpu().numpy() #[bsz, label_num]
 92 |         # get ensembled logits
 93 |         ensemble_logits_for_each_input = np.split(ensemble_logits, indices_or_sections=len(input_strings), axis=0)
 94 |         logits_list = []
 95 | 
 96 |         for logits in ensemble_logits_for_each_input:
 97 |             if self.ensemble_method == 'votes':
 98 |                 voted_label = np.argmax(np.bincount(np.argmax(logits, axis=-1), minlength=logits.shape[-1]))
 99 |                 voted_logits_array = logits[np.where(np.argmax(logits, axis=-1)==voted_label)[0]]
100 |                 voted_logits = np.mean(voted_logits_array, axis=0, keepdims=True) #[1, num_labels]
101 |                 logits_list.append(torch.from_numpy(voted_logits))
102 |             else:
103 |                 avg_logits = np.mean(logits, axis=0, keepdims=True)
104 |                 logits_list.append(torch.from_numpy(avg_logits))
105 | 
106 |         logits = torch.cat(logits_list, dim=0).to(model_device)
107 | 
108 |         loss = None
109 |         if labels is not None:
110 |             labels = labels.to(logits.device)
111 |             if self.config.problem_type is None:
112 |                 if self.num_labels == 1:
113 |                     self.config.problem_type = "regression"
114 |                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
115 |                     self.config.problem_type = "single_label_classification"
116 |                 else:
117 |                     self.config.problem_type = "multi_label_classification"
118 | 
119 |             if self.config.problem_type == "regression":
120 |                 loss_fct = MSELoss()
121 |                 if self.num_labels == 1:
122 |                     loss = loss_fct(logits.squeeze(), labels.squeeze())
123 |                 else:
124 |                     loss = loss_fct(logits, labels)
125 |             elif self.config.problem_type == "single_label_classification":
126 |                 loss_fct = CrossEntropyLoss()
127 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
128 |             elif self.config.problem_type == "multi_label_classification":
129 |                 loss_fct = BCEWithLogitsLoss()
130 |                 loss = loss_fct(logits, labels)
131 |         if not return_dict:
132 |             output = (logits,) + outputs[2:]
133 |             return ((loss,) + output) if loss is not None else output
134 | 
135 |         return SequenceClassifierOutput(
136 |             loss=loss,
137 |             logits=logits,
138 |             hidden_states=None,
139 |             attentions=None,
140 |         )
141 | 
142 |     


--------------------------------------------------------------------------------
/utils/rdrop.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM
 5 | from transformers.modeling_outputs import SequenceClassifierOutput
 6 | 
 7 | 
 8 | def compute_kl_loss(p, q, pad_mask=None):
 9 |     
10 |     p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
11 |     q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')
12 |     
13 |     # pad_mask is for seq-level tasks
14 |     if pad_mask is not None:
15 |         p_loss.masked_fill_(pad_mask, 0.)
16 |         q_loss.masked_fill_(pad_mask, 0.)
17 | 
18 |     # You can choose whether to use function "sum" and "mean" depending on your task
19 |     p_loss = p_loss.mean()
20 |     q_loss = q_loss.mean()
21 | 
22 |     loss = (p_loss + q_loss) / 2
23 |     return loss
24 | 
25 | class RDropTrainer(Trainer):
26 |         
27 |     def compute_loss(self, model, inputs, return_outputs=False):
28 |         labels = inputs.get("labels")
29 |         loss_fct = nn.CrossEntropyLoss()
30 | 
31 |         logits = model(**inputs).logits
32 |         logits2 = model(**inputs).logits
33 | 
34 |         ce_loss = 0.5 * (loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + loss_fct(logits2.view(-1, self.model.config.num_labels), labels.view(-1)))
35 |         kl_loss = compute_kl_loss(logits, logits2)
36 |         loss = ce_loss + kl_loss
37 |         
38 |         outputs = SequenceClassifierOutput(
39 |             loss=loss,
40 |             logits=logits,
41 |             hidden_states=None,
42 |             attentions=None,
43 |         )
44 |         return (loss, outputs) if return_outputs else loss


--------------------------------------------------------------------------------
/utils/scrn_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | from transformers import PreTrainedModel, AutoModelForSequenceClassification, Trainer, AutoModel, AutoModelForCausalLM
  5 | from typing import List, Optional, Tuple, Union
  6 | import pdb
  7 | import json
  8 | from transformers.modeling_outputs import SequenceClassifierOutput
  9 | 
 10 | class Disentangle_Layer(nn.Module):
 11 |     def __init__(self, input_dim = 768, latent_dim = 64, hidden_dim = 512):
 12 |         super(Disentangle_Layer, self).__init__()
 13 |         self.input_dim = input_dim
 14 |         self.latent_dim = latent_dim
 15 |         self.hidden_dim = hidden_dim
 16 | 
 17 |         self.squeezer = nn.ModuleList([nn.Linear(self.input_dim, self.hidden_dim), nn.ReLU()])
 18 |         self.semantic_proj = nn.Linear(self.hidden_dim, self.latent_dim)
 19 |         self.perturbation_proj = nn.Linear(self.hidden_dim, 1)
 20 | 
 21 |     def forward(self, input):
 22 |         latent_rep = input
 23 |         for layer in self.squeezer:
 24 |             latent_rep = layer(latent_rep) # [B, T, D]
 25 |         senmantic_rep = self.semantic_proj(latent_rep)
 26 |         perturbation_log_rep = self.perturbation_proj(latent_rep)
 27 | 
 28 |         return senmantic_rep, perturbation_log_rep
 29 | 
 30 | class Reconstruction_Layer(nn.Module):
 31 |     def __init__(self, output_dim = 768, latent_dim = 64, hidden_dim = 512):
 32 |         super(Reconstruction_Layer, self).__init__()
 33 |         self.output_dim = output_dim
 34 |         self.latent_dim = latent_dim
 35 |         self.hidden_dim = hidden_dim
 36 | 
 37 |         self.recon_layers = nn.ModuleList([nn.Linear(self.latent_dim, self.hidden_dim), nn.ReLU(), nn.Linear(self.hidden_dim, self.output_dim)])
 38 | 
 39 |     def forward(self, latent):
 40 |         recon_rep = latent
 41 |         for layer in self.recon_layers:
 42 |             recon_rep = layer(recon_rep) # [B, T, D]
 43 |         return recon_rep
 44 |     
 45 | class Reconstruction_Network(nn.Module):
 46 |     def __init__(self, input_dim = 768, latent_dim = 64):
 47 |         super(Reconstruction_Network, self).__init__()
 48 |         
 49 |         self.encoder = Disentangle_Layer(input_dim, latent_dim)
 50 |         self.decoder = Reconstruction_Layer(input_dim, latent_dim)
 51 | 
 52 | 
 53 |     def forward(self, input, beta = 0.5):
 54 |         senmantic_rep, perturbation_log_rep = self.encoder(input)
 55 | 
 56 |         noised_rep = self.gaussian_random_perturb(senmantic_rep, torch.exp(0.5 * perturbation_log_rep))
 57 |         output = self.decoder(noised_rep)
 58 |         mse_loss = self.recon_loss(output, input)
 59 |         reg_loss = self.regularization_loss(senmantic_rep, perturbation_log_rep)
 60 |         
 61 |         loss = mse_loss + beta * reg_loss # [B, T]
 62 |         return output, loss.mean()
 63 | 
 64 |     def gaussian_random_perturb(self, semantic_rep, perturbation_log_rep):
 65 |         gaussian_noise = torch.randn_like(perturbation_log_rep)
 66 |         return semantic_rep + gaussian_noise * perturbation_log_rep
 67 |     
 68 |     def recon_loss(self, output, input):
 69 |         return F.mse_loss(output, input, reduction="none").mean(dim = -1)
 70 |     
 71 |     def regularization_loss(self, semantic_rep, perturbation_log_rep, alpha = -1):
 72 |         return torch.mean(semantic_rep.pow(2) + perturbation_log_rep.exp() + alpha * perturbation_log_rep, dim=-1)
 73 | 
 74 | 
 75 | class ClassificationHead(nn.Module):
 76 |     """Head for sentence-level classification tasks."""
 77 | 
 78 |     def __init__(self, config):
 79 |         super().__init__()
 80 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 81 |         classifier_dropout = (
 82 |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
 83 |         )
 84 |         self.dropout = nn.Dropout(classifier_dropout)
 85 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
 86 | 
 87 |     def forward(self, features, **kwargs):
 88 |         x = features.max(dim = 1)[0]  
 89 |         x = self.dropout(x)
 90 |         x = self.dense(x)
 91 |         x = torch.tanh(x)
 92 |         x = self.dropout(x)
 93 |         x = self.out_proj(x)
 94 |         return x
 95 | 
 96 | class Calibrator(nn.Module):
 97 |     def __init__(self, symmetry=True):
 98 |         super(Calibrator, self).__init__()
 99 |         self.symmetry = symmetry
100 |         self.kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=False)
101 |     
102 |     def forward(self, logits_p, logits_q):
103 | 
104 |         log_dist_p, log_dist_q = F.log_softmax(logits_p, dim=-1), F.log_softmax(logits_q, dim=-1)
105 |         dist_p, dist_q = F.softmax(logits_p, dim=-1), F.softmax(logits_q, dim=-1)
106 |         if self.symmetry:
107 |             calib_loss = 0.5 * (self.kl_loss(log_dist_p, dist_q) + self.kl_loss(log_dist_q, dist_p))
108 |         else:
109 |             calib_loss = self.kl_loss(dist_p, dist_q)
110 |         return calib_loss
111 | 
112 | class SCRNModel(PreTrainedModel):
113 | 
114 |     def __init__(self, model_name, config):
115 |         super(SCRNModel, self).__init__(config)
116 | 
117 |         self.bert = AutoModel.from_pretrained(model_name, config=config)
118 |         self.classifier = ClassificationHead(config=config)
119 |         self.reconNN = Reconstruction_Network(input_dim = config.hidden_size, latent_dim = 512) 
120 |     
121 |   
122 | 
123 |     def forward(
124 |         self,
125 |         input_ids: Optional[torch.Tensor] = None,
126 |         attention_mask: Optional[torch.Tensor] = None,
127 |         token_type_ids: Optional[torch.Tensor] = None,
128 |         position_ids: Optional[torch.Tensor] = None,
129 |         inputs_embeds: Optional[torch.Tensor] = None,
130 |         labels: Optional[torch.Tensor] = None,
131 |         output_attentions: Optional[bool] = None,
132 |         output_hidden_states: Optional[bool] = None,
133 |         return_dict: Optional[bool] = None,
134 |     ):
135 |         output = self.bert(input_ids = input_ids ,
136 |                             attention_mask = attention_mask,
137 |                             token_type_ids = token_type_ids,
138 |                             position_ids = position_ids,
139 |                             inputs_embeds = inputs_embeds,
140 |                             output_attentions = output_attentions,
141 |                             output_hidden_states = True,
142 |                             return_dict = return_dict)
143 |         last_hidden_state = output.last_hidden_state
144 |         recon_output, recon_loss = self.reconNN(last_hidden_state)
145 |         logits = self.classifier(recon_output)
146 |         
147 |         return SequenceClassifierOutput(
148 |             loss=recon_loss,
149 |             logits=logits,
150 |             hidden_states=output.hidden_states,
151 |             attentions=output.attentions,
152 |         )
153 | 
154 | class SCRNTrainer(Trainer):
155 |     def __init__(self, *args, **kwargs):
156 |         super().__init__(*args, **kwargs)
157 |         self.calibrator = Calibrator()
158 | 
159 |     def compute_loss(self, model, inputs, return_outputs=False):
160 |         labels = inputs.get("labels")
161 |         
162 |         # siamese branch 1
163 |         outputs_p = model.forward(**inputs)
164 |         recon_loss_p, logits_p = outputs_p.loss.mean(), outputs_p.logits
165 |         cls_loss_p = F.cross_entropy(logits_p, labels)
166 | 
167 |         # siamese branch 2
168 |         outputs_q = model.forward(**inputs)
169 |         recon_loss_q, logits_q = outputs_q.loss.mean(), outputs_q.logits
170 |         cls_loss_q = F.cross_entropy(logits_q, labels)
171 | 
172 |         # cablibration
173 |         calib_loss = self.calibrator(logits_p, logits_q)
174 | 
175 |         # final loss
176 |         loss = 0.5 * (cls_loss_p + cls_loss_q) + 0.5 * calib_loss + 0.01 * (recon_loss_p + recon_loss_q)
177 |         outputs = SequenceClassifierOutput(
178 |             loss=loss,
179 |             logits=logits_p,
180 |             hidden_states=None,
181 |             attentions=None,
182 |         )
183 |         return (loss, outputs) if return_outputs else loss


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import numpy as np
  4 | from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
  5 | 
  6 | def set_logger(training_args):
  7 | 
  8 |     # Setup logging
  9 |     logger = logging.getLogger(__name__)
 10 |     logging.basicConfig(
 11 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 12 |         datefmt="%m/%d/%Y %H:%M:%S",
 13 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
 14 |         handlers=[logging.FileHandler(training_args.output_dir + "/train.log", 'w', encoding='utf-8'),
 15 |                   logging.StreamHandler()]
 16 |     )
 17 |     logger.warning(
 18 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
 19 |         training_args.local_rank,
 20 |         training_args.device,
 21 |         training_args.n_gpu,
 22 |         bool(training_args.local_rank != -1),
 23 |         training_args.fp16,
 24 |     )
 25 |     logger.info("Training/evaluation parameters %s", training_args)
 26 | 
 27 |     return logger
 28 | 
 29 | 
 30 | def path_checker(training_args):
 31 |     if (
 32 |             os.path.exists(training_args.output_dir)
 33 |             and os.listdir(training_args.output_dir)
 34 |             and training_args.do_train
 35 |             and not training_args.overwrite_output_dir
 36 |     ):
 37 |         raise ValueError(
 38 |             f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
 39 |         )
 40 | 
 41 |     if not os.path.exists(training_args.logging_dir):
 42 |         os.mkdir(training_args.logging_dir)
 43 | 
 44 | 
 45 | def simple_accuracy(preds, labels):
 46 |     return (preds == labels).mean()
 47 | 
 48 | 
 49 | def metrics_fn(p):
 50 |     preds = np.argmax(p.predictions, axis=1)
 51 |     return {"acc": simple_accuracy(preds, p.label_ids)}
 52 | 
 53 | 
 54 | def prediction(logit):
 55 |     return np.argmax(logit, axis=1)
 56 | 
 57 | def find_thres(fpr, thresholds, target_fpr=0.01):
 58 |     idx = 0
 59 |     while fpr[idx+1] <= target_fpr:
 60 |         idx += 1
 61 |     return {'fpr': fpr[idx], 'threshold': thresholds[idx]}
 62 | 
 63 | def compute_metrics(y_true, y_pred, y_score):
 64 |     clf_report = classification_report(y_true, y_pred, output_dict=True)
 65 |     auc = roc_auc_score(y_true, y_score)
 66 |     hp_fpr, hp_tpr, hp_thresholds = roc_curve(y_true, 1-y_score, pos_label=0)# human as positive samples
 67 |     hp_fpr_thres = find_thres(hp_fpr, hp_thresholds, target_fpr=0.01)
 68 |     aip_fpr, aip_tpr, aip_thresholds = roc_curve(y_true, y_score, pos_label=1) # ai as positive samples
 69 |     aip_fpr_thres = find_thres(aip_fpr, aip_thresholds, target_fpr=0.01)
 70 |     # con_mat = confusion_matrix(y_true, preds)
 71 |     return {
 72 |         "AUC": auc,
 73 |         "hp_fpr": hp_fpr_thres['fpr'], 
 74 |         "hp_threshold_chatgpt": 1 - hp_fpr_thres['threshold'],
 75 |         "hp_threshold_human": hp_fpr_thres['threshold'], 
 76 |         "aip_fpr": aip_fpr_thres['fpr'], 
 77 |         "aip_threshold_chatgpt": aip_fpr_thres['threshold'],
 78 |         "aip_threshold_human": 1 - aip_fpr_thres['threshold'],
 79 |         "acc": clf_report['accuracy'],
 80 |         "precision_overall_weighted": clf_report['weighted avg']['precision'],
 81 |         "recall_overall_weighted": clf_report['weighted avg']['recall'],
 82 |         "fscore_overall_weighted": clf_report['weighted avg']['f1-score'],
 83 |         "precision_chatgpt": clf_report['1']['precision'],
 84 |         "recall_chatgpt": clf_report['1']['recall'],
 85 |         "fscore_chatgpt": clf_report['1']['f1-score'],
 86 |         "support_chatgpt": clf_report['1']['support'],
 87 |         "precision_human": clf_report['0']['precision'],
 88 |         "recall_human": clf_report['0']['recall'],
 89 |         "fscore_human": clf_report['0']['f1-score'],
 90 |         "support_human": clf_report['0']['support'],
 91 |         # "confusion_matrix": con_mat.tolist()
 92 |     }
 93 | 
 94 | def mask_tokens(strings, mask_percentage=0.3, mask_token='<mask>'):
 95 |     masked_strings = []
 96 | 
 97 |     for string in strings:
 98 |         tokens = np.array(string.split())
 99 |         num_tokens = len(tokens)
100 |         num_masked_tokens = int(num_tokens * mask_percentage)
101 | 
102 |         masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False)
103 |         masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens)
104 |         masked_string = ' '.join(masked_tokens)
105 |         masked_strings.append(masked_string)
106 | 
107 |     return masked_strings
108 | 
109 | def ensemble_mask_tokens(strings, mask_percentage=0.3, ensemble_num=3, mask_token='<mask>'):
110 |     """
111 |     strings: (list[str]): List of strings
112 |     Returns: (list[str]): List of strings
113 |     """
114 |     masked_strings = []
115 |     for string in strings:
116 |         for iter_idx in range(ensemble_num):
117 |             tokens = np.array(string.split())
118 |             num_tokens = len(tokens)
119 |             num_masked_tokens = int(num_tokens * mask_percentage)
120 | 
121 |             masked_indices = np.random.choice(num_tokens, num_masked_tokens, replace=False)
122 |             masked_tokens = np.where(np.isin(np.arange(num_tokens), masked_indices), mask_token, tokens).tolist()
123 |             masked_string = ' '.join(masked_tokens)
124 |             masked_strings.append(masked_string)
125 |     return masked_strings


--------------------------------------------------------------------------------