├── .gitignore ├── LICENSE ├── README.md ├── code ├── generate_r_clm.py ├── generate_r_clm_from_scratch.py ├── ghostbuster │ ├── README.md │ ├── n_gram.py │ ├── symbolic.py │ └── train_lr.py ├── r_clm │ ├── ai_dataset.py │ ├── ai_loader.py │ └── ai_optimizer.py ├── r_detect │ ├── ai_dataset.py │ ├── ai_loader.py │ ├── ai_model.py │ └── ai_optimizer.py ├── r_embed │ ├── ai_dataset.py │ ├── ai_loader.py │ ├── ai_model.py │ └── ai_optimizer.py ├── r_ranking │ ├── ai_dataset.py │ ├── ai_loader.py │ ├── ai_model.py │ └── ai_optimizer.py ├── train_r_clm.py ├── train_r_clm_from_scratch.py ├── train_r_detect.py ├── train_r_dpo.py ├── train_r_embed.py ├── train_r_ranking.py ├── trainer_ranking_loss.py └── utils │ ├── metric_utils.py │ └── train_utils.py ├── conf ├── r_clm │ ├── conf_r_clm.yaml │ ├── conf_r_clm_bloom.yaml │ ├── conf_r_clm_falcon.yaml │ ├── conf_r_clm_generate.yaml │ ├── conf_r_clm_generate_bloom.yaml │ ├── conf_r_clm_generate_falcon.yaml │ ├── conf_r_clm_generate_gpt2.yaml │ ├── conf_r_clm_generate_lite_llama.yaml │ ├── conf_r_clm_generate_llama13b.yaml │ ├── conf_r_clm_generate_mistral_persuade.yaml │ ├── conf_r_clm_generate_mpt.yaml │ ├── conf_r_clm_generate_opt.yaml │ ├── conf_r_clm_generate_pythia.yaml │ ├── conf_r_clm_generate_tiny_llama.yaml │ ├── conf_r_clm_gpt2.yaml │ ├── conf_r_clm_lite_llama.yaml │ ├── conf_r_clm_llama13b.yaml │ ├── conf_r_clm_mistral_persuade.yaml │ ├── conf_r_clm_mpt.yaml │ ├── conf_r_clm_opt.yaml │ ├── conf_r_clm_pythia.yaml │ ├── conf_r_clm_tiny_llama.yaml │ ├── conf_r_dpo.yaml │ └── conf_r_dpo_generate.yaml ├── r_detect │ ├── conf_r_detect_mix_v16.yaml │ └── conf_r_detect_mix_v26.yaml ├── r_embed │ └── conf_r_embed.yaml └── r_ranking │ └── conf_r_ranking_large.yaml ├── requirements.txt └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Raja Biswas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains our code and configurations for the **LLM - Detect AI Generated Text** competition. The summary of the solution is posted [here](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470121). Please refer to the following sections for details on training and dependencies. 2 | 3 | ## Section 1: Setup 4 | ### 1.1 Hardware 5 | **Jarvislabs.ai** was our primary source of compute. Specifically, models were trained on the following instance: 6 | 7 | Ubuntu 20.04.5 LTS (128 GB boot disk) 8 | Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz (7 vCPUs) 9 | 4 x NVIDIA A100 40GB GPU OR 4 x NVIDIA A6000 48GB GPU 10 | 11 | ### 1.2 Software 12 | I used PyTorch-2.1 image from Jarvislabs.ai, which comes with: 13 | 14 | * Python 3.10.11 15 | * CUDA 12.3 16 | 17 | ### 1.3 Dependencies 18 | Please clone the repository and install the required packages using the following commands: 19 | 20 | ``` 21 | git clone https://github.com/rbiswasfc/llm-detect-ai.git 22 | cd llm-detect-ai 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ### 1.4 Datasets 27 | 28 | Please make sure Kaggle API is installed. Then run the following script to download the required datasets: 29 | 30 | ``` 31 | chmod +x ./setup.sh 32 | ./setup.sh 33 | ``` 34 | 35 | Please note that the above script will create `datasets` and `models` folder in the directory located one level above the current directory. The external datasets will be downloaded in the `datasets` folder. Instruction-tuned LLMs, which can be used to generate adversarial essays, will be downloaded in the `models` folder. Total size of downloaded data and model files is ~8GB. 36 | 37 | ## Section 2: Training 38 | Training scripts and configurations are located in the `code` and `conf` folders respectively. We leveraged HF `accelerate` library to execute training runs with DDP on multiple GPUs (4x A100). Specifically, we used the following configurations for training: 39 | 40 | ```yaml 41 | compute_environment: LOCAL_MACHINE 42 | debug: false 43 | distributed_type: MULTI_GPU 44 | downcast_bf16: 'no' 45 | gpu_ids: all 46 | machine_rank: 0 47 | main_training_function: main 48 | mixed_precision: 'no' 49 | num_machines: 1 50 | num_processes: 4 51 | rdzv_backend: static 52 | same_network: true 53 | tpu_env: [] 54 | tpu_use_cluster: false 55 | tpu_use_sudo: false 56 | use_cpu: false 57 | ``` 58 | 59 | ### 2.1 LLM Models 60 | For (Q)LoRA fine-tuning of the LLM models, please run the following commands: 61 | 62 | ```bash 63 | accelerate launch ./code/train_r_detect.py \ 64 | --config-name conf_r_detect_mix_v16 \ 65 | use_wandb=false 66 | ``` 67 | 68 | ```bash 69 | accelerate launch ./code/train_r_detect.py \ 70 | --config-name conf_r_detect_mix_v26 \ 71 | use_wandb=false 72 | ``` 73 | 74 | Please note that training takes ~3 hours for `mix_v16` and ~4 hours for `mix_v26`. 75 | 76 | ### 2.2 DeBERTa Ranking Models 77 | 78 | To training the `deberta-v3-large` model with ranking loss, please run the following command: 79 | 80 | ```bash 81 | accelerate launch ./code/train_r_ranking.py \ 82 | --config-name conf_r_ranking_large \ 83 | use_wandb=false 84 | ``` 85 | 86 | ### 2.3 Embedding model 87 | 88 | We trained an embedding model with supervised contrastive loss to find similar essays (KNN neighbors) for a given essay in the test set. 89 | 90 | ```bash 91 | accelerate launch ./code/train_r_embed.py \ 92 | --config-name conf_r_embed \ 93 | use_wandb=false 94 | ``` 95 | 96 | ## Section 3: Text Generation 97 | 98 | We fine-tuned a wide variety of LLMs using the CLM objective on [PERSUADE](https://www.kaggle.com/datasets/nbroad/persaude-corpus-2) corpus to produce student like essays. The fine-tuned checkpoints were uploaded as a Kaggle Dataset `conjuring92/detect-ai-persuade-clm-ckpts`. These checkpoints can be used to generate essays using the following commands: 99 | 100 | ```bash 101 | accelerate launch ./code/generate_r_clm.py \ 102 | --config_path ./conf/r_clm/conf_r_clm_generate.yaml 103 | 104 | accelerate launch ./code/generate_r_clm.py \ 105 | --config_path ./conf/r_clm/conf_r_clm_generate_tiny_llama.yaml 106 | 107 | accelerate launch ./code/generate_r_clm.py \ 108 | --config_path ./conf/r_clm/conf_r_clm_generate_pythia.yaml 109 | 110 | accelerate launch ./code/generate_r_clm.py \ 111 | --config_path ./conf/r_clm/conf_r_clm_generate_bloom.yaml 112 | 113 | accelerate launch ./code/generate_r_clm.py \ 114 | --config_path ./conf/r_clm/conf_r_clm_generate_gpt2.yaml 115 | 116 | accelerate launch ./code/generate_r_clm.py \ 117 | --config_path ./conf/r_clm/conf_r_clm_generate_opt.yaml 118 | 119 | accelerate launch ./code/generate_r_clm.py \ 120 | --config_path ./conf/r_clm/conf_r_clm_generate_falcon.yaml 121 | 122 | accelerate launch ./code/generate_r_clm.py \ 123 | --config_path ./conf/r_clm/conf_r_clm_generate_mpt.yaml 124 | 125 | accelerate launch ./code/generate_r_clm.py \ 126 | --config_path ./conf/r_clm/conf_r_clm_generate_llama13b.yaml 127 | 128 | accelerate launch ./code/generate_r_clm_from_scratch.py \ 129 | --config_path ./conf/r_clm/conf_r_clm_generate_mistral_persuade.yaml 130 | ``` 131 | 132 | Optionally, the fine-tuning of LLMs for text generation can be done using the following commands: 133 | 134 | ```bash 135 | accelerate launch ./code/train_r_clm.py \ 136 | --config-name conf_r_clm_tiny_llama \ 137 | use_wandb=false 138 | 139 | accelerate launch ./code/train_r_clm.py \ 140 | --config-name conf_r_clm_pythia \ 141 | use_wandb=false 142 | 143 | accelerate launch ./code/train_r_clm.py \ 144 | --config-name conf_r_clm_bloom \ 145 | use_wandb=false 146 | 147 | accelerate launch ./code/train_r_clm.py \ 148 | --config-name conf_r_clm_gpt2 \ 149 | use_wandb=false 150 | 151 | accelerate launch ./code/train_r_clm.py \ 152 | --config-name conf_r_clm_opt \ 153 | use_wandb=false 154 | 155 | accelerate launch ./code/train_r_clm.py \ 156 | --config-name conf_r_clm_falcon \ 157 | use_wandb=false 158 | 159 | accelerate launch ./code/train_r_clm.py \ 160 | --config-name conf_r_clm_mpt \ 161 | use_wandb=false 162 | 163 | accelerate launch ./code/train_r_clm.py \ 164 | --config-name conf_r_clm_llama13b \ 165 | use_wandb=false 166 | 167 | accelerate launch ./code/train_r_clm_from_scratch.py \ 168 | --config-name conf_r_clm_mistral_persuade \ 169 | use_wandb=false 170 | ``` -------------------------------------------------------------------------------- /code/generate_r_clm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import random 5 | import string 6 | from itertools import chain 7 | 8 | import pandas as pd 9 | import torch 10 | from accelerate import Accelerator 11 | from omegaconf import OmegaConf 12 | from peft import PeftModel 13 | from tqdm.auto import tqdm 14 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig 15 | 16 | 17 | def generate_random_string(): 18 | chars = string.ascii_lowercase + string.digits 19 | return 'e_' + ''.join(random.choice(chars) for _ in range(8)) 20 | 21 | 22 | def get_instruction(inputs): 23 | ret = f""" 24 | Prompt: {inputs['prompt_name']} 25 | Task: {inputs['task']} 26 | Score: {inputs['holistic_essay_score']} 27 | Student Grade Level: {inputs['grade_level']} 28 | English Language Learner: {inputs['ell_status']} 29 | Disability Status: {inputs['student_disability_status']} 30 | """.strip() 31 | n_chars = random.randint(16, 64) 32 | 33 | start = inputs['text'][:n_chars] 34 | 35 | ret = f"### Instruction:\n{ret}\n\n### Response: {start}" 36 | return ret 37 | 38 | 39 | def get_inputs(prompt, tokenizer, n=1): 40 | return tokenizer([prompt]*n, return_tensors="pt") 41 | 42 | 43 | def process_response(texts): 44 | ret = [] 45 | 46 | for text in texts: 47 | if "" in text: 48 | text = text.split("### Response:")[-1].split("")[0].strip() 49 | else: 50 | text = text.split("### Response:")[-1].split("<|endoftext|>")[0].strip() 51 | text = text.replace("", "") 52 | ret.append(text) 53 | return ret 54 | 55 | 56 | def pre_process_essay(essay_df): 57 | 58 | essay_df = essay_df[~essay_df['text'].isna()].copy() 59 | essay_df = essay_df.reset_index(drop=True) 60 | 61 | essay_df["student_disability_status"] = essay_df["student_disability_status"].fillna("Unknown") 62 | essay_df["ell_status"] = essay_df["ell_status"].fillna("Unknown") 63 | essay_df["grade_level"] = essay_df["grade_level"].fillna(-1) 64 | essay_df["holistic_essay_score"] = essay_df["holistic_essay_score"].fillna(-1) 65 | 66 | essay_df["prompt"] = essay_df.apply(get_instruction, axis=1) 67 | return essay_df 68 | 69 | 70 | def generate(cfg): 71 | accelerator = Accelerator() 72 | 73 | essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"}) 74 | essay_df = pre_process_essay(essay_df) 75 | 76 | prompts = essay_df["prompt"].values.tolist() 77 | 78 | # --------------------------------------------------- 79 | # uncomment for oversampling of certain prompts--- 80 | # prompts = [p for p in prompts if "Task: Text dependent" not in p] 81 | # prompts = [p for p in prompts if "car-free" not in p.lower()] 82 | # prompts = [p for p in prompts if "facial action" not in p.lower()] 83 | # prompts = [p for p in prompts if "electoral" not in p.lower()] 84 | # --------------------------------------------------- 85 | 86 | print(f"Number of prompts: {len(prompts)}") 87 | 88 | # model & tokenizer --- 89 | tokenizer = AutoTokenizer.from_pretrained( 90 | cfg.base_model_path, 91 | use_fast=True, 92 | padding_side="left", 93 | truncation_side="left", 94 | ) 95 | 96 | if tokenizer.pad_token is None: 97 | tokenizer.pad_token = tokenizer.unk_token 98 | tokenizer.pad_token_id = tokenizer.unk_token_id 99 | 100 | # bnb_config = BitsAndBytesConfig( 101 | # load_in_4bit=True, 102 | # bnb_4bit_quant_type="nf4", 103 | # bnb_4bit_use_double_quant=True, 104 | # bnb_4bit_compute_dtype=torch.bfloat16 105 | # ) 106 | 107 | # compute allowable tokens --- 108 | tokenized_corpus = tokenizer(essay_df['text'].values.tolist()) 109 | all_tok_ids = set(chain(*tokenized_corpus['input_ids'])) 110 | accelerator.print(f"Number of unique tokens: {len(all_tok_ids)}") 111 | out_of_corpus_token_ids = list(set(range(tokenizer.vocab_size)).difference(all_tok_ids)) 112 | accelerator.print(f"Number of out of scope tokens: {len(out_of_corpus_token_ids)}") 113 | 114 | # --- 115 | 116 | base_model = AutoModelForCausalLM.from_pretrained( 117 | cfg.base_model_path, 118 | low_cpu_mem_usage=True, 119 | torch_dtype=torch.bfloat16, 120 | # quantization_config=bnb_config, 121 | # attn_implementation="flash_attention_2", 122 | ) 123 | 124 | model = PeftModel.from_pretrained(base_model, cfg.adapter_path) 125 | model = model.merge_and_unload() 126 | model = accelerator.prepare(model) 127 | model.eval() 128 | 129 | n_examples = cfg.n_examples 130 | n_gen_per_prompt = cfg.n_gen_per_prompt 131 | output_dir = cfg.output_dir 132 | 133 | # progress_bar = tqdm(range(n_examples), disable=not accelerator.is_local_main_process) 134 | progress_bar = tqdm(range(n_examples)) 135 | 136 | for i in range(n_examples): 137 | # print(f"---- Example {i+1}/{n_examples} ------") 138 | temperature = 0.5 + 1.5 * random.random() 139 | top_k = random.randint(128, 256) 140 | penalty_alpha = random.random() 141 | guidance_scale = 1.0 + 0.25 * random.random() 142 | eta_cutoff = 1e-4 + 5e-4 * random.random() 143 | repetition_penalty = 1.2 # 1.0 + 0.2 * random.random() 144 | 145 | try: 146 | generation_config = GenerationConfig.from_pretrained( 147 | cfg.base_model_path, 148 | do_sample=True, 149 | temperature=temperature, 150 | top_k=top_k, 151 | penalty_alpha=penalty_alpha, 152 | guidance_scale=guidance_scale, 153 | max_new_tokens=cfg.max_num_tokens, 154 | pad_token_id=tokenizer.pad_token_id, 155 | eta_cutoff=eta_cutoff, 156 | # repetition_penalty=repetition_penalty, 157 | suppress_tokens=out_of_corpus_token_ids, 158 | ) 159 | 160 | except Exception as e: 161 | print(e) 162 | generation_config = GenerationConfig( 163 | # cfg.base_model_path, 164 | do_sample=True, 165 | temperature=temperature, 166 | top_k=top_k, 167 | # penalty_alpha=penalty_alpha, 168 | # guidance_scale=guidance_scale, 169 | max_new_tokens=cfg.max_num_tokens, 170 | pad_token_id=tokenizer.pad_token_id, 171 | eta_cutoff=eta_cutoff, 172 | suppress_tokens=out_of_corpus_token_ids, 173 | ) 174 | 175 | try: 176 | prompt = random.choice(prompts) 177 | this_example = dict() 178 | this_id = generate_random_string() 179 | this_example['id'] = this_id 180 | this_example['prompt'] = prompt 181 | this_example['temperature'] = temperature 182 | this_example['top_k'] = top_k 183 | this_example['guidance_scale'] = guidance_scale 184 | this_example['penalty_alpha'] = penalty_alpha 185 | 186 | inputs = get_inputs(prompt, tokenizer, n=n_gen_per_prompt) 187 | device = accelerator.device 188 | inputs = {k: v.to(device) for k, v in inputs.items()} 189 | 190 | with torch.no_grad(): 191 | output = model.generate(**inputs, generation_config=generation_config) 192 | output = tokenizer.batch_decode(output) 193 | 194 | output = process_response(output) 195 | this_example['responses'] = output 196 | 197 | with open(f"{output_dir}/{this_id}.json", "w") as f: 198 | json.dump(this_example, f) 199 | 200 | except Exception as e: 201 | print(e) 202 | progress_bar.update(1) 203 | progress_bar.close() 204 | 205 | 206 | if __name__ == "__main__": 207 | 208 | ap = argparse.ArgumentParser() 209 | ap.add_argument('--config_path', type=str, required=True) 210 | 211 | args = ap.parse_args() 212 | cfg = OmegaConf.load(args.config_path) 213 | 214 | os.makedirs(cfg.output_dir, exist_ok=True) 215 | 216 | # execution 217 | generate(cfg) 218 | -------------------------------------------------------------------------------- /code/generate_r_clm_from_scratch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import random 5 | import string 6 | 7 | import pandas as pd 8 | import torch 9 | from accelerate import Accelerator 10 | from omegaconf import OmegaConf 11 | from peft import PeftModel 12 | from tqdm.auto import tqdm 13 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 14 | BitsAndBytesConfig, GenerationConfig) 15 | 16 | 17 | def generate_random_string(): 18 | chars = string.ascii_lowercase + string.digits 19 | return 'e_' + ''.join(random.choice(chars) for _ in range(8)) 20 | 21 | 22 | def get_instruction(inputs): 23 | ret = f""" 24 | Prompt: {inputs['prompt_name']} 25 | Task: {inputs['task']} 26 | Score: {inputs['holistic_essay_score']} 27 | Student Grade Level: {inputs['grade_level']} 28 | English Language Learner: {inputs['ell_status']} 29 | Disability Status: {inputs['student_disability_status']} 30 | """.strip() 31 | n_chars = random.randint(64, 128) 32 | 33 | start = inputs['text'][:n_chars] 34 | 35 | ret = f"### Instruction:\n{ret}\n\n### Response: {start}" 36 | return ret 37 | 38 | 39 | def get_inputs(prompt, tokenizer, n=1): 40 | return tokenizer([prompt]*n, return_tensors="pt") 41 | 42 | 43 | def process_response(texts): 44 | ret = [] 45 | 46 | for text in texts: 47 | if "" in text: 48 | text = text.split("### Response:")[-1].split("")[0].strip() 49 | else: 50 | text = text.split("### Response:")[-1].split("<|endoftext|>")[0].strip() 51 | text = text.replace("", "") 52 | ret.append(text) 53 | return ret 54 | 55 | 56 | def pre_process_essay(essay_df): 57 | 58 | essay_df = essay_df[~essay_df['text'].isna()].copy() 59 | essay_df = essay_df.reset_index(drop=True) 60 | 61 | essay_df["student_disability_status"] = essay_df["student_disability_status"].fillna("Unknown") 62 | essay_df["ell_status"] = essay_df["ell_status"].fillna("Unknown") 63 | essay_df["grade_level"] = essay_df["grade_level"].fillna(-1) 64 | essay_df["holistic_essay_score"] = essay_df["holistic_essay_score"].fillna(-1) 65 | 66 | essay_df["prompt"] = essay_df.apply(get_instruction, axis=1) 67 | return essay_df 68 | 69 | 70 | def generate(cfg): 71 | accelerator = Accelerator() 72 | 73 | essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"}) 74 | essay_df = pre_process_essay(essay_df) 75 | 76 | prompts = essay_df["prompt"].values.tolist() 77 | # prompts = [p for p in prompts if "Task: Text dependent" in p] 78 | 79 | # prompts = [p for p in prompts if "car-free" not in p.lower()] 80 | # prompts = [p for p in prompts if "facial action" not in p.lower()] 81 | # prompts = [p for p in prompts if "electoral" not in p.lower()] 82 | 83 | print(f"Number of prompts: {len(prompts)}") 84 | 85 | # model & tokenizer --- 86 | tokenizer = AutoTokenizer.from_pretrained( 87 | cfg.model_path, 88 | use_fast=True, 89 | padding_side="left", 90 | truncation_side="left", 91 | ) 92 | 93 | if tokenizer.pad_token is None: 94 | tokenizer.pad_token = tokenizer.unk_token 95 | tokenizer.pad_token_id = tokenizer.unk_token_id 96 | 97 | # bnb_config = BitsAndBytesConfig( 98 | # load_in_4bit=True, 99 | # bnb_4bit_quant_type="nf4", 100 | # bnb_4bit_use_double_quant=True, 101 | # bnb_4bit_compute_dtype=torch.bfloat16 102 | # ) 103 | 104 | model = AutoModelForCausalLM.from_pretrained( 105 | cfg.model_path, 106 | torch_dtype=torch.bfloat16, 107 | ) 108 | 109 | model = accelerator.prepare(model) 110 | model.eval() 111 | 112 | n_examples = cfg.n_examples 113 | n_gen_per_prompt = cfg.n_gen_per_prompt 114 | output_dir = cfg.output_dir 115 | 116 | progress_bar = tqdm(range(n_examples)) 117 | 118 | for i in range(n_examples): 119 | # print(f"---- Example {i+1}/{n_examples} ------") 120 | temperature = 1.5 # + 0.75 * random.random() 121 | top_k = 512 # random.randint(4, 8) 122 | penalty_alpha = 0.5 # random.random() 123 | guidance_scale = 1.1 # + 0.5 * random.random() 124 | eta_cutoff = 1e-4 + 5e-4 * random.random() 125 | repetition_penalty = 1.2 # 1.0 + 0.2 * random.random() 126 | 127 | try: 128 | generation_config = GenerationConfig.from_pretrained( 129 | cfg.model_path, 130 | do_sample=True, 131 | temperature=temperature, 132 | top_k=top_k, 133 | penalty_alpha=penalty_alpha, 134 | guidance_scale=guidance_scale, 135 | max_new_tokens=cfg.max_num_tokens, 136 | pad_token_id=tokenizer.pad_token_id, 137 | # eta_cutoff=eta_cutoff, 138 | # repetition_penalty=repetition_penalty, 139 | ) 140 | except Exception as e: 141 | print(e) 142 | generation_config = GenerationConfig( 143 | # cfg.base_model_path, 144 | do_sample=True, 145 | temperature=temperature, 146 | top_k=top_k, 147 | penalty_alpha=penalty_alpha, 148 | guidance_scale=guidance_scale, 149 | max_new_tokens=cfg.max_num_tokens, 150 | pad_token_id=tokenizer.pad_token_id, 151 | eta_cutoff=eta_cutoff, 152 | ) 153 | 154 | try: 155 | prompt = random.choice(prompts) 156 | this_example = dict() 157 | this_id = generate_random_string() 158 | this_example['id'] = this_id 159 | this_example['prompt'] = prompt 160 | this_example['temperature'] = temperature 161 | this_example['top_k'] = top_k 162 | this_example['guidance_scale'] = guidance_scale 163 | this_example['penalty_alpha'] = penalty_alpha 164 | # this_example['typical_p'] = typical_p 165 | 166 | inputs = get_inputs(prompt, tokenizer, n=n_gen_per_prompt) 167 | device = accelerator.device 168 | inputs = {k: v.to(device) for k, v in inputs.items()} 169 | 170 | with torch.no_grad(): 171 | output = model.module.generate(**inputs, generation_config=generation_config) 172 | output = tokenizer.batch_decode(output) 173 | 174 | output = process_response(output) 175 | this_example['responses'] = output 176 | 177 | with open(f"{output_dir}/{this_id}.json", "w") as f: 178 | json.dump(this_example, f) 179 | 180 | except Exception as e: 181 | print(e) 182 | progress_bar.update(1) 183 | progress_bar.close() 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | ap = argparse.ArgumentParser() 189 | ap.add_argument('--config_path', type=str, required=True) 190 | 191 | args = ap.parse_args() 192 | cfg = OmegaConf.load(args.config_path) 193 | 194 | os.makedirs(cfg.output_dir, exist_ok=True) 195 | 196 | # execution 197 | generate(cfg) 198 | -------------------------------------------------------------------------------- /code/ghostbuster/README.md: -------------------------------------------------------------------------------- 1 | # Ghostbuster instructions 2 | 3 | 4 | 1. First get the logprobs for all of your texts using two models with the same tokenizers. I used tinyllama and llama7b. These should be saved in a directory where each text has a separate file. The format for the files is `token logprobs\ntoken logprobs` and so on. 5 | 6 | 2. Then run `run.py` to get the features for the texts. 7 | 3. Finally, run `train_lr.py` to train the model on the features. Below is the command used to run the script. 8 | 9 | ```sh 10 | python train_lr.py \ 11 | --feature_path "../custom_7b-tl-ft-ignore-25-cmin4-cmax4-m20" \ 12 | --model_type "vote" \ 13 | --C 100 \ 14 | --train_on_all_data 15 | ``` 16 | 17 | Note, use `--train_on_all_data` only after finding a good value of C -------------------------------------------------------------------------------- /code/ghostbuster/n_gram.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | from collections import defaultdict, Counter 3 | from transformers import PreTrainedTokenizerBase 4 | import numpy as np 5 | from nltk import ngrams 6 | 7 | 8 | # NGramModels from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/n_gram.py#L1 9 | class NGramModel: 10 | """ 11 | An n-gram model, where alpha is the laplace smoothing parameter. 12 | """ 13 | 14 | def __init__(self, train_text, n=2, alpha=3e-3, vocab_size=None): 15 | self.n = n 16 | if vocab_size is None: 17 | # Assume GPT tokenizer 18 | self.vocab_size = 50257 19 | else: 20 | self.vocab_size = vocab_size 21 | 22 | self.smoothing = alpha 23 | self.smoothing_f = alpha * self.vocab_size 24 | 25 | self.c = defaultdict(lambda: [0, Counter()]) 26 | for i in tqdm.tqdm(range(len(train_text) - n)): 27 | n_gram = tuple(train_text[i : i + n]) 28 | self.c[n_gram[:-1]][1][n_gram[-1]] += 1 29 | self.c[n_gram[:-1]][0] += 1 30 | self.n_size = len(self.c) 31 | 32 | def n_gram_probability(self, n_gram): 33 | assert len(n_gram) == self.n 34 | it = self.c[tuple(n_gram[:-1])] 35 | prob = (it[1][n_gram[-1]] + self.smoothing) / (it[0] + self.smoothing_f) 36 | return prob 37 | 38 | 39 | class DiscountBackoffModel(NGramModel): 40 | """ 41 | An n-gram model with discounting and backoff. Delta is the discounting parameter. 42 | """ 43 | 44 | def __init__(self, train_text, lower_order_model, n=2, delta=0.9, vocab_size=None): 45 | super().__init__(train_text, n=n, vocab_size=vocab_size) 46 | self.lower_order_model = lower_order_model 47 | self.discount = delta 48 | 49 | def n_gram_probability(self, n_gram): 50 | assert len(n_gram) == self.n 51 | it = self.c[tuple(n_gram[:-1])] 52 | 53 | if it[0] == 0: 54 | return self.lower_order_model.n_gram_probability(n_gram[1:]) 55 | 56 | prob = ( 57 | self.discount 58 | * (len(it[1]) / it[0]) 59 | * self.lower_order_model.n_gram_probability(n_gram[1:]) 60 | ) 61 | if it[1][n_gram[-1]] != 0: 62 | prob += max(it[1][n_gram[-1]] - self.discount, 0) / it[0] 63 | 64 | return prob 65 | 66 | 67 | class KneserNeyBaseModel(NGramModel): 68 | """ 69 | A Kneser-Ney base model, where n=1. 70 | """ 71 | 72 | def __init__(self, train_text, vocab_size=None): 73 | super().__init__(train_text, n=1, vocab_size=vocab_size) 74 | 75 | base_cnt = defaultdict(set) 76 | for i in range(1, len(train_text)): 77 | base_cnt[train_text[i]].add(train_text[i - 1]) 78 | 79 | cnt = 0 80 | for word in base_cnt: 81 | cnt += len(base_cnt[word]) 82 | 83 | self.prob = defaultdict(float) 84 | for word in base_cnt: 85 | self.prob[word] = len(base_cnt[word]) / cnt 86 | 87 | def n_gram_probability(self, n_gram): 88 | assert len(n_gram) == 1 89 | ret_prob = self.prob[n_gram[0]] 90 | 91 | if ret_prob == 0: 92 | return 1 / self.vocab_size 93 | else: 94 | return ret_prob 95 | 96 | 97 | class TrigramBackoff: 98 | """ 99 | A trigram model with discounting and backoff. Uses a Kneser-Ney base model. 100 | """ 101 | 102 | def __init__(self, train_text, delta=0.9, vocab_size=None): 103 | self.base = KneserNeyBaseModel(train_text, vocab_size=vocab_size) 104 | self.bigram = DiscountBackoffModel( 105 | train_text, self.base, n=2, delta=delta, vocab_size=vocab_size 106 | ) 107 | self.trigram = DiscountBackoffModel( 108 | train_text, self.bigram, n=3, delta=delta, vocab_size=vocab_size 109 | ) 110 | 111 | def n_gram_probability(self, n_gram): 112 | assert len(n_gram) == 3 113 | return self.trigram.n_gram_probability(n_gram) 114 | 115 | 116 | def score_ngram(doc, model, tokenizer, n=3, strip_first=False, bos_token_id=50256): 117 | """ 118 | Returns vector of ngram probabilities given document, model and tokenizer 119 | 120 | Slightly modified from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/featurize.py#L65-L75 121 | """ 122 | scores = [] 123 | if strip_first: 124 | doc = " ".join(doc.split()[:1000]) 125 | 126 | if isinstance(tokenizer.__self__, PreTrainedTokenizerBase): 127 | tokens = tokenizer(doc.strip(), add_special_tokens=True) 128 | 129 | # tokens[0] is bos token 130 | tokens = (n - 1) * [tokens[0]] + tokens 131 | else: 132 | eos_token_id = 50256 # eos/bos token for davinci model 133 | tokens = (n - 1) * [eos_token_id] + tokenizer(doc.strip()) 134 | 135 | # for k tokens and ngrams of size n, need to add n-1 tokens to the beginning 136 | # to ensure that there are k ngrams 137 | for i in ngrams(tokens, n): 138 | scores.append(model.n_gram_probability(i)) 139 | 140 | return np.array(scores) 141 | -------------------------------------------------------------------------------- /code/ghostbuster/symbolic.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from pathlib import Path 3 | from collections import defaultdict 4 | from functools import partial 5 | 6 | from datasets import Dataset 7 | from tqdm.auto import tqdm 8 | from nltk.corpus import brown 9 | import numpy as np 10 | import torch 11 | 12 | from n_gram import score_ngram, TrigramBackoff 13 | 14 | # Some code copied/modified from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/symbolic.py#L16 15 | 16 | 17 | def train_trigram(tokenizer_name, verbose=True, return_tokenizer=False): 18 | """ 19 | Trains and returns a trigram model on the brown corpus 20 | """ 21 | 22 | if tokenizer_name == "davinci": 23 | import tiktoken 24 | 25 | enc = tiktoken.encoding_for_model("davinci") 26 | tokenizer = enc.encode 27 | vocab_size = enc.n_vocab 28 | 29 | else: 30 | from transformers import AutoTokenizer 31 | 32 | enc = AutoTokenizer.from_pretrained(tokenizer_name) 33 | tokenizer = enc.encode 34 | vocab_size = len(enc) 35 | 36 | # We use the brown corpus to train the n-gram model 37 | sentences = brown.sents() 38 | 39 | if verbose: 40 | print("Tokenizing corpus...") 41 | tokenized_corpus = [] 42 | for sentence in tqdm(sentences): 43 | tokens = tokenizer(" ".join(sentence)) 44 | tokenized_corpus += tokens 45 | 46 | if verbose: 47 | print("\nTraining n-gram model...") 48 | 49 | if return_tokenizer: 50 | return TrigramBackoff(tokenized_corpus, vocab_size=vocab_size), tokenizer 51 | else: 52 | return TrigramBackoff(tokenized_corpus, vocab_size=vocab_size) 53 | 54 | 55 | def ds_from_files( 56 | file_dir, 57 | model1_name, 58 | model2_name, 59 | tokenizer, 60 | trigram, 61 | num_tokens=2047, 62 | num_proc=4, 63 | ): 64 | """ 65 | file_dir should be a path to raw text files. 66 | the logprob directory should be in file_dir 67 | 68 | raw_text_files should have filename `{id}.txt` 69 | 70 | logprob files should have filename `{id}-{model_name}.txt` 71 | """ 72 | 73 | file_dir = Path(file_dir) 74 | 75 | ds = Dataset.from_dict( 76 | {"raw_text_filepath": list(map(str, file_dir.glob("*.txt")))} 77 | ) 78 | 79 | # ds = ds.select(range(1000)) 80 | 81 | ds = ds.map( 82 | lambda x: {"text": open(x["raw_text_filepath"]).read()}, 83 | num_proc=num_proc, 84 | desc="Reading raw text files", 85 | ) 86 | ds = ds.map( 87 | lambda x: {"id": Path(x["raw_text_filepath"]).stem}, 88 | num_proc=num_proc, 89 | desc="Adding id", 90 | ) 91 | 92 | def load_probs(example, model_name): 93 | with open(file_dir / "logprobs" / f"{example['id']}-{model_name}.txt") as fp: 94 | data = fp.read().strip().split("\n") 95 | 96 | tokens, logprobs = [], [] 97 | for row in data: 98 | if len(row.split()) != 2: 99 | print([row]) 100 | 101 | if row[0] != " ": 102 | row = "Ġ" * len(row.split(" ")[0]) + row.split(" ")[1] 103 | 104 | row = "Ġ" * len(row.split(" ")) + row[1:] 105 | 106 | tokens.append(row.split()[0]) 107 | logprobs.append(row.split()[1]) 108 | 109 | probs = np.exp(np.array(list(map(float, logprobs))[:num_tokens])) 110 | 111 | return {"tokens": tokens, f"{model_name}-probs": probs} 112 | 113 | for m in [model1_name, model2_name]: 114 | ds = ds.map( 115 | load_probs, 116 | num_proc=num_proc, 117 | fn_kwargs={"model_name": m}, 118 | desc=f"Getting probs for {m}", 119 | ) 120 | 121 | def add_ngrams(example, n): 122 | model = trigram if n == 3 else trigram.base 123 | 124 | prefix = "uni" if n == 1 else "tri" 125 | 126 | ng = score_ngram(example["text"], model, tokenizer, n=n) 127 | other = len(example[f"{model1_name}-probs"]) 128 | if len(ng) > other: 129 | ng = ng[1 : other + 1] 130 | 131 | return {f"{prefix}gram-probs": ng} 132 | 133 | for n in [1, 3]: 134 | ds = ds.map( 135 | add_ngrams, 136 | num_proc=1, 137 | fn_kwargs={"n": n}, 138 | desc=f"Adding {n}-gram probabilities", 139 | ) 140 | 141 | return ds.with_format("numpy") 142 | 143 | 144 | vec_functions = { 145 | "v-add": lambda a, b: a + b, 146 | "v-sub": lambda a, b: a - b, 147 | "v-mul": lambda a, b: a * b, 148 | "v-div": lambda a, b: np.divide( 149 | a, b, out=np.zeros_like(a), where=(b != 0), casting="unsafe" 150 | ), 151 | "v->": lambda a, b: a > b, 152 | "v-<": lambda a, b: a < b, 153 | } 154 | 155 | scalar_functions = { 156 | "s-max": max, 157 | "s-min": min, 158 | "s-avg": lambda x: sum(x) / len(x), 159 | "s-avg-top-25": lambda x: sum(sorted(x, reverse=True)[:25]) 160 | / len(sorted(x, reverse=True)[:25]), 161 | "s-len": len, 162 | "s-var": np.var, 163 | "s-l2": np.linalg.norm, 164 | } 165 | 166 | vectors = ["llm1-probs", "llm2-probs", "trigram-probs", "unigram-probs"] 167 | 168 | # Get vec_combinations 169 | vec_combinations = defaultdict(list) 170 | for vec1 in range(len(vectors)): 171 | for vec2 in range(vec1): 172 | for func in vec_functions: 173 | if func != "v-div": 174 | vec_combinations[vectors[vec1]].append(f"{func} {vectors[vec2]}") 175 | 176 | for vec1 in vectors: 177 | for vec2 in vectors: 178 | if vec1 != vec2: 179 | vec_combinations[vec1].append(f"v-div {vec2}") 180 | 181 | 182 | def get_words(exp): 183 | """ 184 | Splits up expression into words, to be individually processed 185 | """ 186 | return exp.split(" ") 187 | 188 | 189 | def backtrack_functions(prev="", max_depth=2): 190 | """ 191 | Backtrack all possible features. 192 | """ 193 | 194 | def helper(prev, depth): 195 | if depth >= max_depth: 196 | return [] 197 | 198 | all_funcs = [] 199 | prev_word = get_words(prev)[-1] 200 | 201 | for func in scalar_functions: 202 | all_funcs.append(f"{prev} {func}") 203 | 204 | for comb in vec_combinations[prev_word]: 205 | all_funcs += helper(f"{prev} {comb}", depth + 1) 206 | 207 | return all_funcs 208 | 209 | ret = [] 210 | for vec in vectors: 211 | ret += helper(vec, 0) 212 | return ret 213 | 214 | 215 | def generate_symbolic_data( 216 | ds, 217 | max_depth=2, 218 | output_file="symbolic_data", 219 | verbose=True, 220 | model1="llama-7b", 221 | model2="tinyllama", 222 | tokenizer_name="davinci", 223 | num_proc=50, 224 | limit=100, 225 | ): 226 | """ 227 | Brute forces and generates symbolic data from a dataset of text files. 228 | """ 229 | 230 | ds = ds.with_format("numpy") 231 | 232 | def calc_feats(example, exp): 233 | 234 | name_map = { 235 | "llm1-probs": f"{model1}-probs", 236 | "llm2-probs": f"{model2}-probs", 237 | } 238 | 239 | exp_tokens = exp.split(" ") 240 | # exp_tokens will be operations and the vectors to operate on 241 | # e.g. 242 | # unigram-logprobs v-sub davinci-logprobs v-div ada-logprobs s-avg 243 | 244 | model_probs_key = exp_tokens[0] 245 | if model_probs_key.startswith("llm"): 246 | model_probs_key = name_map[model_probs_key] 247 | 248 | curr = example[model_probs_key] 249 | 250 | for i in range(1, len(exp_tokens)): 251 | if exp_tokens[i] in vec_functions: 252 | model_probs_key = exp_tokens[i + 1] 253 | 254 | if model_probs_key.startswith("llm"): 255 | model_probs_key = name_map[model_probs_key] 256 | next_vec = example[model_probs_key] 257 | curr = vec_functions[exp_tokens[i]](curr, next_vec) 258 | elif exp_tokens[i] in scalar_functions: 259 | final_value = scalar_functions[exp_tokens[i]](curr) 260 | 261 | return { 262 | "feat": final_value, 263 | } 264 | 265 | all_funcs = backtrack_functions(max_depth=max_depth) 266 | 267 | if verbose: 268 | print(f"\nTotal # of Features: {len(all_funcs)}.") 269 | print("Sampling 5 features:") 270 | for i in range(5): 271 | print(all_funcs[np.random.randint(0, len(all_funcs))]) 272 | print("\nGenerating datasets...") 273 | 274 | exp_to_data = {} 275 | 276 | import random 277 | 278 | if limit is not None: 279 | to_run = random.sample(all_funcs, k=limit) 280 | else: 281 | to_run = all_funcs 282 | 283 | for exp in tqdm(to_run): 284 | exp_to_data[exp] = np.array( 285 | ds.map( 286 | calc_feats, 287 | fn_kwargs={"exp": exp}, 288 | num_proc=num_proc, 289 | keep_in_memory=True, 290 | )["feat"] 291 | ).reshape(-1, 1) 292 | 293 | pickle.dump((exp_to_data, ds["label"]), open(output_file, "wb")) 294 | 295 | 296 | def generate_custom_data( 297 | ds, 298 | output_file="custom_data", 299 | model1="llama-7b", 300 | model2="tinyllama", 301 | num_proc=50, 302 | clip_min=1e-4, 303 | clip_max=1e5, 304 | ignore_first=25, 305 | ): 306 | """ 307 | Brute forces and generates symbolic data from a dataset of text files. 308 | 309 | For each sequence and model (llm1, llm2, unigram, trigram), get 310 | - min 311 | - max 312 | - mean 313 | - median 314 | - 25% quantile 315 | - 75% quantile 316 | - l2 norm 317 | - variance 318 | 319 | Will also get the ratio of llm1/llm2, llm1/unigram, llm1/trigram, llm2/unigram, llm2/trigram, unigram/trigram 320 | 321 | 322 | Saves features to pickle file as tuple (features, labels, ids). 323 | 324 | Args: 325 | - ds: Dataset object 326 | - output_file: str, path to save the output 327 | - model1: str, name of model1 328 | - model2: str, name of model2 329 | - num_proc: int, number of processes to use 330 | - clip_min: float, minimum value to clip to 331 | - clip_max: float, maximum value to clip to 332 | - ignore_first: int, number of tokens to ignore from the beginning 333 | 334 | Returns: 335 | - None 336 | """ 337 | 338 | ds = ds.with_format("numpy") 339 | 340 | def calc_feats(example): 341 | 342 | feats = [] 343 | 344 | funcs = [ 345 | min, 346 | max, 347 | np.mean, 348 | np.median, 349 | partial(np.percentile, q=0.25), 350 | partial(np.percentile, q=0.75), 351 | partial(np.percentile, q=0.10), 352 | partial(np.percentile, q=0.90), 353 | np.linalg.norm, 354 | np.var, 355 | ] 356 | 357 | models = [ 358 | f"{model1}-probs", 359 | f"{model2}-probs", 360 | "unigram-probs", 361 | "trigram-probs", 362 | ] 363 | 364 | def ff(x, f): 365 | if len(x) <= ignore_first: 366 | return f(x) 367 | return f(x[ignore_first:]) 368 | 369 | for m in models: 370 | feats.extend([ff(example[m], f) for f in funcs]) 371 | 372 | def c(x): 373 | if len(x) <= ignore_first: 374 | return np.clip(x, clip_min, clip_max) 375 | return np.clip(x[ignore_first:], clip_min, clip_max) 376 | 377 | feats.extend( 378 | [ 379 | f(c(example[f"{model1}-probs"]) / c(example[f"{model2}-probs"])) 380 | for f in funcs 381 | ] 382 | ) 383 | feats.extend( 384 | [ 385 | f(c(example[f"{model1}-probs"]) / c(example["unigram-probs"])) 386 | for f in funcs 387 | ] 388 | ) 389 | feats.extend( 390 | [ 391 | f(c(example[f"{model1}-probs"]) / c(example["trigram-probs"])) 392 | for f in funcs 393 | ] 394 | ) 395 | feats.extend( 396 | [ 397 | f(c(example[f"{model2}-probs"]) / c(example["unigram-probs"])) 398 | for f in funcs 399 | ] 400 | ) 401 | feats.extend( 402 | [ 403 | f(c(example[f"{model2}-probs"]) / c(example["trigram-probs"])) 404 | for f in funcs 405 | ] 406 | ) 407 | feats.extend( 408 | [ 409 | f(c(example["unigram-probs"]) / c(example["trigram-probs"])) 410 | for f in funcs 411 | ] 412 | ) 413 | 414 | return { 415 | "feat": feats, 416 | } 417 | 418 | all_features = np.array( 419 | ds.map( 420 | calc_feats, 421 | num_proc=num_proc, 422 | keep_in_memory=True, 423 | )["feat"] 424 | ) 425 | 426 | pickle.dump((all_features, ds["label"], ds["id"]), open(output_file, "wb")) 427 | -------------------------------------------------------------------------------- /code/ghostbuster/train_lr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Using the features created in `run.py` train a model on the data. 3 | 4 | Using cuml makes it much, much faster on gpu. 5 | """ 6 | 7 | import argparse 8 | import math 9 | import numpy as np 10 | import json 11 | 12 | # import tiktoken 13 | import dill as pickle 14 | from functools import partial 15 | 16 | from sklearn.linear_model import LogisticRegression, LinearRegression 17 | from sklearn.metrics import f1_score, accuracy_score, roc_auc_score 18 | from sklearn.calibration import CalibratedClassifierCV 19 | from sklearn.svm import SVC 20 | from sklearn.ensemble import VotingClassifier 21 | from sklearn.linear_model import SGDClassifier 22 | from sklearn.naive_bayes import MultinomialNB 23 | 24 | from transformers import AutoTokenizer 25 | 26 | from tabulate import tabulate 27 | 28 | from featurize import normalize 29 | 30 | from cuml.svm import SVC, SVR 31 | from cuml import LogisticRegression 32 | from cuml.linear_model import ElasticNet 33 | from cuml.solvers import SGD 34 | from cuml.ensemble import RandomForestClassifier 35 | from cuml.neighbors import KNeighborsClassifier 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("--train_on_all_data", action="store_true") 40 | 41 | parser.add_argument("--feature_path", type=str) 42 | 43 | parser.add_argument( 44 | "--model1", 45 | type=str, 46 | help="name of model1 (used for folders)", 47 | default="llama-7b", 48 | ) 49 | parser.add_argument( 50 | "--model2", 51 | type=str, 52 | help="name of model2 (used for folders)", 53 | default="tinyllama", 54 | ) 55 | 56 | parser.add_argument("--log_reg", action="store_true") 57 | 58 | parser.add_argument("--model_type", type=str) 59 | parser.add_argument("--binary_labels", action="store_true") 60 | parser.add_argument("--C", type=int) 61 | 62 | args = parser.parse_args() 63 | 64 | with open(args.feature_path, "rb") as fp: 65 | features, labels, ids = pickle.load(fp) 66 | 67 | if args.binary_labels: 68 | labels = np.array([int(x > 0.5) for x in labels]) 69 | 70 | indices = np.arange(len(labels)) 71 | 72 | train_frac = 0.997 if args.train_on_all_data else 0.8 73 | 74 | np.random.shuffle(indices) 75 | train, test = ( 76 | indices[: math.floor(train_frac * len(indices))], 77 | indices[math.floor(train_frac * len(indices)) :], 78 | ) 79 | print("Train/Test Split", train, test) 80 | print("Train Size:", len(train), "Valid Size:", len(test)) 81 | print(f"Positive Labels: {sum(labels[indices])}, Total Labels: {len(indices)}") 82 | 83 | data, mu, sigma = normalize( 84 | features, 85 | ret_mu_sigma=True, 86 | ) 87 | 88 | if args.model_type == "log_reg": 89 | base = LogisticRegression(C=args.C, max_iter=10000) 90 | 91 | elif args.model_type == "svc": 92 | base = SVC(C=args.C, probability=True) 93 | 94 | elif args.model_type == "svr": 95 | base = SVR(C=args.C) 96 | 97 | elif args.model_type == "elastic": 98 | 99 | base = ElasticNet() 100 | 101 | elif args.model_type == "sgd": 102 | 103 | base = SGD() 104 | 105 | elif args.model_type == "rfc": 106 | 107 | base = RandomForestClassifier(max_depth=32, n_estimators=100, n_bins=100) 108 | 109 | elif args.model_type == "knnc": 110 | 111 | base = KNeighborsClassifier(n_neighbors=args.C) 112 | 113 | elif args.model_type == "vote": 114 | 115 | base = VotingClassifier( 116 | estimators=[ 117 | ("svc", SVC(C=args.C, probability=True)), 118 | ( 119 | "rfc", 120 | RandomForestClassifier(max_depth=64, n_estimators=100, n_bins=200), 121 | ), 122 | ], 123 | voting="soft", 124 | ) 125 | 126 | if args.binary_labels: 127 | model = CalibratedClassifierCV(base, cv=5) 128 | else: 129 | model = base 130 | 131 | if args.train_on_all_data: 132 | model.fit(data, labels) 133 | 134 | pickle.dump(model, open("model/model", "wb")) 135 | pickle.dump(mu, open("model/mu", "wb")) 136 | pickle.dump(sigma, open("model/sigma", "wb")) 137 | 138 | texts = [open(f"../../data/m20/{id}.txt").read() for id in np.array(ids)[test]] 139 | json.dump(texts, open("model/test_texts.json", "w")) 140 | json.dump(labels[test].tolist(), open("model/test_labels.json", "w")) 141 | 142 | pickle.dump((data, train, test), open("model/data.pkl", "wb")) 143 | 144 | print("Saved model to model/") 145 | else: 146 | model.fit(data[train], labels[train]) 147 | 148 | predictions = model.predict(data[test]) 149 | if args.binary_labels: 150 | probs = model.predict_proba(data[test])[:, 1] 151 | else: 152 | probs = predictions 153 | predictions = predictions > 0.5 154 | 155 | labels = np.array([int(x > 0.5) for x in labels]) 156 | 157 | result_table = [["F1", "Accuracy", "AUC"]] 158 | 159 | result_table.append( 160 | [ 161 | round(f1_score(labels[test], predictions), 3), 162 | round(accuracy_score(labels[test], predictions), 3), 163 | round(roc_auc_score(labels[test], probs), 3), 164 | ] 165 | ) 166 | 167 | print(tabulate(result_table, headers="firstrow", tablefmt="grid")) 168 | 169 | json.dump(result_table, open("model/results.json", "w")) 170 | -------------------------------------------------------------------------------- /code/r_clm/ai_dataset.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from datasets import Dataset 4 | from transformers import AutoTokenizer 5 | 6 | IGNORE_INDEX = -100 7 | 8 | 9 | def get_tokenizer(cfg): 10 | 11 | tokenizer = AutoTokenizer.from_pretrained( 12 | cfg.model.backbone_path, 13 | use_fast=cfg.model.tokenizer.use_fast, 14 | padding_side=cfg.model.tokenizer.padding_side, 15 | truncation_side=cfg.model.tokenizer.truncation_side, 16 | ) 17 | 18 | if tokenizer.pad_token is None: 19 | if tokenizer.unk_token is not None: 20 | tokenizer.pad_token = tokenizer.unk_token 21 | else: 22 | tokenizer.pad_token = tokenizer.eos_token 23 | return tokenizer 24 | 25 | # --------------- Dataset ----------------------------------------------# 26 | 27 | 28 | def get_instruction(inputs): 29 | ret = f""" 30 | Prompt: {inputs['prompt_name']} 31 | Task: {inputs['task']} 32 | Score: {inputs['holistic_essay_score']} 33 | Student Grade Level: {inputs['grade_level']} 34 | English Language Learner: {inputs['ell_status']} 35 | Disability Status: {inputs['student_disability_status']} 36 | """.strip() 37 | return ret 38 | 39 | 40 | class AiDataset: 41 | """ 42 | Dataset class for LLM Detect AI Generated Text competition 43 | """ 44 | 45 | def __init__(self, cfg): 46 | self.cfg = cfg 47 | self.tokenizer = get_tokenizer(cfg) 48 | 49 | def format_source(self, instruction): 50 | ret = f"### Instruction:\n{instruction}\n\n### Response: " 51 | return ret 52 | 53 | def format_target(self, response): 54 | return f"{response} {self.tokenizer.eos_token}" 55 | 56 | def tokenize_function(self, examples): 57 | sources = [self.format_source(s) for s in examples["instruction"]] 58 | targets = [self.format_target(t) for t in examples["text"]] 59 | chats = [s + t for s, t in zip(sources, targets)] 60 | 61 | ex_tokenized_inputs = self.tokenizer( 62 | chats, 63 | padding=False, 64 | truncation=True, 65 | max_length=self.cfg.model.max_length, 66 | ) 67 | 68 | src_tokenized_inputs = self.tokenizer( 69 | sources, 70 | padding=False, 71 | truncation=False, 72 | ) 73 | 74 | src_lens = [len(s)-1 for s in src_tokenized_inputs["input_ids"]] 75 | input_ids = ex_tokenized_inputs["input_ids"] 76 | attention_mask = ex_tokenized_inputs["attention_mask"] 77 | labels = deepcopy(input_ids) 78 | 79 | for idx, src_len in enumerate(src_lens): 80 | labels[idx][:src_len] = [IGNORE_INDEX] * src_len 81 | 82 | to_return = { 83 | "input_ids": input_ids, 84 | "attention_mask": attention_mask, 85 | "labels": labels, 86 | } 87 | 88 | return to_return 89 | 90 | def preprocess_function(self, persuade_df): 91 | persuade_df["student_disability_status"] = persuade_df["student_disability_status"].fillna("Unknown") 92 | persuade_df["ell_status"] = persuade_df["ell_status"].fillna("Unknown") 93 | persuade_df["grade_level"] = persuade_df["grade_level"].fillna(-1) 94 | persuade_df["holistic_essay_score"] = persuade_df["holistic_essay_score"].fillna(-1) 95 | persuade_df["instruction"] = persuade_df.apply(get_instruction, axis=1) 96 | return persuade_df 97 | 98 | def get_dataset(self, df): 99 | df = deepcopy(df) 100 | df = self.preprocess_function(df) 101 | task_dataset = Dataset.from_pandas(df) 102 | task_dataset = task_dataset.map( 103 | self.tokenize_function, 104 | batched=True, 105 | remove_columns=task_dataset.column_names 106 | ) 107 | return task_dataset 108 | -------------------------------------------------------------------------------- /code/r_clm/ai_loader.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from transformers import DataCollatorWithPadding 5 | 6 | 7 | @dataclass 8 | class AiCollator(DataCollatorWithPadding): 9 | """ 10 | data collector for LLM Detect AI Generated Text task 11 | """ 12 | 13 | tokenizer = None 14 | padding = True 15 | max_length = None 16 | pad_to_multiple_of = None 17 | return_tensors = "pt" 18 | 19 | def __call__(self, features): 20 | labels = None 21 | if "labels" in features[0].keys(): 22 | labels = [feature["labels"] for feature in features] 23 | 24 | features = [ 25 | { 26 | "input_ids": feature["input_ids"], 27 | "attention_mask": feature["attention_mask"], 28 | } for feature in features 29 | ] 30 | 31 | batch = self.tokenizer.pad( 32 | features, 33 | padding='longest', 34 | max_length=self.max_length, 35 | pad_to_multiple_of=self.pad_to_multiple_of, 36 | return_tensors=None, 37 | ) 38 | 39 | tensor_keys = [ 40 | "input_ids", 41 | "attention_mask", 42 | ] 43 | 44 | for key in tensor_keys: 45 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 46 | 47 | seq_len = batch["input_ids"].size(1) 48 | 49 | if labels is not None: 50 | padded_labels = [] 51 | for label in labels: 52 | padded_label = [-100] * (seq_len - len(label)) + label # left pad 53 | padded_labels.append(padded_label) 54 | batch["labels"] = torch.tensor(padded_labels, dtype=torch.int64) 55 | 56 | return batch 57 | 58 | 59 | # --- 60 | 61 | 62 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print): 63 | bs = batch['input_ids'].size(0) 64 | print_fn(f"batch size: {bs}") 65 | 66 | print_fn(f"shape of input_ids: {batch['input_ids'].shape}") 67 | 68 | n_examples = min(n_examples, bs) 69 | print_fn(f"Showing {n_examples} from a {task} batch...") 70 | 71 | print_fn("\n\n") 72 | for idx in range(n_examples): 73 | print_fn(f"Example {idx+1}") 74 | print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}") 75 | print_fn(f"Input ids:\n\n{batch['input_ids'][idx]}") 76 | if 'labels' in batch: 77 | print_fn(f"Labels:\n\n{batch['labels'][idx]}") 78 | print_fn('~~'*40) 79 | -------------------------------------------------------------------------------- /code/r_clm/ai_optimizer.py: -------------------------------------------------------------------------------- 1 | import bitsandbytes as bnb 2 | from torch import optim 3 | 4 | 5 | def get_optimizer(cfg, model, print_fn=None): 6 | _optimizers = { 7 | "Adam": optim.Adam, 8 | "AdamW": optim.AdamW, 9 | "AdamW8bit": bnb.optim.Adam8bit, 10 | } 11 | assert cfg.optimizer.name in _optimizers, f"Optimizer {cfg.optimizer.name} not supported" 12 | 13 | no_decay = ["bias", "LayerNorm.weight"] 14 | head_layer_name = "lm_head" 15 | 16 | # start with all of the candidate parameters 17 | param_dict = {name: param for name, param in model.named_parameters()} 18 | # filter out those that do not require grad 19 | param_dict = {name: param for name, param in param_dict.items() if param.requires_grad} 20 | 21 | # head & body params 22 | param_dict_head = { 23 | name: param for name, param in param_dict.items() if head_layer_name in name 24 | } 25 | param_dict_body = { 26 | name: param for name, param in param_dict.items() if head_layer_name not in name 27 | } 28 | 29 | # create groups --- 30 | head_params_no_decay = [ 31 | param for name, param in param_dict_head.items() if any(nd in name for nd in no_decay) 32 | ] 33 | head_params_decay = [ 34 | param for name, param in param_dict_head.items() if not any(nd in name for nd in no_decay) 35 | ] 36 | body_params_no_decay = [ 37 | param for name, param in param_dict_body.items() if any(nd in name for nd in no_decay) 38 | ] 39 | body_params_decay = [ 40 | param for name, param in param_dict_body.items() if not any(nd in name for nd in no_decay) 41 | ] 42 | 43 | optim_groups = [ 44 | {'params': head_params_no_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': 0}, 45 | {'params': head_params_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': cfg.optimizer.weight_decay}, 46 | {'params': body_params_no_decay, 'lr': cfg.optimizer.lr, 'weight_decay': 0}, 47 | {'params': body_params_decay, 'lr': cfg.optimizer.lr, 48 | 'weight_decay': cfg.optimizer.weight_decay * 1e-1}, # less weight decay for body 49 | ] 50 | 51 | if print_fn is not None: 52 | n_head_params_no_decay = sum(p.numel() for p in head_params_no_decay) 53 | n_head_params_decay = sum(p.numel() for p in head_params_decay) 54 | n_body_params_no_decay = sum(p.numel() for p in body_params_no_decay) 55 | n_body_params_decay = sum(p.numel() for p in body_params_decay) 56 | 57 | print_fn(f"n_head_params_no_decay: {n_head_params_no_decay}") 58 | print_fn(f"n_head_params_decay: {n_head_params_decay}") 59 | print_fn(f"n_body_params_no_decay: {n_body_params_no_decay}") 60 | print_fn(f"n_body_params_decay: {n_body_params_decay}") 61 | 62 | # Create AdamW optimizer and use the fused version if it is available 63 | optimizer = _optimizers[cfg.optimizer.name]( 64 | optim_groups, 65 | lr=cfg.optimizer.lr, 66 | weight_decay=cfg.optimizer.weight_decay, 67 | ) 68 | return optimizer 69 | -------------------------------------------------------------------------------- /code/r_detect/ai_dataset.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from datasets import Dataset 4 | from transformers import AutoTokenizer 5 | 6 | 7 | def get_tokenizer(cfg): 8 | 9 | tokenizer = AutoTokenizer.from_pretrained( 10 | cfg.model.backbone_path, 11 | use_fast=cfg.model.tokenizer.use_fast, 12 | padding_side=cfg.model.tokenizer.padding_side, 13 | truncation_side=cfg.model.tokenizer.truncation_side, 14 | ) 15 | 16 | # if the eos token is an empty string, we assign it to a token 17 | if tokenizer.eos_token == "": 18 | tokenizer.add_special_tokens({"eos_token": ""}) 19 | tokenizer.eos_token = "" 20 | 21 | if tokenizer.pad_token is None: 22 | if tokenizer.unk_token is not None: 23 | tokenizer.pad_token = tokenizer.unk_token 24 | else: 25 | tokenizer.pad_token = tokenizer.eos_token 26 | return tokenizer 27 | 28 | # --------------- Dataset ----------------------------------------------# 29 | 30 | 31 | class AiDataset: 32 | """ 33 | Dataset class for LLM Detect AI Generated Text competition 34 | """ 35 | 36 | def __init__(self, cfg): 37 | self.cfg = cfg 38 | self.tokenizer = get_tokenizer(cfg) 39 | 40 | def tokenize_function(self, examples): 41 | tz = self.tokenizer( 42 | examples["text"], 43 | padding=False, 44 | truncation=True, 45 | max_length=self.cfg.model.max_length, 46 | add_special_tokens=True, 47 | ) 48 | 49 | return tz 50 | 51 | def compute_input_length(self, examples): 52 | return {"input_length": [len(x) for x in examples["input_ids"]]} 53 | 54 | def preprocess_function(self, df): 55 | df['text'] = df['text'].apply(lambda x: x.strip() + "\n###\nIs the essay generated by AI?") 56 | return df 57 | 58 | def get_dataset(self, df): 59 | """ 60 | Main api for creating the Science Exam dataset 61 | :param df: input dataframe 62 | :type df: pd.DataFrame 63 | :return: the created dataset 64 | :rtype: Dataset 65 | """ 66 | df = deepcopy(df) 67 | df = self.preprocess_function(df) 68 | task_dataset = Dataset.from_pandas(df) 69 | 70 | task_dataset = task_dataset.map(self.tokenize_function, batched=True) 71 | task_dataset = task_dataset.map(self.compute_input_length, batched=True) 72 | 73 | return task_dataset 74 | -------------------------------------------------------------------------------- /code/r_detect/ai_loader.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import deepcopy 3 | from dataclasses import dataclass, field 4 | 5 | import torch 6 | from transformers import DataCollatorWithPadding 7 | 8 | 9 | def apply_mask_augmentation(input_ids, tokenizer, mask_prob=0.1): 10 | input_ids = deepcopy(input_ids) 11 | input_ids = torch.tensor(input_ids, dtype=torch.int64) 12 | indices_mask = torch.bernoulli(torch.full(input_ids.shape, mask_prob)).bool() 13 | 14 | do_not_mask_tokens = list(set(tokenizer.all_special_ids)) 15 | pass_gate = [ 16 | [0 if token_id in do_not_mask_tokens else 1 for token_id in token_id_seq] for token_id_seq in input_ids 17 | ] 18 | pass_gate = torch.tensor(pass_gate, dtype=torch.bool) 19 | 20 | indices_mask = torch.logical_and(indices_mask, pass_gate) 21 | input_ids[indices_mask] = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) 22 | return input_ids 23 | 24 | 25 | @dataclass 26 | class AiCollator(DataCollatorWithPadding): 27 | """ 28 | data collector for LLM Detect AI Generated Text task 29 | """ 30 | 31 | tokenizer = None 32 | padding = True 33 | max_length = None 34 | pad_to_multiple_of = None 35 | return_tensors = "pt" 36 | 37 | def __call__(self, features): 38 | 39 | buffer_dict = dict() 40 | buffer_keys = ["id"] 41 | 42 | for key in buffer_keys: 43 | if key in features[0].keys(): 44 | value = [feature[key] for feature in features] 45 | buffer_dict[key] = value 46 | 47 | labels = None 48 | if "generated" in features[0].keys(): 49 | labels = [feature["generated"] for feature in features] 50 | 51 | features = [ 52 | { 53 | "input_ids": feature["input_ids"], 54 | "attention_mask": feature["attention_mask"], 55 | } for feature in features 56 | ] 57 | 58 | batch = self.tokenizer.pad( 59 | features, 60 | padding='longest', # self.padding, 61 | max_length=self.max_length, 62 | pad_to_multiple_of=self.pad_to_multiple_of, 63 | return_tensors=None, 64 | ) 65 | 66 | # for key, value in buffer_dict.items(): 67 | # batch[key] = value 68 | 69 | if labels is not None: 70 | batch["labels"] = labels 71 | 72 | tensor_keys = [ 73 | "input_ids", 74 | "attention_mask", 75 | ] 76 | 77 | for key in tensor_keys: 78 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 79 | 80 | if labels is not None: 81 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32) 82 | 83 | return batch 84 | 85 | 86 | @dataclass 87 | class AiCollatorTrain(DataCollatorWithPadding): 88 | """ 89 | data collector for LLM Detect AI Generated Text task 90 | """ 91 | 92 | tokenizer = None 93 | padding = True 94 | max_length = None 95 | pad_to_multiple_of = None 96 | return_tensors = "pt" 97 | kwargs: field(default_factory=dict) = None 98 | 99 | def __post_init__(self): 100 | [setattr(self, k, v) for k, v in self.kwargs.items()] 101 | 102 | def __call__(self, features): 103 | 104 | buffer_dict = dict() 105 | buffer_keys = ["id"] 106 | 107 | for key in buffer_keys: 108 | if key in features[0].keys(): 109 | value = [feature[key] for feature in features] 110 | buffer_dict[key] = value 111 | 112 | labels = None 113 | if "generated" in features[0].keys(): 114 | labels = [feature["generated"] for feature in features] 115 | 116 | features = [ 117 | { 118 | "input_ids": feature["input_ids"], 119 | "attention_mask": feature["attention_mask"], 120 | } for feature in features 121 | ] 122 | 123 | batch = self.tokenizer.pad( 124 | features, 125 | padding='longest', # self.padding, 126 | max_length=self.max_length, 127 | pad_to_multiple_of=self.pad_to_multiple_of, 128 | return_tensors=None, 129 | ) 130 | 131 | # for key, value in buffer_dict.items(): 132 | # batch[key] = value 133 | 134 | if self.cfg.train_params.use_mask_aug: 135 | batch["input_ids"] = apply_mask_augmentation( 136 | batch["input_ids"], self.tokenizer, self.cfg.train_params.mask_aug_prob 137 | ) 138 | 139 | if labels is not None: 140 | batch["labels"] = labels 141 | 142 | tensor_keys = [ 143 | "input_ids", 144 | "attention_mask", 145 | ] 146 | 147 | for key in tensor_keys: 148 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 149 | 150 | if labels is not None: 151 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32) 152 | 153 | return batch 154 | # --- 155 | 156 | 157 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print): 158 | bs = batch['input_ids'].size(0) 159 | print_fn(f"batch size: {bs}") 160 | 161 | print_fn(f"shape of input_ids: {batch['input_ids'].shape}") 162 | 163 | n_examples = min(n_examples, bs) 164 | print_fn(f"Showing {n_examples} from a {task} batch...") 165 | 166 | print_fn("\n\n") 167 | for idx in range(n_examples): 168 | print_fn(f"Example {idx+1}") 169 | print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}") 170 | # print("\n\n") 171 | 172 | if "infer" not in task.lower(): 173 | print_fn("--"*20) 174 | labels = batch['labels'][idx] 175 | print_fn(f"Label: {labels}") 176 | print_fn('~~'*40) 177 | -------------------------------------------------------------------------------- /code/r_detect/ai_model.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModel 6 | from transformers.modeling_outputs import SequenceClassifierOutputWithPast 7 | from transformers.models.llama.modeling_llama import (LlamaModel, 8 | LlamaPreTrainedModel) 9 | from transformers.models.mistral.modeling_mistral import ( 10 | MistralModel, MistralPreTrainedModel) 11 | from transformers.models.phi.modeling_phi import PhiModel, PhiPreTrainedModel 12 | 13 | 14 | class MistralForDetectAI(MistralPreTrainedModel): 15 | def __init__(self, config): 16 | super().__init__(config) 17 | self.num_labels = config.num_labels 18 | self.model = MistralModel(config) 19 | # self.dropout = nn.Dropout(0.3) 20 | 21 | self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False) 22 | 23 | self.loss_fn = nn.BCEWithLogitsLoss() 24 | 25 | # Initialize weights and apply final processing 26 | self.post_init() 27 | 28 | def forward( 29 | self, 30 | input_ids: torch.LongTensor = None, 31 | attention_mask: Optional[torch.Tensor] = None, 32 | position_ids: Optional[torch.LongTensor] = None, 33 | past_key_values: Optional[List[torch.FloatTensor]] = None, 34 | inputs_embeds: Optional[torch.FloatTensor] = None, 35 | labels: Optional[torch.LongTensor] = None, 36 | use_cache: Optional[bool] = None, 37 | output_attentions: Optional[bool] = None, 38 | output_hidden_states: Optional[bool] = None, 39 | return_dict: Optional[bool] = None, 40 | ) -> Union[Tuple, SequenceClassifierOutputWithPast]: 41 | 42 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 43 | 44 | transformer_outputs = self.model( 45 | input_ids, 46 | attention_mask=attention_mask, 47 | position_ids=position_ids, 48 | past_key_values=past_key_values, 49 | inputs_embeds=inputs_embeds, 50 | use_cache=use_cache, 51 | output_attentions=output_attentions, 52 | output_hidden_states=output_hidden_states, 53 | return_dict=return_dict, 54 | ) 55 | hidden_states = transformer_outputs[0] # (bs, seq_len, dim) 56 | # hidden_states = self.dropout(hidden_states) 57 | logits = self.classification_head(hidden_states[:, -1]).reshape(-1) # (bs, num_labels) 58 | 59 | loss = None 60 | if labels is not None: 61 | labels = labels.to(logits.device).reshape(-1) 62 | loss = self.loss_fn(logits, labels) 63 | 64 | if not return_dict: 65 | output = (logits,) + transformer_outputs[1:] 66 | return ((loss,) + output) if loss is not None else output 67 | 68 | return SequenceClassifierOutputWithPast( 69 | loss=loss, 70 | logits=logits, 71 | past_key_values=transformer_outputs.past_key_values, 72 | hidden_states=transformer_outputs.hidden_states, 73 | attentions=transformer_outputs.attentions, 74 | ) 75 | 76 | 77 | class LlamaForDetectAI(LlamaPreTrainedModel): 78 | def __init__(self, config): 79 | super().__init__(config) 80 | self.num_labels = config.num_labels 81 | self.model = LlamaModel(config) 82 | self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False) 83 | self.loss_fn = nn.BCEWithLogitsLoss() 84 | 85 | # Initialize weights and apply final processing 86 | self.post_init() 87 | 88 | def forward( 89 | self, 90 | input_ids: torch.LongTensor = None, 91 | attention_mask: Optional[torch.Tensor] = None, 92 | position_ids: Optional[torch.LongTensor] = None, 93 | past_key_values: Optional[List[torch.FloatTensor]] = None, 94 | inputs_embeds: Optional[torch.FloatTensor] = None, 95 | labels: Optional[torch.LongTensor] = None, 96 | use_cache: Optional[bool] = None, 97 | output_attentions: Optional[bool] = None, 98 | output_hidden_states: Optional[bool] = None, 99 | return_dict: Optional[bool] = None, 100 | ) -> Union[Tuple, SequenceClassifierOutputWithPast]: 101 | 102 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 103 | 104 | transformer_outputs = self.model( 105 | input_ids, 106 | attention_mask=attention_mask, 107 | position_ids=position_ids, 108 | past_key_values=past_key_values, 109 | inputs_embeds=inputs_embeds, 110 | use_cache=use_cache, 111 | output_attentions=output_attentions, 112 | output_hidden_states=output_hidden_states, 113 | return_dict=return_dict, 114 | ) 115 | hidden_states = transformer_outputs[0] # (bs, seq_len, dim) 116 | logits = self.classification_head(hidden_states[:, -1]).reshape(-1) # (bs, num_labels) 117 | 118 | loss = None 119 | if labels is not None: 120 | labels = labels.to(logits.device).reshape(-1) 121 | loss = self.loss_fn(logits, labels) 122 | 123 | if not return_dict: 124 | output = (logits,) + transformer_outputs[1:] 125 | return ((loss,) + output) if loss is not None else output 126 | 127 | return SequenceClassifierOutputWithPast( 128 | loss=loss, 129 | logits=logits, 130 | past_key_values=transformer_outputs.past_key_values, 131 | hidden_states=transformer_outputs.hidden_states, 132 | attentions=transformer_outputs.attentions, 133 | ) 134 | 135 | 136 | class PhiForDetectAI(PhiPreTrainedModel): 137 | def __init__(self, config): 138 | super().__init__(config) 139 | self.num_labels = config.num_labels 140 | self.model = AutoModel(config) 141 | self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False) 142 | self.loss_fn = nn.BCEWithLogitsLoss() 143 | 144 | # Initialize weights and apply final processing 145 | self.post_init() 146 | 147 | def forward( 148 | self, 149 | input_ids: torch.LongTensor = None, 150 | attention_mask: Optional[torch.Tensor] = None, 151 | position_ids: Optional[torch.LongTensor] = None, 152 | past_key_values: Optional[List[torch.FloatTensor]] = None, 153 | inputs_embeds: Optional[torch.FloatTensor] = None, 154 | labels: Optional[torch.LongTensor] = None, 155 | use_cache: Optional[bool] = None, 156 | output_attentions: Optional[bool] = None, 157 | output_hidden_states: Optional[bool] = None, 158 | return_dict: Optional[bool] = None, 159 | ) -> Union[Tuple, SequenceClassifierOutputWithPast]: 160 | 161 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 162 | 163 | transformer_outputs = self.model( 164 | input_ids, 165 | attention_mask=attention_mask, 166 | position_ids=position_ids, 167 | past_key_values=past_key_values, 168 | inputs_embeds=inputs_embeds, 169 | use_cache=use_cache, 170 | output_attentions=output_attentions, 171 | output_hidden_states=output_hidden_states, 172 | return_dict=return_dict, 173 | ) 174 | hidden_states = transformer_outputs[0] # (bs, seq_len, dim) 175 | logits = self.classification_head(hidden_states[:, -1]).reshape(-1) # (bs, num_labels) 176 | 177 | loss = None 178 | if labels is not None: 179 | labels = labels.to(logits.device).reshape(-1) 180 | loss = self.loss_fn(logits, labels) 181 | 182 | if not return_dict: 183 | output = (logits,) + transformer_outputs[1:] 184 | return ((loss,) + output) if loss is not None else output 185 | 186 | return SequenceClassifierOutputWithPast( 187 | loss=loss, 188 | logits=logits, 189 | past_key_values=transformer_outputs.past_key_values, 190 | hidden_states=transformer_outputs.hidden_states, 191 | attentions=transformer_outputs.attentions, 192 | ) 193 | -------------------------------------------------------------------------------- /code/r_detect/ai_optimizer.py: -------------------------------------------------------------------------------- 1 | import bitsandbytes as bnb 2 | from torch import optim 3 | 4 | 5 | def get_optimizer(cfg, model, print_fn=None): 6 | _optimizers = { 7 | "Adam": optim.Adam, 8 | "AdamW": optim.AdamW, 9 | "AdamW8bit": bnb.optim.Adam8bit, 10 | } 11 | assert cfg.optimizer.name in _optimizers, f"Optimizer {cfg.optimizer.name} not supported" 12 | 13 | no_decay = ["bias", "LayerNorm.weight"] 14 | head_layer_name = "classification_head" 15 | 16 | # start with all of the candidate parameters 17 | param_dict = {name: param for name, param in model.named_parameters()} 18 | # filter out those that do not require grad 19 | param_dict = {name: param for name, param in param_dict.items() if param.requires_grad} 20 | 21 | # head & body params 22 | param_dict_head = { 23 | name: param for name, param in param_dict.items() if head_layer_name in name 24 | } 25 | param_dict_body = { 26 | name: param for name, param in param_dict.items() if head_layer_name not in name 27 | } 28 | 29 | # create groups --- 30 | head_params_no_decay = [ 31 | param for name, param in param_dict_head.items() if any(nd in name for nd in no_decay) 32 | ] 33 | head_params_decay = [ 34 | param for name, param in param_dict_head.items() if not any(nd in name for nd in no_decay) 35 | ] 36 | body_params_no_decay = [ 37 | param for name, param in param_dict_body.items() if any(nd in name for nd in no_decay) 38 | ] 39 | body_params_decay = [ 40 | param for name, param in param_dict_body.items() if not any(nd in name for nd in no_decay) 41 | ] 42 | 43 | optim_groups = [ 44 | {'params': head_params_no_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': 0}, 45 | {'params': head_params_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': cfg.optimizer.weight_decay}, 46 | {'params': body_params_no_decay, 'lr': cfg.optimizer.lr, 'weight_decay': 0}, 47 | {'params': body_params_decay, 'lr': cfg.optimizer.lr, 48 | 'weight_decay': cfg.optimizer.weight_decay * 1e-1}, # less weight decay for body 49 | ] 50 | 51 | if print_fn is not None: 52 | n_head_params_no_decay = sum(p.numel() for p in head_params_no_decay) 53 | n_head_params_decay = sum(p.numel() for p in head_params_decay) 54 | n_body_params_no_decay = sum(p.numel() for p in body_params_no_decay) 55 | n_body_params_decay = sum(p.numel() for p in body_params_decay) 56 | 57 | print_fn(f"n_head_params_no_decay: {n_head_params_no_decay}") 58 | print_fn(f"n_head_params_decay: {n_head_params_decay}") 59 | print_fn(f"n_body_params_no_decay: {n_body_params_no_decay}") 60 | print_fn(f"n_body_params_decay: {n_body_params_decay}") 61 | 62 | # Create AdamW optimizer and use the fused version if it is available 63 | optimizer = _optimizers[cfg.optimizer.name]( 64 | optim_groups, 65 | lr=cfg.optimizer.lr, 66 | weight_decay=cfg.optimizer.weight_decay, 67 | ) 68 | return optimizer 69 | -------------------------------------------------------------------------------- /code/r_embed/ai_dataset.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from datasets import Dataset 4 | from transformers import AutoTokenizer 5 | 6 | # --------------- Dataset ----------------------------------------------# 7 | 8 | 9 | class AiDataset: 10 | """ 11 | Dataset class for LLM Detect AI Generated Text competition 12 | """ 13 | 14 | def __init__(self, cfg): 15 | self.cfg = cfg 16 | self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path) 17 | 18 | def tokenize_function(self, examples): 19 | tz = self.tokenizer( 20 | examples["text"], 21 | padding=False, 22 | truncation=True, 23 | max_length=self.cfg.model.max_length, 24 | add_special_tokens=True, 25 | return_token_type_ids=False, 26 | ) 27 | 28 | return tz 29 | 30 | def compute_input_length(self, examples): 31 | return {"input_length": [len(x) for x in examples["input_ids"]]} 32 | 33 | def get_dataset(self, df): 34 | """ 35 | Main api for creating the Science Exam dataset 36 | :param df: input dataframe 37 | :type df: pd.DataFrame 38 | :return: the created dataset 39 | :rtype: Dataset 40 | """ 41 | df = deepcopy(df) 42 | task_dataset = Dataset.from_pandas(df) 43 | 44 | task_dataset = task_dataset.map(self.tokenize_function, batched=True) 45 | task_dataset = task_dataset.map(self.compute_input_length, batched=True) 46 | 47 | try: 48 | task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"]) 49 | except Exception as e: 50 | print(e) 51 | 52 | return task_dataset 53 | -------------------------------------------------------------------------------- /code/r_embed/ai_loader.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import random 4 | import time 5 | from dataclasses import dataclass, field 6 | 7 | import torch 8 | from transformers import DataCollatorWithPadding 9 | 10 | 11 | @dataclass 12 | class AiCollator(DataCollatorWithPadding): 13 | """ 14 | data collector for LLM Detect AI Generated Text task 15 | """ 16 | 17 | tokenizer = None 18 | padding = True 19 | max_length = None 20 | pad_to_multiple_of = None 21 | return_tensors = "pt" 22 | 23 | def __call__(self, features): 24 | labels = None 25 | if "generated" in features[0].keys(): 26 | labels = [feature["generated"] for feature in features] 27 | 28 | features = [ 29 | { 30 | "input_ids": feature["input_ids"], 31 | "attention_mask": feature["attention_mask"], 32 | } for feature in features 33 | ] 34 | 35 | batch = self.tokenizer.pad( 36 | features, 37 | padding=self.padding, 38 | max_length=self.max_length, 39 | pad_to_multiple_of=self.pad_to_multiple_of, 40 | return_tensors=None, 41 | ) 42 | 43 | if labels is not None: 44 | batch["labels"] = labels 45 | 46 | tensor_keys = [ 47 | "input_ids", 48 | "attention_mask", 49 | ] 50 | 51 | for key in tensor_keys: 52 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 53 | 54 | if labels is not None: 55 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64) 56 | 57 | return batch 58 | 59 | 60 | @dataclass 61 | class AiCollatorTrain(DataCollatorWithPadding): 62 | """ 63 | data collector for LLM Detect AI Generated Text task 64 | """ 65 | 66 | tokenizer = None 67 | padding = True 68 | max_length = None 69 | pad_to_multiple_of = None 70 | return_tensors = "pt" 71 | kwargs: field(default_factory=dict) = None 72 | 73 | def __post_init__(self): 74 | [setattr(self, k, v) for k, v in self.kwargs.items()] 75 | 76 | # mappings 77 | example2idx = dict() 78 | example_ids = self.train_ds["id"] 79 | 80 | for idx in range(len(example_ids)): 81 | example2idx[example_ids[idx]] = idx 82 | self.example2idx = example2idx 83 | 84 | seed = seed = int(time.time() * 1000) + os.getpid() 85 | self.rng = random.Random(seed) 86 | 87 | print("=="*40) 88 | print(f"setting random seed in data collator as: {seed}") 89 | print("=="*40) 90 | 91 | def process_features(self, example_ids): 92 | updated_features = [] 93 | for eid in example_ids: 94 | example = dict() 95 | 96 | example["id"] = eid 97 | ex_info = self.train_ds[self.example2idx[eid]] 98 | 99 | # use fields 100 | example["input_ids"] = ex_info["input_ids"] 101 | example["attention_mask"] = ex_info["attention_mask"] 102 | example["generated"] = ex_info["generated"] 103 | updated_features.append(example) 104 | 105 | return updated_features 106 | 107 | def __call__(self, features): 108 | bs = len(features) 109 | selected_prompt_id = self.rng.choice(self.prompt_ids) 110 | selected_example_ids_pos = self.rng.sample(self.prompt2ids_pos[selected_prompt_id], k=bs//2) 111 | selected_example_ids_neg = self.rng.sample(self.prompt2ids_neg[selected_prompt_id], k=bs//2) 112 | selected_example_ids = selected_example_ids_pos + selected_example_ids_neg 113 | features = self.process_features(selected_example_ids) 114 | 115 | labels = None 116 | if "generated" in features[0].keys(): 117 | labels = [feature["generated"] for feature in features] 118 | 119 | features = [ 120 | { 121 | "input_ids": feature["input_ids"], 122 | "attention_mask": feature["attention_mask"], 123 | } for feature in features 124 | ] 125 | 126 | batch = self.tokenizer.pad( 127 | features, 128 | padding=self.padding, 129 | max_length=self.max_length, 130 | pad_to_multiple_of=self.pad_to_multiple_of, 131 | return_tensors=None, 132 | ) 133 | 134 | if labels is not None: 135 | batch["labels"] = labels 136 | 137 | tensor_keys = [ 138 | "input_ids", 139 | "attention_mask", 140 | ] 141 | 142 | for key in tensor_keys: 143 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 144 | 145 | if labels is not None: 146 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64) 147 | 148 | return batch 149 | 150 | # --- 151 | 152 | 153 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print): 154 | print_fn("##"*40) 155 | bs = batch['input_ids'].size(0) 156 | print_fn(f"batch size: {bs}") 157 | 158 | print_fn(f"shape of input_ids: {batch['input_ids'].shape}") 159 | 160 | n_examples = min(n_examples, bs) 161 | print_fn(f"Showing {n_examples} from a {task} batch...") 162 | 163 | print_fn("\n\n") 164 | for idx in range(n_examples): 165 | print_fn(f"Example {idx+1}") 166 | print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}") 167 | # print("\n\n") 168 | 169 | if "infer" not in task.lower(): 170 | print_fn("--"*20) 171 | labels = batch['labels'][idx] 172 | print_fn(f"Label: {labels}") 173 | print_fn('=='*40) 174 | print_fn("##"*40) 175 | -------------------------------------------------------------------------------- /code/r_embed/ai_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.utils.checkpoint 5 | from transformers import AutoConfig, AutoModel 6 | 7 | 8 | class MeanPooling(nn.Module): 9 | def __init__(self): 10 | super(MeanPooling, self).__init__() 11 | 12 | def forward(self, last_hidden_state, attention_mask): 13 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() 14 | sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) 15 | sum_mask = input_mask_expanded.sum(1) 16 | sum_mask = torch.clamp(sum_mask, min=1e-9) 17 | mean_embeddings = sum_embeddings / sum_mask 18 | return mean_embeddings 19 | 20 | # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10034742 21 | 22 | 23 | class SupContrastiveLoss(nn.Module): 24 | def __init__(self, temperature, device): 25 | super(SupContrastiveLoss, self).__init__() 26 | self.temperature = temperature 27 | self.device = device 28 | 29 | def forward(self, outputs, labels): 30 | N = outputs.size()[0] 31 | labels = labels.reshape(N, 1) 32 | self_similarity_mask = torch.ones((N, N)).fill_diagonal_(0).to(self.device) 33 | 34 | pos_mask = torch.eq(labels, labels.T).float() 35 | neg_mask = torch.abs(pos_mask - 1) 36 | 37 | H = torch.matmul(outputs, outputs.T) * self_similarity_mask 38 | H_pos = H * pos_mask 39 | H_neg = H * neg_mask 40 | 41 | v_pos = torch.mean(torch.exp(torch.div(H_pos, self.temperature)), dim=1) 42 | v_neg = torch.mean(torch.exp(torch.div(H_neg, self.temperature)), dim=1) 43 | 44 | loss = (-1/N) * torch.sum(torch.log(v_pos/(v_pos + v_neg))) 45 | 46 | return loss 47 | 48 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 49 | # Rank Model 50 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 51 | 52 | 53 | class AiModel(nn.Module): 54 | """ 55 | The LLM Detect AI Generated Text Model 56 | """ 57 | 58 | def __init__(self, cfg, device): 59 | print("initializing the Rank Model...") 60 | 61 | super(AiModel, self).__init__() 62 | self.cfg = cfg 63 | 64 | # ----------------------------- Backbone -----------------------------------------# 65 | backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path) 66 | backbone_config.update( 67 | { 68 | "use_cache": False, 69 | } 70 | ) 71 | 72 | self.backbone = AutoModel.from_pretrained( 73 | self.cfg.model.backbone_path, 74 | config=backbone_config 75 | ) 76 | if self.cfg.model.gradient_checkpointing: 77 | self.backbone.gradient_checkpointing_enable() 78 | 79 | self.dropout = nn.Dropout(self.cfg.model.dropout_rate) 80 | 81 | hidden_size = self.backbone.config.hidden_size 82 | project_dim = self.cfg.model.projection_dim 83 | self.pool = MeanPooling() 84 | 85 | self.projection_head = nn.Sequential( 86 | nn.Dropout(self.cfg.model.dropout_rate), 87 | nn.Linear(hidden_size, project_dim), 88 | nn.ReLU(), 89 | nn.Linear(project_dim, project_dim) 90 | ) 91 | 92 | # loss function 93 | self.loss_fn = SupContrastiveLoss( 94 | temperature=self.cfg.model.temperature, 95 | device=device, 96 | ) 97 | 98 | def encode( 99 | self, 100 | input_ids, 101 | attention_mask, 102 | ): 103 | outputs = self.backbone( 104 | input_ids, 105 | attention_mask=attention_mask, 106 | output_hidden_states=False, 107 | ) 108 | 109 | encoder_layer = outputs.last_hidden_state 110 | embeddings = self.pool(encoder_layer, attention_mask) 111 | embeddings = self.projection_head(embeddings) 112 | embeddings = F.normalize(embeddings, dim=-1) 113 | 114 | return embeddings 115 | 116 | def forward(self, input_ids, attention_mask, labels=None, **kwargs): 117 | # features 118 | embeddings = self.encode( 119 | input_ids=input_ids, 120 | attention_mask=attention_mask, 121 | ) # (bs, num_features) 122 | 123 | # loss 124 | loss = None 125 | if labels is not None: 126 | loss = self.loss_fn(embeddings, labels) 127 | 128 | return loss 129 | -------------------------------------------------------------------------------- /code/r_embed/ai_optimizer.py: -------------------------------------------------------------------------------- 1 | from torch.optim import AdamW 2 | 3 | 4 | def get_optimizer_grouped_parameters_no_llrd(model, cfg): 5 | 6 | no_decay = ['bias', "LayerNorm.bias", "LayerNorm.weight"] 7 | backbone_params = model.backbone.named_parameters() 8 | 9 | optimizer_parameters = [ 10 | { 11 | "params": [p for n, p in model.named_parameters() if "backbone" not in n], 12 | "lr": cfg.optimizer.lr, 13 | "weight_decay": cfg.optimizer.weight_decay, 14 | }, 15 | { 16 | "params": [p for n, p in backbone_params if not any(nd in n for nd in no_decay)], 17 | "lr": cfg.optimizer.lr, 18 | "weight_decay": cfg.optimizer.weight_decay, 19 | }, 20 | { 21 | "params": [p for n, p in backbone_params if any(nd in n for nd in no_decay)], 22 | "lr": cfg.optimizer.lr, 23 | "weight_decay": 0.0, 24 | }, 25 | ] 26 | 27 | return optimizer_parameters 28 | 29 | 30 | def get_optimizer_grouped_parameters_with_llrd(model, cfg): 31 | """layerwise learning rate decay implementation 32 | """ 33 | no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] 34 | 35 | # initialize lr for task specific layer 36 | optimizer_grouped_parameters = [ 37 | { 38 | "params": [p for n, p in model.named_parameters() if "backbone" not in n], 39 | "lr": cfg.optimizer.head_lr, 40 | "weight_decay": cfg.optimizer.weight_decay, 41 | }, 42 | ] 43 | 44 | # initialize lrs for backbone layers 45 | layers = [model.backbone.embeddings] + list(model.backbone.encoder.layer) 46 | layers.reverse() 47 | lr = cfg.optimizer.lr 48 | 49 | for layer in layers: 50 | lr *= cfg.optimizer.llrd 51 | 52 | optimizer_grouped_parameters += [ 53 | { 54 | "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)], 55 | "weight_decay": cfg.optimizer.weight_decay, 56 | "lr": lr, 57 | }, 58 | 59 | { 60 | "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)], 61 | "weight_decay": 0.0, 62 | "lr": lr, 63 | }, 64 | ] 65 | 66 | return optimizer_grouped_parameters 67 | 68 | 69 | def get_optimizer(model, cfg): 70 | """optimizer for model training 71 | """ 72 | 73 | if cfg.optimizer.use_llrd: 74 | optimizer_grouped_parameters = get_optimizer_grouped_parameters_with_llrd(model, cfg) 75 | else: 76 | optimizer_grouped_parameters = get_optimizer_grouped_parameters_no_llrd(model, cfg) 77 | 78 | optimizer_kwargs = { 79 | "betas": (cfg.optimizer.beta1, cfg.optimizer.beta2), 80 | "eps": cfg.optimizer.eps, 81 | "lr": cfg.optimizer.lr 82 | } 83 | 84 | if cfg.optimizer.use_bnb: 85 | import bitsandbytes as bnb 86 | 87 | optimizer = bnb.optim.Adam8bit( 88 | optimizer_grouped_parameters, 89 | **optimizer_kwargs 90 | ) 91 | return optimizer 92 | else: 93 | optimizer = AdamW( 94 | optimizer_grouped_parameters, 95 | **optimizer_kwargs 96 | ) 97 | 98 | return optimizer 99 | -------------------------------------------------------------------------------- /code/r_ranking/ai_dataset.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from datasets import Dataset 4 | from transformers import AutoTokenizer 5 | 6 | # --------------- Dataset ----------------------------------------------# 7 | 8 | 9 | class AiDataset: 10 | """ 11 | Dataset class for LLM Detect AI Generated Text competition 12 | """ 13 | 14 | def __init__(self, cfg): 15 | self.cfg = cfg 16 | self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path) 17 | 18 | def tokenize_function(self, examples): 19 | tz = self.tokenizer( 20 | examples["text"], 21 | padding=False, 22 | truncation=True, 23 | max_length=self.cfg.model.max_length, 24 | add_special_tokens=True, 25 | return_token_type_ids=False, 26 | ) 27 | 28 | return tz 29 | 30 | def compute_input_length(self, examples): 31 | return {"input_length": [len(x) for x in examples["input_ids"]]} 32 | 33 | def get_dataset(self, df): 34 | """ 35 | Main api for creating the Science Exam dataset 36 | :param df: input dataframe 37 | :type df: pd.DataFrame 38 | :return: the created dataset 39 | :rtype: Dataset 40 | """ 41 | df = deepcopy(df) 42 | task_dataset = Dataset.from_pandas(df) 43 | 44 | task_dataset = task_dataset.map(self.tokenize_function, batched=True) 45 | task_dataset = task_dataset.map(self.compute_input_length, batched=True) 46 | 47 | try: 48 | task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"]) 49 | except Exception as e: 50 | print(e) 51 | 52 | return task_dataset 53 | -------------------------------------------------------------------------------- /code/r_ranking/ai_loader.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import random 4 | import time 5 | from dataclasses import dataclass, field 6 | 7 | import torch 8 | from transformers import DataCollatorWithPadding 9 | 10 | 11 | @dataclass 12 | class AiCollator(DataCollatorWithPadding): 13 | """ 14 | data collector for LLM Detect AI Generated Text task 15 | """ 16 | 17 | tokenizer = None 18 | padding = True 19 | max_length = None 20 | pad_to_multiple_of = None 21 | return_tensors = "pt" 22 | 23 | def __call__(self, features): 24 | labels = None 25 | if "generated" in features[0].keys(): 26 | labels = [feature["generated"] for feature in features] 27 | 28 | features = [ 29 | { 30 | "input_ids": feature["input_ids"], 31 | "attention_mask": feature["attention_mask"], 32 | } for feature in features 33 | ] 34 | 35 | batch = self.tokenizer.pad( 36 | features, 37 | padding=self.padding, 38 | max_length=self.max_length, 39 | pad_to_multiple_of=self.pad_to_multiple_of, 40 | return_tensors=None, 41 | ) 42 | 43 | if labels is not None: 44 | batch["labels"] = labels 45 | 46 | tensor_keys = [ 47 | "input_ids", 48 | "attention_mask", 49 | ] 50 | 51 | for key in tensor_keys: 52 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 53 | 54 | if labels is not None: 55 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64) 56 | 57 | return batch 58 | 59 | 60 | @dataclass 61 | class AiCollatorTrain(DataCollatorWithPadding): 62 | """ 63 | data collector for LLM Detect AI Generated Text task 64 | """ 65 | 66 | tokenizer = None 67 | padding = True 68 | max_length = None 69 | pad_to_multiple_of = None 70 | return_tensors = "pt" 71 | kwargs: field(default_factory=dict) = None 72 | 73 | def __post_init__(self): 74 | [setattr(self, k, v) for k, v in self.kwargs.items()] 75 | 76 | # mappings 77 | example2idx = dict() 78 | example_ids = self.train_ds["id"] 79 | 80 | for idx in range(len(example_ids)): 81 | example2idx[example_ids[idx]] = idx 82 | self.example2idx = example2idx 83 | 84 | seed = seed = int(time.time() * 1000) + os.getpid() # random.randint(100, 1000) 85 | self.rng = random.Random(seed) 86 | 87 | print("=="*40) 88 | print(f"setting random seed in data collator as: {seed}") 89 | print("=="*40) 90 | 91 | def process_features(self, example_ids): 92 | updated_features = [] 93 | for eid in example_ids: 94 | example = dict() 95 | 96 | example["id"] = eid 97 | ex_info = self.train_ds[self.example2idx[eid]] 98 | 99 | # use fields 100 | example["input_ids"] = ex_info["input_ids"] 101 | example["attention_mask"] = ex_info["attention_mask"] 102 | example["generated"] = ex_info["generated"] 103 | updated_features.append(example) 104 | 105 | return updated_features 106 | 107 | def __call__(self, features): 108 | bs = len(features) 109 | 110 | if self.rng.random() < 0.8: 111 | selected_prompt_id = self.rng.choice(self.prompt_ids) 112 | selected_example_ids = self.rng.sample(self.prompt2ids[selected_prompt_id], k=bs) 113 | features = self.process_features(selected_example_ids) 114 | 115 | labels = None 116 | if "generated" in features[0].keys(): 117 | labels = [feature["generated"] for feature in features] 118 | 119 | features = [ 120 | { 121 | "input_ids": feature["input_ids"], 122 | "attention_mask": feature["attention_mask"], 123 | } for feature in features 124 | ] 125 | 126 | batch = self.tokenizer.pad( 127 | features, 128 | padding=self.padding, 129 | max_length=self.max_length, 130 | pad_to_multiple_of=self.pad_to_multiple_of, 131 | return_tensors=None, 132 | ) 133 | 134 | if labels is not None: 135 | batch["labels"] = labels 136 | 137 | tensor_keys = [ 138 | "input_ids", 139 | "attention_mask", 140 | ] 141 | 142 | for key in tensor_keys: 143 | batch[key] = torch.tensor(batch[key], dtype=torch.int64) 144 | 145 | if labels is not None: 146 | batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64) 147 | 148 | return batch 149 | 150 | # --- 151 | 152 | 153 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print): 154 | print_fn("##"*40) 155 | bs = batch['input_ids'].size(0) 156 | print_fn(f"batch size: {bs}") 157 | 158 | print_fn(f"shape of input_ids: {batch['input_ids'].shape}") 159 | 160 | n_examples = min(n_examples, bs) 161 | print_fn(f"Showing {n_examples} from a {task} batch...") 162 | 163 | print_fn("\n\n") 164 | for idx in range(n_examples): 165 | print_fn(f"Example {idx+1}") 166 | print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}") 167 | # print("\n\n") 168 | 169 | if "infer" not in task.lower(): 170 | print_fn("--"*20) 171 | labels = batch['labels'][idx] 172 | print_fn(f"Label: {labels}") 173 | print_fn('=='*40) 174 | print_fn("##"*40) 175 | -------------------------------------------------------------------------------- /code/r_ranking/ai_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.utils.checkpoint 5 | from transformers import AutoConfig, AutoModel 6 | 7 | 8 | def get_ranking_loss(logits, labels, margin=0.7): 9 | logits = torch.sigmoid(logits) 10 | labels1 = labels.unsqueeze(1) 11 | labels2 = labels.unsqueeze(0) 12 | 13 | logits1 = logits.unsqueeze(1) 14 | logits2 = logits.unsqueeze(0) 15 | 16 | y_ij = torch.sign(labels1 - labels2) 17 | r_ij = logits1 - logits2 18 | 19 | loss = torch.clamp(-r_ij*y_ij + margin, min=0.0).mean() 20 | return loss 21 | 22 | 23 | class MeanPooling(nn.Module): 24 | def __init__(self): 25 | super(MeanPooling, self).__init__() 26 | 27 | def forward(self, last_hidden_state, attention_mask): 28 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() 29 | sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) 30 | sum_mask = input_mask_expanded.sum(1) 31 | sum_mask = torch.clamp(sum_mask, min=1e-9) 32 | mean_embeddings = sum_embeddings / sum_mask 33 | return mean_embeddings 34 | 35 | 36 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 37 | # Rank Model 38 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 39 | 40 | 41 | class AiModel(nn.Module): 42 | """ 43 | The LLM Detect AI Generated Text Model 44 | """ 45 | 46 | def __init__(self, cfg, device): 47 | print("initializing the Rank Model...") 48 | 49 | super(AiModel, self).__init__() 50 | self.cfg = cfg 51 | 52 | # ----------------------------- Backbone -----------------------------------------# 53 | backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path) 54 | backbone_config.update( 55 | { 56 | "use_cache": False, 57 | } 58 | ) 59 | 60 | self.backbone = AutoModel.from_pretrained( 61 | self.cfg.model.backbone_path, 62 | config=backbone_config 63 | ) 64 | if self.cfg.model.gradient_checkpointing: 65 | self.backbone.gradient_checkpointing_enable() 66 | 67 | self.dropout = nn.Dropout(self.cfg.model.dropout_rate) 68 | 69 | # classifier 70 | num_features = self.backbone.config.hidden_size 71 | self.classifier = nn.Linear(num_features, 1) 72 | 73 | self.pool = MeanPooling() 74 | 75 | def encode( 76 | self, 77 | input_ids, 78 | attention_mask, 79 | ): 80 | outputs = self.backbone( 81 | input_ids, 82 | attention_mask=attention_mask, 83 | output_hidden_states=False, 84 | ) 85 | 86 | encoder_layer = outputs.last_hidden_state 87 | embeddings = self.pool(encoder_layer, attention_mask) 88 | 89 | return embeddings 90 | 91 | def forward(self, input_ids, attention_mask, labels=None, **kwargs): 92 | # features 93 | features = self.encode( 94 | input_ids=input_ids, 95 | attention_mask=attention_mask, 96 | ) 97 | features = self.dropout(features) 98 | logits = self.classifier(features).reshape(-1) 99 | 100 | # loss 101 | loss = None 102 | labels = labels.reshape(-1) 103 | if labels is not None: 104 | loss = get_ranking_loss(logits, labels) 105 | 106 | return logits, loss 107 | -------------------------------------------------------------------------------- /code/r_ranking/ai_optimizer.py: -------------------------------------------------------------------------------- 1 | from torch.optim import AdamW 2 | 3 | 4 | def get_optimizer_grouped_parameters_no_llrd(model, cfg): 5 | 6 | no_decay = ['bias', "LayerNorm.bias", "LayerNorm.weight"] 7 | backbone_params = model.backbone.named_parameters() 8 | 9 | optimizer_parameters = [ 10 | { 11 | "params": [p for n, p in model.named_parameters() if "backbone" not in n], 12 | "lr": cfg.optimizer.lr, 13 | "weight_decay": cfg.optimizer.weight_decay, 14 | }, 15 | { 16 | "params": [p for n, p in backbone_params if not any(nd in n for nd in no_decay)], 17 | "lr": cfg.optimizer.lr, 18 | "weight_decay": cfg.optimizer.weight_decay, 19 | }, 20 | { 21 | "params": [p for n, p in backbone_params if any(nd in n for nd in no_decay)], 22 | "lr": cfg.optimizer.lr, 23 | "weight_decay": 0.0, 24 | }, 25 | ] 26 | 27 | return optimizer_parameters 28 | 29 | 30 | def get_optimizer_grouped_parameters_with_llrd(model, cfg): 31 | """layerwise learning rate decay implementation 32 | """ 33 | no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] 34 | 35 | # initialize lr for task specific layer 36 | optimizer_grouped_parameters = [ 37 | { 38 | "params": [p for n, p in model.named_parameters() if "backbone" not in n], 39 | "lr": cfg.optimizer.head_lr, 40 | "weight_decay": cfg.optimizer.weight_decay, 41 | }, 42 | ] 43 | 44 | # initialize lrs for backbone layers 45 | layers = [model.backbone.embeddings] + list(model.backbone.encoder.layer) 46 | layers.reverse() 47 | lr = cfg.optimizer.lr 48 | 49 | for layer in layers: 50 | lr *= cfg.optimizer.llrd 51 | 52 | optimizer_grouped_parameters += [ 53 | { 54 | "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)], 55 | "weight_decay": cfg.optimizer.weight_decay, 56 | "lr": lr, 57 | }, 58 | 59 | { 60 | "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)], 61 | "weight_decay": 0.0, 62 | "lr": lr, 63 | }, 64 | ] 65 | 66 | return optimizer_grouped_parameters 67 | 68 | 69 | def get_optimizer(model, cfg): 70 | """optimizer for model training 71 | """ 72 | 73 | if cfg.optimizer.use_llrd: 74 | optimizer_grouped_parameters = get_optimizer_grouped_parameters_with_llrd(model, cfg) 75 | else: 76 | optimizer_grouped_parameters = get_optimizer_grouped_parameters_no_llrd(model, cfg) 77 | 78 | optimizer_kwargs = { 79 | "betas": (cfg.optimizer.beta1, cfg.optimizer.beta2), 80 | "eps": cfg.optimizer.eps, 81 | "lr": cfg.optimizer.lr 82 | } 83 | 84 | if cfg.optimizer.use_bnb: 85 | import bitsandbytes as bnb 86 | 87 | optimizer = bnb.optim.Adam8bit( 88 | optimizer_grouped_parameters, 89 | **optimizer_kwargs 90 | ) 91 | return optimizer 92 | else: 93 | optimizer = AdamW( 94 | optimizer_grouped_parameters, 95 | **optimizer_kwargs 96 | ) 97 | 98 | return optimizer 99 | -------------------------------------------------------------------------------- /code/train_r_clm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import time 5 | 6 | import datasets 7 | import hydra 8 | import numpy as np 9 | import pandas as pd 10 | import torch 11 | import transformers 12 | from accelerate import Accelerator 13 | from accelerate.logging import get_logger 14 | from accelerate.utils import set_seed 15 | from omegaconf import OmegaConf 16 | from peft import LoraConfig, TaskType, get_peft_model 17 | from torch.utils.data import DataLoader 18 | from tqdm.auto import tqdm 19 | from transformers import (AutoModelForCausalLM, BitsAndBytesConfig, 20 | get_cosine_schedule_with_warmup) 21 | 22 | try: 23 | from r_clm.ai_dataset import AiDataset 24 | from r_clm.ai_loader import AiCollator, show_batch 25 | from r_clm.ai_optimizer import get_optimizer 26 | from utils.train_utils import AverageMeter, as_minutes, get_lr 27 | 28 | 29 | except Exception as e: 30 | print(e) 31 | raise ImportError 32 | 33 | logger = get_logger(__name__) 34 | 35 | 36 | def run_evaluation(accelerator, model, valid_dl): 37 | model.eval() 38 | 39 | all_losses = [] 40 | 41 | progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process) 42 | 43 | for step, batch in enumerate(valid_dl): 44 | with torch.no_grad(): 45 | outputs = model(**batch) 46 | 47 | loss = outputs.loss 48 | batch_losses = accelerator.gather_for_metrics(loss) 49 | batch_losses = batch_losses.cpu().numpy().tolist() 50 | 51 | all_losses.extend(batch_losses) 52 | progress_bar.update(1) 53 | progress_bar.close() 54 | 55 | # compute metric 56 | eval_dict = dict() # compute_metrics(all_predictions, all_truths) 57 | eval_dict['valid_loss'] = np.mean(all_losses) 58 | 59 | return eval_dict 60 | 61 | 62 | @hydra.main(version_base=None, config_path="../conf/r_clm", config_name="conf_r_clm") 63 | def run_training(cfg): 64 | # ------- Accelerator ---------------------------------------------------------------# 65 | if cfg.use_wandb: 66 | accelerator = Accelerator( 67 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 68 | log_with="wandb", 69 | ) 70 | 71 | accelerator.init_trackers( 72 | cfg.wandb.project, 73 | config=OmegaConf.to_container(cfg, resolve=True), 74 | ) 75 | 76 | else: 77 | accelerator = Accelerator( 78 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 79 | ) 80 | 81 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 82 | 83 | # Make one log on every process with the configuration for debugging. 84 | logging.basicConfig( 85 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 86 | datefmt="%m/%d/%Y %H:%M:%S", 87 | level=logging.INFO, 88 | ) 89 | logger.info(accelerator.state, main_process_only=False) 90 | 91 | # print_line = partial(print_line, accelerator) 92 | 93 | def print_line(): 94 | prefix, unit, suffix = "#", "~~", "#" 95 | accelerator.print(prefix + unit*50 + suffix) 96 | 97 | if accelerator.is_local_main_process: 98 | datasets.utils.logging.set_verbosity_warning() 99 | transformers.utils.logging.set_verbosity_info() 100 | else: 101 | datasets.utils.logging.set_verbosity_error() 102 | transformers.utils.logging.set_verbosity_error() 103 | 104 | # ------- Runtime Configs -----------------------------------------------------------# 105 | print_line() 106 | accelerator.print(f"setting seed: {cfg.seed}") 107 | set_seed(cfg.seed) 108 | 109 | if accelerator.is_main_process: 110 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 111 | print_line() 112 | 113 | # ------- load data -----------------------------------------------------------------# 114 | print_line() 115 | 116 | # load query dataframe --- 117 | essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"}) 118 | essay_df = essay_df[~essay_df['text'].isna()].copy() 119 | essay_df = essay_df.reset_index(drop=True) 120 | 121 | # ------- Data Split ----------------------------------------------------------------# 122 | # sample validation data 123 | rng = random.Random(cfg.seed) 124 | essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.98 else 'valid') 125 | train_df = essay_df[essay_df['fold'] == 'train'].copy() 126 | valid_df = essay_df[essay_df['fold'] == 'valid'].copy() 127 | 128 | train_df = train_df.reset_index(drop=True) 129 | valid_df = valid_df.reset_index(drop=True) 130 | 131 | accelerator.print(f"shape of train data: {train_df.shape}") 132 | accelerator.print(f"{train_df.head()}") 133 | accelerator.print(f"shape of validation data: {valid_df.shape}") 134 | 135 | with accelerator.main_process_first(): 136 | dataset_creator = AiDataset(cfg) 137 | 138 | train_ds = dataset_creator.get_dataset(train_df) 139 | valid_ds = dataset_creator.get_dataset(valid_df) 140 | 141 | tokenizer = dataset_creator.tokenizer 142 | 143 | train_ds.set_format( 144 | type=None, 145 | columns=[ 146 | 'input_ids', 147 | 'attention_mask', 148 | 'labels' 149 | ] 150 | ) 151 | 152 | # valid_ds = valid_ds.sort("input_length") 153 | 154 | valid_ds.set_format( 155 | type=None, 156 | columns=[ 157 | 'input_ids', 158 | 'attention_mask', 159 | 'labels' 160 | ] 161 | ) 162 | # valid_ids = valid_df["id"] # .tolist() 163 | 164 | data_collator = AiCollator( 165 | tokenizer=tokenizer, 166 | pad_to_multiple_of=64 167 | ) 168 | 169 | train_dl = DataLoader( 170 | train_ds, 171 | batch_size=cfg.train_params.per_device_train_batch_size, 172 | shuffle=True, 173 | collate_fn=data_collator, 174 | ) 175 | 176 | valid_dl = DataLoader( 177 | valid_ds, 178 | batch_size=cfg.train_params.per_device_eval_batch_size, 179 | shuffle=False, 180 | collate_fn=data_collator, 181 | ) 182 | 183 | accelerator.print("data preparation done...") 184 | print_line() 185 | 186 | # --- show batch -------------------------------------------------------------------# 187 | print_line() 188 | 189 | for b in train_dl: 190 | break 191 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 192 | 193 | print_line() 194 | 195 | for b in valid_dl: 196 | break 197 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 198 | 199 | # --- model -------------------------------------------------------------------------# 200 | print_line() 201 | 202 | # Note: avoid quantization for smaller models (e.g. opt-125m, bloom-560m) for training stability -- 203 | bnb_config = BitsAndBytesConfig( 204 | load_in_4bit=True, 205 | bnb_4bit_quant_type="nf4", 206 | bnb_4bit_use_double_quant=True, 207 | bnb_4bit_compute_dtype=torch.bfloat16, 208 | ) 209 | 210 | base_model = AutoModelForCausalLM.from_pretrained( 211 | cfg.model.backbone_path, 212 | quantization_config=bnb_config, 213 | ) 214 | 215 | base_model.config.pretraining_tp = 1 216 | 217 | # lora --- 218 | peft_config = LoraConfig( 219 | r=cfg.model.lora.r, 220 | lora_alpha=cfg.model.lora.lora_alpha, 221 | lora_dropout=cfg.model.lora.lora_dropout, 222 | bias="none", 223 | task_type=TaskType.CAUSAL_LM, 224 | inference_mode=False, 225 | target_modules=cfg_dict["model"]["lora"]["target_modules"], 226 | ) 227 | 228 | model = get_peft_model(base_model, peft_config) 229 | model.print_trainable_parameters() 230 | model.config.use_cache = False 231 | 232 | # --- optimizer ---------------------------------------------------------------------# 233 | print_line() 234 | optimizer = get_optimizer(cfg, model, print_fn=accelerator.print) 235 | 236 | # ------- Prepare -------------------------------------------------------------------# 237 | 238 | model, optimizer, train_dl, valid_dl = accelerator.prepare( 239 | model, optimizer, train_dl, valid_dl 240 | ) 241 | 242 | # ------- Scheduler -----------------------------------------------------------------# 243 | print_line() 244 | num_epochs = cfg.train_params.num_train_epochs 245 | grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps 246 | warmup_pct = cfg.train_params.warmup_pct 247 | 248 | num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps 249 | num_training_steps = num_epochs * num_update_steps_per_epoch 250 | num_warmup_steps = int(warmup_pct*num_training_steps) 251 | 252 | accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}") 253 | accelerator.print(f"# training steps: {num_training_steps}") 254 | accelerator.print(f"# warmup steps: {num_warmup_steps}") 255 | 256 | scheduler = get_cosine_schedule_with_warmup( 257 | optimizer=optimizer, 258 | num_warmup_steps=num_warmup_steps, 259 | num_training_steps=num_training_steps 260 | ) 261 | 262 | # scheduler = accelerator.prepare(scheduler) 263 | 264 | # ------- training setup --------------------------------------------------------------# 265 | best_lb = 1e6 266 | patience_tracker = 0 267 | current_iteration = 0 268 | 269 | # ------- training --------------------------------------------------------------------# 270 | start_time = time.time() 271 | accelerator.wait_for_everyone() 272 | 273 | for epoch in range(num_epochs): 274 | # close and reset progress bar 275 | if epoch != 0: 276 | progress_bar.close() 277 | 278 | progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process) 279 | loss_meter = AverageMeter() 280 | 281 | # Training ------ 282 | model.train() 283 | for step, batch in enumerate(train_dl): 284 | with accelerator.accumulate(model): # gives sync vs no sync context manager 285 | outputs = model(**batch) 286 | loss = outputs.loss 287 | accelerator.backward(loss) 288 | 289 | if accelerator.sync_gradients: 290 | accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm) 291 | 292 | optimizer.step() 293 | scheduler.step() 294 | optimizer.zero_grad() 295 | 296 | # check if loss.item() is okay for TPU 297 | # happening on all processes - values of loss meter in different processes are different 298 | loss_meter.update(loss.item()) # tracks loss in each batch, no accumulation 299 | 300 | if accelerator.sync_gradients: 301 | progress_bar.set_description( 302 | f"STEP: {current_iteration+1:5}/{num_training_steps:5}. " 303 | f"LR: {get_lr(optimizer):.4f}. " 304 | f"Loss: {loss_meter.avg:.4f}. " 305 | ) 306 | 307 | progress_bar.update(1) 308 | current_iteration += 1 309 | 310 | if cfg.use_wandb: 311 | accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration) # only on main process 312 | accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration) 313 | 314 | # >--------------------------------------------------| 315 | # >-- evaluation ------------------------------------| 316 | # >--------------------------------------------------| 317 | 318 | if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0): 319 | # set model in eval mode 320 | model.eval() 321 | scores_dict = run_evaluation(accelerator, model, valid_dl) 322 | lb = scores_dict["valid_loss"] 323 | 324 | print_line() 325 | et = as_minutes(time.time()-start_time) 326 | accelerator.print( 327 | f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}" 328 | ) 329 | print_line() 330 | accelerator.print(f">>> Current Valid Loss = {round(lb, 4)}") 331 | 332 | print_line() 333 | 334 | is_best = False 335 | if lb <= best_lb: 336 | best_lb = lb 337 | is_best = True 338 | patience_tracker = 0 339 | 340 | # ----- 341 | best_dict = dict() 342 | for k, v in scores_dict.items(): 343 | best_dict[f"{k}_at_best"] = v 344 | else: 345 | patience_tracker += 1 346 | 347 | # saving ----- 348 | accelerator.wait_for_everyone() 349 | unwrapped_model = accelerator.unwrap_model(model) 350 | 351 | unwrapped_model.save_pretrained( 352 | f"{cfg.outputs.model_dir}/last", 353 | state_dict=accelerator.get_state_dict(model), 354 | save_function=accelerator.save, 355 | ) 356 | 357 | if accelerator.is_main_process: 358 | tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last") 359 | 360 | # logging ---- 361 | if cfg.use_wandb: 362 | accelerator.log({"lb": lb}, step=current_iteration) 363 | accelerator.log({"best_lb": best_lb}, step=current_iteration) 364 | 365 | # -- post eval 366 | model.train() 367 | torch.cuda.empty_cache() 368 | print_line() 369 | 370 | # early stopping ---- 371 | if patience_tracker >= cfg.train_params.patience: 372 | print("stopping early") 373 | model.eval() 374 | accelerator.end_training() 375 | return 376 | 377 | # --- end training 378 | accelerator.end_training() 379 | 380 | 381 | if __name__ == "__main__": 382 | run_training() 383 | -------------------------------------------------------------------------------- /code/train_r_clm_from_scratch.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import time 5 | from copy import deepcopy 6 | from dataclasses import asdict, dataclass, field 7 | from functools import partial 8 | 9 | import bitsandbytes as bnb 10 | import datasets 11 | import hydra 12 | import numpy as np 13 | import pandas as pd 14 | import torch 15 | import torch.nn as nn 16 | import transformers 17 | from accelerate import Accelerator 18 | from accelerate.logging import get_logger 19 | from accelerate.utils import set_seed 20 | from omegaconf import OmegaConf 21 | from peft import (LoraConfig, TaskType, get_peft_model, 22 | prepare_model_for_kbit_training) 23 | from torch.utils.data import DataLoader 24 | from tqdm.auto import tqdm 25 | from transformers import (AutoModelForCausalLM, BitsAndBytesConfig, 26 | get_cosine_schedule_with_warmup) 27 | 28 | try: 29 | from r_clm.ai_dataset import AiDataset 30 | from r_clm.ai_loader import AiCollator, show_batch 31 | from r_clm.ai_optimizer import get_optimizer 32 | from utils.train_utils import AverageMeter, as_minutes, get_lr 33 | 34 | 35 | except Exception as e: 36 | print(e) 37 | raise ImportError 38 | 39 | logger = get_logger(__name__) 40 | 41 | 42 | def run_evaluation(accelerator, model, valid_dl): 43 | model.eval() 44 | 45 | all_losses = [] 46 | 47 | progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process) 48 | 49 | for step, batch in enumerate(valid_dl): 50 | with torch.no_grad(): 51 | outputs = model(**batch) 52 | 53 | loss = outputs.loss 54 | batch_losses = accelerator.gather_for_metrics(loss) 55 | batch_losses = batch_losses.cpu().numpy().tolist() 56 | 57 | all_losses.extend(batch_losses) 58 | progress_bar.update(1) 59 | progress_bar.close() 60 | 61 | # compute metric 62 | eval_dict = dict() # compute_metrics(all_predictions, all_truths) 63 | eval_dict['valid_loss'] = np.mean(all_losses) 64 | 65 | return eval_dict 66 | 67 | 68 | @hydra.main(version_base=None, config_path="../conf/r_clm", config_name="conf_r_clm") 69 | def run_training(cfg): 70 | # ------- Accelerator ---------------------------------------------------------------# 71 | if cfg.use_wandb: 72 | accelerator = Accelerator( 73 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 74 | log_with="wandb", 75 | ) 76 | 77 | accelerator.init_trackers( 78 | cfg.wandb.project, 79 | config=OmegaConf.to_container(cfg, resolve=True), 80 | ) 81 | 82 | else: 83 | accelerator = Accelerator( 84 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 85 | ) 86 | 87 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 88 | 89 | # Make one log on every process with the configuration for debugging. 90 | logging.basicConfig( 91 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 92 | datefmt="%m/%d/%Y %H:%M:%S", 93 | level=logging.INFO, 94 | ) 95 | logger.info(accelerator.state, main_process_only=False) 96 | 97 | # print_line = partial(print_line, accelerator) 98 | 99 | def print_line(): 100 | prefix, unit, suffix = "#", "~~", "#" 101 | accelerator.print(prefix + unit*50 + suffix) 102 | 103 | if accelerator.is_local_main_process: 104 | datasets.utils.logging.set_verbosity_warning() 105 | transformers.utils.logging.set_verbosity_info() 106 | else: 107 | datasets.utils.logging.set_verbosity_error() 108 | transformers.utils.logging.set_verbosity_error() 109 | 110 | # ------- Runtime Configs -----------------------------------------------------------# 111 | print_line() 112 | accelerator.print(f"setting seed: {cfg.seed}") 113 | set_seed(cfg.seed) 114 | 115 | if accelerator.is_main_process: 116 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 117 | print_line() 118 | 119 | # ------- load data -----------------------------------------------------------------# 120 | print_line() 121 | 122 | # load query dataframe --- 123 | essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"}) 124 | essay_df = essay_df[~essay_df['text'].isna()].copy() 125 | essay_df = essay_df.reset_index(drop=True) 126 | 127 | # ------- Data Split ----------------------------------------------------------------# 128 | # sample validation data 129 | rng = random.Random(cfg.seed) 130 | essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.98 else 'valid') 131 | train_df = essay_df[essay_df['fold'] == 'train'].copy() 132 | valid_df = essay_df[essay_df['fold'] == 'valid'].copy() 133 | 134 | train_df = train_df.reset_index(drop=True) 135 | valid_df = valid_df.reset_index(drop=True) 136 | 137 | accelerator.print(f"shape of train data: {train_df.shape}") 138 | accelerator.print(f"{train_df.head()}") 139 | accelerator.print(f"shape of validation data: {valid_df.shape}") 140 | 141 | with accelerator.main_process_first(): 142 | dataset_creator = AiDataset(cfg) 143 | 144 | train_ds = dataset_creator.get_dataset(train_df) 145 | valid_ds = dataset_creator.get_dataset(valid_df) 146 | 147 | tokenizer = dataset_creator.tokenizer 148 | 149 | train_ds.set_format( 150 | type=None, 151 | columns=[ 152 | 'input_ids', 153 | 'attention_mask', 154 | 'labels' 155 | ] 156 | ) 157 | 158 | # valid_ds = valid_ds.sort("input_length") 159 | 160 | valid_ds.set_format( 161 | type=None, 162 | columns=[ 163 | 'input_ids', 164 | 'attention_mask', 165 | 'labels' 166 | ] 167 | ) 168 | # valid_ids = valid_df["id"] # .tolist() 169 | 170 | data_collator = AiCollator( 171 | tokenizer=tokenizer, 172 | pad_to_multiple_of=64 173 | ) 174 | 175 | train_dl = DataLoader( 176 | train_ds, 177 | batch_size=cfg.train_params.per_device_train_batch_size, 178 | shuffle=True, 179 | collate_fn=data_collator, 180 | ) 181 | 182 | valid_dl = DataLoader( 183 | valid_ds, 184 | batch_size=cfg.train_params.per_device_eval_batch_size, 185 | shuffle=False, 186 | collate_fn=data_collator, 187 | ) 188 | 189 | accelerator.print("data preparation done...") 190 | print_line() 191 | 192 | # --- show batch -------------------------------------------------------------------# 193 | print_line() 194 | 195 | for b in train_dl: 196 | break 197 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 198 | 199 | print_line() 200 | 201 | for b in valid_dl: 202 | break 203 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 204 | 205 | # --- model -------------------------------------------------------------------------# 206 | print_line() 207 | 208 | model = AutoModelForCausalLM.from_pretrained( 209 | cfg.model.backbone_path, 210 | torch_dtype=torch.bfloat16, 211 | ) 212 | 213 | # model.config.pretraining_tp = 1 214 | # model.print_trainable_parameters() 215 | model.config.use_cache = False 216 | 217 | # --- optimizer ---------------------------------------------------------------------# 218 | print_line() 219 | optimizer = get_optimizer(cfg, model, print_fn=accelerator.print) 220 | 221 | # ------- Prepare -------------------------------------------------------------------# 222 | 223 | model, optimizer, train_dl, valid_dl = accelerator.prepare( 224 | model, optimizer, train_dl, valid_dl 225 | ) 226 | 227 | # ------- Scheduler -----------------------------------------------------------------# 228 | print_line() 229 | num_epochs = cfg.train_params.num_train_epochs 230 | grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps 231 | warmup_pct = cfg.train_params.warmup_pct 232 | 233 | num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps 234 | num_training_steps = num_epochs * num_update_steps_per_epoch 235 | num_warmup_steps = int(warmup_pct*num_training_steps) 236 | 237 | accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}") 238 | accelerator.print(f"# training steps: {num_training_steps}") 239 | accelerator.print(f"# warmup steps: {num_warmup_steps}") 240 | 241 | scheduler = get_cosine_schedule_with_warmup( 242 | optimizer=optimizer, 243 | num_warmup_steps=num_warmup_steps, 244 | num_training_steps=num_training_steps 245 | ) 246 | 247 | # ------- training setup --------------------------------------------------------------# 248 | best_lb = 1e6 249 | patience_tracker = 0 250 | current_iteration = 0 251 | 252 | # ------- training --------------------------------------------------------------------# 253 | start_time = time.time() 254 | accelerator.wait_for_everyone() 255 | 256 | for epoch in range(num_epochs): 257 | # close and reset progress bar 258 | if epoch != 0: 259 | progress_bar.close() 260 | 261 | progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process) 262 | loss_meter = AverageMeter() 263 | 264 | # Training ------ 265 | model.train() 266 | for step, batch in enumerate(train_dl): 267 | with accelerator.accumulate(model): # gives sync vs no sync context manager 268 | outputs = model(**batch) 269 | loss = outputs.loss 270 | accelerator.backward(loss) 271 | 272 | if accelerator.sync_gradients: 273 | accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm) 274 | 275 | optimizer.step() 276 | scheduler.step() 277 | optimizer.zero_grad() 278 | 279 | # check if loss.item() is okay for TPU 280 | # happening on all processes - values of loss meter in different processes are different 281 | loss_meter.update(loss.item()) # tracks loss in each batch, no accumulation 282 | 283 | if accelerator.sync_gradients: 284 | progress_bar.set_description( 285 | f"STEP: {current_iteration+1:5}/{num_training_steps:5}. " 286 | f"LR: {get_lr(optimizer):.4f}. " 287 | f"Loss: {loss_meter.avg:.4f}. " 288 | ) 289 | 290 | progress_bar.update(1) 291 | current_iteration += 1 292 | 293 | if cfg.use_wandb: 294 | accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration) # only on main process 295 | accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration) 296 | 297 | # >--------------------------------------------------| 298 | # >-- evaluation ------------------------------------| 299 | # >--------------------------------------------------| 300 | 301 | if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0): 302 | # set model in eval mode 303 | model.eval() 304 | scores_dict = run_evaluation(accelerator, model, valid_dl) 305 | lb = scores_dict["valid_loss"] 306 | 307 | print_line() 308 | et = as_minutes(time.time()-start_time) 309 | accelerator.print( 310 | f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}" 311 | ) 312 | print_line() 313 | accelerator.print(f">>> Current Valid Loss = {round(lb, 4)}") 314 | 315 | print_line() 316 | 317 | is_best = False 318 | if lb <= best_lb: 319 | best_lb = lb 320 | is_best = True 321 | patience_tracker = 0 322 | 323 | # ----- 324 | best_dict = dict() 325 | for k, v in scores_dict.items(): 326 | best_dict[f"{k}_at_best"] = v 327 | else: 328 | patience_tracker += 1 329 | 330 | # saving ----- 331 | accelerator.wait_for_everyone() 332 | unwrapped_model = accelerator.unwrap_model(model) 333 | 334 | unwrapped_model.save_pretrained( 335 | f"{cfg.outputs.model_dir}/last", 336 | state_dict=accelerator.get_state_dict(model), 337 | save_function=accelerator.save, 338 | ) 339 | 340 | if accelerator.is_main_process: 341 | tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last") 342 | 343 | # logging ---- 344 | if cfg.use_wandb: 345 | accelerator.log({"lb": lb}, step=current_iteration) 346 | accelerator.log({"best_lb": best_lb}, step=current_iteration) 347 | 348 | # -- post eval 349 | model.train() 350 | torch.cuda.empty_cache() 351 | print_line() 352 | 353 | # early stopping ---- 354 | if patience_tracker >= cfg.train_params.patience: 355 | print("stopping early") 356 | model.eval() 357 | accelerator.end_training() 358 | return 359 | 360 | # --- end training 361 | accelerator.end_training() 362 | 363 | 364 | if __name__ == "__main__": 365 | run_training() 366 | -------------------------------------------------------------------------------- /code/train_r_detect.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import time 5 | from copy import deepcopy 6 | from dataclasses import asdict, dataclass, field 7 | from functools import partial 8 | 9 | import bitsandbytes as bnb 10 | import datasets 11 | import hydra 12 | import pandas as pd 13 | import torch 14 | import transformers 15 | from accelerate import Accelerator 16 | from accelerate.logging import get_logger 17 | from accelerate.utils import set_seed 18 | from omegaconf import OmegaConf 19 | from peft import (LoraConfig, TaskType, get_peft_model, 20 | prepare_model_for_kbit_training) 21 | from torch.utils.data import DataLoader 22 | from tqdm.auto import tqdm 23 | from transformers import (AutoModelForSequenceClassification, 24 | BitsAndBytesConfig, get_cosine_schedule_with_warmup) 25 | 26 | try: 27 | from r_detect.ai_dataset import AiDataset 28 | from r_detect.ai_loader import AiCollator, AiCollatorTrain, show_batch 29 | from r_detect.ai_model import (LlamaForDetectAI, MistralForDetectAI, 30 | PhiForDetectAI) 31 | from r_detect.ai_optimizer import get_optimizer 32 | from utils.metric_utils import compute_metrics 33 | from utils.train_utils import AverageMeter, as_minutes, get_lr 34 | 35 | 36 | except Exception as e: 37 | print(e) 38 | raise ImportError 39 | 40 | logger = get_logger(__name__) 41 | 42 | 43 | def run_evaluation(accelerator, model, valid_dl, valid_ids): 44 | model.eval() 45 | 46 | all_predictions = [] 47 | all_truths = [] 48 | 49 | progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process) 50 | 51 | for step, batch in enumerate(valid_dl): 52 | with torch.no_grad(): 53 | outputs = model(**batch) 54 | 55 | logits = outputs.logits 56 | 57 | predictions = torch.sigmoid(logits) 58 | predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"].to(torch.long).reshape(-1))) 59 | predictions, references = predictions.cpu().numpy().tolist(), references.cpu().numpy().tolist() 60 | 61 | all_predictions.extend(predictions) 62 | all_truths.extend(references) 63 | 64 | progress_bar.update(1) 65 | progress_bar.close() 66 | 67 | # compute metric 68 | eval_dict = compute_metrics(all_predictions, all_truths) 69 | 70 | result_df = pd.DataFrame() 71 | result_df["id"] = valid_ids 72 | result_df["predictions"] = all_predictions 73 | result_df["truths"] = all_truths 74 | 75 | oof_df = deepcopy(result_df) 76 | oof_df = oof_df.rename(columns={"predictions": "generated"}) 77 | oof_df = oof_df[["id", "generated"]].copy() 78 | 79 | to_return = { 80 | "scores": eval_dict, 81 | "result_df": result_df, 82 | "oof_df": oof_df, 83 | } 84 | 85 | return to_return 86 | 87 | 88 | @hydra.main(version_base=None, config_path="../conf/r_detect", config_name="conf_r_detect") 89 | def run_training(cfg): 90 | # ------- Accelerator ---------------------------------------------------------------# 91 | if cfg.use_wandb: 92 | accelerator = Accelerator( 93 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 94 | log_with="wandb", 95 | # mixed_precision='fp16', 96 | ) 97 | 98 | accelerator.init_trackers( 99 | cfg.wandb.project, 100 | config=OmegaConf.to_container(cfg, resolve=True), 101 | ) 102 | 103 | else: 104 | accelerator = Accelerator( 105 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 106 | # mixed_precision='fp16', 107 | ) 108 | 109 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 110 | 111 | # Make one log on every process with the configuration for debugging. 112 | logging.basicConfig( 113 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 114 | datefmt="%m/%d/%Y %H:%M:%S", 115 | level=logging.INFO, 116 | ) 117 | logger.info(accelerator.state, main_process_only=False) 118 | 119 | # print_line = partial(print_line, accelerator) 120 | 121 | def print_line(): 122 | prefix, unit, suffix = "#", "~~", "#" 123 | accelerator.print(prefix + unit*50 + suffix) 124 | 125 | if accelerator.is_local_main_process: 126 | datasets.utils.logging.set_verbosity_warning() 127 | transformers.utils.logging.set_verbosity_info() 128 | else: 129 | datasets.utils.logging.set_verbosity_error() 130 | transformers.utils.logging.set_verbosity_error() 131 | 132 | # ------- Runtime Configs -----------------------------------------------------------# 133 | print_line() 134 | accelerator.print(f"setting seed: {cfg.seed}") 135 | set_seed(cfg.seed) 136 | 137 | if accelerator.is_main_process: 138 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 139 | print_line() 140 | 141 | # ------- load data -----------------------------------------------------------------# 142 | print_line() 143 | data_dir = cfg.input_data_dir 144 | 145 | try: 146 | essay_df = pd.read_csv(os.path.join(data_dir, "train_essays.csv")) 147 | except Exception as e: 148 | essay_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet")) 149 | 150 | essay_df = essay_df[~essay_df['text'].isna()].copy() 151 | essay_df = essay_df.reset_index(drop=True) 152 | 153 | # train_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet")) 154 | # train_df = train_df[~train_df['text'].isna()].copy() 155 | 156 | # valid_df = pd.read_parquet(os.path.join(data_dir, "valid_essays.parquet")) 157 | # valid_df = valid_df[~valid_df['text'].isna()].copy() 158 | 159 | rng = random.Random(cfg.seed) 160 | essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.99 else 'valid') 161 | train_df = essay_df[essay_df['fold'] == 'train'].copy() 162 | valid_df = essay_df[essay_df['fold'] == 'valid'].copy() 163 | 164 | train_df = train_df.reset_index(drop=True) 165 | valid_df = valid_df.reset_index(drop=True) 166 | 167 | accelerator.print(f"shape of train data: {train_df.shape}") 168 | accelerator.print(f"{train_df.head()}") 169 | accelerator.print(f"shape of validation data: {valid_df.shape}") 170 | 171 | with accelerator.main_process_first(): 172 | dataset_creator = AiDataset(cfg) 173 | 174 | train_ds = dataset_creator.get_dataset(train_df) 175 | valid_ds = dataset_creator.get_dataset(valid_df) 176 | 177 | tokenizer = dataset_creator.tokenizer 178 | 179 | train_ds.set_format( 180 | type=None, 181 | columns=[ 182 | 'id', 183 | 'input_ids', 184 | 'attention_mask', 185 | 'generated' 186 | ] 187 | ) 188 | 189 | valid_ds = valid_ds.sort("input_length") 190 | 191 | valid_ds.set_format( 192 | type=None, 193 | columns=[ 194 | 'id', 195 | 'input_ids', 196 | 'attention_mask', 197 | 'generated' 198 | ] 199 | ) 200 | valid_ids = valid_df["id"] # .tolist() 201 | 202 | data_collator = AiCollator( 203 | tokenizer=tokenizer, 204 | pad_to_multiple_of=64 205 | ) 206 | data_collator_train = AiCollatorTrain( 207 | tokenizer=tokenizer, 208 | pad_to_multiple_of=64, 209 | kwargs=dict(cfg=cfg) 210 | ) 211 | 212 | train_dl = DataLoader( 213 | train_ds, 214 | batch_size=cfg.train_params.per_device_train_batch_size, 215 | shuffle=True, 216 | collate_fn=data_collator_train, 217 | ) 218 | 219 | valid_dl = DataLoader( 220 | valid_ds, 221 | batch_size=cfg.train_params.per_device_eval_batch_size, 222 | shuffle=False, 223 | collate_fn=data_collator, 224 | ) 225 | 226 | accelerator.print("data preparation done...") 227 | print_line() 228 | 229 | # --- show batch -------------------------------------------------------------------# 230 | print_line() 231 | 232 | for b in train_dl: 233 | break 234 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 235 | 236 | print_line() 237 | 238 | for b in valid_dl: 239 | break 240 | show_batch(b, tokenizer, task='training', print_fn=accelerator.print) 241 | 242 | # --- model -------------------------------------------------------------------------# 243 | print_line() 244 | bnb_config = BitsAndBytesConfig( 245 | load_in_4bit=True, 246 | bnb_4bit_quant_type="nf4", 247 | bnb_4bit_use_double_quant=True, 248 | bnb_4bit_compute_dtype=torch.float16 249 | ) 250 | 251 | if 'solar' in cfg.model.backbone_path.lower(): 252 | base_model = LlamaForDetectAI.from_pretrained( 253 | cfg.model.backbone_path, 254 | num_labels=cfg.model.num_labels, # 2 255 | quantization_config=bnb_config, 256 | ) 257 | elif 'phi' in cfg.model.backbone_path.lower(): 258 | base_model = PhiForDetectAI.from_pretrained( 259 | cfg.model.backbone_path, 260 | num_labels=cfg.model.num_labels, # 2 261 | quantization_config=bnb_config, 262 | trust_remote_code=True, # IMP 263 | ) 264 | else: 265 | base_model = MistralForDetectAI.from_pretrained( 266 | cfg.model.backbone_path, 267 | num_labels=cfg.model.num_labels, # 2 268 | quantization_config=bnb_config, 269 | ) 270 | # base_model.peft_config = dict() 271 | 272 | base_model.config.pretraining_tp = 1 273 | # base_model.config.pad_token_id = tokenizer.pad_token_id 274 | 275 | # # base_model = prepare_model_for_kbit_training(base_model, use_gradient_checkpointing=True) 276 | # for param in base_model.parameters(): 277 | # if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16): 278 | # param.data = param.data.to(torch.float32) 279 | 280 | # lora --- 281 | peft_config = LoraConfig( 282 | r=cfg.model.lora.r, 283 | lora_alpha=cfg.model.lora.lora_alpha, 284 | lora_dropout=cfg.model.lora.lora_dropout, 285 | bias="none", 286 | task_type=TaskType.SEQ_CLS, 287 | inference_mode=False, 288 | target_modules=cfg_dict["model"]["lora"]["target_modules"], 289 | modules_to_save=cfg_dict["model"]["lora"]["modules_to_save"], 290 | ) 291 | 292 | model = get_peft_model(base_model, peft_config) 293 | print(model.device) 294 | model.print_trainable_parameters() 295 | accelerator.wait_for_everyone() 296 | 297 | # --- optimizer ---------------------------------------------------------------------# 298 | print_line() 299 | optimizer = get_optimizer(cfg, model, print_fn=accelerator.print) 300 | 301 | # ------- Prepare -------------------------------------------------------------------# 302 | 303 | model, optimizer, train_dl, valid_dl = accelerator.prepare( 304 | model, optimizer, train_dl, valid_dl 305 | ) 306 | 307 | # ------- Scheduler -----------------------------------------------------------------# 308 | print_line() 309 | num_epochs = cfg.train_params.num_train_epochs 310 | grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps 311 | warmup_pct = cfg.train_params.warmup_pct 312 | 313 | num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps 314 | num_training_steps = num_epochs * num_update_steps_per_epoch 315 | num_warmup_steps = int(warmup_pct*num_training_steps) 316 | 317 | accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}") 318 | accelerator.print(f"# training steps: {num_training_steps}") 319 | accelerator.print(f"# warmup steps: {num_warmup_steps}") 320 | 321 | scheduler = get_cosine_schedule_with_warmup( 322 | optimizer=optimizer, 323 | num_warmup_steps=num_warmup_steps, 324 | num_training_steps=num_training_steps 325 | ) 326 | 327 | # scheduler = accelerator.prepare(scheduler) 328 | 329 | # ------- training setup --------------------------------------------------------------# 330 | best_lb = -1. 331 | save_trigger = cfg.train_params.save_trigger 332 | 333 | patience_tracker = 0 334 | current_iteration = 0 335 | 336 | # ------- training --------------------------------------------------------------------# 337 | start_time = time.time() 338 | accelerator.wait_for_everyone() 339 | 340 | for epoch in range(num_epochs): 341 | # close and reset progress bar 342 | if epoch != 0: 343 | progress_bar.close() 344 | 345 | progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process) 346 | loss_meter = AverageMeter() 347 | 348 | # Training ------ 349 | model.train() 350 | for step, batch in enumerate(train_dl): 351 | with accelerator.accumulate(model): # gives sync vs no sync context manager 352 | outputs = model(**batch) 353 | loss = outputs.loss 354 | accelerator.backward(loss) 355 | 356 | if accelerator.sync_gradients: 357 | # Q: why need this check? 358 | # A: gradient_state.sync_gradients check is NOT performed inside clip_grad_norm_ 359 | accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm) 360 | 361 | optimizer.step() # gradient_state.sync_gradients check is performed inside optimizer.step 362 | scheduler.step() 363 | optimizer.zero_grad() 364 | 365 | # check if loss.item() is okay for TPU 366 | # happening on all processes - values of loss meter in different processes are different 367 | loss_meter.update(loss.item()) # tracks loss in each batch, no accumulation 368 | 369 | if accelerator.sync_gradients: 370 | progress_bar.set_description( 371 | f"STEP: {current_iteration+1:5}/{num_training_steps:5}. " 372 | f"LR: {get_lr(optimizer):.4f}. " 373 | f"Loss: {loss_meter.avg:.4f}. " 374 | ) 375 | 376 | progress_bar.update(1) 377 | current_iteration += 1 378 | 379 | if cfg.use_wandb: 380 | accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration) # only on main process 381 | accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration) 382 | 383 | # >--------------------------------------------------| 384 | # >-- evaluation ------------------------------------| 385 | # >--------------------------------------------------| 386 | 387 | if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0): 388 | # set model in eval mode 389 | model.eval() 390 | eval_response = run_evaluation(accelerator, model, valid_dl, valid_ids) 391 | 392 | scores_dict = eval_response["scores"] 393 | result_df = eval_response["result_df"] 394 | oof_df = eval_response["oof_df"] 395 | 396 | lb = scores_dict["lb"] 397 | 398 | print_line() 399 | et = as_minutes(time.time()-start_time) 400 | accelerator.print( 401 | f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}" 402 | ) 403 | print_line() 404 | accelerator.print(f">>> Current LB (AUC) = {round(lb, 4)}") 405 | 406 | print_line() 407 | 408 | is_best = False 409 | if lb >= best_lb: 410 | best_lb = lb 411 | is_best = True 412 | patience_tracker = 0 413 | 414 | # ----- 415 | best_dict = dict() 416 | for k, v in scores_dict.items(): 417 | best_dict[f"{k}_at_best"] = v 418 | else: 419 | patience_tracker += 1 420 | 421 | if is_best: # do in main process 422 | oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_best.csv"), index=False) 423 | result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_best.csv"), index=False) 424 | else: 425 | accelerator.print(f">>> patience reached {patience_tracker}/{cfg.train_params.patience}") 426 | accelerator.print(f">>> current best score: {round(best_lb, 4)}") 427 | 428 | oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_last.csv"), index=False) 429 | result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_last.csv"), index=False) 430 | 431 | # saving ----- 432 | accelerator.wait_for_everyone() 433 | unwrapped_model = accelerator.unwrap_model(model) 434 | 435 | # # debug -- 436 | # selected_adapters = list(unwrapped_model.peft_config.keys()) 437 | # accelerator.print(f"selected adapters: {selected_adapters}") 438 | # for adapter_name in selected_adapters: 439 | # peft_config = unwrapped_model.peft_config[adapter_name] 440 | # peft_config = asdict(peft_config) 441 | # accelerator.print(f"adapter: {adapter_name}") 442 | # accelerator.print(peft_config) 443 | # for k, v in peft_config.items(): 444 | # accelerator.print(f"{k}: {v} ({type(v)})") 445 | # # ------ 446 | unwrapped_model.save_pretrained( 447 | f"{cfg.outputs.model_dir}/last", 448 | state_dict=accelerator.get_state_dict(model), 449 | save_function=accelerator.save, 450 | ) 451 | 452 | if accelerator.is_main_process: 453 | tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last") 454 | 455 | if best_lb > save_trigger: 456 | if accelerator.is_main_process: 457 | tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/best") 458 | unwrapped_model.save_pretrained( 459 | f"{cfg.outputs.model_dir}/best", 460 | state_dict=accelerator.get_state_dict(model), 461 | save_function=accelerator.save, 462 | ) 463 | if accelerator.is_main_process: 464 | tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/best") 465 | 466 | # logging ---- 467 | if cfg.use_wandb: 468 | accelerator.log({"lb": lb}, step=current_iteration) 469 | accelerator.log({"best_lb": best_lb}, step=current_iteration) 470 | 471 | # -- log scores dict 472 | for k, v in scores_dict.items(): 473 | accelerator.log({k: round(v, 4)}, step=current_iteration) 474 | 475 | # --- log best scores dict 476 | for k, v in best_dict.items(): 477 | accelerator.log({k: round(v, 4)}, step=current_iteration) 478 | 479 | # -- post eval 480 | model.train() 481 | torch.cuda.empty_cache() 482 | print_line() 483 | 484 | # early stopping ---- 485 | if patience_tracker >= cfg.train_params.patience: 486 | print("stopping early") 487 | model.eval() 488 | accelerator.end_training() 489 | return 490 | 491 | # --- end training 492 | accelerator.end_training() 493 | 494 | 495 | if __name__ == "__main__": 496 | run_training() 497 | -------------------------------------------------------------------------------- /code/train_r_dpo.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py 2 | 3 | import argparse 4 | import os 5 | 6 | import pandas as pd 7 | import torch 8 | from accelerate import Accelerator 9 | from accelerate.utils import set_seed 10 | from datasets import Dataset, DatasetDict 11 | from omegaconf import OmegaConf 12 | from peft import LoraConfig, PeftConfig, PeftModel, TaskType 13 | from transformers import (AutoModelForCausalLM, AutoTokenizer, 14 | BitsAndBytesConfig, TrainingArguments) 15 | from trl import DPOTrainer 16 | 17 | 18 | def get_datasets(cfg): 19 | """ 20 | prepare training and test datasets for DPO 21 | """ 22 | raw_datasets = DatasetDict() 23 | train_df = pd.read_parquet(cfg.train_path) # prompt, chosen, rejected 24 | test_df = pd.read_parquet(cfg.test_path) 25 | 26 | train_ds = Dataset.from_pandas(train_df) 27 | test_ds = Dataset.from_pandas(test_df) 28 | 29 | train_ds = train_ds.remove_columns(["dpo_id", "diff"]) 30 | test_ds = test_ds.remove_columns(["dpo_id", "diff"]) 31 | 32 | raw_datasets["train"] = train_ds 33 | raw_datasets["test"] = test_ds 34 | 35 | return raw_datasets 36 | 37 | 38 | def get_tokenizer(cfg): 39 | 40 | tokenizer = AutoTokenizer.from_pretrained( 41 | cfg.sft_model_path, 42 | use_fast=True, 43 | padding_side='left', 44 | truncation_side='left', 45 | ) 46 | 47 | if tokenizer.pad_token is None: 48 | if tokenizer.unk_token is not None: 49 | tokenizer.pad_token = tokenizer.unk_token 50 | else: 51 | tokenizer.pad_token = tokenizer.eos_token 52 | return tokenizer 53 | 54 | 55 | def main(cfg): 56 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 57 | 58 | # Set seed for reproducibility 59 | set_seed(cfg.seed) 60 | 61 | # set up accelerator 62 | accelerator = Accelerator() 63 | 64 | # datasets --- 65 | raw_datasets = get_datasets(cfg) 66 | tokenizer = get_tokenizer(cfg) 67 | 68 | quantization_config = BitsAndBytesConfig( 69 | load_in_4bit=True, 70 | bnb_4bit_quant_type="nf4", 71 | bnb_4bit_use_double_quant=True, 72 | bnb_4bit_compute_dtype=torch.float16, 73 | ) 74 | 75 | # model ---- 76 | accelerator.print(f"Merging peft adapters for {cfg.sft_model_path}") 77 | peft_config = PeftConfig.from_pretrained(cfg.sft_model_path) 78 | base_model = AutoModelForCausalLM.from_pretrained( 79 | peft_config.base_model_name_or_path, 80 | quantization_config=quantization_config, 81 | ) 82 | 83 | model = PeftModel.from_pretrained(base_model, cfg.sft_model_path) 84 | model.eval() 85 | model = model.merge_and_unload() 86 | model_kwargs = None # {"use_cache": False} 87 | 88 | peft_config = LoraConfig( 89 | r=cfg.lora.r, 90 | lora_alpha=cfg.lora.lora_alpha, 91 | lora_dropout=cfg.lora.lora_dropout, 92 | bias="none", 93 | task_type=TaskType.CAUSAL_LM, 94 | inference_mode=False, 95 | target_modules=cfg_dict["lora"]["target_modules"], 96 | ) 97 | 98 | ref_model = None 99 | ref_model_kwargs = None # {"use_cache": False} 100 | 101 | # Training args --- 102 | training_args = TrainingArguments( 103 | output_dir=cfg.output_dir, 104 | learning_rate=cfg.learning_rate, 105 | # lr_scheduler_type=cfg.lr_scheduler_type, 106 | per_device_train_batch_size=cfg.per_device_train_batch_size, 107 | per_device_eval_batch_size=cfg.per_device_eval_batch_size, 108 | gradient_accumulation_steps=cfg.gradient_accumulation_steps, 109 | max_grad_norm=cfg.max_grad_norm, 110 | optim=cfg.dpo.optim, 111 | num_train_epochs=cfg.num_train_epochs, 112 | evaluation_strategy="steps", 113 | save_strategy="steps", 114 | eval_steps=50, 115 | save_steps=50, 116 | save_total_limit=None, 117 | warmup_steps=cfg.warmup_ratio, 118 | logging_steps=1, 119 | report_to='wandb', 120 | # gradient_checkpointing=True, 121 | ) 122 | 123 | # DPO Trainer --- 124 | dpo_trainer = DPOTrainer( 125 | model, 126 | ref_model, 127 | model_init_kwargs=model_kwargs, 128 | ref_model_init_kwargs=ref_model_kwargs, 129 | args=training_args, 130 | beta=cfg.dpo.beta, 131 | train_dataset=raw_datasets["train"], 132 | eval_dataset=raw_datasets["test"], 133 | tokenizer=tokenizer, 134 | max_length=cfg.dpo.max_length, 135 | max_prompt_length=cfg.dpo.max_prompt_length, 136 | peft_config=peft_config, 137 | ) 138 | 139 | # Training loop --- 140 | train_result = dpo_trainer.train() 141 | metrics = train_result.metrics 142 | # max_train_samples = int(0.25*len(raw_datasets["train"])) 143 | # metrics["train_samples"] = min(max_train_samples, len(raw_datasets["train"])) 144 | dpo_trainer.log_metrics("train", metrics) 145 | dpo_trainer.save_metrics("train", metrics) 146 | dpo_trainer.save_state() 147 | 148 | accelerator.print("*** Training complete ***") 149 | 150 | # Evaluation loop --- 151 | accelerator.print("*** Evaluate ***") 152 | metrics = dpo_trainer.evaluate() 153 | dpo_trainer.log_metrics("eval", metrics) 154 | dpo_trainer.save_metrics("eval", metrics) 155 | 156 | # Save model --- 157 | dpo_trainer.save_model(cfg.output_dir) 158 | 159 | # Ensure we don't timeout on model save / push to Hub 160 | accelerator.print("*** Waiting for all processes to finish ***") 161 | accelerator.wait_for_everyone() 162 | 163 | accelerator.print("*** Run complete! ***") 164 | 165 | 166 | if __name__ == "__main__": 167 | ap = argparse.ArgumentParser() 168 | ap.add_argument('--config_path', type=str, required=True) 169 | args = ap.parse_args() 170 | cfg = OmegaConf.load(args.config_path) 171 | 172 | os.makedirs(cfg.output_dir, exist_ok=True) 173 | main(cfg) 174 | -------------------------------------------------------------------------------- /code/train_r_embed.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import random 5 | import time 6 | from copy import deepcopy 7 | 8 | import datasets 9 | import hydra 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | import transformers 14 | import wandb 15 | from accelerate import Accelerator 16 | from accelerate.logging import get_logger 17 | from accelerate.utils import set_seed 18 | from omegaconf import OmegaConf 19 | from torch.utils.data import DataLoader 20 | from tqdm.auto import tqdm 21 | from transformers import get_cosine_schedule_with_warmup 22 | 23 | try: 24 | from r_embed.ai_dataset import AiDataset 25 | from r_embed.ai_loader import AiCollator, AiCollatorTrain, show_batch 26 | from r_embed.ai_model import AiModel 27 | from r_embed.ai_optimizer import get_optimizer 28 | from utils.train_utils import (AverageMeter, as_minutes, get_lr, 29 | save_checkpoint) 30 | 31 | except Exception as e: 32 | print(e) 33 | raise ImportError 34 | 35 | logger = get_logger(__name__) 36 | 37 | 38 | pd.options.display.max_colwidth = 1000 39 | 40 | # -------- Evaluation -------------------------------------------------------------# 41 | 42 | 43 | def run_evaluation(accelerator, model, valid_dl): 44 | model.eval() 45 | 46 | all_losses = [] 47 | 48 | progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process) 49 | 50 | for batch in valid_dl: 51 | with torch.no_grad(): 52 | loss = model(**batch) 53 | 54 | batch_losses = accelerator.gather_for_metrics(loss) 55 | batch_losses = batch_losses.cpu().numpy().tolist() 56 | all_losses.extend(batch_losses) 57 | 58 | progress_bar.update(1) 59 | progress_bar.close() 60 | 61 | # compute metric 62 | eval_dict = dict() 63 | eval_dict['valid_loss'] = np.mean(all_losses) 64 | 65 | return eval_dict 66 | 67 | 68 | # -------- Main Function ---------------------------------------------------------# 69 | 70 | 71 | @hydra.main(version_base=None, config_path="../conf/r_embed", config_name="conf_r_embed") 72 | def run_training(cfg): 73 | # ------- Accelerator ---------------------------------------------------------------# 74 | if cfg.use_wandb: 75 | accelerator = Accelerator( 76 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 77 | log_with="wandb", 78 | ) 79 | 80 | accelerator.init_trackers( 81 | cfg.wandb.project, 82 | config=OmegaConf.to_container(cfg, resolve=True), 83 | ) 84 | 85 | else: 86 | accelerator = Accelerator( 87 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 88 | ) 89 | 90 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 91 | 92 | # Make one log on every process with the configuration for debugging. 93 | logging.basicConfig( 94 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 95 | datefmt="%m/%d/%Y %H:%M:%S", 96 | level=logging.INFO, 97 | ) 98 | logger.info(accelerator.state, main_process_only=False) 99 | 100 | def print_line(): 101 | prefix, unit, suffix = "#", "~~", "#" 102 | accelerator.print(prefix + unit*50 + suffix) 103 | 104 | if accelerator.is_local_main_process: 105 | datasets.utils.logging.set_verbosity_warning() 106 | transformers.utils.logging.set_verbosity_info() 107 | else: 108 | datasets.utils.logging.set_verbosity_error() 109 | transformers.utils.logging.set_verbosity_error() 110 | 111 | # ------- Runtime Configs -----------------------------------------------------------# 112 | print_line() 113 | accelerator.print(f"setting seed: {cfg.seed}") 114 | set_seed(cfg.seed) 115 | 116 | if accelerator.is_main_process: 117 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 118 | print_line() 119 | 120 | # ------- load data ----------------------------------------------------------# 121 | print_line() 122 | data_dir = cfg.input_data_dir 123 | 124 | train_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet")) 125 | train_df = train_df[~train_df['text'].isna()].copy() 126 | 127 | valid_df = pd.read_parquet(os.path.join(data_dir, "valid_essays.parquet")) 128 | valid_df = valid_df[~valid_df['text'].isna()].copy() 129 | 130 | train_df = train_df.reset_index(drop=True) 131 | valid_df = valid_df.reset_index(drop=True) 132 | 133 | prompt_ids = train_df["prompt_id"].unique().tolist() 134 | prompt_ids = [p for p in prompt_ids if p <= 8] 135 | 136 | pos_df = train_df[train_df["generated"] == 1].copy() 137 | neg_df = train_df[train_df["generated"] == 0].copy() 138 | 139 | pos_gdf = pos_df.groupby("prompt_id")["id"].apply(list).reset_index() 140 | prompt2ids_pos = dict(zip(pos_gdf["prompt_id"], pos_gdf["id"])) 141 | 142 | neg_gdf = neg_df.groupby("prompt_id")["id"].apply(list).reset_index() 143 | prompt2ids_neg = dict(zip(neg_gdf["prompt_id"], neg_gdf["id"])) 144 | 145 | accelerator.print(f"shape of train data: {train_df.shape}") 146 | accelerator.print(f"{train_df.head()}") 147 | accelerator.print(f"shape of validation data: {valid_df.shape}") 148 | accelerator.print(f"Prompts: {prompt_ids}") 149 | 150 | with accelerator.main_process_first(): 151 | dataset_creator = AiDataset(cfg) 152 | 153 | train_ds = dataset_creator.get_dataset(train_df) 154 | valid_ds = dataset_creator.get_dataset(valid_df) 155 | 156 | tokenizer = dataset_creator.tokenizer 157 | 158 | # ------- data loaders ----------------------------------------------------------------# 159 | train_ds.set_format( 160 | type=None, 161 | columns=[ 162 | 'id', 163 | 'input_ids', 164 | 'attention_mask', 165 | 'generated' 166 | ] 167 | ) 168 | 169 | valid_ds = valid_ds.sort("input_length") 170 | 171 | valid_ds.set_format( 172 | type=None, 173 | columns=[ 174 | 'id', 175 | 'input_ids', 176 | 'attention_mask', 177 | 'generated' 178 | ] 179 | ) 180 | valid_ids = valid_df["id"] 181 | 182 | # --- 183 | kwargs = dict( 184 | train_ds=train_ds, 185 | prompt_ids=prompt_ids, 186 | prompt2ids_pos=prompt2ids_pos, 187 | prompt2ids_neg=prompt2ids_neg, 188 | ) 189 | 190 | data_collector_train = AiCollatorTrain( 191 | tokenizer=tokenizer, 192 | pad_to_multiple_of=64, 193 | kwargs=kwargs, 194 | ) 195 | 196 | data_collector = AiCollator( 197 | tokenizer=tokenizer, 198 | pad_to_multiple_of=64 199 | ) 200 | 201 | train_dl = DataLoader( 202 | train_ds, 203 | batch_size=cfg.train_params.per_device_train_batch_size, 204 | shuffle=True, 205 | collate_fn=data_collector_train, 206 | ) 207 | 208 | valid_dl = DataLoader( 209 | valid_ds, 210 | batch_size=cfg.train_params.per_device_eval_batch_size, 211 | shuffle=False, 212 | collate_fn=data_collector, 213 | ) 214 | 215 | accelerator.print("data preparation done...") 216 | print_line() 217 | 218 | # --- show batch -------------------------------------------------------------------# 219 | print_line() 220 | 221 | for b in train_dl: 222 | break 223 | show_batch(b, tokenizer, task='training', print_fn=print, n_examples=4) 224 | 225 | print_line() 226 | 227 | for b in valid_dl: 228 | break 229 | show_batch(b, tokenizer, task='validation', print_fn=accelerator.print) 230 | 231 | print_line() 232 | 233 | # ------- Config -------------------------------------------------------------------# 234 | accelerator.print("config for the current run:") 235 | accelerator.print(json.dumps(cfg_dict, indent=4)) 236 | print_line() 237 | 238 | # ------- Model --------------------------------------------------------------------# 239 | print_line() 240 | print("creating the LLM Detection model...") 241 | model = AiModel(cfg, accelerator.device) 242 | print_line() 243 | 244 | # ------- Optimizer ----------------------------------------------------------------# 245 | print_line() 246 | print("creating the optimizer...") 247 | optimizer = get_optimizer(model, cfg) 248 | # ------- Prepare -------------------------------------------------------------------# 249 | 250 | model, optimizer, train_dl, valid_dl = accelerator.prepare( 251 | model, optimizer, train_dl, valid_dl 252 | ) 253 | 254 | # ------- Scheduler -----------------------------------------------------------------# 255 | print_line() 256 | num_epochs = cfg.train_params.num_train_epochs 257 | grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps 258 | warmup_pct = cfg.train_params.warmup_pct 259 | 260 | num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps 261 | num_training_steps = num_epochs * num_update_steps_per_epoch 262 | num_warmup_steps = int(warmup_pct*num_training_steps) 263 | 264 | accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}") 265 | accelerator.print(f"# training steps: {num_training_steps}") 266 | accelerator.print(f"# warmup steps: {num_warmup_steps}") 267 | 268 | scheduler = get_cosine_schedule_with_warmup( 269 | optimizer=optimizer, 270 | num_warmup_steps=num_warmup_steps, 271 | num_training_steps=num_training_steps 272 | ) 273 | 274 | # ------- training setup --------------------------------------------------------------# 275 | best_lb = 1e6 # track recall@1000 276 | 277 | patience_tracker = 0 278 | current_iteration = 0 279 | 280 | # ------- training --------------------------------------------------------------------# 281 | start_time = time.time() 282 | accelerator.wait_for_everyone() 283 | 284 | for epoch in range(num_epochs): 285 | # close and reset progress bar 286 | if epoch != 0: 287 | progress_bar.close() 288 | 289 | progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process) 290 | loss_meter = AverageMeter() 291 | 292 | # Training ------ 293 | model.train() 294 | for step, batch in enumerate(train_dl): 295 | with accelerator.accumulate(model): 296 | loss = model(**batch) 297 | accelerator.backward(loss) 298 | 299 | if accelerator.sync_gradients: 300 | 301 | accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm) 302 | 303 | optimizer.step() # gradient_state.sync_gradients check is performed inside optimizer.step 304 | scheduler.step() 305 | optimizer.zero_grad() 306 | 307 | loss_meter.update(loss.item()) 308 | 309 | if accelerator.sync_gradients: 310 | progress_bar.set_description( 311 | f"STEP: {current_iteration+1:5}/{num_update_steps_per_epoch:5}. " 312 | f"LR: {get_lr(optimizer):.4f}. " 313 | f"Loss: {loss_meter.avg:.4f}. " 314 | ) 315 | 316 | progress_bar.update(1) 317 | current_iteration += 1 318 | 319 | if cfg.use_wandb: 320 | accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration) 321 | accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration) 322 | 323 | # >--------------------------------------------------| 324 | # >-- evaluation ------------------------------------| 325 | # >--------------------------------------------------| 326 | 327 | if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0): 328 | # set model in eval mode 329 | model.eval() 330 | scores_dict = run_evaluation(accelerator, model, valid_dl) 331 | lb = scores_dict["valid_loss"] 332 | 333 | print_line() 334 | et = as_minutes(time.time()-start_time) 335 | accelerator.print( 336 | f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}" 337 | ) 338 | print_line() 339 | accelerator.print(f">>> Current LB (valid_loss) = {round(lb, 4)}") 340 | 341 | print_line() 342 | 343 | is_best = False 344 | if lb <= best_lb: 345 | best_lb = lb 346 | is_best = True 347 | patience_tracker = 0 348 | 349 | # ----- 350 | best_dict = dict() 351 | for k, v in scores_dict.items(): 352 | best_dict[f"{k}_at_best"] = v 353 | else: 354 | patience_tracker += 1 355 | 356 | # saving ----- 357 | accelerator.wait_for_everyone() 358 | unwrapped_model = accelerator.unwrap_model(model) 359 | model_state = { 360 | 'step': current_iteration, 361 | 'epoch': epoch + 1, 362 | 'state_dict': unwrapped_model.state_dict(), 363 | 'lb': lb, 364 | } 365 | 366 | if accelerator.is_main_process: 367 | save_checkpoint(cfg, model_state, is_best=is_best) 368 | 369 | # -- post eval 370 | model.train() 371 | torch.cuda.empty_cache() 372 | print_line() 373 | 374 | # early stopping ---- 375 | if patience_tracker >= cfg.train_params.patience: 376 | print("stopping early") 377 | model.eval() 378 | accelerator.end_training() 379 | return 380 | 381 | 382 | if __name__ == "__main__": 383 | run_training() 384 | -------------------------------------------------------------------------------- /code/train_r_ranking.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import random 5 | import time 6 | from copy import deepcopy 7 | 8 | import datasets 9 | import hydra 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | import transformers 14 | import wandb 15 | from accelerate import Accelerator 16 | from accelerate.logging import get_logger 17 | from accelerate.utils import set_seed 18 | from omegaconf import OmegaConf 19 | from torch.utils.data import DataLoader 20 | from tqdm.auto import tqdm 21 | from transformers import get_cosine_schedule_with_warmup 22 | 23 | try: 24 | from r_ranking.ai_dataset import AiDataset 25 | from r_ranking.ai_loader import AiCollator, AiCollatorTrain, show_batch 26 | from r_ranking.ai_model import AiModel 27 | from r_ranking.ai_optimizer import get_optimizer 28 | from utils.metric_utils import compute_metrics 29 | from utils.train_utils import (AverageMeter, as_minutes, get_lr, 30 | save_checkpoint) 31 | 32 | except Exception as e: 33 | print(e) 34 | raise ImportError 35 | 36 | logger = get_logger(__name__) 37 | 38 | 39 | pd.options.display.max_colwidth = 1000 40 | 41 | # -------- Evaluation -------------------------------------------------------------# 42 | 43 | 44 | def run_evaluation(accelerator, model, valid_dl, valid_ids): 45 | model.eval() 46 | 47 | all_predictions = [] 48 | all_truths = [] 49 | 50 | progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process) 51 | 52 | for batch in valid_dl: 53 | with torch.no_grad(): 54 | logits, _ = model(**batch) 55 | logits = logits.reshape(-1) 56 | predictions = torch.sigmoid(logits) 57 | predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"].to(torch.long).reshape(-1))) 58 | predictions, references = predictions.cpu().numpy().tolist(), references.cpu().numpy().tolist() 59 | 60 | all_predictions.extend(predictions) 61 | all_truths.extend(references) 62 | 63 | progress_bar.update(1) 64 | progress_bar.close() 65 | 66 | # compute metric 67 | eval_dict = compute_metrics(all_predictions, all_truths) 68 | 69 | result_df = pd.DataFrame() 70 | result_df["id"] = valid_ids 71 | result_df["predictions"] = all_predictions 72 | result_df["truths"] = all_truths 73 | 74 | oof_df = deepcopy(result_df) 75 | oof_df = oof_df.rename(columns={"predictions": "generated"}) 76 | oof_df = oof_df[["id", "generated"]].copy() 77 | 78 | to_return = { 79 | "scores": eval_dict, 80 | "result_df": result_df, 81 | "oof_df": oof_df, 82 | } 83 | 84 | return to_return 85 | 86 | 87 | # -------- Main Function ---------------------------------------------------------# 88 | 89 | 90 | @hydra.main(version_base=None, config_path="../conf/r_ranking", config_name="conf_r_ranking") 91 | def run_training(cfg): 92 | # ------- Accelerator ---------------------------------------------------------------# 93 | if cfg.use_wandb: 94 | accelerator = Accelerator( 95 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 96 | log_with="wandb", 97 | ) 98 | 99 | accelerator.init_trackers( 100 | cfg.wandb.project, 101 | config=OmegaConf.to_container(cfg, resolve=True), 102 | ) 103 | 104 | else: 105 | accelerator = Accelerator( 106 | gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps, 107 | ) 108 | 109 | cfg_dict = OmegaConf.to_container(cfg, resolve=True) 110 | 111 | # Make one log on every process with the configuration for debugging. 112 | logging.basicConfig( 113 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 114 | datefmt="%m/%d/%Y %H:%M:%S", 115 | level=logging.INFO, 116 | ) 117 | logger.info(accelerator.state, main_process_only=False) 118 | 119 | def print_line(): 120 | prefix, unit, suffix = "#", "~~", "#" 121 | accelerator.print(prefix + unit*50 + suffix) 122 | 123 | if accelerator.is_local_main_process: 124 | datasets.utils.logging.set_verbosity_warning() 125 | transformers.utils.logging.set_verbosity_info() 126 | else: 127 | datasets.utils.logging.set_verbosity_error() 128 | transformers.utils.logging.set_verbosity_error() 129 | 130 | # ------- Runtime Configs -----------------------------------------------------------# 131 | print_line() 132 | accelerator.print(f"setting seed: {cfg.seed}") 133 | set_seed(cfg.seed) 134 | 135 | if accelerator.is_main_process: 136 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 137 | print_line() 138 | 139 | # ------- load data ----------------------------------------------------------# 140 | print_line() 141 | data_dir = cfg.input_data_dir 142 | 143 | # load query dataframe 144 | essay_df = pd.read_csv(os.path.join(data_dir, "train_essays.csv")) 145 | essay_df = essay_df[~essay_df['text'].isna()].copy() 146 | essay_df = essay_df.reset_index(drop=True) 147 | 148 | # ------- Data Split ----------------------------------------------------------------# 149 | 150 | # sample validation data 151 | rng = random.Random(cfg.seed) 152 | essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.99 else 'valid') 153 | train_df = essay_df[essay_df['fold'] == 'train'].copy() 154 | valid_df = essay_df[essay_df['fold'] == 'valid'].copy() 155 | 156 | # train_df = train_df.sort_values(by="prompt_id", ascending=True) 157 | train_df = train_df.reset_index(drop=True) 158 | valid_df = valid_df.reset_index(drop=True) 159 | 160 | prompt_ids = train_df["prompt_id"].unique().tolist() 161 | gdf = train_df.groupby("prompt_id")["id"].apply(list).reset_index() 162 | prompt2ids = dict(zip(gdf["prompt_id"], gdf["id"])) 163 | 164 | accelerator.print(f"shape of train data: {train_df.shape}") 165 | accelerator.print(f"{train_df.head()}") 166 | accelerator.print(f"shape of validation data: {valid_df.shape}") 167 | accelerator.print(f"Prompts: {prompt_ids}") 168 | 169 | with accelerator.main_process_first(): 170 | dataset_creator = AiDataset(cfg) 171 | 172 | train_ds = dataset_creator.get_dataset(train_df) 173 | valid_ds = dataset_creator.get_dataset(valid_df) 174 | 175 | tokenizer = dataset_creator.tokenizer 176 | 177 | # ------- data loaders ----------------------------------------------------------------# 178 | train_ds.set_format( 179 | type=None, 180 | columns=[ 181 | 'id', 182 | 'input_ids', 183 | 'attention_mask', 184 | 'generated' 185 | ] 186 | ) 187 | 188 | # sort valid dataset for faster evaluation 189 | valid_ds = valid_ds.sort("input_length") 190 | 191 | valid_ds.set_format( 192 | type=None, 193 | columns=[ 194 | 'id', 195 | 'input_ids', 196 | 'attention_mask', 197 | 'generated' 198 | ] 199 | ) 200 | valid_ids = valid_df["id"] 201 | 202 | # --- 203 | kwargs = dict( 204 | train_ds=train_ds, 205 | prompt_ids=prompt_ids, 206 | prompt2ids=prompt2ids, 207 | ) 208 | 209 | data_collector_train = AiCollatorTrain( 210 | tokenizer=tokenizer, 211 | pad_to_multiple_of=64, 212 | kwargs=kwargs, 213 | ) 214 | 215 | data_collector = AiCollator( 216 | tokenizer=tokenizer, 217 | pad_to_multiple_of=64 218 | ) 219 | 220 | train_dl = DataLoader( 221 | train_ds, 222 | batch_size=cfg.train_params.per_device_train_batch_size, 223 | shuffle=True, 224 | collate_fn=data_collector_train, 225 | ) 226 | 227 | valid_dl = DataLoader( 228 | valid_ds, 229 | batch_size=cfg.train_params.per_device_eval_batch_size, 230 | shuffle=False, 231 | collate_fn=data_collector, 232 | ) 233 | 234 | accelerator.print("data preparation done...") 235 | print_line() 236 | 237 | # --- show batch -------------------------------------------------------------------# 238 | print_line() 239 | 240 | for b in train_dl: 241 | break 242 | show_batch(b, tokenizer, task='training', print_fn=print, n_examples=4) 243 | 244 | print_line() 245 | 246 | for b in valid_dl: 247 | break 248 | show_batch(b, tokenizer, task='validation', print_fn=accelerator.print) 249 | 250 | print_line() 251 | 252 | # ------- Config -------------------------------------------------------------------# 253 | accelerator.print("config for the current run:") 254 | accelerator.print(json.dumps(cfg_dict, indent=4)) 255 | print_line() 256 | 257 | # ------- Model --------------------------------------------------------------------# 258 | print_line() 259 | print("creating the LLM Detection model...") 260 | model = AiModel(cfg, accelerator.device) 261 | print_line() 262 | 263 | # ------- Optimizer ----------------------------------------------------------------# 264 | print_line() 265 | print("creating the optimizer...") 266 | optimizer = get_optimizer(model, cfg) 267 | # ------- Prepare -------------------------------------------------------------------# 268 | 269 | model, optimizer, train_dl, valid_dl = accelerator.prepare( 270 | model, optimizer, train_dl, valid_dl 271 | ) 272 | 273 | # ------- Scheduler -----------------------------------------------------------------# 274 | print_line() 275 | num_epochs = cfg.train_params.num_train_epochs 276 | grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps 277 | warmup_pct = cfg.train_params.warmup_pct 278 | 279 | num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps 280 | num_training_steps = num_epochs * num_update_steps_per_epoch 281 | num_warmup_steps = int(warmup_pct*num_training_steps) 282 | 283 | accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}") 284 | accelerator.print(f"# training steps: {num_training_steps}") 285 | accelerator.print(f"# warmup steps: {num_warmup_steps}") 286 | 287 | scheduler = get_cosine_schedule_with_warmup( 288 | optimizer=optimizer, 289 | num_warmup_steps=num_warmup_steps, 290 | num_training_steps=num_training_steps 291 | ) 292 | 293 | # ------- training setup --------------------------------------------------------------# 294 | best_lb = -1 # track recall@1000 295 | 296 | patience_tracker = 0 297 | current_iteration = 0 298 | 299 | # ------- training --------------------------------------------------------------------# 300 | start_time = time.time() 301 | accelerator.wait_for_everyone() 302 | 303 | for epoch in range(num_epochs): 304 | # close and reset progress bar 305 | if epoch != 0: 306 | progress_bar.close() 307 | 308 | progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process) 309 | loss_meter = AverageMeter() 310 | 311 | # Training ------ 312 | model.train() 313 | for step, batch in enumerate(train_dl): 314 | with accelerator.accumulate(model): 315 | _, loss = model(**batch) 316 | accelerator.backward(loss) 317 | 318 | if accelerator.sync_gradients: 319 | 320 | accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm) 321 | 322 | optimizer.step() # gradient_state.sync_gradients check is performed inside optimizer.step 323 | scheduler.step() 324 | optimizer.zero_grad() 325 | 326 | loss_meter.update(loss.item()) 327 | 328 | if accelerator.sync_gradients: 329 | progress_bar.set_description( 330 | f"STEP: {current_iteration+1:5}/{num_update_steps_per_epoch:5}. " 331 | f"LR: {get_lr(optimizer):.4f}. " 332 | f"Loss: {loss_meter.avg:.4f}. " 333 | ) 334 | 335 | progress_bar.update(1) 336 | current_iteration += 1 337 | 338 | if cfg.use_wandb: 339 | accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration) 340 | accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration) 341 | 342 | # >--------------------------------------------------| 343 | # >-- evaluation ------------------------------------| 344 | # >--------------------------------------------------| 345 | 346 | if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0): 347 | # set model in eval mode 348 | model.eval() 349 | eval_response = run_evaluation(accelerator, model, valid_dl, valid_ids) 350 | 351 | scores_dict = eval_response["scores"] 352 | result_df = eval_response["result_df"] 353 | oof_df = eval_response["oof_df"] 354 | lb = scores_dict["lb"] 355 | 356 | print_line() 357 | et = as_minutes(time.time()-start_time) 358 | accelerator.print( 359 | f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}" 360 | ) 361 | print_line() 362 | accelerator.print(f">>> Current LB (AUC) = {round(lb, 4)}") 363 | 364 | print_line() 365 | 366 | is_best = False 367 | if lb >= best_lb: 368 | best_lb = lb 369 | is_best = True 370 | patience_tracker = 0 371 | 372 | # ----- 373 | best_dict = dict() 374 | for k, v in scores_dict.items(): 375 | best_dict[f"{k}_at_best"] = v 376 | else: 377 | patience_tracker += 1 378 | 379 | if is_best: 380 | oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_best.csv"), index=False) 381 | result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_best.csv"), index=False) 382 | else: 383 | accelerator.print(f">>> patience reached {patience_tracker}/{cfg_dict['train_params']['patience']}") 384 | accelerator.print(f">>> current best score: {round(best_lb, 4)}") 385 | 386 | oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_last.csv"), index=False) 387 | result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_last.csv"), index=False) 388 | 389 | # saving ----- 390 | accelerator.wait_for_everyone() 391 | unwrapped_model = accelerator.unwrap_model(model) 392 | model_state = { 393 | 'step': current_iteration, 394 | 'epoch': epoch + 1, 395 | 'state_dict': unwrapped_model.state_dict(), 396 | 'lb': lb, 397 | } 398 | 399 | if accelerator.is_main_process: 400 | save_checkpoint(cfg, model_state, is_best=is_best) 401 | 402 | # logging ---- 403 | if cfg.use_wandb: 404 | accelerator.log({"lb": lb}, step=current_iteration) 405 | accelerator.log({"best_lb": best_lb}, step=current_iteration) 406 | 407 | # -- log scores dict 408 | for k, v in scores_dict.items(): 409 | accelerator.log({k: round(v, 4)}, step=current_iteration) 410 | 411 | # --- log best scores dict 412 | for k, v in best_dict.items(): 413 | accelerator.log({k: round(v, 4)}, step=current_iteration) 414 | 415 | # -- post eval 416 | model.train() 417 | torch.cuda.empty_cache() 418 | print_line() 419 | 420 | # early stopping ---- 421 | if patience_tracker >= cfg.train_params.patience: 422 | print("stopping early") 423 | model.eval() 424 | accelerator.end_training() 425 | return 426 | 427 | 428 | if __name__ == "__main__": 429 | run_training() 430 | -------------------------------------------------------------------------------- /code/trainer_ranking_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | from datasets import Dataset 4 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer 5 | 6 | loss_fct = torch.nn.MarginRankingLoss(margin=0.7) 7 | class BCETrainer(Trainer): 8 | def compute_loss(self, model, inputs, return_outputs=False): 9 | human_outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]) 10 | ai_outputs = model(input_ids=inputs["ai_input_ids"], attention_mask=inputs["ai_attention_mask"]) 11 | 12 | human_outputs = human_outputs.get("logits").view(-1) 13 | ai_outputs = ai_outputs.get("logits").view(-1) 14 | 15 | loss = loss_fct(ai_outputs, human_outputs, torch.ones_like(ai_outputs)) 16 | 17 | return (loss, ai_outputs) if return_outputs else loss 18 | 19 | essay_df = pd.read_csv("train_essays_pos_neg.csv").sample(200_000) 20 | train_df = essay_df.copy().reset_index(drop=True) 21 | train_df["human"] = train_df["human"].str.strip() 22 | train_df["ai"] = train_df["ai"].str.strip() 23 | 24 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large") 25 | model = AutoModelForSequenceClassification.from_pretrained( 26 | "microsoft/deberta-v3-large", 27 | num_labels=1 28 | ) 29 | 30 | train_ds = Dataset.from_pandas(train_df) 31 | 32 | def preprocess_function(examples, max_length=1280): 33 | tokenized_samples = tokenizer(examples["human"], truncation=True, max_length=max_length) 34 | tokenized_samples_ai = tokenizer(examples["ai"], truncation=True, max_length=max_length) 35 | 36 | tokenized_samples["ai_input_ids"] = tokenized_samples_ai["input_ids"] 37 | tokenized_samples["ai_attention_mask"] = tokenized_samples_ai["attention_mask"] 38 | 39 | return tokenized_samples 40 | 41 | train_tokenized_ds = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names) 42 | 43 | training_args = TrainingArguments( 44 | output_dir=f"checkpoint/deberta-v3-large-v18-margin", 45 | learning_rate=1e-5, 46 | per_device_train_batch_size=1, 47 | gradient_accumulation_steps=4, 48 | max_grad_norm=1, 49 | optim='adamw_8bit', 50 | num_train_epochs=1, 51 | weight_decay=0.1, 52 | fp16=True, 53 | save_strategy="epoch", 54 | remove_unused_columns=False, 55 | warmup_steps=0.1, 56 | logging_steps=100, 57 | gradient_checkpointing=False, 58 | report_to='tensorboard' 59 | ) 60 | 61 | trainer = BCETrainer( 62 | model=model, 63 | args=training_args, 64 | train_dataset=train_tokenized_ds, 65 | tokenizer=tokenizer, 66 | ) 67 | 68 | trainer.train() -------------------------------------------------------------------------------- /code/utils/metric_utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | 3 | 4 | def compute_metrics(predictions, truths): 5 | """ 6 | ROC AUC SCORE 7 | """ 8 | 9 | assert len(predictions) == len(truths) 10 | score = roc_auc_score(truths, predictions) 11 | 12 | to_return = { 13 | "lb": round(score, 4), 14 | } 15 | 16 | return to_return 17 | -------------------------------------------------------------------------------- /code/utils/train_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import shutil 5 | import string 6 | from copy import deepcopy 7 | 8 | import numpy as np 9 | import torch 10 | import wandb 11 | from omegaconf import OmegaConf 12 | from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, 13 | nvmlInit) 14 | 15 | 16 | def generate_random_string(): 17 | chars = string.ascii_lowercase + string.digits 18 | return ''.join(random.choice(chars) for _ in range(6)) 19 | 20 | 21 | def get_desired_dtype(dtype): 22 | if dtype == 'fp16': 23 | return torch.float16 24 | elif dtype == 'bf16': 25 | return torch.bfloat16 26 | else: 27 | return torch.float32 28 | 29 | 30 | def print_line(logger=None): 31 | prefix, unit, suffix = "#", "~~", "#" 32 | if logger is None: 33 | print(prefix + unit*50 + suffix) 34 | else: 35 | logger.print(prefix + unit*50 + suffix) 36 | 37 | 38 | def as_minutes(s): 39 | m = math.floor(s / 60) 40 | s -= m * 60 41 | return '%dm%ds' % (m, s) 42 | 43 | 44 | def execution_setup(cfg): 45 | print_line() 46 | if cfg.use_random_seed: 47 | seed = random.randint(401, 999) 48 | cfg.seed = seed 49 | 50 | print(f"setting seed: {cfg.seed}") 51 | seed_everything(cfg.seed) 52 | 53 | # folder --- 54 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 55 | 56 | return cfg 57 | 58 | 59 | def seed_everything(seed: int): 60 | random.seed(seed) 61 | os.environ["PYTHONHASHSEED"] = str(seed) 62 | np.random.seed(seed) 63 | torch.manual_seed(seed) 64 | torch.cuda.manual_seed(seed) 65 | torch.backends.cudnn.deterministic = True 66 | torch.backends.cudnn.benchmark = True 67 | 68 | 69 | def init_wandb(cfg): 70 | project = cfg.wandb.project 71 | tags = cfg.wandb.tags 72 | 73 | if cfg.wandb.all_data_flag: 74 | run_id = f"{cfg.wandb.run_name}-all-data" 75 | else: 76 | run_id = f"{cfg.wandb.run_name}" 77 | 78 | run = wandb.init( 79 | project=project, 80 | config=OmegaConf.to_container(cfg, resolve=True), 81 | tags=tags, 82 | name=run_id, 83 | anonymous="must", 84 | job_type="Train", 85 | ) 86 | 87 | return run 88 | 89 | 90 | def print_gpu_utilization(): 91 | nvmlInit() 92 | handle = nvmlDeviceGetHandleByIndex(0) 93 | info = nvmlDeviceGetMemoryInfo(handle) 94 | print(f"GPU memory occupied: {info.used//1024**2} MB.") 95 | 96 | 97 | def get_lr(optimizer): 98 | return optimizer.param_groups[0]['lr']*1e6 99 | 100 | 101 | class AverageMeter(object): 102 | """Computes and stores the average and current value 103 | Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262 104 | """ 105 | 106 | def __init__(self): 107 | self.reset() 108 | 109 | def reset(self): 110 | self.val = 0 111 | self.avg = 0 112 | self.sum = 0 113 | self.count = 0 114 | 115 | def update(self, val, n=1): 116 | self.val = val 117 | self.sum += val * n 118 | self.count += n 119 | self.avg = self.sum / self.count 120 | 121 | 122 | def save_checkpoint(cfg, state, is_best): 123 | os.makedirs(cfg.outputs.model_dir, exist_ok=True) 124 | name = f"detect_ai_model" 125 | 126 | filename = f'{cfg.outputs.model_dir}/{name}_last.pth.tar' 127 | torch.save(state, filename, _use_new_zipfile_serialization=False) 128 | 129 | if is_best: 130 | shutil.copyfile(filename, f'{cfg.outputs.model_dir}/{name}_best.pth.tar') 131 | 132 | 133 | class EMA(): 134 | """ 135 | credit: https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/332567 136 | """ 137 | 138 | def __init__(self, model, decay): 139 | self.model = model 140 | self.decay = decay 141 | self.shadow = {} 142 | self.backup = {} 143 | 144 | def register(self): 145 | for name, param in self.model.named_parameters(): 146 | if param.requires_grad: 147 | self.shadow[name] = param.data.clone() 148 | 149 | def update(self): 150 | for name, param in self.model.named_parameters(): 151 | if param.requires_grad: 152 | assert name in self.shadow 153 | new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name] 154 | self.shadow[name] = new_average.clone() 155 | 156 | def apply_shadow(self): 157 | for name, param in self.model.named_parameters(): 158 | if param.requires_grad: 159 | assert name in self.shadow 160 | self.backup[name] = param.data 161 | param.data = self.shadow[name] 162 | 163 | def restore(self): 164 | for name, param in self.model.named_parameters(): 165 | if param.requires_grad: 166 | assert name in self.backup 167 | param.data = self.backup[name] 168 | self.backup = {} 169 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: mistralai/Mistral-7B-v0.1 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | - o_proj 21 | - gate_proj 22 | - up_proj 23 | - down_proj 24 | r: 16 25 | lora_alpha: 32 26 | lora_dropout: 0.1 27 | modules_to_save: 28 | - lm_head 29 | 30 | train_params: 31 | per_device_train_batch_size: 1 # 512 # 512 32 | per_device_eval_batch_size: 1 33 | num_train_epochs: 2 # 16 34 | gradient_accumulation_steps: 4 35 | 36 | warmup_pct: 0.1 37 | eval_frequency: 300 # 300 # 600 38 | patience: 10 39 | 40 | optimizer: 41 | name: AdamW8bit 42 | head_lr: 5e-5 43 | lr: 5e-5 44 | weight_decay: 1e-2 45 | max_grad_norm: 0.3 46 | 47 | outputs: 48 | model_dir: ../models/r_clm_v2 49 | 50 | wandb: 51 | project: detect-ai-a1 52 | run_name: exp006-r-clm 53 | tags: 54 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_bloom.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: bigscience/bloom-560m 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - query_key_value 18 | r: 64 19 | lora_alpha: 64 20 | lora_dropout: 0.1 21 | modules_to_save: 22 | - lm_head 23 | 24 | train_params: 25 | per_device_train_batch_size: 1 # 512 # 512 26 | per_device_eval_batch_size: 1 27 | num_train_epochs: 1 # 16 28 | gradient_accumulation_steps: 4 29 | 30 | warmup_pct: 0.1 31 | eval_frequency: 300 # 300 # 600 32 | patience: 10 33 | 34 | optimizer: 35 | name: AdamW8bit 36 | head_lr: 5e-5 37 | lr: 5e-5 38 | weight_decay: 1e-2 39 | max_grad_norm: 0.3 40 | 41 | outputs: 42 | model_dir: ../models/r_clm_bloom 43 | 44 | wandb: 45 | project: detect-ai-a1 46 | run_name: exp006-r-clm 47 | tags: 48 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_falcon.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: tiiuae/falcon-7b 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - query_key_value 18 | r: 16 19 | lora_alpha: 32 20 | lora_dropout: 0.1 21 | modules_to_save: 22 | - lm_head 23 | 24 | train_params: 25 | per_device_train_batch_size: 1 # 512 # 512 26 | per_device_eval_batch_size: 1 27 | num_train_epochs: 1 # 16 28 | gradient_accumulation_steps: 4 29 | 30 | warmup_pct: 0.1 31 | eval_frequency: 300 # 300 # 600 32 | patience: 10 33 | 34 | optimizer: 35 | name: AdamW8bit 36 | head_lr: 5e-5 37 | lr: 5e-5 38 | weight_decay: 1e-2 39 | max_grad_norm: 0.3 40 | 41 | outputs: 42 | model_dir: ../models/r_clm_falcon 43 | 44 | wandb: 45 | project: detect-ai-a1 46 | run_name: exp006-r-clm 47 | tags: 48 | - falcon -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: mistralai/Mistral-7B-v0.1 3 | adapter_path: ../models/models/r_clm_v2 4 | max_num_tokens: 1024 5 | output_dir: ../data/scaling/mistral/v0 6 | n_examples: 256 7 | n_gen_per_prompt: 4 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_bloom.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: bigscience/bloom-560m 3 | adapter_path: ../models/models/r_clm_bloom/last 4 | max_num_tokens: 1024 5 | output_dir: ../data/scaling/bloom_560/v2 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_falcon.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: tiiuae/falcon-7b 3 | adapter_path: ../models/models/r_clm_falcon/last 4 | max_num_tokens: 1024 5 | output_dir: ../data/scaling/falcon_7b/v6 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_gpt2.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: distilgpt2 3 | adapter_path: ../models/models/r_clm_gpt2/last 4 | max_num_tokens: 1296 5 | output_dir: ../data/scaling/gpt2/v0 6 | n_examples: 256 7 | n_gen_per_prompt: 1 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_lite_llama.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: ahxt/LiteLlama-460M-1T 3 | adapter_path: ../models/models/r_clm_lite_llama/last 4 | max_num_tokens: 768 5 | output_dir: ../data/scaling/lite_llama/v1 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_llama13b.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: KoboldAI/LLaMA2-13B-Tiefighter 3 | adapter_path: ../models/models/r_clm_llama_13b/last 4 | max_num_tokens: 768 5 | output_dir: ../data/scaling/llama_13b/v3 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_mistral_persuade.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | model_path: ../models/models/mistral_persuade_ft/last 3 | max_num_tokens: 1024 4 | output_dir: ../data/scaling/mistral_persuade_low_vocab/v2 5 | n_examples: 256 6 | n_gen_per_prompt: 2 7 | 8 | 9 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_mpt.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: mosaicml/mpt-7b 3 | adapter_path: ../models/models/r_clm_mpt_7b/last 4 | max_num_tokens: 1024 5 | output_dir: ../data/scaling/mpt_7b/v0 6 | n_examples: 1024 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_opt.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: facebook/opt-125m 3 | adapter_path: ../models/models/r_clm_opt_125m/last 4 | max_num_tokens: 1024 5 | output_dir: ../data/scaling/opt_125m/v0 6 | n_examples: 1024 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_pythia.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: EleutherAI/pythia-1b 3 | adapter_path: ../models/models/r_clm_pythia/last 4 | max_num_tokens: 768 5 | output_dir: ../data/scaling/pythia_1b/v2 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_generate_tiny_llama.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 3 | adapter_path: ../models/models/r_clm_v2/last 4 | max_num_tokens: 768 5 | output_dir: ../data/scaling/tiny_llama/v1 6 | n_examples: 256 7 | n_gen_per_prompt: 2 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_gpt2.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: gpt2 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - c_attn 18 | r: 64 19 | lora_alpha: 64 20 | lora_dropout: 0.1 21 | modules_to_save: 22 | - lm_head 23 | 24 | train_params: 25 | per_device_train_batch_size: 1 # 512 # 512 26 | per_device_eval_batch_size: 1 27 | num_train_epochs: 1 # 16 28 | gradient_accumulation_steps: 4 29 | 30 | warmup_pct: 0.1 31 | eval_frequency: 300 # 300 # 600 32 | patience: 10 33 | 34 | optimizer: 35 | name: AdamW8bit 36 | head_lr: 5e-5 37 | lr: 5e-5 38 | weight_decay: 1e-2 39 | max_grad_norm: 0.3 40 | 41 | outputs: 42 | model_dir: ../models/r_clm_gpt2 43 | 44 | wandb: 45 | project: detect-ai-a1 46 | run_name: exp006-r-clm 47 | tags: 48 | - distilgpt2 -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_lite_llama.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: ahxt/LiteLlama-460M-1T # mistralai/Mistral-7B-v0.1 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | - o_proj 21 | - gate_proj 22 | - up_proj 23 | - down_proj 24 | r: 16 25 | lora_alpha: 32 26 | lora_dropout: 0.1 27 | modules_to_save: 28 | - lm_head 29 | 30 | train_params: 31 | per_device_train_batch_size: 1 # 512 # 512 32 | per_device_eval_batch_size: 1 33 | num_train_epochs: 2 # 16 34 | gradient_accumulation_steps: 4 35 | 36 | warmup_pct: 0.1 37 | eval_frequency: 300 # 300 # 600 38 | patience: 10 39 | 40 | optimizer: 41 | name: AdamW8bit 42 | head_lr: 5e-5 43 | lr: 5e-5 44 | weight_decay: 1e-2 45 | max_grad_norm: 0.3 46 | 47 | outputs: 48 | model_dir: ../models/r_clm_lite_llama 49 | 50 | wandb: 51 | project: detect-ai-a1 52 | run_name: exp006-r-clm 53 | tags: 54 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_llama13b.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: KoboldAI/LLaMA2-13B-Tiefighter 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | - o_proj 21 | - gate_proj 22 | - up_proj 23 | - down_proj 24 | r: 16 25 | lora_alpha: 32 26 | lora_dropout: 0.1 27 | modules_to_save: 28 | - lm_head 29 | 30 | train_params: 31 | per_device_train_batch_size: 1 # 512 # 512 32 | per_device_eval_batch_size: 1 33 | num_train_epochs: 1 # 16 34 | gradient_accumulation_steps: 4 35 | 36 | warmup_pct: 0.1 37 | eval_frequency: 300 # 300 # 600 38 | patience: 10 39 | 40 | optimizer: 41 | name: AdamW8bit 42 | head_lr: 5e-5 43 | lr: 5e-5 44 | weight_decay: 1e-2 45 | max_grad_norm: 0.3 46 | 47 | outputs: 48 | model_dir: ../models/r_clm_llama_13b 49 | 50 | wandb: 51 | project: detect-ai-a1 52 | run_name: exp006-r-clm 53 | tags: 54 | - falcon -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_mistral_persuade.yaml: -------------------------------------------------------------------------------- 1 | seed: 425 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: ../models/mistral_persuade 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | train_params: 16 | per_device_train_batch_size: 2 # 512 # 512 17 | per_device_eval_batch_size: 2 18 | num_train_epochs: 16 # 16 19 | gradient_accumulation_steps: 4 20 | 21 | warmup_pct: 0.01 22 | eval_frequency: 300 # 300 # 600 23 | patience: 10 24 | 25 | optimizer: 26 | name: AdamW8bit 27 | head_lr: 5e-5 28 | lr: 5e-5 29 | weight_decay: 1e-2 30 | max_grad_norm: 0.3 31 | 32 | outputs: 33 | model_dir: ../models/mistral_persuade_ft 34 | 35 | wandb: 36 | project: detect-ai-a1 37 | run_name: exp006-r-clm 38 | tags: 39 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_mpt.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: mosaicml/mpt-7b 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - Wqkv 18 | r: 64 19 | lora_alpha: 64 20 | lora_dropout: 0.1 21 | modules_to_save: 22 | - lm_head 23 | 24 | train_params: 25 | per_device_train_batch_size: 1 # 512 # 512 26 | per_device_eval_batch_size: 1 27 | num_train_epochs: 1 # 16 28 | gradient_accumulation_steps: 4 29 | 30 | warmup_pct: 0.1 31 | eval_frequency: 300 # 300 # 600 32 | patience: 10 33 | 34 | optimizer: 35 | name: AdamW8bit 36 | head_lr: 5e-5 37 | lr: 5e-5 38 | weight_decay: 1e-2 39 | max_grad_norm: 0.3 40 | 41 | outputs: 42 | model_dir: ../models/r_clm_mpt_7b 43 | 44 | wandb: 45 | project: detect-ai-a1 46 | run_name: exp006-r-clm 47 | tags: 48 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_opt.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: facebook/opt-125m 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | r: 64 21 | lora_alpha: 64 22 | lora_dropout: 0.1 23 | modules_to_save: 24 | - lm_head 25 | 26 | train_params: 27 | per_device_train_batch_size: 1 # 512 # 512 28 | per_device_eval_batch_size: 1 29 | num_train_epochs: 5 # 16 30 | gradient_accumulation_steps: 4 31 | 32 | warmup_pct: 0.1 33 | eval_frequency: 300 # 300 # 600 34 | patience: 10 35 | 36 | optimizer: 37 | name: AdamW8bit 38 | head_lr: 5e-5 39 | lr: 5e-5 40 | weight_decay: 1e-2 41 | max_grad_norm: 0.3 42 | 43 | outputs: 44 | model_dir: ../models/r_clm_opt_125m 45 | 46 | wandb: 47 | project: detect-ai-a1 48 | run_name: exp006-r-clm 49 | tags: 50 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_pythia.yaml: -------------------------------------------------------------------------------- 1 | seed: 420 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: EleutherAI/pythia-12b 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - query_key_value 18 | r: 256 19 | lora_alpha: 64 20 | lora_dropout: 0.1 21 | modules_to_save: 22 | - lm_head 23 | 24 | train_params: 25 | per_device_train_batch_size: 1 # 512 # 512 26 | per_device_eval_batch_size: 1 27 | num_train_epochs: 3 # 16 28 | gradient_accumulation_steps: 4 29 | 30 | warmup_pct: 0.1 31 | eval_frequency: 300 # 300 # 600 32 | patience: 10 33 | 34 | optimizer: 35 | name: AdamW8bit 36 | head_lr: 5e-5 37 | lr: 5e-5 38 | weight_decay: 1e-2 39 | max_grad_norm: 0.3 40 | 41 | outputs: 42 | model_dir: ../models/r_clm_pythia_12b 43 | 44 | wandb: 45 | project: detect-ai-a1 46 | run_name: exp006-r-clm 47 | tags: 48 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_clm_tiny_llama.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 4 | 5 | model: 6 | backbone_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # mistralai/Mistral-7B-v0.1 7 | max_length: 1024 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | - o_proj 21 | - gate_proj 22 | - up_proj 23 | - down_proj 24 | r: 16 25 | lora_alpha: 32 26 | lora_dropout: 0.1 27 | modules_to_save: 28 | - lm_head 29 | 30 | train_params: 31 | per_device_train_batch_size: 1 # 512 # 512 32 | per_device_eval_batch_size: 1 33 | num_train_epochs: 1 # 16 34 | gradient_accumulation_steps: 4 35 | 36 | warmup_pct: 0.1 37 | eval_frequency: 300 # 300 # 600 38 | patience: 10 39 | 40 | optimizer: 41 | name: AdamW8bit 42 | head_lr: 5e-5 43 | lr: 5e-5 44 | weight_decay: 1e-2 45 | max_grad_norm: 0.3 46 | 47 | outputs: 48 | model_dir: ../models/r_clm_v2 49 | 50 | wandb: 51 | project: detect-ai-a1 52 | run_name: exp006-r-clm 53 | tags: 54 | - mistral -------------------------------------------------------------------------------- /conf/r_clm/conf_r_dpo.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | sft_model_path: ../models/r_clm_v2/last 3 | train_path: ../datasets/dpo/dpo_train.parquet 4 | test_path: ../datasets/dpo/dpo_test.parquet 5 | 6 | dpo: 7 | beta: 0.05 8 | logging_first_step: true 9 | max_prompt_length: 64 10 | max_length: 840 11 | optim: rmsprop 12 | remove_unused_columns: false 13 | 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | - v_proj 20 | - out_proj 21 | r: 16 22 | lora_alpha: 16 23 | lora_dropout: 0.1 24 | modules_to_save: 25 | - lm_head 26 | 27 | output_dir: ../models/r_dpo_v2 28 | 29 | learning_rate: 1.0e-5 30 | # lr_scheduler_type: linear 31 | per_device_train_batch_size: 1 32 | per_device_eval_batch_size: 1 33 | gradient_accumulation_steps: 8 34 | num_train_epochs: 1 35 | warmup_ratio: 0.05 36 | max_grad_norm: 1.0 -------------------------------------------------------------------------------- /conf/r_clm/conf_r_dpo_generate.yaml: -------------------------------------------------------------------------------- 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv 2 | base_model_path: mistralai/Mistral-7B-v0.1 3 | sft_adapter_path: ../models/r_clm_v2/last 4 | dpo_adapter_path: ../models/r_dpo_v2/checkpoint-250 5 | max_num_tokens: 1296 6 | output_dir: ../data/custom_gen_dpo_v2 7 | n_examples: 256 8 | n_gen_per_prompt: 4 9 | 10 | 11 | -------------------------------------------------------------------------------- /conf/r_detect/conf_r_detect_mix_v16.yaml: -------------------------------------------------------------------------------- 1 | seed: 424 2 | use_wandb: false 3 | input_data_dir: ../datasets/external/ai_mix_v16 4 | 5 | model: 6 | backbone_path: mistralai/Mistral-7B-v0.1 7 | max_length: 1296 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | r: 8 20 | lora_alpha: 16 21 | lora_dropout: 0.1 22 | modules_to_save: 23 | - classification_head 24 | 25 | train_params: 26 | per_device_train_batch_size: 1 # run on 4x A100 27 | per_device_eval_batch_size: 1 28 | num_train_epochs: 1 # 16 29 | gradient_accumulation_steps: 4 30 | 31 | warmup_pct: 0.1 32 | eval_frequency: 500 33 | patience: 20 34 | save_trigger: 0.0 35 | 36 | use_mask_aug: false # false 37 | mask_aug_prob: 0.0 38 | 39 | optimizer: 40 | name: AdamW8bit 41 | head_lr: 2e-6 42 | lr: 2e-5 43 | weight_decay: 1e-2 44 | max_grad_norm: 0.5 45 | 46 | outputs: 47 | model_dir: ../models/r_detect_mix_v16 48 | 49 | wandb: 50 | project: detect-ai-a1 51 | run_name: exp010-r-detect 52 | tags: 53 | - mistral -------------------------------------------------------------------------------- /conf/r_detect/conf_r_detect_mix_v26.yaml: -------------------------------------------------------------------------------- 1 | seed: 424 2 | use_wandb: false 3 | input_data_dir: ../datasets/external/ai_mix_v26 4 | 5 | model: 6 | backbone_path: mistralai/Mistral-7B-v0.1 7 | max_length: 1296 8 | num_labels: 1 9 | 10 | tokenizer: 11 | padding_side: left 12 | truncation_side: left 13 | use_fast: true 14 | 15 | lora: 16 | target_modules: 17 | - q_proj 18 | - k_proj 19 | r: 16 20 | lora_alpha: 16 21 | lora_dropout: 0.1 22 | modules_to_save: 23 | - classification_head 24 | 25 | train_params: 26 | per_device_train_batch_size: 1 # 512 # 512 27 | per_device_eval_batch_size: 1 28 | num_train_epochs: 1 # 16 29 | gradient_accumulation_steps: 4 30 | 31 | warmup_pct: 0.1 32 | eval_frequency: 500 # 300 # 600 33 | patience: 20 34 | save_trigger: 0.0 35 | 36 | use_mask_aug: false # false 37 | mask_aug_prob: 0.0 38 | 39 | optimizer: 40 | name: AdamW8bit 41 | head_lr: 2e-6 42 | lr: 2e-5 43 | weight_decay: 1e-2 44 | max_grad_norm: 0.5 45 | 46 | outputs: 47 | model_dir: ../models/r_detect_mix_v16 48 | 49 | wandb: 50 | project: detect-ai-a1 51 | run_name: exp010-r-detect 52 | tags: 53 | - mistral -------------------------------------------------------------------------------- /conf/r_embed/conf_r_embed.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_dir: ../datasets/external/ai_mix_v26 4 | 5 | model: 6 | backbone_path: microsoft/deberta-v3-base 7 | max_length: 768 # 1024 8 | dropout_rate: 0.01 9 | gradient_checkpointing: true 10 | projection_dim: 512 11 | temperature: 0.1 12 | 13 | train_params: 14 | per_device_train_batch_size: 64 # 512 # 512 15 | per_device_eval_batch_size: 64 16 | num_train_epochs: 3 # 16 17 | gradient_accumulation_steps: 1 18 | 19 | warmup_pct: 0.02 20 | eval_frequency: 100 # 500 # 300 # 600 21 | patience: 10 22 | save_trigger: 0.0 23 | 24 | optimizer: 25 | head_lr: 4e-5 26 | lr: 4e-5 27 | weight_decay: 1e-3 28 | max_grad_norm: 1.0 29 | 30 | eps: 1e-8 31 | beta1: 0.9 32 | beta2: 0.999 33 | 34 | use_bnb: true 35 | use_llrd: true 36 | llrd: 0.9 37 | 38 | outputs: 39 | model_dir: ../models/r_embed 40 | 41 | wandb: 42 | project: detect-ai-a1 43 | run_name: exp002-r-embed 44 | tags: 45 | - deberta -------------------------------------------------------------------------------- /conf/r_ranking/conf_r_ranking_large.yaml: -------------------------------------------------------------------------------- 1 | seed: 42 2 | use_wandb: false 3 | input_data_dir: ../datasets/external/ai_mix_for_ranking 4 | 5 | model: 6 | backbone_path: microsoft/deberta-v3-large 7 | max_length: 1024 # 1024 8 | dropout_rate: 0.05 9 | gradient_checkpointing: true 10 | 11 | train_params: 12 | per_device_train_batch_size: 32 # 512 # 512 13 | per_device_eval_batch_size: 32 14 | num_train_epochs: 2 # 16 15 | gradient_accumulation_steps: 1 16 | 17 | warmup_pct: 0.1 18 | eval_frequency: 100 # 500 # 300 # 600 19 | patience: 10 20 | save_trigger: 0.0 21 | 22 | optimizer: 23 | head_lr: 2e-5 24 | lr: 2e-5 25 | weight_decay: 1e-3 26 | max_grad_norm: 1.0 27 | 28 | eps: 1e-8 29 | beta1: 0.9 30 | beta2: 0.999 31 | 32 | use_bnb: true 33 | use_llrd: true 34 | llrd: 0.9 35 | 36 | outputs: 37 | model_dir: ../models/r_ranking 38 | 39 | wandb: 40 | project: detect-ai-a1 41 | run_name: exp002-r-embed 42 | tags: 43 | - deberta -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.36.1 2 | accelerate==0.24.0 3 | bitsandbytes==0.41.3.post2 4 | datasets==2.15.0 5 | peft==0.7.0 6 | trl==0.7.4 7 | sentence-transformers==2.2.2 8 | hydra-core 9 | pynvml 10 | sentencepiece 11 | einops -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | hdir=$(pwd) 2 | cd .. 3 | 4 | mkdir datasets 5 | mkdir models 6 | mkdir datasets/external 7 | 8 | cd datasets 9 | 10 | kaggle competitions download -c llm-detect-ai-generated-text 11 | unzip llm-detect-ai-generated-text.zip -d llm-detect-ai-generated-text 12 | rm llm-detect-ai-generated-text.zip 13 | 14 | kaggle datasets download -d nbroad/persaude-corpus-2 15 | unzip persaude-corpus-2.zip -d ./ 16 | rm persaude-corpus-2.zip 17 | 18 | kaggle datasets download -d conjuring92/ai-mix-v16 19 | unzip ai-mix-v16.zip -d ./external/ai_mix_v16 20 | rm ai-mix-v16.zip 21 | 22 | kaggle datasets download -d conjuring92/ai-mix-v26 23 | unzip ai-mix-v26.zip -d ./external/ai_mix_v26 24 | rm ai-mix-v26.zip 25 | 26 | kaggle datasets download -d conjuring92/ai-bin7-mix-v1 27 | unzip ai-bin7-mix-v1.zip -d ./external/ai_mix_for_ranking 28 | rm ai-bin7-mix-v1.zip 29 | 30 | cd ../models 31 | kaggle datasets download -d conjuring92/detect-ai-persuade-clm-ckpts 32 | unzip detect-ai-persuade-clm-ckpts.zip -d ./ 33 | rm detect-ai-persuade-clm-ckpts.zip 34 | 35 | cd $hdir 36 | --------------------------------------------------------------------------------