├── .gitignore
├── LICENSE
├── README.md
├── code
    ├── generate_r_clm.py
    ├── generate_r_clm_from_scratch.py
    ├── ghostbuster
    │   ├── README.md
    │   ├── n_gram.py
    │   ├── symbolic.py
    │   └── train_lr.py
    ├── r_clm
    │   ├── ai_dataset.py
    │   ├── ai_loader.py
    │   └── ai_optimizer.py
    ├── r_detect
    │   ├── ai_dataset.py
    │   ├── ai_loader.py
    │   ├── ai_model.py
    │   └── ai_optimizer.py
    ├── r_embed
    │   ├── ai_dataset.py
    │   ├── ai_loader.py
    │   ├── ai_model.py
    │   └── ai_optimizer.py
    ├── r_ranking
    │   ├── ai_dataset.py
    │   ├── ai_loader.py
    │   ├── ai_model.py
    │   └── ai_optimizer.py
    ├── train_r_clm.py
    ├── train_r_clm_from_scratch.py
    ├── train_r_detect.py
    ├── train_r_dpo.py
    ├── train_r_embed.py
    ├── train_r_ranking.py
    ├── trainer_ranking_loss.py
    └── utils
    │   ├── metric_utils.py
    │   └── train_utils.py
├── conf
    ├── r_clm
    │   ├── conf_r_clm.yaml
    │   ├── conf_r_clm_bloom.yaml
    │   ├── conf_r_clm_falcon.yaml
    │   ├── conf_r_clm_generate.yaml
    │   ├── conf_r_clm_generate_bloom.yaml
    │   ├── conf_r_clm_generate_falcon.yaml
    │   ├── conf_r_clm_generate_gpt2.yaml
    │   ├── conf_r_clm_generate_lite_llama.yaml
    │   ├── conf_r_clm_generate_llama13b.yaml
    │   ├── conf_r_clm_generate_mistral_persuade.yaml
    │   ├── conf_r_clm_generate_mpt.yaml
    │   ├── conf_r_clm_generate_opt.yaml
    │   ├── conf_r_clm_generate_pythia.yaml
    │   ├── conf_r_clm_generate_tiny_llama.yaml
    │   ├── conf_r_clm_gpt2.yaml
    │   ├── conf_r_clm_lite_llama.yaml
    │   ├── conf_r_clm_llama13b.yaml
    │   ├── conf_r_clm_mistral_persuade.yaml
    │   ├── conf_r_clm_mpt.yaml
    │   ├── conf_r_clm_opt.yaml
    │   ├── conf_r_clm_pythia.yaml
    │   ├── conf_r_clm_tiny_llama.yaml
    │   ├── conf_r_dpo.yaml
    │   └── conf_r_dpo_generate.yaml
    ├── r_detect
    │   ├── conf_r_detect_mix_v16.yaml
    │   └── conf_r_detect_mix_v26.yaml
    ├── r_embed
    │   └── conf_r_embed.yaml
    └── r_ranking
    │   └── conf_r_ranking_large.yaml
├── requirements.txt
└── setup.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Raja Biswas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This repo contains our code and configurations for the **LLM - Detect AI Generated Text** competition. The summary of the solution is posted [here](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470121). Please refer to the following sections for details on training and dependencies. 
  2 | 
  3 | ## Section 1: Setup
  4 | ### 1.1 Hardware
  5 | **Jarvislabs.ai** was our primary source of compute. Specifically, models were trained on the following instance:
  6 | 
  7 | Ubuntu 20.04.5 LTS (128 GB boot disk)
  8 | Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz (7 vCPUs)
  9 | 4 x NVIDIA A100 40GB GPU OR 4 x NVIDIA A6000 48GB GPU
 10 | 
 11 | ### 1.2 Software
 12 | I used PyTorch-2.1 image from Jarvislabs.ai, which comes with:
 13 | 
 14 | * Python 3.10.11
 15 | * CUDA 12.3
 16 | 
 17 | ### 1.3 Dependencies
 18 | Please clone the repository and install the required packages using the following commands:
 19 | 
 20 | ```
 21 | git clone https://github.com/rbiswasfc/llm-detect-ai.git
 22 | cd llm-detect-ai
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | ### 1.4 Datasets
 27 | 
 28 | Please make sure Kaggle API is installed. Then run the following script to download the required datasets:
 29 | 
 30 | ```
 31 | chmod +x ./setup.sh
 32 | ./setup.sh
 33 | ```
 34 | 
 35 | Please note that the above script will create `datasets` and `models` folder in the directory located one level above the current directory. The external datasets will be downloaded in the `datasets` folder. Instruction-tuned LLMs, which can be used to generate adversarial essays, will be downloaded in the `models` folder. Total size of downloaded data and model files is ~8GB. 
 36 | 
 37 | ## Section 2: Training
 38 | Training scripts and configurations are located in the `code` and `conf` folders respectively. We leveraged HF `accelerate` library to execute training runs with DDP on multiple GPUs (4x A100). Specifically, we used the following configurations for training:
 39 | 
 40 | ```yaml
 41 | compute_environment: LOCAL_MACHINE                                            
 42 | debug: false                                                                           
 43 | distributed_type: MULTI_GPU
 44 | downcast_bf16: 'no'
 45 | gpu_ids: all
 46 | machine_rank: 0
 47 | main_training_function: main
 48 | mixed_precision: 'no'
 49 | num_machines: 1
 50 | num_processes: 4
 51 | rdzv_backend: static
 52 | same_network: true
 53 | tpu_env: []
 54 | tpu_use_cluster: false
 55 | tpu_use_sudo: false
 56 | use_cpu: false
 57 | ```
 58 | 
 59 | ### 2.1 LLM Models
 60 | For (Q)LoRA fine-tuning of the LLM models, please run the following commands:
 61 | 
 62 | ```bash
 63 | accelerate launch ./code/train_r_detect.py \
 64 | --config-name conf_r_detect_mix_v16 \
 65 | use_wandb=false
 66 | ```
 67 | 
 68 | ```bash
 69 | accelerate launch ./code/train_r_detect.py \
 70 | --config-name conf_r_detect_mix_v26 \
 71 | use_wandb=false
 72 | ```
 73 | 
 74 | Please note that training takes ~3 hours for `mix_v16` and ~4 hours for `mix_v26`.
 75 | 
 76 | ### 2.2 DeBERTa Ranking Models
 77 | 
 78 | To training the `deberta-v3-large` model with ranking loss, please run the following command:
 79 | 
 80 | ```bash
 81 | accelerate launch ./code/train_r_ranking.py \
 82 | --config-name conf_r_ranking_large \
 83 | use_wandb=false
 84 | ```
 85 | 
 86 | ### 2.3 Embedding model
 87 | 
 88 | We trained an embedding model with supervised contrastive loss to find similar essays (KNN neighbors) for a given essay in the test set.
 89 | 
 90 | ```bash
 91 | accelerate launch ./code/train_r_embed.py \
 92 | --config-name conf_r_embed \
 93 | use_wandb=false
 94 | ```
 95 | 
 96 | ## Section 3: Text Generation
 97 | 
 98 | We fine-tuned a wide variety of LLMs using the CLM objective on [PERSUADE](https://www.kaggle.com/datasets/nbroad/persaude-corpus-2) corpus to produce student like essays. The fine-tuned checkpoints were uploaded as a Kaggle Dataset `conjuring92/detect-ai-persuade-clm-ckpts`. These checkpoints can be used to generate essays using the following commands:
 99 | 
100 | ```bash
101 | accelerate launch ./code/generate_r_clm.py \
102 | --config_path ./conf/r_clm/conf_r_clm_generate.yaml
103 | 
104 | accelerate launch ./code/generate_r_clm.py \
105 | --config_path ./conf/r_clm/conf_r_clm_generate_tiny_llama.yaml
106 | 
107 | accelerate launch ./code/generate_r_clm.py \
108 | --config_path ./conf/r_clm/conf_r_clm_generate_pythia.yaml
109 | 
110 | accelerate launch ./code/generate_r_clm.py \
111 | --config_path ./conf/r_clm/conf_r_clm_generate_bloom.yaml
112 | 
113 | accelerate launch ./code/generate_r_clm.py \
114 | --config_path ./conf/r_clm/conf_r_clm_generate_gpt2.yaml
115 | 
116 | accelerate launch ./code/generate_r_clm.py \
117 | --config_path ./conf/r_clm/conf_r_clm_generate_opt.yaml
118 | 
119 | accelerate launch ./code/generate_r_clm.py \
120 | --config_path ./conf/r_clm/conf_r_clm_generate_falcon.yaml
121 | 
122 | accelerate launch ./code/generate_r_clm.py \
123 | --config_path ./conf/r_clm/conf_r_clm_generate_mpt.yaml
124 | 
125 | accelerate launch ./code/generate_r_clm.py \
126 | --config_path ./conf/r_clm/conf_r_clm_generate_llama13b.yaml
127 | 
128 | accelerate launch ./code/generate_r_clm_from_scratch.py \
129 | --config_path ./conf/r_clm/conf_r_clm_generate_mistral_persuade.yaml
130 | ```
131 | 
132 | Optionally, the fine-tuning of LLMs for text generation can be done using the following commands:
133 | 
134 | ```bash
135 | accelerate launch ./code/train_r_clm.py \
136 | --config-name conf_r_clm_tiny_llama \
137 | use_wandb=false
138 | 
139 | accelerate launch ./code/train_r_clm.py \
140 | --config-name conf_r_clm_pythia \
141 | use_wandb=false
142 | 
143 | accelerate launch ./code/train_r_clm.py \
144 | --config-name conf_r_clm_bloom \
145 | use_wandb=false
146 | 
147 | accelerate launch ./code/train_r_clm.py \
148 | --config-name conf_r_clm_gpt2 \
149 | use_wandb=false
150 | 
151 | accelerate launch ./code/train_r_clm.py \
152 | --config-name conf_r_clm_opt \
153 | use_wandb=false
154 | 
155 | accelerate launch ./code/train_r_clm.py \
156 | --config-name conf_r_clm_falcon \
157 | use_wandb=false
158 | 
159 | accelerate launch ./code/train_r_clm.py \
160 | --config-name conf_r_clm_mpt \
161 | use_wandb=false
162 | 
163 | accelerate launch ./code/train_r_clm.py \
164 | --config-name conf_r_clm_llama13b \
165 | use_wandb=false
166 | 
167 | accelerate launch ./code/train_r_clm_from_scratch.py \
168 | --config-name conf_r_clm_mistral_persuade \
169 | use_wandb=false
170 | ```


--------------------------------------------------------------------------------
/code/generate_r_clm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import random
  5 | import string
  6 | from itertools import chain
  7 | 
  8 | import pandas as pd
  9 | import torch
 10 | from accelerate import Accelerator
 11 | from omegaconf import OmegaConf
 12 | from peft import PeftModel
 13 | from tqdm.auto import tqdm
 14 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 15 | 
 16 | 
 17 | def generate_random_string():
 18 |     chars = string.ascii_lowercase + string.digits
 19 |     return 'e_' + ''.join(random.choice(chars) for _ in range(8))
 20 | 
 21 | 
 22 | def get_instruction(inputs):
 23 |     ret = f"""
 24 | Prompt: {inputs['prompt_name']}
 25 | Task: {inputs['task']}
 26 | Score: {inputs['holistic_essay_score']}
 27 | Student Grade Level: {inputs['grade_level']}
 28 | English Language Learner: {inputs['ell_status']}
 29 | Disability Status: {inputs['student_disability_status']}
 30 |     """.strip()
 31 |     n_chars = random.randint(16, 64)
 32 | 
 33 |     start = inputs['text'][:n_chars]
 34 | 
 35 |     ret = f"### Instruction:\n{ret}\n\n### Response: {start}"
 36 |     return ret
 37 | 
 38 | 
 39 | def get_inputs(prompt, tokenizer, n=1):
 40 |     return tokenizer([prompt]*n, return_tensors="pt")
 41 | 
 42 | 
 43 | def process_response(texts):
 44 |     ret = []
 45 | 
 46 |     for text in texts:
 47 |         if "</s>" in text:
 48 |             text = text.split("### Response:")[-1].split("</s>")[0].strip()
 49 |         else:
 50 |             text = text.split("### Response:")[-1].split("<|endoftext|>")[0].strip()
 51 |         text = text.replace("<unk>", "")
 52 |         ret.append(text)
 53 |     return ret
 54 | 
 55 | 
 56 | def pre_process_essay(essay_df):
 57 | 
 58 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
 59 |     essay_df = essay_df.reset_index(drop=True)
 60 | 
 61 |     essay_df["student_disability_status"] = essay_df["student_disability_status"].fillna("Unknown")
 62 |     essay_df["ell_status"] = essay_df["ell_status"].fillna("Unknown")
 63 |     essay_df["grade_level"] = essay_df["grade_level"].fillna(-1)
 64 |     essay_df["holistic_essay_score"] = essay_df["holistic_essay_score"].fillna(-1)
 65 | 
 66 |     essay_df["prompt"] = essay_df.apply(get_instruction, axis=1)
 67 |     return essay_df
 68 | 
 69 | 
 70 | def generate(cfg):
 71 |     accelerator = Accelerator()
 72 | 
 73 |     essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"})
 74 |     essay_df = pre_process_essay(essay_df)
 75 | 
 76 |     prompts = essay_df["prompt"].values.tolist()
 77 | 
 78 |     # ---------------------------------------------------
 79 |     # uncomment for oversampling of certain prompts---
 80 |     # prompts = [p for p in prompts if "Task: Text dependent" not in p]
 81 |     # prompts = [p for p in prompts if "car-free" not in p.lower()]
 82 |     # prompts = [p for p in prompts if "facial action" not in p.lower()]
 83 |     # prompts = [p for p in prompts if "electoral" not in p.lower()]
 84 |     # ---------------------------------------------------
 85 | 
 86 |     print(f"Number of prompts: {len(prompts)}")
 87 | 
 88 |     # model & tokenizer ---
 89 |     tokenizer = AutoTokenizer.from_pretrained(
 90 |         cfg.base_model_path,
 91 |         use_fast=True,
 92 |         padding_side="left",
 93 |         truncation_side="left",
 94 |     )
 95 | 
 96 |     if tokenizer.pad_token is None:
 97 |         tokenizer.pad_token = tokenizer.unk_token
 98 |         tokenizer.pad_token_id = tokenizer.unk_token_id
 99 | 
100 |     # bnb_config = BitsAndBytesConfig(
101 |     #     load_in_4bit=True,
102 |     #     bnb_4bit_quant_type="nf4",
103 |     #     bnb_4bit_use_double_quant=True,
104 |     #     bnb_4bit_compute_dtype=torch.bfloat16
105 |     # )
106 | 
107 |     # compute allowable tokens ---
108 |     tokenized_corpus = tokenizer(essay_df['text'].values.tolist())
109 |     all_tok_ids = set(chain(*tokenized_corpus['input_ids']))
110 |     accelerator.print(f"Number of unique tokens: {len(all_tok_ids)}")
111 |     out_of_corpus_token_ids = list(set(range(tokenizer.vocab_size)).difference(all_tok_ids))
112 |     accelerator.print(f"Number of out of scope tokens: {len(out_of_corpus_token_ids)}")
113 | 
114 |     # ---
115 | 
116 |     base_model = AutoModelForCausalLM.from_pretrained(
117 |         cfg.base_model_path,
118 |         low_cpu_mem_usage=True,
119 |         torch_dtype=torch.bfloat16,
120 |         # quantization_config=bnb_config,
121 |         # attn_implementation="flash_attention_2",
122 |     )
123 | 
124 |     model = PeftModel.from_pretrained(base_model, cfg.adapter_path)
125 |     model = model.merge_and_unload()
126 |     model = accelerator.prepare(model)
127 |     model.eval()
128 | 
129 |     n_examples = cfg.n_examples
130 |     n_gen_per_prompt = cfg.n_gen_per_prompt
131 |     output_dir = cfg.output_dir
132 | 
133 |     # progress_bar = tqdm(range(n_examples), disable=not accelerator.is_local_main_process)
134 |     progress_bar = tqdm(range(n_examples))
135 | 
136 |     for i in range(n_examples):
137 |         # print(f"---- Example {i+1}/{n_examples} ------")
138 |         temperature = 0.5 + 1.5 * random.random()
139 |         top_k = random.randint(128, 256)
140 |         penalty_alpha = random.random()
141 |         guidance_scale = 1.0 + 0.25 * random.random()
142 |         eta_cutoff = 1e-4 + 5e-4 * random.random()
143 |         repetition_penalty = 1.2  # 1.0 + 0.2 * random.random()
144 | 
145 |         try:
146 |             generation_config = GenerationConfig.from_pretrained(
147 |                 cfg.base_model_path,
148 |                 do_sample=True,
149 |                 temperature=temperature,
150 |                 top_k=top_k,
151 |                 penalty_alpha=penalty_alpha,
152 |                 guidance_scale=guidance_scale,
153 |                 max_new_tokens=cfg.max_num_tokens,
154 |                 pad_token_id=tokenizer.pad_token_id,
155 |                 eta_cutoff=eta_cutoff,
156 |                 # repetition_penalty=repetition_penalty,
157 |                 suppress_tokens=out_of_corpus_token_ids,
158 |             )
159 | 
160 |         except Exception as e:
161 |             print(e)
162 |             generation_config = GenerationConfig(
163 |                 # cfg.base_model_path,
164 |                 do_sample=True,
165 |                 temperature=temperature,
166 |                 top_k=top_k,
167 |                 # penalty_alpha=penalty_alpha,
168 |                 # guidance_scale=guidance_scale,
169 |                 max_new_tokens=cfg.max_num_tokens,
170 |                 pad_token_id=tokenizer.pad_token_id,
171 |                 eta_cutoff=eta_cutoff,
172 |                 suppress_tokens=out_of_corpus_token_ids,
173 |             )
174 | 
175 |         try:
176 |             prompt = random.choice(prompts)
177 |             this_example = dict()
178 |             this_id = generate_random_string()
179 |             this_example['id'] = this_id
180 |             this_example['prompt'] = prompt
181 |             this_example['temperature'] = temperature
182 |             this_example['top_k'] = top_k
183 |             this_example['guidance_scale'] = guidance_scale
184 |             this_example['penalty_alpha'] = penalty_alpha
185 | 
186 |             inputs = get_inputs(prompt, tokenizer, n=n_gen_per_prompt)
187 |             device = accelerator.device
188 |             inputs = {k: v.to(device) for k, v in inputs.items()}
189 | 
190 |             with torch.no_grad():
191 |                 output = model.generate(**inputs, generation_config=generation_config)
192 |             output = tokenizer.batch_decode(output)
193 | 
194 |             output = process_response(output)
195 |             this_example['responses'] = output
196 | 
197 |             with open(f"{output_dir}/{this_id}.json", "w") as f:
198 |                 json.dump(this_example, f)
199 | 
200 |         except Exception as e:
201 |             print(e)
202 |         progress_bar.update(1)
203 |     progress_bar.close()
204 | 
205 | 
206 | if __name__ == "__main__":
207 | 
208 |     ap = argparse.ArgumentParser()
209 |     ap.add_argument('--config_path', type=str, required=True)
210 | 
211 |     args = ap.parse_args()
212 |     cfg = OmegaConf.load(args.config_path)
213 | 
214 |     os.makedirs(cfg.output_dir, exist_ok=True)
215 | 
216 |     # execution
217 |     generate(cfg)
218 | 


--------------------------------------------------------------------------------
/code/generate_r_clm_from_scratch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import random
  5 | import string
  6 | 
  7 | import pandas as pd
  8 | import torch
  9 | from accelerate import Accelerator
 10 | from omegaconf import OmegaConf
 11 | from peft import PeftModel
 12 | from tqdm.auto import tqdm
 13 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
 14 |                           BitsAndBytesConfig, GenerationConfig)
 15 | 
 16 | 
 17 | def generate_random_string():
 18 |     chars = string.ascii_lowercase + string.digits
 19 |     return 'e_' + ''.join(random.choice(chars) for _ in range(8))
 20 | 
 21 | 
 22 | def get_instruction(inputs):
 23 |     ret = f"""
 24 | Prompt: {inputs['prompt_name']}
 25 | Task: {inputs['task']}
 26 | Score: {inputs['holistic_essay_score']}
 27 | Student Grade Level: {inputs['grade_level']}
 28 | English Language Learner: {inputs['ell_status']}
 29 | Disability Status: {inputs['student_disability_status']}
 30 |     """.strip()
 31 |     n_chars = random.randint(64, 128)
 32 | 
 33 |     start = inputs['text'][:n_chars]
 34 | 
 35 |     ret = f"### Instruction:\n{ret}\n\n### Response: {start}"
 36 |     return ret
 37 | 
 38 | 
 39 | def get_inputs(prompt, tokenizer, n=1):
 40 |     return tokenizer([prompt]*n, return_tensors="pt")
 41 | 
 42 | 
 43 | def process_response(texts):
 44 |     ret = []
 45 | 
 46 |     for text in texts:
 47 |         if "</s>" in text:
 48 |             text = text.split("### Response:")[-1].split("</s>")[0].strip()
 49 |         else:
 50 |             text = text.split("### Response:")[-1].split("<|endoftext|>")[0].strip()
 51 |         text = text.replace("<unk>", "")
 52 |         ret.append(text)
 53 |     return ret
 54 | 
 55 | 
 56 | def pre_process_essay(essay_df):
 57 | 
 58 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
 59 |     essay_df = essay_df.reset_index(drop=True)
 60 | 
 61 |     essay_df["student_disability_status"] = essay_df["student_disability_status"].fillna("Unknown")
 62 |     essay_df["ell_status"] = essay_df["ell_status"].fillna("Unknown")
 63 |     essay_df["grade_level"] = essay_df["grade_level"].fillna(-1)
 64 |     essay_df["holistic_essay_score"] = essay_df["holistic_essay_score"].fillna(-1)
 65 | 
 66 |     essay_df["prompt"] = essay_df.apply(get_instruction, axis=1)
 67 |     return essay_df
 68 | 
 69 | 
 70 | def generate(cfg):
 71 |     accelerator = Accelerator()
 72 | 
 73 |     essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"})
 74 |     essay_df = pre_process_essay(essay_df)
 75 | 
 76 |     prompts = essay_df["prompt"].values.tolist()
 77 |     # prompts = [p for p in prompts if "Task: Text dependent" in p]
 78 | 
 79 |     # prompts = [p for p in prompts if "car-free" not in p.lower()]
 80 |     # prompts = [p for p in prompts if "facial action" not in p.lower()]
 81 |     # prompts = [p for p in prompts if "electoral" not in p.lower()]
 82 | 
 83 |     print(f"Number of prompts: {len(prompts)}")
 84 | 
 85 |     # model & tokenizer ---
 86 |     tokenizer = AutoTokenizer.from_pretrained(
 87 |         cfg.model_path,
 88 |         use_fast=True,
 89 |         padding_side="left",
 90 |         truncation_side="left",
 91 |     )
 92 | 
 93 |     if tokenizer.pad_token is None:
 94 |         tokenizer.pad_token = tokenizer.unk_token
 95 |         tokenizer.pad_token_id = tokenizer.unk_token_id
 96 | 
 97 |     # bnb_config = BitsAndBytesConfig(
 98 |     #     load_in_4bit=True,
 99 |     #     bnb_4bit_quant_type="nf4",
100 |     #     bnb_4bit_use_double_quant=True,
101 |     #     bnb_4bit_compute_dtype=torch.bfloat16
102 |     # )
103 | 
104 |     model = AutoModelForCausalLM.from_pretrained(
105 |         cfg.model_path,
106 |         torch_dtype=torch.bfloat16,
107 |     )
108 | 
109 |     model = accelerator.prepare(model)
110 |     model.eval()
111 | 
112 |     n_examples = cfg.n_examples
113 |     n_gen_per_prompt = cfg.n_gen_per_prompt
114 |     output_dir = cfg.output_dir
115 | 
116 |     progress_bar = tqdm(range(n_examples))
117 | 
118 |     for i in range(n_examples):
119 |         # print(f"---- Example {i+1}/{n_examples} ------")
120 |         temperature = 1.5  # + 0.75 * random.random()
121 |         top_k = 512  # random.randint(4, 8)
122 |         penalty_alpha = 0.5  # random.random()
123 |         guidance_scale = 1.1  # + 0.5 * random.random()
124 |         eta_cutoff = 1e-4 + 5e-4 * random.random()
125 |         repetition_penalty = 1.2  # 1.0 + 0.2 * random.random()
126 | 
127 |         try:
128 |             generation_config = GenerationConfig.from_pretrained(
129 |                 cfg.model_path,
130 |                 do_sample=True,
131 |                 temperature=temperature,
132 |                 top_k=top_k,
133 |                 penalty_alpha=penalty_alpha,
134 |                 guidance_scale=guidance_scale,
135 |                 max_new_tokens=cfg.max_num_tokens,
136 |                 pad_token_id=tokenizer.pad_token_id,
137 |                 # eta_cutoff=eta_cutoff,
138 |                 # repetition_penalty=repetition_penalty,
139 |             )
140 |         except Exception as e:
141 |             print(e)
142 |             generation_config = GenerationConfig(
143 |                 # cfg.base_model_path,
144 |                 do_sample=True,
145 |                 temperature=temperature,
146 |                 top_k=top_k,
147 |                 penalty_alpha=penalty_alpha,
148 |                 guidance_scale=guidance_scale,
149 |                 max_new_tokens=cfg.max_num_tokens,
150 |                 pad_token_id=tokenizer.pad_token_id,
151 |                 eta_cutoff=eta_cutoff,
152 |             )
153 | 
154 |         try:
155 |             prompt = random.choice(prompts)
156 |             this_example = dict()
157 |             this_id = generate_random_string()
158 |             this_example['id'] = this_id
159 |             this_example['prompt'] = prompt
160 |             this_example['temperature'] = temperature
161 |             this_example['top_k'] = top_k
162 |             this_example['guidance_scale'] = guidance_scale
163 |             this_example['penalty_alpha'] = penalty_alpha
164 |             # this_example['typical_p'] = typical_p
165 | 
166 |             inputs = get_inputs(prompt, tokenizer, n=n_gen_per_prompt)
167 |             device = accelerator.device
168 |             inputs = {k: v.to(device) for k, v in inputs.items()}
169 | 
170 |             with torch.no_grad():
171 |                 output = model.module.generate(**inputs, generation_config=generation_config)
172 |             output = tokenizer.batch_decode(output)
173 | 
174 |             output = process_response(output)
175 |             this_example['responses'] = output
176 | 
177 |             with open(f"{output_dir}/{this_id}.json", "w") as f:
178 |                 json.dump(this_example, f)
179 | 
180 |         except Exception as e:
181 |             print(e)
182 |         progress_bar.update(1)
183 |     progress_bar.close()
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     ap = argparse.ArgumentParser()
189 |     ap.add_argument('--config_path', type=str, required=True)
190 | 
191 |     args = ap.parse_args()
192 |     cfg = OmegaConf.load(args.config_path)
193 | 
194 |     os.makedirs(cfg.output_dir, exist_ok=True)
195 | 
196 |     # execution
197 |     generate(cfg)
198 | 


--------------------------------------------------------------------------------
/code/ghostbuster/README.md:
--------------------------------------------------------------------------------
 1 | # Ghostbuster instructions
 2 | 
 3 | 
 4 | 1. First get the logprobs for all of your texts using two models with the same tokenizers. I used tinyllama and llama7b. These should be saved in a directory where each text has a separate file. The format for the files is `token logprobs\ntoken logprobs` and so on.
 5 | 
 6 | 2. Then run `run.py` to get the features for the texts.
 7 | 3. Finally, run `train_lr.py` to train the model on the features. Below is the command used to run the script.
 8 | 
 9 | ```sh
10 | python train_lr.py \
11 | --feature_path "../custom_7b-tl-ft-ignore-25-cmin4-cmax4-m20" \
12 | --model_type "vote" \
13 | --C 100 \
14 | --train_on_all_data
15 | ```
16 | 
17 | Note, use `--train_on_all_data` only after finding a good value of C


--------------------------------------------------------------------------------
/code/ghostbuster/n_gram.py:
--------------------------------------------------------------------------------
  1 | import tqdm
  2 | from collections import defaultdict, Counter
  3 | from transformers import PreTrainedTokenizerBase
  4 | import numpy as np
  5 | from nltk import ngrams
  6 | 
  7 | 
  8 | # NGramModels from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/n_gram.py#L1
  9 | class NGramModel:
 10 |     """
 11 |     An n-gram model, where alpha is the laplace smoothing parameter.
 12 |     """
 13 | 
 14 |     def __init__(self, train_text, n=2, alpha=3e-3, vocab_size=None):
 15 |         self.n = n
 16 |         if vocab_size is None:
 17 |             # Assume GPT tokenizer
 18 |             self.vocab_size = 50257
 19 |         else:
 20 |             self.vocab_size = vocab_size
 21 | 
 22 |         self.smoothing = alpha
 23 |         self.smoothing_f = alpha * self.vocab_size
 24 | 
 25 |         self.c = defaultdict(lambda: [0, Counter()])
 26 |         for i in tqdm.tqdm(range(len(train_text) - n)):
 27 |             n_gram = tuple(train_text[i : i + n])
 28 |             self.c[n_gram[:-1]][1][n_gram[-1]] += 1
 29 |             self.c[n_gram[:-1]][0] += 1
 30 |         self.n_size = len(self.c)
 31 | 
 32 |     def n_gram_probability(self, n_gram):
 33 |         assert len(n_gram) == self.n
 34 |         it = self.c[tuple(n_gram[:-1])]
 35 |         prob = (it[1][n_gram[-1]] + self.smoothing) / (it[0] + self.smoothing_f)
 36 |         return prob
 37 | 
 38 | 
 39 | class DiscountBackoffModel(NGramModel):
 40 |     """
 41 |     An n-gram model with discounting and backoff. Delta is the discounting parameter.
 42 |     """
 43 | 
 44 |     def __init__(self, train_text, lower_order_model, n=2, delta=0.9, vocab_size=None):
 45 |         super().__init__(train_text, n=n, vocab_size=vocab_size)
 46 |         self.lower_order_model = lower_order_model
 47 |         self.discount = delta
 48 | 
 49 |     def n_gram_probability(self, n_gram):
 50 |         assert len(n_gram) == self.n
 51 |         it = self.c[tuple(n_gram[:-1])]
 52 | 
 53 |         if it[0] == 0:
 54 |             return self.lower_order_model.n_gram_probability(n_gram[1:])
 55 | 
 56 |         prob = (
 57 |             self.discount
 58 |             * (len(it[1]) / it[0])
 59 |             * self.lower_order_model.n_gram_probability(n_gram[1:])
 60 |         )
 61 |         if it[1][n_gram[-1]] != 0:
 62 |             prob += max(it[1][n_gram[-1]] - self.discount, 0) / it[0]
 63 | 
 64 |         return prob
 65 | 
 66 | 
 67 | class KneserNeyBaseModel(NGramModel):
 68 |     """
 69 |     A Kneser-Ney base model, where n=1.
 70 |     """
 71 | 
 72 |     def __init__(self, train_text, vocab_size=None):
 73 |         super().__init__(train_text, n=1, vocab_size=vocab_size)
 74 | 
 75 |         base_cnt = defaultdict(set)
 76 |         for i in range(1, len(train_text)):
 77 |             base_cnt[train_text[i]].add(train_text[i - 1])
 78 | 
 79 |         cnt = 0
 80 |         for word in base_cnt:
 81 |             cnt += len(base_cnt[word])
 82 | 
 83 |         self.prob = defaultdict(float)
 84 |         for word in base_cnt:
 85 |             self.prob[word] = len(base_cnt[word]) / cnt
 86 | 
 87 |     def n_gram_probability(self, n_gram):
 88 |         assert len(n_gram) == 1
 89 |         ret_prob = self.prob[n_gram[0]]
 90 | 
 91 |         if ret_prob == 0:
 92 |             return 1 / self.vocab_size
 93 |         else:
 94 |             return ret_prob
 95 | 
 96 | 
 97 | class TrigramBackoff:
 98 |     """
 99 |     A trigram model with discounting and backoff. Uses a Kneser-Ney base model.
100 |     """
101 | 
102 |     def __init__(self, train_text, delta=0.9, vocab_size=None):
103 |         self.base = KneserNeyBaseModel(train_text, vocab_size=vocab_size)
104 |         self.bigram = DiscountBackoffModel(
105 |             train_text, self.base, n=2, delta=delta, vocab_size=vocab_size
106 |         )
107 |         self.trigram = DiscountBackoffModel(
108 |             train_text, self.bigram, n=3, delta=delta, vocab_size=vocab_size
109 |         )
110 | 
111 |     def n_gram_probability(self, n_gram):
112 |         assert len(n_gram) == 3
113 |         return self.trigram.n_gram_probability(n_gram)
114 | 
115 | 
116 | def score_ngram(doc, model, tokenizer, n=3, strip_first=False, bos_token_id=50256):
117 |     """
118 |     Returns vector of ngram probabilities given document, model and tokenizer
119 | 
120 |     Slightly modified from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/featurize.py#L65-L75
121 |     """
122 |     scores = []
123 |     if strip_first:
124 |         doc = " ".join(doc.split()[:1000])
125 | 
126 |     if isinstance(tokenizer.__self__, PreTrainedTokenizerBase):
127 |         tokens = tokenizer(doc.strip(), add_special_tokens=True)
128 | 
129 |         # tokens[0] is bos token
130 |         tokens = (n - 1) * [tokens[0]] + tokens
131 |     else:
132 |         eos_token_id = 50256  # eos/bos token for davinci model
133 |         tokens = (n - 1) * [eos_token_id] + tokenizer(doc.strip())
134 | 
135 |     # for k tokens and ngrams of size n, need to add n-1 tokens to the beginning
136 |     # to ensure that there are k ngrams
137 |     for i in ngrams(tokens, n):
138 |         scores.append(model.n_gram_probability(i))
139 | 
140 |     return np.array(scores)
141 | 


--------------------------------------------------------------------------------
/code/ghostbuster/symbolic.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from pathlib import Path
  3 | from collections import defaultdict
  4 | from functools import partial
  5 | 
  6 | from datasets import Dataset
  7 | from tqdm.auto import tqdm
  8 | from nltk.corpus import brown
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from n_gram import score_ngram, TrigramBackoff
 13 | 
 14 | # Some code copied/modified from here: https://github.com/vivek3141/ghostbuster/blob/9831b53a8ecbfe401d47616db95b9256b9cbaadd/utils/symbolic.py#L16
 15 | 
 16 | 
 17 | def train_trigram(tokenizer_name, verbose=True, return_tokenizer=False):
 18 |     """
 19 |     Trains and returns a trigram model on the brown corpus
 20 |     """
 21 | 
 22 |     if tokenizer_name == "davinci":
 23 |         import tiktoken
 24 | 
 25 |         enc = tiktoken.encoding_for_model("davinci")
 26 |         tokenizer = enc.encode
 27 |         vocab_size = enc.n_vocab
 28 | 
 29 |     else:
 30 |         from transformers import AutoTokenizer
 31 | 
 32 |         enc = AutoTokenizer.from_pretrained(tokenizer_name)
 33 |         tokenizer = enc.encode
 34 |         vocab_size = len(enc)
 35 | 
 36 |     # We use the brown corpus to train the n-gram model
 37 |     sentences = brown.sents()
 38 | 
 39 |     if verbose:
 40 |         print("Tokenizing corpus...")
 41 |     tokenized_corpus = []
 42 |     for sentence in tqdm(sentences):
 43 |         tokens = tokenizer(" ".join(sentence))
 44 |         tokenized_corpus += tokens
 45 | 
 46 |     if verbose:
 47 |         print("\nTraining n-gram model...")
 48 | 
 49 |     if return_tokenizer:
 50 |         return TrigramBackoff(tokenized_corpus, vocab_size=vocab_size), tokenizer
 51 |     else:
 52 |         return TrigramBackoff(tokenized_corpus, vocab_size=vocab_size)
 53 | 
 54 | 
 55 | def ds_from_files(
 56 |     file_dir,
 57 |     model1_name,
 58 |     model2_name,
 59 |     tokenizer,
 60 |     trigram,
 61 |     num_tokens=2047,
 62 |     num_proc=4,
 63 | ):
 64 |     """
 65 |     file_dir should be a path to raw text files.
 66 |     the logprob directory should be in file_dir
 67 | 
 68 |     raw_text_files should have filename `{id}.txt`
 69 | 
 70 |     logprob files should have filename `{id}-{model_name}.txt`
 71 |     """
 72 | 
 73 |     file_dir = Path(file_dir)
 74 | 
 75 |     ds = Dataset.from_dict(
 76 |         {"raw_text_filepath": list(map(str, file_dir.glob("*.txt")))}
 77 |     )
 78 | 
 79 |     # ds = ds.select(range(1000))
 80 | 
 81 |     ds = ds.map(
 82 |         lambda x: {"text": open(x["raw_text_filepath"]).read()},
 83 |         num_proc=num_proc,
 84 |         desc="Reading raw text files",
 85 |     )
 86 |     ds = ds.map(
 87 |         lambda x: {"id": Path(x["raw_text_filepath"]).stem},
 88 |         num_proc=num_proc,
 89 |         desc="Adding id",
 90 |     )
 91 | 
 92 |     def load_probs(example, model_name):
 93 |         with open(file_dir / "logprobs" / f"{example['id']}-{model_name}.txt") as fp:
 94 |             data = fp.read().strip().split("\n")
 95 | 
 96 |         tokens, logprobs = [], []
 97 |         for row in data:
 98 |             if len(row.split()) != 2:
 99 |                 print([row])
100 | 
101 |                 if row[0] != " ":
102 |                     row = "Ġ" * len(row.split(" ")[0]) + row.split(" ")[1]
103 | 
104 |                 row = "Ġ" * len(row.split(" ")) + row[1:]
105 | 
106 |             tokens.append(row.split()[0])
107 |             logprobs.append(row.split()[1])
108 | 
109 |         probs = np.exp(np.array(list(map(float, logprobs))[:num_tokens]))
110 | 
111 |         return {"tokens": tokens, f"{model_name}-probs": probs}
112 | 
113 |     for m in [model1_name, model2_name]:
114 |         ds = ds.map(
115 |             load_probs,
116 |             num_proc=num_proc,
117 |             fn_kwargs={"model_name": m},
118 |             desc=f"Getting probs for {m}",
119 |         )
120 | 
121 |     def add_ngrams(example, n):
122 |         model = trigram if n == 3 else trigram.base
123 | 
124 |         prefix = "uni" if n == 1 else "tri"
125 | 
126 |         ng = score_ngram(example["text"], model, tokenizer, n=n)
127 |         other = len(example[f"{model1_name}-probs"])
128 |         if len(ng) > other:
129 |             ng = ng[1 : other + 1]
130 | 
131 |         return {f"{prefix}gram-probs": ng}
132 | 
133 |     for n in [1, 3]:
134 |         ds = ds.map(
135 |             add_ngrams,
136 |             num_proc=1,
137 |             fn_kwargs={"n": n},
138 |             desc=f"Adding {n}-gram probabilities",
139 |         )
140 | 
141 |     return ds.with_format("numpy")
142 | 
143 | 
144 | vec_functions = {
145 |     "v-add": lambda a, b: a + b,
146 |     "v-sub": lambda a, b: a - b,
147 |     "v-mul": lambda a, b: a * b,
148 |     "v-div": lambda a, b: np.divide(
149 |         a, b, out=np.zeros_like(a), where=(b != 0), casting="unsafe"
150 |     ),
151 |     "v->": lambda a, b: a > b,
152 |     "v-<": lambda a, b: a < b,
153 | }
154 | 
155 | scalar_functions = {
156 |     "s-max": max,
157 |     "s-min": min,
158 |     "s-avg": lambda x: sum(x) / len(x),
159 |     "s-avg-top-25": lambda x: sum(sorted(x, reverse=True)[:25])
160 |     / len(sorted(x, reverse=True)[:25]),
161 |     "s-len": len,
162 |     "s-var": np.var,
163 |     "s-l2": np.linalg.norm,
164 | }
165 | 
166 | vectors = ["llm1-probs", "llm2-probs", "trigram-probs", "unigram-probs"]
167 | 
168 | # Get vec_combinations
169 | vec_combinations = defaultdict(list)
170 | for vec1 in range(len(vectors)):
171 |     for vec2 in range(vec1):
172 |         for func in vec_functions:
173 |             if func != "v-div":
174 |                 vec_combinations[vectors[vec1]].append(f"{func} {vectors[vec2]}")
175 | 
176 | for vec1 in vectors:
177 |     for vec2 in vectors:
178 |         if vec1 != vec2:
179 |             vec_combinations[vec1].append(f"v-div {vec2}")
180 | 
181 | 
182 | def get_words(exp):
183 |     """
184 |     Splits up expression into words, to be individually processed
185 |     """
186 |     return exp.split(" ")
187 | 
188 | 
189 | def backtrack_functions(prev="", max_depth=2):
190 |     """
191 |     Backtrack all possible features.
192 |     """
193 | 
194 |     def helper(prev, depth):
195 |         if depth >= max_depth:
196 |             return []
197 | 
198 |         all_funcs = []
199 |         prev_word = get_words(prev)[-1]
200 | 
201 |         for func in scalar_functions:
202 |             all_funcs.append(f"{prev} {func}")
203 | 
204 |         for comb in vec_combinations[prev_word]:
205 |             all_funcs += helper(f"{prev} {comb}", depth + 1)
206 | 
207 |         return all_funcs
208 | 
209 |     ret = []
210 |     for vec in vectors:
211 |         ret += helper(vec, 0)
212 |     return ret
213 | 
214 | 
215 | def generate_symbolic_data(
216 |     ds,
217 |     max_depth=2,
218 |     output_file="symbolic_data",
219 |     verbose=True,
220 |     model1="llama-7b",
221 |     model2="tinyllama",
222 |     tokenizer_name="davinci",
223 |     num_proc=50,
224 |     limit=100,
225 | ):
226 |     """
227 |     Brute forces and generates symbolic data from a dataset of text files.
228 |     """
229 | 
230 |     ds = ds.with_format("numpy")
231 | 
232 |     def calc_feats(example, exp):
233 | 
234 |         name_map = {
235 |             "llm1-probs": f"{model1}-probs",
236 |             "llm2-probs": f"{model2}-probs",
237 |         }
238 | 
239 |         exp_tokens = exp.split(" ")
240 |         # exp_tokens will be operations and the vectors to operate on
241 |         # e.g.
242 |         # unigram-logprobs v-sub davinci-logprobs v-div ada-logprobs s-avg
243 | 
244 |         model_probs_key = exp_tokens[0]
245 |         if model_probs_key.startswith("llm"):
246 |             model_probs_key = name_map[model_probs_key]
247 | 
248 |         curr = example[model_probs_key]
249 | 
250 |         for i in range(1, len(exp_tokens)):
251 |             if exp_tokens[i] in vec_functions:
252 |                 model_probs_key = exp_tokens[i + 1]
253 | 
254 |                 if model_probs_key.startswith("llm"):
255 |                     model_probs_key = name_map[model_probs_key]
256 |                 next_vec = example[model_probs_key]
257 |                 curr = vec_functions[exp_tokens[i]](curr, next_vec)
258 |             elif exp_tokens[i] in scalar_functions:
259 |                 final_value = scalar_functions[exp_tokens[i]](curr)
260 | 
261 |         return {
262 |             "feat": final_value,
263 |         }
264 | 
265 |     all_funcs = backtrack_functions(max_depth=max_depth)
266 | 
267 |     if verbose:
268 |         print(f"\nTotal # of Features: {len(all_funcs)}.")
269 |         print("Sampling 5 features:")
270 |         for i in range(5):
271 |             print(all_funcs[np.random.randint(0, len(all_funcs))])
272 |         print("\nGenerating datasets...")
273 | 
274 |     exp_to_data = {}
275 | 
276 |     import random
277 | 
278 |     if limit is not None:
279 |         to_run = random.sample(all_funcs, k=limit)
280 |     else:
281 |         to_run = all_funcs
282 | 
283 |     for exp in tqdm(to_run):
284 |         exp_to_data[exp] = np.array(
285 |             ds.map(
286 |                 calc_feats,
287 |                 fn_kwargs={"exp": exp},
288 |                 num_proc=num_proc,
289 |                 keep_in_memory=True,
290 |             )["feat"]
291 |         ).reshape(-1, 1)
292 | 
293 |     pickle.dump((exp_to_data, ds["label"]), open(output_file, "wb"))
294 | 
295 | 
296 | def generate_custom_data(
297 |     ds,
298 |     output_file="custom_data",
299 |     model1="llama-7b",
300 |     model2="tinyllama",
301 |     num_proc=50,
302 |     clip_min=1e-4,
303 |     clip_max=1e5,
304 |     ignore_first=25,
305 | ):
306 |     """
307 |     Brute forces and generates symbolic data from a dataset of text files.
308 | 
309 |     For each sequence and model (llm1, llm2, unigram, trigram), get
310 |     - min
311 |     - max
312 |     - mean
313 |     - median
314 |     - 25% quantile
315 |     - 75% quantile
316 |     - l2 norm
317 |     - variance
318 | 
319 |     Will also get the ratio of llm1/llm2, llm1/unigram, llm1/trigram, llm2/unigram, llm2/trigram, unigram/trigram
320 | 
321 | 
322 |     Saves features to pickle file as tuple (features, labels, ids).
323 | 
324 |     Args:
325 |     - ds: Dataset object
326 |     - output_file: str, path to save the output
327 |     - model1: str, name of model1
328 |     - model2: str, name of model2
329 |     - num_proc: int, number of processes to use
330 |     - clip_min: float, minimum value to clip to
331 |     - clip_max: float, maximum value to clip to
332 |     - ignore_first: int, number of tokens to ignore from the beginning
333 | 
334 |     Returns:
335 |     - None
336 |     """
337 | 
338 |     ds = ds.with_format("numpy")
339 | 
340 |     def calc_feats(example):
341 | 
342 |         feats = []
343 | 
344 |         funcs = [
345 |             min,
346 |             max,
347 |             np.mean,
348 |             np.median,
349 |             partial(np.percentile, q=0.25),
350 |             partial(np.percentile, q=0.75),
351 |             partial(np.percentile, q=0.10),
352 |             partial(np.percentile, q=0.90),
353 |             np.linalg.norm,
354 |             np.var,
355 |         ]
356 | 
357 |         models = [
358 |             f"{model1}-probs",
359 |             f"{model2}-probs",
360 |             "unigram-probs",
361 |             "trigram-probs",
362 |         ]
363 | 
364 |         def ff(x, f):
365 |             if len(x) <= ignore_first:
366 |                 return f(x)
367 |             return f(x[ignore_first:])
368 | 
369 |         for m in models:
370 |             feats.extend([ff(example[m], f) for f in funcs])
371 | 
372 |         def c(x):
373 |             if len(x) <= ignore_first:
374 |                 return np.clip(x, clip_min, clip_max)
375 |             return np.clip(x[ignore_first:], clip_min, clip_max)
376 | 
377 |         feats.extend(
378 |             [
379 |                 f(c(example[f"{model1}-probs"]) / c(example[f"{model2}-probs"]))
380 |                 for f in funcs
381 |             ]
382 |         )
383 |         feats.extend(
384 |             [
385 |                 f(c(example[f"{model1}-probs"]) / c(example["unigram-probs"]))
386 |                 for f in funcs
387 |             ]
388 |         )
389 |         feats.extend(
390 |             [
391 |                 f(c(example[f"{model1}-probs"]) / c(example["trigram-probs"]))
392 |                 for f in funcs
393 |             ]
394 |         )
395 |         feats.extend(
396 |             [
397 |                 f(c(example[f"{model2}-probs"]) / c(example["unigram-probs"]))
398 |                 for f in funcs
399 |             ]
400 |         )
401 |         feats.extend(
402 |             [
403 |                 f(c(example[f"{model2}-probs"]) / c(example["trigram-probs"]))
404 |                 for f in funcs
405 |             ]
406 |         )
407 |         feats.extend(
408 |             [
409 |                 f(c(example["unigram-probs"]) / c(example["trigram-probs"]))
410 |                 for f in funcs
411 |             ]
412 |         )
413 | 
414 |         return {
415 |             "feat": feats,
416 |         }
417 | 
418 |     all_features = np.array(
419 |         ds.map(
420 |             calc_feats,
421 |             num_proc=num_proc,
422 |             keep_in_memory=True,
423 |         )["feat"]
424 |     )
425 | 
426 |     pickle.dump((all_features, ds["label"], ds["id"]), open(output_file, "wb"))
427 | 


--------------------------------------------------------------------------------
/code/ghostbuster/train_lr.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Using the features created in `run.py` train a model on the data.
  3 | 
  4 | Using cuml makes it much, much faster on gpu.
  5 | """
  6 | 
  7 | import argparse
  8 | import math
  9 | import numpy as np
 10 | import json
 11 | 
 12 | # import tiktoken
 13 | import dill as pickle
 14 | from functools import partial
 15 | 
 16 | from sklearn.linear_model import LogisticRegression, LinearRegression
 17 | from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
 18 | from sklearn.calibration import CalibratedClassifierCV
 19 | from sklearn.svm import SVC
 20 | from sklearn.ensemble import VotingClassifier
 21 | from sklearn.linear_model import SGDClassifier
 22 | from sklearn.naive_bayes import MultinomialNB
 23 | 
 24 | from transformers import AutoTokenizer
 25 | 
 26 | from tabulate import tabulate
 27 | 
 28 | from featurize import normalize
 29 | 
 30 | from cuml.svm import SVC, SVR
 31 | from cuml import LogisticRegression
 32 | from cuml.linear_model import ElasticNet
 33 | from cuml.solvers import SGD
 34 | from cuml.ensemble import RandomForestClassifier
 35 | from cuml.neighbors import KNeighborsClassifier
 36 | 
 37 | if __name__ == "__main__":
 38 |     parser = argparse.ArgumentParser()
 39 |     parser.add_argument("--train_on_all_data", action="store_true")
 40 | 
 41 |     parser.add_argument("--feature_path", type=str)
 42 | 
 43 |     parser.add_argument(
 44 |         "--model1",
 45 |         type=str,
 46 |         help="name of model1 (used for folders)",
 47 |         default="llama-7b",
 48 |     )
 49 |     parser.add_argument(
 50 |         "--model2",
 51 |         type=str,
 52 |         help="name of model2 (used for folders)",
 53 |         default="tinyllama",
 54 |     )
 55 | 
 56 |     parser.add_argument("--log_reg", action="store_true")
 57 | 
 58 |     parser.add_argument("--model_type", type=str)
 59 |     parser.add_argument("--binary_labels", action="store_true")
 60 |     parser.add_argument("--C", type=int)
 61 | 
 62 |     args = parser.parse_args()
 63 | 
 64 |     with open(args.feature_path, "rb") as fp:
 65 |         features, labels, ids = pickle.load(fp)
 66 | 
 67 |     if args.binary_labels:
 68 |         labels = np.array([int(x > 0.5) for x in labels])
 69 | 
 70 |     indices = np.arange(len(labels))
 71 | 
 72 |     train_frac = 0.997 if args.train_on_all_data else 0.8
 73 | 
 74 |     np.random.shuffle(indices)
 75 |     train, test = (
 76 |         indices[: math.floor(train_frac * len(indices))],
 77 |         indices[math.floor(train_frac * len(indices)) :],
 78 |     )
 79 |     print("Train/Test Split", train, test)
 80 |     print("Train Size:", len(train), "Valid Size:", len(test))
 81 |     print(f"Positive Labels: {sum(labels[indices])}, Total Labels: {len(indices)}")
 82 | 
 83 |     data, mu, sigma = normalize(
 84 |         features,
 85 |         ret_mu_sigma=True,
 86 |     )
 87 | 
 88 |     if args.model_type == "log_reg":
 89 |         base = LogisticRegression(C=args.C, max_iter=10000)
 90 | 
 91 |     elif args.model_type == "svc":
 92 |         base = SVC(C=args.C, probability=True)
 93 | 
 94 |     elif args.model_type == "svr":
 95 |         base = SVR(C=args.C)
 96 | 
 97 |     elif args.model_type == "elastic":
 98 | 
 99 |         base = ElasticNet()
100 | 
101 |     elif args.model_type == "sgd":
102 | 
103 |         base = SGD()
104 | 
105 |     elif args.model_type == "rfc":
106 | 
107 |         base = RandomForestClassifier(max_depth=32, n_estimators=100, n_bins=100)
108 | 
109 |     elif args.model_type == "knnc":
110 | 
111 |         base = KNeighborsClassifier(n_neighbors=args.C)
112 | 
113 |     elif args.model_type == "vote":
114 | 
115 |         base = VotingClassifier(
116 |             estimators=[
117 |                 ("svc", SVC(C=args.C, probability=True)),
118 |                 (
119 |                     "rfc",
120 |                     RandomForestClassifier(max_depth=64, n_estimators=100, n_bins=200),
121 |                 ),
122 |             ],
123 |             voting="soft",
124 |         )
125 | 
126 |     if args.binary_labels:
127 |         model = CalibratedClassifierCV(base, cv=5)
128 |     else:
129 |         model = base
130 | 
131 |     if args.train_on_all_data:
132 |         model.fit(data, labels)
133 | 
134 |         pickle.dump(model, open("model/model", "wb"))
135 |         pickle.dump(mu, open("model/mu", "wb"))
136 |         pickle.dump(sigma, open("model/sigma", "wb"))
137 | 
138 |         texts = [open(f"../../data/m20/{id}.txt").read() for id in np.array(ids)[test]]
139 |         json.dump(texts, open("model/test_texts.json", "w"))
140 |         json.dump(labels[test].tolist(), open("model/test_labels.json", "w"))
141 | 
142 |         pickle.dump((data, train, test), open("model/data.pkl", "wb"))
143 | 
144 |         print("Saved model to model/")
145 |     else:
146 |         model.fit(data[train], labels[train])
147 | 
148 |     predictions = model.predict(data[test])
149 |     if args.binary_labels:
150 |         probs = model.predict_proba(data[test])[:, 1]
151 |     else:
152 |         probs = predictions
153 |         predictions = predictions > 0.5
154 | 
155 |         labels = np.array([int(x > 0.5) for x in labels])
156 | 
157 |     result_table = [["F1", "Accuracy", "AUC"]]
158 | 
159 |     result_table.append(
160 |         [
161 |             round(f1_score(labels[test], predictions), 3),
162 |             round(accuracy_score(labels[test], predictions), 3),
163 |             round(roc_auc_score(labels[test], probs), 3),
164 |         ]
165 |     )
166 | 
167 |     print(tabulate(result_table, headers="firstrow", tablefmt="grid"))
168 | 
169 |     json.dump(result_table, open("model/results.json", "w"))
170 | 


--------------------------------------------------------------------------------
/code/r_clm/ai_dataset.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | from datasets import Dataset
  4 | from transformers import AutoTokenizer
  5 | 
  6 | IGNORE_INDEX = -100
  7 | 
  8 | 
  9 | def get_tokenizer(cfg):
 10 | 
 11 |     tokenizer = AutoTokenizer.from_pretrained(
 12 |         cfg.model.backbone_path,
 13 |         use_fast=cfg.model.tokenizer.use_fast,
 14 |         padding_side=cfg.model.tokenizer.padding_side,
 15 |         truncation_side=cfg.model.tokenizer.truncation_side,
 16 |     )
 17 | 
 18 |     if tokenizer.pad_token is None:
 19 |         if tokenizer.unk_token is not None:
 20 |             tokenizer.pad_token = tokenizer.unk_token
 21 |         else:
 22 |             tokenizer.pad_token = tokenizer.eos_token
 23 |     return tokenizer
 24 | 
 25 | # --------------- Dataset ----------------------------------------------#
 26 | 
 27 | 
 28 | def get_instruction(inputs):
 29 |     ret = f"""
 30 | Prompt: {inputs['prompt_name']}
 31 | Task: {inputs['task']}
 32 | Score: {inputs['holistic_essay_score']}
 33 | Student Grade Level: {inputs['grade_level']}
 34 | English Language Learner: {inputs['ell_status']}
 35 | Disability Status: {inputs['student_disability_status']}
 36 |     """.strip()
 37 |     return ret
 38 | 
 39 | 
 40 | class AiDataset:
 41 |     """
 42 |     Dataset class for LLM Detect AI Generated Text competition
 43 |     """
 44 | 
 45 |     def __init__(self, cfg):
 46 |         self.cfg = cfg
 47 |         self.tokenizer = get_tokenizer(cfg)
 48 | 
 49 |     def format_source(self, instruction):
 50 |         ret = f"### Instruction:\n{instruction}\n\n### Response: "
 51 |         return ret
 52 | 
 53 |     def format_target(self, response):
 54 |         return f"{response} {self.tokenizer.eos_token}"
 55 | 
 56 |     def tokenize_function(self, examples):
 57 |         sources = [self.format_source(s) for s in examples["instruction"]]
 58 |         targets = [self.format_target(t) for t in examples["text"]]
 59 |         chats = [s + t for s, t in zip(sources, targets)]
 60 | 
 61 |         ex_tokenized_inputs = self.tokenizer(
 62 |             chats,
 63 |             padding=False,
 64 |             truncation=True,
 65 |             max_length=self.cfg.model.max_length,
 66 |         )
 67 | 
 68 |         src_tokenized_inputs = self.tokenizer(
 69 |             sources,
 70 |             padding=False,
 71 |             truncation=False,
 72 |         )
 73 | 
 74 |         src_lens = [len(s)-1 for s in src_tokenized_inputs["input_ids"]]
 75 |         input_ids = ex_tokenized_inputs["input_ids"]
 76 |         attention_mask = ex_tokenized_inputs["attention_mask"]
 77 |         labels = deepcopy(input_ids)
 78 | 
 79 |         for idx, src_len in enumerate(src_lens):
 80 |             labels[idx][:src_len] = [IGNORE_INDEX] * src_len
 81 | 
 82 |         to_return = {
 83 |             "input_ids": input_ids,
 84 |             "attention_mask": attention_mask,
 85 |             "labels": labels,
 86 |         }
 87 | 
 88 |         return to_return
 89 | 
 90 |     def preprocess_function(self, persuade_df):
 91 |         persuade_df["student_disability_status"] = persuade_df["student_disability_status"].fillna("Unknown")
 92 |         persuade_df["ell_status"] = persuade_df["ell_status"].fillna("Unknown")
 93 |         persuade_df["grade_level"] = persuade_df["grade_level"].fillna(-1)
 94 |         persuade_df["holistic_essay_score"] = persuade_df["holistic_essay_score"].fillna(-1)
 95 |         persuade_df["instruction"] = persuade_df.apply(get_instruction, axis=1)
 96 |         return persuade_df
 97 | 
 98 |     def get_dataset(self, df):
 99 |         df = deepcopy(df)
100 |         df = self.preprocess_function(df)
101 |         task_dataset = Dataset.from_pandas(df)
102 |         task_dataset = task_dataset.map(
103 |             self.tokenize_function,
104 |             batched=True,
105 |             remove_columns=task_dataset.column_names
106 |         )
107 |         return task_dataset
108 | 


--------------------------------------------------------------------------------
/code/r_clm/ai_loader.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import torch
 4 | from transformers import DataCollatorWithPadding
 5 | 
 6 | 
 7 | @dataclass
 8 | class AiCollator(DataCollatorWithPadding):
 9 |     """
10 |     data collector for LLM Detect AI Generated Text task
11 |     """
12 | 
13 |     tokenizer = None
14 |     padding = True
15 |     max_length = None
16 |     pad_to_multiple_of = None
17 |     return_tensors = "pt"
18 | 
19 |     def __call__(self, features):
20 |         labels = None
21 |         if "labels" in features[0].keys():
22 |             labels = [feature["labels"] for feature in features]
23 | 
24 |         features = [
25 |             {
26 |                 "input_ids": feature["input_ids"],
27 |                 "attention_mask": feature["attention_mask"],
28 |             } for feature in features
29 |         ]
30 | 
31 |         batch = self.tokenizer.pad(
32 |             features,
33 |             padding='longest',
34 |             max_length=self.max_length,
35 |             pad_to_multiple_of=self.pad_to_multiple_of,
36 |             return_tensors=None,
37 |         )
38 | 
39 |         tensor_keys = [
40 |             "input_ids",
41 |             "attention_mask",
42 |         ]
43 | 
44 |         for key in tensor_keys:
45 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
46 | 
47 |         seq_len = batch["input_ids"].size(1)
48 | 
49 |         if labels is not None:
50 |             padded_labels = []
51 |             for label in labels:
52 |                 padded_label = [-100] * (seq_len - len(label)) + label  # left pad
53 |                 padded_labels.append(padded_label)
54 |             batch["labels"] = torch.tensor(padded_labels, dtype=torch.int64)
55 | 
56 |         return batch
57 | 
58 | 
59 | # ---
60 | 
61 | 
62 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print):
63 |     bs = batch['input_ids'].size(0)
64 |     print_fn(f"batch size: {bs}")
65 | 
66 |     print_fn(f"shape of input_ids: {batch['input_ids'].shape}")
67 | 
68 |     n_examples = min(n_examples, bs)
69 |     print_fn(f"Showing {n_examples} from a {task} batch...")
70 | 
71 |     print_fn("\n\n")
72 |     for idx in range(n_examples):
73 |         print_fn(f"Example {idx+1}")
74 |         print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
75 |         print_fn(f"Input ids:\n\n{batch['input_ids'][idx]}")
76 |         if 'labels' in batch:
77 |             print_fn(f"Labels:\n\n{batch['labels'][idx]}")
78 |         print_fn('~~'*40)
79 | 


--------------------------------------------------------------------------------
/code/r_clm/ai_optimizer.py:
--------------------------------------------------------------------------------
 1 | import bitsandbytes as bnb
 2 | from torch import optim
 3 | 
 4 | 
 5 | def get_optimizer(cfg, model, print_fn=None):
 6 |     _optimizers = {
 7 |         "Adam": optim.Adam,
 8 |         "AdamW": optim.AdamW,
 9 |         "AdamW8bit": bnb.optim.Adam8bit,
10 |     }
11 |     assert cfg.optimizer.name in _optimizers, f"Optimizer {cfg.optimizer.name} not supported"
12 | 
13 |     no_decay = ["bias", "LayerNorm.weight"]
14 |     head_layer_name = "lm_head"
15 | 
16 |     # start with all of the candidate parameters
17 |     param_dict = {name: param for name, param in model.named_parameters()}
18 |     # filter out those that do not require grad
19 |     param_dict = {name: param for name, param in param_dict.items() if param.requires_grad}
20 | 
21 |     # head & body params
22 |     param_dict_head = {
23 |         name: param for name, param in param_dict.items() if head_layer_name in name
24 |     }
25 |     param_dict_body = {
26 |         name: param for name, param in param_dict.items() if head_layer_name not in name
27 |     }
28 | 
29 |     # create groups ---
30 |     head_params_no_decay = [
31 |         param for name, param in param_dict_head.items() if any(nd in name for nd in no_decay)
32 |     ]
33 |     head_params_decay = [
34 |         param for name, param in param_dict_head.items() if not any(nd in name for nd in no_decay)
35 |     ]
36 |     body_params_no_decay = [
37 |         param for name, param in param_dict_body.items() if any(nd in name for nd in no_decay)
38 |     ]
39 |     body_params_decay = [
40 |         param for name, param in param_dict_body.items() if not any(nd in name for nd in no_decay)
41 |     ]
42 | 
43 |     optim_groups = [
44 |         {'params': head_params_no_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': 0},
45 |         {'params': head_params_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': cfg.optimizer.weight_decay},
46 |         {'params': body_params_no_decay, 'lr': cfg.optimizer.lr, 'weight_decay': 0},
47 |         {'params': body_params_decay, 'lr': cfg.optimizer.lr,
48 |             'weight_decay': cfg.optimizer.weight_decay * 1e-1},  # less weight decay for body
49 |     ]
50 | 
51 |     if print_fn is not None:
52 |         n_head_params_no_decay = sum(p.numel() for p in head_params_no_decay)
53 |         n_head_params_decay = sum(p.numel() for p in head_params_decay)
54 |         n_body_params_no_decay = sum(p.numel() for p in body_params_no_decay)
55 |         n_body_params_decay = sum(p.numel() for p in body_params_decay)
56 | 
57 |         print_fn(f"n_head_params_no_decay: {n_head_params_no_decay}")
58 |         print_fn(f"n_head_params_decay: {n_head_params_decay}")
59 |         print_fn(f"n_body_params_no_decay: {n_body_params_no_decay}")
60 |         print_fn(f"n_body_params_decay: {n_body_params_decay}")
61 | 
62 |     # Create AdamW optimizer and use the fused version if it is available
63 |     optimizer = _optimizers[cfg.optimizer.name](
64 |         optim_groups,
65 |         lr=cfg.optimizer.lr,
66 |         weight_decay=cfg.optimizer.weight_decay,
67 |     )
68 |     return optimizer
69 | 


--------------------------------------------------------------------------------
/code/r_detect/ai_dataset.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from datasets import Dataset
 4 | from transformers import AutoTokenizer
 5 | 
 6 | 
 7 | def get_tokenizer(cfg):
 8 | 
 9 |     tokenizer = AutoTokenizer.from_pretrained(
10 |         cfg.model.backbone_path,
11 |         use_fast=cfg.model.tokenizer.use_fast,
12 |         padding_side=cfg.model.tokenizer.padding_side,
13 |         truncation_side=cfg.model.tokenizer.truncation_side,
14 |     )
15 | 
16 |     # if the eos token is an empty string, we assign it to a token
17 |     if tokenizer.eos_token == "":
18 |         tokenizer.add_special_tokens({"eos_token": "</s>"})
19 |         tokenizer.eos_token = "</s>"
20 | 
21 |     if tokenizer.pad_token is None:
22 |         if tokenizer.unk_token is not None:
23 |             tokenizer.pad_token = tokenizer.unk_token
24 |         else:
25 |             tokenizer.pad_token = tokenizer.eos_token
26 |     return tokenizer
27 | 
28 | # --------------- Dataset ----------------------------------------------#
29 | 
30 | 
31 | class AiDataset:
32 |     """
33 |     Dataset class for LLM Detect AI Generated Text competition
34 |     """
35 | 
36 |     def __init__(self, cfg):
37 |         self.cfg = cfg
38 |         self.tokenizer = get_tokenizer(cfg)
39 | 
40 |     def tokenize_function(self, examples):
41 |         tz = self.tokenizer(
42 |             examples["text"],
43 |             padding=False,
44 |             truncation=True,
45 |             max_length=self.cfg.model.max_length,
46 |             add_special_tokens=True,
47 |         )
48 | 
49 |         return tz
50 | 
51 |     def compute_input_length(self, examples):
52 |         return {"input_length": [len(x) for x in examples["input_ids"]]}
53 | 
54 |     def preprocess_function(self, df):
55 |         df['text'] = df['text'].apply(lambda x: x.strip() + "\n###\nIs the essay generated by AI?")
56 |         return df
57 | 
58 |     def get_dataset(self, df):
59 |         """
60 |         Main api for creating the Science Exam dataset
61 |         :param df: input dataframe
62 |         :type df: pd.DataFrame
63 |         :return: the created dataset
64 |         :rtype: Dataset
65 |         """
66 |         df = deepcopy(df)
67 |         df = self.preprocess_function(df)
68 |         task_dataset = Dataset.from_pandas(df)
69 | 
70 |         task_dataset = task_dataset.map(self.tokenize_function, batched=True)
71 |         task_dataset = task_dataset.map(self.compute_input_length, batched=True)
72 | 
73 |         return task_dataset
74 | 


--------------------------------------------------------------------------------
/code/r_detect/ai_loader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import deepcopy
  3 | from dataclasses import dataclass, field
  4 | 
  5 | import torch
  6 | from transformers import DataCollatorWithPadding
  7 | 
  8 | 
  9 | def apply_mask_augmentation(input_ids, tokenizer, mask_prob=0.1):
 10 |     input_ids = deepcopy(input_ids)
 11 |     input_ids = torch.tensor(input_ids, dtype=torch.int64)
 12 |     indices_mask = torch.bernoulli(torch.full(input_ids.shape, mask_prob)).bool()
 13 | 
 14 |     do_not_mask_tokens = list(set(tokenizer.all_special_ids))
 15 |     pass_gate = [
 16 |         [0 if token_id in do_not_mask_tokens else 1 for token_id in token_id_seq] for token_id_seq in input_ids
 17 |     ]
 18 |     pass_gate = torch.tensor(pass_gate, dtype=torch.bool)
 19 | 
 20 |     indices_mask = torch.logical_and(indices_mask, pass_gate)
 21 |     input_ids[indices_mask] = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
 22 |     return input_ids
 23 | 
 24 | 
 25 | @dataclass
 26 | class AiCollator(DataCollatorWithPadding):
 27 |     """
 28 |     data collector for LLM Detect AI Generated Text task
 29 |     """
 30 | 
 31 |     tokenizer = None
 32 |     padding = True
 33 |     max_length = None
 34 |     pad_to_multiple_of = None
 35 |     return_tensors = "pt"
 36 | 
 37 |     def __call__(self, features):
 38 | 
 39 |         buffer_dict = dict()
 40 |         buffer_keys = ["id"]
 41 | 
 42 |         for key in buffer_keys:
 43 |             if key in features[0].keys():
 44 |                 value = [feature[key] for feature in features]
 45 |                 buffer_dict[key] = value
 46 | 
 47 |         labels = None
 48 |         if "generated" in features[0].keys():
 49 |             labels = [feature["generated"] for feature in features]
 50 | 
 51 |         features = [
 52 |             {
 53 |                 "input_ids": feature["input_ids"],
 54 |                 "attention_mask": feature["attention_mask"],
 55 |             } for feature in features
 56 |         ]
 57 | 
 58 |         batch = self.tokenizer.pad(
 59 |             features,
 60 |             padding='longest',  # self.padding,
 61 |             max_length=self.max_length,
 62 |             pad_to_multiple_of=self.pad_to_multiple_of,
 63 |             return_tensors=None,
 64 |         )
 65 | 
 66 |         # for key, value in buffer_dict.items():
 67 |         #     batch[key] = value
 68 | 
 69 |         if labels is not None:
 70 |             batch["labels"] = labels
 71 | 
 72 |         tensor_keys = [
 73 |             "input_ids",
 74 |             "attention_mask",
 75 |         ]
 76 | 
 77 |         for key in tensor_keys:
 78 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
 79 | 
 80 |         if labels is not None:
 81 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32)
 82 | 
 83 |         return batch
 84 | 
 85 | 
 86 | @dataclass
 87 | class AiCollatorTrain(DataCollatorWithPadding):
 88 |     """
 89 |     data collector for LLM Detect AI Generated Text task
 90 |     """
 91 | 
 92 |     tokenizer = None
 93 |     padding = True
 94 |     max_length = None
 95 |     pad_to_multiple_of = None
 96 |     return_tensors = "pt"
 97 |     kwargs: field(default_factory=dict) = None
 98 | 
 99 |     def __post_init__(self):
100 |         [setattr(self, k, v) for k, v in self.kwargs.items()]
101 | 
102 |     def __call__(self, features):
103 | 
104 |         buffer_dict = dict()
105 |         buffer_keys = ["id"]
106 | 
107 |         for key in buffer_keys:
108 |             if key in features[0].keys():
109 |                 value = [feature[key] for feature in features]
110 |                 buffer_dict[key] = value
111 | 
112 |         labels = None
113 |         if "generated" in features[0].keys():
114 |             labels = [feature["generated"] for feature in features]
115 | 
116 |         features = [
117 |             {
118 |                 "input_ids": feature["input_ids"],
119 |                 "attention_mask": feature["attention_mask"],
120 |             } for feature in features
121 |         ]
122 | 
123 |         batch = self.tokenizer.pad(
124 |             features,
125 |             padding='longest',  # self.padding,
126 |             max_length=self.max_length,
127 |             pad_to_multiple_of=self.pad_to_multiple_of,
128 |             return_tensors=None,
129 |         )
130 | 
131 |         # for key, value in buffer_dict.items():
132 |         #     batch[key] = value
133 | 
134 |         if self.cfg.train_params.use_mask_aug:
135 |             batch["input_ids"] = apply_mask_augmentation(
136 |                 batch["input_ids"], self.tokenizer, self.cfg.train_params.mask_aug_prob
137 |             )
138 | 
139 |         if labels is not None:
140 |             batch["labels"] = labels
141 | 
142 |         tensor_keys = [
143 |             "input_ids",
144 |             "attention_mask",
145 |         ]
146 | 
147 |         for key in tensor_keys:
148 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
149 | 
150 |         if labels is not None:
151 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32)
152 | 
153 |         return batch
154 | # ---
155 | 
156 | 
157 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print):
158 |     bs = batch['input_ids'].size(0)
159 |     print_fn(f"batch size: {bs}")
160 | 
161 |     print_fn(f"shape of input_ids: {batch['input_ids'].shape}")
162 | 
163 |     n_examples = min(n_examples, bs)
164 |     print_fn(f"Showing {n_examples} from a {task} batch...")
165 | 
166 |     print_fn("\n\n")
167 |     for idx in range(n_examples):
168 |         print_fn(f"Example {idx+1}")
169 |         print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
170 |         # print("\n\n")
171 | 
172 |         if "infer" not in task.lower():
173 |             print_fn("--"*20)
174 |             labels = batch['labels'][idx]
175 |             print_fn(f"Label: {labels}")
176 |         print_fn('~~'*40)
177 | 


--------------------------------------------------------------------------------
/code/r_detect/ai_model.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModel
  6 | from transformers.modeling_outputs import SequenceClassifierOutputWithPast
  7 | from transformers.models.llama.modeling_llama import (LlamaModel,
  8 |                                                       LlamaPreTrainedModel)
  9 | from transformers.models.mistral.modeling_mistral import (
 10 |     MistralModel, MistralPreTrainedModel)
 11 | from transformers.models.phi.modeling_phi import PhiModel, PhiPreTrainedModel
 12 | 
 13 | 
 14 | class MistralForDetectAI(MistralPreTrainedModel):
 15 |     def __init__(self, config):
 16 |         super().__init__(config)
 17 |         self.num_labels = config.num_labels
 18 |         self.model = MistralModel(config)
 19 |         # self.dropout = nn.Dropout(0.3)
 20 | 
 21 |         self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False)
 22 | 
 23 |         self.loss_fn = nn.BCEWithLogitsLoss()
 24 | 
 25 |         # Initialize weights and apply final processing
 26 |         self.post_init()
 27 | 
 28 |     def forward(
 29 |         self,
 30 |         input_ids: torch.LongTensor = None,
 31 |         attention_mask: Optional[torch.Tensor] = None,
 32 |         position_ids: Optional[torch.LongTensor] = None,
 33 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 34 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 35 |         labels: Optional[torch.LongTensor] = None,
 36 |         use_cache: Optional[bool] = None,
 37 |         output_attentions: Optional[bool] = None,
 38 |         output_hidden_states: Optional[bool] = None,
 39 |         return_dict: Optional[bool] = None,
 40 |     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
 41 | 
 42 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 43 | 
 44 |         transformer_outputs = self.model(
 45 |             input_ids,
 46 |             attention_mask=attention_mask,
 47 |             position_ids=position_ids,
 48 |             past_key_values=past_key_values,
 49 |             inputs_embeds=inputs_embeds,
 50 |             use_cache=use_cache,
 51 |             output_attentions=output_attentions,
 52 |             output_hidden_states=output_hidden_states,
 53 |             return_dict=return_dict,
 54 |         )
 55 |         hidden_states = transformer_outputs[0]  # (bs, seq_len, dim)
 56 |         # hidden_states = self.dropout(hidden_states)
 57 |         logits = self.classification_head(hidden_states[:, -1]).reshape(-1)  # (bs, num_labels)
 58 | 
 59 |         loss = None
 60 |         if labels is not None:
 61 |             labels = labels.to(logits.device).reshape(-1)
 62 |             loss = self.loss_fn(logits, labels)
 63 | 
 64 |         if not return_dict:
 65 |             output = (logits,) + transformer_outputs[1:]
 66 |             return ((loss,) + output) if loss is not None else output
 67 | 
 68 |         return SequenceClassifierOutputWithPast(
 69 |             loss=loss,
 70 |             logits=logits,
 71 |             past_key_values=transformer_outputs.past_key_values,
 72 |             hidden_states=transformer_outputs.hidden_states,
 73 |             attentions=transformer_outputs.attentions,
 74 |         )
 75 | 
 76 | 
 77 | class LlamaForDetectAI(LlamaPreTrainedModel):
 78 |     def __init__(self, config):
 79 |         super().__init__(config)
 80 |         self.num_labels = config.num_labels
 81 |         self.model = LlamaModel(config)
 82 |         self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False)
 83 |         self.loss_fn = nn.BCEWithLogitsLoss()
 84 | 
 85 |         # Initialize weights and apply final processing
 86 |         self.post_init()
 87 | 
 88 |     def forward(
 89 |         self,
 90 |         input_ids: torch.LongTensor = None,
 91 |         attention_mask: Optional[torch.Tensor] = None,
 92 |         position_ids: Optional[torch.LongTensor] = None,
 93 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 94 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 95 |         labels: Optional[torch.LongTensor] = None,
 96 |         use_cache: Optional[bool] = None,
 97 |         output_attentions: Optional[bool] = None,
 98 |         output_hidden_states: Optional[bool] = None,
 99 |         return_dict: Optional[bool] = None,
100 |     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
101 | 
102 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
103 | 
104 |         transformer_outputs = self.model(
105 |             input_ids,
106 |             attention_mask=attention_mask,
107 |             position_ids=position_ids,
108 |             past_key_values=past_key_values,
109 |             inputs_embeds=inputs_embeds,
110 |             use_cache=use_cache,
111 |             output_attentions=output_attentions,
112 |             output_hidden_states=output_hidden_states,
113 |             return_dict=return_dict,
114 |         )
115 |         hidden_states = transformer_outputs[0]  # (bs, seq_len, dim)
116 |         logits = self.classification_head(hidden_states[:, -1]).reshape(-1)  # (bs, num_labels)
117 | 
118 |         loss = None
119 |         if labels is not None:
120 |             labels = labels.to(logits.device).reshape(-1)
121 |             loss = self.loss_fn(logits, labels)
122 | 
123 |         if not return_dict:
124 |             output = (logits,) + transformer_outputs[1:]
125 |             return ((loss,) + output) if loss is not None else output
126 | 
127 |         return SequenceClassifierOutputWithPast(
128 |             loss=loss,
129 |             logits=logits,
130 |             past_key_values=transformer_outputs.past_key_values,
131 |             hidden_states=transformer_outputs.hidden_states,
132 |             attentions=transformer_outputs.attentions,
133 |         )
134 | 
135 | 
136 | class PhiForDetectAI(PhiPreTrainedModel):
137 |     def __init__(self, config):
138 |         super().__init__(config)
139 |         self.num_labels = config.num_labels
140 |         self.model = AutoModel(config)
141 |         self.classification_head = nn.Linear(config.hidden_size, self.num_labels, bias=False)
142 |         self.loss_fn = nn.BCEWithLogitsLoss()
143 | 
144 |         # Initialize weights and apply final processing
145 |         self.post_init()
146 | 
147 |     def forward(
148 |         self,
149 |         input_ids: torch.LongTensor = None,
150 |         attention_mask: Optional[torch.Tensor] = None,
151 |         position_ids: Optional[torch.LongTensor] = None,
152 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
153 |         inputs_embeds: Optional[torch.FloatTensor] = None,
154 |         labels: Optional[torch.LongTensor] = None,
155 |         use_cache: Optional[bool] = None,
156 |         output_attentions: Optional[bool] = None,
157 |         output_hidden_states: Optional[bool] = None,
158 |         return_dict: Optional[bool] = None,
159 |     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
160 | 
161 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
162 | 
163 |         transformer_outputs = self.model(
164 |             input_ids,
165 |             attention_mask=attention_mask,
166 |             position_ids=position_ids,
167 |             past_key_values=past_key_values,
168 |             inputs_embeds=inputs_embeds,
169 |             use_cache=use_cache,
170 |             output_attentions=output_attentions,
171 |             output_hidden_states=output_hidden_states,
172 |             return_dict=return_dict,
173 |         )
174 |         hidden_states = transformer_outputs[0]  # (bs, seq_len, dim)
175 |         logits = self.classification_head(hidden_states[:, -1]).reshape(-1)  # (bs, num_labels)
176 | 
177 |         loss = None
178 |         if labels is not None:
179 |             labels = labels.to(logits.device).reshape(-1)
180 |             loss = self.loss_fn(logits, labels)
181 | 
182 |         if not return_dict:
183 |             output = (logits,) + transformer_outputs[1:]
184 |             return ((loss,) + output) if loss is not None else output
185 | 
186 |         return SequenceClassifierOutputWithPast(
187 |             loss=loss,
188 |             logits=logits,
189 |             past_key_values=transformer_outputs.past_key_values,
190 |             hidden_states=transformer_outputs.hidden_states,
191 |             attentions=transformer_outputs.attentions,
192 |         )
193 | 


--------------------------------------------------------------------------------
/code/r_detect/ai_optimizer.py:
--------------------------------------------------------------------------------
 1 | import bitsandbytes as bnb
 2 | from torch import optim
 3 | 
 4 | 
 5 | def get_optimizer(cfg, model, print_fn=None):
 6 |     _optimizers = {
 7 |         "Adam": optim.Adam,
 8 |         "AdamW": optim.AdamW,
 9 |         "AdamW8bit": bnb.optim.Adam8bit,
10 |     }
11 |     assert cfg.optimizer.name in _optimizers, f"Optimizer {cfg.optimizer.name} not supported"
12 | 
13 |     no_decay = ["bias", "LayerNorm.weight"]
14 |     head_layer_name = "classification_head"
15 | 
16 |     # start with all of the candidate parameters
17 |     param_dict = {name: param for name, param in model.named_parameters()}
18 |     # filter out those that do not require grad
19 |     param_dict = {name: param for name, param in param_dict.items() if param.requires_grad}
20 | 
21 |     # head & body params
22 |     param_dict_head = {
23 |         name: param for name, param in param_dict.items() if head_layer_name in name
24 |     }
25 |     param_dict_body = {
26 |         name: param for name, param in param_dict.items() if head_layer_name not in name
27 |     }
28 | 
29 |     # create groups ---
30 |     head_params_no_decay = [
31 |         param for name, param in param_dict_head.items() if any(nd in name for nd in no_decay)
32 |     ]
33 |     head_params_decay = [
34 |         param for name, param in param_dict_head.items() if not any(nd in name for nd in no_decay)
35 |     ]
36 |     body_params_no_decay = [
37 |         param for name, param in param_dict_body.items() if any(nd in name for nd in no_decay)
38 |     ]
39 |     body_params_decay = [
40 |         param for name, param in param_dict_body.items() if not any(nd in name for nd in no_decay)
41 |     ]
42 | 
43 |     optim_groups = [
44 |         {'params': head_params_no_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': 0},
45 |         {'params': head_params_decay, 'lr': cfg.optimizer.head_lr, 'weight_decay': cfg.optimizer.weight_decay},
46 |         {'params': body_params_no_decay, 'lr': cfg.optimizer.lr, 'weight_decay': 0},
47 |         {'params': body_params_decay, 'lr': cfg.optimizer.lr,
48 |             'weight_decay': cfg.optimizer.weight_decay * 1e-1},  # less weight decay for body
49 |     ]
50 | 
51 |     if print_fn is not None:
52 |         n_head_params_no_decay = sum(p.numel() for p in head_params_no_decay)
53 |         n_head_params_decay = sum(p.numel() for p in head_params_decay)
54 |         n_body_params_no_decay = sum(p.numel() for p in body_params_no_decay)
55 |         n_body_params_decay = sum(p.numel() for p in body_params_decay)
56 | 
57 |         print_fn(f"n_head_params_no_decay: {n_head_params_no_decay}")
58 |         print_fn(f"n_head_params_decay: {n_head_params_decay}")
59 |         print_fn(f"n_body_params_no_decay: {n_body_params_no_decay}")
60 |         print_fn(f"n_body_params_decay: {n_body_params_decay}")
61 | 
62 |     # Create AdamW optimizer and use the fused version if it is available
63 |     optimizer = _optimizers[cfg.optimizer.name](
64 |         optim_groups,
65 |         lr=cfg.optimizer.lr,
66 |         weight_decay=cfg.optimizer.weight_decay,
67 |     )
68 |     return optimizer
69 | 


--------------------------------------------------------------------------------
/code/r_embed/ai_dataset.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from datasets import Dataset
 4 | from transformers import AutoTokenizer
 5 | 
 6 | # --------------- Dataset ----------------------------------------------#
 7 | 
 8 | 
 9 | class AiDataset:
10 |     """
11 |     Dataset class for LLM Detect AI Generated Text competition
12 |     """
13 | 
14 |     def __init__(self, cfg):
15 |         self.cfg = cfg
16 |         self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path)
17 | 
18 |     def tokenize_function(self, examples):
19 |         tz = self.tokenizer(
20 |             examples["text"],
21 |             padding=False,
22 |             truncation=True,
23 |             max_length=self.cfg.model.max_length,
24 |             add_special_tokens=True,
25 |             return_token_type_ids=False,
26 |         )
27 | 
28 |         return tz
29 | 
30 |     def compute_input_length(self, examples):
31 |         return {"input_length": [len(x) for x in examples["input_ids"]]}
32 | 
33 |     def get_dataset(self, df):
34 |         """
35 |         Main api for creating the Science Exam dataset
36 |         :param df: input dataframe
37 |         :type df: pd.DataFrame
38 |         :return: the created dataset
39 |         :rtype: Dataset
40 |         """
41 |         df = deepcopy(df)
42 |         task_dataset = Dataset.from_pandas(df)
43 | 
44 |         task_dataset = task_dataset.map(self.tokenize_function, batched=True)
45 |         task_dataset = task_dataset.map(self.compute_input_length, batched=True)
46 | 
47 |         try:
48 |             task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"])
49 |         except Exception as e:
50 |             print(e)
51 | 
52 |         return task_dataset
53 | 


--------------------------------------------------------------------------------
/code/r_embed/ai_loader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import random
  4 | import time
  5 | from dataclasses import dataclass, field
  6 | 
  7 | import torch
  8 | from transformers import DataCollatorWithPadding
  9 | 
 10 | 
 11 | @dataclass
 12 | class AiCollator(DataCollatorWithPadding):
 13 |     """
 14 |     data collector for LLM Detect AI Generated Text task
 15 |     """
 16 | 
 17 |     tokenizer = None
 18 |     padding = True
 19 |     max_length = None
 20 |     pad_to_multiple_of = None
 21 |     return_tensors = "pt"
 22 | 
 23 |     def __call__(self, features):
 24 |         labels = None
 25 |         if "generated" in features[0].keys():
 26 |             labels = [feature["generated"] for feature in features]
 27 | 
 28 |         features = [
 29 |             {
 30 |                 "input_ids": feature["input_ids"],
 31 |                 "attention_mask": feature["attention_mask"],
 32 |             } for feature in features
 33 |         ]
 34 | 
 35 |         batch = self.tokenizer.pad(
 36 |             features,
 37 |             padding=self.padding,
 38 |             max_length=self.max_length,
 39 |             pad_to_multiple_of=self.pad_to_multiple_of,
 40 |             return_tensors=None,
 41 |         )
 42 | 
 43 |         if labels is not None:
 44 |             batch["labels"] = labels
 45 | 
 46 |         tensor_keys = [
 47 |             "input_ids",
 48 |             "attention_mask",
 49 |         ]
 50 | 
 51 |         for key in tensor_keys:
 52 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
 53 | 
 54 |         if labels is not None:
 55 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
 56 | 
 57 |         return batch
 58 | 
 59 | 
 60 | @dataclass
 61 | class AiCollatorTrain(DataCollatorWithPadding):
 62 |     """
 63 |     data collector for LLM Detect AI Generated Text task
 64 |     """
 65 | 
 66 |     tokenizer = None
 67 |     padding = True
 68 |     max_length = None
 69 |     pad_to_multiple_of = None
 70 |     return_tensors = "pt"
 71 |     kwargs: field(default_factory=dict) = None
 72 | 
 73 |     def __post_init__(self):
 74 |         [setattr(self, k, v) for k, v in self.kwargs.items()]
 75 | 
 76 |         # mappings
 77 |         example2idx = dict()
 78 |         example_ids = self.train_ds["id"]
 79 | 
 80 |         for idx in range(len(example_ids)):
 81 |             example2idx[example_ids[idx]] = idx
 82 |         self.example2idx = example2idx
 83 | 
 84 |         seed = seed = int(time.time() * 1000) + os.getpid()
 85 |         self.rng = random.Random(seed)
 86 | 
 87 |         print("=="*40)
 88 |         print(f"setting random seed in data collator as: {seed}")
 89 |         print("=="*40)
 90 | 
 91 |     def process_features(self, example_ids):
 92 |         updated_features = []
 93 |         for eid in example_ids:
 94 |             example = dict()
 95 | 
 96 |             example["id"] = eid
 97 |             ex_info = self.train_ds[self.example2idx[eid]]
 98 | 
 99 |             # use fields
100 |             example["input_ids"] = ex_info["input_ids"]
101 |             example["attention_mask"] = ex_info["attention_mask"]
102 |             example["generated"] = ex_info["generated"]
103 |             updated_features.append(example)
104 | 
105 |         return updated_features
106 | 
107 |     def __call__(self, features):
108 |         bs = len(features)
109 |         selected_prompt_id = self.rng.choice(self.prompt_ids)
110 |         selected_example_ids_pos = self.rng.sample(self.prompt2ids_pos[selected_prompt_id], k=bs//2)
111 |         selected_example_ids_neg = self.rng.sample(self.prompt2ids_neg[selected_prompt_id], k=bs//2)
112 |         selected_example_ids = selected_example_ids_pos + selected_example_ids_neg
113 |         features = self.process_features(selected_example_ids)
114 | 
115 |         labels = None
116 |         if "generated" in features[0].keys():
117 |             labels = [feature["generated"] for feature in features]
118 | 
119 |         features = [
120 |             {
121 |                 "input_ids": feature["input_ids"],
122 |                 "attention_mask": feature["attention_mask"],
123 |             } for feature in features
124 |         ]
125 | 
126 |         batch = self.tokenizer.pad(
127 |             features,
128 |             padding=self.padding,
129 |             max_length=self.max_length,
130 |             pad_to_multiple_of=self.pad_to_multiple_of,
131 |             return_tensors=None,
132 |         )
133 | 
134 |         if labels is not None:
135 |             batch["labels"] = labels
136 | 
137 |         tensor_keys = [
138 |             "input_ids",
139 |             "attention_mask",
140 |         ]
141 | 
142 |         for key in tensor_keys:
143 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
144 | 
145 |         if labels is not None:
146 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
147 | 
148 |         return batch
149 | 
150 | # ---
151 | 
152 | 
153 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print):
154 |     print_fn("##"*40)
155 |     bs = batch['input_ids'].size(0)
156 |     print_fn(f"batch size: {bs}")
157 | 
158 |     print_fn(f"shape of input_ids: {batch['input_ids'].shape}")
159 | 
160 |     n_examples = min(n_examples, bs)
161 |     print_fn(f"Showing {n_examples} from a {task} batch...")
162 | 
163 |     print_fn("\n\n")
164 |     for idx in range(n_examples):
165 |         print_fn(f"Example {idx+1}")
166 |         print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
167 |         # print("\n\n")
168 | 
169 |         if "infer" not in task.lower():
170 |             print_fn("--"*20)
171 |             labels = batch['labels'][idx]
172 |             print_fn(f"Label: {labels}")
173 |         print_fn('=='*40)
174 |     print_fn("##"*40)
175 | 


--------------------------------------------------------------------------------
/code/r_embed/ai_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.utils.checkpoint
  5 | from transformers import AutoConfig, AutoModel
  6 | 
  7 | 
  8 | class MeanPooling(nn.Module):
  9 |     def __init__(self):
 10 |         super(MeanPooling, self).__init__()
 11 | 
 12 |     def forward(self, last_hidden_state, attention_mask):
 13 |         input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
 14 |         sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
 15 |         sum_mask = input_mask_expanded.sum(1)
 16 |         sum_mask = torch.clamp(sum_mask, min=1e-9)
 17 |         mean_embeddings = sum_embeddings / sum_mask
 18 |         return mean_embeddings
 19 | 
 20 | # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10034742
 21 | 
 22 | 
 23 | class SupContrastiveLoss(nn.Module):
 24 |     def __init__(self, temperature, device):
 25 |         super(SupContrastiveLoss, self).__init__()
 26 |         self.temperature = temperature
 27 |         self.device = device
 28 | 
 29 |     def forward(self, outputs, labels):
 30 |         N = outputs.size()[0]
 31 |         labels = labels.reshape(N, 1)
 32 |         self_similarity_mask = torch.ones((N, N)).fill_diagonal_(0).to(self.device)
 33 | 
 34 |         pos_mask = torch.eq(labels, labels.T).float()
 35 |         neg_mask = torch.abs(pos_mask - 1)
 36 | 
 37 |         H = torch.matmul(outputs, outputs.T) * self_similarity_mask
 38 |         H_pos = H * pos_mask
 39 |         H_neg = H * neg_mask
 40 | 
 41 |         v_pos = torch.mean(torch.exp(torch.div(H_pos, self.temperature)), dim=1)
 42 |         v_neg = torch.mean(torch.exp(torch.div(H_neg, self.temperature)), dim=1)
 43 | 
 44 |         loss = (-1/N) * torch.sum(torch.log(v_pos/(v_pos + v_neg)))
 45 | 
 46 |         return loss
 47 | 
 48 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 49 | # Rank Model
 50 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 51 | 
 52 | 
 53 | class AiModel(nn.Module):
 54 |     """
 55 |     The LLM Detect AI Generated Text Model
 56 |     """
 57 | 
 58 |     def __init__(self, cfg, device):
 59 |         print("initializing the Rank Model...")
 60 | 
 61 |         super(AiModel, self).__init__()
 62 |         self.cfg = cfg
 63 | 
 64 |         # ----------------------------- Backbone -----------------------------------------#
 65 |         backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path)
 66 |         backbone_config.update(
 67 |             {
 68 |                 "use_cache": False,
 69 |             }
 70 |         )
 71 | 
 72 |         self.backbone = AutoModel.from_pretrained(
 73 |             self.cfg.model.backbone_path,
 74 |             config=backbone_config
 75 |         )
 76 |         if self.cfg.model.gradient_checkpointing:
 77 |             self.backbone.gradient_checkpointing_enable()
 78 | 
 79 |         self.dropout = nn.Dropout(self.cfg.model.dropout_rate)
 80 | 
 81 |         hidden_size = self.backbone.config.hidden_size
 82 |         project_dim = self.cfg.model.projection_dim
 83 |         self.pool = MeanPooling()
 84 | 
 85 |         self.projection_head = nn.Sequential(
 86 |             nn.Dropout(self.cfg.model.dropout_rate),
 87 |             nn.Linear(hidden_size, project_dim),
 88 |             nn.ReLU(),
 89 |             nn.Linear(project_dim, project_dim)
 90 |         )
 91 | 
 92 |         # loss function
 93 |         self.loss_fn = SupContrastiveLoss(
 94 |             temperature=self.cfg.model.temperature,
 95 |             device=device,
 96 |         )
 97 | 
 98 |     def encode(
 99 |         self,
100 |         input_ids,
101 |         attention_mask,
102 |     ):
103 |         outputs = self.backbone(
104 |             input_ids,
105 |             attention_mask=attention_mask,
106 |             output_hidden_states=False,
107 |         )
108 | 
109 |         encoder_layer = outputs.last_hidden_state
110 |         embeddings = self.pool(encoder_layer, attention_mask)
111 |         embeddings = self.projection_head(embeddings)
112 |         embeddings = F.normalize(embeddings, dim=-1)
113 | 
114 |         return embeddings
115 | 
116 |     def forward(self, input_ids, attention_mask, labels=None, **kwargs):
117 |         # features
118 |         embeddings = self.encode(
119 |             input_ids=input_ids,
120 |             attention_mask=attention_mask,
121 |         )  # (bs, num_features)
122 | 
123 |         # loss
124 |         loss = None
125 |         if labels is not None:
126 |             loss = self.loss_fn(embeddings, labels)
127 | 
128 |         return loss
129 | 


--------------------------------------------------------------------------------
/code/r_embed/ai_optimizer.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import AdamW
 2 | 
 3 | 
 4 | def get_optimizer_grouped_parameters_no_llrd(model, cfg):
 5 | 
 6 |     no_decay = ['bias', "LayerNorm.bias", "LayerNorm.weight"]
 7 |     backbone_params = model.backbone.named_parameters()
 8 | 
 9 |     optimizer_parameters = [
10 |         {
11 |             "params": [p for n, p in model.named_parameters() if "backbone" not in n],
12 |             "lr": cfg.optimizer.lr,
13 |             "weight_decay": cfg.optimizer.weight_decay,
14 |         },
15 |         {
16 |             "params": [p for n, p in backbone_params if not any(nd in n for nd in no_decay)],
17 |             "lr": cfg.optimizer.lr,
18 |             "weight_decay": cfg.optimizer.weight_decay,
19 |         },
20 |         {
21 |             "params": [p for n, p in backbone_params if any(nd in n for nd in no_decay)],
22 |             "lr": cfg.optimizer.lr,
23 |             "weight_decay": 0.0,
24 |         },
25 |     ]
26 | 
27 |     return optimizer_parameters
28 | 
29 | 
30 | def get_optimizer_grouped_parameters_with_llrd(model, cfg):
31 |     """layerwise learning rate decay implementation
32 |     """
33 |     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
34 | 
35 |     # initialize lr for task specific layer
36 |     optimizer_grouped_parameters = [
37 |         {
38 |             "params": [p for n, p in model.named_parameters() if "backbone" not in n],
39 |             "lr": cfg.optimizer.head_lr,
40 |             "weight_decay": cfg.optimizer.weight_decay,
41 |         },
42 |     ]
43 | 
44 |     # initialize lrs for backbone layers
45 |     layers = [model.backbone.embeddings] + list(model.backbone.encoder.layer)
46 |     layers.reverse()
47 |     lr = cfg.optimizer.lr
48 | 
49 |     for layer in layers:
50 |         lr *= cfg.optimizer.llrd
51 | 
52 |         optimizer_grouped_parameters += [
53 |             {
54 |                 "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
55 |                 "weight_decay": cfg.optimizer.weight_decay,
56 |                 "lr": lr,
57 |             },
58 | 
59 |             {
60 |                 "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
61 |                 "weight_decay": 0.0,
62 |                 "lr": lr,
63 |             },
64 |         ]
65 | 
66 |     return optimizer_grouped_parameters
67 | 
68 | 
69 | def get_optimizer(model, cfg):
70 |     """optimizer for model training
71 |     """
72 | 
73 |     if cfg.optimizer.use_llrd:
74 |         optimizer_grouped_parameters = get_optimizer_grouped_parameters_with_llrd(model, cfg)
75 |     else:
76 |         optimizer_grouped_parameters = get_optimizer_grouped_parameters_no_llrd(model, cfg)
77 | 
78 |     optimizer_kwargs = {
79 |         "betas": (cfg.optimizer.beta1, cfg.optimizer.beta2),
80 |         "eps": cfg.optimizer.eps,
81 |         "lr": cfg.optimizer.lr
82 |     }
83 | 
84 |     if cfg.optimizer.use_bnb:
85 |         import bitsandbytes as bnb
86 | 
87 |         optimizer = bnb.optim.Adam8bit(
88 |             optimizer_grouped_parameters,
89 |             **optimizer_kwargs
90 |         )
91 |         return optimizer
92 |     else:
93 |         optimizer = AdamW(
94 |             optimizer_grouped_parameters,
95 |             **optimizer_kwargs
96 |         )
97 | 
98 |     return optimizer
99 | 


--------------------------------------------------------------------------------
/code/r_ranking/ai_dataset.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from datasets import Dataset
 4 | from transformers import AutoTokenizer
 5 | 
 6 | # --------------- Dataset ----------------------------------------------#
 7 | 
 8 | 
 9 | class AiDataset:
10 |     """
11 |     Dataset class for LLM Detect AI Generated Text competition
12 |     """
13 | 
14 |     def __init__(self, cfg):
15 |         self.cfg = cfg
16 |         self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path)
17 | 
18 |     def tokenize_function(self, examples):
19 |         tz = self.tokenizer(
20 |             examples["text"],
21 |             padding=False,
22 |             truncation=True,
23 |             max_length=self.cfg.model.max_length,
24 |             add_special_tokens=True,
25 |             return_token_type_ids=False,
26 |         )
27 | 
28 |         return tz
29 | 
30 |     def compute_input_length(self, examples):
31 |         return {"input_length": [len(x) for x in examples["input_ids"]]}
32 | 
33 |     def get_dataset(self, df):
34 |         """
35 |         Main api for creating the Science Exam dataset
36 |         :param df: input dataframe
37 |         :type df: pd.DataFrame
38 |         :return: the created dataset
39 |         :rtype: Dataset
40 |         """
41 |         df = deepcopy(df)
42 |         task_dataset = Dataset.from_pandas(df)
43 | 
44 |         task_dataset = task_dataset.map(self.tokenize_function, batched=True)
45 |         task_dataset = task_dataset.map(self.compute_input_length, batched=True)
46 | 
47 |         try:
48 |             task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"])
49 |         except Exception as e:
50 |             print(e)
51 | 
52 |         return task_dataset
53 | 


--------------------------------------------------------------------------------
/code/r_ranking/ai_loader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import random
  4 | import time
  5 | from dataclasses import dataclass, field
  6 | 
  7 | import torch
  8 | from transformers import DataCollatorWithPadding
  9 | 
 10 | 
 11 | @dataclass
 12 | class AiCollator(DataCollatorWithPadding):
 13 |     """
 14 |     data collector for LLM Detect AI Generated Text task
 15 |     """
 16 | 
 17 |     tokenizer = None
 18 |     padding = True
 19 |     max_length = None
 20 |     pad_to_multiple_of = None
 21 |     return_tensors = "pt"
 22 | 
 23 |     def __call__(self, features):
 24 |         labels = None
 25 |         if "generated" in features[0].keys():
 26 |             labels = [feature["generated"] for feature in features]
 27 | 
 28 |         features = [
 29 |             {
 30 |                 "input_ids": feature["input_ids"],
 31 |                 "attention_mask": feature["attention_mask"],
 32 |             } for feature in features
 33 |         ]
 34 | 
 35 |         batch = self.tokenizer.pad(
 36 |             features,
 37 |             padding=self.padding,
 38 |             max_length=self.max_length,
 39 |             pad_to_multiple_of=self.pad_to_multiple_of,
 40 |             return_tensors=None,
 41 |         )
 42 | 
 43 |         if labels is not None:
 44 |             batch["labels"] = labels
 45 | 
 46 |         tensor_keys = [
 47 |             "input_ids",
 48 |             "attention_mask",
 49 |         ]
 50 | 
 51 |         for key in tensor_keys:
 52 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
 53 | 
 54 |         if labels is not None:
 55 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
 56 | 
 57 |         return batch
 58 | 
 59 | 
 60 | @dataclass
 61 | class AiCollatorTrain(DataCollatorWithPadding):
 62 |     """
 63 |     data collector for LLM Detect AI Generated Text task
 64 |     """
 65 | 
 66 |     tokenizer = None
 67 |     padding = True
 68 |     max_length = None
 69 |     pad_to_multiple_of = None
 70 |     return_tensors = "pt"
 71 |     kwargs: field(default_factory=dict) = None
 72 | 
 73 |     def __post_init__(self):
 74 |         [setattr(self, k, v) for k, v in self.kwargs.items()]
 75 | 
 76 |         # mappings
 77 |         example2idx = dict()
 78 |         example_ids = self.train_ds["id"]
 79 | 
 80 |         for idx in range(len(example_ids)):
 81 |             example2idx[example_ids[idx]] = idx
 82 |         self.example2idx = example2idx
 83 | 
 84 |         seed = seed = int(time.time() * 1000) + os.getpid()  # random.randint(100, 1000)
 85 |         self.rng = random.Random(seed)
 86 | 
 87 |         print("=="*40)
 88 |         print(f"setting random seed in data collator as: {seed}")
 89 |         print("=="*40)
 90 | 
 91 |     def process_features(self, example_ids):
 92 |         updated_features = []
 93 |         for eid in example_ids:
 94 |             example = dict()
 95 | 
 96 |             example["id"] = eid
 97 |             ex_info = self.train_ds[self.example2idx[eid]]
 98 | 
 99 |             # use fields
100 |             example["input_ids"] = ex_info["input_ids"]
101 |             example["attention_mask"] = ex_info["attention_mask"]
102 |             example["generated"] = ex_info["generated"]
103 |             updated_features.append(example)
104 | 
105 |         return updated_features
106 | 
107 |     def __call__(self, features):
108 |         bs = len(features)
109 | 
110 |         if self.rng.random() < 0.8:
111 |             selected_prompt_id = self.rng.choice(self.prompt_ids)
112 |             selected_example_ids = self.rng.sample(self.prompt2ids[selected_prompt_id], k=bs)
113 |             features = self.process_features(selected_example_ids)
114 | 
115 |         labels = None
116 |         if "generated" in features[0].keys():
117 |             labels = [feature["generated"] for feature in features]
118 | 
119 |         features = [
120 |             {
121 |                 "input_ids": feature["input_ids"],
122 |                 "attention_mask": feature["attention_mask"],
123 |             } for feature in features
124 |         ]
125 | 
126 |         batch = self.tokenizer.pad(
127 |             features,
128 |             padding=self.padding,
129 |             max_length=self.max_length,
130 |             pad_to_multiple_of=self.pad_to_multiple_of,
131 |             return_tensors=None,
132 |         )
133 | 
134 |         if labels is not None:
135 |             batch["labels"] = labels
136 | 
137 |         tensor_keys = [
138 |             "input_ids",
139 |             "attention_mask",
140 |         ]
141 | 
142 |         for key in tensor_keys:
143 |             batch[key] = torch.tensor(batch[key], dtype=torch.int64)
144 | 
145 |         if labels is not None:
146 |             batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
147 | 
148 |         return batch
149 | 
150 | # ---
151 | 
152 | 
153 | def show_batch(batch, tokenizer, n_examples=16, task='training', print_fn=print):
154 |     print_fn("##"*40)
155 |     bs = batch['input_ids'].size(0)
156 |     print_fn(f"batch size: {bs}")
157 | 
158 |     print_fn(f"shape of input_ids: {batch['input_ids'].shape}")
159 | 
160 |     n_examples = min(n_examples, bs)
161 |     print_fn(f"Showing {n_examples} from a {task} batch...")
162 | 
163 |     print_fn("\n\n")
164 |     for idx in range(n_examples):
165 |         print_fn(f"Example {idx+1}")
166 |         print_fn(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
167 |         # print("\n\n")
168 | 
169 |         if "infer" not in task.lower():
170 |             print_fn("--"*20)
171 |             labels = batch['labels'][idx]
172 |             print_fn(f"Label: {labels}")
173 |         print_fn('=='*40)
174 |     print_fn("##"*40)
175 | 


--------------------------------------------------------------------------------
/code/r_ranking/ai_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.utils.checkpoint
  5 | from transformers import AutoConfig, AutoModel
  6 | 
  7 | 
  8 | def get_ranking_loss(logits, labels, margin=0.7):
  9 |     logits = torch.sigmoid(logits)
 10 |     labels1 = labels.unsqueeze(1)
 11 |     labels2 = labels.unsqueeze(0)
 12 | 
 13 |     logits1 = logits.unsqueeze(1)
 14 |     logits2 = logits.unsqueeze(0)
 15 | 
 16 |     y_ij = torch.sign(labels1 - labels2)
 17 |     r_ij = logits1 - logits2
 18 | 
 19 |     loss = torch.clamp(-r_ij*y_ij + margin, min=0.0).mean()
 20 |     return loss
 21 | 
 22 | 
 23 | class MeanPooling(nn.Module):
 24 |     def __init__(self):
 25 |         super(MeanPooling, self).__init__()
 26 | 
 27 |     def forward(self, last_hidden_state, attention_mask):
 28 |         input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
 29 |         sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
 30 |         sum_mask = input_mask_expanded.sum(1)
 31 |         sum_mask = torch.clamp(sum_mask, min=1e-9)
 32 |         mean_embeddings = sum_embeddings / sum_mask
 33 |         return mean_embeddings
 34 | 
 35 | 
 36 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 37 | # Rank Model
 38 | # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 39 | 
 40 | 
 41 | class AiModel(nn.Module):
 42 |     """
 43 |     The LLM Detect AI Generated Text Model
 44 |     """
 45 | 
 46 |     def __init__(self, cfg, device):
 47 |         print("initializing the Rank Model...")
 48 | 
 49 |         super(AiModel, self).__init__()
 50 |         self.cfg = cfg
 51 | 
 52 |         # ----------------------------- Backbone -----------------------------------------#
 53 |         backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path)
 54 |         backbone_config.update(
 55 |             {
 56 |                 "use_cache": False,
 57 |             }
 58 |         )
 59 | 
 60 |         self.backbone = AutoModel.from_pretrained(
 61 |             self.cfg.model.backbone_path,
 62 |             config=backbone_config
 63 |         )
 64 |         if self.cfg.model.gradient_checkpointing:
 65 |             self.backbone.gradient_checkpointing_enable()
 66 | 
 67 |         self.dropout = nn.Dropout(self.cfg.model.dropout_rate)
 68 | 
 69 |         # classifier
 70 |         num_features = self.backbone.config.hidden_size
 71 |         self.classifier = nn.Linear(num_features, 1)
 72 | 
 73 |         self.pool = MeanPooling()
 74 | 
 75 |     def encode(
 76 |         self,
 77 |         input_ids,
 78 |         attention_mask,
 79 |     ):
 80 |         outputs = self.backbone(
 81 |             input_ids,
 82 |             attention_mask=attention_mask,
 83 |             output_hidden_states=False,
 84 |         )
 85 | 
 86 |         encoder_layer = outputs.last_hidden_state
 87 |         embeddings = self.pool(encoder_layer, attention_mask)
 88 | 
 89 |         return embeddings
 90 | 
 91 |     def forward(self, input_ids, attention_mask, labels=None, **kwargs):
 92 |         # features
 93 |         features = self.encode(
 94 |             input_ids=input_ids,
 95 |             attention_mask=attention_mask,
 96 |         )
 97 |         features = self.dropout(features)
 98 |         logits = self.classifier(features).reshape(-1)
 99 | 
100 |         # loss
101 |         loss = None
102 |         labels = labels.reshape(-1)
103 |         if labels is not None:
104 |             loss = get_ranking_loss(logits, labels)
105 | 
106 |         return logits, loss
107 | 


--------------------------------------------------------------------------------
/code/r_ranking/ai_optimizer.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import AdamW
 2 | 
 3 | 
 4 | def get_optimizer_grouped_parameters_no_llrd(model, cfg):
 5 | 
 6 |     no_decay = ['bias', "LayerNorm.bias", "LayerNorm.weight"]
 7 |     backbone_params = model.backbone.named_parameters()
 8 | 
 9 |     optimizer_parameters = [
10 |         {
11 |             "params": [p for n, p in model.named_parameters() if "backbone" not in n],
12 |             "lr": cfg.optimizer.lr,
13 |             "weight_decay": cfg.optimizer.weight_decay,
14 |         },
15 |         {
16 |             "params": [p for n, p in backbone_params if not any(nd in n for nd in no_decay)],
17 |             "lr": cfg.optimizer.lr,
18 |             "weight_decay": cfg.optimizer.weight_decay,
19 |         },
20 |         {
21 |             "params": [p for n, p in backbone_params if any(nd in n for nd in no_decay)],
22 |             "lr": cfg.optimizer.lr,
23 |             "weight_decay": 0.0,
24 |         },
25 |     ]
26 | 
27 |     return optimizer_parameters
28 | 
29 | 
30 | def get_optimizer_grouped_parameters_with_llrd(model, cfg):
31 |     """layerwise learning rate decay implementation
32 |     """
33 |     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
34 | 
35 |     # initialize lr for task specific layer
36 |     optimizer_grouped_parameters = [
37 |         {
38 |             "params": [p for n, p in model.named_parameters() if "backbone" not in n],
39 |             "lr": cfg.optimizer.head_lr,
40 |             "weight_decay": cfg.optimizer.weight_decay,
41 |         },
42 |     ]
43 | 
44 |     # initialize lrs for backbone layers
45 |     layers = [model.backbone.embeddings] + list(model.backbone.encoder.layer)
46 |     layers.reverse()
47 |     lr = cfg.optimizer.lr
48 | 
49 |     for layer in layers:
50 |         lr *= cfg.optimizer.llrd
51 | 
52 |         optimizer_grouped_parameters += [
53 |             {
54 |                 "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
55 |                 "weight_decay": cfg.optimizer.weight_decay,
56 |                 "lr": lr,
57 |             },
58 | 
59 |             {
60 |                 "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
61 |                 "weight_decay": 0.0,
62 |                 "lr": lr,
63 |             },
64 |         ]
65 | 
66 |     return optimizer_grouped_parameters
67 | 
68 | 
69 | def get_optimizer(model, cfg):
70 |     """optimizer for model training
71 |     """
72 | 
73 |     if cfg.optimizer.use_llrd:
74 |         optimizer_grouped_parameters = get_optimizer_grouped_parameters_with_llrd(model, cfg)
75 |     else:
76 |         optimizer_grouped_parameters = get_optimizer_grouped_parameters_no_llrd(model, cfg)
77 | 
78 |     optimizer_kwargs = {
79 |         "betas": (cfg.optimizer.beta1, cfg.optimizer.beta2),
80 |         "eps": cfg.optimizer.eps,
81 |         "lr": cfg.optimizer.lr
82 |     }
83 | 
84 |     if cfg.optimizer.use_bnb:
85 |         import bitsandbytes as bnb
86 | 
87 |         optimizer = bnb.optim.Adam8bit(
88 |             optimizer_grouped_parameters,
89 |             **optimizer_kwargs
90 |         )
91 |         return optimizer
92 |     else:
93 |         optimizer = AdamW(
94 |             optimizer_grouped_parameters,
95 |             **optimizer_kwargs
96 |         )
97 | 
98 |     return optimizer
99 | 


--------------------------------------------------------------------------------
/code/train_r_clm.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import time
  5 | 
  6 | import datasets
  7 | import hydra
  8 | import numpy as np
  9 | import pandas as pd
 10 | import torch
 11 | import transformers
 12 | from accelerate import Accelerator
 13 | from accelerate.logging import get_logger
 14 | from accelerate.utils import set_seed
 15 | from omegaconf import OmegaConf
 16 | from peft import LoraConfig, TaskType, get_peft_model
 17 | from torch.utils.data import DataLoader
 18 | from tqdm.auto import tqdm
 19 | from transformers import (AutoModelForCausalLM, BitsAndBytesConfig,
 20 |                           get_cosine_schedule_with_warmup)
 21 | 
 22 | try:
 23 |     from r_clm.ai_dataset import AiDataset
 24 |     from r_clm.ai_loader import AiCollator, show_batch
 25 |     from r_clm.ai_optimizer import get_optimizer
 26 |     from utils.train_utils import AverageMeter, as_minutes, get_lr
 27 | 
 28 | 
 29 | except Exception as e:
 30 |     print(e)
 31 |     raise ImportError
 32 | 
 33 | logger = get_logger(__name__)
 34 | 
 35 | 
 36 | def run_evaluation(accelerator, model, valid_dl):
 37 |     model.eval()
 38 | 
 39 |     all_losses = []
 40 | 
 41 |     progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process)
 42 | 
 43 |     for step, batch in enumerate(valid_dl):
 44 |         with torch.no_grad():
 45 |             outputs = model(**batch)
 46 | 
 47 |         loss = outputs.loss
 48 |         batch_losses = accelerator.gather_for_metrics(loss)
 49 |         batch_losses = batch_losses.cpu().numpy().tolist()
 50 | 
 51 |         all_losses.extend(batch_losses)
 52 |         progress_bar.update(1)
 53 |     progress_bar.close()
 54 | 
 55 |     # compute metric
 56 |     eval_dict = dict()  # compute_metrics(all_predictions, all_truths)
 57 |     eval_dict['valid_loss'] = np.mean(all_losses)
 58 | 
 59 |     return eval_dict
 60 | 
 61 | 
 62 | @hydra.main(version_base=None, config_path="../conf/r_clm", config_name="conf_r_clm")
 63 | def run_training(cfg):
 64 |     # ------- Accelerator ---------------------------------------------------------------#
 65 |     if cfg.use_wandb:
 66 |         accelerator = Accelerator(
 67 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 68 |             log_with="wandb",
 69 |         )
 70 | 
 71 |         accelerator.init_trackers(
 72 |             cfg.wandb.project,
 73 |             config=OmegaConf.to_container(cfg, resolve=True),
 74 |         )
 75 | 
 76 |     else:
 77 |         accelerator = Accelerator(
 78 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 79 |         )
 80 | 
 81 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
 82 | 
 83 |     # Make one log on every process with the configuration for debugging.
 84 |     logging.basicConfig(
 85 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 86 |         datefmt="%m/%d/%Y %H:%M:%S",
 87 |         level=logging.INFO,
 88 |     )
 89 |     logger.info(accelerator.state, main_process_only=False)
 90 | 
 91 |     # print_line = partial(print_line, accelerator)
 92 | 
 93 |     def print_line():
 94 |         prefix, unit, suffix = "#", "~~", "#"
 95 |         accelerator.print(prefix + unit*50 + suffix)
 96 | 
 97 |     if accelerator.is_local_main_process:
 98 |         datasets.utils.logging.set_verbosity_warning()
 99 |         transformers.utils.logging.set_verbosity_info()
100 |     else:
101 |         datasets.utils.logging.set_verbosity_error()
102 |         transformers.utils.logging.set_verbosity_error()
103 | 
104 |     # ------- Runtime Configs -----------------------------------------------------------#
105 |     print_line()
106 |     accelerator.print(f"setting seed: {cfg.seed}")
107 |     set_seed(cfg.seed)
108 | 
109 |     if accelerator.is_main_process:
110 |         os.makedirs(cfg.outputs.model_dir, exist_ok=True)
111 |     print_line()
112 | 
113 |     # ------- load data -----------------------------------------------------------------#
114 |     print_line()
115 | 
116 |     # load query dataframe ---
117 |     essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"})
118 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
119 |     essay_df = essay_df.reset_index(drop=True)
120 | 
121 |     # ------- Data Split ----------------------------------------------------------------#
122 |     # sample validation data
123 |     rng = random.Random(cfg.seed)
124 |     essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.98 else 'valid')
125 |     train_df = essay_df[essay_df['fold'] == 'train'].copy()
126 |     valid_df = essay_df[essay_df['fold'] == 'valid'].copy()
127 | 
128 |     train_df = train_df.reset_index(drop=True)
129 |     valid_df = valid_df.reset_index(drop=True)
130 | 
131 |     accelerator.print(f"shape of train data: {train_df.shape}")
132 |     accelerator.print(f"{train_df.head()}")
133 |     accelerator.print(f"shape of validation data: {valid_df.shape}")
134 | 
135 |     with accelerator.main_process_first():
136 |         dataset_creator = AiDataset(cfg)
137 | 
138 |         train_ds = dataset_creator.get_dataset(train_df)
139 |         valid_ds = dataset_creator.get_dataset(valid_df)
140 | 
141 |     tokenizer = dataset_creator.tokenizer
142 | 
143 |     train_ds.set_format(
144 |         type=None,
145 |         columns=[
146 |             'input_ids',
147 |             'attention_mask',
148 |             'labels'
149 |         ]
150 |     )
151 | 
152 |     # valid_ds = valid_ds.sort("input_length")
153 | 
154 |     valid_ds.set_format(
155 |         type=None,
156 |         columns=[
157 |             'input_ids',
158 |             'attention_mask',
159 |             'labels'
160 |         ]
161 |     )
162 |     # valid_ids = valid_df["id"]  # .tolist()
163 | 
164 |     data_collator = AiCollator(
165 |         tokenizer=tokenizer,
166 |         pad_to_multiple_of=64
167 |     )
168 | 
169 |     train_dl = DataLoader(
170 |         train_ds,
171 |         batch_size=cfg.train_params.per_device_train_batch_size,
172 |         shuffle=True,
173 |         collate_fn=data_collator,
174 |     )
175 | 
176 |     valid_dl = DataLoader(
177 |         valid_ds,
178 |         batch_size=cfg.train_params.per_device_eval_batch_size,
179 |         shuffle=False,
180 |         collate_fn=data_collator,
181 |     )
182 | 
183 |     accelerator.print("data preparation done...")
184 |     print_line()
185 | 
186 |     # --- show batch -------------------------------------------------------------------#
187 |     print_line()
188 | 
189 |     for b in train_dl:
190 |         break
191 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
192 | 
193 |     print_line()
194 | 
195 |     for b in valid_dl:
196 |         break
197 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
198 | 
199 |     # --- model -------------------------------------------------------------------------#
200 |     print_line()
201 | 
202 |     # Note: avoid quantization for smaller models (e.g. opt-125m, bloom-560m) for training stability --
203 |     bnb_config = BitsAndBytesConfig(
204 |         load_in_4bit=True,
205 |         bnb_4bit_quant_type="nf4",
206 |         bnb_4bit_use_double_quant=True,
207 |         bnb_4bit_compute_dtype=torch.bfloat16,
208 |     )
209 | 
210 |     base_model = AutoModelForCausalLM.from_pretrained(
211 |         cfg.model.backbone_path,
212 |         quantization_config=bnb_config,
213 |     )
214 | 
215 |     base_model.config.pretraining_tp = 1
216 | 
217 |     # lora ---
218 |     peft_config = LoraConfig(
219 |         r=cfg.model.lora.r,
220 |         lora_alpha=cfg.model.lora.lora_alpha,
221 |         lora_dropout=cfg.model.lora.lora_dropout,
222 |         bias="none",
223 |         task_type=TaskType.CAUSAL_LM,
224 |         inference_mode=False,
225 |         target_modules=cfg_dict["model"]["lora"]["target_modules"],
226 |     )
227 | 
228 |     model = get_peft_model(base_model, peft_config)
229 |     model.print_trainable_parameters()
230 |     model.config.use_cache = False
231 | 
232 |     # --- optimizer ---------------------------------------------------------------------#
233 |     print_line()
234 |     optimizer = get_optimizer(cfg, model, print_fn=accelerator.print)
235 | 
236 |     # ------- Prepare -------------------------------------------------------------------#
237 | 
238 |     model, optimizer, train_dl, valid_dl = accelerator.prepare(
239 |         model, optimizer, train_dl, valid_dl
240 |     )
241 | 
242 |     # ------- Scheduler -----------------------------------------------------------------#
243 |     print_line()
244 |     num_epochs = cfg.train_params.num_train_epochs
245 |     grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps
246 |     warmup_pct = cfg.train_params.warmup_pct
247 | 
248 |     num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps
249 |     num_training_steps = num_epochs * num_update_steps_per_epoch
250 |     num_warmup_steps = int(warmup_pct*num_training_steps)
251 | 
252 |     accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
253 |     accelerator.print(f"# training steps: {num_training_steps}")
254 |     accelerator.print(f"# warmup steps: {num_warmup_steps}")
255 | 
256 |     scheduler = get_cosine_schedule_with_warmup(
257 |         optimizer=optimizer,
258 |         num_warmup_steps=num_warmup_steps,
259 |         num_training_steps=num_training_steps
260 |     )
261 | 
262 |     # scheduler = accelerator.prepare(scheduler)
263 | 
264 |     # ------- training setup --------------------------------------------------------------#
265 |     best_lb = 1e6
266 |     patience_tracker = 0
267 |     current_iteration = 0
268 | 
269 |     # ------- training  --------------------------------------------------------------------#
270 |     start_time = time.time()
271 |     accelerator.wait_for_everyone()
272 | 
273 |     for epoch in range(num_epochs):
274 |         # close and reset progress bar
275 |         if epoch != 0:
276 |             progress_bar.close()
277 | 
278 |         progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
279 |         loss_meter = AverageMeter()
280 | 
281 |         # Training ------
282 |         model.train()
283 |         for step, batch in enumerate(train_dl):
284 |             with accelerator.accumulate(model):  # gives sync vs no sync context manager
285 |                 outputs = model(**batch)
286 |                 loss = outputs.loss
287 |                 accelerator.backward(loss)
288 | 
289 |                 if accelerator.sync_gradients:
290 |                     accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm)
291 | 
292 |                     optimizer.step()
293 |                     scheduler.step()
294 |                     optimizer.zero_grad()
295 | 
296 |                 # check if loss.item() is okay for TPU
297 |                 # happening on all processes - values of loss meter in different processes are different
298 |                 loss_meter.update(loss.item())  # tracks loss in each batch, no accumulation
299 | 
300 |             if accelerator.sync_gradients:
301 |                 progress_bar.set_description(
302 |                     f"STEP: {current_iteration+1:5}/{num_training_steps:5}. "
303 |                     f"LR: {get_lr(optimizer):.4f}. "
304 |                     f"Loss: {loss_meter.avg:.4f}. "
305 |                 )
306 | 
307 |                 progress_bar.update(1)
308 |                 current_iteration += 1
309 | 
310 |                 if cfg.use_wandb:
311 |                     accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration)  # only on main process
312 |                     accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration)
313 | 
314 |             # >--------------------------------------------------|
315 |             # >-- evaluation ------------------------------------|
316 |             # >--------------------------------------------------|
317 | 
318 |             if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0):
319 |                 # set model in eval mode
320 |                 model.eval()
321 |                 scores_dict = run_evaluation(accelerator, model, valid_dl)
322 |                 lb = scores_dict["valid_loss"]
323 | 
324 |                 print_line()
325 |                 et = as_minutes(time.time()-start_time)
326 |                 accelerator.print(
327 |                     f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}"
328 |                 )
329 |                 print_line()
330 |                 accelerator.print(f">>> Current Valid Loss = {round(lb, 4)}")
331 | 
332 |                 print_line()
333 | 
334 |                 is_best = False
335 |                 if lb <= best_lb:
336 |                     best_lb = lb
337 |                     is_best = True
338 |                     patience_tracker = 0
339 | 
340 |                     # -----
341 |                     best_dict = dict()
342 |                     for k, v in scores_dict.items():
343 |                         best_dict[f"{k}_at_best"] = v
344 |                 else:
345 |                     patience_tracker += 1
346 | 
347 |                 # saving -----
348 |                 accelerator.wait_for_everyone()
349 |                 unwrapped_model = accelerator.unwrap_model(model)
350 | 
351 |                 unwrapped_model.save_pretrained(
352 |                     f"{cfg.outputs.model_dir}/last",
353 |                     state_dict=accelerator.get_state_dict(model),
354 |                     save_function=accelerator.save,
355 |                 )
356 | 
357 |                 if accelerator.is_main_process:
358 |                     tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last")
359 | 
360 |                 # logging ----
361 |                 if cfg.use_wandb:
362 |                     accelerator.log({"lb": lb}, step=current_iteration)
363 |                     accelerator.log({"best_lb": best_lb}, step=current_iteration)
364 | 
365 |                 # -- post eval
366 |                 model.train()
367 |                 torch.cuda.empty_cache()
368 |                 print_line()
369 | 
370 |                 # early stopping ----
371 |                 if patience_tracker >= cfg.train_params.patience:
372 |                     print("stopping early")
373 |                     model.eval()
374 |                     accelerator.end_training()
375 |                     return
376 | 
377 |     # --- end training
378 |     accelerator.end_training()
379 | 
380 | 
381 | if __name__ == "__main__":
382 |     run_training()
383 | 


--------------------------------------------------------------------------------
/code/train_r_clm_from_scratch.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import time
  5 | from copy import deepcopy
  6 | from dataclasses import asdict, dataclass, field
  7 | from functools import partial
  8 | 
  9 | import bitsandbytes as bnb
 10 | import datasets
 11 | import hydra
 12 | import numpy as np
 13 | import pandas as pd
 14 | import torch
 15 | import torch.nn as nn
 16 | import transformers
 17 | from accelerate import Accelerator
 18 | from accelerate.logging import get_logger
 19 | from accelerate.utils import set_seed
 20 | from omegaconf import OmegaConf
 21 | from peft import (LoraConfig, TaskType, get_peft_model,
 22 |                   prepare_model_for_kbit_training)
 23 | from torch.utils.data import DataLoader
 24 | from tqdm.auto import tqdm
 25 | from transformers import (AutoModelForCausalLM, BitsAndBytesConfig,
 26 |                           get_cosine_schedule_with_warmup)
 27 | 
 28 | try:
 29 |     from r_clm.ai_dataset import AiDataset
 30 |     from r_clm.ai_loader import AiCollator, show_batch
 31 |     from r_clm.ai_optimizer import get_optimizer
 32 |     from utils.train_utils import AverageMeter, as_minutes, get_lr
 33 | 
 34 | 
 35 | except Exception as e:
 36 |     print(e)
 37 |     raise ImportError
 38 | 
 39 | logger = get_logger(__name__)
 40 | 
 41 | 
 42 | def run_evaluation(accelerator, model, valid_dl):
 43 |     model.eval()
 44 | 
 45 |     all_losses = []
 46 | 
 47 |     progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process)
 48 | 
 49 |     for step, batch in enumerate(valid_dl):
 50 |         with torch.no_grad():
 51 |             outputs = model(**batch)
 52 | 
 53 |         loss = outputs.loss
 54 |         batch_losses = accelerator.gather_for_metrics(loss)
 55 |         batch_losses = batch_losses.cpu().numpy().tolist()
 56 | 
 57 |         all_losses.extend(batch_losses)
 58 |         progress_bar.update(1)
 59 |     progress_bar.close()
 60 | 
 61 |     # compute metric
 62 |     eval_dict = dict()  # compute_metrics(all_predictions, all_truths)
 63 |     eval_dict['valid_loss'] = np.mean(all_losses)
 64 | 
 65 |     return eval_dict
 66 | 
 67 | 
 68 | @hydra.main(version_base=None, config_path="../conf/r_clm", config_name="conf_r_clm")
 69 | def run_training(cfg):
 70 |     # ------- Accelerator ---------------------------------------------------------------#
 71 |     if cfg.use_wandb:
 72 |         accelerator = Accelerator(
 73 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 74 |             log_with="wandb",
 75 |         )
 76 | 
 77 |         accelerator.init_trackers(
 78 |             cfg.wandb.project,
 79 |             config=OmegaConf.to_container(cfg, resolve=True),
 80 |         )
 81 | 
 82 |     else:
 83 |         accelerator = Accelerator(
 84 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 85 |         )
 86 | 
 87 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
 88 | 
 89 |     # Make one log on every process with the configuration for debugging.
 90 |     logging.basicConfig(
 91 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 92 |         datefmt="%m/%d/%Y %H:%M:%S",
 93 |         level=logging.INFO,
 94 |     )
 95 |     logger.info(accelerator.state, main_process_only=False)
 96 | 
 97 |     # print_line = partial(print_line, accelerator)
 98 | 
 99 |     def print_line():
100 |         prefix, unit, suffix = "#", "~~", "#"
101 |         accelerator.print(prefix + unit*50 + suffix)
102 | 
103 |     if accelerator.is_local_main_process:
104 |         datasets.utils.logging.set_verbosity_warning()
105 |         transformers.utils.logging.set_verbosity_info()
106 |     else:
107 |         datasets.utils.logging.set_verbosity_error()
108 |         transformers.utils.logging.set_verbosity_error()
109 | 
110 |     # ------- Runtime Configs -----------------------------------------------------------#
111 |     print_line()
112 |     accelerator.print(f"setting seed: {cfg.seed}")
113 |     set_seed(cfg.seed)
114 | 
115 |     if accelerator.is_main_process:
116 |         os.makedirs(cfg.outputs.model_dir, exist_ok=True)
117 |     print_line()
118 | 
119 |     # ------- load data -----------------------------------------------------------------#
120 |     print_line()
121 | 
122 |     # load query dataframe ---
123 |     essay_df = pd.read_csv(cfg.input_data_path).rename(columns={"full_text": "text"})
124 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
125 |     essay_df = essay_df.reset_index(drop=True)
126 | 
127 |     # ------- Data Split ----------------------------------------------------------------#
128 |     # sample validation data
129 |     rng = random.Random(cfg.seed)
130 |     essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.98 else 'valid')
131 |     train_df = essay_df[essay_df['fold'] == 'train'].copy()
132 |     valid_df = essay_df[essay_df['fold'] == 'valid'].copy()
133 | 
134 |     train_df = train_df.reset_index(drop=True)
135 |     valid_df = valid_df.reset_index(drop=True)
136 | 
137 |     accelerator.print(f"shape of train data: {train_df.shape}")
138 |     accelerator.print(f"{train_df.head()}")
139 |     accelerator.print(f"shape of validation data: {valid_df.shape}")
140 | 
141 |     with accelerator.main_process_first():
142 |         dataset_creator = AiDataset(cfg)
143 | 
144 |         train_ds = dataset_creator.get_dataset(train_df)
145 |         valid_ds = dataset_creator.get_dataset(valid_df)
146 | 
147 |     tokenizer = dataset_creator.tokenizer
148 | 
149 |     train_ds.set_format(
150 |         type=None,
151 |         columns=[
152 |             'input_ids',
153 |             'attention_mask',
154 |             'labels'
155 |         ]
156 |     )
157 | 
158 |     # valid_ds = valid_ds.sort("input_length")
159 | 
160 |     valid_ds.set_format(
161 |         type=None,
162 |         columns=[
163 |             'input_ids',
164 |             'attention_mask',
165 |             'labels'
166 |         ]
167 |     )
168 |     # valid_ids = valid_df["id"]  # .tolist()
169 | 
170 |     data_collator = AiCollator(
171 |         tokenizer=tokenizer,
172 |         pad_to_multiple_of=64
173 |     )
174 | 
175 |     train_dl = DataLoader(
176 |         train_ds,
177 |         batch_size=cfg.train_params.per_device_train_batch_size,
178 |         shuffle=True,
179 |         collate_fn=data_collator,
180 |     )
181 | 
182 |     valid_dl = DataLoader(
183 |         valid_ds,
184 |         batch_size=cfg.train_params.per_device_eval_batch_size,
185 |         shuffle=False,
186 |         collate_fn=data_collator,
187 |     )
188 | 
189 |     accelerator.print("data preparation done...")
190 |     print_line()
191 | 
192 |     # --- show batch -------------------------------------------------------------------#
193 |     print_line()
194 | 
195 |     for b in train_dl:
196 |         break
197 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
198 | 
199 |     print_line()
200 | 
201 |     for b in valid_dl:
202 |         break
203 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
204 | 
205 |     # --- model -------------------------------------------------------------------------#
206 |     print_line()
207 | 
208 |     model = AutoModelForCausalLM.from_pretrained(
209 |         cfg.model.backbone_path,
210 |         torch_dtype=torch.bfloat16,
211 |     )
212 | 
213 |     # model.config.pretraining_tp = 1
214 |     # model.print_trainable_parameters()
215 |     model.config.use_cache = False
216 | 
217 |     # --- optimizer ---------------------------------------------------------------------#
218 |     print_line()
219 |     optimizer = get_optimizer(cfg, model, print_fn=accelerator.print)
220 | 
221 |     # ------- Prepare -------------------------------------------------------------------#
222 | 
223 |     model, optimizer, train_dl, valid_dl = accelerator.prepare(
224 |         model, optimizer, train_dl, valid_dl
225 |     )
226 | 
227 |     # ------- Scheduler -----------------------------------------------------------------#
228 |     print_line()
229 |     num_epochs = cfg.train_params.num_train_epochs
230 |     grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps
231 |     warmup_pct = cfg.train_params.warmup_pct
232 | 
233 |     num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps
234 |     num_training_steps = num_epochs * num_update_steps_per_epoch
235 |     num_warmup_steps = int(warmup_pct*num_training_steps)
236 | 
237 |     accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
238 |     accelerator.print(f"# training steps: {num_training_steps}")
239 |     accelerator.print(f"# warmup steps: {num_warmup_steps}")
240 | 
241 |     scheduler = get_cosine_schedule_with_warmup(
242 |         optimizer=optimizer,
243 |         num_warmup_steps=num_warmup_steps,
244 |         num_training_steps=num_training_steps
245 |     )
246 | 
247 |     # ------- training setup --------------------------------------------------------------#
248 |     best_lb = 1e6
249 |     patience_tracker = 0
250 |     current_iteration = 0
251 | 
252 |     # ------- training  --------------------------------------------------------------------#
253 |     start_time = time.time()
254 |     accelerator.wait_for_everyone()
255 | 
256 |     for epoch in range(num_epochs):
257 |         # close and reset progress bar
258 |         if epoch != 0:
259 |             progress_bar.close()
260 | 
261 |         progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
262 |         loss_meter = AverageMeter()
263 | 
264 |         # Training ------
265 |         model.train()
266 |         for step, batch in enumerate(train_dl):
267 |             with accelerator.accumulate(model):  # gives sync vs no sync context manager
268 |                 outputs = model(**batch)
269 |                 loss = outputs.loss
270 |                 accelerator.backward(loss)
271 | 
272 |                 if accelerator.sync_gradients:
273 |                     accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm)
274 | 
275 |                     optimizer.step()
276 |                     scheduler.step()
277 |                     optimizer.zero_grad()
278 | 
279 |                 # check if loss.item() is okay for TPU
280 |                 # happening on all processes - values of loss meter in different processes are different
281 |                 loss_meter.update(loss.item())  # tracks loss in each batch, no accumulation
282 | 
283 |             if accelerator.sync_gradients:
284 |                 progress_bar.set_description(
285 |                     f"STEP: {current_iteration+1:5}/{num_training_steps:5}. "
286 |                     f"LR: {get_lr(optimizer):.4f}. "
287 |                     f"Loss: {loss_meter.avg:.4f}. "
288 |                 )
289 | 
290 |                 progress_bar.update(1)
291 |                 current_iteration += 1
292 | 
293 |                 if cfg.use_wandb:
294 |                     accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration)  # only on main process
295 |                     accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration)
296 | 
297 |             # >--------------------------------------------------|
298 |             # >-- evaluation ------------------------------------|
299 |             # >--------------------------------------------------|
300 | 
301 |             if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0):
302 |                 # set model in eval mode
303 |                 model.eval()
304 |                 scores_dict = run_evaluation(accelerator, model, valid_dl)
305 |                 lb = scores_dict["valid_loss"]
306 | 
307 |                 print_line()
308 |                 et = as_minutes(time.time()-start_time)
309 |                 accelerator.print(
310 |                     f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}"
311 |                 )
312 |                 print_line()
313 |                 accelerator.print(f">>> Current Valid Loss = {round(lb, 4)}")
314 | 
315 |                 print_line()
316 | 
317 |                 is_best = False
318 |                 if lb <= best_lb:
319 |                     best_lb = lb
320 |                     is_best = True
321 |                     patience_tracker = 0
322 | 
323 |                     # -----
324 |                     best_dict = dict()
325 |                     for k, v in scores_dict.items():
326 |                         best_dict[f"{k}_at_best"] = v
327 |                 else:
328 |                     patience_tracker += 1
329 | 
330 |                 # saving -----
331 |                 accelerator.wait_for_everyone()
332 |                 unwrapped_model = accelerator.unwrap_model(model)
333 | 
334 |                 unwrapped_model.save_pretrained(
335 |                     f"{cfg.outputs.model_dir}/last",
336 |                     state_dict=accelerator.get_state_dict(model),
337 |                     save_function=accelerator.save,
338 |                 )
339 | 
340 |                 if accelerator.is_main_process:
341 |                     tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last")
342 | 
343 |                 # logging ----
344 |                 if cfg.use_wandb:
345 |                     accelerator.log({"lb": lb}, step=current_iteration)
346 |                     accelerator.log({"best_lb": best_lb}, step=current_iteration)
347 | 
348 |                 # -- post eval
349 |                 model.train()
350 |                 torch.cuda.empty_cache()
351 |                 print_line()
352 | 
353 |                 # early stopping ----
354 |                 if patience_tracker >= cfg.train_params.patience:
355 |                     print("stopping early")
356 |                     model.eval()
357 |                     accelerator.end_training()
358 |                     return
359 | 
360 |     # --- end training
361 |     accelerator.end_training()
362 | 
363 | 
364 | if __name__ == "__main__":
365 |     run_training()
366 | 


--------------------------------------------------------------------------------
/code/train_r_detect.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import time
  5 | from copy import deepcopy
  6 | from dataclasses import asdict, dataclass, field
  7 | from functools import partial
  8 | 
  9 | import bitsandbytes as bnb
 10 | import datasets
 11 | import hydra
 12 | import pandas as pd
 13 | import torch
 14 | import transformers
 15 | from accelerate import Accelerator
 16 | from accelerate.logging import get_logger
 17 | from accelerate.utils import set_seed
 18 | from omegaconf import OmegaConf
 19 | from peft import (LoraConfig, TaskType, get_peft_model,
 20 |                   prepare_model_for_kbit_training)
 21 | from torch.utils.data import DataLoader
 22 | from tqdm.auto import tqdm
 23 | from transformers import (AutoModelForSequenceClassification,
 24 |                           BitsAndBytesConfig, get_cosine_schedule_with_warmup)
 25 | 
 26 | try:
 27 |     from r_detect.ai_dataset import AiDataset
 28 |     from r_detect.ai_loader import AiCollator, AiCollatorTrain, show_batch
 29 |     from r_detect.ai_model import (LlamaForDetectAI, MistralForDetectAI,
 30 |                                    PhiForDetectAI)
 31 |     from r_detect.ai_optimizer import get_optimizer
 32 |     from utils.metric_utils import compute_metrics
 33 |     from utils.train_utils import AverageMeter, as_minutes, get_lr
 34 | 
 35 | 
 36 | except Exception as e:
 37 |     print(e)
 38 |     raise ImportError
 39 | 
 40 | logger = get_logger(__name__)
 41 | 
 42 | 
 43 | def run_evaluation(accelerator, model, valid_dl, valid_ids):
 44 |     model.eval()
 45 | 
 46 |     all_predictions = []
 47 |     all_truths = []
 48 | 
 49 |     progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process)
 50 | 
 51 |     for step, batch in enumerate(valid_dl):
 52 |         with torch.no_grad():
 53 |             outputs = model(**batch)
 54 | 
 55 |         logits = outputs.logits
 56 | 
 57 |         predictions = torch.sigmoid(logits)
 58 |         predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"].to(torch.long).reshape(-1)))
 59 |         predictions, references = predictions.cpu().numpy().tolist(), references.cpu().numpy().tolist()
 60 | 
 61 |         all_predictions.extend(predictions)
 62 |         all_truths.extend(references)
 63 | 
 64 |         progress_bar.update(1)
 65 |     progress_bar.close()
 66 | 
 67 |     # compute metric
 68 |     eval_dict = compute_metrics(all_predictions, all_truths)
 69 | 
 70 |     result_df = pd.DataFrame()
 71 |     result_df["id"] = valid_ids
 72 |     result_df["predictions"] = all_predictions
 73 |     result_df["truths"] = all_truths
 74 | 
 75 |     oof_df = deepcopy(result_df)
 76 |     oof_df = oof_df.rename(columns={"predictions": "generated"})
 77 |     oof_df = oof_df[["id", "generated"]].copy()
 78 | 
 79 |     to_return = {
 80 |         "scores": eval_dict,
 81 |         "result_df": result_df,
 82 |         "oof_df": oof_df,
 83 |     }
 84 | 
 85 |     return to_return
 86 | 
 87 | 
 88 | @hydra.main(version_base=None, config_path="../conf/r_detect", config_name="conf_r_detect")
 89 | def run_training(cfg):
 90 |     # ------- Accelerator ---------------------------------------------------------------#
 91 |     if cfg.use_wandb:
 92 |         accelerator = Accelerator(
 93 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 94 |             log_with="wandb",
 95 |             # mixed_precision='fp16',
 96 |         )
 97 | 
 98 |         accelerator.init_trackers(
 99 |             cfg.wandb.project,
100 |             config=OmegaConf.to_container(cfg, resolve=True),
101 |         )
102 | 
103 |     else:
104 |         accelerator = Accelerator(
105 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
106 |             # mixed_precision='fp16',
107 |         )
108 | 
109 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
110 | 
111 |     # Make one log on every process with the configuration for debugging.
112 |     logging.basicConfig(
113 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
114 |         datefmt="%m/%d/%Y %H:%M:%S",
115 |         level=logging.INFO,
116 |     )
117 |     logger.info(accelerator.state, main_process_only=False)
118 | 
119 |     # print_line = partial(print_line, accelerator)
120 | 
121 |     def print_line():
122 |         prefix, unit, suffix = "#", "~~", "#"
123 |         accelerator.print(prefix + unit*50 + suffix)
124 | 
125 |     if accelerator.is_local_main_process:
126 |         datasets.utils.logging.set_verbosity_warning()
127 |         transformers.utils.logging.set_verbosity_info()
128 |     else:
129 |         datasets.utils.logging.set_verbosity_error()
130 |         transformers.utils.logging.set_verbosity_error()
131 | 
132 |     # ------- Runtime Configs -----------------------------------------------------------#
133 |     print_line()
134 |     accelerator.print(f"setting seed: {cfg.seed}")
135 |     set_seed(cfg.seed)
136 | 
137 |     if accelerator.is_main_process:
138 |         os.makedirs(cfg.outputs.model_dir, exist_ok=True)
139 |     print_line()
140 | 
141 |     # ------- load data -----------------------------------------------------------------#
142 |     print_line()
143 |     data_dir = cfg.input_data_dir
144 | 
145 |     try:
146 |         essay_df = pd.read_csv(os.path.join(data_dir, "train_essays.csv"))
147 |     except Exception as e:
148 |         essay_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet"))
149 | 
150 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
151 |     essay_df = essay_df.reset_index(drop=True)
152 | 
153 |     # train_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet"))
154 |     # train_df = train_df[~train_df['text'].isna()].copy()
155 | 
156 |     # valid_df = pd.read_parquet(os.path.join(data_dir, "valid_essays.parquet"))
157 |     # valid_df = valid_df[~valid_df['text'].isna()].copy()
158 | 
159 |     rng = random.Random(cfg.seed)
160 |     essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.99 else 'valid')
161 |     train_df = essay_df[essay_df['fold'] == 'train'].copy()
162 |     valid_df = essay_df[essay_df['fold'] == 'valid'].copy()
163 | 
164 |     train_df = train_df.reset_index(drop=True)
165 |     valid_df = valid_df.reset_index(drop=True)
166 | 
167 |     accelerator.print(f"shape of train data: {train_df.shape}")
168 |     accelerator.print(f"{train_df.head()}")
169 |     accelerator.print(f"shape of validation data: {valid_df.shape}")
170 | 
171 |     with accelerator.main_process_first():
172 |         dataset_creator = AiDataset(cfg)
173 | 
174 |         train_ds = dataset_creator.get_dataset(train_df)
175 |         valid_ds = dataset_creator.get_dataset(valid_df)
176 | 
177 |     tokenizer = dataset_creator.tokenizer
178 | 
179 |     train_ds.set_format(
180 |         type=None,
181 |         columns=[
182 |             'id',
183 |             'input_ids',
184 |             'attention_mask',
185 |             'generated'
186 |         ]
187 |     )
188 | 
189 |     valid_ds = valid_ds.sort("input_length")
190 | 
191 |     valid_ds.set_format(
192 |         type=None,
193 |         columns=[
194 |             'id',
195 |             'input_ids',
196 |             'attention_mask',
197 |             'generated'
198 |         ]
199 |     )
200 |     valid_ids = valid_df["id"]  # .tolist()
201 | 
202 |     data_collator = AiCollator(
203 |         tokenizer=tokenizer,
204 |         pad_to_multiple_of=64
205 |     )
206 |     data_collator_train = AiCollatorTrain(
207 |         tokenizer=tokenizer,
208 |         pad_to_multiple_of=64,
209 |         kwargs=dict(cfg=cfg)
210 |     )
211 | 
212 |     train_dl = DataLoader(
213 |         train_ds,
214 |         batch_size=cfg.train_params.per_device_train_batch_size,
215 |         shuffle=True,
216 |         collate_fn=data_collator_train,
217 |     )
218 | 
219 |     valid_dl = DataLoader(
220 |         valid_ds,
221 |         batch_size=cfg.train_params.per_device_eval_batch_size,
222 |         shuffle=False,
223 |         collate_fn=data_collator,
224 |     )
225 | 
226 |     accelerator.print("data preparation done...")
227 |     print_line()
228 | 
229 |     # --- show batch -------------------------------------------------------------------#
230 |     print_line()
231 | 
232 |     for b in train_dl:
233 |         break
234 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
235 | 
236 |     print_line()
237 | 
238 |     for b in valid_dl:
239 |         break
240 |     show_batch(b, tokenizer, task='training', print_fn=accelerator.print)
241 | 
242 |     # --- model -------------------------------------------------------------------------#
243 |     print_line()
244 |     bnb_config = BitsAndBytesConfig(
245 |         load_in_4bit=True,
246 |         bnb_4bit_quant_type="nf4",
247 |         bnb_4bit_use_double_quant=True,
248 |         bnb_4bit_compute_dtype=torch.float16
249 |     )
250 | 
251 |     if 'solar' in cfg.model.backbone_path.lower():
252 |         base_model = LlamaForDetectAI.from_pretrained(
253 |             cfg.model.backbone_path,
254 |             num_labels=cfg.model.num_labels,  # 2
255 |             quantization_config=bnb_config,
256 |         )
257 |     elif 'phi' in cfg.model.backbone_path.lower():
258 |         base_model = PhiForDetectAI.from_pretrained(
259 |             cfg.model.backbone_path,
260 |             num_labels=cfg.model.num_labels,  # 2
261 |             quantization_config=bnb_config,
262 |             trust_remote_code=True,  # IMP
263 |         )
264 |     else:
265 |         base_model = MistralForDetectAI.from_pretrained(
266 |             cfg.model.backbone_path,
267 |             num_labels=cfg.model.num_labels,  # 2
268 |             quantization_config=bnb_config,
269 |         )
270 |         # base_model.peft_config = dict()
271 | 
272 |     base_model.config.pretraining_tp = 1
273 |     # base_model.config.pad_token_id = tokenizer.pad_token_id
274 | 
275 |     # # base_model = prepare_model_for_kbit_training(base_model, use_gradient_checkpointing=True)
276 |     # for param in base_model.parameters():
277 |     #     if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
278 |     #         param.data = param.data.to(torch.float32)
279 | 
280 |     # lora ---
281 |     peft_config = LoraConfig(
282 |         r=cfg.model.lora.r,
283 |         lora_alpha=cfg.model.lora.lora_alpha,
284 |         lora_dropout=cfg.model.lora.lora_dropout,
285 |         bias="none",
286 |         task_type=TaskType.SEQ_CLS,
287 |         inference_mode=False,
288 |         target_modules=cfg_dict["model"]["lora"]["target_modules"],
289 |         modules_to_save=cfg_dict["model"]["lora"]["modules_to_save"],
290 |     )
291 | 
292 |     model = get_peft_model(base_model, peft_config)
293 |     print(model.device)
294 |     model.print_trainable_parameters()
295 |     accelerator.wait_for_everyone()
296 | 
297 |     # --- optimizer ---------------------------------------------------------------------#
298 |     print_line()
299 |     optimizer = get_optimizer(cfg, model, print_fn=accelerator.print)
300 | 
301 |     # ------- Prepare -------------------------------------------------------------------#
302 | 
303 |     model, optimizer, train_dl, valid_dl = accelerator.prepare(
304 |         model, optimizer, train_dl, valid_dl
305 |     )
306 | 
307 |     # ------- Scheduler -----------------------------------------------------------------#
308 |     print_line()
309 |     num_epochs = cfg.train_params.num_train_epochs
310 |     grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps
311 |     warmup_pct = cfg.train_params.warmup_pct
312 | 
313 |     num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps
314 |     num_training_steps = num_epochs * num_update_steps_per_epoch
315 |     num_warmup_steps = int(warmup_pct*num_training_steps)
316 | 
317 |     accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
318 |     accelerator.print(f"# training steps: {num_training_steps}")
319 |     accelerator.print(f"# warmup steps: {num_warmup_steps}")
320 | 
321 |     scheduler = get_cosine_schedule_with_warmup(
322 |         optimizer=optimizer,
323 |         num_warmup_steps=num_warmup_steps,
324 |         num_training_steps=num_training_steps
325 |     )
326 | 
327 |     # scheduler = accelerator.prepare(scheduler)
328 | 
329 |     # ------- training setup --------------------------------------------------------------#
330 |     best_lb = -1.
331 |     save_trigger = cfg.train_params.save_trigger
332 | 
333 |     patience_tracker = 0
334 |     current_iteration = 0
335 | 
336 |     # ------- training  --------------------------------------------------------------------#
337 |     start_time = time.time()
338 |     accelerator.wait_for_everyone()
339 | 
340 |     for epoch in range(num_epochs):
341 |         # close and reset progress bar
342 |         if epoch != 0:
343 |             progress_bar.close()
344 | 
345 |         progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
346 |         loss_meter = AverageMeter()
347 | 
348 |         # Training ------
349 |         model.train()
350 |         for step, batch in enumerate(train_dl):
351 |             with accelerator.accumulate(model):  # gives sync vs no sync context manager
352 |                 outputs = model(**batch)
353 |                 loss = outputs.loss
354 |                 accelerator.backward(loss)
355 | 
356 |                 if accelerator.sync_gradients:
357 |                     # Q: why need this check?
358 |                     # A: gradient_state.sync_gradients check is NOT performed inside clip_grad_norm_
359 |                     accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm)
360 | 
361 |                     optimizer.step()  # gradient_state.sync_gradients check is performed inside optimizer.step
362 |                     scheduler.step()
363 |                     optimizer.zero_grad()
364 | 
365 |                 # check if loss.item() is okay for TPU
366 |                 # happening on all processes - values of loss meter in different processes are different
367 |                 loss_meter.update(loss.item())  # tracks loss in each batch, no accumulation
368 | 
369 |             if accelerator.sync_gradients:
370 |                 progress_bar.set_description(
371 |                     f"STEP: {current_iteration+1:5}/{num_training_steps:5}. "
372 |                     f"LR: {get_lr(optimizer):.4f}. "
373 |                     f"Loss: {loss_meter.avg:.4f}. "
374 |                 )
375 | 
376 |                 progress_bar.update(1)
377 |                 current_iteration += 1
378 | 
379 |                 if cfg.use_wandb:
380 |                     accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration)  # only on main process
381 |                     accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration)
382 | 
383 |             # >--------------------------------------------------|
384 |             # >-- evaluation ------------------------------------|
385 |             # >--------------------------------------------------|
386 | 
387 |             if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0):
388 |                 # set model in eval mode
389 |                 model.eval()
390 |                 eval_response = run_evaluation(accelerator, model, valid_dl, valid_ids)
391 | 
392 |                 scores_dict = eval_response["scores"]
393 |                 result_df = eval_response["result_df"]
394 |                 oof_df = eval_response["oof_df"]
395 | 
396 |                 lb = scores_dict["lb"]
397 | 
398 |                 print_line()
399 |                 et = as_minutes(time.time()-start_time)
400 |                 accelerator.print(
401 |                     f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}"
402 |                 )
403 |                 print_line()
404 |                 accelerator.print(f">>> Current LB (AUC) = {round(lb, 4)}")
405 | 
406 |                 print_line()
407 | 
408 |                 is_best = False
409 |                 if lb >= best_lb:
410 |                     best_lb = lb
411 |                     is_best = True
412 |                     patience_tracker = 0
413 | 
414 |                     # -----
415 |                     best_dict = dict()
416 |                     for k, v in scores_dict.items():
417 |                         best_dict[f"{k}_at_best"] = v
418 |                 else:
419 |                     patience_tracker += 1
420 | 
421 |                 if is_best:  # do in main process
422 |                     oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_best.csv"), index=False)
423 |                     result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_best.csv"), index=False)
424 |                 else:
425 |                     accelerator.print(f">>> patience reached {patience_tracker}/{cfg.train_params.patience}")
426 |                     accelerator.print(f">>> current best score: {round(best_lb, 4)}")
427 | 
428 |                 oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_last.csv"), index=False)
429 |                 result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_last.csv"), index=False)
430 | 
431 |                 # saving -----
432 |                 accelerator.wait_for_everyone()
433 |                 unwrapped_model = accelerator.unwrap_model(model)
434 | 
435 |                 # # debug --
436 |                 # selected_adapters = list(unwrapped_model.peft_config.keys())
437 |                 # accelerator.print(f"selected adapters: {selected_adapters}")
438 |                 # for adapter_name in selected_adapters:
439 |                 #     peft_config = unwrapped_model.peft_config[adapter_name]
440 |                 #     peft_config = asdict(peft_config)
441 |                 #     accelerator.print(f"adapter: {adapter_name}")
442 |                 #     accelerator.print(peft_config)
443 |                 #     for k, v in peft_config.items():
444 |                 #         accelerator.print(f"{k}: {v} ({type(v)})")
445 |                 # # ------
446 |                 unwrapped_model.save_pretrained(
447 |                     f"{cfg.outputs.model_dir}/last",
448 |                     state_dict=accelerator.get_state_dict(model),
449 |                     save_function=accelerator.save,
450 |                 )
451 | 
452 |                 if accelerator.is_main_process:
453 |                     tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/last")
454 | 
455 |                 if best_lb > save_trigger:
456 |                     if accelerator.is_main_process:
457 |                         tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/best")
458 |                     unwrapped_model.save_pretrained(
459 |                         f"{cfg.outputs.model_dir}/best",
460 |                         state_dict=accelerator.get_state_dict(model),
461 |                         save_function=accelerator.save,
462 |                     )
463 |                     if accelerator.is_main_process:
464 |                         tokenizer.save_pretrained(f"{cfg.outputs.model_dir}/best")
465 | 
466 |                 # logging ----
467 |                 if cfg.use_wandb:
468 |                     accelerator.log({"lb": lb}, step=current_iteration)
469 |                     accelerator.log({"best_lb": best_lb}, step=current_iteration)
470 | 
471 |                     # -- log scores dict
472 |                     for k, v in scores_dict.items():
473 |                         accelerator.log({k: round(v, 4)}, step=current_iteration)
474 | 
475 |                     # --- log best scores dict
476 |                     for k, v in best_dict.items():
477 |                         accelerator.log({k: round(v, 4)}, step=current_iteration)
478 | 
479 |                 # -- post eval
480 |                 model.train()
481 |                 torch.cuda.empty_cache()
482 |                 print_line()
483 | 
484 |                 # early stopping ----
485 |                 if patience_tracker >= cfg.train_params.patience:
486 |                     print("stopping early")
487 |                     model.eval()
488 |                     accelerator.end_training()
489 |                     return
490 | 
491 |     # --- end training
492 |     accelerator.end_training()
493 | 
494 | 
495 | if __name__ == "__main__":
496 |     run_training()
497 | 


--------------------------------------------------------------------------------
/code/train_r_dpo.py:
--------------------------------------------------------------------------------
  1 | # adapted from https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py
  2 | 
  3 | import argparse
  4 | import os
  5 | 
  6 | import pandas as pd
  7 | import torch
  8 | from accelerate import Accelerator
  9 | from accelerate.utils import set_seed
 10 | from datasets import Dataset, DatasetDict
 11 | from omegaconf import OmegaConf
 12 | from peft import LoraConfig, PeftConfig, PeftModel, TaskType
 13 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
 14 |                           BitsAndBytesConfig, TrainingArguments)
 15 | from trl import DPOTrainer
 16 | 
 17 | 
 18 | def get_datasets(cfg):
 19 |     """
 20 |     prepare training and test datasets for DPO
 21 |     """
 22 |     raw_datasets = DatasetDict()
 23 |     train_df = pd.read_parquet(cfg.train_path)  # prompt, chosen, rejected
 24 |     test_df = pd.read_parquet(cfg.test_path)
 25 | 
 26 |     train_ds = Dataset.from_pandas(train_df)
 27 |     test_ds = Dataset.from_pandas(test_df)
 28 | 
 29 |     train_ds = train_ds.remove_columns(["dpo_id", "diff"])
 30 |     test_ds = test_ds.remove_columns(["dpo_id", "diff"])
 31 | 
 32 |     raw_datasets["train"] = train_ds
 33 |     raw_datasets["test"] = test_ds
 34 | 
 35 |     return raw_datasets
 36 | 
 37 | 
 38 | def get_tokenizer(cfg):
 39 | 
 40 |     tokenizer = AutoTokenizer.from_pretrained(
 41 |         cfg.sft_model_path,
 42 |         use_fast=True,
 43 |         padding_side='left',
 44 |         truncation_side='left',
 45 |     )
 46 | 
 47 |     if tokenizer.pad_token is None:
 48 |         if tokenizer.unk_token is not None:
 49 |             tokenizer.pad_token = tokenizer.unk_token
 50 |         else:
 51 |             tokenizer.pad_token = tokenizer.eos_token
 52 |     return tokenizer
 53 | 
 54 | 
 55 | def main(cfg):
 56 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
 57 | 
 58 |     # Set seed for reproducibility
 59 |     set_seed(cfg.seed)
 60 | 
 61 |     # set up accelerator
 62 |     accelerator = Accelerator()
 63 | 
 64 |     # datasets ---
 65 |     raw_datasets = get_datasets(cfg)
 66 |     tokenizer = get_tokenizer(cfg)
 67 | 
 68 |     quantization_config = BitsAndBytesConfig(
 69 |         load_in_4bit=True,
 70 |         bnb_4bit_quant_type="nf4",
 71 |         bnb_4bit_use_double_quant=True,
 72 |         bnb_4bit_compute_dtype=torch.float16,
 73 |     )
 74 | 
 75 |     # model ----
 76 |     accelerator.print(f"Merging peft adapters for {cfg.sft_model_path}")
 77 |     peft_config = PeftConfig.from_pretrained(cfg.sft_model_path)
 78 |     base_model = AutoModelForCausalLM.from_pretrained(
 79 |         peft_config.base_model_name_or_path,
 80 |         quantization_config=quantization_config,
 81 |     )
 82 | 
 83 |     model = PeftModel.from_pretrained(base_model, cfg.sft_model_path)
 84 |     model.eval()
 85 |     model = model.merge_and_unload()
 86 |     model_kwargs = None  # {"use_cache": False}
 87 | 
 88 |     peft_config = LoraConfig(
 89 |         r=cfg.lora.r,
 90 |         lora_alpha=cfg.lora.lora_alpha,
 91 |         lora_dropout=cfg.lora.lora_dropout,
 92 |         bias="none",
 93 |         task_type=TaskType.CAUSAL_LM,
 94 |         inference_mode=False,
 95 |         target_modules=cfg_dict["lora"]["target_modules"],
 96 |     )
 97 | 
 98 |     ref_model = None
 99 |     ref_model_kwargs = None  # {"use_cache": False}
100 | 
101 |     # Training args ---
102 |     training_args = TrainingArguments(
103 |         output_dir=cfg.output_dir,
104 |         learning_rate=cfg.learning_rate,
105 |         # lr_scheduler_type=cfg.lr_scheduler_type,
106 |         per_device_train_batch_size=cfg.per_device_train_batch_size,
107 |         per_device_eval_batch_size=cfg.per_device_eval_batch_size,
108 |         gradient_accumulation_steps=cfg.gradient_accumulation_steps,
109 |         max_grad_norm=cfg.max_grad_norm,
110 |         optim=cfg.dpo.optim,
111 |         num_train_epochs=cfg.num_train_epochs,
112 |         evaluation_strategy="steps",
113 |         save_strategy="steps",
114 |         eval_steps=50,
115 |         save_steps=50,
116 |         save_total_limit=None,
117 |         warmup_steps=cfg.warmup_ratio,
118 |         logging_steps=1,
119 |         report_to='wandb',
120 |         # gradient_checkpointing=True,
121 |     )
122 | 
123 |     # DPO Trainer ---
124 |     dpo_trainer = DPOTrainer(
125 |         model,
126 |         ref_model,
127 |         model_init_kwargs=model_kwargs,
128 |         ref_model_init_kwargs=ref_model_kwargs,
129 |         args=training_args,
130 |         beta=cfg.dpo.beta,
131 |         train_dataset=raw_datasets["train"],
132 |         eval_dataset=raw_datasets["test"],
133 |         tokenizer=tokenizer,
134 |         max_length=cfg.dpo.max_length,
135 |         max_prompt_length=cfg.dpo.max_prompt_length,
136 |         peft_config=peft_config,
137 |     )
138 | 
139 |     # Training loop ---
140 |     train_result = dpo_trainer.train()
141 |     metrics = train_result.metrics
142 |     # max_train_samples = int(0.25*len(raw_datasets["train"]))
143 |     # metrics["train_samples"] = min(max_train_samples, len(raw_datasets["train"]))
144 |     dpo_trainer.log_metrics("train", metrics)
145 |     dpo_trainer.save_metrics("train", metrics)
146 |     dpo_trainer.save_state()
147 | 
148 |     accelerator.print("*** Training complete ***")
149 | 
150 |     # Evaluation loop ---
151 |     accelerator.print("*** Evaluate ***")
152 |     metrics = dpo_trainer.evaluate()
153 |     dpo_trainer.log_metrics("eval", metrics)
154 |     dpo_trainer.save_metrics("eval", metrics)
155 | 
156 |     # Save model ---
157 |     dpo_trainer.save_model(cfg.output_dir)
158 | 
159 |     # Ensure we don't timeout on model save / push to Hub
160 |     accelerator.print("*** Waiting for all processes to finish ***")
161 |     accelerator.wait_for_everyone()
162 | 
163 |     accelerator.print("*** Run complete! ***")
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     ap = argparse.ArgumentParser()
168 |     ap.add_argument('--config_path', type=str, required=True)
169 |     args = ap.parse_args()
170 |     cfg = OmegaConf.load(args.config_path)
171 | 
172 |     os.makedirs(cfg.output_dir, exist_ok=True)
173 |     main(cfg)
174 | 


--------------------------------------------------------------------------------
/code/train_r_embed.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import random
  5 | import time
  6 | from copy import deepcopy
  7 | 
  8 | import datasets
  9 | import hydra
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | import transformers
 14 | import wandb
 15 | from accelerate import Accelerator
 16 | from accelerate.logging import get_logger
 17 | from accelerate.utils import set_seed
 18 | from omegaconf import OmegaConf
 19 | from torch.utils.data import DataLoader
 20 | from tqdm.auto import tqdm
 21 | from transformers import get_cosine_schedule_with_warmup
 22 | 
 23 | try:
 24 |     from r_embed.ai_dataset import AiDataset
 25 |     from r_embed.ai_loader import AiCollator, AiCollatorTrain, show_batch
 26 |     from r_embed.ai_model import AiModel
 27 |     from r_embed.ai_optimizer import get_optimizer
 28 |     from utils.train_utils import (AverageMeter, as_minutes, get_lr,
 29 |                                    save_checkpoint)
 30 | 
 31 | except Exception as e:
 32 |     print(e)
 33 |     raise ImportError
 34 | 
 35 | logger = get_logger(__name__)
 36 | 
 37 | 
 38 | pd.options.display.max_colwidth = 1000
 39 | 
 40 | # -------- Evaluation -------------------------------------------------------------#
 41 | 
 42 | 
 43 | def run_evaluation(accelerator, model, valid_dl):
 44 |     model.eval()
 45 | 
 46 |     all_losses = []
 47 | 
 48 |     progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process)
 49 | 
 50 |     for batch in valid_dl:
 51 |         with torch.no_grad():
 52 |             loss = model(**batch)
 53 | 
 54 |         batch_losses = accelerator.gather_for_metrics(loss)
 55 |         batch_losses = batch_losses.cpu().numpy().tolist()
 56 |         all_losses.extend(batch_losses)
 57 | 
 58 |         progress_bar.update(1)
 59 |     progress_bar.close()
 60 | 
 61 |     # compute metric
 62 |     eval_dict = dict()
 63 |     eval_dict['valid_loss'] = np.mean(all_losses)
 64 | 
 65 |     return eval_dict
 66 | 
 67 | 
 68 | # -------- Main Function ---------------------------------------------------------#
 69 | 
 70 | 
 71 | @hydra.main(version_base=None, config_path="../conf/r_embed", config_name="conf_r_embed")
 72 | def run_training(cfg):
 73 |     # ------- Accelerator ---------------------------------------------------------------#
 74 |     if cfg.use_wandb:
 75 |         accelerator = Accelerator(
 76 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 77 |             log_with="wandb",
 78 |         )
 79 | 
 80 |         accelerator.init_trackers(
 81 |             cfg.wandb.project,
 82 |             config=OmegaConf.to_container(cfg, resolve=True),
 83 |         )
 84 | 
 85 |     else:
 86 |         accelerator = Accelerator(
 87 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 88 |         )
 89 | 
 90 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
 91 | 
 92 |     # Make one log on every process with the configuration for debugging.
 93 |     logging.basicConfig(
 94 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 95 |         datefmt="%m/%d/%Y %H:%M:%S",
 96 |         level=logging.INFO,
 97 |     )
 98 |     logger.info(accelerator.state, main_process_only=False)
 99 | 
100 |     def print_line():
101 |         prefix, unit, suffix = "#", "~~", "#"
102 |         accelerator.print(prefix + unit*50 + suffix)
103 | 
104 |     if accelerator.is_local_main_process:
105 |         datasets.utils.logging.set_verbosity_warning()
106 |         transformers.utils.logging.set_verbosity_info()
107 |     else:
108 |         datasets.utils.logging.set_verbosity_error()
109 |         transformers.utils.logging.set_verbosity_error()
110 | 
111 |     # ------- Runtime Configs -----------------------------------------------------------#
112 |     print_line()
113 |     accelerator.print(f"setting seed: {cfg.seed}")
114 |     set_seed(cfg.seed)
115 | 
116 |     if accelerator.is_main_process:
117 |         os.makedirs(cfg.outputs.model_dir, exist_ok=True)
118 |     print_line()
119 | 
120 |     # ------- load data ----------------------------------------------------------#
121 |     print_line()
122 |     data_dir = cfg.input_data_dir
123 | 
124 |     train_df = pd.read_parquet(os.path.join(data_dir, "train_essays.parquet"))
125 |     train_df = train_df[~train_df['text'].isna()].copy()
126 | 
127 |     valid_df = pd.read_parquet(os.path.join(data_dir, "valid_essays.parquet"))
128 |     valid_df = valid_df[~valid_df['text'].isna()].copy()
129 | 
130 |     train_df = train_df.reset_index(drop=True)
131 |     valid_df = valid_df.reset_index(drop=True)
132 | 
133 |     prompt_ids = train_df["prompt_id"].unique().tolist()
134 |     prompt_ids = [p for p in prompt_ids if p <= 8]
135 | 
136 |     pos_df = train_df[train_df["generated"] == 1].copy()
137 |     neg_df = train_df[train_df["generated"] == 0].copy()
138 | 
139 |     pos_gdf = pos_df.groupby("prompt_id")["id"].apply(list).reset_index()
140 |     prompt2ids_pos = dict(zip(pos_gdf["prompt_id"], pos_gdf["id"]))
141 | 
142 |     neg_gdf = neg_df.groupby("prompt_id")["id"].apply(list).reset_index()
143 |     prompt2ids_neg = dict(zip(neg_gdf["prompt_id"], neg_gdf["id"]))
144 | 
145 |     accelerator.print(f"shape of train data: {train_df.shape}")
146 |     accelerator.print(f"{train_df.head()}")
147 |     accelerator.print(f"shape of validation data: {valid_df.shape}")
148 |     accelerator.print(f"Prompts: {prompt_ids}")
149 | 
150 |     with accelerator.main_process_first():
151 |         dataset_creator = AiDataset(cfg)
152 | 
153 |         train_ds = dataset_creator.get_dataset(train_df)
154 |         valid_ds = dataset_creator.get_dataset(valid_df)
155 | 
156 |     tokenizer = dataset_creator.tokenizer
157 | 
158 |     # ------- data loaders ----------------------------------------------------------------#
159 |     train_ds.set_format(
160 |         type=None,
161 |         columns=[
162 |             'id',
163 |             'input_ids',
164 |             'attention_mask',
165 |             'generated'
166 |         ]
167 |     )
168 | 
169 |     valid_ds = valid_ds.sort("input_length")
170 | 
171 |     valid_ds.set_format(
172 |         type=None,
173 |         columns=[
174 |             'id',
175 |             'input_ids',
176 |             'attention_mask',
177 |             'generated'
178 |         ]
179 |     )
180 |     valid_ids = valid_df["id"]
181 | 
182 |     # ---
183 |     kwargs = dict(
184 |         train_ds=train_ds,
185 |         prompt_ids=prompt_ids,
186 |         prompt2ids_pos=prompt2ids_pos,
187 |         prompt2ids_neg=prompt2ids_neg,
188 |     )
189 | 
190 |     data_collector_train = AiCollatorTrain(
191 |         tokenizer=tokenizer,
192 |         pad_to_multiple_of=64,
193 |         kwargs=kwargs,
194 |     )
195 | 
196 |     data_collector = AiCollator(
197 |         tokenizer=tokenizer,
198 |         pad_to_multiple_of=64
199 |     )
200 | 
201 |     train_dl = DataLoader(
202 |         train_ds,
203 |         batch_size=cfg.train_params.per_device_train_batch_size,
204 |         shuffle=True,
205 |         collate_fn=data_collector_train,
206 |     )
207 | 
208 |     valid_dl = DataLoader(
209 |         valid_ds,
210 |         batch_size=cfg.train_params.per_device_eval_batch_size,
211 |         shuffle=False,
212 |         collate_fn=data_collector,
213 |     )
214 | 
215 |     accelerator.print("data preparation done...")
216 |     print_line()
217 | 
218 |     # --- show batch -------------------------------------------------------------------#
219 |     print_line()
220 | 
221 |     for b in train_dl:
222 |         break
223 |     show_batch(b, tokenizer, task='training', print_fn=print, n_examples=4)
224 | 
225 |     print_line()
226 | 
227 |     for b in valid_dl:
228 |         break
229 |     show_batch(b, tokenizer, task='validation', print_fn=accelerator.print)
230 | 
231 |     print_line()
232 | 
233 |     # ------- Config -------------------------------------------------------------------#
234 |     accelerator.print("config for the current run:")
235 |     accelerator.print(json.dumps(cfg_dict, indent=4))
236 |     print_line()
237 | 
238 |     # ------- Model --------------------------------------------------------------------#
239 |     print_line()
240 |     print("creating the LLM Detection model...")
241 |     model = AiModel(cfg, accelerator.device)
242 |     print_line()
243 | 
244 |     # ------- Optimizer ----------------------------------------------------------------#
245 |     print_line()
246 |     print("creating the optimizer...")
247 |     optimizer = get_optimizer(model, cfg)
248 |     # ------- Prepare -------------------------------------------------------------------#
249 | 
250 |     model, optimizer, train_dl, valid_dl = accelerator.prepare(
251 |         model, optimizer, train_dl, valid_dl
252 |     )
253 | 
254 |     # ------- Scheduler -----------------------------------------------------------------#
255 |     print_line()
256 |     num_epochs = cfg.train_params.num_train_epochs
257 |     grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps
258 |     warmup_pct = cfg.train_params.warmup_pct
259 | 
260 |     num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps
261 |     num_training_steps = num_epochs * num_update_steps_per_epoch
262 |     num_warmup_steps = int(warmup_pct*num_training_steps)
263 | 
264 |     accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
265 |     accelerator.print(f"# training steps: {num_training_steps}")
266 |     accelerator.print(f"# warmup steps: {num_warmup_steps}")
267 | 
268 |     scheduler = get_cosine_schedule_with_warmup(
269 |         optimizer=optimizer,
270 |         num_warmup_steps=num_warmup_steps,
271 |         num_training_steps=num_training_steps
272 |     )
273 | 
274 |     # ------- training setup --------------------------------------------------------------#
275 |     best_lb = 1e6  # track recall@1000
276 | 
277 |     patience_tracker = 0
278 |     current_iteration = 0
279 | 
280 |     # ------- training  --------------------------------------------------------------------#
281 |     start_time = time.time()
282 |     accelerator.wait_for_everyone()
283 | 
284 |     for epoch in range(num_epochs):
285 |         # close and reset progress bar
286 |         if epoch != 0:
287 |             progress_bar.close()
288 | 
289 |         progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
290 |         loss_meter = AverageMeter()
291 | 
292 |         # Training ------
293 |         model.train()
294 |         for step, batch in enumerate(train_dl):
295 |             with accelerator.accumulate(model):
296 |                 loss = model(**batch)
297 |                 accelerator.backward(loss)
298 | 
299 |                 if accelerator.sync_gradients:
300 | 
301 |                     accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm)
302 | 
303 |                     optimizer.step()  # gradient_state.sync_gradients check is performed inside optimizer.step
304 |                     scheduler.step()
305 |                     optimizer.zero_grad()
306 | 
307 |                 loss_meter.update(loss.item())
308 | 
309 |             if accelerator.sync_gradients:
310 |                 progress_bar.set_description(
311 |                     f"STEP: {current_iteration+1:5}/{num_update_steps_per_epoch:5}. "
312 |                     f"LR: {get_lr(optimizer):.4f}. "
313 |                     f"Loss: {loss_meter.avg:.4f}. "
314 |                 )
315 | 
316 |                 progress_bar.update(1)
317 |                 current_iteration += 1
318 | 
319 |                 if cfg.use_wandb:
320 |                     accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration)
321 |                     accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration)
322 | 
323 |             # >--------------------------------------------------|
324 |             # >-- evaluation ------------------------------------|
325 |             # >--------------------------------------------------|
326 | 
327 |             if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0):
328 |                 # set model in eval mode
329 |                 model.eval()
330 |                 scores_dict = run_evaluation(accelerator, model, valid_dl)
331 |                 lb = scores_dict["valid_loss"]
332 | 
333 |                 print_line()
334 |                 et = as_minutes(time.time()-start_time)
335 |                 accelerator.print(
336 |                     f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}"
337 |                 )
338 |                 print_line()
339 |                 accelerator.print(f">>> Current LB (valid_loss) = {round(lb, 4)}")
340 | 
341 |                 print_line()
342 | 
343 |                 is_best = False
344 |                 if lb <= best_lb:
345 |                     best_lb = lb
346 |                     is_best = True
347 |                     patience_tracker = 0
348 | 
349 |                     # -----
350 |                     best_dict = dict()
351 |                     for k, v in scores_dict.items():
352 |                         best_dict[f"{k}_at_best"] = v
353 |                 else:
354 |                     patience_tracker += 1
355 | 
356 |                 # saving -----
357 |                 accelerator.wait_for_everyone()
358 |                 unwrapped_model = accelerator.unwrap_model(model)
359 |                 model_state = {
360 |                     'step': current_iteration,
361 |                     'epoch': epoch + 1,
362 |                     'state_dict': unwrapped_model.state_dict(),
363 |                     'lb': lb,
364 |                 }
365 | 
366 |                 if accelerator.is_main_process:
367 |                     save_checkpoint(cfg, model_state, is_best=is_best)
368 | 
369 |                 # -- post eval
370 |                 model.train()
371 |                 torch.cuda.empty_cache()
372 |                 print_line()
373 | 
374 |                 # early stopping ----
375 |                 if patience_tracker >= cfg.train_params.patience:
376 |                     print("stopping early")
377 |                     model.eval()
378 |                     accelerator.end_training()
379 |                     return
380 | 
381 | 
382 | if __name__ == "__main__":
383 |     run_training()
384 | 


--------------------------------------------------------------------------------
/code/train_r_ranking.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import random
  5 | import time
  6 | from copy import deepcopy
  7 | 
  8 | import datasets
  9 | import hydra
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | import transformers
 14 | import wandb
 15 | from accelerate import Accelerator
 16 | from accelerate.logging import get_logger
 17 | from accelerate.utils import set_seed
 18 | from omegaconf import OmegaConf
 19 | from torch.utils.data import DataLoader
 20 | from tqdm.auto import tqdm
 21 | from transformers import get_cosine_schedule_with_warmup
 22 | 
 23 | try:
 24 |     from r_ranking.ai_dataset import AiDataset
 25 |     from r_ranking.ai_loader import AiCollator, AiCollatorTrain, show_batch
 26 |     from r_ranking.ai_model import AiModel
 27 |     from r_ranking.ai_optimizer import get_optimizer
 28 |     from utils.metric_utils import compute_metrics
 29 |     from utils.train_utils import (AverageMeter, as_minutes, get_lr,
 30 |                                    save_checkpoint)
 31 | 
 32 | except Exception as e:
 33 |     print(e)
 34 |     raise ImportError
 35 | 
 36 | logger = get_logger(__name__)
 37 | 
 38 | 
 39 | pd.options.display.max_colwidth = 1000
 40 | 
 41 | # -------- Evaluation -------------------------------------------------------------#
 42 | 
 43 | 
 44 | def run_evaluation(accelerator, model, valid_dl, valid_ids):
 45 |     model.eval()
 46 | 
 47 |     all_predictions = []
 48 |     all_truths = []
 49 | 
 50 |     progress_bar = tqdm(range(len(valid_dl)), disable=not accelerator.is_local_main_process)
 51 | 
 52 |     for batch in valid_dl:
 53 |         with torch.no_grad():
 54 |             logits, _ = model(**batch)
 55 |             logits = logits.reshape(-1)
 56 |         predictions = torch.sigmoid(logits)
 57 |         predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"].to(torch.long).reshape(-1)))
 58 |         predictions, references = predictions.cpu().numpy().tolist(), references.cpu().numpy().tolist()
 59 | 
 60 |         all_predictions.extend(predictions)
 61 |         all_truths.extend(references)
 62 | 
 63 |         progress_bar.update(1)
 64 |     progress_bar.close()
 65 | 
 66 |     # compute metric
 67 |     eval_dict = compute_metrics(all_predictions, all_truths)
 68 | 
 69 |     result_df = pd.DataFrame()
 70 |     result_df["id"] = valid_ids
 71 |     result_df["predictions"] = all_predictions
 72 |     result_df["truths"] = all_truths
 73 | 
 74 |     oof_df = deepcopy(result_df)
 75 |     oof_df = oof_df.rename(columns={"predictions": "generated"})
 76 |     oof_df = oof_df[["id", "generated"]].copy()
 77 | 
 78 |     to_return = {
 79 |         "scores": eval_dict,
 80 |         "result_df": result_df,
 81 |         "oof_df": oof_df,
 82 |     }
 83 | 
 84 |     return to_return
 85 | 
 86 | 
 87 | # -------- Main Function ---------------------------------------------------------#
 88 | 
 89 | 
 90 | @hydra.main(version_base=None, config_path="../conf/r_ranking", config_name="conf_r_ranking")
 91 | def run_training(cfg):
 92 |     # ------- Accelerator ---------------------------------------------------------------#
 93 |     if cfg.use_wandb:
 94 |         accelerator = Accelerator(
 95 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
 96 |             log_with="wandb",
 97 |         )
 98 | 
 99 |         accelerator.init_trackers(
100 |             cfg.wandb.project,
101 |             config=OmegaConf.to_container(cfg, resolve=True),
102 |         )
103 | 
104 |     else:
105 |         accelerator = Accelerator(
106 |             gradient_accumulation_steps=cfg.train_params.gradient_accumulation_steps,
107 |         )
108 | 
109 |     cfg_dict = OmegaConf.to_container(cfg, resolve=True)
110 | 
111 |     # Make one log on every process with the configuration for debugging.
112 |     logging.basicConfig(
113 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
114 |         datefmt="%m/%d/%Y %H:%M:%S",
115 |         level=logging.INFO,
116 |     )
117 |     logger.info(accelerator.state, main_process_only=False)
118 | 
119 |     def print_line():
120 |         prefix, unit, suffix = "#", "~~", "#"
121 |         accelerator.print(prefix + unit*50 + suffix)
122 | 
123 |     if accelerator.is_local_main_process:
124 |         datasets.utils.logging.set_verbosity_warning()
125 |         transformers.utils.logging.set_verbosity_info()
126 |     else:
127 |         datasets.utils.logging.set_verbosity_error()
128 |         transformers.utils.logging.set_verbosity_error()
129 | 
130 |     # ------- Runtime Configs -----------------------------------------------------------#
131 |     print_line()
132 |     accelerator.print(f"setting seed: {cfg.seed}")
133 |     set_seed(cfg.seed)
134 | 
135 |     if accelerator.is_main_process:
136 |         os.makedirs(cfg.outputs.model_dir, exist_ok=True)
137 |     print_line()
138 | 
139 |     # ------- load data ----------------------------------------------------------#
140 |     print_line()
141 |     data_dir = cfg.input_data_dir
142 | 
143 |     # load query dataframe
144 |     essay_df = pd.read_csv(os.path.join(data_dir, "train_essays.csv"))
145 |     essay_df = essay_df[~essay_df['text'].isna()].copy()
146 |     essay_df = essay_df.reset_index(drop=True)
147 | 
148 |     # ------- Data Split ----------------------------------------------------------------#
149 | 
150 |     # sample validation data
151 |     rng = random.Random(cfg.seed)
152 |     essay_df['fold'] = essay_df['text'].apply(lambda x: 'train' if rng.random() < 0.99 else 'valid')
153 |     train_df = essay_df[essay_df['fold'] == 'train'].copy()
154 |     valid_df = essay_df[essay_df['fold'] == 'valid'].copy()
155 | 
156 |     # train_df = train_df.sort_values(by="prompt_id", ascending=True)
157 |     train_df = train_df.reset_index(drop=True)
158 |     valid_df = valid_df.reset_index(drop=True)
159 | 
160 |     prompt_ids = train_df["prompt_id"].unique().tolist()
161 |     gdf = train_df.groupby("prompt_id")["id"].apply(list).reset_index()
162 |     prompt2ids = dict(zip(gdf["prompt_id"], gdf["id"]))
163 | 
164 |     accelerator.print(f"shape of train data: {train_df.shape}")
165 |     accelerator.print(f"{train_df.head()}")
166 |     accelerator.print(f"shape of validation data: {valid_df.shape}")
167 |     accelerator.print(f"Prompts: {prompt_ids}")
168 | 
169 |     with accelerator.main_process_first():
170 |         dataset_creator = AiDataset(cfg)
171 | 
172 |         train_ds = dataset_creator.get_dataset(train_df)
173 |         valid_ds = dataset_creator.get_dataset(valid_df)
174 | 
175 |     tokenizer = dataset_creator.tokenizer
176 | 
177 |     # ------- data loaders ----------------------------------------------------------------#
178 |     train_ds.set_format(
179 |         type=None,
180 |         columns=[
181 |             'id',
182 |             'input_ids',
183 |             'attention_mask',
184 |             'generated'
185 |         ]
186 |     )
187 | 
188 |     # sort valid dataset for faster evaluation
189 |     valid_ds = valid_ds.sort("input_length")
190 | 
191 |     valid_ds.set_format(
192 |         type=None,
193 |         columns=[
194 |             'id',
195 |             'input_ids',
196 |             'attention_mask',
197 |             'generated'
198 |         ]
199 |     )
200 |     valid_ids = valid_df["id"]
201 | 
202 |     # ---
203 |     kwargs = dict(
204 |         train_ds=train_ds,
205 |         prompt_ids=prompt_ids,
206 |         prompt2ids=prompt2ids,
207 |     )
208 | 
209 |     data_collector_train = AiCollatorTrain(
210 |         tokenizer=tokenizer,
211 |         pad_to_multiple_of=64,
212 |         kwargs=kwargs,
213 |     )
214 | 
215 |     data_collector = AiCollator(
216 |         tokenizer=tokenizer,
217 |         pad_to_multiple_of=64
218 |     )
219 | 
220 |     train_dl = DataLoader(
221 |         train_ds,
222 |         batch_size=cfg.train_params.per_device_train_batch_size,
223 |         shuffle=True,
224 |         collate_fn=data_collector_train,
225 |     )
226 | 
227 |     valid_dl = DataLoader(
228 |         valid_ds,
229 |         batch_size=cfg.train_params.per_device_eval_batch_size,
230 |         shuffle=False,
231 |         collate_fn=data_collector,
232 |     )
233 | 
234 |     accelerator.print("data preparation done...")
235 |     print_line()
236 | 
237 |     # --- show batch -------------------------------------------------------------------#
238 |     print_line()
239 | 
240 |     for b in train_dl:
241 |         break
242 |     show_batch(b, tokenizer, task='training', print_fn=print, n_examples=4)
243 | 
244 |     print_line()
245 | 
246 |     for b in valid_dl:
247 |         break
248 |     show_batch(b, tokenizer, task='validation', print_fn=accelerator.print)
249 | 
250 |     print_line()
251 | 
252 |     # ------- Config -------------------------------------------------------------------#
253 |     accelerator.print("config for the current run:")
254 |     accelerator.print(json.dumps(cfg_dict, indent=4))
255 |     print_line()
256 | 
257 |     # ------- Model --------------------------------------------------------------------#
258 |     print_line()
259 |     print("creating the LLM Detection model...")
260 |     model = AiModel(cfg, accelerator.device)
261 |     print_line()
262 | 
263 |     # ------- Optimizer ----------------------------------------------------------------#
264 |     print_line()
265 |     print("creating the optimizer...")
266 |     optimizer = get_optimizer(model, cfg)
267 |     # ------- Prepare -------------------------------------------------------------------#
268 | 
269 |     model, optimizer, train_dl, valid_dl = accelerator.prepare(
270 |         model, optimizer, train_dl, valid_dl
271 |     )
272 | 
273 |     # ------- Scheduler -----------------------------------------------------------------#
274 |     print_line()
275 |     num_epochs = cfg.train_params.num_train_epochs
276 |     grad_accumulation_steps = cfg.train_params.gradient_accumulation_steps
277 |     warmup_pct = cfg.train_params.warmup_pct
278 | 
279 |     num_update_steps_per_epoch = len(train_dl)//grad_accumulation_steps
280 |     num_training_steps = num_epochs * num_update_steps_per_epoch
281 |     num_warmup_steps = int(warmup_pct*num_training_steps)
282 | 
283 |     accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
284 |     accelerator.print(f"# training steps: {num_training_steps}")
285 |     accelerator.print(f"# warmup steps: {num_warmup_steps}")
286 | 
287 |     scheduler = get_cosine_schedule_with_warmup(
288 |         optimizer=optimizer,
289 |         num_warmup_steps=num_warmup_steps,
290 |         num_training_steps=num_training_steps
291 |     )
292 | 
293 |     # ------- training setup --------------------------------------------------------------#
294 |     best_lb = -1  # track recall@1000
295 | 
296 |     patience_tracker = 0
297 |     current_iteration = 0
298 | 
299 |     # ------- training  --------------------------------------------------------------------#
300 |     start_time = time.time()
301 |     accelerator.wait_for_everyone()
302 | 
303 |     for epoch in range(num_epochs):
304 |         # close and reset progress bar
305 |         if epoch != 0:
306 |             progress_bar.close()
307 | 
308 |         progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
309 |         loss_meter = AverageMeter()
310 | 
311 |         # Training ------
312 |         model.train()
313 |         for step, batch in enumerate(train_dl):
314 |             with accelerator.accumulate(model):
315 |                 _, loss = model(**batch)
316 |                 accelerator.backward(loss)
317 | 
318 |                 if accelerator.sync_gradients:
319 | 
320 |                     accelerator.clip_grad_norm_(model.parameters(), cfg.optimizer.max_grad_norm)
321 | 
322 |                     optimizer.step()  # gradient_state.sync_gradients check is performed inside optimizer.step
323 |                     scheduler.step()
324 |                     optimizer.zero_grad()
325 | 
326 |                 loss_meter.update(loss.item())
327 | 
328 |             if accelerator.sync_gradients:
329 |                 progress_bar.set_description(
330 |                     f"STEP: {current_iteration+1:5}/{num_update_steps_per_epoch:5}. "
331 |                     f"LR: {get_lr(optimizer):.4f}. "
332 |                     f"Loss: {loss_meter.avg:.4f}. "
333 |                 )
334 | 
335 |                 progress_bar.update(1)
336 |                 current_iteration += 1
337 | 
338 |                 if cfg.use_wandb:
339 |                     accelerator.log({"train_loss": round(loss_meter.avg, 5)}, step=current_iteration)
340 |                     accelerator.log({"lr": get_lr(optimizer)}, step=current_iteration)
341 | 
342 |             # >--------------------------------------------------|
343 |             # >-- evaluation ------------------------------------|
344 |             # >--------------------------------------------------|
345 | 
346 |             if (accelerator.sync_gradients) & (current_iteration % cfg.train_params.eval_frequency == 0):
347 |                 # set model in eval mode
348 |                 model.eval()
349 |                 eval_response = run_evaluation(accelerator, model, valid_dl, valid_ids)
350 | 
351 |                 scores_dict = eval_response["scores"]
352 |                 result_df = eval_response["result_df"]
353 |                 oof_df = eval_response["oof_df"]
354 |                 lb = scores_dict["lb"]
355 | 
356 |                 print_line()
357 |                 et = as_minutes(time.time()-start_time)
358 |                 accelerator.print(
359 |                     f">>> Epoch {epoch+1} | Step {step} | Total Step {current_iteration} | Time: {et}"
360 |                 )
361 |                 print_line()
362 |                 accelerator.print(f">>> Current LB (AUC) = {round(lb, 4)}")
363 | 
364 |                 print_line()
365 | 
366 |                 is_best = False
367 |                 if lb >= best_lb:
368 |                     best_lb = lb
369 |                     is_best = True
370 |                     patience_tracker = 0
371 | 
372 |                     # -----
373 |                     best_dict = dict()
374 |                     for k, v in scores_dict.items():
375 |                         best_dict[f"{k}_at_best"] = v
376 |                 else:
377 |                     patience_tracker += 1
378 | 
379 |                 if is_best:
380 |                     oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_best.csv"), index=False)
381 |                     result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_best.csv"), index=False)
382 |                 else:
383 |                     accelerator.print(f">>> patience reached {patience_tracker}/{cfg_dict['train_params']['patience']}")
384 |                     accelerator.print(f">>> current best score: {round(best_lb, 4)}")
385 | 
386 |                 oof_df.to_csv(os.path.join(cfg.outputs.model_dir, f"oof_df_last.csv"), index=False)
387 |                 result_df.to_csv(os.path.join(cfg.outputs.model_dir, f"result_df_last.csv"), index=False)
388 | 
389 |                 # saving -----
390 |                 accelerator.wait_for_everyone()
391 |                 unwrapped_model = accelerator.unwrap_model(model)
392 |                 model_state = {
393 |                     'step': current_iteration,
394 |                     'epoch': epoch + 1,
395 |                     'state_dict': unwrapped_model.state_dict(),
396 |                     'lb': lb,
397 |                 }
398 | 
399 |                 if accelerator.is_main_process:
400 |                     save_checkpoint(cfg, model_state, is_best=is_best)
401 | 
402 |                 # logging ----
403 |                 if cfg.use_wandb:
404 |                     accelerator.log({"lb": lb}, step=current_iteration)
405 |                     accelerator.log({"best_lb": best_lb}, step=current_iteration)
406 | 
407 |                     # -- log scores dict
408 |                     for k, v in scores_dict.items():
409 |                         accelerator.log({k: round(v, 4)}, step=current_iteration)
410 | 
411 |                     # --- log best scores dict
412 |                     for k, v in best_dict.items():
413 |                         accelerator.log({k: round(v, 4)}, step=current_iteration)
414 | 
415 |                 # -- post eval
416 |                 model.train()
417 |                 torch.cuda.empty_cache()
418 |                 print_line()
419 | 
420 |                 # early stopping ----
421 |                 if patience_tracker >= cfg.train_params.patience:
422 |                     print("stopping early")
423 |                     model.eval()
424 |                     accelerator.end_training()
425 |                     return
426 | 
427 | 
428 | if __name__ == "__main__":
429 |     run_training()
430 | 


--------------------------------------------------------------------------------
/code/trainer_ranking_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | from datasets import Dataset
 4 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
 5 | 
 6 | loss_fct = torch.nn.MarginRankingLoss(margin=0.7)
 7 | class BCETrainer(Trainer):
 8 |     def compute_loss(self, model, inputs, return_outputs=False):
 9 |         human_outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
10 |         ai_outputs = model(input_ids=inputs["ai_input_ids"], attention_mask=inputs["ai_attention_mask"])
11 | 
12 |         human_outputs = human_outputs.get("logits").view(-1)
13 |         ai_outputs = ai_outputs.get("logits").view(-1)
14 | 
15 |         loss = loss_fct(ai_outputs, human_outputs, torch.ones_like(ai_outputs))
16 | 
17 |         return (loss, ai_outputs) if return_outputs else loss
18 | 
19 | essay_df = pd.read_csv("train_essays_pos_neg.csv").sample(200_000)
20 | train_df = essay_df.copy().reset_index(drop=True)
21 | train_df["human"] = train_df["human"].str.strip()
22 | train_df["ai"] = train_df["ai"].str.strip()
23 | 
24 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
25 | model = AutoModelForSequenceClassification.from_pretrained(
26 |     "microsoft/deberta-v3-large",
27 |     num_labels=1
28 | )
29 | 
30 | train_ds = Dataset.from_pandas(train_df)
31 | 
32 | def preprocess_function(examples, max_length=1280):
33 |     tokenized_samples = tokenizer(examples["human"], truncation=True, max_length=max_length)
34 |     tokenized_samples_ai = tokenizer(examples["ai"], truncation=True, max_length=max_length)
35 | 
36 |     tokenized_samples["ai_input_ids"] = tokenized_samples_ai["input_ids"]
37 |     tokenized_samples["ai_attention_mask"] = tokenized_samples_ai["attention_mask"]
38 | 
39 |     return tokenized_samples
40 | 
41 | train_tokenized_ds = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
42 | 
43 | training_args = TrainingArguments(
44 |     output_dir=f"checkpoint/deberta-v3-large-v18-margin",
45 |     learning_rate=1e-5,
46 |     per_device_train_batch_size=1,
47 |     gradient_accumulation_steps=4,
48 |     max_grad_norm=1,
49 |     optim='adamw_8bit',
50 |     num_train_epochs=1,
51 |     weight_decay=0.1,
52 |     fp16=True,
53 |     save_strategy="epoch",
54 |     remove_unused_columns=False,
55 |     warmup_steps=0.1,
56 |     logging_steps=100,
57 |     gradient_checkpointing=False,
58 |     report_to='tensorboard'
59 | )
60 | 
61 | trainer = BCETrainer(
62 |     model=model,
63 |     args=training_args,
64 |     train_dataset=train_tokenized_ds,
65 |     tokenizer=tokenizer,
66 | )
67 | 
68 | trainer.train()


--------------------------------------------------------------------------------
/code/utils/metric_utils.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import roc_auc_score
 2 | 
 3 | 
 4 | def compute_metrics(predictions, truths):
 5 |     """
 6 |     ROC AUC SCORE
 7 |     """
 8 | 
 9 |     assert len(predictions) == len(truths)
10 |     score = roc_auc_score(truths, predictions)
11 | 
12 |     to_return = {
13 |         "lb": round(score, 4),
14 |     }
15 | 
16 |     return to_return
17 | 


--------------------------------------------------------------------------------
/code/utils/train_utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import shutil
  5 | import string
  6 | from copy import deepcopy
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import wandb
 11 | from omegaconf import OmegaConf
 12 | from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
 13 |                     nvmlInit)
 14 | 
 15 | 
 16 | def generate_random_string():
 17 |     chars = string.ascii_lowercase + string.digits
 18 |     return ''.join(random.choice(chars) for _ in range(6))
 19 | 
 20 | 
 21 | def get_desired_dtype(dtype):
 22 |     if dtype == 'fp16':
 23 |         return torch.float16
 24 |     elif dtype == 'bf16':
 25 |         return torch.bfloat16
 26 |     else:
 27 |         return torch.float32
 28 | 
 29 | 
 30 | def print_line(logger=None):
 31 |     prefix, unit, suffix = "#", "~~", "#"
 32 |     if logger is None:
 33 |         print(prefix + unit*50 + suffix)
 34 |     else:
 35 |         logger.print(prefix + unit*50 + suffix)
 36 | 
 37 | 
 38 | def as_minutes(s):
 39 |     m = math.floor(s / 60)
 40 |     s -= m * 60
 41 |     return '%dm%ds' % (m, s)
 42 | 
 43 | 
 44 | def execution_setup(cfg):
 45 |     print_line()
 46 |     if cfg.use_random_seed:
 47 |         seed = random.randint(401, 999)
 48 |         cfg.seed = seed
 49 | 
 50 |     print(f"setting seed: {cfg.seed}")
 51 |     seed_everything(cfg.seed)
 52 | 
 53 |     # folder ---
 54 |     os.makedirs(cfg.outputs.model_dir, exist_ok=True)
 55 | 
 56 |     return cfg
 57 | 
 58 | 
 59 | def seed_everything(seed: int):
 60 |     random.seed(seed)
 61 |     os.environ["PYTHONHASHSEED"] = str(seed)
 62 |     np.random.seed(seed)
 63 |     torch.manual_seed(seed)
 64 |     torch.cuda.manual_seed(seed)
 65 |     torch.backends.cudnn.deterministic = True
 66 |     torch.backends.cudnn.benchmark = True
 67 | 
 68 | 
 69 | def init_wandb(cfg):
 70 |     project = cfg.wandb.project
 71 |     tags = cfg.wandb.tags
 72 | 
 73 |     if cfg.wandb.all_data_flag:
 74 |         run_id = f"{cfg.wandb.run_name}-all-data"
 75 |     else:
 76 |         run_id = f"{cfg.wandb.run_name}"
 77 | 
 78 |     run = wandb.init(
 79 |         project=project,
 80 |         config=OmegaConf.to_container(cfg, resolve=True),
 81 |         tags=tags,
 82 |         name=run_id,
 83 |         anonymous="must",
 84 |         job_type="Train",
 85 |     )
 86 | 
 87 |     return run
 88 | 
 89 | 
 90 | def print_gpu_utilization():
 91 |     nvmlInit()
 92 |     handle = nvmlDeviceGetHandleByIndex(0)
 93 |     info = nvmlDeviceGetMemoryInfo(handle)
 94 |     print(f"GPU memory occupied: {info.used//1024**2} MB.")
 95 | 
 96 | 
 97 | def get_lr(optimizer):
 98 |     return optimizer.param_groups[0]['lr']*1e6
 99 | 
100 | 
101 | class AverageMeter(object):
102 |     """Computes and stores the average and current value
103 |        Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
104 |     """
105 | 
106 |     def __init__(self):
107 |         self.reset()
108 | 
109 |     def reset(self):
110 |         self.val = 0
111 |         self.avg = 0
112 |         self.sum = 0
113 |         self.count = 0
114 | 
115 |     def update(self, val, n=1):
116 |         self.val = val
117 |         self.sum += val * n
118 |         self.count += n
119 |         self.avg = self.sum / self.count
120 | 
121 | 
122 | def save_checkpoint(cfg, state, is_best):
123 |     os.makedirs(cfg.outputs.model_dir, exist_ok=True)
124 |     name = f"detect_ai_model"
125 | 
126 |     filename = f'{cfg.outputs.model_dir}/{name}_last.pth.tar'
127 |     torch.save(state, filename, _use_new_zipfile_serialization=False)
128 | 
129 |     if is_best:
130 |         shutil.copyfile(filename, f'{cfg.outputs.model_dir}/{name}_best.pth.tar')
131 | 
132 | 
133 | class EMA():
134 |     """
135 |     credit: https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/332567
136 |     """
137 | 
138 |     def __init__(self, model, decay):
139 |         self.model = model
140 |         self.decay = decay
141 |         self.shadow = {}
142 |         self.backup = {}
143 | 
144 |     def register(self):
145 |         for name, param in self.model.named_parameters():
146 |             if param.requires_grad:
147 |                 self.shadow[name] = param.data.clone()
148 | 
149 |     def update(self):
150 |         for name, param in self.model.named_parameters():
151 |             if param.requires_grad:
152 |                 assert name in self.shadow
153 |                 new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
154 |                 self.shadow[name] = new_average.clone()
155 | 
156 |     def apply_shadow(self):
157 |         for name, param in self.model.named_parameters():
158 |             if param.requires_grad:
159 |                 assert name in self.shadow
160 |                 self.backup[name] = param.data
161 |                 param.data = self.shadow[name]
162 | 
163 |     def restore(self):
164 |         for name, param in self.model.named_parameters():
165 |             if param.requires_grad:
166 |                 assert name in self.backup
167 |                 param.data = self.backup[name]
168 |         self.backup = {}
169 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: mistralai/Mistral-7B-v0.1
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |       - v_proj
20 |       - o_proj
21 |       - gate_proj
22 |       - up_proj
23 |       - down_proj
24 |     r: 16
25 |     lora_alpha: 32
26 |     lora_dropout: 0.1
27 |     modules_to_save:
28 |       - lm_head
29 | 
30 | train_params:
31 |   per_device_train_batch_size: 1 # 512 # 512
32 |   per_device_eval_batch_size: 1
33 |   num_train_epochs: 2 # 16
34 |   gradient_accumulation_steps: 4
35 | 
36 |   warmup_pct: 0.1
37 |   eval_frequency: 300 # 300 # 600
38 |   patience: 10
39 | 
40 | optimizer:
41 |   name: AdamW8bit
42 |   head_lr: 5e-5
43 |   lr: 5e-5
44 |   weight_decay: 1e-2
45 |   max_grad_norm: 0.3
46 | 
47 | outputs:
48 |   model_dir: ../models/r_clm_v2
49 | 
50 | wandb:
51 |   project: detect-ai-a1
52 |   run_name: exp006-r-clm
53 |   tags:
54 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_bloom.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: bigscience/bloom-560m
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - query_key_value
18 |     r: 64
19 |     lora_alpha: 64
20 |     lora_dropout: 0.1
21 |     modules_to_save:
22 |       - lm_head
23 | 
24 | train_params:
25 |   per_device_train_batch_size: 1 # 512 # 512
26 |   per_device_eval_batch_size: 1
27 |   num_train_epochs: 1 # 16
28 |   gradient_accumulation_steps: 4
29 | 
30 |   warmup_pct: 0.1
31 |   eval_frequency: 300 # 300 # 600
32 |   patience: 10
33 | 
34 | optimizer:
35 |   name: AdamW8bit
36 |   head_lr: 5e-5
37 |   lr: 5e-5
38 |   weight_decay: 1e-2
39 |   max_grad_norm: 0.3
40 | 
41 | outputs:
42 |   model_dir: ../models/r_clm_bloom
43 | 
44 | wandb:
45 |   project: detect-ai-a1
46 |   run_name: exp006-r-clm
47 |   tags:
48 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_falcon.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: tiiuae/falcon-7b
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - query_key_value
18 |     r: 16
19 |     lora_alpha: 32
20 |     lora_dropout: 0.1
21 |     modules_to_save:
22 |       - lm_head
23 | 
24 | train_params:
25 |   per_device_train_batch_size: 1 # 512 # 512
26 |   per_device_eval_batch_size: 1
27 |   num_train_epochs: 1 # 16
28 |   gradient_accumulation_steps: 4
29 | 
30 |   warmup_pct: 0.1
31 |   eval_frequency: 300 # 300 # 600
32 |   patience: 10
33 | 
34 | optimizer:
35 |   name: AdamW8bit
36 |   head_lr: 5e-5
37 |   lr: 5e-5
38 |   weight_decay: 1e-2
39 |   max_grad_norm: 0.3
40 | 
41 | outputs:
42 |   model_dir: ../models/r_clm_falcon
43 | 
44 | wandb:
45 |   project: detect-ai-a1
46 |   run_name: exp006-r-clm
47 |   tags:
48 |     - falcon


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: mistralai/Mistral-7B-v0.1
 3 | adapter_path: ../models/models/r_clm_v2
 4 | max_num_tokens: 1024
 5 | output_dir: ../data/scaling/mistral/v0
 6 | n_examples: 256
 7 | n_gen_per_prompt: 4
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_bloom.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: bigscience/bloom-560m
 3 | adapter_path: ../models/models/r_clm_bloom/last
 4 | max_num_tokens: 1024
 5 | output_dir: ../data/scaling/bloom_560/v2
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_falcon.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: tiiuae/falcon-7b
 3 | adapter_path: ../models/models/r_clm_falcon/last
 4 | max_num_tokens: 1024
 5 | output_dir: ../data/scaling/falcon_7b/v6
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_gpt2.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: distilgpt2
 3 | adapter_path: ../models/models/r_clm_gpt2/last
 4 | max_num_tokens: 1296
 5 | output_dir: ../data/scaling/gpt2/v0
 6 | n_examples: 256
 7 | n_gen_per_prompt: 1
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_lite_llama.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: ahxt/LiteLlama-460M-1T
 3 | adapter_path: ../models/models/r_clm_lite_llama/last
 4 | max_num_tokens: 768
 5 | output_dir: ../data/scaling/lite_llama/v1
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_llama13b.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: KoboldAI/LLaMA2-13B-Tiefighter
 3 | adapter_path: ../models/models/r_clm_llama_13b/last
 4 | max_num_tokens: 768
 5 | output_dir: ../data/scaling/llama_13b/v3
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_mistral_persuade.yaml:
--------------------------------------------------------------------------------
1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
2 | model_path: ../models/models/mistral_persuade_ft/last
3 | max_num_tokens: 1024
4 | output_dir: ../data/scaling/mistral_persuade_low_vocab/v2
5 | n_examples: 256
6 | n_gen_per_prompt: 2
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_mpt.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: mosaicml/mpt-7b
 3 | adapter_path: ../models/models/r_clm_mpt_7b/last
 4 | max_num_tokens: 1024
 5 | output_dir: ../data/scaling/mpt_7b/v0
 6 | n_examples: 1024
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_opt.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: facebook/opt-125m
 3 | adapter_path: ../models/models/r_clm_opt_125m/last
 4 | max_num_tokens: 1024
 5 | output_dir: ../data/scaling/opt_125m/v0
 6 | n_examples: 1024
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_pythia.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: EleutherAI/pythia-1b
 3 | adapter_path: ../models/models/r_clm_pythia/last
 4 | max_num_tokens: 768
 5 | output_dir: ../data/scaling/pythia_1b/v2
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_generate_tiny_llama.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 3 | adapter_path: ../models/models/r_clm_v2/last
 4 | max_num_tokens: 768
 5 | output_dir: ../data/scaling/tiny_llama/v1
 6 | n_examples: 256
 7 | n_gen_per_prompt: 2
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_gpt2.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: gpt2
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - c_attn
18 |     r: 64
19 |     lora_alpha: 64
20 |     lora_dropout: 0.1
21 |     modules_to_save:
22 |       - lm_head
23 | 
24 | train_params:
25 |   per_device_train_batch_size: 1 # 512 # 512
26 |   per_device_eval_batch_size: 1
27 |   num_train_epochs: 1 # 16
28 |   gradient_accumulation_steps: 4
29 | 
30 |   warmup_pct: 0.1
31 |   eval_frequency: 300 # 300 # 600
32 |   patience: 10
33 | 
34 | optimizer:
35 |   name: AdamW8bit
36 |   head_lr: 5e-5
37 |   lr: 5e-5
38 |   weight_decay: 1e-2
39 |   max_grad_norm: 0.3
40 | 
41 | outputs:
42 |   model_dir: ../models/r_clm_gpt2
43 | 
44 | wandb:
45 |   project: detect-ai-a1
46 |   run_name: exp006-r-clm
47 |   tags:
48 |     - distilgpt2


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_lite_llama.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path:  ahxt/LiteLlama-460M-1T # mistralai/Mistral-7B-v0.1
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |       - v_proj
20 |       - o_proj
21 |       - gate_proj
22 |       - up_proj
23 |       - down_proj
24 |     r: 16
25 |     lora_alpha: 32
26 |     lora_dropout: 0.1
27 |     modules_to_save:
28 |       - lm_head
29 | 
30 | train_params:
31 |   per_device_train_batch_size: 1 # 512 # 512
32 |   per_device_eval_batch_size: 1
33 |   num_train_epochs: 2 # 16
34 |   gradient_accumulation_steps: 4
35 | 
36 |   warmup_pct: 0.1
37 |   eval_frequency: 300 # 300 # 600
38 |   patience: 10
39 | 
40 | optimizer:
41 |   name: AdamW8bit
42 |   head_lr: 5e-5
43 |   lr: 5e-5
44 |   weight_decay: 1e-2
45 |   max_grad_norm: 0.3
46 | 
47 | outputs:
48 |   model_dir: ../models/r_clm_lite_llama
49 | 
50 | wandb:
51 |   project: detect-ai-a1
52 |   run_name: exp006-r-clm
53 |   tags:
54 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_llama13b.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: KoboldAI/LLaMA2-13B-Tiefighter
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |       - v_proj
20 |       - o_proj
21 |       - gate_proj
22 |       - up_proj
23 |       - down_proj
24 |     r: 16
25 |     lora_alpha: 32
26 |     lora_dropout: 0.1
27 |     modules_to_save:
28 |       - lm_head
29 | 
30 | train_params:
31 |   per_device_train_batch_size: 1 # 512 # 512
32 |   per_device_eval_batch_size: 1
33 |   num_train_epochs: 1 # 16
34 |   gradient_accumulation_steps: 4
35 | 
36 |   warmup_pct: 0.1
37 |   eval_frequency: 300 # 300 # 600
38 |   patience: 10
39 | 
40 | optimizer:
41 |   name: AdamW8bit
42 |   head_lr: 5e-5
43 |   lr: 5e-5
44 |   weight_decay: 1e-2
45 |   max_grad_norm: 0.3
46 | 
47 | outputs:
48 |   model_dir: ../models/r_clm_llama_13b
49 | 
50 | wandb:
51 |   project: detect-ai-a1
52 |   run_name: exp006-r-clm
53 |   tags:
54 |     - falcon


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_mistral_persuade.yaml:
--------------------------------------------------------------------------------
 1 | seed: 425
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: ../models/mistral_persuade
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 | train_params:
16 |   per_device_train_batch_size: 2 # 512 # 512
17 |   per_device_eval_batch_size: 2
18 |   num_train_epochs: 16 # 16
19 |   gradient_accumulation_steps: 4
20 | 
21 |   warmup_pct: 0.01
22 |   eval_frequency: 300 # 300 # 600
23 |   patience: 10
24 | 
25 | optimizer:
26 |   name: AdamW8bit
27 |   head_lr: 5e-5
28 |   lr: 5e-5
29 |   weight_decay: 1e-2
30 |   max_grad_norm: 0.3
31 | 
32 | outputs:
33 |   model_dir: ../models/mistral_persuade_ft
34 | 
35 | wandb:
36 |   project: detect-ai-a1
37 |   run_name: exp006-r-clm
38 |   tags:
39 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_mpt.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: mosaicml/mpt-7b
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - Wqkv
18 |     r: 64
19 |     lora_alpha: 64
20 |     lora_dropout: 0.1
21 |     modules_to_save:
22 |       - lm_head
23 | 
24 | train_params:
25 |   per_device_train_batch_size: 1 # 512 # 512
26 |   per_device_eval_batch_size: 1
27 |   num_train_epochs: 1 # 16
28 |   gradient_accumulation_steps: 4
29 | 
30 |   warmup_pct: 0.1
31 |   eval_frequency: 300 # 300 # 600
32 |   patience: 10
33 | 
34 | optimizer:
35 |   name: AdamW8bit
36 |   head_lr: 5e-5
37 |   lr: 5e-5
38 |   weight_decay: 1e-2
39 |   max_grad_norm: 0.3
40 | 
41 | outputs:
42 |   model_dir: ../models/r_clm_mpt_7b
43 | 
44 | wandb:
45 |   project: detect-ai-a1
46 |   run_name: exp006-r-clm
47 |   tags:
48 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_opt.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: facebook/opt-125m
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |       - v_proj
20 |     r: 64
21 |     lora_alpha: 64
22 |     lora_dropout: 0.1
23 |     modules_to_save:
24 |       - lm_head
25 | 
26 | train_params:
27 |   per_device_train_batch_size: 1 # 512 # 512
28 |   per_device_eval_batch_size: 1
29 |   num_train_epochs: 5 # 16
30 |   gradient_accumulation_steps: 4
31 | 
32 |   warmup_pct: 0.1
33 |   eval_frequency: 300 # 300 # 600
34 |   patience: 10
35 | 
36 | optimizer:
37 |   name: AdamW8bit
38 |   head_lr: 5e-5
39 |   lr: 5e-5
40 |   weight_decay: 1e-2
41 |   max_grad_norm: 0.3
42 | 
43 | outputs:
44 |   model_dir: ../models/r_clm_opt_125m
45 | 
46 | wandb:
47 |   project: detect-ai-a1
48 |   run_name: exp006-r-clm
49 |   tags:
50 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_pythia.yaml:
--------------------------------------------------------------------------------
 1 | seed: 420
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: EleutherAI/pythia-12b
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - query_key_value
18 |     r: 256
19 |     lora_alpha: 64
20 |     lora_dropout: 0.1
21 |     modules_to_save:
22 |       - lm_head
23 | 
24 | train_params:
25 |   per_device_train_batch_size: 1 # 512 # 512
26 |   per_device_eval_batch_size: 1
27 |   num_train_epochs: 3 # 16
28 |   gradient_accumulation_steps: 4
29 | 
30 |   warmup_pct: 0.1
31 |   eval_frequency: 300 # 300 # 600
32 |   patience: 10
33 | 
34 | optimizer:
35 |   name: AdamW8bit
36 |   head_lr: 5e-5
37 |   lr: 5e-5
38 |   weight_decay: 1e-2
39 |   max_grad_norm: 0.3
40 | 
41 | outputs:
42 |   model_dir: ../models/r_clm_pythia_12b
43 | 
44 | wandb:
45 |   project: detect-ai-a1
46 |   run_name: exp006-r-clm
47 |   tags:
48 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_clm_tiny_llama.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 4 | 
 5 | model:
 6 |   backbone_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # mistralai/Mistral-7B-v0.1
 7 |   max_length: 1024
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |       - v_proj
20 |       - o_proj
21 |       - gate_proj
22 |       - up_proj
23 |       - down_proj
24 |     r: 16
25 |     lora_alpha: 32
26 |     lora_dropout: 0.1
27 |     modules_to_save:
28 |       - lm_head
29 | 
30 | train_params:
31 |   per_device_train_batch_size: 1 # 512 # 512
32 |   per_device_eval_batch_size: 1
33 |   num_train_epochs: 1 # 16
34 |   gradient_accumulation_steps: 4
35 | 
36 |   warmup_pct: 0.1
37 |   eval_frequency: 300 # 300 # 600
38 |   patience: 10
39 | 
40 | optimizer:
41 |   name: AdamW8bit
42 |   head_lr: 5e-5
43 |   lr: 5e-5
44 |   weight_decay: 1e-2
45 |   max_grad_norm: 0.3
46 | 
47 | outputs:
48 |   model_dir: ../models/r_clm_v2
49 | 
50 | wandb:
51 |   project: detect-ai-a1
52 |   run_name: exp006-r-clm
53 |   tags:
54 |     - mistral


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_dpo.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | sft_model_path: ../models/r_clm_v2/last
 3 | train_path: ../datasets/dpo/dpo_train.parquet
 4 | test_path: ../datasets/dpo/dpo_test.parquet
 5 | 
 6 | dpo:
 7 |   beta: 0.05
 8 |   logging_first_step: true
 9 |   max_prompt_length: 64
10 |   max_length: 840
11 |   optim: rmsprop
12 |   remove_unused_columns: false
13 | 
14 | 
15 | lora:
16 |   target_modules:
17 |     - q_proj
18 |     - k_proj
19 |     - v_proj
20 |     - out_proj
21 |   r: 16
22 |   lora_alpha: 16
23 |   lora_dropout: 0.1
24 |   modules_to_save:
25 |     - lm_head
26 | 
27 | output_dir: ../models/r_dpo_v2
28 | 
29 | learning_rate: 1.0e-5
30 | # lr_scheduler_type: linear
31 | per_device_train_batch_size: 1
32 | per_device_eval_batch_size: 1
33 | gradient_accumulation_steps: 8
34 | num_train_epochs: 1
35 | warmup_ratio: 0.05
36 | max_grad_norm: 1.0


--------------------------------------------------------------------------------
/conf/r_clm/conf_r_dpo_generate.yaml:
--------------------------------------------------------------------------------
 1 | input_data_path: ../datasets/persuade_2.0_human_scores_demo_id_github.csv
 2 | base_model_path: mistralai/Mistral-7B-v0.1
 3 | sft_adapter_path: ../models/r_clm_v2/last
 4 | dpo_adapter_path: ../models/r_dpo_v2/checkpoint-250
 5 | max_num_tokens: 1296
 6 | output_dir: ../data/custom_gen_dpo_v2
 7 | n_examples: 256
 8 | n_gen_per_prompt: 4
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/conf/r_detect/conf_r_detect_mix_v16.yaml:
--------------------------------------------------------------------------------
 1 | seed: 424
 2 | use_wandb: false
 3 | input_data_dir: ../datasets/external/ai_mix_v16
 4 | 
 5 | model:
 6 |   backbone_path: mistralai/Mistral-7B-v0.1
 7 |   max_length: 1296
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |     r: 8
20 |     lora_alpha: 16
21 |     lora_dropout: 0.1
22 |     modules_to_save:
23 |       - classification_head
24 | 
25 | train_params:
26 |   per_device_train_batch_size: 1 # run on 4x A100
27 |   per_device_eval_batch_size: 1
28 |   num_train_epochs: 1 # 16
29 |   gradient_accumulation_steps: 4
30 | 
31 |   warmup_pct: 0.1
32 |   eval_frequency: 500
33 |   patience: 20
34 |   save_trigger: 0.0
35 | 
36 |   use_mask_aug: false # false
37 |   mask_aug_prob: 0.0
38 | 
39 | optimizer:
40 |   name: AdamW8bit
41 |   head_lr: 2e-6
42 |   lr: 2e-5
43 |   weight_decay: 1e-2
44 |   max_grad_norm: 0.5
45 | 
46 | outputs:
47 |   model_dir: ../models/r_detect_mix_v16
48 | 
49 | wandb:
50 |   project: detect-ai-a1
51 |   run_name: exp010-r-detect
52 |   tags:
53 |     - mistral


--------------------------------------------------------------------------------
/conf/r_detect/conf_r_detect_mix_v26.yaml:
--------------------------------------------------------------------------------
 1 | seed: 424
 2 | use_wandb: false
 3 | input_data_dir: ../datasets/external/ai_mix_v26
 4 | 
 5 | model:
 6 |   backbone_path: mistralai/Mistral-7B-v0.1
 7 |   max_length: 1296
 8 |   num_labels: 1
 9 |   
10 |   tokenizer:
11 |     padding_side: left
12 |     truncation_side: left
13 |     use_fast: true
14 | 
15 |   lora:
16 |     target_modules:
17 |       - q_proj
18 |       - k_proj
19 |     r: 16
20 |     lora_alpha: 16
21 |     lora_dropout: 0.1
22 |     modules_to_save:
23 |       - classification_head
24 | 
25 | train_params:
26 |   per_device_train_batch_size: 1 # 512 # 512
27 |   per_device_eval_batch_size: 1
28 |   num_train_epochs: 1 # 16
29 |   gradient_accumulation_steps: 4
30 | 
31 |   warmup_pct: 0.1
32 |   eval_frequency: 500 # 300 # 600
33 |   patience: 20
34 |   save_trigger: 0.0
35 | 
36 |   use_mask_aug: false # false
37 |   mask_aug_prob: 0.0
38 | 
39 | optimizer:
40 |   name: AdamW8bit
41 |   head_lr: 2e-6
42 |   lr: 2e-5
43 |   weight_decay: 1e-2
44 |   max_grad_norm: 0.5
45 | 
46 | outputs:
47 |   model_dir: ../models/r_detect_mix_v16
48 | 
49 | wandb:
50 |   project: detect-ai-a1
51 |   run_name: exp010-r-detect
52 |   tags:
53 |     - mistral


--------------------------------------------------------------------------------
/conf/r_embed/conf_r_embed.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_dir: ../datasets/external/ai_mix_v26
 4 | 
 5 | model:
 6 |   backbone_path: microsoft/deberta-v3-base
 7 |   max_length: 768 # 1024
 8 |   dropout_rate: 0.01
 9 |   gradient_checkpointing: true
10 |   projection_dim: 512
11 |   temperature: 0.1
12 | 
13 | train_params:
14 |   per_device_train_batch_size: 64 # 512 # 512
15 |   per_device_eval_batch_size: 64
16 |   num_train_epochs: 3 # 16
17 |   gradient_accumulation_steps: 1
18 | 
19 |   warmup_pct: 0.02
20 |   eval_frequency: 100 # 500 # 300 # 600
21 |   patience: 10
22 |   save_trigger: 0.0
23 | 
24 | optimizer:
25 |   head_lr: 4e-5
26 |   lr: 4e-5
27 |   weight_decay: 1e-3
28 |   max_grad_norm: 1.0
29 | 
30 |   eps: 1e-8
31 |   beta1: 0.9
32 |   beta2: 0.999
33 | 
34 |   use_bnb: true
35 |   use_llrd: true
36 |   llrd: 0.9
37 | 
38 | outputs:
39 |   model_dir: ../models/r_embed
40 | 
41 | wandb:
42 |   project: detect-ai-a1
43 |   run_name: exp002-r-embed
44 |   tags:
45 |     - deberta


--------------------------------------------------------------------------------
/conf/r_ranking/conf_r_ranking_large.yaml:
--------------------------------------------------------------------------------
 1 | seed: 42
 2 | use_wandb: false
 3 | input_data_dir: ../datasets/external/ai_mix_for_ranking
 4 | 
 5 | model:
 6 |   backbone_path: microsoft/deberta-v3-large
 7 |   max_length: 1024 # 1024
 8 |   dropout_rate: 0.05
 9 |   gradient_checkpointing: true
10 | 
11 | train_params:
12 |   per_device_train_batch_size: 32 # 512 # 512
13 |   per_device_eval_batch_size: 32
14 |   num_train_epochs: 2 # 16
15 |   gradient_accumulation_steps: 1
16 | 
17 |   warmup_pct: 0.1
18 |   eval_frequency: 100 # 500 # 300 # 600
19 |   patience: 10
20 |   save_trigger: 0.0
21 | 
22 | optimizer:
23 |   head_lr: 2e-5
24 |   lr: 2e-5
25 |   weight_decay: 1e-3
26 |   max_grad_norm: 1.0
27 | 
28 |   eps: 1e-8
29 |   beta1: 0.9
30 |   beta2: 0.999
31 | 
32 |   use_bnb: true
33 |   use_llrd: true
34 |   llrd: 0.9
35 | 
36 | outputs:
37 |   model_dir: ../models/r_ranking
38 | 
39 | wandb:
40 |   project: detect-ai-a1
41 |   run_name: exp002-r-embed
42 |   tags:
43 |     - deberta


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.36.1
 2 | accelerate==0.24.0
 3 | bitsandbytes==0.41.3.post2
 4 | datasets==2.15.0
 5 | peft==0.7.0
 6 | trl==0.7.4
 7 | sentence-transformers==2.2.2
 8 | hydra-core
 9 | pynvml
10 | sentencepiece
11 | einops


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | hdir=$(pwd)
 2 | cd ..
 3 | 
 4 | mkdir datasets
 5 | mkdir models
 6 | mkdir datasets/external
 7 | 
 8 | cd datasets
 9 | 
10 | kaggle competitions download -c llm-detect-ai-generated-text
11 | unzip llm-detect-ai-generated-text.zip -d llm-detect-ai-generated-text
12 | rm llm-detect-ai-generated-text.zip
13 | 
14 | kaggle datasets download -d nbroad/persaude-corpus-2
15 | unzip persaude-corpus-2.zip -d ./
16 | rm persaude-corpus-2.zip
17 | 
18 | kaggle datasets download -d conjuring92/ai-mix-v16
19 | unzip ai-mix-v16.zip -d ./external/ai_mix_v16
20 | rm ai-mix-v16.zip
21 | 
22 | kaggle datasets download -d conjuring92/ai-mix-v26
23 | unzip ai-mix-v26.zip -d ./external/ai_mix_v26
24 | rm ai-mix-v26.zip
25 | 
26 | kaggle datasets download -d conjuring92/ai-bin7-mix-v1
27 | unzip ai-bin7-mix-v1.zip -d ./external/ai_mix_for_ranking
28 | rm ai-bin7-mix-v1.zip
29 | 
30 | cd ../models
31 | kaggle datasets download -d conjuring92/detect-ai-persuade-clm-ckpts
32 | unzip detect-ai-persuade-clm-ckpts.zip -d ./
33 | rm detect-ai-persuade-clm-ckpts.zip
34 | 
35 | cd $hdir
36 | 


--------------------------------------------------------------------------------