├── .gitignore ├── LICENSE ├── README.md ├── analysis.py ├── correction_script.py ├── data_creator.py ├── dataloader.py ├── demo.ipynb ├── di.py ├── files └── llm-dataset-inference-overview.png ├── linear_di.py ├── metrics.py ├── requirements.txt ├── results_reader.py ├── scripts ├── data_creator.sh ├── di_launcher_individual.sh ├── di_mega_launcher.sh ├── launcher.sh └── mega_launcher.sh ├── selected_features.py ├── transform.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # project specific 2 | plots_p/ 3 | plots_p_w/ 4 | data/ 5 | scripts/slurm_output/ 6 | results/ 7 | aggregated_results/ 8 | 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Pratyush Maini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Dataset Inference: Did you train on my dataset? 2 | 3 | 4 | ![LLM Dataset Inference Overview](files/llm-dataset-inference-overview.png) 5 | 6 | 7 | The proliferation of large language models (LLMs) in the real world has come with a rise in copyright 8 | cases against companies for training their models on unlicensed data from the internet. Recent works 9 | have presented methods to identify if individual text sequences were members of the model’s training 10 | data, known as membership inference attacks (MIAs). We demonstrate that the apparent success of 11 | these MIAs is confounded by selecting non-members (text sequences not used for training) belonging to 12 | a different distribution from the members (e.g., temporally shifted recent Wikipedia articles compared 13 | with ones used to train the model). This distribution shift makes membership inference appear successful. 14 | However, most MIA methods perform no better than random guessing when discriminating between 15 | members and non-members from the same distribution (e.g., in this case, the same period of time). 16 | Even when MIAs work, we find that different MIAs succeed at inferring membership of samples from 17 | different distributions. Instead, we propose a new dataset inference method to accurately identify 18 | the datasets used to train large language models. This paradigm sits realistically in the modern-day 19 | copyright landscape, where authors claim that an LLM is trained over multiple documents (such as a 20 | book) written by them, rather than one particular paragraph. While dataset inference shares many 21 | of the challenges of membership inference, we solve it by selectively combining the MIAs that provide 22 | positive signal for a given distribution, and aggregating them to perform a statistical test on a given 23 | dataset. Our approach successfully distinguishes the train and test sets of different subsets of the Pile 24 | with statistically significant p-values < 0.1, without any false positives. 25 | 26 | ## Data Used 27 | 28 | This repository contains data different subsets of the PILE, divided into train and val sets. The data is in the form of a JSON file, with each entry containing the raw text, as well as various kinds of perturbations applied to it. The dataset is used to facilitate privacy research in language models, where the perturbed data can be used as reference detect the presence of a particular dataset in the training data of a language model. 29 | 30 | ## Quick Links 31 | 32 | - [**arXiv Paper**](): Detailed information about the Dataset Inference V2 project, including the dataset, results, and additional resources. 33 | - [**GitHub Repository**](): Access the source code, evaluation scripts, and additional resources for Dataset Inference. 34 | - [**Dataset on Hugging Face**](https://huggingface.co/datasets/pratyushmaini/llm_dataset_inference): Direct link to download the various versons of the PILE dataset. 35 | - [**Summary on Twitter**](): A concise summary and key takeaways from the project. 36 | 37 | 38 | ## Applicability 🚀 39 | 40 | The dataset is in text format and can be loaded using the Hugging Face `datasets` library. It can be used to evaluate any causal or masked language model for the presence of specific datasets in its training pool. The dataset is *not* intended for direct use in training models, but rather for evaluating the privacy of language models. Please keep the validation sets, and the perturbed train sets private, and do not use them for training models. 41 | 42 | ## Loading the Dataset 43 | 44 | To load the dataset, use the following code: 45 | 46 | ```python 47 | from datasets import load_dataset 48 | dataset = load_dataset("pratyushmaini/llm_dataset_inference", subset = "wikipedia", split = "train") 49 | ``` 50 | 51 | ### Available perturbations: 52 | 53 | We use the NL-Augmenter library to apply the following perturbations to the data: 54 | - `synonym_substitution`: Synonym substitution of words in the sentence. 55 | - `butter_fingers`: Randomly changing characters from the sentence. 56 | - `random_deletion`: Randomly deleting words from the sentence. 57 | - `change_char_case`: Randomly changing the case of characters in the sentence. 58 | - `whitespace_perturbation`: Randomly adding or removing whitespace from the sentence. 59 | - `underscore_trick`: Adding underscores to the sentence. 60 | 61 | ## Demo 62 | 63 | Run `Demo.ipynb` to conduct LLM Dataset Inference attacks on your own data and/or understand the code. 64 | 65 | ## Citing Our Work 66 | 67 | If you find our codebase and dataset beneficial, please cite our work: 68 | ``` 69 | @misc{mainidi2024, 70 | title={LLM Dataset Inference: Did you train on my dataset?}, 71 | author={Pratyush Maini and Hengrui Jia and Nicolas Papernot and Adam Dziedzic}, 72 | year={2024}, 73 | archivePrefix={arXiv}, 74 | primaryClass={cs.LG} 75 | } 76 | ``` 77 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file calculates p values by loading the json from results 3 | ''' 4 | import json, os 5 | import argparse 6 | import numpy as np 7 | from scipy.stats import ttest_ind, chi2, norm 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model') 12 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-12b", help='The name of the model to use') 13 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use') 14 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use') 15 | parser.add_argument('--batch_size', type=int, default=32, help='The batch size to use') 16 | args = parser.parse_args() 17 | return args 18 | 19 | def fishers_method(p_values): 20 | statistic = -2 * np.sum(np.log(p_values)) 21 | combined_p_value = chi2.sf(statistic, 2 * len(p_values)) 22 | return combined_p_value 23 | 24 | def harmonic_mean(p_values): 25 | return len(p_values) / np.sum(1. / np.array(p_values)) 26 | 27 | def get_p_values_averaged(list1, list2): 28 | # make 10 random samples of the two lists by sampling without replacement 29 | num_elements = min(len(list1), len(list2)) 30 | num_elements_per_sample = int(num_elements/10) 31 | # randomly permute the two lists 32 | np.random.shuffle(list1) 33 | np.random.shuffle(list2) 34 | p_values = [] 35 | for i in range(10): 36 | sample1 = list1[i*num_elements_per_sample:(i+1)*num_elements_per_sample] 37 | sample2 = list2[i*num_elements_per_sample:(i+1)*num_elements_per_sample] 38 | t_stat, p_value = ttest_ind(sample1, sample2) 39 | p_values.append(p_value) 40 | 41 | return harmonic_mean(p_values) 42 | 43 | def get_p_values(list1, list2): 44 | t_stat, p_value = ttest_ind(list1, list2) 45 | return p_value 46 | 47 | def main(): 48 | args = get_args() 49 | with open(f"new_results/{args.model_name}/{args.dataset_name}_train_metrics.json", 'r') as f: 50 | metrics_train = json.load(f) 51 | with open(f"new_results/{args.model_name}/{args.dataset_name}_val_metrics.json", 'r') as f: 52 | metrics_val = json.load(f) 53 | 54 | keys = list(metrics_train.keys()) 55 | p_values = {} 56 | for key in keys: 57 | # remove the top 2.5% and bottom 2.5% of the data 58 | metrics_train_key = np.array(metrics_train[key]) 59 | metrics_val_key = np.array(metrics_val[key]) 60 | metrics_train_key = metrics_train_key[np.argsort(metrics_train_key)] 61 | metrics_val_key = metrics_val_key[np.argsort(metrics_val_key)] 62 | metrics_train_key = metrics_train_key[int(0.025*len(metrics_train_key)):int(0.975*len(metrics_train_key))] 63 | metrics_val_key = metrics_val_key[int(0.025*len(metrics_val_key)):int(0.975*len(metrics_val_key))] 64 | # shuffle the data 65 | np.random.shuffle(metrics_train_key) 66 | np.random.shuffle(metrics_val_key) 67 | # get the p value 68 | # t_stat, p_value = ttest_ind(metrics_train_key, metrics_val_key) 69 | 70 | 71 | p_values[key] = get_p_values(metrics_train[key], metrics_val[key]) 72 | 73 | # add the p_values to the csv in p_values_averaged/{args.model_name}/{key}.csv if it does not exist 74 | os.makedirs(f"p_values/{args.model_name}", exist_ok=True) 75 | for key in p_values: 76 | p_file = f"p_values/{args.model_name}/{key}.csv" 77 | if not os.path.exists(p_file): 78 | with open(p_file, 'w') as f: 79 | f.write("dataset_name,p_value\n") 80 | 81 | # check if the dataset_name is already in the file 82 | flag = 0 83 | with open(p_file, 'r') as f: 84 | lines = f.readlines() 85 | for line in lines: 86 | if args.dataset_name in line: 87 | print(f"Dataset {args.dataset_name} already in file {p_file}. Aborting...") 88 | flag = 1 89 | 90 | if flag == 0: 91 | with open(p_file, 'a') as f: 92 | f.write(f"{args.dataset_name},{p_values[key]}\n") 93 | 94 | if __name__ == "__main__": 95 | main() -------------------------------------------------------------------------------- /correction_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | There were certain inconsitencies in the use of ppl and likelihood in the code 3 | Correct all results to accommodate for the same 4 | """ 5 | 6 | import glob 7 | import json 8 | import os 9 | import torch 10 | 11 | # get all files in "results/EleutherAI/*/*.json" 12 | file_list = glob.glob("results/EleutherAI/pythia-410m/*.json") 13 | 14 | ''' 15 | dict_keys(['ppl', 'k_min_probs_0.05', 'k_min_probs_0.1', 'k_min_probs_0.2', 'k_min_probs_0.3', 'k_min_probs_0.4', 'k_min_probs_0.5', 'k_min_probs_0.6', 'k_max_probs_0.05', 'k_max_probs_0.1', 'k_max_probs_0.2', 'k_max_probs_0.3', 'k_max_probs_0.4', 'k_max_probs_0.5', 'k_max_probs_0.6', 'zlib_ratio', 'ppl_ratio_synonym_substitution', 'ppl_diff_synonym_substitution', 'ppl_ratio_butter_fingers', 'ppl_diff_butter_fingers', 'ppl_ratio_random_deletion', 'ppl_diff_random_deletion', 'ppl_ratio_change_char_case', 'ppl_diff_change_char_case', 'ppl_ratio_whitespace_perturbation', 'ppl_diff_whitespace_perturbation', 'ppl_ratio_underscore_trick', 'ppl_diff_underscore_trick', 'ref_ppl_ratio_silo', 'ref_ppl_diff_silo', 'ref_ppl_ratio_tinystories-33M', 'ref_ppl_diff_tinystories-33M', 'ref_ppl_ratio_tinystories-1M', 'ref_ppl_diff_tinystories-1M', 'ref_ppl_ratio_phi-1_5', 'ref_ppl_diff_phi-1_5']) 16 | ''' 17 | 18 | 19 | 20 | # iterate over all files 21 | for file in file_list: 22 | with open(file, 'r') as f: 23 | metrics = json.load(f) 24 | ppl_list = torch.tensor(metrics['ppl']) 25 | loss_list = torch.log(ppl_list) 26 | keys = list(metrics.keys()) 27 | for key in keys: 28 | if "ref_ppl_ratio" in key: 29 | current_ratio = torch.tensor(metrics[key]) # loss_list / ref_ppl 30 | ref_ppl = loss_list / current_ratio 31 | ppl_ratio = ppl_list / ref_ppl 32 | loss_ratio = torch.log(ref_ppl) / loss_list 33 | metrics[key] = ppl_ratio.tolist() 34 | metrics[key.replace("ppl", "loss")] = loss_ratio.tolist() 35 | elif "ref_ppl_diff" in key: 36 | current_diff = torch.tensor(metrics[key]) # loss_list - ref_ppl 37 | ref_ppl = loss_list - current_diff 38 | ppl_diff = ppl_list - ref_ppl 39 | loss_diff = torch.log(ref_ppl) - loss_list 40 | metrics[key] = ppl_diff.tolist() 41 | metrics[key.replace("ppl", "loss")] = loss_diff.tolist() 42 | elif "ppl_ratio" in key: 43 | current_ratio = torch.tensor(metrics[key]) 44 | perturbation_loss = loss_list / current_ratio 45 | perturbation_ppl = torch.exp(perturbation_loss) 46 | ppl_ratio = ppl_list / perturbation_ppl 47 | loss_ratio = perturbation_loss / loss_list 48 | metrics[key] = ppl_ratio.tolist() 49 | metrics[key.replace("ppl", "loss")] = loss_ratio.tolist() 50 | elif "ppl_diff" in key: 51 | current_diff = torch.tensor(metrics[key]) 52 | perturbation_loss = loss_list - current_diff 53 | perturbation_ppl = torch.exp(perturbation_loss) 54 | ppl_diff = ppl_list - perturbation_ppl 55 | loss_diff = perturbation_loss - loss_list 56 | metrics[key] = ppl_diff.tolist() 57 | metrics[key.replace("ppl", "loss")] = loss_diff.tolist() 58 | 59 | # save the new file at "new_results/EleutherAI/*/*.json" 60 | new_file = file.replace("results", "new_results") 61 | os.makedirs(os.path.dirname(new_file), exist_ok=True) 62 | with open(new_file, 'w') as f: 63 | json.dump(metrics, f) 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /data_creator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file is used to convert data from the PILE to a huggingface dataset. 3 | This file will also call various perturbations, and add perturbed versions of the data to the dataset as different subsets. 4 | ''' 5 | from dataloader import load_data, pile_mapper 6 | from transform import generate_perturbations 7 | import os 8 | import json 9 | 10 | 11 | def main(args): 12 | root = os.getcwd() + "/data" 13 | os.makedirs(root, exist_ok=True) 14 | 15 | if args.dataset_names == "all": 16 | dataset_names = pile_mapper.keys() 17 | else: 18 | dataset_names = [args.dataset_names] 19 | 20 | for dataset_name in dataset_names: 21 | for split in ["train", "val"]: 22 | file_name = f"{root}/{dataset_name}_{split}.jsonl" 23 | # load the data 24 | num_samples = 2000 25 | raw_texts = load_data(dataset_name, split, num_samples) 26 | print(f"Data loaded for {dataset_name} {split} | {len(raw_texts)} samples") 27 | # add the perturbations 28 | perturbed_texts_dictionary = generate_perturbations(raw_texts) 29 | perturbation_styles = list(perturbed_texts_dictionary.keys()) 30 | 31 | #save all the texts to a json lines file 32 | with open(file_name, "w") as f: 33 | for i, text in enumerate(raw_texts): 34 | json_line = {} 35 | json_line["text"] = text 36 | for style in perturbation_styles: 37 | json_line[style] = perturbed_texts_dictionary[style][i] 38 | f.write(json.dumps(json_line) + "\n") 39 | print(f"Data saved to {file_name}") 40 | 41 | 42 | 43 | if __name__ == "__main__": 44 | import argparse 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--dataset_names", type=str, default="all") 47 | args = parser.parse_args() 48 | 49 | main(args) 50 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import json 3 | 4 | import lm_dataformat 5 | import numpy as np 6 | import nltk 7 | 8 | # nltk.download('punkt') 9 | nltk_sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 10 | 11 | pile_mapper = { "stackexchange":"StackExchange", 12 | "wikipedia":"Wikipedia (en)", 13 | "cc":"Pile-CC", 14 | "github":"Github", 15 | "pubmed_abstracts":"PubMed Abstracts", 16 | "openwebtext2":"OpenWebText2", 17 | "freelaw":"FreeLaw", 18 | "math":"DM Mathematics", 19 | "nih":"NIH ExPorter", 20 | "uspto":"USPTO Backgrounds", 21 | "hackernews":"HackerNews", 22 | "enron":'Enron Emails', 23 | "books3": 'Books3', 24 | "pubmed_central": 'PubMed Central', 25 | "gutenberg":'Gutenberg (PG-19)', 26 | "arxiv":'ArXiv', 27 | "bookcorpus2":'BookCorpus2', 28 | "opensubtitles":'OpenSubtitles', 29 | "youtubesubtitles":'YoutubeSubtitles', 30 | "ubuntu":'Ubuntu IRC', 31 | "europarl":'EuroParl', 32 | "philpapers":'PhilPapers'} 33 | 34 | def split_paragraph(paragraph, max_sentences = 10): 35 | sentences = nltk_sentence_tokenizer.tokenize(paragraph) 36 | new_paragraphs = [] 37 | for i in range(0, len(sentences), max_sentences): 38 | new_para = " ".join(sentences[i:i + max_sentences]) 39 | new_paragraphs.append(new_para) 40 | return new_paragraphs 41 | 42 | def generate_pile_zst(subset, num_samples=5000, split = "val"): 43 | if subset.startswith("pile_"): 44 | subset = subset[5:] 45 | file_path = f"/data/the_pile/{split}.jsonl.zst" 46 | subset_key = pile_mapper[subset] 47 | texts = [] 48 | num_docs = 0 49 | reader = lm_dataformat.Reader(file_path) 50 | for count, doc in enumerate(tqdm(reader.stream_data(get_meta=True))): 51 | if doc[1]['pile_set_name'] == subset_key: 52 | if len(doc[0].split(" ")) < 10: 53 | continue 54 | texts.append(doc[0]) 55 | num_docs += 1 56 | if num_docs >= num_samples: 57 | break 58 | return texts 59 | 60 | def generate_pile_jsonl(subset, num_samples=5000): 61 | if subset.startswith("pile_"): 62 | subset = subset[5:] 63 | file_path = "/data/the_pile/combined.jsonl" 64 | subset_key = pile_mapper[subset] 65 | texts = [] 66 | num_texts = 0 67 | with open(file_path, 'r', encoding="utf-8") as json_file: 68 | for line in json_file: 69 | json_data = json.loads(line) 70 | if 'text' in json_data: 71 | if json_data['meta']['pile_set_name'] == subset_key: 72 | if len(json_data['text'].split(" ")) < 800: 73 | continue 74 | texts.append(json_data['text']) 75 | num_texts += 1 76 | if num_texts == num_samples: 77 | break 78 | return texts 79 | 80 | def generate_c4(num_samples=500): 81 | # trove mount dataset/C4_subset@1.0.0 ./data 82 | file = "data/C4_subset-1.0.0/data/raw/c4-train.00000-of-01024.json" 83 | texts = [] 84 | num_texts = 0 85 | with open(file, 'r', encoding="utf-8") as json_file: 86 | for line in json_file: 87 | json_data = json.loads(line) 88 | if 'text' in json_data: 89 | texts.append(json_data['text']) 90 | num_texts += 1 91 | if num_texts == num_samples: 92 | break 93 | return texts 94 | 95 | def split_long_texts_by_paragraph(texts, num_samples): 96 | if len(texts) < num_samples: 97 | print(f"initial texts {len(texts)} were less than num_samples {num_samples}. Further splitting") 98 | #split the sentences at every 1000 characters 99 | required_from_each = 2*(num_samples//len(texts) + 1) 100 | new_texts = [] 101 | for text in texts: 102 | new_texts += split_paragraph(text, max_sentences=3)[:required_from_each] 103 | 104 | texts = new_texts 105 | 106 | print(f"Length of texts {len(texts)}") 107 | return texts 108 | 109 | def split_long_texts(texts, num_samples, seq_length, tokenizer = None): 110 | ''' 111 | This function splits long texts into smaller texts of length seq_length 112 | 1. Concatenate all the texts together 113 | 2. Convert everything to tokens 114 | 3. divide into chunks of seq_length 115 | 4. Convert back to text 116 | 5. return the list of texts 117 | ''' 118 | if tokenizer is None: 119 | from transformers import AutoTokenizer 120 | tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped") 121 | 122 | #concatenate all the texts 123 | all_text = " ".join(texts) 124 | #tokenize 125 | tokens = tokenizer.encode(all_text, return_tensors="pt")[0] 126 | #divide into chunks 127 | chunk_length = seq_length 128 | num_chunks = len(tokens)//chunk_length 129 | new_texts = [] 130 | 131 | for i in range(num_chunks): 132 | chunk = tokens[i*chunk_length:(i+1)*chunk_length] 133 | text = tokenizer.decode(chunk) 134 | new_texts.append(text) 135 | 136 | # randomize the order and return only num_samples 137 | np.random.seed(11) 138 | np.random.shuffle(new_texts) 139 | new_texts = new_texts[:num_samples] 140 | 141 | return new_texts 142 | 143 | def load_data(dataset_name, split, num_samples = 1000, seq_length = 512): 144 | if "enron" in dataset_name: 145 | seq_length = 64 146 | if "nih" in dataset_name: 147 | seq_length = 64 148 | if "pubmed_abstracts" in dataset_name: 149 | seq_length = 32 150 | 151 | if split == "train": 152 | texts = generate_pile_jsonl(dataset_name, num_samples=num_samples*5) 153 | texts = split_long_texts(texts, num_samples,seq_length) 154 | else: 155 | assert split == "val" 156 | texts = generate_pile_zst(dataset_name, num_samples=num_samples*5) 157 | texts = split_long_texts(texts, num_samples,seq_length) 158 | print (f"Loaded {len(texts)} samples from {dataset_name} {split}") 159 | return texts 160 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [LLM Dataset Inference Demo](#toc0_)\n", 8 | "\n", 9 | "![LLM Dataset Inference Overview](files/llm-dataset-inference-overview.png \"LLM Dataset Inference Overview\")" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "**Table of contents** \n", 17 | "- [LLM Dataset Inference Demo](#toc1_) \n", 18 | "- [Step 0: Make Splits A and B](#toc2_) \n", 19 | "- [Step 1: Aggregate Features with MIAs](#toc3_) \n", 20 | "- [Step 2: Learn MIA Correlations](#toc4_) \n", 21 | " - [Step 2.1 Remove Outliers](#toc4_1_) \n", 22 | " - [Step 2.2: Learn the weights of each feature](#toc4_2_) \n", 23 | "- [Step 3: Dataset Inference](#toc5_) \n", 24 | "\n", 25 | "\n", 32 | "" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "This notebook will show you how to conduct LLM Dataset Inference on your own data.\n", 40 | "\n", 41 | "As you can see in the figure above, the process is divided into four steps:\n", 42 | "\n", 43 | "1. Generate Features with MIAs.\n", 44 | "2. Learn a linear classifier that assigns the importance of each feature to classify the membership of a text.\n", 45 | "3. Perform Dataset Inference on your Data\n", 46 | " - a. Generate MIA features\n", 47 | " - b. Run the linear classifier on those MIA features\n", 48 | " - c. Conduct statistical tests on the ouputs of the linear classifier to determine whether there is a significance difference between them. " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 1, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from utils import prepare_model\n", 58 | "from metrics import aggregate_metrics\n", 59 | "from datasets import load_dataset" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# model_name = \"EleutherAI/pythia-410m-deduped\"\n", 69 | "# model_name = \"EleutherAI/pythia-2.8b\"\n", 70 | "model_name = \"EleutherAI/pythia-6.9b\"\n", 71 | "# model_name = \"EleutherAI/pythia-12b\"\n", 72 | "cache_dir = \"/tmp\"" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stderr", 82 | "output_type": "stream", 83 | "text": [ 84 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 85 | ] 86 | }, 87 | { 88 | "data": { 89 | "application/vnd.jupyter.widget-view+json": { 90 | "model_id": "c3c524b4c3854fe18535935300c6a45a", 91 | "version_major": 2, 92 | "version_minor": 0 93 | }, 94 | "text/plain": [ 95 | "Loading checkpoint shards: 0%| | 0/2 [00:00[Step 0: Make Splits A and B](#toc0_)\n", 119 | "\n", 120 | "Splits A from members and non-members are used to trained the NN.\n", 121 | "\n", 122 | "Splits B from members and non-members are used to perform Dataset Inference (DI)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from datasets import load_dataset\n", 132 | "\n", 133 | "ds = load_dataset(\"haritzpuerto/the_pile_arxiv_50k_sample\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "DatasetDict({\n", 145 | " train: Dataset({\n", 146 | " features: ['text', 'meta'],\n", 147 | " num_rows: 50000\n", 148 | " })\n", 149 | " validation: Dataset({\n", 150 | " features: ['text', 'meta'],\n", 151 | " num_rows: 2434\n", 152 | " })\n", 153 | " test: Dataset({\n", 154 | " features: ['text', 'meta'],\n", 155 | " num_rows: 2407\n", 156 | " })\n", 157 | "})" 158 | ] 159 | }, 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "ds" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 6, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "A_members = ds['train'].select(range(0, 1000))\n", 176 | "A_nonmembers = ds['validation'].select(range(1000))\n", 177 | "\n", 178 | "B_members = ds['train'].select(range(1000, 2000))\n", 179 | "B_nonmembers = ds['validation'].select(range(1000, 2000))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "# [Step 1: Aggregate Features with MIAs](#toc0_)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 7, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "metric_list = [\"k_min_probs\", \"ppl\", \"zlib_ratio\", \"k_max_probs\"]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 8, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "batch_size = 2" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stderr", 214 | "output_type": "stream", 215 | "text": [ 216 | "100%|██████████| 500/500 [09:40<00:00, 1.16s/it]\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "A_members_metrics = aggregate_metrics(llm, tokenizer, A_members, metric_list, None, batch_size=batch_size)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "1000" 233 | ] 234 | }, 235 | "execution_count": 10, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "len(A_members_metrics['ppl'])" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 11, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stderr", 251 | "output_type": "stream", 252 | "text": [ 253 | "100%|██████████| 500/500 [09:39<00:00, 1.16s/it]\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "A_nonmembers_metrics = aggregate_metrics(llm, tokenizer, A_nonmembers, metric_list, None, batch_size=batch_size)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "# [Step 2: Learn MIA Correlations](#toc0_)\n", 266 | "\n", 267 | "In this stage, we train a linear regressor to learn the importance of weights for different MIA attacks to use for the final dataset inference procedure. " 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "import numpy as np\n", 277 | "import pandas as pd\n", 278 | "from scipy.stats import ttest_ind, chi2, norm\n", 279 | "import torch\n", 280 | "import torch.nn as nn\n", 281 | "from tqdm import tqdm\n", 282 | "from selected_features import feature_list" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 13, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "def split_train_val(metrics):\n", 292 | " keys = list(metrics.keys())\n", 293 | " num_elements = len(metrics[keys[0]])\n", 294 | " print (f\"Using {num_elements} elements\")\n", 295 | " # select a random subset of val_metrics (50% of ids)\n", 296 | " ids_train = np.random.choice(num_elements, num_elements//2, replace=False)\n", 297 | " ids_val = np.array([i for i in range(num_elements) if i not in ids_train])\n", 298 | " new_metrics_train = {}\n", 299 | " new_metrics_val = {}\n", 300 | " for key in keys:\n", 301 | " new_metrics_train[key] = np.array(metrics[key])[ids_train]\n", 302 | " new_metrics_val[key] = np.array(metrics[key])[ids_val]\n", 303 | " return new_metrics_train, new_metrics_val\n", 304 | "\n", 305 | "def remove_outliers(metrics, remove_frac=0.05, outliers = \"zero\"):\n", 306 | " # Sort the array to work with ordered data\n", 307 | " sorted_ids = np.argsort(metrics)\n", 308 | " \n", 309 | " # Calculate the number of elements to remove from each side\n", 310 | " total_elements = len(metrics)\n", 311 | " elements_to_remove_each_side = int(total_elements * remove_frac / 2) \n", 312 | " \n", 313 | " # Ensure we're not attempting to remove more elements than are present\n", 314 | " if elements_to_remove_each_side * 2 > total_elements:\n", 315 | " raise ValueError(\"remove_frac is too large, resulting in no elements left.\")\n", 316 | " \n", 317 | " # Change the removed metrics to 0.\n", 318 | " lowest_ids = sorted_ids[:elements_to_remove_each_side]\n", 319 | " highest_ids = sorted_ids[-elements_to_remove_each_side:]\n", 320 | " all_ids = np.concatenate((lowest_ids, highest_ids))\n", 321 | "\n", 322 | " # import pdb; pdb.set_trace()\n", 323 | " \n", 324 | " trimmed_metrics = np.copy(metrics)\n", 325 | " \n", 326 | " if outliers == \"zero\":\n", 327 | " trimmed_metrics[all_ids] = 0\n", 328 | " elif outliers == \"mean\" or outliers == \"mean+p-value\":\n", 329 | " trimmed_metrics[all_ids] = np.mean(trimmed_metrics)\n", 330 | " elif outliers == \"clip\":\n", 331 | " highest_val_permissible = trimmed_metrics[highest_ids[0]]\n", 332 | " lowest_val_permissible = trimmed_metrics[lowest_ids[-1]]\n", 333 | " trimmed_metrics[highest_ids] = highest_val_permissible\n", 334 | " trimmed_metrics[lowest_ids] = lowest_val_permissible\n", 335 | " elif outliers == \"randomize\":\n", 336 | " #this will randomize the order of metrics\n", 337 | " trimmed_metrics = np.delete(trimmed_metrics, all_ids)\n", 338 | " else:\n", 339 | " assert outliers in [\"keep\", \"p-value\"]\n", 340 | " pass\n", 341 | " \n", 342 | " \n", 343 | " return trimmed_metrics\n", 344 | "\n", 345 | "def normalize_and_stack(train_metrics, val_metrics, normalize=\"train\"):\n", 346 | " '''\n", 347 | " excpects an input list of list of metrics\n", 348 | " normalize val with corre\n", 349 | " '''\n", 350 | " new_train_metrics = []\n", 351 | " new_val_metrics = []\n", 352 | " for (tm, vm) in zip(train_metrics, val_metrics):\n", 353 | " if normalize == \"combined\":\n", 354 | " combined_m = np.concatenate((tm, vm))\n", 355 | " mean_tm = np.mean(combined_m)\n", 356 | " std_tm = np.std(combined_m)\n", 357 | " else:\n", 358 | " mean_tm = np.mean(tm)\n", 359 | " std_tm = np.std(tm)\n", 360 | " \n", 361 | " if normalize == \"no\":\n", 362 | " normalized_vm = vm\n", 363 | " normalized_tm = tm\n", 364 | " else:\n", 365 | " #normalization should be done with respect to the train set statistics\n", 366 | " normalized_vm = (vm - mean_tm) / std_tm\n", 367 | " normalized_tm = (tm - mean_tm) / std_tm\n", 368 | " \n", 369 | " new_train_metrics.append(normalized_tm)\n", 370 | " new_val_metrics.append(normalized_vm)\n", 371 | "\n", 372 | " train_metrics = np.stack(new_train_metrics, axis=1)\n", 373 | " val_metrics = np.stack(new_val_metrics, axis=1)\n", 374 | " return train_metrics, val_metrics" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## [Step 2.1 Remove Outliers](#toc0_)\n", 382 | "\n", 383 | "Across each MIA feature value, we first modify the top 5% outliers by changing their values to the mean of the distribution. This step is crucial to prevent issues in Step 3, where the model might learn skewed correlations due to a few outlier samples. " 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 14, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "def prepare_metrics(members_metrics, nonmembers_metrics, outliers=\"clip\", return_tensors=False):\n", 393 | " keys = list(members_metrics.keys())\n", 394 | " np_members_metrics = []\n", 395 | " np_nonmembers_metrics = []\n", 396 | " for key in keys:\n", 397 | " members_metric_key = np.array(members_metrics[key])\n", 398 | " nonmembers_metric_key = np.array(nonmembers_metrics[key])\n", 399 | " \n", 400 | " if outliers is not None:\n", 401 | " # remove the top 2.5% and bottom 2.5% of the data\n", 402 | " members_metric_key = remove_outliers(members_metric_key, remove_frac = 0.05, outliers = outliers)\n", 403 | " nonmembers_metric_key = remove_outliers(nonmembers_metric_key, remove_frac = 0.05, outliers = outliers)\n", 404 | "\n", 405 | " np_members_metrics.append(members_metric_key)\n", 406 | " np_nonmembers_metrics.append(nonmembers_metric_key)\n", 407 | "\n", 408 | " # concatenate the train and val metrics by stacking them\n", 409 | " np_members_metrics, np_nonmembers_metrics = normalize_and_stack(np_members_metrics, np_nonmembers_metrics)\n", 410 | " if return_tensors:\n", 411 | " np_members_metrics = torch.tensor(np_members_metrics, dtype=torch.float32)\n", 412 | " np_nonmembers_metrics = torch.tensor(np_nonmembers_metrics, dtype=torch.float32)\n", 413 | "\n", 414 | " return np_members_metrics, np_nonmembers_metrics" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 15, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "name": "stdout", 424 | "output_type": "stream", 425 | "text": [ 426 | "(1000, 16)\n", 427 | "(999, 16)\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "train_metrics, val_metrics = prepare_metrics(A_members_metrics, A_nonmembers_metrics, outliers=\"clip\")\n", 433 | "\n", 434 | "print(train_metrics.shape)\n", 435 | "print(val_metrics.shape)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 16, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stderr", 445 | "output_type": "stream", 446 | "text": [ 447 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", 448 | "To disable this warning, you can either:\n", 449 | "\t- Avoid using `tokenizers` before the fork if possible\n", 450 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "# aux functions about MIA classifier\n", 456 | "\n", 457 | "def get_dataset_splits(_train_metrics, _val_metrics, num_samples):\n", 458 | " # get the train and val sets\n", 459 | " for_train_train_metrics = _train_metrics[:num_samples]\n", 460 | " for_train_val_metrics = _val_metrics[:num_samples]\n", 461 | " for_val_train_metrics = _train_metrics[num_samples:]\n", 462 | " for_val_val_metrics = _val_metrics[num_samples:]\n", 463 | "\n", 464 | "\n", 465 | " # create the train and val sets\n", 466 | " train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0)\n", 467 | " train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0])))\n", 468 | " val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0)\n", 469 | " val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0])))\n", 470 | " \n", 471 | " # return tensors\n", 472 | " train_x = torch.tensor(train_x, dtype=torch.float32)\n", 473 | " train_y = torch.tensor(train_y, dtype=torch.float32)\n", 474 | " val_x = torch.tensor(val_x, dtype=torch.float32)\n", 475 | " val_y = torch.tensor(val_y, dtype=torch.float32)\n", 476 | " \n", 477 | " return (train_x, train_y), (val_x, val_y)\n", 478 | "\n", 479 | "def train_model(inputs, y, num_epochs=10000):\n", 480 | " num_features = inputs.shape[1]\n", 481 | " model = get_model(num_features)\n", 482 | " \n", 483 | " criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy Loss for binary classification\n", 484 | " optimizer = torch.optim.Adam(model.parameters(), lr=0.01)\n", 485 | " \n", 486 | " # Convert y to float tensor for BCEWithLogitsLoss\n", 487 | " y_float = y.float()\n", 488 | "\n", 489 | " with tqdm(range(num_epochs)) as pbar:\n", 490 | " for epoch in pbar:\n", 491 | " optimizer.zero_grad()\n", 492 | " outputs = model(inputs).squeeze() # Squeeze the output to remove singleton dimension\n", 493 | " loss = criterion(outputs, y_float)\n", 494 | " loss.backward()\n", 495 | " optimizer.step()\n", 496 | " pbar.set_description('loss {}'.format(loss.item()))\n", 497 | " return model\n", 498 | "\n", 499 | "def get_model(num_features, linear = True):\n", 500 | " if linear:\n", 501 | " model = nn.Linear(num_features, 1)\n", 502 | " else:\n", 503 | " model = nn.Sequential(\n", 504 | " nn.Linear(num_features, 10),\n", 505 | " nn.ReLU(),\n", 506 | " nn.Linear(10, 1) # Single output neuron\n", 507 | " )\n", 508 | " return model\n", 509 | "\n", 510 | "def get_predictions(model, val, y):\n", 511 | " with torch.no_grad():\n", 512 | " preds = model(val).detach().squeeze()\n", 513 | " criterion = nn.BCEWithLogitsLoss()\n", 514 | " loss = criterion(preds, y.float())\n", 515 | " return preds.numpy(), loss.item()\n", 516 | "\n", 517 | "from sklearn.metrics import roc_curve, auc\n", 518 | "import matplotlib.pyplot as plt\n", 519 | "\n", 520 | "def plot_roc_curve(model, val, y):\n", 521 | " # get auc and plot roc curve\n", 522 | " from sklearn.metrics import roc_auc_score\n", 523 | " preds, _ = get_predictions(model, val, y)\n", 524 | " auc_score = roc_auc_score(y, preds)\n", 525 | " \n", 526 | " # Compute ROC curve\n", 527 | " fpr, tpr, thresholds = roc_curve(y, preds)\n", 528 | " \n", 529 | " # Compute AUC\n", 530 | " roc_auc = auc(fpr, tpr)\n", 531 | " \n", 532 | " # Plot ROC curve\n", 533 | " plt.figure()\n", 534 | " plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)\n", 535 | " plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", 536 | " plt.xlim([0.0, 1.0])\n", 537 | " plt.ylim([0.0, 1.05])\n", 538 | " plt.xlabel('False Positive Rate')\n", 539 | " plt.ylabel('True Positive Rate')\n", 540 | " plt.title('Receiver Operating Characteristic')\n", 541 | " plt.legend(loc=\"lower right\")\n", 542 | " plt.show()\n", 543 | " \n", 544 | " return auc_score" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "## [Step 2.2: Learn the weights of each feature](#toc0_)\n", 552 | "\n", 553 | "We then pass the data through a linear regression model to learn weights for each feature.\n", 554 | "\n", 555 | "\n", 556 | "⚠️ **Members are classified as 0, while non-members as 1.**" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 17, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "# aux functions about p-values\n", 566 | "list_number_samples = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]\n", 567 | "\n", 568 | "def get_p_value_list(heldout_train, heldout_val, list_number_samples):\n", 569 | " # list_number_samples is used to see how the p-values changes across different number of samples\n", 570 | " p_value_list = []\n", 571 | " for num_samples in list_number_samples:\n", 572 | " heldout_train_curr = heldout_train[:num_samples]\n", 573 | " heldout_val_curr = heldout_val[:num_samples]\n", 574 | " t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less')\n", 575 | " p_value_list.append(p_value)\n", 576 | " return p_value_list\n", 577 | " \n", 578 | " \n", 579 | "\n", 580 | "def split_train_val(metrics):\n", 581 | " keys = list(metrics.keys())\n", 582 | " num_elements = len(metrics[keys[0]])\n", 583 | " print (f\"Using {num_elements} elements\")\n", 584 | " # select a random subset of val_metrics (50% of ids)\n", 585 | " ids_train = np.random.choice(num_elements, num_elements//2, replace=False)\n", 586 | " ids_val = np.array([i for i in range(num_elements) if i not in ids_train])\n", 587 | " new_metrics_train = {}\n", 588 | " new_metrics_val = {}\n", 589 | " for key in keys:\n", 590 | " new_metrics_train[key] = np.array(metrics[key])[ids_train]\n", 591 | " new_metrics_val[key] = np.array(metrics[key])[ids_val]\n", 592 | " return new_metrics_train, new_metrics_val" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 18, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "name": "stderr", 602 | "output_type": "stream", 603 | "text": [ 604 | "loss 0.6681151986122131: 100%|██████████| 1000/1000 [00:01<00:00, 659.06it/s]\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "num_samples = 250 # How many samples to use for training and validation?\n", 610 | "\n", 611 | "np.random.shuffle(train_metrics)\n", 612 | "np.random.shuffle(val_metrics)\n", 613 | "\n", 614 | "# train a model by creating a train set and a held out set\n", 615 | "(train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)\n", 616 | "\n", 617 | "model = train_model(train_x, train_y, num_epochs = 1000)\n", 618 | "\n", 619 | "# using the model weights, get importance of each feature, and save to csv\n", 620 | "weights = model.weight.data.squeeze().tolist() \n", 621 | "features = list(A_members_metrics.keys())\n", 622 | "feature_importance = {feature: weight for feature, weight in zip(features, weights)}\n", 623 | "df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 19, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "data": { 633 | "text/html": [ 634 | "
\n", 635 | "\n", 648 | "\n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | "
FeatureImportance
0ppl0.187106
1k_min_probs_0.050.051939
2k_min_probs_0.1-0.513615
3k_min_probs_0.21.180070
4k_min_probs_0.3-1.855523
5k_min_probs_0.41.458794
6k_min_probs_0.5-0.619759
7k_min_probs_0.60.117170
8k_max_probs_0.05-0.679361
9k_max_probs_0.11.334020
10k_max_probs_0.20.610850
11k_max_probs_0.3-2.398481
12k_max_probs_0.41.521971
13k_max_probs_0.52.084856
14k_max_probs_0.6-2.399661
15zlib_ratio0.112112
\n", 739 | "
" 740 | ], 741 | "text/plain": [ 742 | " Feature Importance\n", 743 | "0 ppl 0.187106\n", 744 | "1 k_min_probs_0.05 0.051939\n", 745 | "2 k_min_probs_0.1 -0.513615\n", 746 | "3 k_min_probs_0.2 1.180070\n", 747 | "4 k_min_probs_0.3 -1.855523\n", 748 | "5 k_min_probs_0.4 1.458794\n", 749 | "6 k_min_probs_0.5 -0.619759\n", 750 | "7 k_min_probs_0.6 0.117170\n", 751 | "8 k_max_probs_0.05 -0.679361\n", 752 | "9 k_max_probs_0.1 1.334020\n", 753 | "10 k_max_probs_0.2 0.610850\n", 754 | "11 k_max_probs_0.3 -2.398481\n", 755 | "12 k_max_probs_0.4 1.521971\n", 756 | "13 k_max_probs_0.5 2.084856\n", 757 | "14 k_max_probs_0.6 -2.399661\n", 758 | "15 zlib_ratio 0.112112" 759 | ] 760 | }, 761 | "execution_count": 19, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "df" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 20, 773 | "metadata": {}, 774 | "outputs": [ 775 | { 776 | "data": { 777 | "image/png": "", 778 | "text/plain": [ 779 | "
" 780 | ] 781 | }, 782 | "metadata": {}, 783 | "output_type": "display_data" 784 | } 785 | ], 786 | "source": [ 787 | "auc = plot_roc_curve(model, val_x, val_y)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "markdown", 792 | "metadata": {}, 793 | "source": [ 794 | "# [Step 3: Dataset Inference](#toc0_)\n", 795 | "\n", 796 | "⚠️ **Remember: Members are classified as 0, while non-members as 1.**" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": 21, 802 | "metadata": {}, 803 | "outputs": [ 804 | { 805 | "name": "stderr", 806 | "output_type": "stream", 807 | "text": [ 808 | "100%|██████████| 500/500 [09:41<00:00, 1.16s/it]\n", 809 | "100%|██████████| 500/500 [09:40<00:00, 1.16s/it]\n" 810 | ] 811 | } 812 | ], 813 | "source": [ 814 | "B_members_metrics = aggregate_metrics(llm, tokenizer, B_members, metric_list, None, batch_size=batch_size)\n", 815 | "B_nonmembers_metrics = aggregate_metrics(llm, tokenizer, B_nonmembers, metric_list, None, batch_size=batch_size)" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 22, 821 | "metadata": {}, 822 | "outputs": [ 823 | { 824 | "name": "stdout", 825 | "output_type": "stream", 826 | "text": [ 827 | "The null hypothesis is that the B_members scores are larger or equal than B_nonmembers.\n", 828 | "The alternative hypothesis is that B_members (0) are lower than B_nonmembers (1) . The p-value is 0.00331255002904568\n" 829 | ] 830 | } 831 | ], 832 | "source": [ 833 | "B_members_metrics_tensor, B_nonmembers_metrics_ternsor = prepare_metrics(B_members_metrics, B_nonmembers_metrics, outliers=None, return_tensors=True)\n", 834 | "B_members_preds, _ = get_predictions(model, B_members_metrics_tensor, torch.tensor([0]*B_members_metrics_tensor.shape[0]))\n", 835 | "B_nonmembers_preds, _ = get_predictions(model, B_nonmembers_metrics_ternsor, torch.tensor([1]*B_nonmembers_metrics_ternsor.shape[0]))\n", 836 | "\n", 837 | "p_value_list = get_p_value_list(B_members_preds, B_nonmembers_preds, list_number_samples=[1000])\n", 838 | "\n", 839 | "print(f\"The null hypothesis is that the B_members scores are larger or equal than B_nonmembers.\\nThe alternative hypothesis is that B_members (0) are lower than B_nonmembers (1) . The p-value is {p_value_list[-1]}\")" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": null, 845 | "metadata": {}, 846 | "outputs": [], 847 | "source": [] 848 | } 849 | ], 850 | "metadata": { 851 | "kernelspec": { 852 | "display_name": "Python 3", 853 | "language": "python", 854 | "name": "python3" 855 | }, 856 | "language_info": { 857 | "codemirror_mode": { 858 | "name": "ipython", 859 | "version": 3 860 | }, 861 | "file_extension": ".py", 862 | "mimetype": "text/x-python", 863 | "name": "python", 864 | "nbconvert_exporter": "python", 865 | "pygments_lexer": "ipython3", 866 | "version": "3.8.10" 867 | } 868 | }, 869 | "nbformat": 4, 870 | "nbformat_minor": 2 871 | } 872 | -------------------------------------------------------------------------------- /di.py: -------------------------------------------------------------------------------- 1 | from utils import prepare_model 2 | from metrics import aggregate_metrics, reference_model_registry 3 | import json, os 4 | import argparse 5 | from datasets import load_dataset 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model') 9 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-410m-deduped", help='The name of the model to use') 10 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use') 11 | parser.add_argument('--split', type=str, default="train", help='The split of the dataset to use') 12 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use') 13 | parser.add_argument('--batch_size', type=int, default=32, help='The batch size to use') 14 | parser.add_argument('--from_hf', type=int, default=1, help='If set, will load the dataset from huggingface') 15 | parser.add_argument('--cache_dir', type=str, default="/data/locus/llm_weights", help='The directory to cache the model') 16 | args = parser.parse_args() 17 | return args 18 | 19 | 20 | 21 | def main(): 22 | args = get_args() 23 | results_file = f"results/{args.model_name}/{args.dataset_name}_{args.split}_metrics.json" 24 | # if os.path.exists(results_file): 25 | # print(f"Results file {results_file} already exists. Aborting...") 26 | # return 27 | model_name = args.model_name 28 | 29 | if model_name in ["microsoft/phi-1_5", "EleutherAI/pythia-12b", "EleutherAI/pythia-6.9b", "EleutherAI/pythia-410m"]: 30 | args.cache_dir = "/data/locus/llm_weights/pratyush" 31 | 32 | model, tokenizer = prepare_model(model_name, cache_dir= args.cache_dir) 33 | 34 | # load the data 35 | dataset_name = args.dataset_name 36 | split = args.split 37 | 38 | if not args.from_hf: 39 | from dataloader import load_data 40 | # if you want to load data directly from the PILE, use the following line 41 | num_samples = args.num_samples 42 | dataset = load_data(dataset_name, split, num_samples) 43 | else: 44 | dataset_path = f"data/{dataset_name}_{split}.jsonl" 45 | dataset = load_dataset("json", data_files=dataset_path, split="train") 46 | print("Data loaded") 47 | 48 | # get the metrics 49 | if model_name in reference_model_registry.values(): 50 | metric_list = ["ppl"] 51 | else: 52 | metric_list = ["k_min_probs", "ppl", "zlib_ratio", "k_max_probs", "perturbation", "reference_model"] 53 | metrics = aggregate_metrics(model, tokenizer, dataset, metric_list, args, batch_size = args.batch_size) 54 | 55 | # save the metrics 56 | os.makedirs(f"results/{model_name}", exist_ok=True) 57 | with open(results_file, 'w') as f: 58 | json.dump(metrics, f) 59 | 60 | if __name__ == "__main__": 61 | main() 62 | 63 | -------------------------------------------------------------------------------- /files/llm-dataset-inference-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pratyushmaini/llm_dataset_inference/6f25d41f133c94b1272341f11224ae6f628c7b5e/files/llm-dataset-inference-overview.png -------------------------------------------------------------------------------- /linear_di.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loads various features for the train and val sets. 3 | Trains a linear model on the train set and evaluates it on the val set. 4 | 5 | Tests p value of differentiating train versus val on held out features. 6 | """ 7 | 8 | import os 9 | import sys 10 | import json 11 | import numpy as np 12 | import pandas as pd 13 | from scipy.stats import ttest_ind, chi2, norm 14 | import torch 15 | import torch.nn as nn 16 | import argparse 17 | from tqdm import tqdm 18 | from selected_features import feature_list 19 | 20 | p_sample_list = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000] 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model') 24 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-12b", help='The name of the model to use') 25 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use') 26 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use') 27 | parser.add_argument("--normalize", type=str, default="train", help="Should you normalize?", choices=["no", "train", "combined"]) 28 | parser.add_argument("--outliers", type=str, default="clip", help="The ablation to use", choices=["randomize", "keep", "zero", "mean", "clip", "mean+p-value", "p-value"]) 29 | parser.add_argument("--features", type=str, default="all", help="The features to use", choices=["all", "selected"]) 30 | parser.add_argument("--false_positive", type=int, default=0, help="What if you gave two val splits?", choices=[0, 1]) 31 | parser.add_argument("--num_random", type=int, default=1, help="How many random runs to do?") 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | def get_model(num_features, linear = True): 37 | if linear: 38 | model = nn.Linear(num_features, 1) 39 | else: 40 | model = nn.Sequential( 41 | nn.Linear(num_features, 10), 42 | nn.ReLU(), 43 | nn.Linear(10, 1) # Single output neuron 44 | ) 45 | return model 46 | 47 | 48 | def train_model(inputs, y, num_epochs=10000): 49 | num_features = inputs.shape[1] 50 | model = get_model(num_features) 51 | 52 | criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy Loss for binary classification 53 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 54 | 55 | # Convert y to float tensor for BCEWithLogitsLoss 56 | y_float = y.float() 57 | 58 | with tqdm(range(num_epochs)) as pbar: 59 | for epoch in pbar: 60 | optimizer.zero_grad() 61 | outputs = model(inputs).squeeze() # Squeeze the output to remove singleton dimension 62 | loss = criterion(outputs, y_float) 63 | loss.backward() 64 | optimizer.step() 65 | pbar.set_description('loss {}'.format(loss.item())) 66 | return model 67 | 68 | def get_predictions(model, val, y): 69 | with torch.no_grad(): 70 | preds = model(val).detach().squeeze() 71 | criterion = nn.BCEWithLogitsLoss() 72 | loss = criterion(preds, y.float()) 73 | return preds.numpy(), loss.item() 74 | 75 | def get_dataset_splits(_train_metrics, _val_metrics, num_samples): 76 | # get the train and val sets 77 | for_train_train_metrics = _train_metrics[:num_samples] 78 | for_train_val_metrics = _val_metrics[:num_samples] 79 | for_val_train_metrics = _train_metrics[num_samples:] 80 | for_val_val_metrics = _val_metrics[num_samples:] 81 | 82 | 83 | # create the train and val sets 84 | train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0) 85 | train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0]))) 86 | val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0) 87 | val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0]))) 88 | 89 | # return tensors 90 | train_x = torch.tensor(train_x, dtype=torch.float32) 91 | train_y = torch.tensor(train_y, dtype=torch.float32) 92 | val_x = torch.tensor(val_x, dtype=torch.float32) 93 | val_y = torch.tensor(val_y, dtype=torch.float32) 94 | 95 | return (train_x, train_y), (val_x, val_y) 96 | 97 | def normalize_and_stack(train_metrics, val_metrics, normalize="train"): 98 | ''' 99 | excpects an input list of list of metrics 100 | normalize val with corre 101 | ''' 102 | new_train_metrics = [] 103 | new_val_metrics = [] 104 | for (tm, vm) in zip(train_metrics, val_metrics): 105 | if normalize == "combined": 106 | combined_m = np.concatenate((tm, vm)) 107 | mean_tm = np.mean(combined_m) 108 | std_tm = np.std(combined_m) 109 | else: 110 | mean_tm = np.mean(tm) 111 | std_tm = np.std(tm) 112 | 113 | if normalize == "no": 114 | normalized_vm = vm 115 | normalized_tm = tm 116 | else: 117 | #normalization should be done with respect to the train set statistics 118 | normalized_vm = (vm - mean_tm) / std_tm 119 | normalized_tm = (tm - mean_tm) / std_tm 120 | 121 | new_train_metrics.append(normalized_tm) 122 | new_val_metrics.append(normalized_vm) 123 | 124 | train_metrics = np.stack(new_train_metrics, axis=1) 125 | val_metrics = np.stack(new_val_metrics, axis=1) 126 | return train_metrics, val_metrics 127 | 128 | def remove_outliers(metrics, remove_frac=0.05, outliers = "zero"): 129 | # Sort the array to work with ordered data 130 | sorted_ids = np.argsort(metrics) 131 | 132 | # Calculate the number of elements to remove from each side 133 | total_elements = len(metrics) 134 | elements_to_remove_each_side = int(total_elements * remove_frac / 2) 135 | 136 | # Ensure we're not attempting to remove more elements than are present 137 | if elements_to_remove_each_side * 2 > total_elements: 138 | raise ValueError("remove_frac is too large, resulting in no elements left.") 139 | 140 | # Change the removed metrics to 0. 141 | lowest_ids = sorted_ids[:elements_to_remove_each_side] 142 | highest_ids = sorted_ids[-elements_to_remove_each_side:] 143 | all_ids = np.concatenate((lowest_ids, highest_ids)) 144 | 145 | # import pdb; pdb.set_trace() 146 | 147 | trimmed_metrics = np.copy(metrics) 148 | 149 | if outliers == "zero": 150 | trimmed_metrics[all_ids] = 0 151 | elif outliers == "mean" or outliers == "mean+p-value": 152 | trimmed_metrics[all_ids] = np.mean(trimmed_metrics) 153 | elif outliers == "clip": 154 | highest_val_permissible = trimmed_metrics[highest_ids[0]] 155 | lowest_val_permissible = trimmed_metrics[lowest_ids[-1]] 156 | trimmed_metrics[highest_ids] = highest_val_permissible 157 | trimmed_metrics[lowest_ids] = lowest_val_permissible 158 | elif outliers == "randomize": 159 | #this will randomize the order of metrics 160 | trimmed_metrics = np.delete(trimmed_metrics, all_ids) 161 | else: 162 | assert outliers in ["keep", "p-value"] 163 | pass 164 | 165 | 166 | return trimmed_metrics 167 | 168 | 169 | def get_p_value_list(heldout_train, heldout_val): 170 | p_value_list = [] 171 | for num_samples in p_sample_list: 172 | heldout_train_curr = heldout_train[:num_samples] 173 | heldout_val_curr = heldout_val[:num_samples] 174 | t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less') 175 | p_value_list.append(p_value) 176 | return p_value_list 177 | 178 | 179 | 180 | def split_train_val(metrics): 181 | keys = list(metrics.keys()) 182 | num_elements = len(metrics[keys[0]]) 183 | print (f"Using {num_elements} elements") 184 | # select a random subset of val_metrics (50% of ids) 185 | ids_train = np.random.choice(num_elements, num_elements//2, replace=False) 186 | ids_val = np.array([i for i in range(num_elements) if i not in ids_train]) 187 | new_metrics_train = {} 188 | new_metrics_val = {} 189 | for key in keys: 190 | new_metrics_train[key] = np.array(metrics[key])[ids_train] 191 | new_metrics_val[key] = np.array(metrics[key])[ids_val] 192 | return new_metrics_train, new_metrics_val 193 | 194 | def main(): 195 | args = get_args() 196 | with open(f"new_results/{args.model_name}/{args.dataset_name}_train_metrics.json", 'r') as f: 197 | metrics_train = json.load(f) 198 | with open(f"new_results/{args.model_name}/{args.dataset_name}_val_metrics.json", 'r') as f: 199 | metrics_val = json.load(f) 200 | 201 | if args.false_positive: 202 | metrics_train, metrics_val = split_train_val(metrics_val) 203 | 204 | keys = list(metrics_train.keys()) 205 | train_metrics = [] 206 | val_metrics = [] 207 | for key in keys: 208 | if args.features != "all": 209 | if key not in feature_list: 210 | continue 211 | metrics_train_key = np.array(metrics_train[key]) 212 | metrics_val_key = np.array(metrics_val[key]) 213 | 214 | # remove the top 2.5% and bottom 2.5% of the data 215 | 216 | metrics_train_key = remove_outliers(metrics_train_key, remove_frac = 0.05, outliers = args.outliers) 217 | metrics_val_key = remove_outliers(metrics_val_key, remove_frac = 0.05, outliers = args.outliers) 218 | 219 | train_metrics.append(metrics_train_key) 220 | val_metrics.append(metrics_val_key) 221 | 222 | # concatenate the train and val metrics by stacking them 223 | 224 | # train_metrics, val_metrics = new_train_metrics, new_val_metrics 225 | train_metrics, val_metrics = normalize_and_stack(train_metrics, val_metrics) 226 | 227 | for i in range(args.num_random): 228 | np.random.shuffle(train_metrics) 229 | np.random.shuffle(val_metrics) 230 | 231 | # train a model by creating a train set and a held out set 232 | num_samples = args.num_samples 233 | (train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples) 234 | 235 | model = train_model(train_x, train_y, num_epochs = 1000) 236 | preds, loss = get_predictions(model, val_x, val_y) 237 | preds_train, loss_train = get_predictions(model, train_x, train_y) 238 | og_train = preds_train[train_y == 0] 239 | og_val = preds_train[train_y == 1] 240 | 241 | heldout_train = preds[val_y == 0] 242 | heldout_val = preds[val_y == 1] 243 | # alternate hypothesis: heldout_train < heldout_val 244 | 245 | if args.outliers == "p-value" or args.outliers == "mean+p-value": 246 | heldout_train = remove_outliers(heldout_train, remove_frac = 0.05, outliers = "randomize") 247 | heldout_val = remove_outliers(heldout_val, remove_frac = 0.05, outliers = "randomize") 248 | 249 | p_value_list = get_p_value_list(heldout_train, heldout_val) 250 | 251 | # using the model weights, get importance of each feature, and save to csv 252 | weights = model.weight.data.squeeze().tolist() 253 | features = keys 254 | feature_importance = {feature: weight for feature, weight in zip(features, weights)} 255 | df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance']) 256 | import os 257 | path_to_append = f"{args.outliers}-outliers/{args.normalize}-normalize" 258 | if args.features == "selected": 259 | path_to_append += "-selected_features" 260 | if args.false_positive: 261 | path_to_append += f"-{args.false_positive}-false_positive" 262 | 263 | model_name = args.model_name.replace("/", "_") 264 | os.makedirs(f"aggregated_results/feature_importance/{path_to_append}/{model_name}", exist_ok=True) 265 | df.to_csv(f'aggregated_results/feature_importance/{path_to_append}/{model_name}/{args.dataset_name}_seed_{i}.csv', index=False) 266 | 267 | 268 | # add the to the csv in p_values/{model_name}.csv if it does not exist 269 | os.makedirs(f"aggregated_results/p_values/{path_to_append}/{model_name}", exist_ok=True) 270 | 271 | p_file = f"aggregated_results/p_values/{path_to_append}/{model_name}/{args.dataset_name}.csv" 272 | print(f"Writing to {p_file}") 273 | if not os.path.exists(p_file): 274 | with open(p_file, 'w') as f: 275 | to_write = "seed," + ",".join([f"p_{str(p)}" for p in p_sample_list]) + "\n" 276 | f.write(to_write) 277 | 278 | # check if the dataset_name is already in the file 279 | flag = 0 280 | seed = f"seed_{i}" 281 | with open(p_file, 'r') as f: 282 | lines = f.readlines() 283 | for line in lines: 284 | if seed in line: 285 | print(f"Dataset {args.dataset_name} already in file {p_file}. Aborting...\n{p_value_list}") 286 | flag = 1 287 | 288 | if flag == 0: 289 | with open(p_file, 'a') as f: 290 | to_write = seed + "," + ",".join([str(p) for p in p_value_list]) + "\n" 291 | f.write(to_write) 292 | 293 | if __name__ == "__main__": 294 | main() -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import zlib 3 | import tqdm, json 4 | 5 | loss_fct = torch.nn.CrossEntropyLoss(reduction="none") 6 | 7 | def raw_values_batch(model, tokenizer, example_list): 8 | ''' 9 | This function takes a list of strings and returns the loss values for each token in the string 10 | input: 11 | model: the language model 12 | tokenizer: the tokenizer 13 | example_list: a list of strings 14 | 15 | output: 16 | loss_list: a list of lists. 17 | Each list contains the loss values for each token in the string 18 | 19 | ''' 20 | max_length = tokenizer.model_max_length 21 | input_ids = tokenizer(example_list, return_tensors="pt", padding=True, truncation=True, max_length=max_length) 22 | 23 | if model.device.type == "cuda": 24 | input_ids = {k: v.cuda() for k, v in input_ids.items()} 25 | 26 | # forward pass with no grad 27 | with torch.no_grad(): 28 | outputs = model(**input_ids) 29 | 30 | labels = input_ids["input_ids"] 31 | labels[labels == tokenizer.pad_token_id] = -100 32 | 33 | # shift the labels 34 | shifted_labels = labels[..., 1:].contiguous().view(-1) 35 | 36 | # shift the logits 37 | shifted_logits = outputs.logits[..., :-1, :].contiguous() 38 | shifted_logits = shifted_logits.view(-1, shifted_logits.size(-1)) 39 | 40 | loss = loss_fct(shifted_logits, shifted_labels) 41 | 42 | # reshape the loss to the original shape 43 | loss = loss.view(labels.size(0), labels.size(1) - 1) 44 | 45 | # now remove the 0 values and create loss as a list of lists 46 | loss_list = loss.tolist() 47 | 48 | for i,entry in enumerate(loss_list): 49 | # remove the 0 values 50 | entry = [x for x in entry if x != 0] 51 | loss_list[i] = entry 52 | 53 | # if any list is empty, remove it 54 | loss_list = [entry for entry in loss_list if len(entry) > 0] 55 | 56 | return loss_list 57 | 58 | def raw_values(model, tokenizer, example_list, batch_size = 32): 59 | ''' 60 | This function takes a list of strings and returns the loss values for each token in the string 61 | input: 62 | model: the language model 63 | tokenizer: the tokenizer 64 | example_list: a list of strings 65 | batch_size: the batch size 66 | output: 67 | loss_list: a list of lists. 68 | Each list contains the loss values for each token in the string 69 | ''' 70 | loss_list = [] 71 | for i in tqdm.tqdm(range(0, len(example_list), batch_size)): 72 | batch = example_list[i:i + batch_size] 73 | loss_list += raw_values_batch(model, tokenizer, batch) 74 | return loss_list 75 | 76 | def k_min_probs(loss_list, k=0.05, reverse=False): 77 | ''' 78 | This function takes a list of lists and returns the ppl of the k fraction smallest values in each list 79 | input: 80 | loss_list: a list of lists 81 | k: the fraction of smallest values to return 82 | 83 | output: 84 | k_min_prob: the mean probability of the k fraction smallest values in each list 85 | ''' 86 | # sort each list. if reverse is true, sort in reverse order (descending) 87 | sorted_list = [sorted(entry) for entry in loss_list] 88 | if reverse: 89 | sorted_list = [entry[::-1] for entry in sorted_list] 90 | k_min_probs = [] 91 | for entry in sorted_list: 92 | # get the k fraction smallest values 93 | num_values = max(1, int(len(entry)*k)) 94 | k_min = entry[:num_values] 95 | k_min_prob = sum(k_min)/len(k_min) 96 | k_min_probs.append(k_min_prob) 97 | return k_min_probs 98 | 99 | def perplexity(loss_list): 100 | ''' 101 | This function takes a list of lists and returns the perplexity of each list 102 | input: 103 | loss_list: a list of lists 104 | 105 | output: 106 | perplexity: the perplexity of each list 107 | ''' 108 | perplexity = [] 109 | for entry in loss_list: 110 | # calculate the mean of each list 111 | mean = sum(entry)/len(entry) 112 | # ppl is the exponent of the mean 113 | ppl = torch.exp(torch.tensor(mean)).item() 114 | perplexity.append(ppl) 115 | 116 | return perplexity 117 | 118 | def zlib_ratio(loss_list, example_list): 119 | ''' 120 | This function takes a list of lists and returns the ratio of the mean loss to the zlib compression of the input string 121 | input: 122 | loss_list: a list of lists 123 | example_list: a list of strings 124 | 125 | output: 126 | zlib_ratio: the ratio of the mean loss to the zlib compression of the input string 127 | ''' 128 | zlib_ratios = [] 129 | for i,entry in enumerate(loss_list): 130 | # calculate the mean of each list 131 | mean = sum(entry)/len(entry) 132 | # calculate the zlib compression of the input string 133 | zlib_entropy = len(zlib.compress(bytes(example_list[i], 'utf-8'))) 134 | # calculate the ratio 135 | ratio = mean/zlib_entropy 136 | zlib_ratios.append(ratio) 137 | return zlib_ratios 138 | 139 | def ppl_ratio(loss_list, reference_list): 140 | ''' 141 | This function takes a list of lists and returns the ratio of the mean loss to the perplexity of a reference model 142 | input: 143 | loss_list: a list of lists 144 | reference_list: a list of perplexity values, or a list of lists of loss values 145 | 146 | output: 147 | ratio: the ratio of the mean loss to the perplexity of the reference model 148 | ''' 149 | ratios = [] 150 | for (entry, entry_ref) in zip(loss_list, reference_list): 151 | # calculate the mean of each list 152 | mean_model = sum(entry)/len(entry) 153 | if type(entry_ref) == list: 154 | mean_ref = sum(entry_ref)/len(entry_ref) 155 | else: 156 | mean_ref = entry_ref 157 | # calculate the ratio 158 | ratio = mean_model/mean_ref 159 | ratios.append(ratio) 160 | 161 | return ratios 162 | 163 | def ppl_diff(loss_list, reference_list): 164 | ''' 165 | This function takes a list of lists and returns the difference of the mean loss to the perplexity of a reference model 166 | input: 167 | loss_list: a list of lists 168 | reference_list: a list of perplexity values, or a list of lists of loss values 169 | 170 | output: 171 | diff: the difference of the mean loss to the perplexity of the reference model 172 | ''' 173 | diffs = [] 174 | for (entry, entry_ref) in zip(loss_list, reference_list): 175 | # calculate the mean of each list 176 | mean_model = sum(entry)/len(entry) 177 | if type(entry_ref) == list: 178 | mean_ref = sum(entry_ref)/len(entry_ref) 179 | else: 180 | mean_ref = entry_ref 181 | # calculate the ratio 182 | diff = mean_model - mean_ref 183 | diffs.append(diff) 184 | 185 | return diffs 186 | 187 | 188 | def perturbation_ratio(model, tokenizer, dataset, loss_list, batch_size = 32): 189 | ''' 190 | Dataset({ 191 | features: ['text', 'synonym_substitution', 'butter_fingers', 'random_deletion', 'change_char_case', 'whitespace_perturbation', 'underscore_trick'], 192 | num_rows: 2000 193 | }) 194 | ''' 195 | result = {} 196 | for perturbation in dataset.column_names: 197 | if perturbation != "text": 198 | perturbed_list = dataset[perturbation] 199 | perturbed_loss_list = raw_values(model, tokenizer, perturbed_list, batch_size = batch_size) 200 | ratios = ppl_ratio(loss_list, perturbed_loss_list) 201 | diffs = ppl_diff(loss_list, perturbed_loss_list) 202 | result[f"ppl_ratio_{perturbation}"] = ratios 203 | result[f"ppl_diff_{perturbation}"] = diffs 204 | return result 205 | 206 | 207 | 208 | 209 | reference_model_registry = { 210 | "silo":"kernelmachine/silo-pdswby-1.3b", 211 | "tinystories-33M": "roneneldan/TinyStories-33M", 212 | "tinystories-1M": "roneneldan/TinyStories-1M", 213 | "phi-1_5": "microsoft/phi-1_5", 214 | # "phi-1": "microsoft/phi-1", 215 | } 216 | 217 | 218 | 219 | def aggregate_metrics(model, tokenizer, dataset, metric_list, args, batch_size = 32): 220 | ''' 221 | This function takes a list of strings and returns a dictionary of metrics 222 | input: 223 | model: the language model 224 | tokenizer: the tokenizer 225 | dataset: a huggingface dataset, with key "text" containing the strings 226 | metric_list: a list of metrics to calculate 227 | 228 | output: 229 | metrics: a dictionary of metrics 230 | ''' 231 | example_list = dataset["text"] 232 | loss_list = raw_values(model, tokenizer, example_list, batch_size = batch_size) 233 | 234 | metrics = {} 235 | if "ppl" in metric_list: 236 | metrics["ppl"] = perplexity(loss_list) 237 | if "k_min_probs" in metric_list: 238 | for k in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]: 239 | metrics[f"k_min_probs_{k}"] = k_min_probs(loss_list, k=k) 240 | if "k_max_probs" in metric_list: 241 | for k in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]: 242 | metrics[f"k_max_probs_{k}"] = k_min_probs(loss_list, k=k, reverse=True) 243 | if "zlib_ratio" in metric_list: 244 | metrics["zlib_ratio"] = zlib_ratio(loss_list, example_list) 245 | 246 | if "perturbation" in metric_list: 247 | ratios_dict = perturbation_ratio(model, tokenizer, dataset, loss_list, batch_size) 248 | metrics.update(ratios_dict) 249 | 250 | if "reference_model" in metric_list: 251 | # for computation efficiency, we now enforce that the reference model should already have been run and its ppl saved 252 | for model_name in reference_model_registry: 253 | hf_path = reference_model_registry[model_name] 254 | with open(f"results/{hf_path}/{args.dataset_name}_{args.split}_metrics.json", 'r') as f: 255 | metrics_train = json.load(f) 256 | ref_ppl = metrics_train["ppl"] 257 | ref_ratios = ppl_ratio(loss_list, ref_ppl) 258 | ref_diffs = ppl_diff(loss_list, ref_ppl) 259 | metrics[f"ref_ppl_ratio_{model_name}"] = ref_ratios 260 | metrics[f"ref_ppl_diff_{model_name}"] = ref_diffs 261 | 262 | ''' 263 | old code to run reference models on the fly 264 | from utils import prepare_model 265 | for model_name in reference_model_registry: 266 | hf_path = reference_model_registry[model_name] 267 | model, tokenizer = prepare_model(hf_path) 268 | 269 | reference_list = raw_values(model, tokenizer, example_list, batch_size = batch_size) 270 | metrics[f"ref_ppl_ratio_{model_name}"] = ppl_ratio(loss_list, reference_list) 271 | metrics[f"ref_ppl_diff_{model_name}"] = ppl_diff(loss_list, reference_list) 272 | ''' 273 | 274 | return metrics -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lm_dataformat 2 | nltk 3 | git+https://github.com/huggingface/transformers 4 | datasets -------------------------------------------------------------------------------- /results_reader.py: -------------------------------------------------------------------------------- 1 | # go to p_values/{model} 2 | # read every csv file 3 | # print: {csv_name}: Number of values < 0.1 = {number of values < 0.1} 4 | 5 | import sys 6 | model_name = sys.argv[1] 7 | import os 8 | import pandas as pd 9 | 10 | p_values_dir = f"p_values/{model_name}" 11 | p_values = {} 12 | for file in sorted(os.listdir(p_values_dir)): 13 | if file.endswith(".csv"): 14 | p_values[file] = pd.read_csv(f"{p_values_dir}/{file}") 15 | 16 | for key in p_values: 17 | print(f"{key}: Number of values < 0.1 = {len(p_values[key][p_values[key]['p_value'] < 0.1])}") -------------------------------------------------------------------------------- /scripts/data_creator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd .. 3 | 4 | for dataset in stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers 5 | do 6 | echo "dataset: $dataset" 7 | python data_creator.py --dataset_name $dataset 8 | done -------------------------------------------------------------------------------- /scripts/di_launcher_individual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH --output=slurm_output/slurm_%j.out # Standard output 3 | #SBATCH --error=slurm_output/slurm_%j.err # Standard error 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --gpus-per-node=A6000:1 6 | #SBATCH --tasks-per-node=1 7 | #SBATCH --mem=50G 8 | #SBATCH --time=20:00:00 9 | #SBATCH --mail-user=pratyus2@cs.cmu.edu 10 | #SBATCH --partition=general 11 | 12 | model_name=$1 13 | split_name=$2 14 | gpu_id=$3 15 | batch_size=$4 16 | dataset=$5 17 | 18 | source ~/.bashrc 19 | conda init 20 | 21 | # if model_name is "kernelmachine/silo-pdswby-1.3b", then conda activate di_silo 22 | # conda activate di 23 | 24 | if [ $model_name = "kernelmachine/silo-pdswby-1.3b" ] 25 | then 26 | conda activate di_silo 27 | else 28 | conda activate di 29 | fi 30 | 31 | cd /home/pratyus2/projects/llm_dataset_inference 32 | 33 | echo "model_name: $model_name" split_name: $split_name gpu_id: $gpu_id 34 | 35 | echo "dataset: $dataset" 36 | CUDA_VISIBLE_DEVICES=$gpu_id python di.py --split $split_name --dataset_name $dataset --model_name $model_name --batch_size $batch_size -------------------------------------------------------------------------------- /scripts/di_mega_launcher.sh: -------------------------------------------------------------------------------- 1 | # launch all the models on the inference dataset 2 | # "kernelmachine/silo-pdswby-1.3b", 3 | # "roneneldan/TinyStories-33M", 4 | # "roneneldan/TinyStories-1M", 5 | # "microsoft/phi-1_5", 6 | # "microsoft/phi-1", 7 | 8 | # for model_name in "roneneldan/TinyStories-33M" "roneneldan/TinyStories-1M" #"microsoft/phi-1_5" 9 | # for model_name in "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-6.9b-deduped" 10 | # for model_name in "kernelmachine/silo-pdswby-1.3b" #(need different git repo for this model) 11 | num_jobs=0 12 | for model_name in "EleutherAI/pythia-410m" 13 | do 14 | if [ $model_name = "EleutherAI/pythia-6.9b" ] 15 | then 16 | batch_size=8 17 | else 18 | batch_size=32 19 | fi 20 | # batch_size=1 if model_name = "EleutherAI/pythia-12b" 21 | if [ $model_name = "EleutherAI/pythia-12b-deduped" ] 22 | then 23 | batch_size=1 24 | fi 25 | 26 | for dataset in bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers pubmed_abstracts math nih enron stackexchange wikipedia cc github openwebtext2 freelaw uspto hackernews books3 pubmed_central gutenberg arxiv # 27 | do 28 | for split_name in "train" "val" 29 | do 30 | num_jobs=$((num_jobs+1)) 31 | # wait if 8 jobs submitted 32 | if [ $num_jobs -eq 24 ] 33 | then 34 | echo "waiting for 8 jobs to complete" 35 | wait 36 | sleep 100s 37 | # check squeue if any process is running by user pratyus2 38 | while [ $(squeue -u pratyus2 | wc -l) -gt 1 ] 39 | do 40 | echo "waiting for 8 jobs to complete" 41 | sleep 10s 42 | done 43 | num_jobs=0 44 | fi 45 | sbatch di_launcher_individual.sh $model_name $split_name 0 $batch_size $dataset 46 | done 47 | # sbatch di_launcher_b.sh $model_name $split_name 0 $batch_size 48 | # sbatch di_launcher_c.sh $model_name $split_name 0 $batch_size 49 | # sbatch di_launcher_d.sh $model_name $split_name 0 $batch_size 50 | # sbatch di_launcher_e.sh $model_name $split_name 0 $batch_size 51 | # sbatch di_launcher_f.sh $model_name $split_name 0 $batch_size 52 | done 53 | done -------------------------------------------------------------------------------- /scripts/launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH --output=slurm_output/slurm_%j.out # Standard output 3 | #SBATCH --error=slurm_output/slurm_%j.err # Standard error 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --tasks-per-node=1 6 | #SBATCH --mem=10G 7 | #SBATCH --time=1-10:00:00 8 | #SBATCH --partition=general 9 | #SBATCH --array=0-999 10 | 11 | 12 | datasets=(stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers) 13 | outliers=("mean" "p-value") 14 | normalizes=("combined" "train" "no") 15 | features=("all" "selected") 16 | false_positives=(1 0) 17 | models=("EleutherAI/pythia-12b-deduped" "EleutherAI/pythia-12b" "EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-6.9b" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-1.3b" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-410m") 18 | 19 | #add 6000 to the array index to get the next model 20 | SLURM_ARRAY_TASK_ID=$((SLURM_ARRAY_TASK_ID + 1000)) 21 | 22 | # Calculate the array index 23 | dataset_idx=$((SLURM_ARRAY_TASK_ID % ${#datasets[@]})) 24 | outlier_idx=$((SLURM_ARRAY_TASK_ID / ${#datasets[@]} % ${#outliers[@]})) 25 | normalize_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]}) % ${#normalizes[@]})) 26 | feature_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]}) % ${#features[@]})) 27 | false_positive_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]} * ${#features[@]}) % ${#false_positives[@]})) 28 | model_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]} * ${#features[@]} * ${#false_positives[@]}))) 29 | 30 | dataset=${datasets[$dataset_idx]} 31 | outlier=${outliers[$outlier_idx]} 32 | normalize=${normalizes[$normalize_idx]} 33 | features=${features[$feature_idx]} 34 | false_positive=${false_positives[$false_positive_idx]} 35 | model_name=${models[$model_idx]} 36 | 37 | if [ $false_positive -eq 1 ]; then 38 | num_samples=500 39 | else 40 | num_samples=1000 41 | fi 42 | 43 | # model_name=$1 44 | # outlier=$2 45 | # normalize=$3 46 | # features=$4 47 | # false_positive=$5 48 | # num_samples=$6 49 | # dataset=$7 50 | 51 | echo model_name: $model_name outliers: $outlier normalize: $normalize features: $features false_positive: $false_positive num_samples: $num_samples dataset: $dataset 52 | 53 | 54 | source ~/.bashrc 55 | conda init 56 | conda activate di 57 | 58 | cd /home/pratyus2/projects/llm_dataset_inference 59 | 60 | python linear_di.py --num_random 10 --dataset_name $dataset --model_name $model_name --normalize $normalize --outliers $outlier --features $features --false_positive $false_positive --num_samples $num_samples 61 | -------------------------------------------------------------------------------- /scripts/mega_launcher.sh: -------------------------------------------------------------------------------- 1 | 2 | for dataset in stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers 3 | do 4 | for outliers in "mean+p-value" "mean" "p-value" #"clip" "zero" "keep" "randomize" # 5 | do 6 | for normalize in "combined" "train" "no" 7 | do 8 | for features in "all" "selected" 9 | do 10 | for false_positive in 1 0 11 | do 12 | for model in "EleutherAI/pythia-12b-deduped" "EleutherAI/pythia-12b" "EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-6.9b" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-1.3b" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-410m" 13 | do 14 | # num_samples=500 if false_positive=1 else 1000 15 | if [ $false_positive -eq 1 ] 16 | then 17 | num_samples=500 18 | else 19 | num_samples=1000 20 | fi 21 | sbatch launcher.sh $model $outliers $normalize $features $false_positive $num_samples $dataset 22 | done 23 | wait 24 | done 25 | done 26 | 27 | done 28 | done 29 | done -------------------------------------------------------------------------------- /selected_features.py: -------------------------------------------------------------------------------- 1 | feature_list = [ 2 | "ppl", 3 | "k_min_probs_0.05", 4 | "k_min_probs_0.1", 5 | "k_min_probs_0.2", 6 | "k_min_probs_0.3", 7 | "k_min_probs_0.4", 8 | "k_min_probs_0.5", 9 | "k_min_probs_0.6", 10 | "k_max_probs_0.05", 11 | "k_max_probs_0.1", 12 | "k_max_probs_0.2", 13 | "k_max_probs_0.3", 14 | "k_max_probs_0.4", 15 | "k_max_probs_0.5", 16 | "k_max_probs_0.6", 17 | "zlib_ratio", 18 | "ppl_ratio_synonym_substitution", 19 | "ppl_ratio_butter_fingers", 20 | "ppl_ratio_random_deletion", 21 | "ppl_ratio_change_char_case", 22 | "ppl_ratio_whitespace_perturbation", 23 | "ppl_ratio_underscore_trick", 24 | "ref_ppl_ratio_silo", 25 | "ref_ppl_ratio_tinystories-33M", 26 | "ref_ppl_ratio_tinystories-1M", 27 | "ref_ppl_ratio_phi-1_5", 28 | ] -------------------------------------------------------------------------------- /transform.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | sys.path.append("NL-Augmenter") 3 | 4 | # pip install spacy torchtext cucco fastpunct sacremoses 5 | # python -m spacy download en_core_web_sm 6 | 7 | 8 | from nlaugmenter.transformations.butter_fingers_perturbation.transformation import ButterFingersPerturbation 9 | from nlaugmenter.transformations.random_deletion.transformation import RandomDeletion 10 | from nlaugmenter.transformations.synonym_substitution.transformation import SynonymSubstitution 11 | from nlaugmenter.transformations.back_translation.transformation import BackTranslation 12 | from nlaugmenter.transformations.change_char_case.transformation import ChangeCharCase 13 | from nlaugmenter.transformations.whitespace_perturbation.transformation import WhitespacePerturbation 14 | from nlaugmenter.transformations.underscore_trick.transformation import UnderscoreTrick 15 | from nlaugmenter.transformations.style_paraphraser.transformation import StyleTransferParaphraser 16 | from nlaugmenter.transformations.punctuation.transformation import PunctuationWithRules 17 | 18 | 19 | 20 | 21 | def aug_generator(text_list, aug_style): 22 | 23 | if aug_style == "butter_fingers": 24 | t1 = ButterFingersPerturbation(max_outputs=1) 25 | return [t1.generate(text_list[i], prob = 0.1)[0] for i in range(len(text_list))] 26 | elif aug_style == "random_deletion": 27 | t1 = RandomDeletion(prob=0.25) 28 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))] 29 | elif aug_style == "synonym_substitution": 30 | syn = SynonymSubstitution(max_outputs=1, prob = 0.2) 31 | return [syn.generate(text_list[i])[0] for i in range(len(text_list))] 32 | elif aug_style == "back_translation": 33 | trans = BackTranslation() 34 | return [trans.generate(text_list[i])[0] for i in range(len(text_list))] 35 | elif aug_style == "change_char_case": 36 | t1 = ChangeCharCase() 37 | return [t1.generate(text_list[i], prob = 0.25)[0] for i in range(len(text_list))] 38 | elif aug_style == "whitespace_perturbation": 39 | t1 = WhitespacePerturbation() 40 | return [t1.generate(text_list[i], prob = 0.25)[0] for i in range(len(text_list))] 41 | elif aug_style == "underscore_trick": 42 | t1 = UnderscoreTrick(prob = 0.25) 43 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))] 44 | elif aug_style == "style_paraphraser": 45 | t1 = StyleTransferParaphraser(style = "Basic", upper_length="same_5") 46 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))] 47 | elif aug_style == "punctuation_perturbation": 48 | normalizations = ['remove_extra_white_spaces', ('replace_characters', {'characters': 'was', 'replacement': 'TZ'}), 49 | ('replace_emojis', {'replacement': 'TESTO'})] 50 | punc = PunctuationWithRules(rules=normalizations) 51 | return [punc.generate(text_list[i])[0] for i in range(len(text_list))] 52 | else: 53 | raise ValueError("Augmentation style not found. Please check the available styles.") 54 | 55 | def generate_perturbations(text_list): 56 | augmentation_styles = ["synonym_substitution", "butter_fingers", "random_deletion", "change_char_case", "whitespace_perturbation", "underscore_trick"] 57 | all_augmented = {} 58 | for style in augmentation_styles: 59 | start = time.time() 60 | aug_list = aug_generator(text_list, style) 61 | all_augmented[style] = aug_list 62 | print(f"Perturbing with {style} took {time.time() - start} seconds") 63 | return all_augmented 64 | 65 | if __name__ == "__main__": 66 | text_list = ["This is a test sentence. It is a good sentence.", "This is another test sentence. It is a bad sentence."] 67 | print(generate_perturbations(text_list)) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | import torch 3 | 4 | def prepare_model(model_name, cache_dir, quant=None): 5 | tokenizer = AutoTokenizer.from_pretrained(model_name) 6 | # pad token 7 | tokenizer.pad_token = tokenizer.eos_token 8 | tokenizer.padding_side = "right" 9 | tokenizer.model_max_length = 512 10 | 11 | if quant is not None: 12 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True).cuda() 13 | elif quant == "fp16": 14 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda() 15 | elif quant == "8bit": 16 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True, load_in_8bit=True).cuda() 17 | 18 | print("Model loaded") 19 | return model, tokenizer --------------------------------------------------------------------------------