├── .gitignore
├── LICENSE
├── README.md
├── analysis.py
├── correction_script.py
├── data_creator.py
├── dataloader.py
├── demo.ipynb
├── di.py
├── files
└── llm-dataset-inference-overview.png
├── linear_di.py
├── metrics.py
├── requirements.txt
├── results_reader.py
├── scripts
├── data_creator.sh
├── di_launcher_individual.sh
├── di_mega_launcher.sh
├── launcher.sh
└── mega_launcher.sh
├── selected_features.py
├── transform.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # project specific
2 | plots_p/
3 | plots_p_w/
4 | data/
5 | scripts/slurm_output/
6 | results/
7 | aggregated_results/
8 |
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | share/python-wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .nox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | *.py,cover
59 | .hypothesis/
60 | .pytest_cache/
61 | cover/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | .pybuilder/
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | # For a library or package, you might want to ignore these files since the code is
96 | # intended to run in multiple environments; otherwise, check them in:
97 | # .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # poetry
107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | # This is especially recommended for binary packages to ensure reproducibility, and is more
109 | # commonly ignored for libraries.
110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 |
113 | # pdm
114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | # in version control.
118 | # https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 |
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 |
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 |
128 | # SageMath parsed files
129 | *.sage.py
130 |
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 |
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 |
144 | # Rope project settings
145 | .ropeproject
146 |
147 | # mkdocs documentation
148 | /site
149 |
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 |
155 | # Pyre type checker
156 | .pyre/
157 |
158 | # pytype static type analyzer
159 | .pytype/
160 |
161 | # Cython debug symbols
162 | cython_debug/
163 |
164 | # PyCharm
165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | # and can be added to the global gitignore or merged into this file. For a more nuclear
168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Pratyush Maini
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM Dataset Inference: Did you train on my dataset?
2 |
3 |
4 | 
5 |
6 |
7 | The proliferation of large language models (LLMs) in the real world has come with a rise in copyright
8 | cases against companies for training their models on unlicensed data from the internet. Recent works
9 | have presented methods to identify if individual text sequences were members of the model’s training
10 | data, known as membership inference attacks (MIAs). We demonstrate that the apparent success of
11 | these MIAs is confounded by selecting non-members (text sequences not used for training) belonging to
12 | a different distribution from the members (e.g., temporally shifted recent Wikipedia articles compared
13 | with ones used to train the model). This distribution shift makes membership inference appear successful.
14 | However, most MIA methods perform no better than random guessing when discriminating between
15 | members and non-members from the same distribution (e.g., in this case, the same period of time).
16 | Even when MIAs work, we find that different MIAs succeed at inferring membership of samples from
17 | different distributions. Instead, we propose a new dataset inference method to accurately identify
18 | the datasets used to train large language models. This paradigm sits realistically in the modern-day
19 | copyright landscape, where authors claim that an LLM is trained over multiple documents (such as a
20 | book) written by them, rather than one particular paragraph. While dataset inference shares many
21 | of the challenges of membership inference, we solve it by selectively combining the MIAs that provide
22 | positive signal for a given distribution, and aggregating them to perform a statistical test on a given
23 | dataset. Our approach successfully distinguishes the train and test sets of different subsets of the Pile
24 | with statistically significant p-values < 0.1, without any false positives.
25 |
26 | ## Data Used
27 |
28 | This repository contains data different subsets of the PILE, divided into train and val sets. The data is in the form of a JSON file, with each entry containing the raw text, as well as various kinds of perturbations applied to it. The dataset is used to facilitate privacy research in language models, where the perturbed data can be used as reference detect the presence of a particular dataset in the training data of a language model.
29 |
30 | ## Quick Links
31 |
32 | - [**arXiv Paper**](): Detailed information about the Dataset Inference V2 project, including the dataset, results, and additional resources.
33 | - [**GitHub Repository**](): Access the source code, evaluation scripts, and additional resources for Dataset Inference.
34 | - [**Dataset on Hugging Face**](https://huggingface.co/datasets/pratyushmaini/llm_dataset_inference): Direct link to download the various versons of the PILE dataset.
35 | - [**Summary on Twitter**](): A concise summary and key takeaways from the project.
36 |
37 |
38 | ## Applicability 🚀
39 |
40 | The dataset is in text format and can be loaded using the Hugging Face `datasets` library. It can be used to evaluate any causal or masked language model for the presence of specific datasets in its training pool. The dataset is *not* intended for direct use in training models, but rather for evaluating the privacy of language models. Please keep the validation sets, and the perturbed train sets private, and do not use them for training models.
41 |
42 | ## Loading the Dataset
43 |
44 | To load the dataset, use the following code:
45 |
46 | ```python
47 | from datasets import load_dataset
48 | dataset = load_dataset("pratyushmaini/llm_dataset_inference", subset = "wikipedia", split = "train")
49 | ```
50 |
51 | ### Available perturbations:
52 |
53 | We use the NL-Augmenter library to apply the following perturbations to the data:
54 | - `synonym_substitution`: Synonym substitution of words in the sentence.
55 | - `butter_fingers`: Randomly changing characters from the sentence.
56 | - `random_deletion`: Randomly deleting words from the sentence.
57 | - `change_char_case`: Randomly changing the case of characters in the sentence.
58 | - `whitespace_perturbation`: Randomly adding or removing whitespace from the sentence.
59 | - `underscore_trick`: Adding underscores to the sentence.
60 |
61 | ## Demo
62 |
63 | Run `Demo.ipynb` to conduct LLM Dataset Inference attacks on your own data and/or understand the code.
64 |
65 | ## Citing Our Work
66 |
67 | If you find our codebase and dataset beneficial, please cite our work:
68 | ```
69 | @misc{mainidi2024,
70 | title={LLM Dataset Inference: Did you train on my dataset?},
71 | author={Pratyush Maini and Hengrui Jia and Nicolas Papernot and Adam Dziedzic},
72 | year={2024},
73 | archivePrefix={arXiv},
74 | primaryClass={cs.LG}
75 | }
76 | ```
77 |
--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
1 | '''
2 | This file calculates p values by loading the json from results
3 | '''
4 | import json, os
5 | import argparse
6 | import numpy as np
7 | from scipy.stats import ttest_ind, chi2, norm
8 |
9 |
10 | def get_args():
11 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model')
12 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-12b", help='The name of the model to use')
13 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use')
14 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use')
15 | parser.add_argument('--batch_size', type=int, default=32, help='The batch size to use')
16 | args = parser.parse_args()
17 | return args
18 |
19 | def fishers_method(p_values):
20 | statistic = -2 * np.sum(np.log(p_values))
21 | combined_p_value = chi2.sf(statistic, 2 * len(p_values))
22 | return combined_p_value
23 |
24 | def harmonic_mean(p_values):
25 | return len(p_values) / np.sum(1. / np.array(p_values))
26 |
27 | def get_p_values_averaged(list1, list2):
28 | # make 10 random samples of the two lists by sampling without replacement
29 | num_elements = min(len(list1), len(list2))
30 | num_elements_per_sample = int(num_elements/10)
31 | # randomly permute the two lists
32 | np.random.shuffle(list1)
33 | np.random.shuffle(list2)
34 | p_values = []
35 | for i in range(10):
36 | sample1 = list1[i*num_elements_per_sample:(i+1)*num_elements_per_sample]
37 | sample2 = list2[i*num_elements_per_sample:(i+1)*num_elements_per_sample]
38 | t_stat, p_value = ttest_ind(sample1, sample2)
39 | p_values.append(p_value)
40 |
41 | return harmonic_mean(p_values)
42 |
43 | def get_p_values(list1, list2):
44 | t_stat, p_value = ttest_ind(list1, list2)
45 | return p_value
46 |
47 | def main():
48 | args = get_args()
49 | with open(f"new_results/{args.model_name}/{args.dataset_name}_train_metrics.json", 'r') as f:
50 | metrics_train = json.load(f)
51 | with open(f"new_results/{args.model_name}/{args.dataset_name}_val_metrics.json", 'r') as f:
52 | metrics_val = json.load(f)
53 |
54 | keys = list(metrics_train.keys())
55 | p_values = {}
56 | for key in keys:
57 | # remove the top 2.5% and bottom 2.5% of the data
58 | metrics_train_key = np.array(metrics_train[key])
59 | metrics_val_key = np.array(metrics_val[key])
60 | metrics_train_key = metrics_train_key[np.argsort(metrics_train_key)]
61 | metrics_val_key = metrics_val_key[np.argsort(metrics_val_key)]
62 | metrics_train_key = metrics_train_key[int(0.025*len(metrics_train_key)):int(0.975*len(metrics_train_key))]
63 | metrics_val_key = metrics_val_key[int(0.025*len(metrics_val_key)):int(0.975*len(metrics_val_key))]
64 | # shuffle the data
65 | np.random.shuffle(metrics_train_key)
66 | np.random.shuffle(metrics_val_key)
67 | # get the p value
68 | # t_stat, p_value = ttest_ind(metrics_train_key, metrics_val_key)
69 |
70 |
71 | p_values[key] = get_p_values(metrics_train[key], metrics_val[key])
72 |
73 | # add the p_values to the csv in p_values_averaged/{args.model_name}/{key}.csv if it does not exist
74 | os.makedirs(f"p_values/{args.model_name}", exist_ok=True)
75 | for key in p_values:
76 | p_file = f"p_values/{args.model_name}/{key}.csv"
77 | if not os.path.exists(p_file):
78 | with open(p_file, 'w') as f:
79 | f.write("dataset_name,p_value\n")
80 |
81 | # check if the dataset_name is already in the file
82 | flag = 0
83 | with open(p_file, 'r') as f:
84 | lines = f.readlines()
85 | for line in lines:
86 | if args.dataset_name in line:
87 | print(f"Dataset {args.dataset_name} already in file {p_file}. Aborting...")
88 | flag = 1
89 |
90 | if flag == 0:
91 | with open(p_file, 'a') as f:
92 | f.write(f"{args.dataset_name},{p_values[key]}\n")
93 |
94 | if __name__ == "__main__":
95 | main()
--------------------------------------------------------------------------------
/correction_script.py:
--------------------------------------------------------------------------------
1 | """
2 | There were certain inconsitencies in the use of ppl and likelihood in the code
3 | Correct all results to accommodate for the same
4 | """
5 |
6 | import glob
7 | import json
8 | import os
9 | import torch
10 |
11 | # get all files in "results/EleutherAI/*/*.json"
12 | file_list = glob.glob("results/EleutherAI/pythia-410m/*.json")
13 |
14 | '''
15 | dict_keys(['ppl', 'k_min_probs_0.05', 'k_min_probs_0.1', 'k_min_probs_0.2', 'k_min_probs_0.3', 'k_min_probs_0.4', 'k_min_probs_0.5', 'k_min_probs_0.6', 'k_max_probs_0.05', 'k_max_probs_0.1', 'k_max_probs_0.2', 'k_max_probs_0.3', 'k_max_probs_0.4', 'k_max_probs_0.5', 'k_max_probs_0.6', 'zlib_ratio', 'ppl_ratio_synonym_substitution', 'ppl_diff_synonym_substitution', 'ppl_ratio_butter_fingers', 'ppl_diff_butter_fingers', 'ppl_ratio_random_deletion', 'ppl_diff_random_deletion', 'ppl_ratio_change_char_case', 'ppl_diff_change_char_case', 'ppl_ratio_whitespace_perturbation', 'ppl_diff_whitespace_perturbation', 'ppl_ratio_underscore_trick', 'ppl_diff_underscore_trick', 'ref_ppl_ratio_silo', 'ref_ppl_diff_silo', 'ref_ppl_ratio_tinystories-33M', 'ref_ppl_diff_tinystories-33M', 'ref_ppl_ratio_tinystories-1M', 'ref_ppl_diff_tinystories-1M', 'ref_ppl_ratio_phi-1_5', 'ref_ppl_diff_phi-1_5'])
16 | '''
17 |
18 |
19 |
20 | # iterate over all files
21 | for file in file_list:
22 | with open(file, 'r') as f:
23 | metrics = json.load(f)
24 | ppl_list = torch.tensor(metrics['ppl'])
25 | loss_list = torch.log(ppl_list)
26 | keys = list(metrics.keys())
27 | for key in keys:
28 | if "ref_ppl_ratio" in key:
29 | current_ratio = torch.tensor(metrics[key]) # loss_list / ref_ppl
30 | ref_ppl = loss_list / current_ratio
31 | ppl_ratio = ppl_list / ref_ppl
32 | loss_ratio = torch.log(ref_ppl) / loss_list
33 | metrics[key] = ppl_ratio.tolist()
34 | metrics[key.replace("ppl", "loss")] = loss_ratio.tolist()
35 | elif "ref_ppl_diff" in key:
36 | current_diff = torch.tensor(metrics[key]) # loss_list - ref_ppl
37 | ref_ppl = loss_list - current_diff
38 | ppl_diff = ppl_list - ref_ppl
39 | loss_diff = torch.log(ref_ppl) - loss_list
40 | metrics[key] = ppl_diff.tolist()
41 | metrics[key.replace("ppl", "loss")] = loss_diff.tolist()
42 | elif "ppl_ratio" in key:
43 | current_ratio = torch.tensor(metrics[key])
44 | perturbation_loss = loss_list / current_ratio
45 | perturbation_ppl = torch.exp(perturbation_loss)
46 | ppl_ratio = ppl_list / perturbation_ppl
47 | loss_ratio = perturbation_loss / loss_list
48 | metrics[key] = ppl_ratio.tolist()
49 | metrics[key.replace("ppl", "loss")] = loss_ratio.tolist()
50 | elif "ppl_diff" in key:
51 | current_diff = torch.tensor(metrics[key])
52 | perturbation_loss = loss_list - current_diff
53 | perturbation_ppl = torch.exp(perturbation_loss)
54 | ppl_diff = ppl_list - perturbation_ppl
55 | loss_diff = perturbation_loss - loss_list
56 | metrics[key] = ppl_diff.tolist()
57 | metrics[key.replace("ppl", "loss")] = loss_diff.tolist()
58 |
59 | # save the new file at "new_results/EleutherAI/*/*.json"
60 | new_file = file.replace("results", "new_results")
61 | os.makedirs(os.path.dirname(new_file), exist_ok=True)
62 | with open(new_file, 'w') as f:
63 | json.dump(metrics, f)
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/data_creator.py:
--------------------------------------------------------------------------------
1 | '''
2 | This file is used to convert data from the PILE to a huggingface dataset.
3 | This file will also call various perturbations, and add perturbed versions of the data to the dataset as different subsets.
4 | '''
5 | from dataloader import load_data, pile_mapper
6 | from transform import generate_perturbations
7 | import os
8 | import json
9 |
10 |
11 | def main(args):
12 | root = os.getcwd() + "/data"
13 | os.makedirs(root, exist_ok=True)
14 |
15 | if args.dataset_names == "all":
16 | dataset_names = pile_mapper.keys()
17 | else:
18 | dataset_names = [args.dataset_names]
19 |
20 | for dataset_name in dataset_names:
21 | for split in ["train", "val"]:
22 | file_name = f"{root}/{dataset_name}_{split}.jsonl"
23 | # load the data
24 | num_samples = 2000
25 | raw_texts = load_data(dataset_name, split, num_samples)
26 | print(f"Data loaded for {dataset_name} {split} | {len(raw_texts)} samples")
27 | # add the perturbations
28 | perturbed_texts_dictionary = generate_perturbations(raw_texts)
29 | perturbation_styles = list(perturbed_texts_dictionary.keys())
30 |
31 | #save all the texts to a json lines file
32 | with open(file_name, "w") as f:
33 | for i, text in enumerate(raw_texts):
34 | json_line = {}
35 | json_line["text"] = text
36 | for style in perturbation_styles:
37 | json_line[style] = perturbed_texts_dictionary[style][i]
38 | f.write(json.dumps(json_line) + "\n")
39 | print(f"Data saved to {file_name}")
40 |
41 |
42 |
43 | if __name__ == "__main__":
44 | import argparse
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument("--dataset_names", type=str, default="all")
47 | args = parser.parse_args()
48 |
49 | main(args)
50 |
--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import json
3 |
4 | import lm_dataformat
5 | import numpy as np
6 | import nltk
7 |
8 | # nltk.download('punkt')
9 | nltk_sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
10 |
11 | pile_mapper = { "stackexchange":"StackExchange",
12 | "wikipedia":"Wikipedia (en)",
13 | "cc":"Pile-CC",
14 | "github":"Github",
15 | "pubmed_abstracts":"PubMed Abstracts",
16 | "openwebtext2":"OpenWebText2",
17 | "freelaw":"FreeLaw",
18 | "math":"DM Mathematics",
19 | "nih":"NIH ExPorter",
20 | "uspto":"USPTO Backgrounds",
21 | "hackernews":"HackerNews",
22 | "enron":'Enron Emails',
23 | "books3": 'Books3',
24 | "pubmed_central": 'PubMed Central',
25 | "gutenberg":'Gutenberg (PG-19)',
26 | "arxiv":'ArXiv',
27 | "bookcorpus2":'BookCorpus2',
28 | "opensubtitles":'OpenSubtitles',
29 | "youtubesubtitles":'YoutubeSubtitles',
30 | "ubuntu":'Ubuntu IRC',
31 | "europarl":'EuroParl',
32 | "philpapers":'PhilPapers'}
33 |
34 | def split_paragraph(paragraph, max_sentences = 10):
35 | sentences = nltk_sentence_tokenizer.tokenize(paragraph)
36 | new_paragraphs = []
37 | for i in range(0, len(sentences), max_sentences):
38 | new_para = " ".join(sentences[i:i + max_sentences])
39 | new_paragraphs.append(new_para)
40 | return new_paragraphs
41 |
42 | def generate_pile_zst(subset, num_samples=5000, split = "val"):
43 | if subset.startswith("pile_"):
44 | subset = subset[5:]
45 | file_path = f"/data/the_pile/{split}.jsonl.zst"
46 | subset_key = pile_mapper[subset]
47 | texts = []
48 | num_docs = 0
49 | reader = lm_dataformat.Reader(file_path)
50 | for count, doc in enumerate(tqdm(reader.stream_data(get_meta=True))):
51 | if doc[1]['pile_set_name'] == subset_key:
52 | if len(doc[0].split(" ")) < 10:
53 | continue
54 | texts.append(doc[0])
55 | num_docs += 1
56 | if num_docs >= num_samples:
57 | break
58 | return texts
59 |
60 | def generate_pile_jsonl(subset, num_samples=5000):
61 | if subset.startswith("pile_"):
62 | subset = subset[5:]
63 | file_path = "/data/the_pile/combined.jsonl"
64 | subset_key = pile_mapper[subset]
65 | texts = []
66 | num_texts = 0
67 | with open(file_path, 'r', encoding="utf-8") as json_file:
68 | for line in json_file:
69 | json_data = json.loads(line)
70 | if 'text' in json_data:
71 | if json_data['meta']['pile_set_name'] == subset_key:
72 | if len(json_data['text'].split(" ")) < 800:
73 | continue
74 | texts.append(json_data['text'])
75 | num_texts += 1
76 | if num_texts == num_samples:
77 | break
78 | return texts
79 |
80 | def generate_c4(num_samples=500):
81 | # trove mount dataset/C4_subset@1.0.0 ./data
82 | file = "data/C4_subset-1.0.0/data/raw/c4-train.00000-of-01024.json"
83 | texts = []
84 | num_texts = 0
85 | with open(file, 'r', encoding="utf-8") as json_file:
86 | for line in json_file:
87 | json_data = json.loads(line)
88 | if 'text' in json_data:
89 | texts.append(json_data['text'])
90 | num_texts += 1
91 | if num_texts == num_samples:
92 | break
93 | return texts
94 |
95 | def split_long_texts_by_paragraph(texts, num_samples):
96 | if len(texts) < num_samples:
97 | print(f"initial texts {len(texts)} were less than num_samples {num_samples}. Further splitting")
98 | #split the sentences at every 1000 characters
99 | required_from_each = 2*(num_samples//len(texts) + 1)
100 | new_texts = []
101 | for text in texts:
102 | new_texts += split_paragraph(text, max_sentences=3)[:required_from_each]
103 |
104 | texts = new_texts
105 |
106 | print(f"Length of texts {len(texts)}")
107 | return texts
108 |
109 | def split_long_texts(texts, num_samples, seq_length, tokenizer = None):
110 | '''
111 | This function splits long texts into smaller texts of length seq_length
112 | 1. Concatenate all the texts together
113 | 2. Convert everything to tokens
114 | 3. divide into chunks of seq_length
115 | 4. Convert back to text
116 | 5. return the list of texts
117 | '''
118 | if tokenizer is None:
119 | from transformers import AutoTokenizer
120 | tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped")
121 |
122 | #concatenate all the texts
123 | all_text = " ".join(texts)
124 | #tokenize
125 | tokens = tokenizer.encode(all_text, return_tensors="pt")[0]
126 | #divide into chunks
127 | chunk_length = seq_length
128 | num_chunks = len(tokens)//chunk_length
129 | new_texts = []
130 |
131 | for i in range(num_chunks):
132 | chunk = tokens[i*chunk_length:(i+1)*chunk_length]
133 | text = tokenizer.decode(chunk)
134 | new_texts.append(text)
135 |
136 | # randomize the order and return only num_samples
137 | np.random.seed(11)
138 | np.random.shuffle(new_texts)
139 | new_texts = new_texts[:num_samples]
140 |
141 | return new_texts
142 |
143 | def load_data(dataset_name, split, num_samples = 1000, seq_length = 512):
144 | if "enron" in dataset_name:
145 | seq_length = 64
146 | if "nih" in dataset_name:
147 | seq_length = 64
148 | if "pubmed_abstracts" in dataset_name:
149 | seq_length = 32
150 |
151 | if split == "train":
152 | texts = generate_pile_jsonl(dataset_name, num_samples=num_samples*5)
153 | texts = split_long_texts(texts, num_samples,seq_length)
154 | else:
155 | assert split == "val"
156 | texts = generate_pile_zst(dataset_name, num_samples=num_samples*5)
157 | texts = split_long_texts(texts, num_samples,seq_length)
158 | print (f"Loaded {len(texts)} samples from {dataset_name} {split}")
159 | return texts
160 |
--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# [LLM Dataset Inference Demo](#toc0_)\n",
8 | "\n",
9 | ""
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "**Table of contents** \n",
17 | "- [LLM Dataset Inference Demo](#toc1_) \n",
18 | "- [Step 0: Make Splits A and B](#toc2_) \n",
19 | "- [Step 1: Aggregate Features with MIAs](#toc3_) \n",
20 | "- [Step 2: Learn MIA Correlations](#toc4_) \n",
21 | " - [Step 2.1 Remove Outliers](#toc4_1_) \n",
22 | " - [Step 2.2: Learn the weights of each feature](#toc4_2_) \n",
23 | "- [Step 3: Dataset Inference](#toc5_) \n",
24 | "\n",
25 | "\n",
32 | ""
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "This notebook will show you how to conduct LLM Dataset Inference on your own data.\n",
40 | "\n",
41 | "As you can see in the figure above, the process is divided into four steps:\n",
42 | "\n",
43 | "1. Generate Features with MIAs.\n",
44 | "2. Learn a linear classifier that assigns the importance of each feature to classify the membership of a text.\n",
45 | "3. Perform Dataset Inference on your Data\n",
46 | " - a. Generate MIA features\n",
47 | " - b. Run the linear classifier on those MIA features\n",
48 | " - c. Conduct statistical tests on the ouputs of the linear classifier to determine whether there is a significance difference between them. "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 1,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "from utils import prepare_model\n",
58 | "from metrics import aggregate_metrics\n",
59 | "from datasets import load_dataset"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# model_name = \"EleutherAI/pythia-410m-deduped\"\n",
69 | "# model_name = \"EleutherAI/pythia-2.8b\"\n",
70 | "model_name = \"EleutherAI/pythia-6.9b\"\n",
71 | "# model_name = \"EleutherAI/pythia-12b\"\n",
72 | "cache_dir = \"/tmp\""
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 3,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stderr",
82 | "output_type": "stream",
83 | "text": [
84 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
85 | ]
86 | },
87 | {
88 | "data": {
89 | "application/vnd.jupyter.widget-view+json": {
90 | "model_id": "c3c524b4c3854fe18535935300c6a45a",
91 | "version_major": 2,
92 | "version_minor": 0
93 | },
94 | "text/plain": [
95 | "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
96 | ]
97 | },
98 | "metadata": {},
99 | "output_type": "display_data"
100 | },
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "Model loaded\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "# quantization options: None, fp16, 8bit (needs accelerate)\n",
111 | "llm, tokenizer = prepare_model(model_name, cache_dir=cache_dir, quant=\"fp16\")"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "# [Step 0: Make Splits A and B](#toc0_)\n",
119 | "\n",
120 | "Splits A from members and non-members are used to trained the NN.\n",
121 | "\n",
122 | "Splits B from members and non-members are used to perform Dataset Inference (DI)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 4,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "from datasets import load_dataset\n",
132 | "\n",
133 | "ds = load_dataset(\"haritzpuerto/the_pile_arxiv_50k_sample\")"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "DatasetDict({\n",
145 | " train: Dataset({\n",
146 | " features: ['text', 'meta'],\n",
147 | " num_rows: 50000\n",
148 | " })\n",
149 | " validation: Dataset({\n",
150 | " features: ['text', 'meta'],\n",
151 | " num_rows: 2434\n",
152 | " })\n",
153 | " test: Dataset({\n",
154 | " features: ['text', 'meta'],\n",
155 | " num_rows: 2407\n",
156 | " })\n",
157 | "})"
158 | ]
159 | },
160 | "execution_count": 5,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "ds"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 6,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "A_members = ds['train'].select(range(0, 1000))\n",
176 | "A_nonmembers = ds['validation'].select(range(1000))\n",
177 | "\n",
178 | "B_members = ds['train'].select(range(1000, 2000))\n",
179 | "B_nonmembers = ds['validation'].select(range(1000, 2000))"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "# [Step 1: Aggregate Features with MIAs](#toc0_)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 7,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "metric_list = [\"k_min_probs\", \"ppl\", \"zlib_ratio\", \"k_max_probs\"]"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 8,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "batch_size = 2"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 9,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stderr",
214 | "output_type": "stream",
215 | "text": [
216 | "100%|██████████| 500/500 [09:40<00:00, 1.16s/it]\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "A_members_metrics = aggregate_metrics(llm, tokenizer, A_members, metric_list, None, batch_size=batch_size)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 10,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "data": {
231 | "text/plain": [
232 | "1000"
233 | ]
234 | },
235 | "execution_count": 10,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "len(A_members_metrics['ppl'])"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 11,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "name": "stderr",
251 | "output_type": "stream",
252 | "text": [
253 | "100%|██████████| 500/500 [09:39<00:00, 1.16s/it]\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "A_nonmembers_metrics = aggregate_metrics(llm, tokenizer, A_nonmembers, metric_list, None, batch_size=batch_size)"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "# [Step 2: Learn MIA Correlations](#toc0_)\n",
266 | "\n",
267 | "In this stage, we train a linear regressor to learn the importance of weights for different MIA attacks to use for the final dataset inference procedure. "
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 12,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "import numpy as np\n",
277 | "import pandas as pd\n",
278 | "from scipy.stats import ttest_ind, chi2, norm\n",
279 | "import torch\n",
280 | "import torch.nn as nn\n",
281 | "from tqdm import tqdm\n",
282 | "from selected_features import feature_list"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 13,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "def split_train_val(metrics):\n",
292 | " keys = list(metrics.keys())\n",
293 | " num_elements = len(metrics[keys[0]])\n",
294 | " print (f\"Using {num_elements} elements\")\n",
295 | " # select a random subset of val_metrics (50% of ids)\n",
296 | " ids_train = np.random.choice(num_elements, num_elements//2, replace=False)\n",
297 | " ids_val = np.array([i for i in range(num_elements) if i not in ids_train])\n",
298 | " new_metrics_train = {}\n",
299 | " new_metrics_val = {}\n",
300 | " for key in keys:\n",
301 | " new_metrics_train[key] = np.array(metrics[key])[ids_train]\n",
302 | " new_metrics_val[key] = np.array(metrics[key])[ids_val]\n",
303 | " return new_metrics_train, new_metrics_val\n",
304 | "\n",
305 | "def remove_outliers(metrics, remove_frac=0.05, outliers = \"zero\"):\n",
306 | " # Sort the array to work with ordered data\n",
307 | " sorted_ids = np.argsort(metrics)\n",
308 | " \n",
309 | " # Calculate the number of elements to remove from each side\n",
310 | " total_elements = len(metrics)\n",
311 | " elements_to_remove_each_side = int(total_elements * remove_frac / 2) \n",
312 | " \n",
313 | " # Ensure we're not attempting to remove more elements than are present\n",
314 | " if elements_to_remove_each_side * 2 > total_elements:\n",
315 | " raise ValueError(\"remove_frac is too large, resulting in no elements left.\")\n",
316 | " \n",
317 | " # Change the removed metrics to 0.\n",
318 | " lowest_ids = sorted_ids[:elements_to_remove_each_side]\n",
319 | " highest_ids = sorted_ids[-elements_to_remove_each_side:]\n",
320 | " all_ids = np.concatenate((lowest_ids, highest_ids))\n",
321 | "\n",
322 | " # import pdb; pdb.set_trace()\n",
323 | " \n",
324 | " trimmed_metrics = np.copy(metrics)\n",
325 | " \n",
326 | " if outliers == \"zero\":\n",
327 | " trimmed_metrics[all_ids] = 0\n",
328 | " elif outliers == \"mean\" or outliers == \"mean+p-value\":\n",
329 | " trimmed_metrics[all_ids] = np.mean(trimmed_metrics)\n",
330 | " elif outliers == \"clip\":\n",
331 | " highest_val_permissible = trimmed_metrics[highest_ids[0]]\n",
332 | " lowest_val_permissible = trimmed_metrics[lowest_ids[-1]]\n",
333 | " trimmed_metrics[highest_ids] = highest_val_permissible\n",
334 | " trimmed_metrics[lowest_ids] = lowest_val_permissible\n",
335 | " elif outliers == \"randomize\":\n",
336 | " #this will randomize the order of metrics\n",
337 | " trimmed_metrics = np.delete(trimmed_metrics, all_ids)\n",
338 | " else:\n",
339 | " assert outliers in [\"keep\", \"p-value\"]\n",
340 | " pass\n",
341 | " \n",
342 | " \n",
343 | " return trimmed_metrics\n",
344 | "\n",
345 | "def normalize_and_stack(train_metrics, val_metrics, normalize=\"train\"):\n",
346 | " '''\n",
347 | " excpects an input list of list of metrics\n",
348 | " normalize val with corre\n",
349 | " '''\n",
350 | " new_train_metrics = []\n",
351 | " new_val_metrics = []\n",
352 | " for (tm, vm) in zip(train_metrics, val_metrics):\n",
353 | " if normalize == \"combined\":\n",
354 | " combined_m = np.concatenate((tm, vm))\n",
355 | " mean_tm = np.mean(combined_m)\n",
356 | " std_tm = np.std(combined_m)\n",
357 | " else:\n",
358 | " mean_tm = np.mean(tm)\n",
359 | " std_tm = np.std(tm)\n",
360 | " \n",
361 | " if normalize == \"no\":\n",
362 | " normalized_vm = vm\n",
363 | " normalized_tm = tm\n",
364 | " else:\n",
365 | " #normalization should be done with respect to the train set statistics\n",
366 | " normalized_vm = (vm - mean_tm) / std_tm\n",
367 | " normalized_tm = (tm - mean_tm) / std_tm\n",
368 | " \n",
369 | " new_train_metrics.append(normalized_tm)\n",
370 | " new_val_metrics.append(normalized_vm)\n",
371 | "\n",
372 | " train_metrics = np.stack(new_train_metrics, axis=1)\n",
373 | " val_metrics = np.stack(new_val_metrics, axis=1)\n",
374 | " return train_metrics, val_metrics"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "## [Step 2.1 Remove Outliers](#toc0_)\n",
382 | "\n",
383 | "Across each MIA feature value, we first modify the top 5% outliers by changing their values to the mean of the distribution. This step is crucial to prevent issues in Step 3, where the model might learn skewed correlations due to a few outlier samples. "
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 14,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "def prepare_metrics(members_metrics, nonmembers_metrics, outliers=\"clip\", return_tensors=False):\n",
393 | " keys = list(members_metrics.keys())\n",
394 | " np_members_metrics = []\n",
395 | " np_nonmembers_metrics = []\n",
396 | " for key in keys:\n",
397 | " members_metric_key = np.array(members_metrics[key])\n",
398 | " nonmembers_metric_key = np.array(nonmembers_metrics[key])\n",
399 | " \n",
400 | " if outliers is not None:\n",
401 | " # remove the top 2.5% and bottom 2.5% of the data\n",
402 | " members_metric_key = remove_outliers(members_metric_key, remove_frac = 0.05, outliers = outliers)\n",
403 | " nonmembers_metric_key = remove_outliers(nonmembers_metric_key, remove_frac = 0.05, outliers = outliers)\n",
404 | "\n",
405 | " np_members_metrics.append(members_metric_key)\n",
406 | " np_nonmembers_metrics.append(nonmembers_metric_key)\n",
407 | "\n",
408 | " # concatenate the train and val metrics by stacking them\n",
409 | " np_members_metrics, np_nonmembers_metrics = normalize_and_stack(np_members_metrics, np_nonmembers_metrics)\n",
410 | " if return_tensors:\n",
411 | " np_members_metrics = torch.tensor(np_members_metrics, dtype=torch.float32)\n",
412 | " np_nonmembers_metrics = torch.tensor(np_nonmembers_metrics, dtype=torch.float32)\n",
413 | "\n",
414 | " return np_members_metrics, np_nonmembers_metrics"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 15,
420 | "metadata": {},
421 | "outputs": [
422 | {
423 | "name": "stdout",
424 | "output_type": "stream",
425 | "text": [
426 | "(1000, 16)\n",
427 | "(999, 16)\n"
428 | ]
429 | }
430 | ],
431 | "source": [
432 | "train_metrics, val_metrics = prepare_metrics(A_members_metrics, A_nonmembers_metrics, outliers=\"clip\")\n",
433 | "\n",
434 | "print(train_metrics.shape)\n",
435 | "print(val_metrics.shape)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 16,
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "name": "stderr",
445 | "output_type": "stream",
446 | "text": [
447 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
448 | "To disable this warning, you can either:\n",
449 | "\t- Avoid using `tokenizers` before the fork if possible\n",
450 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "# aux functions about MIA classifier\n",
456 | "\n",
457 | "def get_dataset_splits(_train_metrics, _val_metrics, num_samples):\n",
458 | " # get the train and val sets\n",
459 | " for_train_train_metrics = _train_metrics[:num_samples]\n",
460 | " for_train_val_metrics = _val_metrics[:num_samples]\n",
461 | " for_val_train_metrics = _train_metrics[num_samples:]\n",
462 | " for_val_val_metrics = _val_metrics[num_samples:]\n",
463 | "\n",
464 | "\n",
465 | " # create the train and val sets\n",
466 | " train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0)\n",
467 | " train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0])))\n",
468 | " val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0)\n",
469 | " val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0])))\n",
470 | " \n",
471 | " # return tensors\n",
472 | " train_x = torch.tensor(train_x, dtype=torch.float32)\n",
473 | " train_y = torch.tensor(train_y, dtype=torch.float32)\n",
474 | " val_x = torch.tensor(val_x, dtype=torch.float32)\n",
475 | " val_y = torch.tensor(val_y, dtype=torch.float32)\n",
476 | " \n",
477 | " return (train_x, train_y), (val_x, val_y)\n",
478 | "\n",
479 | "def train_model(inputs, y, num_epochs=10000):\n",
480 | " num_features = inputs.shape[1]\n",
481 | " model = get_model(num_features)\n",
482 | " \n",
483 | " criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy Loss for binary classification\n",
484 | " optimizer = torch.optim.Adam(model.parameters(), lr=0.01)\n",
485 | " \n",
486 | " # Convert y to float tensor for BCEWithLogitsLoss\n",
487 | " y_float = y.float()\n",
488 | "\n",
489 | " with tqdm(range(num_epochs)) as pbar:\n",
490 | " for epoch in pbar:\n",
491 | " optimizer.zero_grad()\n",
492 | " outputs = model(inputs).squeeze() # Squeeze the output to remove singleton dimension\n",
493 | " loss = criterion(outputs, y_float)\n",
494 | " loss.backward()\n",
495 | " optimizer.step()\n",
496 | " pbar.set_description('loss {}'.format(loss.item()))\n",
497 | " return model\n",
498 | "\n",
499 | "def get_model(num_features, linear = True):\n",
500 | " if linear:\n",
501 | " model = nn.Linear(num_features, 1)\n",
502 | " else:\n",
503 | " model = nn.Sequential(\n",
504 | " nn.Linear(num_features, 10),\n",
505 | " nn.ReLU(),\n",
506 | " nn.Linear(10, 1) # Single output neuron\n",
507 | " )\n",
508 | " return model\n",
509 | "\n",
510 | "def get_predictions(model, val, y):\n",
511 | " with torch.no_grad():\n",
512 | " preds = model(val).detach().squeeze()\n",
513 | " criterion = nn.BCEWithLogitsLoss()\n",
514 | " loss = criterion(preds, y.float())\n",
515 | " return preds.numpy(), loss.item()\n",
516 | "\n",
517 | "from sklearn.metrics import roc_curve, auc\n",
518 | "import matplotlib.pyplot as plt\n",
519 | "\n",
520 | "def plot_roc_curve(model, val, y):\n",
521 | " # get auc and plot roc curve\n",
522 | " from sklearn.metrics import roc_auc_score\n",
523 | " preds, _ = get_predictions(model, val, y)\n",
524 | " auc_score = roc_auc_score(y, preds)\n",
525 | " \n",
526 | " # Compute ROC curve\n",
527 | " fpr, tpr, thresholds = roc_curve(y, preds)\n",
528 | " \n",
529 | " # Compute AUC\n",
530 | " roc_auc = auc(fpr, tpr)\n",
531 | " \n",
532 | " # Plot ROC curve\n",
533 | " plt.figure()\n",
534 | " plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)\n",
535 | " plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
536 | " plt.xlim([0.0, 1.0])\n",
537 | " plt.ylim([0.0, 1.05])\n",
538 | " plt.xlabel('False Positive Rate')\n",
539 | " plt.ylabel('True Positive Rate')\n",
540 | " plt.title('Receiver Operating Characteristic')\n",
541 | " plt.legend(loc=\"lower right\")\n",
542 | " plt.show()\n",
543 | " \n",
544 | " return auc_score"
545 | ]
546 | },
547 | {
548 | "cell_type": "markdown",
549 | "metadata": {},
550 | "source": [
551 | "## [Step 2.2: Learn the weights of each feature](#toc0_)\n",
552 | "\n",
553 | "We then pass the data through a linear regression model to learn weights for each feature.\n",
554 | "\n",
555 | "\n",
556 | "⚠️ **Members are classified as 0, while non-members as 1.**"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 17,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "# aux functions about p-values\n",
566 | "list_number_samples = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]\n",
567 | "\n",
568 | "def get_p_value_list(heldout_train, heldout_val, list_number_samples):\n",
569 | " # list_number_samples is used to see how the p-values changes across different number of samples\n",
570 | " p_value_list = []\n",
571 | " for num_samples in list_number_samples:\n",
572 | " heldout_train_curr = heldout_train[:num_samples]\n",
573 | " heldout_val_curr = heldout_val[:num_samples]\n",
574 | " t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less')\n",
575 | " p_value_list.append(p_value)\n",
576 | " return p_value_list\n",
577 | " \n",
578 | " \n",
579 | "\n",
580 | "def split_train_val(metrics):\n",
581 | " keys = list(metrics.keys())\n",
582 | " num_elements = len(metrics[keys[0]])\n",
583 | " print (f\"Using {num_elements} elements\")\n",
584 | " # select a random subset of val_metrics (50% of ids)\n",
585 | " ids_train = np.random.choice(num_elements, num_elements//2, replace=False)\n",
586 | " ids_val = np.array([i for i in range(num_elements) if i not in ids_train])\n",
587 | " new_metrics_train = {}\n",
588 | " new_metrics_val = {}\n",
589 | " for key in keys:\n",
590 | " new_metrics_train[key] = np.array(metrics[key])[ids_train]\n",
591 | " new_metrics_val[key] = np.array(metrics[key])[ids_val]\n",
592 | " return new_metrics_train, new_metrics_val"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 18,
598 | "metadata": {},
599 | "outputs": [
600 | {
601 | "name": "stderr",
602 | "output_type": "stream",
603 | "text": [
604 | "loss 0.6681151986122131: 100%|██████████| 1000/1000 [00:01<00:00, 659.06it/s]\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "num_samples = 250 # How many samples to use for training and validation?\n",
610 | "\n",
611 | "np.random.shuffle(train_metrics)\n",
612 | "np.random.shuffle(val_metrics)\n",
613 | "\n",
614 | "# train a model by creating a train set and a held out set\n",
615 | "(train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)\n",
616 | "\n",
617 | "model = train_model(train_x, train_y, num_epochs = 1000)\n",
618 | "\n",
619 | "# using the model weights, get importance of each feature, and save to csv\n",
620 | "weights = model.weight.data.squeeze().tolist() \n",
621 | "features = list(A_members_metrics.keys())\n",
622 | "feature_importance = {feature: weight for feature, weight in zip(features, weights)}\n",
623 | "df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 19,
629 | "metadata": {},
630 | "outputs": [
631 | {
632 | "data": {
633 | "text/html": [
634 | "
\n",
635 | "\n",
648 | "
\n",
649 | " \n",
650 | " \n",
651 | " | \n",
652 | " Feature | \n",
653 | " Importance | \n",
654 | "
\n",
655 | " \n",
656 | " \n",
657 | " \n",
658 | " 0 | \n",
659 | " ppl | \n",
660 | " 0.187106 | \n",
661 | "
\n",
662 | " \n",
663 | " 1 | \n",
664 | " k_min_probs_0.05 | \n",
665 | " 0.051939 | \n",
666 | "
\n",
667 | " \n",
668 | " 2 | \n",
669 | " k_min_probs_0.1 | \n",
670 | " -0.513615 | \n",
671 | "
\n",
672 | " \n",
673 | " 3 | \n",
674 | " k_min_probs_0.2 | \n",
675 | " 1.180070 | \n",
676 | "
\n",
677 | " \n",
678 | " 4 | \n",
679 | " k_min_probs_0.3 | \n",
680 | " -1.855523 | \n",
681 | "
\n",
682 | " \n",
683 | " 5 | \n",
684 | " k_min_probs_0.4 | \n",
685 | " 1.458794 | \n",
686 | "
\n",
687 | " \n",
688 | " 6 | \n",
689 | " k_min_probs_0.5 | \n",
690 | " -0.619759 | \n",
691 | "
\n",
692 | " \n",
693 | " 7 | \n",
694 | " k_min_probs_0.6 | \n",
695 | " 0.117170 | \n",
696 | "
\n",
697 | " \n",
698 | " 8 | \n",
699 | " k_max_probs_0.05 | \n",
700 | " -0.679361 | \n",
701 | "
\n",
702 | " \n",
703 | " 9 | \n",
704 | " k_max_probs_0.1 | \n",
705 | " 1.334020 | \n",
706 | "
\n",
707 | " \n",
708 | " 10 | \n",
709 | " k_max_probs_0.2 | \n",
710 | " 0.610850 | \n",
711 | "
\n",
712 | " \n",
713 | " 11 | \n",
714 | " k_max_probs_0.3 | \n",
715 | " -2.398481 | \n",
716 | "
\n",
717 | " \n",
718 | " 12 | \n",
719 | " k_max_probs_0.4 | \n",
720 | " 1.521971 | \n",
721 | "
\n",
722 | " \n",
723 | " 13 | \n",
724 | " k_max_probs_0.5 | \n",
725 | " 2.084856 | \n",
726 | "
\n",
727 | " \n",
728 | " 14 | \n",
729 | " k_max_probs_0.6 | \n",
730 | " -2.399661 | \n",
731 | "
\n",
732 | " \n",
733 | " 15 | \n",
734 | " zlib_ratio | \n",
735 | " 0.112112 | \n",
736 | "
\n",
737 | " \n",
738 | "
\n",
739 | "
"
740 | ],
741 | "text/plain": [
742 | " Feature Importance\n",
743 | "0 ppl 0.187106\n",
744 | "1 k_min_probs_0.05 0.051939\n",
745 | "2 k_min_probs_0.1 -0.513615\n",
746 | "3 k_min_probs_0.2 1.180070\n",
747 | "4 k_min_probs_0.3 -1.855523\n",
748 | "5 k_min_probs_0.4 1.458794\n",
749 | "6 k_min_probs_0.5 -0.619759\n",
750 | "7 k_min_probs_0.6 0.117170\n",
751 | "8 k_max_probs_0.05 -0.679361\n",
752 | "9 k_max_probs_0.1 1.334020\n",
753 | "10 k_max_probs_0.2 0.610850\n",
754 | "11 k_max_probs_0.3 -2.398481\n",
755 | "12 k_max_probs_0.4 1.521971\n",
756 | "13 k_max_probs_0.5 2.084856\n",
757 | "14 k_max_probs_0.6 -2.399661\n",
758 | "15 zlib_ratio 0.112112"
759 | ]
760 | },
761 | "execution_count": 19,
762 | "metadata": {},
763 | "output_type": "execute_result"
764 | }
765 | ],
766 | "source": [
767 | "df"
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": 20,
773 | "metadata": {},
774 | "outputs": [
775 | {
776 | "data": {
777 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAHHCAYAAABTMjf2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAACKxElEQVR4nOzdd3xN9//A8VcSGRJZRMyQ2HvF3qsotWsUsRU1anQYNTrQVtVo1Swxa6tRq9RuvrT2jBKpGcTIkiH3fn5/5OfqdW8i4Y6M9/PxyIPzPp9zzjv3ivvO53zO52OjlFIIIYQQQmRBttZOQAghhBDCWqQQEkIIIUSWJYWQEEIIIbIsKYSEEEIIkWVJISSEEEKILEsKISGEEEJkWVIICSGEECLLkkJICCGEEFmWFEJCCCGEyLKkEBLCRHx9fendu7e108hyGjZsSMOGDa2dxitNnjwZGxsbwsPDrZ1KumNjY8PkyZNNcq7Q0FBsbGwIDAw0yflE5ieFkMgQAgMDsbGx0X1ly5aNAgUK0Lt3b27fvm3t9NK1mJgYvvzySypUqICzszPu7u7Uq1eP5cuXk1FW2Ll48SKTJ08mNDTU2qkY0Gg0LF26lIYNG5IzZ04cHR3x9fWlT58+/P3339ZOzyRWr17NrFmzrJ2GnvSYk8iYslk7ASHS4osvvsDPz4+4uDj+97//ERgYyJEjRzh//jxOTk5WzS04OBhb2/T1u8W9e/do0qQJly5domvXrgwdOpS4uDg2btxIr1692LFjB6tWrcLOzs7aqabo4sWLfP755zRs2BBfX1+9fXv27LFOUkBsbCwdOnRg165d1K9fn3HjxpEzZ05CQ0NZt24dy5Yt48aNGxQsWNBqOZrC6tWrOX/+PCNGjDDL+WNjY8mWLW0fR8nlVLhwYWJjY7G3tzdhhiIzk0JIZChvv/02VatWBaB///54eXnxzTffsHXrVjp37mzV3BwdHS1+zbi4OBwcHJItwHr16sWlS5fYvHkzbdq00cWHDx/Oxx9/zHfffUflypX59NNPLZUykNRL5eLiYpJzOTg4mOQ8r+Pjjz9m165dzJw50+ADedKkScycOdOi+SiliIuLI3v27Ba97uvQarUkJCTg5ORk0l9ibGxsrP5LkchglBAZwNKlSxWg/vrrL7349u3bFaCmTp2qF7906ZLq2LGj8vT0VI6Ojsrf319t2bLF4LyPHz9WI0aMUIULF1YODg6qQIECKiAgQD148EDXJi4uTk2cOFEVLVpUOTg4qIIFC6qPP/5YxcXF6Z2rcOHCqlevXkoppf766y8FqMDAQINr7tq1SwFq27ZtutitW7dUnz59lLe3t3JwcFBlypRRP//8s95x+/fvV4D65Zdf1Pjx41X+/PmVjY2Nevz4sdHXLCgoSAGqb9++Rvc/e/ZMFS9eXHl6eqqnT58qpZS6fv26AtT06dPV999/rwoVKqScnJxU/fr11blz5wzOkZrX+fl7d+DAATV48GCVO3du5eHhoZRSKjQ0VA0ePFiVKFFCOTk5qZw5c6p3331XXb9+3eD4l7/279+vlFKqQYMGqkGDBgav09q1a9VXX32lChQooBwdHVXjxo3VP//8Y/A9/Pjjj8rPz085OTmpatWqqUOHDhmc05ibN2+qbNmyqbfeeivFds9NmjRJAeqff/5RvXr1Uu7u7srNzU317t1bxcTE6LVdsmSJatSokcqdO7dycHBQpUuXVj/99JPBOQsXLqxatWqldu3apfz9/ZWjo6OaOXNmms6hlFI7duxQ9evXVzly5FCurq6qatWqatWqVUqppNf35de+cOHCumNT+/MBqCFDhqiVK1eqMmXKqGzZsqnNmzfr9k2aNEnXNjIyUn344Ye6n8vcuXOrpk2bqhMnTrwyp+f/hpcuXap3/UuXLqlOnTopLy8v5eTkpEqUKKHGjRuX0lsmsgjpERIZ2vMxI56enrrYhQsXqFOnDgUKFGDMmDG4uLiwbt062rVrx8aNG2nfvj0A0dHR1KtXj0uXLtG3b1+qVKlCeHg4W7du5datW3h5eaHVamnTpg1Hjhzh/fffp3Tp0pw7d46ZM2dy5coVfv31V6N5Va1alSJFirBu3Tp69eqlt2/t2rV4enrSvHlzIOn2Vc2aNbGxsWHo0KHkzp2bnTt30q9fPyIjIw16Gr788kscHBz46KOPiI+PT7ZHZNu2bQD07NnT6P5s2bLRrVs3Pv/8c44ePUrTpk11+5YvX05UVBRDhgwhLi6O2bNn07hxY86dO0eePHnS9Do/98EHH5A7d24mTpxITEwMAH/99Rd//vknXbt2pWDBgoSGhjJv3jwaNmzIxYsXcXZ2pn79+gwfPpw5c+Ywbtw4SpcuDaD7Mzlff/01tra2fPTRR0RERPDtt9/SvXt3jh07pmszb948hg4dSr169Rg5ciShoaG0a9cOT0/PV97O2rlzJ4mJiQQEBKTY7mWdO3fGz8+PadOmcfLkSRYvXoy3tzfffPONXl5ly5alTZs2ZMuWjW3btvHBBx+g1WoZMmSI3vmCg4N57733GDhwIAMGDKBkyZJpOkdgYCB9+/albNmyjB07Fg8PD06dOsWuXbvo1q0b48ePJyIiglu3bul6uHLkyAGQ5p+PP/74g3Xr1jF06FC8vLwMbnM+N2jQIDZs2MDQoUMpU6YMDx8+5MiRI1y6dIkqVaqkmJMxZ8+epV69etjb2/P+++/j6+vLtWvX2LZtG1OmTEndGycyL2tXYkKkxvNegb1796oHDx6omzdvqg0bNqjcuXMrR0dHdfPmTV3bJk2aqPLly+v9RqrValXt2rVV8eLFdbGJEycqQG3atMngelqtViml1IoVK5Stra06fPiw3v758+crQB09elQX+2+PkFJKjR07Vtnb26tHjx7pYvHx8crDw0Ovl6Zfv34qX758Kjw8XO8aXbt2Ve7u7rremuc9HUWKFNHFUtKuXTsFJNtjpJRSmzZtUoCaM2eOUurFb9PZs2dXt27d0rU7duyYAtTIkSN1sdS+zs/fu7p166rExES96xv7Pp73ZC1fvlwXW79+vV4v0H8l1yNUunRpFR8fr4vPnj1bAbqerfj4eJUrVy5VrVo19ezZM127wMBABbyyR2jkyJEKUKdOnUqx3XPPe4Re7qFr3769ypUrl17M2OvSvHlzVaRIEb1Y4cKFFaB27dpl0D4153jy5IlydXVVNWrUULGxsXptn/8MKKVUq1at9HqBnkvLzwegbG1t1YULFwzOw0s9Qu7u7mrIkCEG7f4ruZyM9QjVr19fubq6qn///TfZ71FkXelrZKcQr9C0aVNy586Nj48P7777Li4uLmzdulX32/ujR4/4448/6Ny5M1FRUYSHhxMeHs7Dhw9p3rw5//zzj+4ps40bN1KxYkWDngtIGmcAsH79ekqXLk2pUqV05woPD6dx48YA7N+/P9lcu3TpwrNnz9i0aZMutmfPHp48eUKXLl2ApDEdGzdupHXr1iil9K7RvHlzIiIiOHnypN55e/XqlaoxIFFRUQC4urom2+b5vsjISL14u3btKFCggG67evXq1KhRgx07dgBpe52fGzBggMGg7P9+H8+ePePhw4cUK1YMDw8Pg+87rfr06aPXW1avXj0AQkJCAPj77795+PAhAwYM0Buo2717d70exuQ8f81Sen2NGTRokN52vXr1ePjwod578N/XJSIigvDwcBo0aEBISAgRERF6x/v5+el6F/8rNef4/fffiYqKYsyYMQbjap7/DKQkrT8fDRo0oEyZMq88r4eHB8eOHePOnTuvbPsqDx484NChQ/Tt25dChQrp7UvN9ygyP7k1JjKUuXPnUqJECSIiIliyZAmHDh3SG6R89epVlFJMmDCBCRMmGD3H/fv3KVCgANeuXaNjx44pXu+ff/7h0qVL5M6dO9lzJadixYqUKlWKtWvX0q9fPyDptpiXl5fug+LBgwc8efKEhQsXsnDhwlRdw8/PL8Wcn3v+AR0VFYWHh4fRNskVS8WLFzdoW6JECdatWwek7XVOKe/Y2FimTZvG0qVLuX37tt7j/C9/4KfVyx96z4ubx48fA/Dvv/8CUKxYMb122bJlS/aWzX+5ubkBL15DU+T1/JxHjx5l0qRJBAUF8fTpU732ERERuLu767aT+/eQmnNcu3YNgHLlyqXpe3gurT8fqf23++2339KrVy98fHzw9/enZcuW9OzZkyJFiqQ5x+eF7+t+jyLzk0JIZCjVq1fXPTXWrl076tatS7du3QgODiZHjhxotVoAPvroI6O/JYPhB19KtFot5cuX5/vvvze638fHJ8Xju3TpwpQpUwgPD8fV1ZWtW7fy3nvv6Xognufbo0cPg7FEz1WoUEFvO7VPBJUuXZpff/2Vs2fPUr9+faNtzp49C5Cq39L/63VeZ2N5Dxs2jKVLlzJixAhq1aqFu7s7NjY2dO3aVXeN15XclADKRHMnlSpVCoBz585RqVKlVB/3qryuXbtGkyZNKFWqFN9//z0+Pj44ODiwY8cOZs6cafC6GHtd03qO15XWn4/U/tvt3Lkz9erVY/PmzezZs4fp06fzzTffsGnTJt5+++03zluI/5JCSGRYdnZ2TJs2jUaNGvHjjz8yZswY3W+M9vb2eoN/jSlatCjnz59/ZZszZ87QpEmT1+pG79KlC59//jkbN24kT548REZG0rVrV93+3Llz4+rqikajeWW+afXOO+8wbdo0li9fbrQQ0mg0rF69Gk9PT+rUqaO3759//jFof+XKFV1PSVpe55Rs2LCBXr16MWPGDF0sLi6OJ0+e6LUzxy2MwoULA0m9W40aNdLFExMTCQ0NNShAX/b2229jZ2fHypUr0zxgOiXbtm0jPj6erVu36vUepXQb9nXPUbRoUQDOnz+f4i8Iyb3+b/rzkZJ8+fLxwQcf8MEHH3D//n2qVKnClClTdIVQaq/3/N/qq37WRdYlY4REhtawYUOqV6/OrFmziIuLw9vbm4YNG7JgwQLu3r1r0P7Bgwe6v3fs2JEzZ86wefNmg3bPfzvv3Lkzt2/fZtGiRQZtYmNjdU8/Jad06dKUL1+etWvXsnbtWvLly6dXlNjZ2dGxY0c2btxo9D/q/+abVrVr16Zp06YsXbqU7du3G+wfP348V65c4ZNPPjH4Tf3XX3/VG+Nz/Phxjh07pvsQSsvrnBI7OzuDHpoffvgBjUajF3s+59DLBdKbqFq1Krly5WLRokUkJibq4qtWrdLdPkuJj48PAwYMYM+ePfzwww8G+7VaLTNmzODWrVtpyut5j9HLtwmXLl1q8nM0a9YMV1dXpk2bRlxcnN6+/x7r4uJi9Fblm/58GKPRaAyu5e3tTf78+YmPj39lTi/LnTs39evXZ8mSJdy4cUNvn6l6B0XGJj1CIsP7+OOP6dSpE4GBgQwaNIi5c+dSt25dypcvz4ABAyhSpAj37t0jKCiIW7ducebMGd1xGzZsoFOnTvTt2xd/f38ePXrE1q1bmT9/PhUrViQgIIB169YxaNAg9u/fT506ddBoNFy+fJl169axe/du3a265HTp0oWJEyfi5OREv379DCY//Prrr9m/fz81atRgwIABlClThkePHnHy5En27t3Lo0ePXvu1Wb58OU2aNKFt27Z069aNevXqER8fz6ZNmzhw4ABdunTh448/NjiuWLFi1K1bl8GDBxMfH8+sWbPIlSsXn3zyia5Nal/nlLzzzjusWLECd3d3ypQpQ1BQEHv37iVXrlx67SpVqoSdnR3ffPMNERERODo60rhxY7y9vV/7tXFwcGDy5MkMGzaMxo0b07lzZ0JDQwkMDKRo0aKp6nGYMWMG165dY/jw4WzatIl33nkHT09Pbty4wfr167l8+bJeD2BqNGvWDAcHB1q3bs3AgQOJjo5m0aJFeHt7Gy063+Qcbm5uzJw5k/79+1OtWjW6deuGp6cnZ86c4enTpyxbtgwAf39/1q5dy6hRo6hWrRo5cuSgdevWJvn5eFlUVBQFCxbk3XffpWLFiuTIkYO9e/fy119/6fUcJpeTMXPmzKFu3bpUqVKF999/Hz8/P0JDQ/ntt984ffp0mvITmZBVnlUTIo2Sm1BRKaU0Go0qWrSoKlq0qO7x7GvXrqmePXuqvHnzKnt7e1WgQAH1zjvvqA0bNugd+/DhQzV06FBVoEAB3WRwvXr10nuUPSEhQX3zzTeqbNmyytHRUXl6eip/f3/1+eefq4iICF27lx+ff+6ff/7RTfp25MgRo9/fvXv31JAhQ5SPj4+yt7dXefPmVU2aNFELFy7UtXn+WPj69evT9NpFRUWpyZMnq7Jly6rs2bMrV1dXVadOHRUYGGjw+PB/J1ScMWOG8vHxUY6OjqpevXrqzJkzBudOzeuc0nv3+PFj1adPH+Xl5aVy5Mihmjdvri5fvmz0tVy0aJEqUqSIsrOzS9WEii+/TslNtDdnzhxVuHBh5ejoqKpXr66OHj2q/P39VYsWLVLx6iqVmJioFi9erOrVq6fc3d2Vvb29Kly4sOrTp4/eo/XPH5//72Sd/319/juJ5NatW1WFChWUk5OT8vX1Vd98841asmSJQbvnEyoak9pzPG9bu3ZtlT17duXm5qaqV6+ufvnlF93+6Oho1a1bN+Xh4WEwoWJqfz74/wkVjeE/j8/Hx8erjz/+WFWsWFG5uroqFxcXVbFiRYPJIJPLKbn3+fz586p9+/bKw8NDOTk5qZIlS6oJEyYYzUdkLTZKSd+gECJJaGgofn5+TJ8+nY8++sja6ViFVqsld+7cdOjQwegtHyFE5iJjhIQQWVZcXJzBOJHly5fz6NEjGjZsaJ2khBAWJWOEhBBZ1v/+9z9GjhxJp06dyJUrFydPnuTnn3+mXLlydOrUydrpCSEsQAohIUSW5evri4+PD3PmzOHRo0fkzJmTnj178vXXX1t1VXshhOXIGCEhhBBCZFkyRkgIIYQQWZYUQkIIIYTIsrLcGCGtVsudO3dwdXWVlYeFEEKIDEIpRVRUFPnz5zeYmPZNZLlC6M6dO69cKFMIIYQQ6dPNmzcpWLCgyc6X5QohV1dXIOmFdHNzs3I2QgghhEiNyMhIfHx8dJ/jppLlCqHnt8Pc3NykEBJCCCEyGFMPa5HB0kIIIYTIsqQQEkIIIUSWJYWQEEIIIbIsKYSEEEIIkWVJISSEEEKILEsKISGEEEJkWVIICSGEECLLkkJICCGEEFmWFEJCCCGEyLKkEBJCCCFElmXVQujQoUO0bt2a/PnzY2Njw6+//vrKYw4cOECVKlVwdHSkWLFiBAYGmj1PIYQQQmROVi2EYmJiqFixInPnzk1V++vXr9OqVSsaNWrE6dOnGTFiBP3792f37t1mzlQIIYQQmZFVF119++23efvtt1Pdfv78+fj5+TFjxgwASpcuzZEjR5g5cybNmzc3V5pCCCGEyKQy1BihoKAgmjZtqhdr3rw5QUFBVspICCGEEOamjQnnwvwBZjm3VXuE0iosLIw8efLoxfLkyUNkZCSxsbFkz57d4Jj4+Hji4+N125GRkWbPUwghhBCmcffWI/p0/IGDJwuZ5fwZqkfodUybNg13d3fdl4+Pj7VTEkIIIUQqbPlpKRVKfc3u47bEJZqn7yZDFUJ58+bl3r17erF79+7h5uZmtDcIYOzYsUREROi+bt68aYlUhRBCCPG6HpzjweZRdB95lfAYFwC8c0Sb5VIZqhCqVasW+/bt04v9/vvv1KpVK9ljHB0dcXNz0/sSQgghRDqUGAcHP4HlFcgdMpNZbXcB0K7cJf731VmzXNKqY4Sio6O5evWqbvv69eucPn2anDlzUqhQIcaOHcvt27dZvnw5AIMGDeLHH3/kk08+oW/fvvzxxx+sW7eO3377zVrfghBCCCFSI/HFeF1u7IWwvwAbXUhz9HMStbY4ZtPoYv2qn8THI4JmY+cSlasWjHA3eVpWLYT+/vtvGjVqpNseNWoUAL169SIwMJC7d+9y48YN3X4/Pz9+++03Ro4cyezZsylYsCCLFy+WR+eFEEKI9Cg+Eo5Nhb++SbHZzSdu9PylJ+Xy3ueH9jt1cZuqI2nesxd4VwQzPexko5RSZjlzOhUZGYm7uzsRERFym0wIIYQwlztB8EvtVzZbd7osAze+w5PYpLG+v/VbRcsWftD0J/Aoqmtnrs/vDPX4vBBCCCEygMibxosgBzfwrpzUJDKe4Tu7smzzE91unwLZce25B+r7WiZPpBASQgghxOtKiIK7x4H/3Fy6thVO/aDfzqcRvLMOnL0ACAq6SY8emwkJeaxr0qVLWebNa4Wnp/GnwM1FCiEhhBBCpF18BPzo8ep2VT+GBt8CkJioZcqUQ3z55SE0mqTiydXVgblzW9KjRwVsbGxSOpNZSCEkhBBCiNR79hTOzIeDo1/dtv50qPYRAA8fPqV1618ICrql2127tg8rV7bHz8/TXNm+khRCQgghhEi9Xb3hynr9mKsPlO31YjubM5TrAy55dSEPDyeyZUuavtDOzoaJExswblw9XcxapBASQgghRMo0zyDqJgSvNSyCfJtD+9/A1i7FU9jZ2bJiRXs6dFjH3LktqVmzoBkTTj0phIQQQgiRvLMLYf8ISIw13Nc/BNz9jB528GAo2bPbU716AV2scGEP/v57gFXGAiUnQy2xIYQQQggLO/6N8SKoTIDRIighQcPYsXtp1GgZ7723kaioeL396akIAukREkIIIURyfm0LESEvtvNWh8JNocg7kK+GQfPg4HC6ddvEyZN3AQgJecy8eX/zySd1LJVxmkkhJIQQQogXIm8kPRF2ZYN+3M4Buh8zeohSikWLTjJixC5iYxMBsLe3ZcqUxowe/erZpa1JCiEhhBBCJLm+Cza9bXxfl8NGww8exDBgwDa2bAnWxUqWzMXq1R2pUiWfObI0KSmEhBBCCAGhvxsvguwc4INwcHA12LV791V6995CWFi0LjZokD8zZjTH2dnenNmajBRCQgghRFZ3aIzhCvHVPoEa48HR+AKn9+5F067dWuLikm6FeXk5s2RJG1q3LmnubE1KnhoTQgghsrKHlw2LoGaLof43yRZBAHny5ODrr5sA0Lx5Uc6dG5zhiiCQHiEhhBAi64q6DYGl9WMtlkHZngZNtVqFRqPF3v7FxInDhtWgYEE32rcvja1t+nosPrWkR0gIIYTIapSCR8Gw8KXZnZstNloE3b0bxdtvr+Kzz/7Qi9va2tCxY5kMWwSB9AgJIYQQWc+GZnBjr36seEco38+g6ZYtl+nXbysPH8by++/XaN68GI0bG59NOiOSQkgIIYTISi79YlgEYQOt9dcQi4lJYPToPSxYcEIXy5MnhwUStCwphIQQQoisYv9IODlLP1ZzAlQeBv9Z+uLEiTt067aJK1ce6mJt25Zk8eI2eHk5WyhZy5BCSAghhMgKnsUaFkEddyWtHv//NBot3333J599tp/ERC0Azs72zJrVnP79q6S7dcJMQQohIYQQIivQxOlvB5wC70q6zfDwp3TqtJ4DB0J1MX//fKxe3ZESJXJZJkcrkEJICCGEyOzunYJLK15s+72tVwQBuLs7Eh2dACTdJRszpi6TJzfEwcGOzEwKISGEECKzUgpW14Sw4y/tMLzFZW9vx6pVHWjXbg3z5rWiQQNfi6RobVIICSGEEJmNUnDzAKxvbHy/bwuCgm7i7GxPxYp5deESJXJx/vwHGXpeoLSSQkgIIYTITCJvwKLCxve9tZDEPHWYMv8+X365lBIlcvH33+/rLZCalYogkJmlhRBCiMzl2FTj8REJhOR4l/rtg5g8+SAajeLSpXB++ukvy+aXzkiPkBBCCJFZaBPh7AL9WLttqCKtWLHiLEOH7iAqKmlAtJ2dDZMmNWDEiJpWSDT9kEJICCGEyOiehsO1LbCnv358yCMexzoxqOtG1q27oAsXLerJypUdqFnzpbXGsiAphIQQQoiMbtPbcO9v/ZhTLg78L4KAgEBu3YrUhfv0qcTs2S1wdXW0cJLpkxRCQgghREaWEGVYBAF3W56heamlJCRoAPD0dGLBgnfo1KmspTNM12SwtBBCCJGRHfxIf7vDThj5jHx+BZg0qQEAjRr5cvbsYCmCjJAeISGEECIjevoA5nnrNpUCbele2Pm10MU+/bQOPj5udO9eIcs9Fp9a0iMkhBBCZBRKC3ePw439ekXQg2hn2gd25av/vafX3M7OloCAilIEpUB6hIQQQoj0TvMMLq2C3X0Mdu0OLkrvNe0Ii3Jl++VjNGtZhlq1fKyQZMYkhZAQQgiRHkXegH0fwJNr8Oiywe64Z9kYu6MJsw7X0sU8PbPr5gkSqSOFkBBCCJEenVsMIb8Z3xXuR/d1AZwLeTHCpXnzogQGtiNv3hyWyjBTkDFCQgghRHqi1UDkTfjfly9i9i5Ju7yqMDtuE9Vm9tUVQY6Odsye3YIdO7pLEfQapEdICCGESC9Ozob9Iwzjnf7goUM5unffxO7dZ3Xh8uW9Wb26I+XKeRseI1JFeoSEEEKI9OB/XxkvgrLnBq/yuLg4cPt2lC48cmRNjh8fIEXQG5IeISGEEMJabh2Cv6ZD3GO4c1R/X+5K4Pc2VB8D9tlxsofVqzvQtu0a5s9/h2bNilol5cxGCiEhhBDCGm4egHWNjO/rcogT94ri4uJAKUc3Xbh8+TxcuTKMbNnkho6pyCsphBBCWNLJObCyWrJFkKbjfr5ZZUPNmj/z3nsbiY9P1NsvRZBpyasphBBCWErMvaRxQC8vktpsMbx/i5vtw2jSO5QxY/aRmKjl9OkwfvrpL6ukmlVIISSEEEJYSnwEoPRj9adD+X6s2/mECv6BHDz4LwA2NjB2bF2GDKlu+TyzEBkjJIQQQlhC1G1YXuHFdpme8PYyIiPjGd77V5YtO6Pb5ePjxooV7WnQwNfyeWYxUggJIYQQ5nQnCK5sgBPf68edchIUdJMePTYTEvJYF+7SpSzz5rXC0zO7hRPNmqQQEkIIIcxB8wz+ng5Hxhvuc/bmdp5+NKy8jIQEDQCurg7MnduSHj0qYGMjq8VbiowREkIIIUwt7gn8OdF4EZSzNAwKo0CZcnz0UdKCqbVr+3DmzCACAipKEWRh0iMkhBBCvKmYe/DgLDy6CEcnQkKkQRNV9ROoNAQb90K62OTJDSlUyJ1+/arIY/FWIoWQEEII8SaOjIdjU1Ns8rjpbwya+pRq1W7x0UcvCiF7ezsGDqxq7gxFCqQQEkIIIV5XYnzKRVDZ3hy4X5+Ad4K5dSuSzZsv0aSJH5Ur57NcjiJFUggJIYQQrytosv525eHglBOKtyfBrQwTJx3g22+Pov5/6qAcORwIC4u2eJoieVIICSGEEGl14w9Y30Q/VuQdaDwbgODgcLq1WMLJk3d1uxs18mX58vYULOiGSD+kEBJCCCHSIj7CsAgCqDkBpRQLF55g5MjdxMYmrRFmb2/LlCmNGT26Nra28kRYeiOFkBBCCPEqEdchMS7p74Fl9Pe5FYYeJ3gU60yfdmvZujVYt6tkyVysXt2RKlVkTFB6JYWQEEIIkZKdveDicuP7Sr0HrVYD4KhN4PLlcN2uwYOr8t13zXB2trdEluI1yaQFQgghREqSK4IAWizT/dXFxYFVqzqQP78rW7d25aefWkkRlAFIj5AQQggBSbe+QvdA4tOk7Svr4Z9N+m3K9kn60zk35xy64/JvNEWKeOp2V62an5CQ4Tg6ysdrRiHvlBBCCKG0MMcl6c/kFHkHWixBq1X88MMxPv10K5Ur5+Pw4T56s0JLEZSxyK0xIYQQWVtMGHxvl3IRZOcAby/n7t0o3n57FSNG7CY+XsP//neLefP+slyuwuSsXgjNnTsXX19fnJycqFGjBsePH0+x/axZsyhZsiTZs2fHx8eHkSNHEhcXZ6FshRBCZBqn5sIPbjDfyBNdjWYlfb29HIZFwYh4tuy+R/ny89iz55qu2ciRNRkwwN9iKQvTs2r/3dq1axk1ahTz58+nRo0azJo1i+bNmxMcHIy3t7dB+9WrVzNmzBiWLFlC7dq1uXLlCr1798bGxobvv//eCt+BEEKIDOuvbyEhyjD+YRxkc9RtxsQkMHr4dhYsOKGL5cuXg8DAdjRrVtQSmQozsmqP0Pfff8+AAQPo06cPZcqUYf78+Tg7O7NkyRKj7f/880/q1KlDt27d8PX1pVmzZrz33nuv7EUSQgghdG4dhm2d4em9pG07B/AsAU3nw2ilVwSdOHGHKlUW6hVB7dqV4uzZwVIEZRJW6xFKSEjgxIkTjB07VheztbWladOmBAUFGT2mdu3arFy5kuPHj1O9enVCQkLYsWMHAQEByV4nPj6e+Ph43XZkZKTpvgkhhBAZy61DsLaBfszVB/oGGzS9eTOC2rWXkJCgAcDZ2Z7Zs1vQr19lbGxkhujMwmo9QuHh4Wg0GvLkyaMXz5MnD2FhYUaP6datG1988QV169bF3t6eokWL0rBhQ8aNG5fsdaZNm4a7u7vuy8fHx6TfhxBCiAzCWBGEDZTpZbS5j487H3xQFQB//3ycOjWQ/v2rSBGUyWSoZ/wOHDjA1KlT+emnn6hRowZXr17lww8/5Msvv2TChAlGjxk7diyjRo3SbUdGRkoxJIQQWUXQl/DPhqQnwsLP6++r8yWUHwAuL34hV0rpFTrTpjWlUCF3hgypjoODnaWyFhZktULIy8sLOzs77t27pxe/d+8eefPmNXrMhAkTCAgIoH///gCUL1+emJgY3n//fcaPH4+trWEHl6OjI46OjgZxIYQQmdidIPildvL7/VpCjXFgk/S5ERkZz/DhO6levQAffFBN18zJKRsjR9Yyd7bCiqx2a8zBwQF/f3/27duni2m1Wvbt20etWsb/0T19+tSg2LGzS6rQlVLmS1YIIUT6phTEPYYz82FpGeNFkL0LZM8Nby2ADr/piqCgoJtUqjSfZcvOMHr0Hi5demDh5IU1WfXW2KhRo+jVqxdVq1alevXqzJo1i5iYGPr0SZrCvGfPnhQoUIBp06YB0Lp1a77//nsqV66suzU2YcIEWrdurSuIhBBCZDGXVsPuvqCJN77fswS02wY5S+iFExO1fPXVIb766hAaTdIv0/b2tly79pjSpXObO2uRTli1EOrSpQsPHjxg4sSJhIWFUalSJXbt2qUbQH3jxg29HqDPPvsMGxsbPvvsM27fvk3u3Llp3bo1U6ZMsda3IIQQwlqehMAfQ+H6zuTbdPoDCjUyCIeEPKZHj00EBd3SxWrX9mHlyvb4+XkatBeZl43KYveUIiMjcXd3JyIiAjc3N2unI4QQ4nU8vgpLihvGvcpDvppQsjMUbAB2+qu/K6VYvvwMQ4fuJDo6AQA7OxsmTmzAuHH19NYME+mLuT6/M9RTY0IIIQRKGS+C3lkHJTsle9iTJ3EMHLiddesu6GJFiniyalUHatYsaI5MRQYghZAQQoiMJealuebK9oGmP0E2pxQPs7GBY8de3Arr3bsSc+a0wNVVnizOyqQPUAghRMaREA0L8uvHmv/8yiIIwN3diRUr2uPl5cy6de+ydGlbKYKE9AgJIYTIQFZW1d+u+VlSV48RwcHhuLg4ULDgi/Ek9eoVJjT0Q1xcHMyZpchApBASQgiRPiVEw6VVEPsAjk4EjDzbU91wiSWlFAsXnmDkyN3UrFmQvXt7Ymv7oliSIkj8lxRCQggh0p+4JzD3FY+xfxint1I8wIMHMfTvv42tW5MWUd2/P5SFC08waFBVY2cQQgohIYQQ6dD+4cnvK/wWNPzeoAjavfsqvXtvISwsWhcbNMifnj0rmitLkQlIISSEECL9UFpYWQ3un9SPt98OLnnBu4rBmKC4uETGjt3LrFnHdDEvL2eWLGlD69YlLZG1yMCkEBJCCGF9Wg3cPZY0U/T9U/r7BvwLboWMHnbu3D26d9/EuXP3dbHmzYsSGNiOvHlzmDNjkUlIISSEEMI6lIJ7J+Dmfjj0ifE2fS4nWwT9++8TqlVbRHy8BgBHRzu+/fYthg6trjc4WoiUSCEkhBDCOg6OhhMzk98/6G7S7bBkFC7sQc+eFVm06CTly3uzenVHypXzNkOiIjOTQkgIIYR5aRMh5DeIDIVzi+HJNUiMNd7WoyiU6wcV3ofsuV556pkzm1O4sDujR9fGyUk+0kTayb8aIYQQpqe0sPVdCN0JiXGvbl93KpR4FzyNrCEGxMQkMHr0HmrWLEjv3pV0cRcXB8aPr2+ipEVWJIWQEEII07t9FK5uTrmNW2GIvgM9/obcFZJtduLEHbp330Rw8ENWrTpHvXqFKFo0p4kTFlmVFEJCCCFM6+4xWPtSL02usmDnANU+ASdPKNQEbFP+CNJotHz33Z989tl+EhO1AGi1ivPn70shJExGCiEhhBCmFfS5/naj2VAlhQkSjbh5M4KAgM0cPPivLubvn4/VqztSosSrxw4JkVpSCAkhhDCNp+EwL7d+zKchlO2dptOsW3eBgQO38+RJ0tgiGxsYM6Yukyc3xMHBzjS5CvH/pBASQghhGov99Ldts0GrNeDoZrz9S6Ki4hk2bCfLlp3RxXx83Fixoj0NGviaMFEhXpBCSAghxJv7ewY8i9aPtf8NXPKk+hTx8Rr27Lmm2+7SpSzz5rXC0zO7qbIUwoCttRMQQgiRwWk1cPAj/dgoLfg2S9NpvLycWbasHW5ujixf3o5ffukoRZAwO+kREkII8fpiwmBTS/1Y338MFkY1JiTkMS4u9uTJ82JNsLfeKsq//47Aw8PJ1JkKYZT0CAkhhHg9ifEwP5/+IqmuPuBZLMXDlFIsW3aaihXn07fvVpRSevulCBKWJIWQEEKI1/PXt4axnmcMY//x+HEsXbtupHfvLURHJ7Bjxz8sXXraPPkJkQpya0wIIUTaXFoNO7obxodGpPiE2IEDoQQEbObWrUhdrHfvSnTqVMYcWQqRKlIICSGESL3TP8G+IYbxfleTLYISEjRMnLifb789yvO7YJ6eTixY8A6dOpU1Y7JCvJoUQkIIIV4t/AKcXQCnfjDc12pN0qrxRly+HE737ps4efKuLtaokS/Ll7enYMHUzS8khDlJISSEEMK42Eewuy9c22J8f8dd4Ns82cNDQh5TpcoCYmMTAbC3t2XKlMaMHl0bW9tXP1UmhCXIYGkhhBCGru+Cn3IlXwR1OZxiEQRQpIgnHTqUBqBkyVz873/9+fjjOlIEiXRFeoSEEEIkeXAOdvXSfxz+ZfWnJ60cn6dyqk45d25LChd2Z/z4+jg725soUSFMx0a9PIFDGsTFxeHklLHme4iMjMTd3Z2IiAjc3OT+tBBCABBxHRYXMb6vXD94awHYJr/gaVxcImPH7qV2bR8ZAC3Mwlyf32m+NabVavnyyy8pUKAAOXLkICQkBIAJEybw888/mywxIYQQFhJ1y3gRVKAuvLMOmi9OsQg6d+4e1asvYtasY7z//nZu3owwY7JCmFaaC6GvvvqKwMBAvv32WxwcHHTxcuXKsXjxYpMmJ4QQwgIW+uhvl+sLI59B18NQslOyh2m1itmz/0e1aos4d+4+ALGxz/j77zvmzFYIk0pzIbR8+XIWLlxI9+7dsbN78RtCxYoVuXz5skmTE0IIYSYPL8O2TrCsvH68XD9o/jPYpjyE9O7dKFq2XMWIEbuJj9cAUL68N3///T7t25c2V9ZCmFyaB0vfvn2bYsUM15HRarU8e/bMJEkJIYQws2NT4MoGw3izRa88dMuWy/Tvv43w8Ke62MiRNZk6tQlOTvIMjshY0vwvtkyZMhw+fJjChQvrxTds2EDlyql7ikAIIYSVxT7Q387uBW+vSHHV+JiYBEaP3sOCBSd0sXz5chAY2I5mzYxPqChEepfmQmjixIn06tWL27dvo9Vq2bRpE8HBwSxfvpzt27ebI0chhBCm8uwprKgCj4NfxAbfB6ecKQ6IBoiMjGfjxku67XbtSrFoUWu8vJzNla0QZpfmMUJt27Zl27Zt7N27FxcXFyZOnMilS5fYtm0bb731ljlyFEIIYQpHxsMcF/0iKJsTOLi+sggCyJfPlcWLW+PsbM+iRa3ZtKmzFEEiw3ujeYQyIplHSAiRJWmewRxn0Cbqx5v8BJUGGz3k5s0IXFwcyJkzu178/v0YvL1dzJWpEEalm3mEihQpwsOHDw3iT548oUiRZCbjEkIIYV2JT/WLoAoDYZQ22SJo3boLVKgwn4EDt/Py78tSBInMJM1jhEJDQ9FoNAbx+Ph4bt++bZKkhBBCvAGlhXsnIDEOom7AsWnw8MKL/QXrw1vzjR4aGRnP8OE7WbbsDAAbNlxk9epzdO9ewRKZC2FxqS6Etm7dqvv77t27cXd3121rNBr27duHr6+vSZMTQgiRBg/Owp7+EPZXyu3c/YyGg4Ju0r37Jq5ff6KLdelSlpYti5swSSHSl1QXQu3atQPAxsaGXr166e2zt7fH19eXGTNmmDQ5IYQQqaBU0krxcY9f3bZsL6g7VS+UmKhlypRDfPnlITSapNtgrq4OzJ3bkh49KmCTwiP1QmR0qS6EtFotAH5+fvz11194eXmZLSkhhBBpcH2n8SIoRwEo9V5SoeTbHAo1Nng6LCTkMT16bCIo6JYuVru2DytXtsfPz9PcmQthdWkeI3T9+nVz5CGEEOJ1PDgLm1vpx9pshKJtXrlMxtWrj6hSZQFRUQkA2NnZMHFiA8aNq0e2bGl+lkaIDOm15kKPiYnh4MGD3Lhxg4SEBL19w4cPN0liQgghUqAU3PsbVlXXj3fcDb7NUnWKokU9adKkCL/+epkiRTxZtaoDNWsWNEOyQqRfaS6ETp06RcuWLXn69CkxMTHkzJmT8PBwnJ2d8fb2lkJICCHM5d5JODMPtBq4sNRwv/+oVBdBkDTmc9Gi1hQu7M6XXzbC1dXRhMkKkTGkeULFhg0bUqJECebPn4+7uztnzpzB3t6eHj168OGHH9KhQwdz5WoSMqGiECJDSIiGv6a/mAX6TlDSo/DJKdkF3lmT/OkSNEycuJ969QrRqlUJEycrhPmZ6/M7zYWQh4cHx44do2TJknh4eBAUFETp0qU5duwYvXr14vLlyyZLzhykEBJCZAjLK8GDM69uZ+cAbTZBkVbJNgkODqdbt02cPHkXb28Xzp4dRJ48OUyXqxAWYK7P7zTfGrO3t8fWNmkQnbe3Nzdu3KB06dK4u7tz8+ZNkyUmhBBZ0olZcHIWRP6bfJsWy8C7Erj6gFPyT3YppVi48AQjR+4mNjZpVunHj2M5evQmHTqUNmnaQmRUaS6EKleuzF9//UXx4sVp0KABEydOJDw8nBUrVlCuXDlz5CiEEJnf0wfwaxu4+z/Dff2u/v8TYDZJxU8q5vV58CCG/v23sXXriwVWS5bMxerVHalSJZ8JExciY0vz85FTp04lX76kH6IpU6bg6enJ4MGDefDgAQsWLDB5gkIIkSUsLGhYBDl7Q4+T4FEU3AqDW6FUFUG7d1+lQoX5ekXQ4MFVOXlyoBRBQrxEVp8XQghriXsM8/OCJsFwX8+zkLt82k4Xl8jYsXuZNeuYLubl5cySJW1o3brkm2YrhFWlm9Xnk3Py5EneeecdU51OCCEyj7gn8DRc/yvkN5ib03gRNEqT5iII4P79GJYuPa3bbtGiGOfODZYiSIgUpGmM0O7du/n9999xcHCgf//+FClShMuXLzNmzBi2bdtG8+bNzZWnEEJkPBGhsNj4AqdGFXkHGs0Cm9f7HbVQIXfmzWtFnz5bmD79LYYOrS7rhAnxCqkuhH7++WcGDBhAzpw5efz4MYsXL+b7779n2LBhdOnShfPnz1O6tDyFIIQQAJxbAnv6pa6tZwnoG/zqdi+5ezcKFxcH3NxeTIT43nvlqVu3ED4+7mk+nxBZUarHCFWoUIGAgAA+/vhjNm7cSKdOnahZsybr1q2jYMGMMyW7jBESQpjN03D43xcQdhzuHjPcX+Sl4QOJsVBlRNIcQGnsudmy5TL9+2+jVaviBAa2e+2UhcgorD6hoouLCxcuXMDX1xelFI6Ojuzfv586deqYLBlLkEJICGFyDy/Db+/Bg9PG9zeZC5U+MMmlYmISGD16DwsWnNDFNmzoRMeOZUxyfiHSK6tPqBgbG4uzszOQtD6No6Oj7jF6IYTIsp4+gMAUhgW02QjFTbP00IkTd+jWbRNXrjzUxdq1K0WDBr4mOb8QWVGaBksvXryYHDmSpmVPTEwkMDAQLy8vvTay6KoQIstIjId53obxap9ApaHgkidpCYw3pNFo+e67P/nss/0kJmoBcHa2Z/bsFvTrV1kGRAvxBlJ9a8zX1/eVP2w2NjaEhISkKYG5c+cyffp0wsLCqFixIj/88APVq1dPtv2TJ08YP348mzZt4tGjRxQuXJhZs2bRsmXLVF1Pbo0JId5Y6G64/WfSeKD/KtQEOu016aVu3owgIGAzBw++WHLD3z8fq1d3pESJXCa9lhDpmdVvjYWGhprsos+tXbuWUaNGMX/+fGrUqMGsWbNo3rw5wcHBeHsb/paVkJDAW2+9hbe3Nxs2bKBAgQL8+++/eHh4mDw3IYTQc/80XFiWtA6YMZ4lTV4EXbnykBo1FvPkSRyQNJ56zJi6TJ7cEAcHO5NeS4isyqozS9eoUYNq1arx448/AqDVavHx8WHYsGGMGTPGoP38+fOZPn06ly9fxt7e/rWuKT1CQog029Ac/t2TcpuRiWBr2uJEq1W0bLmK3buv4ePjxooV7WU8kMiy0v3M0mmVkJDAiRMnaNq06YtkbG1p2rQpQUFBRo/ZunUrtWrVYsiQIeTJk4dy5coxdepUNBqNpdIWQmQVSkH4BZiTI/kiqPUGePd3+DDW5EUQgK2tDUuXtuX996tw5swgKYKEMIM0rz5vKuHh4Wg0GvLkyaMXz5MnD5cvXzZ6TEhICH/88Qfdu3dnx44dXL16lQ8++IBnz54xadIko8fEx8cTHx+v246MjDTdNyGEyJweBcPSUsb3NVsMBepBzhImvWRiopYpUw5Rr15hGjd+MRt1vnyuLFjQ2qTXEkK8YLVC6HVotVq8vb1ZuHAhdnZ2+Pv7c/v2baZPn55sITRt2jQ+//xzC2cqhMhwNAmwrhE8vQdPrhlvM/wp2Gc3+aVDQh7To8cmgoJuUaCAK2fPDiZnTtNfRwhhyGq3xry8vLCzs+PevXt68Xv37pE3b16jx+TLl48SJUpgZ/eiC7p06dKEhYWRkGBk4UJg7NixRERE6L5u3rxpum9CCJGxKQU3D8LZxTDLEe78abwIqjkBBtwweRGklGL58jNUqjSfoKBbAISFRbN//3WTXkcIkbzX6hG6du0aS5cu5dq1a8yePRtvb2927txJoUKFKFu2bKrO4eDggL+/P/v27aNdu3ZAUo/Pvn37GDp0qNFj6tSpw+rVq9FqtdjaJtVwV65cIV++fDg4GJ+rw9HREUdHR6P7hBBZWNzjpNXfU/L2cigTYJbLP34cy6BBv7Fu3QVdrEgRT1at6kDNmhln2SIhMro09wgdPHiQ8uXLc+zYMTZt2kR0dDQAZ86cSfb2VHJGjRrFokWLWLZsGZcuXWLw4MHExMTQp08fAHr27MnYsWN17QcPHsyjR4/48MMPuXLlCr/99htTp05lyJAhaf02hBBZ2eN/ki+C7BySngAbrcxWBB04EEqFCvP1iqDevStx+vRAKYKEsLA09wiNGTOGr776ilGjRuHq6qqLN27cWPcYfGp16dKFBw8eMHHiRMLCwqhUqRK7du3SDaC+ceOGrucHwMfHh927dzNy5EgqVKhAgQIF+PDDD/n000/T+m0IIbIipSDsL1hdw3DfWwvB0T1pYVQzPAEGkJCgYdKk/XzzzVGeT1zi4eHEwoXv0KlT6nrThRCmleZ5hHLkyMG5c+fw8/PD1dWVM2fOUKRIEUJDQylVqhRxcXHmytUkZB4hIbKo5J4E83sbOuywSAohIY+pUGEeMTHPAGjY0Jfly9vh4+NukesLkZGlm3mEPDw8uHv3rkH81KlTFChQwCRJCSGESTx7CkfGw6/tjBdBOQpA640WS6dIEU9mz26Bvb0t337blH37ekoRJISVpfnWWNeuXfn0009Zv349NjY2aLVajh49ykcffUTPnj3NkaMQQqSdUvCjO2gTDffZ2kPDmVC+P2Qz38MU4eFPcXa2x9n5xUz4fftWpkEDX4oVe8VAbSGERaT51lhCQgJDhgwhMDAQjUZDtmzZ0Gg0dOvWjcDAQL1H29MjuTUmRCal1cC2TvA4GGwd4MFp4+0qDYEmaRvP+Dp2775K795b6NChFHPntjL79YTI7Mz1+f3aa43duHGD8+fPEx0dTeXKlSlevLjJkjInKYSEyKT2vA/nFiW/f8C/4OAGTh5mTSMuLpGxY/cya9YxXWz79vdo1cq0M1ELkdVYffX5544cOULdunUpVKgQhQoVMlkiQgiRZkrBv3vh/inDIsjOETTx4FkCAk6bZUbol507d4/u3Tdx7tx9XaxFi2L4++c3+7WFEK8nzYVQ48aNKVCgAO+99x49evSgTJky5shLCCFS9uwprKgMj68Y7vswzqxjf16m1Sp++OEYn366l/j4pEWgHR3tmD79LYYOrY6NjY3FchFCpE2anxq7c+cOo0eP5uDBg5QrV45KlSoxffp0bt26ZY78hBBC39MHMMMG5rgYL4IK1LVoEXT3bhQtW65ixIjduiKofHlv/v77fYYNqyFFkBDp3GuPEQK4fv06q1ev5pdffuHy5cvUr1+fP/74w5T5mZyMERIig9vVBy4EGsbfWQf2LlCoMWRzskgqwcHh1K27lPDwp7rYyJE1mTq1CU5OGWpNayHSvXQ3WPo5jUbDzp07mTBhAmfPnkWj0ZgqN7OQQkiIDEBpk8b9JP7/BK3Xd8LdY0kFTsh2/bZlekKtSeBRxOJpajRaGjdezqFD/5IvXw4CA9vRrFlRi+chRFaQbgZLP3f06FFWrVrFhg0biIuLo23btkybNs1kiQkhsrBf2xoWPMYMj07qBbISOztbVqxoz2ef/cH33zfHy8vZarkIIV5PmguhsWPHsmbNGu7cucNbb73F7Nmzadu2Lc7O8h+AEOINKAW3j0DUzdQVQWX7WLQI0mi0fPfdn9SrV5jatX108UKF3Fm+vL3F8hBCmFaaC6FDhw7x8ccf07lzZ7y8vMyRkxAiq9FqYGYy/x35j0760z47lOoO2b3ANpvZ5wP6r5s3IwgI2MzBg//i5+fB6dODcHOz3IBsIYT5pLkQOnr0qDnyEEJkVYnx8MdQ4/sqDoaG31k2n5esW3eBgQO38+RJ0nil0NAn7NlzjXfflalDhMgMUlUIbd26lbfffht7e3u2bt2aYts2bdqYJDEhRBZxag6cW6wfazQHsueCYu2skhJAZGQ8w4fvZNmyM7qYj48bK1a0p0EDX6vlJYQwrVQ9NWZra0tYWBje3t7Y2iY/9ZCNjY08NSaESL3EOJj90ozPXY9AgTrWyef/BQXdpEePzYSEPNbFunQpy7x5rfD0NP8M1UIIQ1Z9akyr1Rr9uxBCvJFbh/W33wuC/DWtkwuQmKhlypRDfPnlITSapN8RXV0dmDu3JT16VJDJEYXIhNI8s/Ty5cuJj483iCckJLB8+XKTJCWEyCI0//m/xLO4VYsggGvXHjFt2hFdEVS7tg9nzgwiIKCiFEFCZFJpLoT69OlDRESEQTwqKoo+ffqYJCkhRCanTYQLy+DX1i9ipQOsl8//K1nSi2+/fQs7Oxs+/7whBw/2xs/P09ppCSHMKM1PjSmljP5mdOvWLdzd3U2SlBAik3oWCyurwKPLhvs8S1g8ncePY3F2tsfR8cV/hcOGVadxYz/KlfO2eD5CCMtLdSFUuXJlbGxssLGxoUmTJmTL9uJQjUbD9evXadGihVmSFEJkcJfXwJ+T4XGw8f35akKJjhZN6cCBUAICNtO1a1mmT2+mi9vY2EgRJEQWkupCqF27dgCcPn2a5s2bkyNHDt0+BwcHfH196djRsv+RCSEyiKMT4MlVw7h3ZWgRCLkrWCyVhAQNkybt55tvjqIUfPddEC1aFKNJE8uvVSaEsL5UF0KTJk0CwNfXly5duuDkZJnVnYUQGZjmGcQ+eFEE2diCnQOUHwCNZiVtW1BwcDjdum3i5Mm7ulijRr6ULCmz5AuRVaV5jFCvXr3MkYcQIjN5FgPXd8G2d/XjOQrA+zcsno5SioULTzBy5G5iYxMBsLe3ZcqUxoweXRtbW3kiTIisKlWFUM6cObly5QpeXl54enqm+Bjpo0ePTJacECIDiLgOcU9ebB/6CG78Ybxt7kqWyEjPgwcx9O+/ja1bX4xPKlkyF6tXd6RKlXwWz0cIkb6kqhCaOXMmrq6uur/LfBpCZHFKwZ0/YU3d1LXPWQoKNYHGP5g3r5cEB4fTsOEywsKidbHBg6vy3XfNcHa2t2guQoj0KVVLbGQmssSGEG8o6jYsLJi6tiXehWLtoXQ38+aUjGfPNNSps4S//rqDl5czS5a0oXXrklbJRQjxZqy6xMZ/nTx5Ent7e8qXLw/Ali1bWLp0KWXKlGHy5Mk4ODiYLDkhRDqTGJd8EVRx8Iu/O7hCzc+S/rQie3s7Vq3qwJgx+5g7tyV58+Z49UFCiCwlzT1C1apVY8yYMXTs2JGQkBDKlClDhw4d+Ouvv2jVqhWzZs0yU6qmIT1CQrymp+EwL7dhvO2vUKytxdN5mVar+PHH49SrV4jKlWXsjxCZjbk+v9P87OqVK1eoVKkSAOvXr6dBgwasXr2awMBANm7caLLEhBDphNLCngGGRVDxjjBapYsi6O7dKFq2XMWHH+6iW7dNPH36zNopCSEyiDQXQkop3Qr0e/fupWXLlgD4+PgQHh5u2uyEENajtHBxJSyvCOcW6+9z9IB31lglrZdt2XKZChXms3v3NQAuXw5n585/rJyVECKjSPMYoapVq/LVV1/RtGlTDh48yLx58wC4fv06efLkMXmCQggrSIxPKn7+GGq4r9IQaDQbbO0sn9d/xMQkMHr0HhYsOKGL5cuXg8DAdjRrVtSKmQkhMpI0F0KzZs2ie/fu/Prrr4wfP55ixYoBsGHDBmrXrm3yBIUQFhQRCn8Mh5Btxve/fxNcU/nEmBmdOHGHbt02ceXKQ12sXbtSLFrUGi8vZytmJoTIaEz2+HxcXBx2dnbY26fvuTlksLQQKdjUCq7vMIzX+xoqfWD1p8A0Gi3Tp//JhAn7SUxMukXv7GzPrFnN6d+/isxxJkQmlm4en3/uxIkTXLp0CYAyZcpQpUoVkyUlhLAwpeDSSsMiyLcFFG2d9Gh8OigyLl8O1yuC/P3zsXp1R0qUyGXlzIQQGVWaC6H79+/TpUsXDh48iIeHBwBPnjyhUaNGrFmzhty5jTxeK4RIv87Mh72DDeMj4pMWSE1Hypb15ssvGzFu3D7GjKnL5MkNcXCw7lglIUTGluanxoYNG0Z0dDQXLlzg0aNHPHr0iPPnzxMZGcnw4cPNkaMQwlw0z+DAKMN4i2XpogiKiorX9f489/HHtTl+fABTpzaRIkgI8cbSXAjt2rWLn376idKlS+tiZcqUYe7cuezcudOkyQkhzEybCImxL7Yd3aHzfijb03o5/b+goJtUqrSAr746pBe3s7OlatX8VspKCJHZpLkQ0mq1RgdE29vb6+YXEkJkAM+ewqOLL7Z9GsHQJ+DT0FoZAZCYqOXzzw9Qr95SQkIe8+WXh/jzz5tWzUkIkXmleYxQ48aN+fDDD/nll1/Inz/pt7Lbt28zcuRImjRpYvIEhRAmdn0XbO8CCZH6cWX9X2RCQh7To8cmgoJu6WI1axYkXz5ZI0wIYR5p7hH68ccfiYyMxNfXl6JFi1K0aFH8/PyIjIzkhx9+MEeOQghTubIRNr1tWAQB+L1t+Xz+n1KK5cvPUKnSfF0RZGdnw+efN+Tgwd74+XlaLTchROaW5h4hHx8fTp48yb59+3SPz5cuXZqmTZuaPDkhhIncOwW/D4B7Jwz3+bUE3+ZQxToPOzx+HMvgwb+xdu0FXaxIEU9WrepAzZrWn7xRCJG5pakQWrt2LVu3biUhIYEmTZowbNgwc+UlhDCFW4dhYzNIjDPc13I1lOpq1fmBgoPDeeutFdy8+aKHqnfvSsyZ0wJXV0er5SWEyDpSXQjNmzePIUOGULx4cbJnz86mTZu4du0a06dPN2d+QojXobRwaRXsTObprzI9ofR7ls3JiMKFPfDwcOLmzUg8PZ1YsOAdOnUqa+20hBBZSKqX2ChbtiydO3dm0qRJAKxcuZKBAwcSExNj1gRNTZbYEJne0/uwvimEnzPc12Yj+LWCbOmnt+X8+ft8+uleFix4h4IF5WdSCGGcuT6/U10IZc+enUuXLuHr6wskPUafPXt2QkNDyZcvn8kSMjcphESmpdVA6G7Y3MpwX+MfofIQy+f0H0opFi06Sd26hShTRmagF0KkjdXXGouPj8fFxUW3bWtri4ODA7GxsSkcJYSwiNDfk8YCGdPlEBSoa9l8XvLgQQz9+29j69ZgKlbMw7Fj/XF0fO2lDoUQwmTS9D/RhAkTcHZ21m0nJCQwZcoU3N3ddbHvv//edNkJIVKmTYTfusOVdYb73ApDwClwsu6j57t3X6V37y2EhUUDcObMPbZvv0LHjmWsmpcQQkAaCqH69esTHBysF6tduzYhISG6bZt0sDq1EJmeUvDwQtLEiIc+NtzvVhiaLYZCTaz6RFhcXCJjxuxl9uxjupiXlzNLlrShdeuSVstLCCH+K9WF0IEDB8yYhhDilZQ2qfcneE3ybVquhNLdLZdTMs6du0e3bps4f/6+Lta8eVECA9uRN6/MEi2ESD/kJr0QGUH4BVhWLvn9BevDWwshp3V7WrRaxQ8/HOPTT/cSH68BwNHRjm+/fYuhQ6tjayu9xkKI9EUKISHSu2exxosgG1vwHwXlB0DOEpbPy4hz5+4xatQetNqkh1HLl/dm9eqOlCvnbeXMhBDCuDSvNSaEsJC4J3BmAcz10I+7FYbhMTBKAw2mp5siCKBixbyMG5f0hNrIkTU5fnyAFEFCiHRNeoSESC9i7sGfk+DsAsAGMDLFl9/b0GGHpTNL1tOnz3ByyqZ3y2vixAY0a1aUevUKWzEzIYRIHekREiI9iPwX5uf9/yIIjBZBkDQxYjpx4sQdKldewIwZf+rF7e3tpAgSQmQYr1UIHT58mB49elCrVi1u374NwIoVKzhy5IhJkxMi09MkwPFvYZGv4T73IpCzNJTrCx33wIgE8Chi8RRfptFo+eabI9Ss+TNXrjxk/Pg/OHnyrrXTEkKI15LmW2MbN24kICCA7t27c+rUKeLj4wGIiIhg6tSp7NiRfrrthUj3/poORz/Tj+UsDd2CwNHd+DFWdPNmBAEBmzl48F9drEKFPOTI4WDFrIQQ4vWluUfoq6++Yv78+SxatAh7e3tdvE6dOpw8edKkyQmRKUXehG1dYIaNYRHkWijdFkHr1l2gQoX5uiLIxgbGjq3Ln3/2o0SJXFbOTgghXk+ae4SCg4OpX7++Qdzd3Z0nT56YIichMq+U5gNqvQGKtQXb9PUMQ2RkPMOH72TZsjO6mI+PGytWtKdBA1/rJSaEECaQ5v9x8+bNy9WrV3Wr0D935MgRihSx/vgFIdK124eNx9tshuLtLJpKagQHh9Oy5WpCQh7rYl26lGX+/Hfw8HCyYmZCCGEaaS6EBgwYwIcffsiSJUuwsbHhzp07BAUF8dFHHzFhwgRz5ChE5qBU0tdz+etAx53g4Gq9nF6hYEE3smVLuoPu6urA3Lkt6dGjgqwrKITINNI8RmjMmDF069aNJk2aEB0dTf369enfvz8DBw5k2LBhr5XE3Llz8fX1xcnJiRo1anD8+PFUHbdmzRpsbGxo167da11XCIs58BHMcoB9H7yIVRiQrosgABcXB1av7kDDhr6cOTOIgICKUgQJITIVG6VUMhOWpCwhIYGrV68SHR1NmTJlyJHj9RZSXLt2LT179mT+/PnUqFGDWbNmsX79eoKDg/H2Tn5G2tDQUOrWrUuRIkXImTMnv/76a6quFxkZibu7OxEREbi5ub1WzkKkyeOrsKS4Ybz9dijSyvL5JEMpxYoVZ6lTx4eiRXMa7JMCSAhhTeb6/H7tQshUatSoQbVq1fjxx6SJ4rRaLT4+PgwbNowxY8YYPUaj0VC/fn369u3L4cOHefLkiRRCIn2JvgtP7yd9bWymvy9fLchXA+p/C3b2xo+3sMePYxk06DfWrbtAjRoFOHy4D/b2dtZOSwghdMz1+Z3mMUKNGjVK8TfDP/74I9XnSkhI4MSJE4wdO1YXs7W1pWnTpgQFBSV73BdffIG3tzf9+vXj8OFkBp/+v/j4eN1cR5D0QgphNloN7B8Bp5OZAbrxj1B5iEVTepUDB0IJCNjMrVtJPxvHjt1m+/YrtG9f2sqZCSGE+aW5EKpUqZLe9rNnzzh9+jTnz5+nV69eaTpXeHg4Go2GPHny6MXz5MnD5cuXjR5z5MgRfv75Z06fPp2qa0ybNo3PP/88TXkJkWaJcbCuIdw9lnyb0j3SVRGUkKBh4sT9fPvtUd0Ybk9PJxYubC1FkBAiy0hzITRz5kyj8cmTJxMdHf3GCaUkKiqKgIAAFi1ahJeXV6qOGTt2LKNGjdJtR0ZG4uPjY64URVaj1cAfQ+HMfOP7y/YGO0fI4w/l+1k0tZQEB4fTrdsmvaUxGjXyZfny9hQsKLeMhRBZh8lmbuvRowfVq1fnu+++S/UxXl5e2NnZce/ePb34vXv3yJs3r0H7a9euERoaSuvWrXUxrVYLQLZs2QgODqZo0aJ6xzg6OuLo6JiWb0WI1Lt92HgRlLM0tFoN3pUsnlJKlFIsXHiCkSN3ExubCIC9vS1TpjRm9OjaeqvICyFEVmCyQigoKAgnp7RNsObg4IC/vz/79u3TPQKv1WrZt28fQ4cONWhfqlQpzp07pxf77LPPiIqKYvbs2dLTIywrMQ5OztaPVRwETeaCzWutZ2x2p06FMWjQb7rtkiVzsXp1R6pUyWfFrIQQwnrSXAh16NBBb1spxd27d/n7779fa0LFUaNG0atXL6pWrUr16tWZNWsWMTEx9OnTB4CePXtSoEABpk2bhpOTE+XK6S9P4OHhAWAQF8LsNjTTnym63tdQ/VPr5ZMKVarkY9Somnz//f8YPLgq333XDGfn9PHkmhBCWEOaCyF3d/3FIG1tbSlZsiRffPEFzZo1S+ao5HXp0oUHDx4wceJEwsLCqFSpErt27dINoL5x4wa2tunzt2uRhT28bLhchk9Dq6SSkvj4RBwc7PSe9Jw6tQktWhTjrbeKpnCkEEJkDWmaR0ij0XD06FHKly+Pp6enOfMyG5lHSJjEtW3wa5sX2/2ugkf6KizOnbtHt26bGDy4Kh98UM3a6QghxBsx1+d3mrpa7OzsaNasmawyL7IupYV/ftUvgmpOSFdFkFarmD37f1Srtojz5+8zevQeLl58YO20hBAiXUrzrbFy5coREhKCn5+fOfIRIn1bWhoeX9GP5Uw/c+7cvRtFnz5b2L37mi5WvHjOFI4QQoisLc2Db7766is++ugjtm/fzt27d4mMjNT7EiJTefYUrmyAza1hho1hEeRZHEp0tE5uL9my5TIVKszXK4JGjqzJ8eMDKFMmtxUzE0KI9CvVY4S++OILRo8ejavri9Wy/zsA8/mijBqNxvRZmpCMERKpltxiqc91OQwF61oun2TExCQwevQeFiw4oYvly5eDwMB2NGuWfm7ZCSHEm7D6oqt2dnbcvXuXS5cupdiuQYMGJknMXKQQEqmitPB9MouO2rvA+7fAycOiKRlz5cpDWrf+hStXHupi7dqVYtGi1nh5OVsxMyGEMC2rL7r6vF5K74WOECZx9Vf9bc+SUHsyFG0D9umnwMiTx4WEhKReWGdne2bPbkG/fpVTXBhZCCHEC2kaLC3/uYpM7VkMnJwDj/+BC0v19/U1vgiwtbm7O7FyZXtGj97D8uXtKVEil7VTEkKIDCVNhVCJEiVeWQw9evTojRISwmrOLoIj4wzjnfZZPpdkrF9/gZo1C+Lj82Ji0zp1ChEU1E9+URFCiNeQpkLo888/N5hZWohM4fFVODDSMF62NxRqbPF0XhYZGc/w4TtZtuwMDRv6sndvAHZ2Lx76lCJICCFeT5oKoa5du+Lt7W2uXISwjht/wPom+rGOu5PmB3Kz/kK+QUE36dFjMyEhjwE4cCCU7duv0LZtKStnJoQQGV+qCyH5jVNkWv/u1d/2awm+aV83z9QSE7VMmXKIL788hEaT9LCCq6sDc+e2pE2bklbOTgghMoc0PzUmRKbz6D9TQpTuAc0WWy+X/xcS8pgePTYRFHRLF6td24eVK9vj55cx1/kTQoj0KNWFkFarNWceQljH46v6j8qX6wPZHK2WjlKKFSvOMnToDqKiEgCws7Nh4sQGjBtXj2zZ0jwZvBBCiBSkea0xITKVLW31t70rWyeP//f333fo1etX3XaRIp6sWtWBmjULWi8pIYTIxOTXS5F1nV0IDy++2G4yF5yse9upWrUCDBzoD0Dv3pU4fXqgFEFCCGFG0iMksp74CPjRwzBe6QOLp/LsmYZs2Wz1HkaYMaMZLVsWlwHRQghhAdIjJLIWrcZ4EVR5uMVTCQ4Op2bNn1m27Ixe3MXFQYogIYSwECmERNZxYhbMNNIJ2vMsNJ5tsTSUUixY8DeVKy/g5Mm7DBu2k6tXZUZ2IYSwBrk1JjI/peDAKDg5Sz/uVQ56nbNoKg8exNC//za2bg3WxQoUcCU29plF8xBCCJFECiGReSkF//6eVABd36m/z6s8dDtm0XR2775K795bCAuL1sUGDfJnxozmODvbWzQXIYQQSaQQEpmT0sLVrbC1veG+gNPgXdFiqcTFJTJ27F5mzXpReHl5ObNkSRtat5axQEIIYU1SCInMJ/oOrG0AT64a7ut9CXJZbo2uq1cf0aHDWs6du6+LtWhRjKVL25I3bw6L5SGEEMI4KYRE5hETBrEPYVsnwyKo8nCo9gm4FrBoSp6eTjx8GAuAo6Md06e/xdCh1WXtPiGESCekEBKZw9lFsHdQ0i2xl9WaBLUmgo3lH5LMlcuZwMC2fPzx76xc2YFy5bwtnoMQQojkSSEkMocr640XQR+EQ/ZcFktj27ZgqlUroHfb6623inLihB92djJbhRBCpDfyP7PI+M4uTno67LkyPaHC+/D+LYsVQTExCQwatJ02bdbQt+8WlFJ6+6UIEkKI9El6hETGlhAFvw94sW2bDd5aANmcLJbCiRN36NZtE1euPARg586rbN9+RZ4IE0KIDEB+TRUZV/h5+MFNP1Z9nMWKII1GyzffHKFmzZ91RZCzsz2LFrXmnXdKWCQHIYQQb0Z6hETGFPcYlpXXj1X9GOp8bpHL37wZQUDAZg4e/FcX8/fPx+rVHSlRwnJjkoQQQrwZ6RESGc+/+2BuTv2YZ3GoN80il1+79jwVKszXFUE2NjB2bF3+/LOfFEFCCJHBSI+QyFiUFjY01Y+VHwDNFlrk8v/73y26dt2o2/bxcWPFivY0aOBrkesLIYQwLekREhlD5E3Y8z5saK4fL9vHYkUQQM2aBQkIqABAly5lOXNmkBRBQgiRgUmPkMgYDo5Omivov/JWhxZLzHpZrVZha6s/C/SPP7akVavidO5cVmaIFkKIDE56hET6F3XbsAjK5gxVRpj1siEhj6lbdwnr1l3Qi7u5OdKlSzkpgoQQIhOQHiGR/u3qrb898A44eoB9drNcTinFihVnGTp0B1FRCVy6tJ1atQri4+NulusJIYSwHimERPqhtHAnCG4dTNo+MQtiH+i38R8JOfKZLYXHj2MZNOg3vV6gnDmz8/BhrBRCQgiRCUkhJKzr371wfSec+D517Wt8ZrZUDhwIJSBgM7duRepivXtXYs6cFri6OprtukIIIaxHCiFhHU/vw8NLsOGt1LUvWB/qToPsOV/dNo0SEjRMnLifb789yvMlwjw8nFi48B06dSpr8usJIYRIP6QQEpZ1aRXs6JFym1xlofqnSeOAnHJC/tpJsxaaQUjIYzp1Ws/Jk3d1sYYNfVm+vJ3cChNCiCxACiFhOZH/Jl8ElewKDWdAjvwWTSl79mzcuBEBgL29LVOmNGb06NoGj8wLIYTInKQQEpbzazvDWKn3wLMkVB0NDjksnlK+fK78/HMbPv10L6tWdaBKFfMNxBZCCJH+SCEkzEspCDsOj4LhwekX8TpfQc3xFk9n794QKlfOS65czrpYmzYlefvtYtjb21k8HyGEENYlhZAwD6VNeiJsY3Pj+6t+ZNF04uISGTt2L7NmHaNjx9KsX99Jb0JEKYKEECJrkpmlhXlc/TX5IqhIK8hmucfRz527R/Xqi5g16xgAGzdeYteuqxa7vhBCiPRLeoSEedw/bRhrMhccXKFoW4ukoNUqfvjhGJ9+upf4eA0Ajo52TJ/+Fi1aFLNIDkIIIdI3KYSEaUXehL+/g1NzXsRqfwE1PzPbI/DG3L0bRZ8+W9i9+5ouVr68N6tXd6RcOW+L5SGEECJ9k0JImM7fM+CgkbE/xdtbtAjaujWYfv22Eh7+VBcbObImU6c2wclJ/skLIYR4QT4VhGkkRBsvgvxaglc5i6Vx9OgN2rZdo9vOmzcHy5a1o1mzohbLQQghRMYhg6XFm1Na+MFVP1b1Y/ggHDr8ZtFUatf2oX37UgC0bVuSc+cGSxEkhBAiWdIjJN5c4Es9PvlrQ4NvLXJppZTeY/A2NjYsWtSaNm1K0qtXRb19QgghxMukR0i8HqXgxh+wqRU8uqS/r/N+i6Rw82YEjRsvZ/v2K3rxXLmc6d27khRBQgghXkl6hETaxT2GnQEQYuS214h4sHMwewrr1l1g4MDtPHkSx4UL9zl7djB581p+iQ4hhBAZmxRCIvWi78KRsXBhmfH9A2+bvQiKjIxn+PCdLFt2RhdzcsrGnTtRUggJIYRIMymEROpc+gV2dDO+r+UqKNkFbM27TEVQ0E26d9/E9etPdLEuXcoyb14rPD2zm/XaQgghMicphMSrJcYbL4KKd4CmC8DZy7yXT9Ty1VeH+OqrQ2g0CgBXVwfmzm1Jjx4VZCyQEEKI1yaFkEiZJgFCd+vHKg2BetOSlssws9DQJ3TrtpGgoFu6WO3aPqxc2R4/P0+zX18IIUTmJoWQSJ5SsKYuhP31IlaoKTT50WIp2NracPHiAwDs7GyYOLEB48bVI1s2eeBRCCHEm5NPE5G8G3/oF0EA3pUsmkKhQu7Mn/8ORYp4cuRIXyZObCBFkBBCCJORHiGRvP8unApJq8eXCTDrJQ8f/peKFfPi5uaoi3XtWo527UrJOmFCCCFMLl38aj137lx8fX1xcnKiRo0aHD9+PNm2ixYtol69enh6euLp6UnTpk1TbC9ew40/YIEPXNv6ItZ+O1T6wGzjghISNIwZs5cGDQIZNmynwX4pgoQQQpiD1QuhtWvXMmrUKCZNmsTJkyepWLEizZs35/79+0bbHzhwgPfee4/9+/cTFBSEj48PzZo14/bt2xbOPJNSCtY3gegXg5PJlh0KNTHbJYODw6lV62e++eYoSsHy5WfYs+ea2a4nhBBCPGejlFLWTKBGjRpUq1aNH39MGoCr1Wrx8fFh2LBhjBkz5pXHazQaPD09+fHHH+nZs+cr20dGRuLu7k5ERARubm5vnH+mc/tPWFPnxbZnSag+Bsr1NvmllFIsXHiCkSN3ExubCIC9vS1TpjRm9Oja2NrKY/FCCCGSmOvz26r3GxISEjhx4gRjx47VxWxtbWnatClBQUGpOsfTp0959uwZOXPmNLo/Pj6e+Ph43XZkZOSbJZ0ZJURB+HmIvAG/ddXf1/eyWS754EEM/ftvY+vWYF2sZMlcrF7dkSpV8pnlmkIIIcTLrFoIhYeHo9FoyJMnj148T548XL6cug/gTz/9lPz589O0aVOj+6dNm8bnn3/+xrlmWlG3YWFB4/uazjfLJXfvvkrv3lsIC4vWxQYPrsp33zXD2dneLNcUQgghjMnQI1C//vpr1qxZw4EDB3BycjLaZuzYsYwaNUq3HRkZiY+Pj6VSTL+excDKqvAomYLTJS8UbW3yyx4+/C8tWqzSbXt5ObNkSRtaty5p8msJIYQQr2LVQsjLyws7Ozvu3bunF7937x558+ZN8djvvvuOr7/+mr1791KhQoVk2zk6OuLo6Jjs/izrp9yQGKsfy1kKfJtD4bfAt4VZ1g6rW7cQLVoUY9euq7RoUYylS9vKYqlCCCGsxqpPjTk4OODv78++fft0Ma1Wy759+6hVq1ayx3377bd8+eWX7Nq1i6pVq1oi1cxH+0x/u2wf6H0BGs2CIq3MtoCqjY0NS5e25aefWrJjRzcpgoQQQliV1R+fHzVqFIsWLWLZsmVcunSJwYMHExMTQ58+fQDo2bOn3mDqb775hgkTJrBkyRJ8fX0JCwsjLCyM6Ojo5C4hXvYsBrSJL7ZHaaHFErAx7T+HsLBoWrVazb59IXrxvHlzMHhwNVksVQghhNVZfYxQly5dePDgARMnTiQsLIxKlSqxa9cu3QDqGzduYGv74gN63rx5JCQk8O677+qdZ9KkSUyePNmSqWdcf8948ffclcAMBcnWrcH067eV8PCnnDkTxpkzg8iVy9nk1xFCCCHehNXnEbK0LD2P0PmlSbNFX/31RaxkV3jnF5NdIiYmgdGj97BgwQldLF++HGzb9h7+/vlNdh0hhBBZS6acR0hYUMgO2N3XMN50nskuceLEHbp330Rw8ENdrF27Uixa1BovL+kNEkIIkf5IIZTZKQUr/eH+KcN9daeAk8cbX0Kj0fLdd3/y2Wf7SUzUAuDsbM/s2S3o16+yjAUSQgiRbkkhlJkpBRtbGBZBdb4E/9Fgn/2NL3HrViQBAZs5cCBUF/P3z8fq1R0pUSLXG59fCCGEMCcphDITpYV/98KWduCUE6KNLETb7GcoEwB2ppnBOTb2GX/9lXQdGxsYM6Yukyc3xMHBPI/fCyGEEKZk9cfnhQkdGA0bmydNlGisCBoQCuX7mqwIAihePBdz5ryNj48b+/f3YurUJlIECSGEyDDkqbHMIHQPBK+F80sM9zl7g40d9A0GB9c3vtTx47cpV85bb00wpRQxMc/IkcPhjc8vhBBCGCNPjYkXQvckPQKvtHB2gfE2nfaBTyOTzRGUmKhlypRDfPnlId5/35+ffmql22djYyNFkBBCiAxJCqGMJvYh/NoaNAnJt2n1CxRqbLJLhoQ8pkePTQQF3QJg3ry/6dSpDI0a+ZnsGkIIIYQ1SCGUkSREwY7uyRdBrTdA3mrgVsgkl1NKsWLFWYYO3UFUVNI17exsmDixAfXqFTbJNYQQQghrkkIoI7m0CkJ3v9jOVwua/gT2OcCzmEkv9fhxLIMH/8batRd0sSJFPFm1qgM1axY06bWEEEIIa5FCKCN4ej/psfjQPfrxBtPBu5LJL3fwYCgBAZu5eTNSF+vduxJz5rTA1dXR5NcTQgghrEUKofQu7gnMy2MYb7sFCtQx+eUOHgylUaNlPH+W0NPTiQUL3qFTp7Imv5YQQghhbTKPUHoWEwZzPQ3jNnaQx98sl6xbtxD16yeN/2nUyJezZwdLESSEECLTkh6h9Gx+Pv3tXGWh0gfg0xBcC5jlknZ2tqxY0Z716y8yYkRNbG1lnTAhhBCZl/QIpVd3j+tv564Avc4lFUK5ypjkEg8exNCx4zqOHr2hF/fxcWfUqFpSBAkhhMj0pEcovTr0if52wGmTTY4IsHv3VXr33kJYWDQnT97lzJlBuLnJQGghhBBZi/QIpUfxEXDr4Ivt1htMVgTFxSUyYsQuWrRYRVhYNADR0QlcufLQJOcXQgghMhLpEUpv7p2AlVX1YyU6muTU587do1u3TZw/f18Xa9GiGEuXtiVv3hwmuYYQQgiRkUghlF48OAu3DsEfw/Tjvi3e+NRareKHH47x6ad7iY/XAODoaMf06W8xdGh1bEx4y00IIYTISKQQSg+2dICrmw3jlYdB4zlvdOq7d6Po02cLu3df08XKl/dm9eqOlCvn/UbnFkIIITI6KYSsRSmIfQDrGsHDi4b7yw944yII4NGjWA4cCNVtjxxZk6lTm+DkJG+9EEIIIZ+GlqQUBK+D0F1wIdB4m0azoGBD8K5okkuWLevN9OlvMXXqEZYta0ezZkVNcl4hhBAiM7BR6vliCllDZGQk7u7uRERE4ObmZtmL3/4T1qSwLEavc+BV7o0uceZMGKVKeeHo+KLGVUrx5Ekcnp7Z3+jcQgghhLWY6/NbHp+3JGNFkKsPVP0IRiS8URGk0Wj55psjVK26iPHj/9DbZ2NjI0WQEEIIYYTcGjO3iFD4cyJcXKEfrzE+qQBy8njjS9y8GUFAwGYOHvwXgBkzgmjXrhR16xZ643MLIYQQmZkUQuYUvA62dzG+r/bnYGv3xpdYt+4CAwdu58mTOCBp3sUxY+pSvbp51iITQgghMhMphMxBmwi/D4TzS4zvHxT2xkVQZGQ8w4fvZNmyM7qYj48bK1a0p0ED3zc6txBCCJFVSCFkDsFrDYugcn2h8Y9g/+ZjdYKCbtKjx2ZCQh7rYl26lGXevFYyFkgIIYRIAymEzGFHD/3tjrvAt7lJTn3gQChNmy5Ho0l62M/V1YG5c1vSo0cFmSFaCCGESCMphExFKXh6L2mZjP/qehQK1DbZZerU8cHfPz/Hj9+mdm0fVq5sj5+fp8nOL4QQQmQlUgi9qcibcGMv7O5ruM8+h0mLIAB7eztWrerA2rXn+fTTumTLJjMgCCGEEK9LCqHXFXULFvqk3Kb9tje6xOPHsQwdupNRo2ri759fFy9WLCfjx9d/o3MLIVJPKUViYiIajcbaqQiRqdnb22Nn9+ZPVKeFFEKvI2QHbG5lfJ97EfCuDCW7gE/D177EgQOhBARs5tatSE6cuMPJkwNxdrZ/7fMJIV5PQkICd+/e5enTp9ZORYhMz8bGhoIFC5IjRw6LXVMKoddhrAgq3SNpgsQ3XCMsIUHDxIn7+fbbozxf/OT+/RguXLhPtWoyN5AQlqTVarl+/Tp2dnbkz58fBwcHeShBCDNRSvHgwQNu3bpF8eLFLdYzJIVQWl36RX+73jdQ/ROTnDo4OJxu3TZx8uRdXaxRI1+WL29PwYIWXhdNCEFCQgJarRYfHx+cnZ2tnY4QmV7u3LkJDQ3l2bNnUgilW3eOvvi7Z3GTFEFKKRYuPMHIkbuJjU0EwN7elilTGjN6dG1sbeU3UCGsydZWHkoQwhKs0eMqhVBaKC2cnvtiu8m8Nz7lgwcx9O+/ja1bg3WxkiVzsXp1R6pUyffG5xdCCCFE8uTXnNR6FgPfv9RN51HkjU9782YkO3b8o9sePLgqJ08OlCJICCGEsAAphFLj/hmY89II9uxe4O73xqeuUiUfX33VCC8vZ7Zu7cpPP7WSp8OEEMKKgoODyZs3L1FRUdZOJVMJDw/H29ubW7duWTsVPVIIpcbmloaxPsGGsVS4fDmcZ8/05yL56KPaXLjwAa1bl3ytcwohxMt69+6NjY0NNjY22Nvb4+fnxyeffEJcXJxB2+3bt9OgQQNcXV1xdnamWrVqBAYGGj3vxo0badiwIe7u7uTIkYMKFSrwxRdf8OjRIzN/R5YzduxYhg0bhqurq7VTMZu5c+fi6+uLk5MTNWrU4Pjx4ym2DwwM1P17ev7l5OSk1+a//+aef7Vo0UK338vLi549ezJp0iSzfE+vSwqhV3n6AKLvvNj2aQSjFWTPmabTaLWK2bP/R6VK8/nqK/1lOOzsbPH2djFFtkIIodOiRQvu3r1LSEgIM2fOZMGCBQYfQj/88ANt27alTp06HDt2jLNnz9K1a1cGDRrERx99pNd2/PjxdOnShWrVqrFz507Onz/PjBkzOHPmDCtWrLDY95WQkGC2c9+4cYPt27fTu3fvNzqPOXN8U2vXrmXUqFFMmjSJkydPUrFiRZo3b879+/dTPM7NzY27d+/qvv7991+DNs//zT3/+uUX/Set+/Tpw6pVq9JX4ayymIiICAWoiIiI1B1w7GulviPp63v717rmnTuRqnnzFQomK5isbG0/V8eO3XqtcwkhLCc2NlZdvHhRxcbGWjuVNOvVq5dq27atXqxDhw6qcuXKuu0bN24oe3t7NWrUKIPj58yZowD1v//9Tyml1LFjxxSgZs2aZfR6jx8/TjaXmzdvqq5duypPT0/l7Oys/P39dec1lueHH36oGjRooNtu0KCBGjJkiPrwww9Vrly5VMOGDdV7772nOnfurHdcQkKCypUrl1q2bJlSSimNRqOmTp2qfH19lZOTk6pQoYJav359snkqpdT06dNV1apV9WLh4eGqa9euKn/+/Cp79uyqXLlyavXq1XptjOWolFLnzp1TLVq0UC4uLsrb21v16NFDPXjwQHfczp07VZ06dZS7u7vKmTOnatWqlbp69WqKOb6p6tWrqyFDhui2NRqNyp8/v5o2bVqyxyxdulS5u7uneF5j76Uxfn5+avHixUb3pfQzl+bP71SSp8Ze5emDF38v1j7Nh2/Zcpn+/bcRHv5iVtrhw6tToUIeU2QnhLCGlVUhJszy13XJCz3+fq1Dz58/z59//knhwoV1sQ0bNvDs2TODnh+AgQMHMm7cOH755Rdq1KjBqlWryJEjBx988IHR83t4eBiNR0dH06BBAwoUKMDWrVvJmzcvJ0+eRKvVpin/ZcuWMXjwYI4eTZrC5OrVq3Tq1Ino6GjdLMS7d+/m6dOntG+f9H/1tGnTWLlyJfPnz6d48eIcOnSIHj16kDt3bho0aGD0OocPH6Zq1ap6sbi4OPz9/fn0009xc3Pjt99+IyAggKJFi1K9evVkc3zy5AmNGzemf//+zJw5k9jYWD799FM6d+7MH3/8AUBMTAyjRo2iQoUKREdHM3HiRNq3b8/p06eTnbZh6tSpTJ06NcXX6+LFixQqVMggnpCQwIkTJxg7dqwuZmtrS9OmTQkKCkrxnNHR0RQuXBitVkuVKlWYOnUqZcuW1Wtz4MABvL298fT0pHHjxnz11VfkypVLr0316tU5fPgw/fr1S/F6liKFUFpUGZ7qpjExCYwevYcFC07oYnnz5mDZsnY0a1bUHNkJISwlJgyib1s7i1favn07OXLkIDExkfj4eGxtbfnxxx91+69cuYK7uzv58hk+perg4ECRIkW4cuUKAP/88w9FihTB3j5tD3OsXr2aBw8e8Ndff5EzZ9KQgmLFiqX5eylevDjffvutbrto0aK4uLiwefNmAgICdNdq06YNrq6uxMfHM3XqVPbu3UutWrUAKFKkCEeOHGHBggXJFkL//vuvQSFUoEABvWJx2LBh7N69m3Xr1ukVQi/n+NVXX1G5cmW9omXJkiX4+Phw5coVSpQoQceOHfWutWTJEnLnzs3FixcpV66c0RwHDRpE586dU3y98ufPbzQeHh6ORqMhTx79X8bz5MnD5cuXkz1fyZIlWbJkCRUqVCAiIoLvvvuO2rVrc+HCBQoWLAgk3Rbr0KEDfn5+XLt2jXHjxvH2228TFBSkNzli/vz5OXXqVIr5W5IUQslJiIK1DeB+2t+sEyfu0K3bJq5ceaiLtW1bksWL2+DlJbPTCpHhueTNENdt1KgR8+bNIyYmhpkzZ5ItWzaDD97UUs/X/Emj06dPU7lyZV0R9Lr8/f31trNly0bnzp1ZtWoVAQEBxMTEsGXLFtasWQMk9Rg9ffqUt956S++4hIQEKleunOx1YmNjDQYBazQapk6dyrp167h9+zYJCQnEx8cbzDb+co5nzpxh//79RtfNunbtGiVKlOCff/5h4sSJHDt2jPDwcF1P2Y0bN5IthHLmzPnGr2da1apVS1dQAtSuXZvSpUuzYMECvvzySwC6du2q21++fHkqVKhA0aJFOXDgAE2aNNHty549e7pau08KoZclxsHyivD4iuE+p1yGsZf88cd1mjdfSWJi0j9mZ2d7Zs1qTv/+VWSNIiEyi9e8PWVpLi4uut6XJUuWULFiRX7++WfdLYkSJUoQERHBnTt3DHoQEhISuHbtGo0aNdK1PXLkCM+ePUtTr1D27NlT3G9ra2tQZD179szo9/Ky7t2706BBA+7fv8/vv/9O9uzZdU8pRUdHA/Dbb79RoID+Oo2Ojo7J5uPl5cXjx4/1YtOnT2f27NnMmjWL8uXL4+LiwogRIwwGRL+cY3R0NK1bt+abb74xuM7zXrjWrVtTuHBhFi1aRP78+dFqtZQrVy7FwdZvcmvMy8sLOzs77t27pxe/d+8eefOmvtC2t7encuXKXL16Ndk2RYoUwcvLi6tXr+oVQo8ePSJ37typvpa5yVNjL7u02ngRVOcryFXqlYfXqeNDmTJJb7C/fz5OnRrIgAH+UgQJIazK1taWcePG8dlnnxEbGwtAx44dsbe3Z8aMGQbt58+fT0xMDO+99x4A3bp1Izo6mp9++sno+Z88eWI0XqFCBU6fPp3sU0K5c+fm7t27erHTp0+n6nuqXbs2Pj4+rF27llWrVtGpUyddkVamTBkcHR25ceMGxYoV0/vy8fFJ9pyVK1fm4sWLerGjR4/Stm1bevToQcWKFfVuGaakSpUqXLhwAV9fX4McXFxcePjwIcHBwXz22Wc0adKE0qVLGxRhxgwaNIjTp0+n+JXcrTEHBwf8/f3Zt2+fLqbVatm3b59ej8+raDQazp07Z/S26nO3bt3i4cOHBm3Onz+fYq+cxZl06HUG8MpR5wc+evGU2HcotSNAKa02Tdc4f/6eGj9+n4qPTzRBxkIIa8lsT409e/ZMFShQQE2fPl0XmzlzprK1tVXjxo1Tly5dUlevXlUzZsxQjo6OavTo0XrHf/LJJ8rOzk59/PHH6s8//1ShoaFq79696t133032abL4+HhVokQJVa9ePXXkyBF17do1tWHDBvXnn38qpZTatWuXsrGxUcuWLVNXrlxREydOVG5ubgZPjX344YdGzz9+/HhVpkwZlS1bNnX48GGDfbly5VKBgYHq6tWr6sSJE2rOnDkqMDAw2ddt69atytvbWyUmvvj/e+TIkcrHx0cdPXpUXbx4UfXv31+5ubnpvb7Gcrx9+7bKnTu3evfdd9Xx48fV1atX1a5du1Tv3r1VYmKi0mg0KleuXKpHjx7qn3/+Ufv27VPVqlVTgNq8eXOyOb6pNWvWKEdHRxUYGKguXryo3n//feXh4aHCwsJ0bQICAtSYMWN0259//rnavXu3unbtmjpx4oTq2rWrcnJyUhcuXFBKKRUVFaU++ugjFRQUpK5fv6727t2rqlSpoooXL67i4uJ054mJiVHZs2dXhw4dMpqbNZ4ak0Lov24e0i+CQn9/xbniVP/+W9T58/fMlK0QwpoyWyGklFLTpk1TuXPnVtHR0brYli1bVL169ZSLi4tycnJS/v7+asmSJUbPu3btWlW/fn3l6uqqXFxcVIUKFdQXX3yR4uPzoaGhqmPHjsrNzU05OzurqlWrqmPHjun2T5w4UeXJk0e5u7urkSNHqqFDh6a6ELp48aICVOHChZX2pV9atVqtmjVrlipZsqSyt7dXuXPnVs2bN1cHDx5MNtdnz56p/Pnzq127duliDx8+VG3btlU5cuRQ3t7e6rPPPlM9e/Z8ZSGklFJXrlxR7du3Vx4eHip79uyqVKlSasSIEbpcf//9d1W6dGnl6OioKlSooA4cOGD2QkgppX744QdVqFAh5eDgoKpXr66bzuC/30+vXr102yNGjNC1z5Mnj2rZsqU6efKkbv/Tp09Vs2bNVO7cuZW9vb0qXLiwGjBggF5xpZRSq1evViVLlkw2L2sUQjZKveYIuAwqMjISd3d3IiIicHNz098546XbV/2vg7uv0fMEBd2kR4/NhIQ8pkKFPBw/3h9HRxlyJURmEhcXx/Xr1/Hz8zMYQCsyr7lz57J161Z2795t7VQynZo1azJ8+HC6detmdH9KP3Mpfn6/ARkj9NzZhfrbdacaLYISE7V8/vkB6tVbSkhI0r3c69cfc/bsPYO2QgghMp6BAwdSv359WWvMxMLDw+nQoYNu3Fl6IT1CAM+ewpz/jPa3sYNRiQbHhoQ8pkePTQQFvVgwrnZtH1aubI+fn6e5UxdCWJj0CAlhWdboEZJ7OQCRL62X0uu83qZSihUrzjJ06A6iopIeabSzs2HixAaMG1ePbNmkY00IIYTIiKQQeplPI73H5B8/jmXw4N9Yu/aCLlakiCerVnWgZs2C1shQCCGEECYihRDAiZkv/u7mq7fr0qVw1q9/MadE796VmDOnBa6uyU/IJYTIXLLYCAIhrMYaP2tyTwfg9pEXf3fV7+WpXduH8ePr4eHhxLp177J0aVspgoTIIp5PzpeelgMQIjN7PqP2f9cmMzfpEQKwffEyXM/Zh0IaLXZ2L2rECRPqM3CgPwUKmG5wlhAi/bOzs8PDw4P79+8D4OzsLLPEC2EmWq2WBw8e4OzsTLZslitPpBB6eAnCz6EULDxei5ETVjNpUgM+/bSurom9vZ0UQUJkUc/XX3peDAkhzMfW1pZChQpZ9BeOrP34vIsTzHLkQbQz/de3YeuFpEHS2bLZcvx4fypXTn4NFSFE1qLRaIwuBiqEMB0HBwdsbY2P2snUj8/PnTuX6dOnExYWRsWKFfnhhx+oXr16su3Xr1/PhAkTCA0NpXjx4nzzzTe0bNkybReNDoMVNdkdXJTea9oRFuWq29W/f2VKlvR63W9HCJEJ2dnZWXTcghDCMqw+WHrt2rWMGjWKSZMmcfLkSSpWrEjz5s2T7Yb+888/ee+99+jXrx+nTp2iXbt2tGvXjvPnzxttn5y4i5sZsbYGLRYF6IogLy9ntm7tyrx57+DsbP/G35sQQggh0jer3xqrUaMG1apV48cffwSSBkv5+PgwbNgwxowZY9C+S5cuxMTEsH37dl2sZs2aVKpUifnz57/yes+71krnH8KlO7l18RZN8rF0ZTfy5s1hgu9KCCGEEKaUKdcaS0hI4MSJEzRt2lQXs7W1pWnTpgQFBRk9JigoSK89QPPmzZNtn5xLd5J6gRyzJTJnQkF2/D5AiiAhhBAii7HqGKHw8HA0Gg158uTRi+fJk4fLly8bPSYsLMxo+7CwMKPt4+PjiY+P121HREQ830OZPA/4uedhygzbK4vrCSGEEOlYZGQkYPpJF9PFYGlzmjZtGp9//rmRPTO5eA9qTQemF7J0WkIIIYR4DQ8fPsTd3d1k57NqIeTl5YWdnR337t3Ti9+7d083d8fL8ubNm6b2Y8eOZdSoUbrtJ0+eULhwYW7cuGHSF1KkXWRkJD4+Pty8edOk93vF65H3I/2Q9yL9kPci/YiIiKBQoULkzJnTpOe1aiHk4OCAv78/+/bto127dkDSYOl9+/YxdOhQo8fUqlWLffv2MWLECF3s999/p1atWkbbOzo64uhouCSGu7u7/KNOJ9zc3OS9SEfk/Ug/5L1IP+S9SD+Sm2fodVn91tioUaPo1asXVatWpXr16syaNYuYmBj69OkDQM+ePSlQoADTpk0D4MMPP6RBgwbMmDGDVq1asWbNGv7++28WLlxozW9DCCGEEBmQ1QuhLl268ODBAyZOnEhYWBiVKlVi165dugHRN27c0Kv+ateuzerVq/nss88YN24cxYsX59dff6VcuXLW+haEEEIIkUFZvRACGDp0aLK3wg4cOGAQ69SpE506dXqtazk6OjJp0iSjt8uEZcl7kb7I+5F+yHuRfsh7kX6Y672w+oSKQgghhBDWYvUlNoQQQgghrEUKISGEEEJkWVIICSGEECLLkkJICCGEEFlWpiyE5s6di6+vL05OTtSoUYPjx4+n2H79+vWUKlUKJycnypcvz44dOyyUaeaXlvdi0aJF1KtXD09PTzw9PWnatOkr3zuRNmn92XhuzZo12NjY6CY+FW8ure/FkydPGDJkCPny5cPR0ZESJUrI/1Umktb3YtasWZQsWZLs2bPj4+PDyJEjiYuLs1C2mdehQ4do3bo1+fPnx8bGhl9//fWVxxw4cIAqVarg6OhIsWLFCAwMTPuFVSazZs0a5eDgoJYsWaIuXLigBgwYoDw8PNS9e/eMtj969Kiys7NT3377rbp48aL67LPPlL29vTp37pyFM8980vpedOvWTc2dO1edOnVKXbp0SfXu3Vu5u7urW7duWTjzzCmt78dz169fVwUKFFD16tVTbdu2tUyymVxa34v4+HhVtWpV1bJlS3XkyBF1/fp1deDAAXX69GkLZ575pPW9WLVqlXJ0dFSrVq1S169fV7t371b58uVTI0eOtHDmmc+OHTvU+PHj1aZNmxSgNm/enGL7kJAQ5ezsrEaNGqUuXryofvjhB2VnZ6d27dqVputmukKoevXqasiQIbptjUaj8ufPr6ZNm2a0fefOnVWrVq30YjVq1FADBw40a55ZQVrfi5clJiYqV1dXtWzZMnOlmKW8zvuRmJioateurRYvXqx69eolhZCJpPW9mDdvnipSpIhKSEiwVIpZRlrfiyFDhqjGjRvrxUaNGqXq1Klj1jyzmtQUQp988okqW7asXqxLly6qefPmabpWpro1lpCQwIkTJ2jatKkuZmtrS9OmTQkKCjJ6TFBQkF57gObNmyfbXqTO67wXL3v69CnPnj0z+QJ7WdHrvh9ffPEF3t7e9OvXzxJpZgmv815s3bqVWrVqMWTIEPLkyUO5cuWYOnUqGo3GUmlnSq/zXtSuXZsTJ07obp+FhISwY8cOWrZsaZGcxQum+vxOFzNLm0p4eDgajUa3PMdzefLk4fLly0aPCQsLM9o+LCzMbHlmBa/zXrzs008/JX/+/Ab/0EXavc77ceTIEX7++WdOnz5tgQyzjtd5L0JCQvjjjz/o3r07O3bs4OrVq3zwwQc8e/aMSZMmWSLtTOl13otu3boRHh5O3bp1UUqRmJjIoEGDGDdunCVSFv+R3Od3ZGQksbGxZM+ePVXnyVQ9QiLz+Prrr1mzZg2bN2/GycnJ2ulkOVFRUQQEBLBo0SK8vLysnU6Wp9Vq8fb2ZuHChfj7+9OlSxfGjx/P/PnzrZ1alnPgwAGmTp3KTz/9xMmTJ9m0aRO//fYbX375pbVTE68pU/UIeXl5YWdnx7179/Ti9+7dI2/evEaPyZs3b5rai9R5nffiue+++46vv/6avXv3UqFCBXOmmWWk9f24du0aoaGhtG7dWhfTarUAZMuWjeDgYIoWLWrepDOp1/nZyJcvH/b29tjZ2elipUuXJiwsjISEBBwcHMyac2b1Ou/FhAkTCAgIoH///gCUL1+emJgY3n//fcaPH6+3SLgwr+Q+v93c3FLdGwSZrEfIwcEBf39/9u3bp4tptVr27dtHrVq1jB5Tq1YtvfYAv//+e7LtReq8znsB8O233/Lll1+ya9cuqlataolUs4S0vh+lSpXi3LlznD59WvfVpk0bGjVqxOnTp/Hx8bFk+pnK6/xs1KlTh6tXr+qKUYArV66QL18+KYLewOu8F0+fPjUodp4XqEqW7rQok31+p20cd/q3Zs0a5ejoqAIDA9XFixfV+++/rzw8PFRYWJhSSqmAgAA1ZswYXfujR4+qbNmyqe+++05dunRJTZo0SR6fN5G0vhdff/21cnBwUBs2bFB3797VfUVFRVnrW8hU0vp+vEyeGjOdtL4XN27cUK6urmro0KEqODhYbd++XXl7e6uvvvrKWt9CppHW92LSpEnK1dVV/fLLLyokJETt2bNHFS1aVHXu3Nla30KmERUVpU6dOqVOnTqlAPX999+rU6dOqX///VcppdSYMWNUQECArv3zx+c//vhjdenSJTV37lx5fP65H374QRUqVEg5ODio6tWrq//973+6fQ0aNFC9evXSa79u3TpVokQJ5eDgoMqWLat+++03C2eceaXlvShcuLACDL4mTZpk+cQzqbT+bPyXFEKmldb34s8//1Q1atRQjo6OqkiRImrKlCkqMTHRwllnTml5L549e6YmT56sihYtqpycnJSPj4/64IMP1OPHjy2feCazf/9+o58Bz1//Xr16qQYNGhgcU6lSJeXg4KCKFCmili5dmubr2iglfXlCCCGEyJoy1RghIYQQQoi0kEJICCGEEFmWFEJCCCGEyLKkEBJCCCFEliWFkBBCCCGyLCmEhBBCCJFlSSEkhBBCiCxLCiEhhJ7AwEA8PDysncZrs7Gx4ddff02xTe/evWnXrp1F8hFCpG9SCAmRCfXu3RsbGxuDr6tXr1o7NQIDA3X52NraUrBgQfr06cP9+/dNcv67d+/y9ttvAxAaGoqNjQ2nT5/WazN79mwCAwNNcr3kTJ48Wfd92tnZ4ePjw/vvv8+jR4/SdB4p2oQwr0y1+rwQ4oUWLVqwdOlSvVju3LmtlI0+Nzc3goOD0Wq1nDlzhj59+nDnzh127979xudObtXw/3J3d3/j66RG2bJl2bt3LxqNhkuXLtG3b18iIiJYu3atRa4vhHg16RESIpNydHQkb968el92dnZ8//33lC9fHhcXF3x8fPjggw+Ijo5O9jxnzpyhUaNGuLq64ubmhr+/P3///bdu/5EjR6hXrx7Zs2fHx8eH4cOHExMTk2JuNjY25M2bl/z58/P2228zfPhw9u7dS2xsLFqtli+++IKCBQvi6OhIpUqV2LVrl+7YhIQEhg4dSr58+XBycqJw4cJMmzZN79zPb435+fkBULlyZWxsbGjYsCGg38uycOFC8ufPr7eyO0Dbtm3p27evbnvLli1UqVIFJycnihQpwueff05iYmKK32e2bNnImzcvBQoUoGnTpnTq1Inff/9dt1+j0dCvXz/8/PzInj07JUuWZPbs2br9kydPZtmyZWzZskXXu3TgwAEAbt68SefOnfHw8CBnzpy0bduW0NDQFPMRQhiSQkiILMbW1pY5c+Zw4cIFli1bxh9//MEnn3ySbPvu3btTsGBB/vrrL06cOMGYMWOwt7cH4Nq1a7Ro0YKOHTty9uxZ1q5dy5EjRxg6dGiacsqePTtarZbExERmz57NjBkz+O677zh79izNmzenTZs2/PPPPwDMmTOHrVu3sm7dOoKDg1m1ahW+vr5Gz3v8+HEA9u7dy927d9m0aZNBm06dOvHw4UP279+viz169Ihdu3bRvXt3AA4fPkzPnj358MMPuXjxIgsWLCAwMJApU6ak+nsMDQ1l9+7dODg46GJarZaCBQuyfv16Ll68yMSJExk3bhzr1q0D4KOPPqJz5860aNGCu3fvcvfuXWrXrs2zZ89o3rw5rq6uHD58mKNHj5IjRw5atGhBQkJCqnMSQkCmXH1eiKyuV69eys7OTrm4uOi+3n33XaNt169fr3LlyqXbXrp0qXJ3d9dtu7q6qsDAQKPH9uvXT73//vt6scOHDytbW1sVGxtr9JiXz3/lyhVVokQJVbVqVaWUUvnz51dTpkzRO6ZatWrqgw8+UEopNWzYMNW4cWOl1WqNnh9QmzdvVkopdf36dQWoU6dO6bXp1auXatu2rW67bdu2qm/fvrrtBQsWqPz58yuNRqOUUqpJkyZq6tSpeudYsWKFypcvn9EclFJq0qRJytbWVrm4uCgnJyfdStrff/99sscopdSQIUNUx44dk831+bVLliyp9xrEx8er7Nmzq927d6d4fiGEPhkjJEQm1ahRI+bNm6fbdnFxAZJ6R6ZNm8bly5eJjIwkMTGRuLg4nj59irOzs8F5Ro0aRf/+/VmxYoXu9k7RokWBpNtmZ8+eZdWqVbr2Sim0Wi3Xr1+ndOnSRnOLiIggR44caLVa4uLiqFu3LosXLyYyMpI7d+5Qp04dvfZ16tThzJkzQNJtrbfeeouSJUvSokUL3nnnHZo1a/ZGr1X37t0ZMGAAP/30E46OjqxatYquXbtia2ur+z6PHj2q1wOk0WhSfN0ASpYsydatW4mLi2PlypWcPn2aYcOG6bWZO3cuS5Ys4caNG8TGxpKQkEClSpVSzPfMmTNcvXoVV1dXvXhcXBzXrl17jVdAiKxLCiEhMikXFxeKFSumFwsNDeWdd95h8ODBTJkyhZw5c3LkyBH69etHQkKC0Q/0yZMn061bN3777Td27tzJpEmTWLNmDe3btyc6OpqBAwcyfPhwg+MKFSqUbG6urq6cPHkSW1tb8uXLR/bs2QGIjIx85fdVpUoVrl+/zs6dO9m7dy+dO3emadOmbNiw4ZXHJqd169Yopf6vnXsJha+N4wD+fZVhwpA0uTQluezQyBSbKRJlIVJIycaCJlJkFuMyKZGwsBFFkWYmKwsZK0qjXJNyGddIiURTCmn8/iuTMXjT+5b37Xw/y/Oc58zznNl8O+fbwdzcHLKzs7G8vIyhoSHf+MPDA6xWK8rKygLmhoaGfnldlUrl+w96e3tRXFwMq9WK7u5uAIDdbkdLSwsGBgaQk5ODiIgI9Pf3Y3V19dv1Pjw8ICsryy+AvvmvFOKJ/i8YhIgUZHNzE6+vrxgYGPA97Xjro3wnNTUVqampaG5uRlVVFSYmJlBaWgq9Xo+9vb2AwPV3goKCPp2j0WgQHx8Pl8sFo9HoO+5yuWAwGPzOq6ioQEVFBcrLy1FUVIS7uztER0f7Xe+tj+P1er9dT2hoKMrKyjA9PY3j42OkpaVBr9f7xvV6Pdxu94/3+ZHFYkFeXh7q6+t9+8zNzUVDQ4PvnI9PdFQqVcD69Xo9HA4HtFotNBrNP1oTkdKxLE2kIMnJyXh5ecHw8DBOT08xNTWFkZGRL89/fHyEyWTC0tISzs/P4XK5sL6+7nvl1dbWhpWVFZhMJmxvb+Po6Aizs7M/Lku/19rair6+PjgcDrjdbpjNZmxvb6OpqQkAMDg4CJvNhoODAxweHmJmZgaxsbGffgRSq9VCrVbD6XTi+voaHo/ny9+trq7G3NwcxsfHfSXpNx0dHZicnITVasXu7i729/dht9thsVh+tLecnBykp6ejp6cHAJCSkoKNjQ0sLCzg8PAQ7e3tWF9f95uTmJiInZ0duN1u3N7e4uXlBdXV1YiJiUFJSQmWl5dxdnaGpaUlNDY24vLy8kdrIlK83y4pEdG/77OC7ZvBwUGJi4sTtVothYWFMjk5KQDk/v5eRPzLzM/Pz1JZWSk6nU5UKpXEx8eLyWTyK0Kvra1JQUGBhIeHS1hYmKSnpweUnd/7WJb+yOv1SldXlyQkJEhwcLBkZGTI/Py8b3x0dFQyMzMlLCxMNBqN5Ofny9bWlm8c78rSIiJjY2Oi0+kkKChIjEbjl/fH6/VKXFycAJCTk5OAdTmdTsnNzRW1Wi0ajUYMBoOMjo5+uY/Ozk7JyMgIOG6z2SQkJEQuLi7k6elJamtrJTIyUqKioqS+vl7MZrPfvJubG9/9BSCLi4siInJ1dSU1NTUSExMjISEhkpSUJHV1deLxeL5cExEF+ktE5HejGBEREdHv4KsxIiIiUiwGISIiIlIsBiEiIiJSLAYhIiIiUiwGISIiIlIsBiEiIiJSLAYhIiIiUiwGISIiIlIsBiEiIiJSLAYhIiIiUiwGISIiIlIsBiEiIiJSrD9c1Thy+Y7JUgAAAABJRU5ErkJggg==",
778 | "text/plain": [
779 | ""
780 | ]
781 | },
782 | "metadata": {},
783 | "output_type": "display_data"
784 | }
785 | ],
786 | "source": [
787 | "auc = plot_roc_curve(model, val_x, val_y)"
788 | ]
789 | },
790 | {
791 | "cell_type": "markdown",
792 | "metadata": {},
793 | "source": [
794 | "# [Step 3: Dataset Inference](#toc0_)\n",
795 | "\n",
796 | "⚠️ **Remember: Members are classified as 0, while non-members as 1.**"
797 | ]
798 | },
799 | {
800 | "cell_type": "code",
801 | "execution_count": 21,
802 | "metadata": {},
803 | "outputs": [
804 | {
805 | "name": "stderr",
806 | "output_type": "stream",
807 | "text": [
808 | "100%|██████████| 500/500 [09:41<00:00, 1.16s/it]\n",
809 | "100%|██████████| 500/500 [09:40<00:00, 1.16s/it]\n"
810 | ]
811 | }
812 | ],
813 | "source": [
814 | "B_members_metrics = aggregate_metrics(llm, tokenizer, B_members, metric_list, None, batch_size=batch_size)\n",
815 | "B_nonmembers_metrics = aggregate_metrics(llm, tokenizer, B_nonmembers, metric_list, None, batch_size=batch_size)"
816 | ]
817 | },
818 | {
819 | "cell_type": "code",
820 | "execution_count": 22,
821 | "metadata": {},
822 | "outputs": [
823 | {
824 | "name": "stdout",
825 | "output_type": "stream",
826 | "text": [
827 | "The null hypothesis is that the B_members scores are larger or equal than B_nonmembers.\n",
828 | "The alternative hypothesis is that B_members (0) are lower than B_nonmembers (1) . The p-value is 0.00331255002904568\n"
829 | ]
830 | }
831 | ],
832 | "source": [
833 | "B_members_metrics_tensor, B_nonmembers_metrics_ternsor = prepare_metrics(B_members_metrics, B_nonmembers_metrics, outliers=None, return_tensors=True)\n",
834 | "B_members_preds, _ = get_predictions(model, B_members_metrics_tensor, torch.tensor([0]*B_members_metrics_tensor.shape[0]))\n",
835 | "B_nonmembers_preds, _ = get_predictions(model, B_nonmembers_metrics_ternsor, torch.tensor([1]*B_nonmembers_metrics_ternsor.shape[0]))\n",
836 | "\n",
837 | "p_value_list = get_p_value_list(B_members_preds, B_nonmembers_preds, list_number_samples=[1000])\n",
838 | "\n",
839 | "print(f\"The null hypothesis is that the B_members scores are larger or equal than B_nonmembers.\\nThe alternative hypothesis is that B_members (0) are lower than B_nonmembers (1) . The p-value is {p_value_list[-1]}\")"
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": null,
845 | "metadata": {},
846 | "outputs": [],
847 | "source": []
848 | }
849 | ],
850 | "metadata": {
851 | "kernelspec": {
852 | "display_name": "Python 3",
853 | "language": "python",
854 | "name": "python3"
855 | },
856 | "language_info": {
857 | "codemirror_mode": {
858 | "name": "ipython",
859 | "version": 3
860 | },
861 | "file_extension": ".py",
862 | "mimetype": "text/x-python",
863 | "name": "python",
864 | "nbconvert_exporter": "python",
865 | "pygments_lexer": "ipython3",
866 | "version": "3.8.10"
867 | }
868 | },
869 | "nbformat": 4,
870 | "nbformat_minor": 2
871 | }
872 |
--------------------------------------------------------------------------------
/di.py:
--------------------------------------------------------------------------------
1 | from utils import prepare_model
2 | from metrics import aggregate_metrics, reference_model_registry
3 | import json, os
4 | import argparse
5 | from datasets import load_dataset
6 |
7 | def get_args():
8 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model')
9 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-410m-deduped", help='The name of the model to use')
10 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use')
11 | parser.add_argument('--split', type=str, default="train", help='The split of the dataset to use')
12 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use')
13 | parser.add_argument('--batch_size', type=int, default=32, help='The batch size to use')
14 | parser.add_argument('--from_hf', type=int, default=1, help='If set, will load the dataset from huggingface')
15 | parser.add_argument('--cache_dir', type=str, default="/data/locus/llm_weights", help='The directory to cache the model')
16 | args = parser.parse_args()
17 | return args
18 |
19 |
20 |
21 | def main():
22 | args = get_args()
23 | results_file = f"results/{args.model_name}/{args.dataset_name}_{args.split}_metrics.json"
24 | # if os.path.exists(results_file):
25 | # print(f"Results file {results_file} already exists. Aborting...")
26 | # return
27 | model_name = args.model_name
28 |
29 | if model_name in ["microsoft/phi-1_5", "EleutherAI/pythia-12b", "EleutherAI/pythia-6.9b", "EleutherAI/pythia-410m"]:
30 | args.cache_dir = "/data/locus/llm_weights/pratyush"
31 |
32 | model, tokenizer = prepare_model(model_name, cache_dir= args.cache_dir)
33 |
34 | # load the data
35 | dataset_name = args.dataset_name
36 | split = args.split
37 |
38 | if not args.from_hf:
39 | from dataloader import load_data
40 | # if you want to load data directly from the PILE, use the following line
41 | num_samples = args.num_samples
42 | dataset = load_data(dataset_name, split, num_samples)
43 | else:
44 | dataset_path = f"data/{dataset_name}_{split}.jsonl"
45 | dataset = load_dataset("json", data_files=dataset_path, split="train")
46 | print("Data loaded")
47 |
48 | # get the metrics
49 | if model_name in reference_model_registry.values():
50 | metric_list = ["ppl"]
51 | else:
52 | metric_list = ["k_min_probs", "ppl", "zlib_ratio", "k_max_probs", "perturbation", "reference_model"]
53 | metrics = aggregate_metrics(model, tokenizer, dataset, metric_list, args, batch_size = args.batch_size)
54 |
55 | # save the metrics
56 | os.makedirs(f"results/{model_name}", exist_ok=True)
57 | with open(results_file, 'w') as f:
58 | json.dump(metrics, f)
59 |
60 | if __name__ == "__main__":
61 | main()
62 |
63 |
--------------------------------------------------------------------------------
/files/llm-dataset-inference-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pratyushmaini/llm_dataset_inference/6f25d41f133c94b1272341f11224ae6f628c7b5e/files/llm-dataset-inference-overview.png
--------------------------------------------------------------------------------
/linear_di.py:
--------------------------------------------------------------------------------
1 | """
2 | Loads various features for the train and val sets.
3 | Trains a linear model on the train set and evaluates it on the val set.
4 |
5 | Tests p value of differentiating train versus val on held out features.
6 | """
7 |
8 | import os
9 | import sys
10 | import json
11 | import numpy as np
12 | import pandas as pd
13 | from scipy.stats import ttest_ind, chi2, norm
14 | import torch
15 | import torch.nn as nn
16 | import argparse
17 | from tqdm import tqdm
18 | from selected_features import feature_list
19 |
20 | p_sample_list = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
21 |
22 | def get_args():
23 | parser = argparse.ArgumentParser(description='Dataset Inference on a language model')
24 | parser.add_argument('--model_name', type=str, default="EleutherAI/pythia-12b", help='The name of the model to use')
25 | parser.add_argument('--dataset_name', type=str, default="wikipedia", help='The name of the dataset to use')
26 | parser.add_argument('--num_samples', type=int, default=1000, help='The number of samples to use')
27 | parser.add_argument("--normalize", type=str, default="train", help="Should you normalize?", choices=["no", "train", "combined"])
28 | parser.add_argument("--outliers", type=str, default="clip", help="The ablation to use", choices=["randomize", "keep", "zero", "mean", "clip", "mean+p-value", "p-value"])
29 | parser.add_argument("--features", type=str, default="all", help="The features to use", choices=["all", "selected"])
30 | parser.add_argument("--false_positive", type=int, default=0, help="What if you gave two val splits?", choices=[0, 1])
31 | parser.add_argument("--num_random", type=int, default=1, help="How many random runs to do?")
32 | args = parser.parse_args()
33 | return args
34 |
35 |
36 | def get_model(num_features, linear = True):
37 | if linear:
38 | model = nn.Linear(num_features, 1)
39 | else:
40 | model = nn.Sequential(
41 | nn.Linear(num_features, 10),
42 | nn.ReLU(),
43 | nn.Linear(10, 1) # Single output neuron
44 | )
45 | return model
46 |
47 |
48 | def train_model(inputs, y, num_epochs=10000):
49 | num_features = inputs.shape[1]
50 | model = get_model(num_features)
51 |
52 | criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy Loss for binary classification
53 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
54 |
55 | # Convert y to float tensor for BCEWithLogitsLoss
56 | y_float = y.float()
57 |
58 | with tqdm(range(num_epochs)) as pbar:
59 | for epoch in pbar:
60 | optimizer.zero_grad()
61 | outputs = model(inputs).squeeze() # Squeeze the output to remove singleton dimension
62 | loss = criterion(outputs, y_float)
63 | loss.backward()
64 | optimizer.step()
65 | pbar.set_description('loss {}'.format(loss.item()))
66 | return model
67 |
68 | def get_predictions(model, val, y):
69 | with torch.no_grad():
70 | preds = model(val).detach().squeeze()
71 | criterion = nn.BCEWithLogitsLoss()
72 | loss = criterion(preds, y.float())
73 | return preds.numpy(), loss.item()
74 |
75 | def get_dataset_splits(_train_metrics, _val_metrics, num_samples):
76 | # get the train and val sets
77 | for_train_train_metrics = _train_metrics[:num_samples]
78 | for_train_val_metrics = _val_metrics[:num_samples]
79 | for_val_train_metrics = _train_metrics[num_samples:]
80 | for_val_val_metrics = _val_metrics[num_samples:]
81 |
82 |
83 | # create the train and val sets
84 | train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0)
85 | train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0])))
86 | val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0)
87 | val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0])))
88 |
89 | # return tensors
90 | train_x = torch.tensor(train_x, dtype=torch.float32)
91 | train_y = torch.tensor(train_y, dtype=torch.float32)
92 | val_x = torch.tensor(val_x, dtype=torch.float32)
93 | val_y = torch.tensor(val_y, dtype=torch.float32)
94 |
95 | return (train_x, train_y), (val_x, val_y)
96 |
97 | def normalize_and_stack(train_metrics, val_metrics, normalize="train"):
98 | '''
99 | excpects an input list of list of metrics
100 | normalize val with corre
101 | '''
102 | new_train_metrics = []
103 | new_val_metrics = []
104 | for (tm, vm) in zip(train_metrics, val_metrics):
105 | if normalize == "combined":
106 | combined_m = np.concatenate((tm, vm))
107 | mean_tm = np.mean(combined_m)
108 | std_tm = np.std(combined_m)
109 | else:
110 | mean_tm = np.mean(tm)
111 | std_tm = np.std(tm)
112 |
113 | if normalize == "no":
114 | normalized_vm = vm
115 | normalized_tm = tm
116 | else:
117 | #normalization should be done with respect to the train set statistics
118 | normalized_vm = (vm - mean_tm) / std_tm
119 | normalized_tm = (tm - mean_tm) / std_tm
120 |
121 | new_train_metrics.append(normalized_tm)
122 | new_val_metrics.append(normalized_vm)
123 |
124 | train_metrics = np.stack(new_train_metrics, axis=1)
125 | val_metrics = np.stack(new_val_metrics, axis=1)
126 | return train_metrics, val_metrics
127 |
128 | def remove_outliers(metrics, remove_frac=0.05, outliers = "zero"):
129 | # Sort the array to work with ordered data
130 | sorted_ids = np.argsort(metrics)
131 |
132 | # Calculate the number of elements to remove from each side
133 | total_elements = len(metrics)
134 | elements_to_remove_each_side = int(total_elements * remove_frac / 2)
135 |
136 | # Ensure we're not attempting to remove more elements than are present
137 | if elements_to_remove_each_side * 2 > total_elements:
138 | raise ValueError("remove_frac is too large, resulting in no elements left.")
139 |
140 | # Change the removed metrics to 0.
141 | lowest_ids = sorted_ids[:elements_to_remove_each_side]
142 | highest_ids = sorted_ids[-elements_to_remove_each_side:]
143 | all_ids = np.concatenate((lowest_ids, highest_ids))
144 |
145 | # import pdb; pdb.set_trace()
146 |
147 | trimmed_metrics = np.copy(metrics)
148 |
149 | if outliers == "zero":
150 | trimmed_metrics[all_ids] = 0
151 | elif outliers == "mean" or outliers == "mean+p-value":
152 | trimmed_metrics[all_ids] = np.mean(trimmed_metrics)
153 | elif outliers == "clip":
154 | highest_val_permissible = trimmed_metrics[highest_ids[0]]
155 | lowest_val_permissible = trimmed_metrics[lowest_ids[-1]]
156 | trimmed_metrics[highest_ids] = highest_val_permissible
157 | trimmed_metrics[lowest_ids] = lowest_val_permissible
158 | elif outliers == "randomize":
159 | #this will randomize the order of metrics
160 | trimmed_metrics = np.delete(trimmed_metrics, all_ids)
161 | else:
162 | assert outliers in ["keep", "p-value"]
163 | pass
164 |
165 |
166 | return trimmed_metrics
167 |
168 |
169 | def get_p_value_list(heldout_train, heldout_val):
170 | p_value_list = []
171 | for num_samples in p_sample_list:
172 | heldout_train_curr = heldout_train[:num_samples]
173 | heldout_val_curr = heldout_val[:num_samples]
174 | t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less')
175 | p_value_list.append(p_value)
176 | return p_value_list
177 |
178 |
179 |
180 | def split_train_val(metrics):
181 | keys = list(metrics.keys())
182 | num_elements = len(metrics[keys[0]])
183 | print (f"Using {num_elements} elements")
184 | # select a random subset of val_metrics (50% of ids)
185 | ids_train = np.random.choice(num_elements, num_elements//2, replace=False)
186 | ids_val = np.array([i for i in range(num_elements) if i not in ids_train])
187 | new_metrics_train = {}
188 | new_metrics_val = {}
189 | for key in keys:
190 | new_metrics_train[key] = np.array(metrics[key])[ids_train]
191 | new_metrics_val[key] = np.array(metrics[key])[ids_val]
192 | return new_metrics_train, new_metrics_val
193 |
194 | def main():
195 | args = get_args()
196 | with open(f"new_results/{args.model_name}/{args.dataset_name}_train_metrics.json", 'r') as f:
197 | metrics_train = json.load(f)
198 | with open(f"new_results/{args.model_name}/{args.dataset_name}_val_metrics.json", 'r') as f:
199 | metrics_val = json.load(f)
200 |
201 | if args.false_positive:
202 | metrics_train, metrics_val = split_train_val(metrics_val)
203 |
204 | keys = list(metrics_train.keys())
205 | train_metrics = []
206 | val_metrics = []
207 | for key in keys:
208 | if args.features != "all":
209 | if key not in feature_list:
210 | continue
211 | metrics_train_key = np.array(metrics_train[key])
212 | metrics_val_key = np.array(metrics_val[key])
213 |
214 | # remove the top 2.5% and bottom 2.5% of the data
215 |
216 | metrics_train_key = remove_outliers(metrics_train_key, remove_frac = 0.05, outliers = args.outliers)
217 | metrics_val_key = remove_outliers(metrics_val_key, remove_frac = 0.05, outliers = args.outliers)
218 |
219 | train_metrics.append(metrics_train_key)
220 | val_metrics.append(metrics_val_key)
221 |
222 | # concatenate the train and val metrics by stacking them
223 |
224 | # train_metrics, val_metrics = new_train_metrics, new_val_metrics
225 | train_metrics, val_metrics = normalize_and_stack(train_metrics, val_metrics)
226 |
227 | for i in range(args.num_random):
228 | np.random.shuffle(train_metrics)
229 | np.random.shuffle(val_metrics)
230 |
231 | # train a model by creating a train set and a held out set
232 | num_samples = args.num_samples
233 | (train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)
234 |
235 | model = train_model(train_x, train_y, num_epochs = 1000)
236 | preds, loss = get_predictions(model, val_x, val_y)
237 | preds_train, loss_train = get_predictions(model, train_x, train_y)
238 | og_train = preds_train[train_y == 0]
239 | og_val = preds_train[train_y == 1]
240 |
241 | heldout_train = preds[val_y == 0]
242 | heldout_val = preds[val_y == 1]
243 | # alternate hypothesis: heldout_train < heldout_val
244 |
245 | if args.outliers == "p-value" or args.outliers == "mean+p-value":
246 | heldout_train = remove_outliers(heldout_train, remove_frac = 0.05, outliers = "randomize")
247 | heldout_val = remove_outliers(heldout_val, remove_frac = 0.05, outliers = "randomize")
248 |
249 | p_value_list = get_p_value_list(heldout_train, heldout_val)
250 |
251 | # using the model weights, get importance of each feature, and save to csv
252 | weights = model.weight.data.squeeze().tolist()
253 | features = keys
254 | feature_importance = {feature: weight for feature, weight in zip(features, weights)}
255 | df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])
256 | import os
257 | path_to_append = f"{args.outliers}-outliers/{args.normalize}-normalize"
258 | if args.features == "selected":
259 | path_to_append += "-selected_features"
260 | if args.false_positive:
261 | path_to_append += f"-{args.false_positive}-false_positive"
262 |
263 | model_name = args.model_name.replace("/", "_")
264 | os.makedirs(f"aggregated_results/feature_importance/{path_to_append}/{model_name}", exist_ok=True)
265 | df.to_csv(f'aggregated_results/feature_importance/{path_to_append}/{model_name}/{args.dataset_name}_seed_{i}.csv', index=False)
266 |
267 |
268 | # add the to the csv in p_values/{model_name}.csv if it does not exist
269 | os.makedirs(f"aggregated_results/p_values/{path_to_append}/{model_name}", exist_ok=True)
270 |
271 | p_file = f"aggregated_results/p_values/{path_to_append}/{model_name}/{args.dataset_name}.csv"
272 | print(f"Writing to {p_file}")
273 | if not os.path.exists(p_file):
274 | with open(p_file, 'w') as f:
275 | to_write = "seed," + ",".join([f"p_{str(p)}" for p in p_sample_list]) + "\n"
276 | f.write(to_write)
277 |
278 | # check if the dataset_name is already in the file
279 | flag = 0
280 | seed = f"seed_{i}"
281 | with open(p_file, 'r') as f:
282 | lines = f.readlines()
283 | for line in lines:
284 | if seed in line:
285 | print(f"Dataset {args.dataset_name} already in file {p_file}. Aborting...\n{p_value_list}")
286 | flag = 1
287 |
288 | if flag == 0:
289 | with open(p_file, 'a') as f:
290 | to_write = seed + "," + ",".join([str(p) for p in p_value_list]) + "\n"
291 | f.write(to_write)
292 |
293 | if __name__ == "__main__":
294 | main()
--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import zlib
3 | import tqdm, json
4 |
5 | loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
6 |
7 | def raw_values_batch(model, tokenizer, example_list):
8 | '''
9 | This function takes a list of strings and returns the loss values for each token in the string
10 | input:
11 | model: the language model
12 | tokenizer: the tokenizer
13 | example_list: a list of strings
14 |
15 | output:
16 | loss_list: a list of lists.
17 | Each list contains the loss values for each token in the string
18 |
19 | '''
20 | max_length = tokenizer.model_max_length
21 | input_ids = tokenizer(example_list, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
22 |
23 | if model.device.type == "cuda":
24 | input_ids = {k: v.cuda() for k, v in input_ids.items()}
25 |
26 | # forward pass with no grad
27 | with torch.no_grad():
28 | outputs = model(**input_ids)
29 |
30 | labels = input_ids["input_ids"]
31 | labels[labels == tokenizer.pad_token_id] = -100
32 |
33 | # shift the labels
34 | shifted_labels = labels[..., 1:].contiguous().view(-1)
35 |
36 | # shift the logits
37 | shifted_logits = outputs.logits[..., :-1, :].contiguous()
38 | shifted_logits = shifted_logits.view(-1, shifted_logits.size(-1))
39 |
40 | loss = loss_fct(shifted_logits, shifted_labels)
41 |
42 | # reshape the loss to the original shape
43 | loss = loss.view(labels.size(0), labels.size(1) - 1)
44 |
45 | # now remove the 0 values and create loss as a list of lists
46 | loss_list = loss.tolist()
47 |
48 | for i,entry in enumerate(loss_list):
49 | # remove the 0 values
50 | entry = [x for x in entry if x != 0]
51 | loss_list[i] = entry
52 |
53 | # if any list is empty, remove it
54 | loss_list = [entry for entry in loss_list if len(entry) > 0]
55 |
56 | return loss_list
57 |
58 | def raw_values(model, tokenizer, example_list, batch_size = 32):
59 | '''
60 | This function takes a list of strings and returns the loss values for each token in the string
61 | input:
62 | model: the language model
63 | tokenizer: the tokenizer
64 | example_list: a list of strings
65 | batch_size: the batch size
66 | output:
67 | loss_list: a list of lists.
68 | Each list contains the loss values for each token in the string
69 | '''
70 | loss_list = []
71 | for i in tqdm.tqdm(range(0, len(example_list), batch_size)):
72 | batch = example_list[i:i + batch_size]
73 | loss_list += raw_values_batch(model, tokenizer, batch)
74 | return loss_list
75 |
76 | def k_min_probs(loss_list, k=0.05, reverse=False):
77 | '''
78 | This function takes a list of lists and returns the ppl of the k fraction smallest values in each list
79 | input:
80 | loss_list: a list of lists
81 | k: the fraction of smallest values to return
82 |
83 | output:
84 | k_min_prob: the mean probability of the k fraction smallest values in each list
85 | '''
86 | # sort each list. if reverse is true, sort in reverse order (descending)
87 | sorted_list = [sorted(entry) for entry in loss_list]
88 | if reverse:
89 | sorted_list = [entry[::-1] for entry in sorted_list]
90 | k_min_probs = []
91 | for entry in sorted_list:
92 | # get the k fraction smallest values
93 | num_values = max(1, int(len(entry)*k))
94 | k_min = entry[:num_values]
95 | k_min_prob = sum(k_min)/len(k_min)
96 | k_min_probs.append(k_min_prob)
97 | return k_min_probs
98 |
99 | def perplexity(loss_list):
100 | '''
101 | This function takes a list of lists and returns the perplexity of each list
102 | input:
103 | loss_list: a list of lists
104 |
105 | output:
106 | perplexity: the perplexity of each list
107 | '''
108 | perplexity = []
109 | for entry in loss_list:
110 | # calculate the mean of each list
111 | mean = sum(entry)/len(entry)
112 | # ppl is the exponent of the mean
113 | ppl = torch.exp(torch.tensor(mean)).item()
114 | perplexity.append(ppl)
115 |
116 | return perplexity
117 |
118 | def zlib_ratio(loss_list, example_list):
119 | '''
120 | This function takes a list of lists and returns the ratio of the mean loss to the zlib compression of the input string
121 | input:
122 | loss_list: a list of lists
123 | example_list: a list of strings
124 |
125 | output:
126 | zlib_ratio: the ratio of the mean loss to the zlib compression of the input string
127 | '''
128 | zlib_ratios = []
129 | for i,entry in enumerate(loss_list):
130 | # calculate the mean of each list
131 | mean = sum(entry)/len(entry)
132 | # calculate the zlib compression of the input string
133 | zlib_entropy = len(zlib.compress(bytes(example_list[i], 'utf-8')))
134 | # calculate the ratio
135 | ratio = mean/zlib_entropy
136 | zlib_ratios.append(ratio)
137 | return zlib_ratios
138 |
139 | def ppl_ratio(loss_list, reference_list):
140 | '''
141 | This function takes a list of lists and returns the ratio of the mean loss to the perplexity of a reference model
142 | input:
143 | loss_list: a list of lists
144 | reference_list: a list of perplexity values, or a list of lists of loss values
145 |
146 | output:
147 | ratio: the ratio of the mean loss to the perplexity of the reference model
148 | '''
149 | ratios = []
150 | for (entry, entry_ref) in zip(loss_list, reference_list):
151 | # calculate the mean of each list
152 | mean_model = sum(entry)/len(entry)
153 | if type(entry_ref) == list:
154 | mean_ref = sum(entry_ref)/len(entry_ref)
155 | else:
156 | mean_ref = entry_ref
157 | # calculate the ratio
158 | ratio = mean_model/mean_ref
159 | ratios.append(ratio)
160 |
161 | return ratios
162 |
163 | def ppl_diff(loss_list, reference_list):
164 | '''
165 | This function takes a list of lists and returns the difference of the mean loss to the perplexity of a reference model
166 | input:
167 | loss_list: a list of lists
168 | reference_list: a list of perplexity values, or a list of lists of loss values
169 |
170 | output:
171 | diff: the difference of the mean loss to the perplexity of the reference model
172 | '''
173 | diffs = []
174 | for (entry, entry_ref) in zip(loss_list, reference_list):
175 | # calculate the mean of each list
176 | mean_model = sum(entry)/len(entry)
177 | if type(entry_ref) == list:
178 | mean_ref = sum(entry_ref)/len(entry_ref)
179 | else:
180 | mean_ref = entry_ref
181 | # calculate the ratio
182 | diff = mean_model - mean_ref
183 | diffs.append(diff)
184 |
185 | return diffs
186 |
187 |
188 | def perturbation_ratio(model, tokenizer, dataset, loss_list, batch_size = 32):
189 | '''
190 | Dataset({
191 | features: ['text', 'synonym_substitution', 'butter_fingers', 'random_deletion', 'change_char_case', 'whitespace_perturbation', 'underscore_trick'],
192 | num_rows: 2000
193 | })
194 | '''
195 | result = {}
196 | for perturbation in dataset.column_names:
197 | if perturbation != "text":
198 | perturbed_list = dataset[perturbation]
199 | perturbed_loss_list = raw_values(model, tokenizer, perturbed_list, batch_size = batch_size)
200 | ratios = ppl_ratio(loss_list, perturbed_loss_list)
201 | diffs = ppl_diff(loss_list, perturbed_loss_list)
202 | result[f"ppl_ratio_{perturbation}"] = ratios
203 | result[f"ppl_diff_{perturbation}"] = diffs
204 | return result
205 |
206 |
207 |
208 |
209 | reference_model_registry = {
210 | "silo":"kernelmachine/silo-pdswby-1.3b",
211 | "tinystories-33M": "roneneldan/TinyStories-33M",
212 | "tinystories-1M": "roneneldan/TinyStories-1M",
213 | "phi-1_5": "microsoft/phi-1_5",
214 | # "phi-1": "microsoft/phi-1",
215 | }
216 |
217 |
218 |
219 | def aggregate_metrics(model, tokenizer, dataset, metric_list, args, batch_size = 32):
220 | '''
221 | This function takes a list of strings and returns a dictionary of metrics
222 | input:
223 | model: the language model
224 | tokenizer: the tokenizer
225 | dataset: a huggingface dataset, with key "text" containing the strings
226 | metric_list: a list of metrics to calculate
227 |
228 | output:
229 | metrics: a dictionary of metrics
230 | '''
231 | example_list = dataset["text"]
232 | loss_list = raw_values(model, tokenizer, example_list, batch_size = batch_size)
233 |
234 | metrics = {}
235 | if "ppl" in metric_list:
236 | metrics["ppl"] = perplexity(loss_list)
237 | if "k_min_probs" in metric_list:
238 | for k in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
239 | metrics[f"k_min_probs_{k}"] = k_min_probs(loss_list, k=k)
240 | if "k_max_probs" in metric_list:
241 | for k in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
242 | metrics[f"k_max_probs_{k}"] = k_min_probs(loss_list, k=k, reverse=True)
243 | if "zlib_ratio" in metric_list:
244 | metrics["zlib_ratio"] = zlib_ratio(loss_list, example_list)
245 |
246 | if "perturbation" in metric_list:
247 | ratios_dict = perturbation_ratio(model, tokenizer, dataset, loss_list, batch_size)
248 | metrics.update(ratios_dict)
249 |
250 | if "reference_model" in metric_list:
251 | # for computation efficiency, we now enforce that the reference model should already have been run and its ppl saved
252 | for model_name in reference_model_registry:
253 | hf_path = reference_model_registry[model_name]
254 | with open(f"results/{hf_path}/{args.dataset_name}_{args.split}_metrics.json", 'r') as f:
255 | metrics_train = json.load(f)
256 | ref_ppl = metrics_train["ppl"]
257 | ref_ratios = ppl_ratio(loss_list, ref_ppl)
258 | ref_diffs = ppl_diff(loss_list, ref_ppl)
259 | metrics[f"ref_ppl_ratio_{model_name}"] = ref_ratios
260 | metrics[f"ref_ppl_diff_{model_name}"] = ref_diffs
261 |
262 | '''
263 | old code to run reference models on the fly
264 | from utils import prepare_model
265 | for model_name in reference_model_registry:
266 | hf_path = reference_model_registry[model_name]
267 | model, tokenizer = prepare_model(hf_path)
268 |
269 | reference_list = raw_values(model, tokenizer, example_list, batch_size = batch_size)
270 | metrics[f"ref_ppl_ratio_{model_name}"] = ppl_ratio(loss_list, reference_list)
271 | metrics[f"ref_ppl_diff_{model_name}"] = ppl_diff(loss_list, reference_list)
272 | '''
273 |
274 | return metrics
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lm_dataformat
2 | nltk
3 | git+https://github.com/huggingface/transformers
4 | datasets
--------------------------------------------------------------------------------
/results_reader.py:
--------------------------------------------------------------------------------
1 | # go to p_values/{model}
2 | # read every csv file
3 | # print: {csv_name}: Number of values < 0.1 = {number of values < 0.1}
4 |
5 | import sys
6 | model_name = sys.argv[1]
7 | import os
8 | import pandas as pd
9 |
10 | p_values_dir = f"p_values/{model_name}"
11 | p_values = {}
12 | for file in sorted(os.listdir(p_values_dir)):
13 | if file.endswith(".csv"):
14 | p_values[file] = pd.read_csv(f"{p_values_dir}/{file}")
15 |
16 | for key in p_values:
17 | print(f"{key}: Number of values < 0.1 = {len(p_values[key][p_values[key]['p_value'] < 0.1])}")
--------------------------------------------------------------------------------
/scripts/data_creator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ..
3 |
4 | for dataset in stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers
5 | do
6 | echo "dataset: $dataset"
7 | python data_creator.py --dataset_name $dataset
8 | done
--------------------------------------------------------------------------------
/scripts/di_launcher_individual.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #SBATCH --output=slurm_output/slurm_%j.out # Standard output
3 | #SBATCH --error=slurm_output/slurm_%j.err # Standard error
4 | #SBATCH --cpus-per-task=4
5 | #SBATCH --gpus-per-node=A6000:1
6 | #SBATCH --tasks-per-node=1
7 | #SBATCH --mem=50G
8 | #SBATCH --time=20:00:00
9 | #SBATCH --mail-user=pratyus2@cs.cmu.edu
10 | #SBATCH --partition=general
11 |
12 | model_name=$1
13 | split_name=$2
14 | gpu_id=$3
15 | batch_size=$4
16 | dataset=$5
17 |
18 | source ~/.bashrc
19 | conda init
20 |
21 | # if model_name is "kernelmachine/silo-pdswby-1.3b", then conda activate di_silo
22 | # conda activate di
23 |
24 | if [ $model_name = "kernelmachine/silo-pdswby-1.3b" ]
25 | then
26 | conda activate di_silo
27 | else
28 | conda activate di
29 | fi
30 |
31 | cd /home/pratyus2/projects/llm_dataset_inference
32 |
33 | echo "model_name: $model_name" split_name: $split_name gpu_id: $gpu_id
34 |
35 | echo "dataset: $dataset"
36 | CUDA_VISIBLE_DEVICES=$gpu_id python di.py --split $split_name --dataset_name $dataset --model_name $model_name --batch_size $batch_size
--------------------------------------------------------------------------------
/scripts/di_mega_launcher.sh:
--------------------------------------------------------------------------------
1 | # launch all the models on the inference dataset
2 | # "kernelmachine/silo-pdswby-1.3b",
3 | # "roneneldan/TinyStories-33M",
4 | # "roneneldan/TinyStories-1M",
5 | # "microsoft/phi-1_5",
6 | # "microsoft/phi-1",
7 |
8 | # for model_name in "roneneldan/TinyStories-33M" "roneneldan/TinyStories-1M" #"microsoft/phi-1_5"
9 | # for model_name in "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-6.9b-deduped"
10 | # for model_name in "kernelmachine/silo-pdswby-1.3b" #(need different git repo for this model)
11 | num_jobs=0
12 | for model_name in "EleutherAI/pythia-410m"
13 | do
14 | if [ $model_name = "EleutherAI/pythia-6.9b" ]
15 | then
16 | batch_size=8
17 | else
18 | batch_size=32
19 | fi
20 | # batch_size=1 if model_name = "EleutherAI/pythia-12b"
21 | if [ $model_name = "EleutherAI/pythia-12b-deduped" ]
22 | then
23 | batch_size=1
24 | fi
25 |
26 | for dataset in bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers pubmed_abstracts math nih enron stackexchange wikipedia cc github openwebtext2 freelaw uspto hackernews books3 pubmed_central gutenberg arxiv #
27 | do
28 | for split_name in "train" "val"
29 | do
30 | num_jobs=$((num_jobs+1))
31 | # wait if 8 jobs submitted
32 | if [ $num_jobs -eq 24 ]
33 | then
34 | echo "waiting for 8 jobs to complete"
35 | wait
36 | sleep 100s
37 | # check squeue if any process is running by user pratyus2
38 | while [ $(squeue -u pratyus2 | wc -l) -gt 1 ]
39 | do
40 | echo "waiting for 8 jobs to complete"
41 | sleep 10s
42 | done
43 | num_jobs=0
44 | fi
45 | sbatch di_launcher_individual.sh $model_name $split_name 0 $batch_size $dataset
46 | done
47 | # sbatch di_launcher_b.sh $model_name $split_name 0 $batch_size
48 | # sbatch di_launcher_c.sh $model_name $split_name 0 $batch_size
49 | # sbatch di_launcher_d.sh $model_name $split_name 0 $batch_size
50 | # sbatch di_launcher_e.sh $model_name $split_name 0 $batch_size
51 | # sbatch di_launcher_f.sh $model_name $split_name 0 $batch_size
52 | done
53 | done
--------------------------------------------------------------------------------
/scripts/launcher.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #SBATCH --output=slurm_output/slurm_%j.out # Standard output
3 | #SBATCH --error=slurm_output/slurm_%j.err # Standard error
4 | #SBATCH --cpus-per-task=4
5 | #SBATCH --tasks-per-node=1
6 | #SBATCH --mem=10G
7 | #SBATCH --time=1-10:00:00
8 | #SBATCH --partition=general
9 | #SBATCH --array=0-999
10 |
11 |
12 | datasets=(stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers)
13 | outliers=("mean" "p-value")
14 | normalizes=("combined" "train" "no")
15 | features=("all" "selected")
16 | false_positives=(1 0)
17 | models=("EleutherAI/pythia-12b-deduped" "EleutherAI/pythia-12b" "EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-6.9b" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-1.3b" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-410m")
18 |
19 | #add 6000 to the array index to get the next model
20 | SLURM_ARRAY_TASK_ID=$((SLURM_ARRAY_TASK_ID + 1000))
21 |
22 | # Calculate the array index
23 | dataset_idx=$((SLURM_ARRAY_TASK_ID % ${#datasets[@]}))
24 | outlier_idx=$((SLURM_ARRAY_TASK_ID / ${#datasets[@]} % ${#outliers[@]}))
25 | normalize_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]}) % ${#normalizes[@]}))
26 | feature_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]}) % ${#features[@]}))
27 | false_positive_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]} * ${#features[@]}) % ${#false_positives[@]}))
28 | model_idx=$((SLURM_ARRAY_TASK_ID / (${#datasets[@]} * ${#outliers[@]} * ${#normalizes[@]} * ${#features[@]} * ${#false_positives[@]})))
29 |
30 | dataset=${datasets[$dataset_idx]}
31 | outlier=${outliers[$outlier_idx]}
32 | normalize=${normalizes[$normalize_idx]}
33 | features=${features[$feature_idx]}
34 | false_positive=${false_positives[$false_positive_idx]}
35 | model_name=${models[$model_idx]}
36 |
37 | if [ $false_positive -eq 1 ]; then
38 | num_samples=500
39 | else
40 | num_samples=1000
41 | fi
42 |
43 | # model_name=$1
44 | # outlier=$2
45 | # normalize=$3
46 | # features=$4
47 | # false_positive=$5
48 | # num_samples=$6
49 | # dataset=$7
50 |
51 | echo model_name: $model_name outliers: $outlier normalize: $normalize features: $features false_positive: $false_positive num_samples: $num_samples dataset: $dataset
52 |
53 |
54 | source ~/.bashrc
55 | conda init
56 | conda activate di
57 |
58 | cd /home/pratyus2/projects/llm_dataset_inference
59 |
60 | python linear_di.py --num_random 10 --dataset_name $dataset --model_name $model_name --normalize $normalize --outliers $outlier --features $features --false_positive $false_positive --num_samples $num_samples
61 |
--------------------------------------------------------------------------------
/scripts/mega_launcher.sh:
--------------------------------------------------------------------------------
1 |
2 | for dataset in stackexchange wikipedia cc github pubmed_abstracts openwebtext2 freelaw math nih uspto hackernews enron books3 pubmed_central gutenberg arxiv bookcorpus2 opensubtitles youtubesubtitles ubuntu europarl philpapers
3 | do
4 | for outliers in "mean+p-value" "mean" "p-value" #"clip" "zero" "keep" "randomize" #
5 | do
6 | for normalize in "combined" "train" "no"
7 | do
8 | for features in "all" "selected"
9 | do
10 | for false_positive in 1 0
11 | do
12 | for model in "EleutherAI/pythia-12b-deduped" "EleutherAI/pythia-12b" "EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-6.9b" "EleutherAI/pythia-1.3b-deduped" "EleutherAI/pythia-1.3b" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-410m"
13 | do
14 | # num_samples=500 if false_positive=1 else 1000
15 | if [ $false_positive -eq 1 ]
16 | then
17 | num_samples=500
18 | else
19 | num_samples=1000
20 | fi
21 | sbatch launcher.sh $model $outliers $normalize $features $false_positive $num_samples $dataset
22 | done
23 | wait
24 | done
25 | done
26 |
27 | done
28 | done
29 | done
--------------------------------------------------------------------------------
/selected_features.py:
--------------------------------------------------------------------------------
1 | feature_list = [
2 | "ppl",
3 | "k_min_probs_0.05",
4 | "k_min_probs_0.1",
5 | "k_min_probs_0.2",
6 | "k_min_probs_0.3",
7 | "k_min_probs_0.4",
8 | "k_min_probs_0.5",
9 | "k_min_probs_0.6",
10 | "k_max_probs_0.05",
11 | "k_max_probs_0.1",
12 | "k_max_probs_0.2",
13 | "k_max_probs_0.3",
14 | "k_max_probs_0.4",
15 | "k_max_probs_0.5",
16 | "k_max_probs_0.6",
17 | "zlib_ratio",
18 | "ppl_ratio_synonym_substitution",
19 | "ppl_ratio_butter_fingers",
20 | "ppl_ratio_random_deletion",
21 | "ppl_ratio_change_char_case",
22 | "ppl_ratio_whitespace_perturbation",
23 | "ppl_ratio_underscore_trick",
24 | "ref_ppl_ratio_silo",
25 | "ref_ppl_ratio_tinystories-33M",
26 | "ref_ppl_ratio_tinystories-1M",
27 | "ref_ppl_ratio_phi-1_5",
28 | ]
--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
1 | import sys, time
2 | sys.path.append("NL-Augmenter")
3 |
4 | # pip install spacy torchtext cucco fastpunct sacremoses
5 | # python -m spacy download en_core_web_sm
6 |
7 |
8 | from nlaugmenter.transformations.butter_fingers_perturbation.transformation import ButterFingersPerturbation
9 | from nlaugmenter.transformations.random_deletion.transformation import RandomDeletion
10 | from nlaugmenter.transformations.synonym_substitution.transformation import SynonymSubstitution
11 | from nlaugmenter.transformations.back_translation.transformation import BackTranslation
12 | from nlaugmenter.transformations.change_char_case.transformation import ChangeCharCase
13 | from nlaugmenter.transformations.whitespace_perturbation.transformation import WhitespacePerturbation
14 | from nlaugmenter.transformations.underscore_trick.transformation import UnderscoreTrick
15 | from nlaugmenter.transformations.style_paraphraser.transformation import StyleTransferParaphraser
16 | from nlaugmenter.transformations.punctuation.transformation import PunctuationWithRules
17 |
18 |
19 |
20 |
21 | def aug_generator(text_list, aug_style):
22 |
23 | if aug_style == "butter_fingers":
24 | t1 = ButterFingersPerturbation(max_outputs=1)
25 | return [t1.generate(text_list[i], prob = 0.1)[0] for i in range(len(text_list))]
26 | elif aug_style == "random_deletion":
27 | t1 = RandomDeletion(prob=0.25)
28 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))]
29 | elif aug_style == "synonym_substitution":
30 | syn = SynonymSubstitution(max_outputs=1, prob = 0.2)
31 | return [syn.generate(text_list[i])[0] for i in range(len(text_list))]
32 | elif aug_style == "back_translation":
33 | trans = BackTranslation()
34 | return [trans.generate(text_list[i])[0] for i in range(len(text_list))]
35 | elif aug_style == "change_char_case":
36 | t1 = ChangeCharCase()
37 | return [t1.generate(text_list[i], prob = 0.25)[0] for i in range(len(text_list))]
38 | elif aug_style == "whitespace_perturbation":
39 | t1 = WhitespacePerturbation()
40 | return [t1.generate(text_list[i], prob = 0.25)[0] for i in range(len(text_list))]
41 | elif aug_style == "underscore_trick":
42 | t1 = UnderscoreTrick(prob = 0.25)
43 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))]
44 | elif aug_style == "style_paraphraser":
45 | t1 = StyleTransferParaphraser(style = "Basic", upper_length="same_5")
46 | return [t1.generate(text_list[i])[0] for i in range(len(text_list))]
47 | elif aug_style == "punctuation_perturbation":
48 | normalizations = ['remove_extra_white_spaces', ('replace_characters', {'characters': 'was', 'replacement': 'TZ'}),
49 | ('replace_emojis', {'replacement': 'TESTO'})]
50 | punc = PunctuationWithRules(rules=normalizations)
51 | return [punc.generate(text_list[i])[0] for i in range(len(text_list))]
52 | else:
53 | raise ValueError("Augmentation style not found. Please check the available styles.")
54 |
55 | def generate_perturbations(text_list):
56 | augmentation_styles = ["synonym_substitution", "butter_fingers", "random_deletion", "change_char_case", "whitespace_perturbation", "underscore_trick"]
57 | all_augmented = {}
58 | for style in augmentation_styles:
59 | start = time.time()
60 | aug_list = aug_generator(text_list, style)
61 | all_augmented[style] = aug_list
62 | print(f"Perturbing with {style} took {time.time() - start} seconds")
63 | return all_augmented
64 |
65 | if __name__ == "__main__":
66 | text_list = ["This is a test sentence. It is a good sentence.", "This is another test sentence. It is a bad sentence."]
67 | print(generate_perturbations(text_list))
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 | import torch
3 |
4 | def prepare_model(model_name, cache_dir, quant=None):
5 | tokenizer = AutoTokenizer.from_pretrained(model_name)
6 | # pad token
7 | tokenizer.pad_token = tokenizer.eos_token
8 | tokenizer.padding_side = "right"
9 | tokenizer.model_max_length = 512
10 |
11 | if quant is not None:
12 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True).cuda()
13 | elif quant == "fp16":
14 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
15 | elif quant == "8bit":
16 | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True, load_in_8bit=True).cuda()
17 |
18 | print("Model loaded")
19 | return model, tokenizer
--------------------------------------------------------------------------------