├── human_eval ├── bigcode_eval │ ├── __init__.py │ ├── tasks │ │ ├── custom_metrics │ │ │ ├── __init__.py │ │ │ ├── pal_metric │ │ │ │ ├── __init__.py │ │ │ │ └── pal_code_exec.py │ │ │ └── multiple_metrics │ │ │ │ ├── __init__.py │ │ │ │ ├── safe_subprocess │ │ │ │ ├── .gitignore │ │ │ │ ├── evil_programs │ │ │ │ │ ├── block_on_inputs.py │ │ │ │ │ ├── fork_bomb.py │ │ │ │ │ ├── sleep_forever.py │ │ │ │ │ ├── unbounded_output.py │ │ │ │ │ ├── fork_once.py │ │ │ │ │ └── close_outputs.py │ │ │ │ ├── __init__.py │ │ │ │ └── module_test.py │ │ │ │ ├── eval_lua.py │ │ │ │ ├── eval_php.py │ │ │ │ ├── eval_python.py │ │ │ │ ├── eval_pl.py │ │ │ │ ├── single_experiment_pass_k.py │ │ │ │ ├── eval_julia.py │ │ │ │ ├── eval_sh.py │ │ │ │ ├── eval_swift.py │ │ │ │ ├── eval_ts.py │ │ │ │ ├── eval_go.py │ │ │ │ ├── eval_cpp.py │ │ │ │ ├── eval_racket.py │ │ │ │ ├── eval_scala.py │ │ │ │ ├── libeval.py │ │ │ │ ├── eval_ruby.py │ │ │ │ ├── eval_r.py │ │ │ │ ├── eval_java.py │ │ │ │ ├── eval_javascript.py │ │ │ │ ├── eval_rust.py │ │ │ │ ├── eval_cs.py │ │ │ │ ├── eval_dlang.py │ │ │ │ ├── evaluation.py │ │ │ │ ├── containerized_eval.py │ │ │ │ └── generic_eval.py │ │ ├── few_shot_examples │ │ │ ├── conala_few_shot_prompts.json │ │ │ ├── concode_few_shot_prompts.json │ │ │ ├── codexglue_text_to_text_few_shot_prompts.json │ │ │ └── gsm8k_few_shot_prompts.json │ │ ├── __init__.py │ │ ├── concode.py │ │ ├── mbpp.py │ │ ├── conala.py │ │ ├── codexglue_text_to_text.py │ │ ├── apps.py │ │ ├── python_bugs.py │ │ └── humaneval.py │ ├── arguments.py │ ├── base.py │ └── evaluator.py ├── requirements.txt ├── Dockerfile ├── tests │ ├── data │ │ ├── mbpp_eval_gens.json │ │ ├── mbpp_gen_refs.json │ │ ├── humaneval_eval_gens.json │ │ ├── pal-gsm8k-greedy_eval_gens.json │ │ ├── humaneval_gen_refs.json │ │ ├── humaneval_gen_gens.json │ │ ├── mbpp_gen_gens.json │ │ └── pal-gsm8k-greedy_prompt.json │ ├── test_prompts.py │ └── test_generation_evaluation.py ├── eval.sh ├── makefile ├── finetuning │ ├── CodeComplex │ │ ├── README.md │ │ └── train.py │ ├── CodeDefect │ │ ├── README.md │ │ └── train.py │ ├── CodeClone │ │ ├── README.md │ │ └── train.py │ ├── APPS │ │ ├── README.md │ │ └── apps_train.py │ ├── Code-to-text │ │ ├── README.md │ │ └── train.py │ └── README.md ├── leaderboard │ ├── throughput_config.yaml │ ├── group_jsons.py │ └── multiple_eval.slurm ├── setup.py ├── Dockerfile-multiple ├── templates │ └── new_task.py └── .gitignore ├── sensitive_memorization ├── utils │ └── __init__.py ├── analyze.py ├── tokenize_secrets_and_prefixes.py ├── filter.py └── generate_secret_mask.py ├── assets ├── Unlearning.jpg ├── Illustration.jpg ├── MemorizationDistribution.jpg └── SensitiveMemorizationDetection.jpg ├── .gitignore ├── memorization_thresholds ├── TopLists │ ├── Ruby-top-repos.txt │ ├── Lua-top-repos.txt │ ├── PHP-top-repos.txt │ └── Rust-top-repos.txt ├── clean.sh ├── deduplicate.py ├── collect_data.sh ├── humaneval_mbpp_get.py ├── clone_repo.sh ├── extract_code.py ├── sample.py └── gh_crawler.py ├── cache_models.py ├── LICENSE ├── unlearning_preparation ├── forgotten_data_sample.py └── retained_data_sample.py └── unlearning └── dataset.py /human_eval/bigcode_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sensitive_memorization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/pal_metric/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/Unlearning.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/Unlearning.jpg -------------------------------------------------------------------------------- /assets/Illustration.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/Illustration.jpg -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ 2 | /.pytest_cache -------------------------------------------------------------------------------- /assets/MemorizationDistribution.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/MemorizationDistribution.jpg -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/block_on_inputs.py: -------------------------------------------------------------------------------- 1 | while True: 2 | input() 3 | -------------------------------------------------------------------------------- /assets/SensitiveMemorizationDetection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/SensitiveMemorizationDetection.jpg -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/fork_bomb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | while True: 4 | os.fork() 5 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/sleep_forever.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | while True: 4 | time.sleep(60) 5 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/unbounded_output.py: -------------------------------------------------------------------------------- 1 | b = True 2 | while True: 3 | print(b) 4 | b = not b 5 | -------------------------------------------------------------------------------- /human_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.25.1 2 | accelerate>=0.13.2 3 | datasets>=2.6.1 4 | evaluate>=0.3.0 5 | pyext==0.7 6 | mosestokenizer==1.0.0 7 | huggingface_hub>=0.11.1 8 | fsspec<2023.10.0 9 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/fork_once.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | if os.fork() == 0: 5 | while True: 6 | time.sleep(60) 7 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/close_outputs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | print("This is the end") 4 | sys.stdout.close() 5 | sys.stderr.close() 6 | while True: 7 | pass 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | 3 | memorization_thresholds/Code/ 4 | memorization_thresholds/Repos/ 5 | 6 | sensitive_memorization/codeparrot-clean-train-secrets-* 7 | sensitive_memorization/.codeparrot-clean-train-cache 8 | 9 | unlearning_preparation/benchmark.feather 10 | 11 | unlearning/ckpts/ 12 | -------------------------------------------------------------------------------- /human_eval/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt-get update && apt-get install -y python3 python3-pip 4 | 5 | COPY . /app 6 | 7 | WORKDIR /app 8 | 9 | RUN test -f /app/generations.json && rm /app/generations.json || true 10 | 11 | RUN pip3 install . 12 | 13 | CMD ["python3", "main.py"] 14 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json: -------------------------------------------------------------------------------- 1 | {"instruction1": "convert a list of integers into a single integer", "instruction2": "how to convert a datetime string back to datetime object?", "solution1": "r = int(''.join(map(str, x)))", "solution2": "datetime.datetime.strptime(str, '%m/%d/%Y')"} -------------------------------------------------------------------------------- /memorization_thresholds/TopLists/Ruby-top-repos.txt: -------------------------------------------------------------------------------- 1 | 28270 https://github.com/maybe-finance/maybe 2 | 5489 https://github.com/docusealco/docuseal 3 | 707 https://github.com/rage-rb/rage 4 | 694 https://github.com/Multiwoven/multiwoven 5 | 616 https://github.com/darwin-containers/homebrew-formula 6 | 584 https://github.com/wouterken/crystalruby 7 | 516 https://github.com/Freika/dawarich 8 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json: -------------------------------------------------------------------------------- 1 | {"instruction1": "get the distance of map coordinates to the center ", "instruction2": "check if details are parsed", "solution1": "float function ( int arg0 , int arg1 ) { int loc0 = arg0 - cx ; int loc1 = arg1 - cy ; return getSquaredDistance ( loc0 , loc1 ) ; }", "solution2": "boolean function ( ) { return isParsed ; }"} -------------------------------------------------------------------------------- /human_eval/tests/data/mbpp_eval_gens.json: -------------------------------------------------------------------------------- 1 | [["def remove_Occ(s,ch): \r\n for i in range(len(s)): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n for i in range(len(s) - 1,-1,-1): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n return s ", "This is some random text"], ["This is some random text", "This is some random text"]] -------------------------------------------------------------------------------- /memorization_thresholds/clean.sh: -------------------------------------------------------------------------------- 1 | cd Repos/Python/ 2 | find . -maxdepth 2 -type d -empty | xargs -i sudo rm -rf {} 3 | find . -maxdepth 1 -type d -empty | xargs -i sudo rm -rf {} 4 | 5 | cd ../../Code/Python/ 6 | find . -maxdepth 2 -type d -empty | xargs -i sudo rm -rf {} 7 | find . -maxdepth 1 -type d -empty | xargs -i sudo rm -rf {} 8 | 9 | ls ../../Repos/Python/ | xargs -i sudo rm -rf {} 10 | sudo rm -rf ../../Repos/Python/* 11 | -------------------------------------------------------------------------------- /human_eval/tests/data/mbpp_gen_refs.json: -------------------------------------------------------------------------------- 1 | ["assert remove_Occ(\"hello\",\"l\") == \"heo\"\nassert remove_Occ(\"abcda\",\"a\") == \"bcd\"\nassert remove_Occ(\"PHP\",\"P\") == \"H\"", "assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\nassert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]]\nassert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]"] -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_lua.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | 6 | def eval_script(path: Path): 7 | r = run(["lua", str(path)]) 8 | if r.timeout: 9 | status = "Timeout" 10 | elif r.exit_code == 0: 11 | status = "OK" 12 | else: 13 | status = "Exception" 14 | return { 15 | "status": status, 16 | "exit_code": r.exit_code, 17 | "stdout": r.stdout, 18 | "stderr": r.stderr, 19 | } 20 | -------------------------------------------------------------------------------- /human_eval/eval.sh: -------------------------------------------------------------------------------- 1 | model=$1 2 | batch_size=$2 3 | 4 | accelerate launch main.py \ 5 | --model $model \ 6 | --tasks humaneval \ 7 | --batch_size $batch_size \ 8 | --max_length_generation 512 \ 9 | --precision fp16 \ 10 | --allow_code_execution \ 11 | --metric_output_path $model/humaneval_evaluation_results.json \ 12 | --save_generations --save_generations_path $model/humaneval_generations.json \ 13 | --max_memory_per_gpu auto \ 14 | --do_sample True \ 15 | --temperature 0.2 \ 16 | --top_p 0.95 \ 17 | --n_samples 50 \ 18 | --seed 42 19 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_php.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | LANG_NAME = "PHP" 6 | LANG_EXT = ".php" 7 | 8 | 9 | def eval_script(path: Path): 10 | r = run(["php", path]) 11 | if "PHP Parse error" in r.stdout: 12 | status = "SyntaxError" 13 | elif r.exit_code != 0: 14 | status = "Exception" 15 | else: 16 | status = "OK" 17 | return { 18 | "status": status, 19 | "exit_code": r.exit_code, 20 | "stdout": r.stdout, 21 | "stderr": r.stderr, 22 | } 23 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_python.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | 6 | def eval_script(path: Path): 7 | r = run(["python3", str(path)]) 8 | if r.timeout: 9 | status = "Timeout" 10 | elif r.exit_code == 0: 11 | status = "OK" 12 | elif "SyntaxError" in r.stderr: 13 | status = "SyntaxError" 14 | else: 15 | status = "Exception" 16 | return { 17 | "status": status, 18 | "exit_code": r.exit_code, 19 | "stdout": r.stdout, 20 | "stderr": r.stderr, 21 | } 22 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_pl.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | 6 | def eval_script(path: Path): 7 | r = run(["perl", path]) 8 | 9 | if r.timeout: 10 | status = "Timeout" 11 | elif r.exit_code != 0: 12 | status = "Exception" 13 | elif "ERROR" in r.stdout or "ERROR" in r.stderr: 14 | status = "Exception" 15 | else: 16 | status = "OK" 17 | return { 18 | "status": status, 19 | "exit_code": r.exit_code, 20 | "stdout": r.stdout, 21 | "stderr": r.stderr, 22 | } 23 | -------------------------------------------------------------------------------- /human_eval/tests/test_prompts.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bigcode_eval import tasks 4 | 5 | TASKS = ["pal-gsm8k-greedy"] 6 | 7 | sample_doc = {"pal-gsm8k-greedy": {"question": "test"}} 8 | 9 | 10 | def load_reference_prompt(task_name): 11 | with open(f"tests/data/{task_name}_prompt.json") as fp: 12 | prompts = json.load(fp) 13 | return prompts["prompt"] 14 | 15 | 16 | def test_gsm_prompt(): 17 | for task_name in TASKS: 18 | task = tasks.get_task(task_name) 19 | task_prompt = task.get_prompt(sample_doc[task_name]) 20 | ref_prompt = load_reference_prompt(task_name) 21 | assert task_prompt == ref_prompt 22 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/single_experiment_pass_k.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | 5 | 6 | def estimator(n: int, c: int, k: int) -> float: 7 | """ 8 | Calculates 1 - comb(n - c, k) / comb(n, k). 9 | """ 10 | if n - c < k: 11 | return 1.0 12 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 13 | 14 | 15 | def for_file(path): 16 | with open(path, "r") as f: 17 | data = json.load(f) 18 | n = len(data["results"]) 19 | c = len( 20 | [True for r in data["results"] if r["status"] == "OK" and r["exit_code"] == 0] 21 | ) 22 | return np.array([estimator(n, c, 1), estimator(n, c, 10), estimator(n, c, 100)]) 23 | -------------------------------------------------------------------------------- /human_eval/makefile: -------------------------------------------------------------------------------- 1 | # There are two dockerfiles: for all benchmarks, and for MultiPL-E 2 | DOCKERFILE=Dockerfile 3 | 4 | ifeq ($(DOCKERFILE), Dockerfile) 5 | IMAGE_NAME=evaluation-harness 6 | else 7 | IMAGE_NAME=evaluation-harness-multiple 8 | endif 9 | 10 | build: 11 | docker build -f $(DOCKERFILE) -t $(IMAGE_NAME) . 12 | 13 | test: 14 | docker run -v $(CURDIR)/tests/docker_test/test_generations.json:/app/test_generations.json:ro \ 15 | -it $(IMAGE_NAME) python3 main.py --model dummy_model --tasks humaneval --limit 4 \ 16 | --load_generations_path /app/test_generations.json --allow_code_execution 17 | 18 | @echo "If pass@1 is 0.25 then your configuration for standard benchmarks is correct" 19 | 20 | all: build test -------------------------------------------------------------------------------- /human_eval/tests/data/humaneval_eval_gens.json: -------------------------------------------------------------------------------- 1 | [["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "This is some random text"], ["This is some random text", "This is some random text"]] -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_julia.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | 6 | def eval_script(path: Path): 7 | result = run(["julia", str(path)], timeout_seconds=5) 8 | if result.timeout: 9 | status = "Timeout" 10 | elif result.exit_code == 0: 11 | status = "OK" 12 | # TODO(arjun): I would like this to be reviewed more carefully by John. 13 | elif len(result.stderr) < 1: 14 | status = "Exception" 15 | else: 16 | status = "SyntaxError" 17 | 18 | return { 19 | "status": status, 20 | "exit_code": result.exit_code, 21 | "stdout": result.stdout, 22 | "stderr": result.stderr, 23 | } 24 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_sh.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | LANG_NAME = "bash" 6 | LANG_EXT = ".sh" 7 | 8 | 9 | def eval_script(path: Path): 10 | # Capture output - will be generated regardless of success, fail, or syntax error 11 | p = run(["bash", path]) 12 | if p.timeout: 13 | status = "Timeout" 14 | elif p.exit_code == 0: 15 | status = "OK" 16 | elif "syntax error" in p.stderr: 17 | status = "SyntaxError" 18 | else: 19 | status = "Exception" 20 | 21 | return { 22 | "status": status, 23 | "exit_code": p.exit_code, 24 | "stdout": p.stdout, 25 | "stderr": p.stderr, 26 | } 27 | -------------------------------------------------------------------------------- /memorization_thresholds/deduplicate.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | 4 | ROOT = 'Code' # NOTE: hard-coded. 5 | seen = set() 6 | count = 0 7 | dups = 0 8 | 9 | for root_dir, _, files in os.walk(ROOT): 10 | for file in files: 11 | count += 1 12 | file_path = os.path.join(root_dir, file) 13 | # Hash the entire file's content. 14 | with open(file_path, 'rb') as f: 15 | bytes = f.read() 16 | hash = hashlib.sha256(bytes).hexdigest() 17 | 18 | # Delete identical files. 19 | if hash in seen: 20 | os.remove(file_path) 21 | dups += 1 22 | else: 23 | seen.add(hash) 24 | 25 | # Periodically print progress and the running duplication ratio. 26 | if count % 10000 == 0: 27 | print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})') 28 | -------------------------------------------------------------------------------- /memorization_thresholds/TopLists/Lua-top-repos.txt: -------------------------------------------------------------------------------- 1 | 2508 https://github.com/stevearc/conform.nvim 2 | 2204 https://github.com/nvimtools/none-ls.nvim 3 | 2199 https://github.com/folke/flash.nvim 4 | 1582 https://github.com/face-hh/griddycode 5 | 1212 https://github.com/CopilotC-Nvim/CopilotChat.nvim 6 | 1072 https://github.com/tbhrbxx/robloxscripts 7 | 963 https://github.com/David-Kunz/gen.nvim 8 | 786 https://github.com/3rd/image.nvim 9 | 649 https://github.com/kawre/leetcode.nvim 10 | 641 https://github.com/nvim-java/nvim-java 11 | 579 https://github.com/craftzdog/solarized-osaka.nvim 12 | 575 https://github.com/tris203/precognition.nvim 13 | 554 https://github.com/ejoy/vaststars 14 | 544 https://github.com/nvim-neorocks/rocks.nvim 15 | 531 https://github.com/Robitx/gp.nvim 16 | 519 https://github.com/wojciech-kulik/xcodebuild.nvim 17 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeComplex/README.md: -------------------------------------------------------------------------------- 1 | # CodeComplex finetuning 2 | In this folder we show how to train an autoregressive on CodeComplex dataset, for algorithmic complexity prediction of Java programs. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs. 3 | 4 | ## Setup 5 | 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub: 7 | ``` 8 | wandb login 9 | huggingface-cli login 10 | ``` 11 | 12 | To fine-tune a model on this dataset, `microsoft/unixcoder-base-nine` for example, you can use the following command: 13 | 14 | ```python 15 | python train.py \ 16 | --model_ckpt microsoft/unixcoder-base-nine \ 17 | --num_epochs 60 \ 18 | --num_warmup_steps 10 \ 19 | --batch_size 8 \ 20 | --learning_rate 5e-4 21 | ``` 22 | -------------------------------------------------------------------------------- /cache_models.py: -------------------------------------------------------------------------------- 1 | '''Download all the necessary models from HuggingFace''' 2 | import torch 3 | from transformers import AutoTokenizer, AutoModelForCausalLM 4 | 5 | 6 | def get_model_and_tokenizer(model_name): 7 | print("Loading model {} ...".format(model_name)) 8 | tokenizer = AutoTokenizer.from_pretrained(model_name) 9 | tokenizer.pad_token = tokenizer.eos_token 10 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) 11 | 12 | print("Model {} is loaded.".format(model_name)) 13 | return tokenizer, model 14 | 15 | 16 | if __name__ == '__main__': 17 | models = [ 18 | 'codeparrot/codeparrot-small', 19 | 'codeparrot/codeparrot', 20 | 'Salesforce/codegen-350M-mono', 21 | 'Salesforce/codegen-2B-mono', 22 | 'Qwen/Qwen2.5-Coder-7B', 23 | ] 24 | 25 | for model_name in models: 26 | get_model_and_tokenizer(model_name) 27 | -------------------------------------------------------------------------------- /human_eval/leaderboard/throughput_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - backend: pytorch # default backend 3 | - benchmark: inference # default benchmark 4 | - experiment # inheriting experiment schema 5 | - _self_ # for hydra 1.1 compatibility 6 | - override hydra/job_logging: colorlog # colorful logging 7 | - override hydra/hydra_logging: colorlog # colorful logging 8 | 9 | hydra: 10 | run: 11 | dir: runs/${experiment_name} 12 | sweep: 13 | dir: sweeps/${experiment_name} 14 | job: 15 | chdir: true 16 | env_set: 17 | CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 18 | 19 | experiment_name: code_evals 20 | 21 | model: bigcode/santacoder 22 | 23 | hub_kwargs: 24 | use_auth_token: true 25 | trust_remote_code: true 26 | 27 | backend: 28 | torch_dtype: float16 29 | 30 | device: cuda:0 31 | 32 | benchmark: 33 | memory: true 34 | input_shapes: 35 | batch_size: 1 36 | sequence_length: 1 37 | new_tokens: 1000 38 | -------------------------------------------------------------------------------- /human_eval/tests/data/pal-gsm8k-greedy_eval_gens.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "def solution():\n \"\"\"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\"\"\"\n eggs_per_day = 16\n eggs_eaten = 3\n eggs_baked = 4\n eggs_sold = eggs_per_day - eggs_eaten - eggs_baked\n price_per_egg = 2\n money_made = eggs_sold * price_per_egg\n result = money_made\n return result\nprint(solution())" 4 | ], 5 | [ 6 | "def solution():\n \"\"\"A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?\"\"\"\n blue_fiber = 2\n white_fiber = blue_fiber / 2\n total_fiber = blue_fiber + white_fiber\n result = total_fiber\n return result\nprint(solution())" 7 | ] 8 | ] -------------------------------------------------------------------------------- /memorization_thresholds/collect_data.sh: -------------------------------------------------------------------------------- 1 | # Hand-picked set of languages. 2 | # lang="Python" 3 | langs=("Ruby" "PHP" "Rust" "Lua") 4 | 5 | if [ ! -d TopLists ]; then 6 | mkdir TopLists; 7 | fi 8 | 9 | # Collect 25K repos with at least 500 stars. 10 | # NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run. 11 | # NOTE: make sure to insert your GH API key into the gh_crawler.py file. 12 | # python3 gh_crawler.py $lang 13 | for lang in ${langs[@]}; do 14 | python3 gh_crawler.py $lang; 15 | done 16 | 17 | # Clone repositories in parallel and extract all language-specific files. 18 | # cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang 19 | for lang in ${langs[@]}; do 20 | cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang 21 | done 22 | 23 | # Deduplicate code files. 24 | python3 deduplicate.py 25 | -------------------------------------------------------------------------------- /memorization_thresholds/TopLists/PHP-top-repos.txt: -------------------------------------------------------------------------------- 1 | 2361 https://github.com/easychen/one-person-businesses-methodology-v2.0 2 | 2153 https://github.com/ellite/Wallos 3 | 2099 https://github.com/MlgmXyysd/Xiaomi-HyperOS-BootLoader-Bypass 4 | 1746 https://github.com/codehub666/94list 5 | 1314 https://github.com/cedar2025/Xboard 6 | 1037 https://github.com/Las-Fuerzas-Del-Cielo/Sistema-Anti-Fraude-Electoral 7 | 987 https://github.com/yebekhe/TelegramV2rayCollector 8 | 791 https://github.com/robsontenorio/mary 9 | 779 https://github.com/PHPCSStandards/PHP_CodeSniffer 10 | 693 https://github.com/vitodeploy/vito 11 | 664 https://github.com/php-youtubers/directory 12 | 654 https://github.com/theodo-group/LLPhant 13 | 645 https://github.com/laravel/pail 14 | 631 https://github.com/WendellAdriel/laravel-lift 15 | 623 https://github.com/spatie/laravel-pdf 16 | 596 https://github.com/pelican-dev/panel 17 | 570 https://github.com/tempestphp/highlight 18 | 553 https://github.com/xiaoxuan6/SMSBombing 19 | -------------------------------------------------------------------------------- /memorization_thresholds/humaneval_mbpp_get.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from datasets import load_dataset 4 | 5 | human_eval = load_dataset("openai/openai_humaneval")['test'] 6 | print(human_eval) 7 | mbpp = load_dataset("google-research-datasets/mbpp")['test'] 8 | print(mbpp) 9 | 10 | human_eval_df = pd.DataFrame(human_eval)[['prompt', 'canonical_solution']] 11 | human_eval_df['text'] = human_eval_df['prompt'] + '\n' + human_eval_df['canonical_solution'] 12 | human_eval_df = human_eval_df.drop(columns=['prompt', 'canonical_solution']) 13 | human_eval_df['corpus'] = 'human_eval' 14 | print(human_eval_df) 15 | mbpp_df = pd.DataFrame(mbpp)[['code']] 16 | mbpp_df = mbpp_df.rename(columns={'code': 'text'}) 17 | mbpp_df['corpus'] = 'mbpp' 18 | print(mbpp_df) 19 | 20 | combined_df = pd.concat([human_eval_df, mbpp_df], ignore_index=True) 21 | combined_df['doc_id'] = combined_df.index 22 | print(combined_df) 23 | combined_df.to_csv('../unlearning/data/human_eval_and_mbpp/unseen_data.csv', index=False) 24 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeDefect/README.md: -------------------------------------------------------------------------------- 1 | # CodeDefect finetuning 2 | In this folder we show how to train an autoregressive on [CodeDefect](https://huggingface.co/datasets/code_x_glue_cc_defect_detection) dataset, for the problem of predicting if a code is insecure or not. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs. 3 | 4 | ## Setup 5 | 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub: 7 | ``` 8 | wandb login 9 | huggingface-cli login 10 | ``` 11 | 12 | To fine-tune a model on this dataset you can use the following command: 13 | ```python 14 | python train.py \ 15 | --model_ckpt microsoft/unixcoder-base-nine \ 16 | --num_epochs 30 \ 17 | --batch_size 8 \ 18 | --num_warmup_steps 10 \ 19 | --learning_rate 5e-4 20 | --push_to_hub True 21 | ``` 22 | This will fine-tune your model, push it to the hub and print the evaluation accuracy on the test set. -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_swift.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | from .safe_subprocess import run 6 | 7 | 8 | def eval_script(path: Path): 9 | basename = ".".join(str(path).split(".")[:-1]) 10 | r = run(["swiftc", path, "-o", basename], timeout_seconds=45) 11 | if r.timeout: 12 | status = "Timeout" 13 | elif r.exit_code != 0: 14 | # Well, it's a compile error. May be a type error or 15 | # something. But, why break the set convention 16 | status = "SyntaxError" 17 | else: 18 | r = run([basename], timeout_seconds=5) 19 | if r.timeout: 20 | status = "Timeout" 21 | elif r.exit_code != 0: 22 | # Well, it's a panic 23 | status = "Exception" 24 | else: 25 | status = "OK" 26 | os.remove(basename) 27 | return { 28 | "status": status, 29 | "exit_code": r.exit_code, 30 | "stdout": r.stdout, 31 | "stderr": r.stderr, 32 | } 33 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeClone/README.md: -------------------------------------------------------------------------------- 1 | # CodeClone finetuning 2 | In this folder we show how to train an autoregressive on [CodeClone](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench) dataset, for the binary classification problem of code equivalence prediction. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs. 3 | 4 | ## Setup 5 | 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub: 7 | ``` 8 | wandb login 9 | huggingface-cli login 10 | ``` 11 | 12 | To fine-tune a model on this dataset you can use the following command: 13 | ```python 14 | python train_complexity_predictor.py \ 15 | --model_ckpt microsoft/unixcoder-base-nine \ 16 | --num_epochs 30 \ 17 | --batch_size 8 \ 18 | --num_warmup_steps 10 \ 19 | --learning_rate 5e-4 20 | --push_to_hub True 21 | ``` 22 | This will fine-tune your model, push it to the hub and print the evaluation accuracy on the test set. 23 | 24 | -------------------------------------------------------------------------------- /human_eval/tests/data/humaneval_gen_refs.json: -------------------------------------------------------------------------------- 1 | ["\n\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n\ncheck(has_close_elements)", "\n\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate('(()()) ((())) () ((())()())') == [\n '(()())', '((()))', '()', '((())()())'\n ]\n assert candidate('() (()) ((())) (((())))') == [\n '()', '(())', '((()))', '(((())))'\n ]\n assert candidate('(()(())((())))') == [\n '(()(())((())))'\n ]\n assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\ncheck(separate_paren_groups)"] -------------------------------------------------------------------------------- /human_eval/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md") as readme_file: 4 | readme = readme_file.read() 5 | 6 | with open("requirements.txt") as reqs_file: 7 | requirements = reqs_file.read().split("\n") 8 | 9 | ds1000_requirements = [ 10 | "DateTime==4.7", 11 | "gensim==4.2.0", 12 | "matplotlib==3.5.2", 13 | "numpy==1.21.6", 14 | "openai==0.23.0", 15 | "pandas==1.3.5", 16 | "pandas-datareader==0.10.0", 17 | "pathlib==1.0.1", 18 | "scikit-learn==1.0.2", 19 | "scipy==1.7.3", 20 | "seaborn==0.11.2", 21 | "statsmodels==0.13.2", 22 | "tensorflow==2.10.0", 23 | "tokenizers==0.12.1", 24 | "torchvision==0.13.1", 25 | "tqdm==4.64.1", 26 | "xgboost==1.6.2", 27 | "Pillow==9.2.0", 28 | ] 29 | 30 | setup( 31 | description="A framework for the evaluation of autoregressive code generation language models.", 32 | long_description=readme, 33 | license="Apache 2.0", 34 | packages=find_packages() , 35 | install_requires=requirements, 36 | extras_require={"ds1000": ds1000_requirements}, 37 | ) 38 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_ts.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .safe_subprocess import run 4 | 5 | 6 | def eval_script(path: Path): 7 | r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15) 8 | if r.exit_code != 0: 9 | return { 10 | "status": "SyntaxError", 11 | "exit_code": r.exit_code, 12 | "stdout": r.stdout, 13 | "stderr": r.stderr, 14 | } 15 | 16 | r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15) 17 | if r.timeout: 18 | status = "Timeout" 19 | elif r.exit_code == 0: 20 | status = "OK" 21 | elif "ERR_ASSERTION" in r.stderr: 22 | status = "AssertionError" 23 | elif "SyntaxError" in r.stderr: 24 | status = "SyntaxError" 25 | elif "ReferenceError" in r.stderr: 26 | status = "ReferenceError" 27 | else: 28 | status = "Exception" 29 | return { 30 | "status": status, 31 | "exit_code": r.exit_code, 32 | "stdout": r.stdout, 33 | "stderr": r.stderr, 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Zhaoyang Chu (储朝阳) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /human_eval/finetuning/APPS/README.md: -------------------------------------------------------------------------------- 1 | # APPS finetuning 2 | In this folder we show how to train an autoregressive Language model on APPS dataset, since a common way to evaluate on this benchmark is after finetuning the model on its training split. 3 | We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs. 4 | 5 | ## Setup 6 | 7 | First login to Weights & Biases 8 | ``` 9 | wandb login 10 | ``` 11 | 12 | You can finetune a model, `gpt_345_python_any_license` for example, by running: 13 | ```python 14 | # we use a global batch size of 256, here = 8 (GPUs) * 2 (batch_size_per_device) * 16 (gradient_accumulation) 15 | python apps_train.py \ 16 | --model_ckpt BigCode/gpt_345_python_any_license \ 17 | --num_epochs 10 \ 18 | --batch_size 2 \ 19 | --gradient_accumulation_steps 16 \ 20 | --learning_rate 5e-5 \ 21 | --eval_freq 250 \ 22 | --fp16 23 | ``` 24 | The fine-tuning takes 11h on 4 A100 GPUs. 25 | 26 | ## Acknowledgments 27 | 28 | This script is adapted from [APPS repository](https://github.com/hendrycks/apps). -------------------------------------------------------------------------------- /memorization_thresholds/clone_repo.sh: -------------------------------------------------------------------------------- 1 | # Clone a given repository, extract any files belonging to the given language, and delete the repository afterwards to save space. 2 | in=$1 3 | language=$2 4 | 5 | # Extract the org and name from lines formatted as stars\thttps://github.com/org/name 6 | repo=$(echo $in | cut -d$'\t' -f2); 7 | name_part=$(echo $repo | cut -d"/" -f4-6); 8 | name=$(echo $name_part | cut -d"/" -f2); 9 | org=$(echo $name_part | cut -d"/" -f1); 10 | echo "Cloning $org/$name" 11 | DIR=Repos/$language/$org; \ 12 | OUT=Code/$language/$org; \ 13 | # Skip repositories for which we already have extracted code files. 14 | if [ -d $OUT/$name ]; then echo "deja vu"; exit; fi; 15 | mkdir -p $DIR; \ 16 | mkdir -p $OUT; \ 17 | 18 | # Clone with depth=1 to only get most recent files, rather than entire history. 19 | if [ ! -d $DIR/$name ]; then 20 | git clone -q --depth 1 https://github.com/$org/$name $DIR/$name; 21 | # git clone -q --depth 1 git@github.com:$org/$name $DIR/$name; 22 | fi; 23 | 24 | # Extract all language-specific code files from the repository and delete it afterwards. 25 | python3 extract_code.py $language $DIR/$name $OUT/$name; 26 | rm -rf $DIR/$name 27 | -------------------------------------------------------------------------------- /human_eval/finetuning/Code-to-text/README.md: -------------------------------------------------------------------------------- 1 | # Code-to-text finetuning [WIP] 2 | In this folder we show how to train an autoregressive on [Code-to-text](https://huggingface.co/datasets/code_x_glue_ct_code_to_text) dataset, for natural language comments generation from code. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs. 3 | 4 | ## Setup 5 | 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub: 7 | ``` 8 | wandb login 9 | huggingface-cli login 10 | ``` 11 | 12 | During the training, we use the code as input to the model and docstring as label. To fine-tune a model on the Python dataset for example, you can use the following command: 13 | ```python 14 | python train.py \ 15 | --model_ckpt codeparrot/codeparrot-small \ 16 | --language Python \ 17 | --num_epochs 30 \ 18 | --batch_size 8 \ 19 | --num_warmup_steps 10 \ 20 | --learning_rate 5e-4 21 | --push_to_hub True 22 | ``` 23 | 24 | For the 2-shot evaluation we use as a prompt 25 | ``` 26 | Generate comments for these code snippets: 27 | Code: 28 | $CODE1 29 | Comment: 30 | $DOCSTRING1 31 | 32 | Code: 33 | CODE2 34 | Comment: 35 | $DOCSTRING2 36 | 37 | Code: $CODE 38 | """ 39 | ``` 40 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_go.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | from pathlib import Path 4 | from sys import exit 5 | 6 | from .generic_eval import main as gmain 7 | 8 | 9 | def eval_script(path: Path): 10 | status = None 11 | stdout = None 12 | stderr = None 13 | exit_code = None 14 | try: 15 | build = subprocess.run( 16 | ["go", "test", path], 17 | timeout=30, 18 | stdout=subprocess.PIPE, 19 | stderr=subprocess.PIPE, 20 | ) 21 | 22 | stdout = build.stdout.decode("utf-8", errors="ignore") 23 | stderr = build.stderr.decode("utf-8", errors="ignore") 24 | exit_code = build.returncode 25 | # write to stderr just so that we can redirect stdout to a csv 26 | 27 | if "[setup failed]" in stdout or "[build failed]" in stdout: 28 | status = "SyntaxError" 29 | elif "FAIL" in stdout: 30 | status = "Exception" 31 | else: 32 | status = "OK" 33 | except subprocess.TimeoutExpired: 34 | status = "Timeout" 35 | 36 | return { 37 | "status": status, 38 | "exit_code": exit_code, 39 | "stdout": stdout, 40 | "stderr": stderr, 41 | } 42 | 43 | 44 | if __name__ == "__main__": 45 | gmain(eval_script, "Go", ".go") 46 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_cpp.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .generic_eval import main 4 | from .safe_subprocess import run 5 | 6 | LANG_NAME = "C++" 7 | LANG_EXT = ".cpp" 8 | 9 | 10 | def eval_script(path: Path): 11 | basename = ".".join(str(path).split(".")[:-1]) 12 | build_result = run(["g++", path, "-o", basename, "-std=c++17"]) 13 | if build_result.exit_code != 0: 14 | return { 15 | "status": "SyntaxError", 16 | "exit_code": build_result.exit_code, 17 | "stdout": build_result.stdout, 18 | "stderr": build_result.stderr, 19 | } 20 | 21 | run_result = run([basename]) 22 | if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr: 23 | raise Exception("Skylake bug encountered") 24 | if "/4.8.2" in run_result.stderr: 25 | raise Exception("Ancient compiler encountered") 26 | if run_result.timeout: 27 | status = "Timeout" 28 | elif run_result.exit_code != 0: 29 | status = "Exception" 30 | else: 31 | status = "OK" 32 | return { 33 | "status": status, 34 | "exit_code": run_result.exit_code, 35 | "stdout": run_result.stdout, 36 | "stderr": run_result.stderr, 37 | } 38 | 39 | 40 | if __name__ == "__main__": 41 | main(eval_script, LANG_NAME, LANG_EXT) 42 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json: -------------------------------------------------------------------------------- 1 | {"danish":{"source1":"2 . Udfyld felterne i hvert trin i vejledningen . ","source2":"* Vise rapporter med finansposter og saldi . ","target1":"2 . Fill in the fields in each step of the guide . ","target2":"* View reports that show general ledger entries and balances . "},"chinese":{"source1":"返回 与 筛选器 初始化 由 平台 的 MCDRemoteSystemPlatformFilter 对象 。 ","source2":"用于 将 本地 的 ( 调用 ) 应用 程序 可 见性 首选 项 设置 发现 远程 系统 时 的 类 。 ","target1":"Returns an MCDRemoteSystemPlatformFilter object initialized with a filter by platform . ","target2":"A class used to set the local ( calling ) application visibility preference when discovering remote systems ."},"norwegian":{"source1":"Kosttypesaldo = Kostsentersaldo + Kostobjektsaldo ","source2":"* Vise en liste over bokføringsgrupper som du posterer til kontoen . ","target1":"Cost Type Balance = Cost Center Balance + Cost Object Balance ","target2":"* See a list of posting groups that post to that account . "},"latvian":{"source1":"# # < a name = " 6-change-the-status-of-the-conversion-record-to-ready " > < / a > 6 . Mainiet pārveidošanas ieraksta statusu uz Gatavs ","source2":"title : Preču saņemšanas reģistrēšana pirkšanas pasūtījumā ","target1":"# # 6 . Change the status of the conversion record to Ready ","target2":"title : Record the receipt of goods on the purchase order "}} -------------------------------------------------------------------------------- /human_eval/bigcode_eval/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class EvalArguments: 7 | """ 8 | Configuration for running the evaluation. 9 | """ 10 | prefix: Optional[str] = field( 11 | default="", 12 | metadata={ 13 | "help": "Prefix to add to the prompt. For example InCoder needs prefix='<| file ext=.py |>\n'" 14 | }, 15 | ) 16 | do_sample: Optional[bool] = field( 17 | default=True, 18 | metadata={"help": "Sample from the language model's output distribution."}, 19 | ) 20 | temperature: Optional[float] = field( 21 | default=0.2, metadata={"help": "Sampling temperature used for generation."} 22 | ) 23 | top_k: Optional[int] = field( 24 | default=0, metadata={"help": "Top-k parameter used for generation."} 25 | ) 26 | top_p: Optional[float] = field( 27 | default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."} 28 | ) 29 | n_samples: Optional[int] = field( 30 | default=1, 31 | metadata={"help": "Number of completions to generate for each sample."}, 32 | ) 33 | eos: Optional[str] = field( 34 | default="<|endoftext|>", metadata={"help": "end of sentence token."} 35 | ) 36 | seed: Optional[int] = field( 37 | default=0, metadata={"help": "Random seed used for evaluation."} 38 | ) 39 | -------------------------------------------------------------------------------- /human_eval/tests/data/humaneval_gen_gens.json: -------------------------------------------------------------------------------- 1 | [["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\"jectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectject through through through through through through through through through through through through through through through through through through through through through through through through through through"], ["from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\" at at at at at at at at at at at at at at at at at at at at at at at fe fe fe fe fe fe fe fe fe fe fe fe"]] -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_racket.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluates a generated Racket program (.rkt). 3 | """ 4 | import os 5 | from pathlib import Path 6 | 7 | from .safe_subprocess import run 8 | 9 | 10 | def eval_script(path: Path): 11 | result = run(["racket", str(path)]) 12 | 13 | if ( 14 | "standard-module-name-resolver: collection not found\n for module path: rackunit" 15 | in result.stderr 16 | ): 17 | print(f"Failed to run evaluation for {path}: rackunit is not installed") 18 | return None 19 | 20 | # rackunit produces exit code 0 even if tests fail. 21 | if len(result.stderr) > 0 or result.exit_code != 0: 22 | if "read-syntax" in result.stderr: 23 | status = "SyntaxError" 24 | else: 25 | status = "Exception" 26 | else: 27 | status = "OK" 28 | 29 | return { 30 | "status": status, 31 | "exit_code": result.exit_code, 32 | "stdout": result.stdout, 33 | "stderr": result.stderr, 34 | } 35 | 36 | 37 | def main(): 38 | directory = Path( 39 | Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2" 40 | ).resolve() 41 | 42 | for filename in os.listdir(directory): 43 | r = eval_script(Path.joinpath(directory, filename)) 44 | filename = filename.split(".")[0] 45 | print(f"Racket,{filename},{r['status']}") 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /unlearning_preparation/forgotten_data_sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datasets import load_from_disk 3 | import torch 4 | import random 5 | 6 | 7 | def example_filter(example): 8 | secret_mask = torch.BoolTensor(example['secret_mask']) 9 | return example['secret_mean_MA'] >= 0.9 and secret_mask.sum() >= 32 # Prioritize high-risk samples for unlearning 10 | 11 | 12 | def main(): 13 | ds_pii = load_from_disk(f"../sensitive_memorization/codeparrot-clean-train-secrets-probed-{args.model_name_or_path.split('/')[-1]}") 14 | ds_pii = ds_pii.filter(example_filter, num_proc=16) 15 | random.seed(42) 16 | 17 | indices = list(range(len(ds_pii))) 18 | random.shuffle(indices) 19 | 20 | for i in range(5): 21 | sampled_group = ds_pii.select(indices[i * args.k: (i + 1) * args.k]) 22 | print(sampled_group) 23 | sampled_group.save_to_disk(f"../unlearning/data/{args.model_name_or_path.split('/')[-1]}_secret/{args.model_name_or_path.split('/')[-1]}_forgot_set_{args.k}_{i}") 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str, 29 | help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.") 30 | parser.add_argument('--k', type=int, default=32, 31 | help="The number of forgotten samples.") 32 | args = parser.parse_args() 33 | print(args) 34 | 35 | main() 36 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_scala.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | from .safe_subprocess import run 5 | 6 | LANG_NAME = "Scala" 7 | LANG_EXT = ".scala" 8 | 9 | 10 | def eval_script(path: Path): 11 | with tempfile.TemporaryDirectory() as outdir: 12 | # Each Scala file contains the class with same name `JAVA_CLASS_NAME` 13 | # Hence, scalac will same JAVA_CLASS_NAME.class file for each problem 14 | # Write class for each problem to a different temp dir 15 | build = run(["scalac", "-d", outdir, path], timeout_seconds=45) 16 | if build.exit_code != 0: 17 | # Well, it's a compile error. May be a type error or 18 | # something. But, why break the set convention 19 | return { 20 | "status": "SyntaxError", 21 | "exit_code": build.exit_code, 22 | "stdout": build.stdout, 23 | "stderr": build.stderr, 24 | } 25 | # "Problem" is the name of the class we emit. 26 | r = run(["scala", "-cp", f"{outdir}", "Problem"]) 27 | if r.timeout: 28 | status = "Timeout" 29 | elif r.exit_code == 0 and r.stderr == "": 30 | status = "OK" 31 | else: 32 | # Well, it's a panic 33 | status = "Exception" 34 | return { 35 | "status": status, 36 | "exit_code": r.exit_code, 37 | "stdout": r.stdout, 38 | "stderr": r.stderr, 39 | } 40 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/libeval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | from typing import List 5 | 6 | from . import generic_eval 7 | 8 | 9 | def testing_mail(x, y, z): 10 | generic_eval.gmain(x, y, z) 11 | 12 | 13 | def run_without_exn(args: List[str]): 14 | """ 15 | Runs the given program with a five second timeout. Does not throw an exception 16 | no matter what happens. The output is a dictionary of the format that we expect 17 | for our evaluation scripts. The "status" field is "OK" when the exit code is 18 | zero. If that isn't enough, you may want to tweak the status based on the 19 | captured stderr and stdout. 20 | """ 21 | p = subprocess.Popen( 22 | args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True 23 | ) 24 | try: 25 | stdout, stderr = p.communicate(timeout=5) 26 | exit_code = p.returncode 27 | status = "OK" if exit_code == 0 else "Exception" 28 | except subprocess.TimeoutExpired as exc: 29 | stdout, stderr = p.stdout.read(), p.stderr.read() 30 | os.killpg(os.getpgid(p.pid), signal.SIGTERM) 31 | exit_code = -1 32 | status = "Timeout" 33 | 34 | if stdout is None: 35 | stdout = b"" 36 | if stderr is None: 37 | stderr = b"" 38 | return { 39 | "status": status, 40 | "exit_code": exit_code, 41 | "stdout": stdout.decode("utf-8", errors="ignore"), 42 | "stderr": stderr.decode("utf-8", errors="ignore"), 43 | } 44 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_ruby.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | from pathlib import Path 4 | 5 | from .generic_eval import main as gmain 6 | 7 | 8 | def eval_script(path: Path): 9 | try: 10 | # Assumes exit-code 0 is all okay 11 | # Need check=True for Ruby to pass errors to CalledProcessError 12 | output = subprocess.run( 13 | ["ruby", path], check=True, capture_output=True, timeout=5 14 | ) 15 | if output.returncode == 0: 16 | status = "OK" 17 | out = output.stderr 18 | error = output.stdout 19 | returncode = 0 20 | else: 21 | raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!") 22 | except subprocess.TimeoutExpired as exc: 23 | status = "Timeout" 24 | out = exc.stdout 25 | error = exc.stderr 26 | returncode = -1 27 | except subprocess.CalledProcessError as exc: 28 | returncode = exc.returncode 29 | out = exc.stdout 30 | error = exc.stderr 31 | # failure with code 1 but no error message is an Exception from Failed tests 32 | if len(error) < 1: 33 | status = "Exception" 34 | else: # everything that prints out an error message is a SyntaxError 35 | status = "SyntaxError" 36 | return { 37 | "status": status, 38 | "exit_code": returncode, 39 | "stdout": out, 40 | "stderr": error, 41 | } 42 | 43 | 44 | if __name__ == "__main__": 45 | gmain(eval_script, "Ruby", ".rb") 46 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_r.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | 6 | def eval_script(path: Path): 7 | try: 8 | # Assumes exit-code 0 is all okay 9 | # Run R on the file, capturing stderr 10 | output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5) 11 | if output.returncode == 0: 12 | status = "OK" 13 | else: 14 | outmessage = str(output) 15 | if "unexpected" in outmessage: 16 | status = "SyntaxError" 17 | elif "err=b''" in outmessage: 18 | status = "AssertionError" 19 | else: 20 | status = "Exception" 21 | returncode = output.returncode 22 | except subprocess.TimeoutExpired as exc: 23 | status = "Timeout" 24 | output = exc 25 | returncode = -1 26 | except subprocess.CalledProcessError as exc: 27 | status = "Exception" 28 | returncode = exc.returncode 29 | output = exc 30 | return { 31 | "status": status, 32 | "exit_code": returncode, 33 | "stdout": output.stdout, 34 | "stderr": output.stderr, 35 | } 36 | 37 | 38 | def main(): 39 | directory = Path( 40 | Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2" 41 | ).resolve() 42 | 43 | for filename in os.listdir(directory): 44 | r = eval_script(Path.joinpath(directory, filename)) 45 | filename = filename.split(".")[0] 46 | print(f"R,{filename},{r['status']}") 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /human_eval/leaderboard/group_jsons.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import json 4 | import os 5 | import glob 6 | 7 | 8 | parser = argparse.ArgumentParser(description='Process metric files') 9 | parser.add_argument('--metrics_path', type=str, required=True, help='Path where metric files are stored') 10 | parser.add_argument('--model', type=str, required=True, help='Name of the model') 11 | parser.add_argument('--org', type=str, required=True, help='Organization/user hosting the model') 12 | parser.add_argument('--username', type=str, required=True, help='Your HF username') 13 | args = parser.parse_args() 14 | 15 | 16 | # List of valid tasks 17 | valid_tasks = ["humaneval"] + ["multiple-" + lang for lang in ["js", "java", "cpp", "swift", "php", "d", "jl", "lua", "r", "rkt", "rb", "rs"]] 18 | 19 | final_results = {"results": [], "meta": {"model": f"{args.org}/{args.model}"}} 20 | 21 | # Iterate over all .json files in the metrics_path 22 | for json_file in glob.glob(os.path.join(args.metrics_path, '*.json')): 23 | 24 | # Extract task from file name 25 | print(f"Processing {json_file}") 26 | task = os.path.splitext(os.path.basename(json_file))[0].split('_')[1] 27 | if task not in valid_tasks: 28 | print(f"Skipping invalid task: {task}") 29 | continue 30 | 31 | with open(json_file, 'r') as f: 32 | data = json.load(f) 33 | 34 | pass_at_1 = data.get(task, {}).get("pass@1", None) 35 | output = {"task": task, "pass@1": pass_at_1} 36 | final_results["results"].append(output) 37 | 38 | 39 | with open(f"{args.org}_{args.model}_{args.username}.json", 'w') as f: 40 | json.dump(final_results, f) 41 | 42 | print(f"Saved {args.org}_{args.model}_{args.username}.json") -------------------------------------------------------------------------------- /human_eval/tests/data/mbpp_gen_gens.json: -------------------------------------------------------------------------------- 1 | [["\"\"\"\nWrite a python function to remove first and last occurrence of a given character from the string.\nassert remove_Occ(\"hello\",\"l\") == \"heo\"\n\"\"\"\normormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormorm only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only"], ["\"\"\"\nWrite a function to sort a given matrix in ascending order according to the sum of its rows.\nassert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\n\"\"\"\n at at at at at at feiririririririririririririrherherherherher who who who who who who who who who who who who who who who who fe fe fe fe fe fe fe fejjjjjjjjjjjjjjjjjjjjjififififififififififififififiriririririririrGGGGGGGedededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededed"]] -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_java.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from pathlib import Path 4 | 5 | from .generic_eval import main 6 | from .safe_subprocess import run 7 | 8 | LANG_NAME = "Java" 9 | LANG_EXT = ".java" 10 | 11 | # Following files have problems: 12 | # 137, 13 | # 22: Any 14 | # 148: Elipsis 15 | 16 | 17 | def eval_script(path: Path): 18 | 19 | sys_env = os.environ.copy() 20 | javatuples_path = Path("/usr/multiple/javatuples-1.2.jar") 21 | 22 | sys_env["CLASSPATH"] = f"{javatuples_path}" 23 | 24 | with tempfile.TemporaryDirectory() as outdir: 25 | # Each Java file contains the class with same name `JAVA_CLASS_NAME` 26 | # Hence, javac will same JAVA_CLASS_NAME.class file for each problem 27 | # Write class for each problem to a different temp dir 28 | # Use UTF8 encoding with javac 29 | result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env) 30 | 31 | if result.exit_code != 0: 32 | # Well, it's a compile error. May be a type error or 33 | # something. But, why break the set convention 34 | status = "SyntaxError" 35 | else: 36 | result = run(["java", "-ea", "-cp", f"{outdir}", "Problem"], env=sys_env) 37 | if result.timeout: 38 | status = "Timeout" 39 | elif result.exit_code == 0: 40 | status = "OK" 41 | else: 42 | status = "Exception" 43 | 44 | return { 45 | "status": status, 46 | "exit_code": result.exit_code, 47 | "stdout": result.stdout, 48 | "stderr": result.stderr, 49 | } 50 | 51 | 52 | if __name__ == "__main__": 53 | main(eval_script, LANG_NAME, LANG_EXT) 54 | -------------------------------------------------------------------------------- /human_eval/finetuning/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning 2 | In this folder we show how to fine-tune an autoregressive Language model on the following evaluation and downstream tasks with support for 7 programming languages: 3 | 4 | * [APPS](https://huggingface.co/datasets/codeparrot/apps): Python benchmark to evaluate code generation. It is similar to HumanEval and MBPP, but it is more challanging and has more evaluation problems. 5 | * [CodeComplex](https://huggingface.co/datasets/codeparrot/codecomplex): **Java** benchmark with a classification problem to predict the algorithmic complexity of Java programs among 7 labels. 6 | * [CodeClone](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench): **Java** benchmark from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) dataset, with a binary classification problem of predicting the semantic equivalence of two programs. [WIP] 7 | * [CodeDefect](https://huggingface.co/datasets/code_x_glue_cc_defect_detection): **C** benchmark from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE), with a binary classification problem of predicting whether a code is insecure code and may attack software systems. [WIP] 8 | * [Code-to-text](https://huggingface.co/datasets/code_x_glue_ct_code_to_text): Dataset from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) for generationg natural language comments from code in **Python, Go, Java, Javascript, PHP and Ruby**. This task can also be done in a zero-shot setting without need for fine-tuning. [WIP] 9 | 10 | We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) API for all tasks, which supports distributed training on multiple GPUs. 11 | 12 | The evaluation score on the test set is shown at the end of the fine-tuning. For implementation details, please refer to the README inside each folder. 13 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from pprint import pprint 3 | 4 | from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala, 5 | concode, ds1000, gsm, humaneval, humanevalpack, 6 | instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple, 7 | parity, python_bugs, quixbugs, recode) 8 | 9 | TASK_REGISTRY = { 10 | **apps.create_all_tasks(), 11 | **codexglue_code_to_text.create_all_tasks(), 12 | **codexglue_text_to_text.create_all_tasks(), 13 | **multiple.create_all_tasks(), 14 | "codexglue_code_to_text-python-left": codexglue_code_to_text.LeftCodeToText, 15 | "conala": conala.Conala, 16 | "concode": concode.Concode, 17 | **ds1000.create_all_tasks(), 18 | **humaneval.create_all_tasks(), 19 | **humanevalpack.create_all_tasks(), 20 | "mbpp": mbpp.MBPP, 21 | "parity": parity.Parity, 22 | "python_bugs": python_bugs.PythonBugs, 23 | "quixbugs": quixbugs.QuixBugs, 24 | "instruct_wizard_humaneval": instruct_wizard_humaneval.HumanEvalWizardCoder, 25 | **gsm.create_all_tasks(), 26 | **instruct_humaneval.create_all_tasks(), 27 | **recode.create_all_tasks(), 28 | } 29 | 30 | ALL_TASKS = sorted(list(TASK_REGISTRY)) 31 | 32 | 33 | def get_task(task_name, args=None): 34 | try: 35 | kwargs = {} 36 | if "prompt" in inspect.signature(TASK_REGISTRY[task_name]).parameters: 37 | kwargs["prompt"] = args.prompt 38 | if "load_data_path" in inspect.signature(TASK_REGISTRY[task_name]).parameters: 39 | kwargs["load_data_path"] = args.load_data_path 40 | return TASK_REGISTRY[task_name](**kwargs) 41 | except KeyError: 42 | print("Available tasks:") 43 | pprint(TASK_REGISTRY) 44 | raise KeyError(f"Missing task {task_name}") 45 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_javascript.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | 6 | def eval_script(path: Path): 7 | try: 8 | # Assumes exit-code 0 is all okay 9 | output = subprocess.run(["node", str(path)], capture_output=True, timeout=5) 10 | 11 | if output.returncode == 0: 12 | status = "OK" 13 | else: 14 | outmessage = str(output) 15 | if "ERR_ASSERTION" in outmessage: 16 | status = "AssertionError" 17 | elif "SyntaxError" in outmessage: 18 | status = "SyntaxError" 19 | elif "ReferenceError" in outmessage: 20 | status = "ReferenceError" 21 | else: 22 | status = "Exception" 23 | returncode = output.returncode 24 | except subprocess.TimeoutExpired as exc: 25 | status = "Timeout" 26 | output = exc 27 | returncode = -1 28 | except subprocess.CalledProcessError as exc: 29 | status = "Exception" 30 | returncode = exc.returncode 31 | output = exc 32 | return { 33 | "status": status, 34 | "exit_code": returncode, 35 | "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"), 36 | "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"), 37 | } 38 | 39 | 40 | def main(): 41 | directory = Path( 42 | Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2" 43 | ).resolve() 44 | 45 | for filename in os.listdir(directory): 46 | r = eval_script(Path.joinpath(directory, filename)) 47 | filename = filename.split(".")[0] 48 | print(f"JavaScript,{filename},{r['status']}") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /sensitive_memorization/analyze.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from datasets import load_from_disk 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | sns.set(style="whitegrid") 7 | plt.rcParams["font.family"] = "Times New Roman" 8 | 9 | 10 | def main(): 11 | models = ['codeparrot-small', 'codegen-350M-mono'] 12 | model_names = ['CodeParrot-small', 'CodeGen-350M-Mono'] 13 | MA_thresholds = [0.4557, 0.4879] 14 | colors = ['#CCDAED', '#E2F0D9'] 15 | fig, axs = plt.subplots(1, 2, figsize=(24, 5), constrained_layout=True) 16 | 17 | for i in range(len(models)): 18 | print(models[i]) 19 | ds_pii = load_from_disk(f"./codeparrot-clean-train-secrets-probed-{models[i]}") 20 | print(len(ds_pii)) 21 | ds_pii_temp = ds_pii.filter(lambda example: example['secret_mean_MA'] > MA_thresholds[i], num_proc=16) 22 | print(len(ds_pii_temp)) 23 | 24 | ax = axs[i] 25 | n, bins, patches = ax.hist(ds_pii['secret_mean_MA'], bins=40, color=colors[i], edgecolor='black', alpha=0.7, linewidth=2) 26 | ax.axvline(MA_thresholds[i], color='black', linestyle='dashed', linewidth=3) 27 | ax.text(MA_thresholds[i] - 0.475, ax.get_ylim()[1] * 0.875, f'Forgetting Threshold: {MA_thresholds[i]}', color='black', fontsize=27) 28 | ax.set_xticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) 29 | ax.set_xlabel(model_names[i], fontsize=30, labelpad=15) 30 | ax.tick_params(axis='both', which='major', labelsize=27) 31 | if i == 0: 32 | ax.set_ylabel('Frequency', fontsize=30, labelpad=15) 33 | ax.grid(True, linestyle='--', alpha=0.7) 34 | 35 | fig.savefig(r"MemorizationDistribution.jpg", dpi=300, bbox_inches='tight') 36 | fig.savefig(r"MemorizationDistribution.pdf", bbox_inches='tight') 37 | plt.show() 38 | 39 | 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_rust.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | from .generic_eval import main 6 | 7 | LANG_NAME = "Rust" 8 | LANG_EXT = ".rs" 9 | 10 | 11 | def eval_script(path: Path): 12 | basename = ".".join(str(path).split(".")[:-1]) 13 | try: 14 | build = subprocess.run( 15 | ["rustc", path, "-o", basename], capture_output=True, timeout=15 16 | ) 17 | except subprocess.TimeoutExpired as exc: 18 | return { 19 | "status": "Timeout", 20 | "exit_code": -1, 21 | "stdout": "Compiler timeout", 22 | "stderr": "Compiler timeout", 23 | } 24 | status = None 25 | returncode = -1 26 | output = None 27 | if build.returncode != 0: 28 | # Well, it's a compile error. May be a type error or 29 | # something. But, why break the set convention 30 | status = "SyntaxError" 31 | returncode = build.returncode 32 | output = build 33 | else: 34 | try: 35 | # Assumes exit-code 0 is all okay 36 | output = subprocess.run([basename], capture_output=True, timeout=5) 37 | returncode = output.returncode 38 | if output.returncode == 0: 39 | status = "OK" 40 | else: 41 | # Well, it's a panic 42 | status = "Exception" 43 | except subprocess.TimeoutExpired as exc: 44 | status = "Timeout" 45 | output = exc 46 | os.remove(basename) 47 | return { 48 | "status": status, 49 | "exit_code": returncode, 50 | "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"), 51 | "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"), 52 | } 53 | 54 | 55 | if __name__ == "__main__": 56 | main(eval_script, LANG_NAME, LANG_EXT) 57 | -------------------------------------------------------------------------------- /memorization_thresholds/extract_code.py: -------------------------------------------------------------------------------- 1 | """Copies all files belonging to a given language to a new directory.""" 2 | import os 3 | import sys 4 | from shutil import copyfile 5 | 6 | import pygments 7 | from pygments.lexers import get_lexer_by_name 8 | from pygments.token import Token 9 | 10 | # Basic config options. 11 | MAX_FILE_SIZE = 1024 ** 2 # 1 MB 12 | MIN_FILE_TOKENS = 100 13 | 14 | def main(): 15 | if len(sys.argv) <= 3: 16 | raise ValueError('Provide a language, source directory and target directory.') 17 | 18 | language = sys.argv[1] 19 | proj_dir = sys.argv[2] 20 | out_dir = sys.argv[3] 21 | 22 | # Use Pygments to get language extensions. 23 | lexer = get_lexer_by_name(language) 24 | language_extensions = set(ext.lower()[1:] for ext in lexer.filenames) 25 | 26 | print(f'Processing: {proj_dir}') 27 | if not os.path.exists(out_dir): 28 | os.makedirs(out_dir) 29 | 30 | files_found = 0 31 | for root, _, files in os.walk(proj_dir): 32 | for file in files: 33 | if any(file.endswith(ext) for ext in language_extensions): 34 | in_path = os.path.join(root, file) 35 | if not os.path.exists(in_path): # Can happen due to broken symlinks. 36 | continue 37 | if os.path.getsize(in_path) > MAX_FILE_SIZE: # Drop excessively long files. 38 | continue 39 | with open(in_path, errors='ignore') as f_in: 40 | text = f_in.read() 41 | if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS: # Drop files with too few tokens. 42 | continue 43 | 44 | # Copy all other files to the target directory using a simplified path. 45 | rel_path = root[len(proj_dir)+1:].replace('/', '__') 46 | out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file) 47 | if not os.path.exists(out_path): 48 | try: 49 | copyfile(in_path, out_path) 50 | except Exception as e: 51 | print(f'Skipping problematic file {in_path} due to: {e}') 52 | files_found += 1 53 | print(f'Done processing; copied {files_found} files.') 54 | 55 | 56 | if __name__ == '__main__': 57 | main() -------------------------------------------------------------------------------- /human_eval/Dockerfile-multiple: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | RUN apt-get update -yqq && apt-get install -yqq curl build-essential python3-pip python3-tqdm 3 | RUN apt-get install racket -yqq 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | ENV TZ=Etc/UTC 6 | RUN apt-get install -yqq \ 7 | default-jdk-headless \ 8 | golang-go \ 9 | php-cli \ 10 | ruby \ 11 | lua5.3 \ 12 | r-base \ 13 | rustc \ 14 | scala 15 | 16 | RUN apt-get install -yqq libtest-deep-perl 17 | RUN apt-get install -yqq wget 18 | 19 | # JS/TS 20 | RUN curl -fsSL https://deb.nodesource.com/setup_current.x | bash - 21 | RUN apt-get install -y nodejs 22 | RUN npm install -g typescript 23 | 24 | # Dlang 25 | RUN wget https://netcologne.dl.sourceforge.net/project/d-apt/files/d-apt.list -O /etc/apt/sources.list.d/d-apt.list 26 | RUN apt-get update --allow-insecure-repositories 27 | RUN apt-get -y --allow-unauthenticated install --reinstall d-apt-keyring 28 | RUN apt-get update && apt-get install -yqq dmd-compiler dub 29 | 30 | # C# 31 | RUN apt install gnupg ca-certificates 32 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF 33 | RUN echo "deb https://download.mono-project.com/repo/ubuntu stable-focal main" | tee /etc/apt/sources.list.d/mono-official-stable.list 34 | RUN apt update 35 | RUN apt install -yqq mono-devel 36 | 37 | # Post-processing 38 | 39 | # Julia 40 | RUN curl https://julialang-s3.julialang.org/bin/linux/x64/1.8/julia-1.8.2-linux-x86_64.tar.gz | tar xz 41 | ENV PATH="/julia-1.8.2/bin:${PATH}" 42 | # Swift 43 | RUN curl https://download.swift.org/swift-5.7-release/ubuntu2204/swift-5.7-RELEASE/swift-5.7-RELEASE-ubuntu22.04.tar.gz | tar xz 44 | ENV PATH="/swift-5.7-RELEASE-ubuntu22.04/usr/bin:${PATH}" 45 | # Javatuples 46 | RUN mkdir /usr/multiple && wget https://repo.mavenlibs.com/maven/org/javatuples/javatuples/1.2/javatuples-1.2.jar -O /usr/multiple/javatuples-1.2.jar 47 | # Luaunit 48 | RUN apt-get update -yqq && apt-get install -yqq lua-unit 49 | 50 | # Standard requirements 51 | COPY . /app 52 | WORKDIR /app 53 | RUN test -f /app/generations.json && rm /app/generations.json || true 54 | 55 | RUN pip3 install . 56 | CMD ["python3", "main.py"] 57 | -------------------------------------------------------------------------------- /human_eval/leaderboard/multiple_eval.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=1 3 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 4 | #SBATCH --cpus-per-task=48 5 | #SBATCH --gres=gpu:4 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out 8 | 9 | set -x -e 10 | source /admin/home/loubna/.bashrc 11 | 12 | conda activate brr4 13 | 14 | # File Path setup 15 | echo "START TIME: $(date)" 16 | 17 | GPUS_PER_NODE=4 18 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 19 | MASTER_PORT=6000 20 | NNODES=$SLURM_NNODES 21 | NODE_RANK=$SLURM_PROCID 22 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 23 | 24 | 25 | model=$1 26 | task=$2 27 | org=$3 28 | out_path=$4 29 | 30 | CMD="\ 31 | /fsx/loubna/code/bigcode-evaluation-harness/main.py \ 32 | --model $org/$model \ 33 | --tasks $task \ 34 | --max_length_generation 512 \ 35 | --batch_size 50 \ 36 | --n_samples 50 \ 37 | --temperature 0.2 \ 38 | --precision bf16 \ 39 | --allow_code_execution \ 40 | --trust_remote_code \ 41 | --save_generations \ 42 | --use_auth_token \ 43 | --generation_only \ 44 | --save_generations_path $out_path/generations_$task\_$model.json \ 45 | " 46 | 47 | export LAUNCHER="accelerate launch \ 48 | --multi_gpu \ 49 | --num_machines $NNODES \ 50 | --num_processes $WORLD_SIZE \ 51 | --main_process_ip "$MASTER_ADDR" \ 52 | --main_process_port $MASTER_PORT \ 53 | --num_processes $WORLD_SIZE \ 54 | --machine_rank \$SLURM_PROCID \ 55 | --role $SLURMD_NODENAME: \ 56 | --rdzv_conf rdzv_backend=c10d \ 57 | --max_restarts 0 \ 58 | --tee 3 \ 59 | " 60 | 61 | # force crashing on nccl issues like hanging broadcast 62 | export NCCL_ASYNC_ERROR_HANDLING=1 63 | 64 | # AWS specific 65 | export NCCL_PROTO=simple 66 | export RDMAV_FORK_SAFE=1 67 | export FI_EFA_FORK_SAFE=1 68 | export FI_EFA_USE_DEVICE_RDMA=1 69 | export FI_PROVIDER=efa 70 | export FI_LOG_LEVEL=1 71 | export NCCL_IB_DISABLE=1 72 | export NCCL_SOCKET_IFNAME=ens 73 | 74 | echo $CMD 75 | 76 | SRUN_ARGS=" \ 77 | --wait=60 \ 78 | --kill-on-bad-exit=1 \ 79 | " 80 | 81 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" 82 | 83 | echo "END TIME: $(date)" -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_cs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | from .generic_eval import main 5 | 6 | LANG_NAME = "CSharp" 7 | LANG_EXT = ".cs" 8 | 9 | # Following files have problems: 10 | # 137, 11 | # 22: Any 12 | # 148: Elipsis 13 | 14 | 15 | def eval_script(path: str): 16 | if ".cs" not in path.name: 17 | return 18 | basename = ".".join(str(path).split(".")[:-1]) 19 | binaryname = basename + ".exe" 20 | build = subprocess.run( 21 | ["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"], 22 | capture_output=True, 23 | ) 24 | status = None 25 | returncode = -1 26 | output = None 27 | if build.returncode != 0: 28 | # Well, it's a compile error. May be a type error or 29 | # something. But, why break the set convention 30 | status = "SyntaxError" 31 | returncode = build.returncode 32 | output = build 33 | else: 34 | try: 35 | output = subprocess.run( 36 | ["mono", binaryname], 37 | env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER": "Console.Error"}, 38 | capture_output=True, 39 | timeout=5, 40 | ) 41 | returncode = output.returncode 42 | output.stderr = str(output.stderr, "utf-8") 43 | # mono return 0 even when failing 44 | fail = ( 45 | "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr 46 | or "Unhandled Exception" in output.stderr 47 | ) 48 | output.returncode = 1 if fail else 0 49 | if output.returncode == 0: 50 | status = "OK" 51 | else: 52 | # Well, it's a panic 53 | status = "Exception" 54 | except subprocess.TimeoutExpired as exc: 55 | status = "Timeout" 56 | output = exc 57 | os.remove(binaryname) 58 | 59 | if output.stdout is not None: 60 | output.stdout = output.stdout.decode("utf-8") 61 | else: 62 | output.stdout = "None" 63 | 64 | if output.stderr == "": 65 | output.stderr = "None" 66 | 67 | return { 68 | "status": status, 69 | "exit_code": returncode, 70 | "stdout": output.stdout, 71 | "stderr": output.stderr, 72 | } 73 | 74 | 75 | if __name__ == "__main__": 76 | main(eval_script, LANG_NAME, LANG_EXT) 77 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_dlang.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from pathlib import Path 4 | 5 | from .safe_subprocess import run 6 | 7 | ENABLE_SYNTAX_CHECK = False 8 | 9 | 10 | def eval_script(path: Path): 11 | result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15) 12 | if "might not be correctly installed" in result.stderr: 13 | raise Exception("D is not correctly installed") 14 | 15 | if result.timeout: 16 | status = "Timeout" 17 | elif result.exit_code == 0: 18 | status = "OK" 19 | elif "Error:" in result.stderr: 20 | status = "SyntaxError" 21 | else: 22 | status = "Exception" 23 | 24 | return { 25 | "status": status, 26 | "exit_code": result.exit_code, 27 | "stdout": result.stdout, 28 | "stderr": result.stderr, 29 | } 30 | 31 | 32 | DIR = "d-keep-code_davinci_001_temp_0.2" 33 | 34 | 35 | def main(): 36 | directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve() 37 | 38 | count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0} 39 | for filename in os.listdir(directory): 40 | path = Path.joinpath(directory, filename) 41 | r = eval_script(path) 42 | status = r["status"] 43 | count[status] += 1 44 | 45 | if ENABLE_SYNTAX_CHECK and status == "SyntaxError": 46 | error_msgs = r["stderr"].split("\n") 47 | with open(path) as source_file: 48 | lines = source_file.readlines() 49 | unittest_line_start = lines.index("unittest\n") 50 | unittest_line_end = len(lines) 51 | for err_msg_line in error_msgs: 52 | matched_parts = re.match( 53 | r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)", 54 | err_msg_line[2:-1], 55 | ) 56 | _file, line_num = matched_parts[1], int(matched_parts[2]) 57 | if ( 58 | unittest_line_start <= line_num 59 | and line_num <= unittest_line_end 60 | ): 61 | print("===============") 62 | print(path, "contains error in unit test part") 63 | print(error_msgs) 64 | print("===============") 65 | 66 | filename = filename.split(".")[0] 67 | print(f"Dlang,{filename},{status}") 68 | 69 | print(DIR + ":" + str(count)) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /sensitive_memorization/tokenize_secrets_and_prefixes.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from datasets import load_from_disk 4 | import torch 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | 7 | 8 | def tokenize_secret(example): 9 | content = example['content'] 10 | secrets = eval(example['secrets']) 11 | example['secret_token_ids'] = [] 12 | for index in range(len(secrets)): 13 | secret = secrets[index]['value'] 14 | secret_token_ids = tokenizer.encode(secret, return_tensors='pt', max_length=max_secret_len, truncation=True, padding='max_length') 15 | example['secret_token_ids'].append(secret_token_ids) 16 | example['secret_token_ids'] = torch.cat(example['secret_token_ids'], dim=0) 17 | return example 18 | 19 | 20 | def tokenize_secret_prefix(example): 21 | content = example['content'] 22 | secrets = eval(example['secrets']) 23 | example['secret_prefix_token_ids'] = [] 24 | for index in range(len(secrets)): 25 | secret_prefix = content[:secrets[index]['start']] # Extract the context leading up to the secret 26 | # secret_prefix_token_ids = tokenizer.encode(secret_prefix, return_tensors='pt')[..., -1 * max_prefix_len:] 27 | secret_prefix_token_ids = tokenizer.encode(secret_prefix, return_tensors='pt', max_length=max_prefix_len, truncation=True, padding='max_length') 28 | example['secret_prefix_token_ids'].append(secret_prefix_token_ids) 29 | example['secret_prefix_token_ids'] = torch.cat(example['secret_prefix_token_ids'], dim=0) 30 | return example 31 | 32 | 33 | def main(): 34 | dataset_path = f"./codeparrot-clean-train-secrets-tokenized-{args.model_name_or_path.split('/')[-1]}" 35 | if os.path.exists(dataset_path): 36 | ds_pii = load_from_disk(dataset_path) 37 | else: 38 | ds_pii = load_from_disk(f"codeparrot-clean-train-secrets-masked-{args.model_name_or_path.split('/')[-1]}") 39 | ds_pii = ds_pii.map(tokenize_secret, num_proc=32) 40 | tokenizer.truncation_side = "left" 41 | tokenizer.padding_side = "left" 42 | ds_pii = ds_pii.map(tokenize_secret_prefix, num_proc=16) 43 | ds_pii.save_to_disk(dataset_path) 44 | print(ds_pii) 45 | 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str, 50 | help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.") 51 | args = parser.parse_args() 52 | print(args) 53 | 54 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True) 55 | # tokenizer.pad_token = tokenizer.eos_token 56 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 57 | 58 | max_secret_len = 32 59 | max_prefix_len = 128 60 | 61 | main() 62 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "questions": ["Olivia has $23. She bought five bagels for $3 each. How much money does she have left?", 3 | "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?", 4 | "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?", 5 | "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?", 6 | "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?", 7 | "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?", 8 | "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?", 9 | "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?"], 10 | "solutions": [" money_initial = 23\n bagels = 5\n bagel_cost = 3\n money_spent = bagels * bagel_cost\n money_left = money_initial - money_spent\n result = money_left\n return result", 11 | " golf_balls_initial = 58\n golf_balls_lost_tuesday = 23\n golf_balls_lost_wednesday = 2\n golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n result = golf_balls_left\n return result", 12 | " computers_initial = 9\n computers_per_day = 5\n num_days = 4 # 4 days between monday and thursday\n computers_added = computers_per_day * num_days\n computers_total = computers_initial + computers_added\n result = computers_total\n return result", 13 | " toys_initial = 5\n mom_toys = 2\n dad_toys = 2\n total_received = mom_toys + dad_toys\n total_toys = toys_initial + total_received\n result = total_toys\n return result", 14 | " jason_lollipops_initial = 20\n jason_lollipops_after = 12\n denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n result = denny_lollipops\n return result", 15 | " leah_chocolates = 32\n sister_chocolates = 42\n total_chocolates = leah_chocolates + sister_chocolates\n chocolates_eaten = 35\n chocolates_left = total_chocolates - chocolates_eaten\n result = chocolates_left\n return result", 16 | " cars_initial = 3\n cars_arrived = 2\n total_cars = cars_initial + cars_arrived\n result = total_cars\n return result", 17 | " trees_initial = 15\n trees_after = 21\n trees_added = trees_after - trees_initial\n result = trees_added\n return result"] 18 | } -------------------------------------------------------------------------------- /human_eval/bigcode_eval/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from warnings import warn 3 | 4 | from datasets import load_dataset 5 | 6 | 7 | class Task(ABC): 8 | """A task represents an entire benchmark including its dataset, problems, 9 | answers, generation settings and evaluation methods. 10 | """ 11 | 12 | # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub 13 | DATASET_PATH: str = None 14 | 15 | # The name of a subset within `DATASET_PATH`. 16 | DATASET_NAME: str = None 17 | 18 | def __init__(self, stop_words=None, requires_execution=True): 19 | """ 20 | :param stop_words: list 21 | list of stop words if the generation uses a stopping criteria during generation 22 | :param requires_execution: bool 23 | wheter the task requires code execution during evaluation or not 24 | """ 25 | self.stop_words = stop_words 26 | self.requires_execution = requires_execution 27 | try: 28 | self.dataset = load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME) 29 | except Exception as e: 30 | warn( 31 | f"Loading the dataset failed with {str(e)}. This task will use a locally downloaded dataset, not from the HF hub." 32 | ) 33 | 34 | @abstractmethod 35 | def get_dataset(self): 36 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 37 | return [] 38 | 39 | def fewshot_examples(self): 40 | """Loads and returns the few-shot examples for the task if they exist.""" 41 | pass 42 | 43 | @abstractmethod 44 | def get_prompt(self, doc): 45 | """Builds the prompt for the LM to generate from. 46 | :param doc: dict[str: str] 47 | sample from the test dataset 48 | """ 49 | pass 50 | 51 | @abstractmethod 52 | def get_reference(self, doc): 53 | """Builds the reference solution for the doc. 54 | :param doc: dict[str: str] 55 | sample from the test dataset 56 | """ 57 | pass 58 | 59 | @abstractmethod 60 | def postprocess_generation(self, generation, idx): 61 | """Defines the postprocessing for a LM generation. 62 | :param generation: str 63 | code generation from LM 64 | :param idx: int 65 | index of doc in the dataset to which the generation belongs 66 | """ 67 | pass 68 | 69 | @abstractmethod 70 | def process_results(self, generations, references): 71 | """Takes the list of LM generations and evaluates them against ground truth references, 72 | returning the metric for the generations as in {"metric_name": result}. 73 | :param generations: list(list(str)) 74 | list of lists containing generations 75 | :param references: list(str) 76 | list of str containing refrences 77 | :return: dict[str: float] 78 | """ 79 | pass 80 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | from concurrent.futures import ThreadPoolExecutor 5 | from pathlib import Path 6 | from threading import Lock 7 | from typing import Optional 8 | 9 | from .containerized_eval import eval_string_script 10 | 11 | # Get working directory 12 | WORKING_DIR = Path(__file__).parent.parent 13 | 14 | # program: str => Result 15 | CACHE = dict() 16 | CACHE_LOCK = Lock() 17 | 18 | 19 | def cache_get(program: str) -> Optional[dict]: 20 | if program in CACHE: 21 | result = CACHE[program] 22 | return result 23 | else: 24 | return None 25 | 26 | 27 | def cache_set(program: str, result: dict): 28 | if program in CACHE: 29 | print("Setting already-existing cache") 30 | CACHE[program] = result 31 | 32 | 33 | def cached_eval_script(problem, index) -> dict: 34 | # here prompt is already included in completions 35 | program = problem["completions"][index] + "\n" + problem["tests"] 36 | CACHE_LOCK.acquire(True) 37 | cached = cache_get(program) 38 | if cached is not None: 39 | CACHE_LOCK.release() 40 | return cached 41 | else: 42 | result_yaml = dict() 43 | cache_set(program, result_yaml) 44 | CACHE_LOCK.release() 45 | result_dict = eval_string_script(problem["language"], program) 46 | for k in result_dict.keys(): 47 | result_yaml[k] = result_dict[k] 48 | result_yaml["timestamp"] = int(time.time()) 49 | return result_yaml 50 | 51 | 52 | def get_test_results_json_path( 53 | output_dir: str, problem_json_path: str, input_dir: Path 54 | ) -> Path: 55 | suffixes = ".results.json" 56 | problem_name = problem_json_path[: -len(".json")] 57 | if input_dir: 58 | raise ValueError("input dir given") 59 | return Path(output_dir) / ( 60 | problem_json_path.relative_to(input_dir).parent / (problem_name + suffixes) 61 | ) 62 | return Path(output_dir) / (problem_name + suffixes) 63 | 64 | 65 | def evaluate_problem( 66 | output_dir: str, problem_json_path: str, max_workers: int, input_dir: Path = None 67 | ): 68 | with open(problem_json_path, "r") as f: 69 | problem = json.load(f) 70 | test_results_path = get_test_results_json_path( 71 | output_dir, problem_json_path, input_dir 72 | ) 73 | test_results_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True) 74 | 75 | test_results = problem.copy() 76 | del test_results["completions"] 77 | test_results["results"] = [] 78 | 79 | num_problems = len(problem["completions"]) 80 | min_problem = len(test_results["results"]) 81 | 82 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 83 | for j in executor.map( 84 | lambda index: cached_eval_script(problem, index), 85 | range(min_problem, num_problems), 86 | ): 87 | test_results["results"].append(j) 88 | with open(test_results_path, "w") as f: 89 | f.write(json.dumps(test_results, indent=2)) 90 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/__init__.py: -------------------------------------------------------------------------------- 1 | import fcntl 2 | import os 3 | import signal 4 | import subprocess 5 | import time 6 | from typing import List 7 | 8 | MAX_BYTES_PER_READ = 1024 9 | SLEEP_BETWEEN_READS = 0.1 10 | 11 | 12 | class Result: 13 | timeout: int 14 | exit_code: int 15 | stdout: str 16 | stderr: str 17 | 18 | def __init__(self, timeout, exit_code, stdout, stderr): 19 | self.timeout = timeout 20 | self.exit_code = exit_code 21 | self.stdout = stdout 22 | self.stderr = stderr 23 | 24 | 25 | def set_nonblocking(reader): 26 | fd = reader.fileno() 27 | fl = fcntl.fcntl(fd, fcntl.F_GETFL) 28 | fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) 29 | 30 | 31 | def run( 32 | args: List[str], 33 | timeout_seconds: int = 15, 34 | max_output_size: int = 2048, 35 | env=None, 36 | ) -> Result: 37 | """ 38 | Runs the given program with arguments. After the timeout elapses, kills the process 39 | and all other processes in the process group. Captures at most max_output_size bytes 40 | of stdout and stderr each, and discards any output beyond that. 41 | """ 42 | p = subprocess.Popen( 43 | args, 44 | env=env, 45 | stdin=subprocess.DEVNULL, 46 | stdout=subprocess.PIPE, 47 | stderr=subprocess.PIPE, 48 | start_new_session=True, 49 | bufsize=MAX_BYTES_PER_READ, 50 | ) 51 | set_nonblocking(p.stdout) 52 | set_nonblocking(p.stderr) 53 | 54 | process_group_id = os.getpgid(p.pid) 55 | 56 | # We sleep for 0.1 seconds in each iteration. 57 | max_iterations = timeout_seconds * 10 58 | stdout_saved_bytes = [] 59 | stderr_saved_bytes = [] 60 | stdout_bytes_read = 0 61 | stderr_bytes_read = 0 62 | 63 | for _ in range(max_iterations): 64 | this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ) 65 | this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ) 66 | # this_stdout_read and this_stderr_read may be None if stdout or stderr 67 | # are closed. Without these checks, test_close_output fails. 68 | if this_stdout_read is not None and stdout_bytes_read < max_output_size: 69 | stdout_saved_bytes.append(this_stdout_read) 70 | stdout_bytes_read += len(this_stdout_read) 71 | if this_stderr_read is not None and stderr_bytes_read < max_output_size: 72 | stderr_saved_bytes.append(this_stderr_read) 73 | stderr_bytes_read += len(this_stderr_read) 74 | exit_code = p.poll() 75 | if exit_code is not None: 76 | break 77 | time.sleep(SLEEP_BETWEEN_READS) 78 | 79 | try: 80 | # Kills the process group. Without this line, test_fork_once fails. 81 | os.killpg(process_group_id, signal.SIGKILL) 82 | except ProcessLookupError: 83 | pass 84 | 85 | timeout = exit_code is None 86 | exit_code = exit_code if exit_code is not None else -1 87 | stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore") 88 | stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore") 89 | return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr) 90 | -------------------------------------------------------------------------------- /memorization_thresholds/sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | import random 5 | from transformers import AutoTokenizer 6 | 7 | 8 | def remove_comments_from_string(code_str): 9 | lines = code_str.splitlines() 10 | 11 | # Skipping the opening blank line and comment line 12 | start = 0 13 | in_block_comment = False 14 | for i, line in enumerate(lines): 15 | stripped = line.strip() 16 | if in_block_comment: 17 | if stripped.endswith('"""') or stripped.endswith("'''"): 18 | in_block_comment = False 19 | continue 20 | if stripped.startswith('#') or not stripped: 21 | continue 22 | if stripped.startswith('"""') or stripped.startswith("'''"): 23 | in_block_comment = True 24 | continue 25 | start = i 26 | break 27 | 28 | return '\n'.join(lines[start:]) 29 | 30 | 31 | def main(): 32 | files = [] 33 | for language in ['Ruby', 'PHP', 'Rust', 'Lua']: 34 | with open(f'TopLists/{language}-top-repos.txt', 'r') as fr: 35 | for line in fr.readlines(): 36 | line = line.strip() 37 | temp1 = line.split('\t') 38 | star = temp1[0] 39 | github_link = temp1[1] 40 | temp2 = github_link.split('/') 41 | github_org, github_repo = temp2[-2], temp2[-1] 42 | data_dir = f'Code/{language}/{github_org}/{github_repo}' 43 | if not os.path.exists(data_dir): 44 | continue 45 | for file_name in os.listdir(data_dir): 46 | if os.path.isfile(os.path.join(data_dir, file_name)): 47 | files.append(os.path.join(data_dir, file_name)) 48 | print(f"Obtained {len(files)} deduplicated files from GitHub.") 49 | 50 | random.seed(42) 51 | random.shuffle(files) 52 | 53 | tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot") 54 | target_csv_file = f'../unlearning/data/github/unseen_data.csv' 55 | if not os.path.exists(target_csv_file): 56 | directory = os.path.dirname(target_csv_file) 57 | if not os.path.exists(directory): 58 | os.makedirs(directory) 59 | 60 | max_sample_num = 10000 61 | current_sample_num = 0 62 | with open(target_csv_file, 'w') as fw: 63 | writer = csv.writer(fw) 64 | writer.writerow(['doc_id', 'corpus', 'text']) 65 | for file_path in files: 66 | try: 67 | with open(file_path, 'r', encoding='utf-8') as data_fr: 68 | if current_sample_num >= max_sample_num: 69 | return 70 | data = data_fr.read().strip() 71 | data = remove_comments_from_string(data) 72 | length = len(tokenizer(data)['input_ids']) 73 | if length > 200 and length < 1000: 74 | corpus = file_path.replace('Code', 'GitHub') 75 | writer.writerow([current_sample_num, corpus, data]) 76 | current_sample_num += 1 77 | except Exception: 78 | continue 79 | # print(current_sample_num) 80 | 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /sensitive_memorization/filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datasets import load_from_disk 3 | 4 | ip_filter_func = lambda secret: (secret['value'][:4] == '127.' or secret['value'][:3] == '10.' or secret['value'][:8] == '192.168.' 5 | or secret['value'][:8] == '169.254.' or (secret['value'][:4] == '172.' and eval(secret['value'][4:6]) >= 16 and eval(secret['value'][4:6]) <= 31)) 6 | email_filter_func = lambda secret: ('example' in secret['value'] or 'test' in secret['value'] or 7 | 'user' in secret['value'] or 'aaa' in secret['value'] or 'bbb' in secret['value'] or 'ccc' in secret['value']) 8 | key_filter_func = lambda secret: (secret['value'] == 'ghp' or 'aaaaaaa' in secret['value'] or 'AAAAAAA' in secret['value'] or 9 | 'xxxxxxx' in secret['value'] or 'XXXXXXX' in secret['value'] or 'https' in secret['value'] or 'dummy' in secret['value'] or 10 | 'placeholder' in secret['value'] or 'changeme' in secret['value']) 11 | 12 | secret_filter_func = lambda secret: ((secret['tag'] == 'IP_ADDRESS' and ip_filter_func(secret)) or 13 | (secret['tag'] == 'EMAIL' and email_filter_func(secret)) or 14 | (secret['tag'] == 'KEY' and key_filter_func(secret))) 15 | 16 | 17 | def filter_secrets(example): 18 | if type(example['secrets']) != str: 19 | return False 20 | secrets = eval(example['secrets']) 21 | if example['number_secrets'] == 1: 22 | secret = secrets[0] 23 | # Check if the only secret is either a local IP or an email containing "example" 24 | if secret_filter_func(secret): 25 | # if secret['start'] < 512 or secret_filter_func(secret): 26 | return False # This will remove the example 27 | elif example['number_secrets'] > 1: 28 | # Filter out specific secrets 29 | filtered_secrets = [secret for secret in secrets 30 | if not secret_filter_func(secret)] 31 | # filtered_secrets = [secret for secret in secrets 32 | # if not (secret['start'] < 512 or secret_filter_func(secret))] 33 | if len(filtered_secrets) == 0: 34 | return False # This will remove the example 35 | return True 36 | 37 | 38 | def update_example(example): 39 | secrets = eval(example['secrets']) 40 | # Filter out specific secrets 41 | filtered_secrets = [secret for secret in secrets 42 | if not secret_filter_func(secret)] 43 | # filtered_secrets = [secret for secret in secrets 44 | # if not ((secret['start'] < 512 or secret_filter_func(secret)))] 45 | example['secrets'] = str(filtered_secrets) 46 | example['number_secrets'] = len(filtered_secrets) 47 | return example 48 | 49 | 50 | def main(): 51 | dataset_path = './codeparrot-clean-train-secrets-filtered' 52 | if os.path.exists(dataset_path): 53 | ds_pii = load_from_disk(dataset_path) 54 | else: 55 | ds_pii = load_from_disk('codeparrot-clean-train-secrets') 56 | ds_pii = ds_pii.filter(filter_secrets, num_proc=48) 57 | ds_pii = ds_pii.map(update_example, num_proc=48) 58 | ds_pii.save_to_disk(dataset_path) 59 | print(ds_pii) 60 | 61 | 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/module_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pathlib import Path 3 | 4 | from safe_subprocess import run 5 | 6 | ROOT = Path(__file__).resolve().parent / "evil_programs" 7 | 8 | 9 | def assert_no_running_evil(): 10 | result = run(["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024) 11 | assert ( 12 | result.exit_code == 1 13 | ), f"There are still evil processes running: {result.stdout}" 14 | assert len(result.stderr) == 0 15 | assert len(result.stdout) == 0 16 | 17 | 18 | def test_fork_once(): 19 | # The program exits cleanly and immediately. But, it forks a child that runs 20 | # forever. 21 | result = run( 22 | ["python3", ROOT / "fork_once.py"], 23 | timeout_seconds=2, 24 | max_output_size=1024, 25 | ) 26 | assert result.exit_code == 0 27 | assert result.timeout == False 28 | assert len(result.stderr) == 0 29 | assert len(result.stdout) == 0 30 | assert_no_running_evil() 31 | 32 | 33 | def test_close_outputs(): 34 | # The program prints to stdout, closes its output, and then runs forever. 35 | result = run( 36 | ["python3", ROOT / "close_outputs.py"], 37 | timeout_seconds=2, 38 | max_output_size=1024, 39 | ) 40 | assert result.exit_code == -1 41 | assert result.timeout == True 42 | assert len(result.stderr) == 0 43 | assert result.stdout == "This is the end\n" 44 | assert_no_running_evil() 45 | 46 | 47 | def test_unbounded_output(): 48 | result = run( 49 | ["python3", ROOT / "unbounded_output.py"], 50 | timeout_seconds=3, 51 | max_output_size=1024, 52 | ) 53 | assert result.exit_code == -1 54 | assert result.timeout == True 55 | assert len(result.stderr) == 0 56 | assert len(result.stdout) == 1024 57 | assert_no_running_evil() 58 | 59 | 60 | def test_sleep_forever(): 61 | result = run( 62 | ["python3", ROOT / "sleep_forever.py"], 63 | timeout_seconds=2, 64 | max_output_size=1024, 65 | ) 66 | assert result.exit_code == -1 67 | assert result.timeout == True 68 | assert len(result.stderr) == 0 69 | assert len(result.stdout) == 0 70 | assert_no_running_evil() 71 | 72 | 73 | def test_fork_bomb(): 74 | result = run( 75 | ["python3", ROOT / "fork_bomb.py"], 76 | timeout_seconds=2, 77 | max_output_size=1024, 78 | ) 79 | assert result.exit_code == -1 80 | assert result.timeout == True 81 | assert len(result.stderr) == 0 82 | assert len(result.stdout) == 0 83 | # Unfortunately, this sleep seems to be necessary. My theories: 84 | # 1. os.killpg doesn't block until the whole process group is dead. 85 | # 2. pgrep can produce stale output 86 | time.sleep(2) 87 | assert_no_running_evil() 88 | 89 | 90 | def test_block_on_inputs(): 91 | # We run the subprocess with /dev/null as input. So, any program that tries 92 | # to read input will error. 93 | result = run( 94 | ["python3", ROOT / "block_on_inputs.py"], 95 | timeout_seconds=2, 96 | max_output_size=1024, 97 | ) 98 | assert result.exit_code == 1 99 | assert result.timeout == False 100 | assert len(result.stdout) == 0 101 | assert "EOF when reading a line" in result.stderr 102 | assert_no_running_evil() 103 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | NOTE: Nothing containerized about this any more. This is just a helper 3 | for problem_evaluator.py. 4 | """ 5 | 6 | import tempfile 7 | from pathlib import Path 8 | 9 | from . import (eval_cpp, eval_dlang, eval_java, eval_javascript, eval_julia, 10 | eval_lua, eval_php, eval_python, eval_r, eval_racket, eval_ruby, 11 | eval_rust, eval_swift, eval_ts, eval_go, eval_pl, eval_sh, eval_scala, eval_cs) 12 | 13 | EVALUATORS = { 14 | "rb": (eval_ruby.eval_script, ".rb"), 15 | "lua": (eval_lua.eval_script, ".lua"), 16 | "python": (eval_python.eval_script, ".py"), 17 | "py": (eval_python.eval_script, ".py"), 18 | "notypes.py": (eval_python.eval_script, ".py"), 19 | "julia": (eval_julia.eval_script, ".jl"), 20 | "java": (eval_java.eval_script, ".java"), 21 | "rust": (eval_rust.eval_script, ".rs"), 22 | "rs": (eval_rust.eval_script, ".rs"), 23 | "swift": (eval_swift.eval_script, ".swift"), 24 | "lua": (eval_lua.eval_script, ".lua"), 25 | "racket": (eval_racket.eval_script, ".rkt"), 26 | "rkt": (eval_racket.eval_script, ".rkt"), 27 | "javascript": (eval_javascript.eval_script, ".js"), 28 | "js": (eval_javascript.eval_script, ".js"), 29 | "cpp": (eval_cpp.eval_script, ".cpp"), 30 | "cs": (eval_cs.eval_script, ".cs"), 31 | "php": (eval_php.eval_script, ".php"), 32 | "humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"), 33 | "d": (eval_dlang.eval_script, ".d"), 34 | "r": (eval_r.eval_script, ".r"), 35 | "humaneval_to_r.py": (eval_r.eval_script, ".r"), 36 | "jl": (eval_julia.eval_script, ".jl"), 37 | "ts": (eval_ts.eval_script, ".ts"), 38 | "go": (eval_go.eval_script, ".go"), 39 | "pl": (eval_pl.eval_script, ".pl"), 40 | "sh": (eval_sh.eval_script, ".sh"), 41 | "scala": (eval_scala.eval_script, ".scala"), 42 | } 43 | 44 | 45 | def eval_string_script(language, program): 46 | if language in EVALUATORS: 47 | (eval_script, file_ext) = EVALUATORS[language] 48 | else: 49 | eval_module = __import__( 50 | f"eval_{language}" if language != "go_test.go" else "eval_go" 51 | ) 52 | eval_script = eval_module.eval_script 53 | file_ext = f".{language}" if language != "go_test.go" else "_test.go" 54 | with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f: 55 | f.write(program.encode("utf-8")) 56 | f.flush() 57 | result = eval_script(Path(f.name)) 58 | # Only save the first 2K of output from the running program. Any futher 59 | # output is very likely an exceptionally long stack trace or a long 60 | # series of prints. 61 | if type(result["stdout"]) == bytes: 62 | result["stdout"] = result["stdout"].decode("utf-8", errors="ignore") 63 | if result["stdout"] is None: 64 | result["stdout"] = "" 65 | if result["stderr"] is None: 66 | result["stderr"] = "" 67 | if type(result["stderr"]) == bytes: 68 | result["stderr"] = result["stderr"].decode("utf-8", errors="ignore") 69 | assert type(result["stdout"]) == str 70 | assert type(result["stderr"]) == str 71 | return { 72 | "program": program, 73 | "stdout": result["stdout"].replace("!!int", "")[:2048], 74 | "stderr": result["stderr"][:2048], 75 | "exit_code": result["exit_code"], 76 | "status": result["status"], 77 | } 78 | -------------------------------------------------------------------------------- /human_eval/tests/test_generation_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | 5 | from accelerate import Accelerator 6 | from accelerate.utils import write_basic_config 7 | from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed 8 | 9 | from bigcode_eval.arguments import EvalArguments 10 | from bigcode_eval.evaluator import Evaluator 11 | 12 | # TODO add more tasks 13 | 14 | # Tasks for generation test 15 | GEN_TASKS = ["humaneval", "mbpp"] 16 | # Tasks for evaluator tests 17 | EVAL_TASKS = ["humaneval", "mbpp", "pal-gsm8k-greedy"] 18 | TMPDIR = tempfile.mkdtemp() 19 | TEST_MODEL = "hf-internal-testing/tiny-random-gpt2" 20 | REF_EVAL_SCORES = { 21 | "humaneval": {"pass@1": 0.25}, 22 | "mbpp": {"pass@1": 0.25}, 23 | "pal-gsm8k-greedy": {"accuracy": 1.0, "num_failed_execution": 0}, 24 | } 25 | 26 | 27 | def update_args(args): 28 | args.model = "hf-internal-testing/tiny-random-gpt2" 29 | # the executed code for the tests is safe (see tests/data/*_eval_gens.json) 30 | args.allow_code_execution = True 31 | args.save_generations = False 32 | args.save_generations_path = "" 33 | args.save_references = False 34 | args.metric_output_path = TMPDIR 35 | args.load_generations_path = None 36 | args.generation_only = False 37 | args.check_references = False 38 | # postprocessing for HumanEval and MBPP makes generations 39 | # with dummy model not distinctive 40 | args.postprocess = False 41 | args.instruction_tokens = None 42 | 43 | args.limit = 2 44 | args.limit_start = 0 45 | args.batch_size = 1 46 | args.max_length_generation = 300 47 | args.do_sample = False 48 | args.top_p = 0 49 | args.n_samples = 1 50 | args.seed = 0 51 | args.prompt = None 52 | args.precision = None 53 | args.modeltype = None 54 | args.max_memory_per_gpu = None 55 | return args 56 | 57 | 58 | def setup(): 59 | model = AutoModelForCausalLM.from_pretrained(TEST_MODEL) 60 | tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL) 61 | tokenizer.pad_token = tokenizer.eos_token 62 | configPath = os.path.join(TMPDIR, "default_config.yml") 63 | write_basic_config(save_location=configPath) 64 | accelerator = Accelerator() 65 | return model, tokenizer, accelerator 66 | 67 | 68 | def load_generation_examples(task): 69 | # generations for testing the generation feature of dummy test model 70 | with open(f"tests/data/{task}_gen_gens.json") as fp: 71 | gens = json.load(fp) 72 | with open(f"tests/data/{task}_gen_refs.json") as fp: 73 | refs = json.load(fp) 74 | return gens, refs 75 | 76 | 77 | args = update_args(EvalArguments()) 78 | set_seed(args.seed) 79 | model, tokenizer, accelerator = setup() 80 | 81 | 82 | def test_generation(): 83 | args.generation_only = True 84 | evaluator = Evaluator(accelerator, model, tokenizer, args) 85 | for task in GEN_TASKS: 86 | print(f"testing task {task}") 87 | generations, references = evaluator.generate_text(task) 88 | true_gens, true_refs = load_generation_examples(task) 89 | assert generations == true_gens 90 | assert references == true_refs 91 | print("passed gen") 92 | 93 | 94 | def test_evaluation(): 95 | # TODO add scores for each task 96 | args.n_samples = 2 97 | for task in EVAL_TASKS: 98 | print(f"testing task {task}") 99 | # path to generation examples to evaluate 100 | args.load_generations_path = f"tests/data/{task}_eval_gens.json" 101 | evaluator = Evaluator(accelerator, None, None, args) 102 | results = evaluator.evaluate(task) 103 | assert results == REF_EVAL_SCORES[task] 104 | print("passed eval") 105 | -------------------------------------------------------------------------------- /human_eval/templates/new_task.py: -------------------------------------------------------------------------------- 1 | # This template file is adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/templates/new_task.py 2 | 3 | # TODO: Remove all TODO comments once the implementation is complete. 4 | """ 5 | TODO: Add the Paper Title on this line. 6 | TODO: Add the paper's PDF URL (preferably from arXiv) on this line. 7 | TODO: Write a Short Description of the task. 8 | Homepage: TODO: Add the URL to the task's Homepage here. 9 | """ 10 | from bigcode_eval.base import Task 11 | 12 | # TODO: Add the BibTeX citation for the task. 13 | _CITATION = """ 14 | """ 15 | 16 | 17 | # TODO: Replace `NewTask` with the name of your Task. 18 | class NewTask(Task): 19 | # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` 20 | # dataset as denoted in HuggingFace `datasets`. 21 | DATASET_PATH = "" 22 | # TODO: Add the `DATASET_NAME` string. This is the name of a subset within 23 | # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. 24 | DATASET_NAME = None 25 | 26 | def __init__(self): 27 | super().__init__( 28 | # TODO: Specify the list of stop words in `stop_words` for the code generation task \ 29 | # and if the evaluation requires executing the generated code in `requires_execution`. 30 | stop_words=[], 31 | requires_execution=False, 32 | ) 33 | 34 | def get_dataset(self): 35 | # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`) 36 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 37 | return [] 38 | 39 | def fewshot_examples(self): 40 | # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist 41 | """Loads and returns the few-shot examples for the task if they exist.""" 42 | pass 43 | 44 | def get_prompt(self, doc): 45 | # TODO: build the prompt for the language model from a sample `doc` from the dataset 46 | """ 47 | Builds the prompt for the LM to generate from. 48 | :param doc: dict[str: str] 49 | sample from the test dataset 50 | :return: str 51 | """ 52 | return "" 53 | 54 | def get_reference(self, doc): 55 | # TODO: get the reference solution from a sample `doc` from the dataset 56 | """ 57 | Builds the reference solution for the doc (sample from the test dataset). 58 | :param doc: dict[str: str] 59 | sample from the test dataset 60 | :return: str 61 | """ 62 | return "" 63 | 64 | def postprocess_generation(self, generation, idx): 65 | # TODO: define the postprocessing for the LM generation 66 | """ 67 | Defines the postprocessing for a LM generation. 68 | :param generation: str 69 | code generation from LM 70 | :param idx: int (if needed) 71 | index of doc in the dataset to which the generation belongs 72 | :return: str 73 | """ 74 | return "" 75 | 76 | def process_results(self, generations, references): 77 | # TODO: define how the evaluation score is computed from list of \ 78 | # generations and reference solutions 79 | """ 80 | Takes the list of LM generations and evaluates them against ground truth references, 81 | returning the metric for the generations as in {"metric_name": result}. 82 | We encourage to directly load the metric from `evaluate` library to keep the code concise. 83 | :param generations: list(list(str)) 84 | list of lists containing generations 85 | :param references: list(str) 86 | list of str containing refrences 87 | :return: dict[str: float] 88 | """ 89 | return {} 90 | -------------------------------------------------------------------------------- /human_eval/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .vscode/ 163 | .trunk 164 | .DS_Store 165 | 166 | # Script outputs 167 | evaluation*.json 168 | generations*.json 169 | -------------------------------------------------------------------------------- /human_eval/finetuning/APPS/apps_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fine-Tune LM on APPS train split 3 | """ 4 | 5 | import argparse 6 | import os 7 | 8 | import torch 9 | from apps_dataset import APPSBaseDataset 10 | from datasets import load_dataset 11 | from transformers import (AutoModelForCausalLM, Trainer, TrainingArguments, 12 | logging, set_seed) 13 | 14 | 15 | def get_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--model_ckpt", type=str, default="codeparrot/codeparrot-small") 18 | parser.add_argument("--max_length", type=int, default=1024) 19 | parser.add_argument("--num_epochs", type=int, default=10) 20 | parser.add_argument("--max_steps", type=int, default=-1) 21 | parser.add_argument("--batch_size", type=int, default=8) 22 | parser.add_argument("--gradient_accumulation_steps", type=int, default=8) 23 | 24 | parser.add_argument("--learning_rate", type=float, default=5e-5) 25 | parser.add_argument("--lr_scheduler_type", type=str, default="cosine") 26 | parser.add_argument("--num_warmup_steps", type=int, default=100) 27 | parser.add_argument("--weight_decay", type=float, default=0.05) 28 | 29 | parser.add_argument("--fp16", default=False, action="store_true") 30 | parser.add_argument("--seed", type=int, default=0) 31 | parser.add_argument("--output_dir", type=str, default="./checkpoints") 32 | parser.add_argument("--log_freq", default=1, type=int) 33 | parser.add_argument("--eval_freq", default=250, type=int) 34 | parser.add_argument("--save_freq", default=250, type=int) 35 | return parser.parse_args() 36 | 37 | 38 | def get_dataset(dataset, args): 39 | 40 | train_data = APPSBaseDataset( 41 | dataset=dataset, max_tokens=args.max_length, tokenizer_path=args.model_ckpt 42 | ) 43 | 44 | return train_data 45 | 46 | 47 | def run_training(args, train_data, val_data): 48 | 49 | model = AutoModelForCausalLM.from_pretrained(args.model_ckpt, use_auth_token=True) 50 | train_data.start_iteration = 0 51 | 52 | print(f"Starting main loop") 53 | 54 | training_args = TrainingArguments( 55 | output_dir=args.output_dir, 56 | dataloader_drop_last=True, 57 | evaluation_strategy="steps", 58 | num_train_epochs=args.num_epochs, 59 | max_steps=args.max_steps, 60 | eval_steps=args.eval_freq, 61 | save_steps=args.save_freq, 62 | logging_steps=args.log_freq, 63 | per_device_train_batch_size=args.batch_size, 64 | per_device_eval_batch_size=args.batch_size, 65 | learning_rate=args.learning_rate, 66 | lr_scheduler_type=args.lr_scheduler_type, 67 | warmup_steps=args.num_warmup_steps, 68 | gradient_accumulation_steps=args.gradient_accumulation_steps, 69 | weight_decay=args.weight_decay, 70 | fp16=args.fp16, 71 | run_name="apps-train", 72 | report_to="wandb", 73 | ) 74 | 75 | trainer = Trainer( 76 | model=model, 77 | args=training_args, 78 | train_dataset=train_data, 79 | eval_dataset=val_data, 80 | ) 81 | 82 | print("Training...") 83 | trainer.train() 84 | 85 | print("saving last checkpoint of the model") 86 | model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/")) 87 | 88 | 89 | def main(args): 90 | 91 | dataset = load_dataset("codeparrot/apps", split="train") 92 | dataset.shuffle(seed=args.seed) 93 | data = get_dataset(dataset, args) 94 | train_size = int(0.95 * len(data)) 95 | train_data, val_data = torch.utils.data.random_split( 96 | data, 97 | [train_size, len(data) - train_size], 98 | generator=torch.Generator().manual_seed(args.seed), 99 | ) 100 | print( 101 | f"size of training data {len(train_data)}\nsize of validation data {len(val_data)}" 102 | ) 103 | run_training(args, train_data, val_data) 104 | 105 | 106 | if __name__ == "__main__": 107 | 108 | args = get_args() 109 | set_seed(args.seed) 110 | os.makedirs(args.output_dir, exist_ok=True) 111 | 112 | logging.set_verbosity_error() 113 | 114 | main(args) 115 | -------------------------------------------------------------------------------- /human_eval/tests/data/pal-gsm8k-greedy_prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt":"Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\"\"\"\n money_initial = 23\n bagels = 5\n bagel_cost = 3\n money_spent = bagels * bagel_cost\n money_left = money_initial - money_spent\n result = money_left\n return result\n\n\n\n\n\nQ: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\"\"\"\n golf_balls_initial = 58\n golf_balls_lost_tuesday = 23\n golf_balls_lost_wednesday = 2\n golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n result = golf_balls_left\n return result\n\n\n\n\n\nQ: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\"\"\"\n computers_initial = 9\n computers_per_day = 5\n num_days = 4 # 4 days between monday and thursday\n computers_added = computers_per_day * num_days\n computers_total = computers_initial + computers_added\n result = computers_total\n return result\n\n\n\n\n\nQ: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\"\"\"\n toys_initial = 5\n mom_toys = 2\n dad_toys = 2\n total_received = mom_toys + dad_toys\n total_toys = toys_initial + total_received\n result = total_toys\n return result\n\n\n\n\n\nQ: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\"\"\"\n jason_lollipops_initial = 20\n jason_lollipops_after = 12\n denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n result = denny_lollipops\n return result\n\n\n\n\n\nQ: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\"\"\"\n leah_chocolates = 32\n sister_chocolates = 42\n total_chocolates = leah_chocolates + sister_chocolates\n chocolates_eaten = 35\n chocolates_left = total_chocolates - chocolates_eaten\n result = chocolates_left\n return result\n\n\n\n\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\"\"\"\n cars_initial = 3\n cars_arrived = 2\n total_cars = cars_initial + cars_arrived\n result = total_cars\n return result\n\n\n\n\n\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\"\"\"\n trees_initial = 15\n trees_after = 21\n trees_added = trees_after - trees_initial\n result = trees_added\n return result\n\n\n\n\n\nQ: test\n\n# solution in Python:\n\n\n" 3 | } -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/concode.py: -------------------------------------------------------------------------------- 1 | """Mapping Language to Code in Programmatic Context (Concode) 2 | https://arxiv.org/abs/1808.09588 3 | 4 | CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation 5 | https://arxiv.org/abs/2102.04664 6 | 7 | Java code generation in CodeXGLUE text-to-code dataset (built from Concode dataset) 8 | Available at https://huggingface.co/datasets/code_x_glue_ct_code_to_text 9 | 2000 samples are available in the test set. 10 | 11 | Here we use two-shot evaluation (the original paper evaluates finetuned models) 12 | """ 13 | import json 14 | 15 | from evaluate import load 16 | 17 | from bigcode_eval.base import Task 18 | 19 | _CITATION = """ 20 | @article{iyer2018mapping, 21 | title={Mapping language to code in programmatic context}, 22 | author={Iyer, Srinivasan and Konstas, Ioannis and Cheung, Alvin and Zettlemoyer, Luke}, 23 | journal={arXiv preprint arXiv:1808.09588}, 24 | year={2018} 25 | } 26 | """ 27 | 28 | 29 | class Concode(Task): 30 | """A task represents an entire benchmark including its dataset, problems, 31 | answers, generation settings and evaluation methods. 32 | """ 33 | 34 | DATASET_PATH = "code_x_glue_tc_text_to_code" 35 | 36 | def __init__(self): 37 | super().__init__( 38 | stop_words=["\n"], 39 | requires_execution=False, 40 | ) 41 | 42 | def get_dataset(self): 43 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 44 | # test split of the dataset doesn't have targets 45 | return self.dataset["validation"] 46 | 47 | def fewshot_examples(self): 48 | """Loads and returns the few-shot examples for the task if they exist.""" 49 | with open( 50 | "bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json", "r" 51 | ) as file: 52 | examples = json.load(file) 53 | return examples 54 | 55 | @staticmethod 56 | def two_shot_prompt(entry, text, examples): 57 | """Two shot prompt format as instructions & solutions""" 58 | prompt = f"\nInstruction:\n{examples['instruction1']}\ 59 | \nSolution:\n{examples['solution1']}\ 60 | \nInstruction:\n{examples['instruction2']}\ 61 | \nSolution:\n{examples['solution2']}\ 62 | \nInstruction:\n{text}\ 63 | \nSolution:\n" 64 | assert ( 65 | prompt.count("Solution:\n") == 3 66 | ), "Splitting operation in postprocess_generation is invalid" 67 | return entry + prompt 68 | 69 | def get_prompt(self, doc): 70 | """Builds the prompt for the LM to generate from.""" 71 | examples = self.fewshot_examples() 72 | text = doc["nl"].split("concode_field_sep")[0].strip() 73 | if text.endswith("."): 74 | text = text[:-1].strip() 75 | entry = "Answer the following instructions in a one line of Java code:\n" 76 | prompt = self.two_shot_prompt(entry, text, examples) 77 | return prompt 78 | 79 | def get_reference(self, doc): 80 | """Builds the reference solution for the doc (sample from the test dataset).""" 81 | return doc["code"] 82 | 83 | def postprocess_generation(self, generation, idx): 84 | """Defines the postprocessing for a LM generation. 85 | :param generation: str 86 | code generation from LM 87 | :param idx: int 88 | index of doc in the dataset to which the generation belongs 89 | (not used for this task) 90 | """ 91 | output = generation.split("Solution:\n", 3)[-1].strip() 92 | return output 93 | 94 | def process_results(self, generations, references): 95 | """Takes the list of LM generations and evaluates them against ground truth references, 96 | returning the metric for the generations. 97 | :param generations: list(list(str)) 98 | list of lists containing generations 99 | :param references: list(str) 100 | list of str containing references 101 | """ 102 | bleu = load("bleu") 103 | gens = [gen[0] for gen in generations] 104 | results = bleu.compute( 105 | references=references, predictions=gens, max_order=4, smooth=True 106 | ) 107 | return results 108 | -------------------------------------------------------------------------------- /memorization_thresholds/gh_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import sys 3 | import time 4 | from requests.packages import urllib3 5 | 6 | # Insert GitHub API token here, in place of *TOKEN*. 7 | headers = {"Authorization": "token *TOKEN*"} 8 | 9 | # Constants & language argument. 10 | NUM_REPOS = 25_000 11 | MIN_STARS = 500 12 | CREATED = '2023-06-01' 13 | LANGUAGE = "python" if len(sys.argv) <= 1 else sys.argv[1] # Default to Python, if none passed. 14 | 15 | 16 | def main(): 17 | urllib3.disable_warnings() 18 | repositories = set() # Keep track of a set of repositories seen to avoid duplicate entries across pages. 19 | next_max_stars = 1_000_000_000 # Initialize to a very high value. 20 | with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'a+') as f: 21 | while len(repositories) < NUM_REPOS: 22 | results = run_query(next_max_stars) # Get the next set of pages. 23 | if not results: 24 | break 25 | new_repositories = [repository for repository, _ in results] 26 | next_max_stars = min([stars for _, stars in results]) 27 | 28 | # If a query returns no new repositories, drop it. 29 | if len(repositories | set(new_repositories)) == len(repositories): 30 | break 31 | for repository, stars in sorted(results, key=lambda e: e[1], reverse=True): 32 | if repository not in repositories: 33 | repositories.add(repository) 34 | f.write(f'{stars}\t{repository}\n') 35 | f.flush() 36 | print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}') 37 | 38 | 39 | def run_query(max_stars): 40 | end_cursor = None # Used to track pagination. 41 | repositories = set() 42 | 43 | while end_cursor != "": 44 | # Extracts non-fork, recently active repositories in the provided language, in groups of 100. 45 | # Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories 46 | # by repeatedly lowering the bar. 47 | query = f""" 48 | {{ 49 | search(query: "language:{LANGUAGE} fork:false created:>{CREATED} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{ 50 | edges {{ 51 | node {{ 52 | ... on Repository {{ 53 | url 54 | isPrivate 55 | isDisabled 56 | isLocked 57 | stargazers {{ 58 | totalCount 59 | }} 60 | }} 61 | }} 62 | }} 63 | pageInfo {{ 64 | hasNextPage 65 | endCursor 66 | }} 67 | }} 68 | }} 69 | """ 70 | print(f' Retrieving next page; {len(repositories)} repositories in this batch so far.') 71 | # Attempt a query up to three times, pausing when a query limit is hit. 72 | attempts = 0 73 | success = False 74 | while not success and attempts < 3: 75 | request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers) 76 | content = request.json() 77 | if 'data' not in content or 'search' not in content['data']: 78 | # If this is simply a signal to pause querying, wait two minutes. 79 | if 'message' in content and 'wait' in content['message']: 80 | attempts += 1 81 | time.sleep(120) 82 | # Otherwise, assume we've hit the end of the stream. 83 | else: 84 | break 85 | else: 86 | success = True 87 | if not success: 88 | break 89 | end_cursor = get_end_cursor(content) 90 | new_repositories, is_done = get_repositories(content) 91 | repositories.update(new_repositories) 92 | if len(repositories) > NUM_REPOS or is_done: 93 | break 94 | return repositories 95 | 96 | 97 | def get_end_cursor(content): 98 | page_info = content['data']['search']['pageInfo'] 99 | has_next_page = page_info['hasNextPage'] 100 | if has_next_page: 101 | return page_info['endCursor'] 102 | return "" 103 | 104 | 105 | def get_repositories(content): 106 | edges = content['data']['search']['edges'] 107 | repositories_with_stars = [] 108 | for edge in edges: 109 | if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False: 110 | repository = edge['node']['url'] 111 | star_count = edge['node']['stargazers']['totalCount'] 112 | if star_count < MIN_STARS: 113 | return repositories_with_stars, True 114 | repositories_with_stars.append((repository, star_count)) 115 | return repositories_with_stars, False 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/mbpp.py: -------------------------------------------------------------------------------- 1 | """Program Synthesis with Large Language Models 2 | https://arxiv.org/abs/2108.07732 3 | 4 | The benchmark consists of around 1,000 crowd-sourced Python programming problems, 5 | designed to be solvable by entry level programmers, covering programming fundamentals, 6 | standard library functionality, and so on. Each problem consists of a task description, 7 | code solution and 3 automated test cases. As described in the paper, a subset of the data 8 | has been hand-verified by the authors. 9 | 10 | Homepage:: https://github.com/google-research/google-research/tree/master/mbpp 11 | """ 12 | 13 | import re 14 | 15 | from evaluate import load 16 | 17 | from bigcode_eval.base import Task 18 | 19 | _CITATION = """ 20 | @article{austin2021program, 21 | title={Program Synthesis with Large Language Models}, 22 | author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others}, 23 | journal={arXiv preprint arXiv:2108.07732}, 24 | year={2021} 25 | } 26 | """ 27 | 28 | 29 | class MBPP(Task): 30 | """A task represents an entire benchmark including its dataset, problems, 31 | answers, generation settings and evaluation methods. 32 | """ 33 | 34 | DATASET_PATH = "mbpp" 35 | 36 | def __init__(self): 37 | super().__init__( 38 | stop_words=["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/", "\n```"], 39 | requires_execution=True, 40 | ) 41 | 42 | def get_dataset(self): 43 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 44 | dataset = self.dataset["test"] 45 | # the wrong split of mbpp can be loaded with old datasets cache 46 | assert ( 47 | len(dataset) == 500 48 | ), "please ensure you have the latest version of MBPP dataset, try deleting its old cache" 49 | return dataset 50 | 51 | def get_prompt(self, doc): 52 | """Builds the prompt for the LM to generate from. 53 | MBPP prompt is built following to InCoder (Fried et al.) approach 54 | prompt = docstring that includes one test 55 | """ 56 | description = doc["text"] 57 | test_example = doc["test_list"][0] 58 | prompt = f'"""\n{description}\n{test_example}\n"""\n' 59 | return prompt 60 | 61 | def get_reference(self, doc): 62 | """Builds the reference solution for the doc (sample from the test dataset).""" 63 | return "\n".join(doc["test_list"]) 64 | 65 | @staticmethod 66 | def _stop_at_stop_token(decoded_string, stop_tokens): 67 | """ 68 | Produces the prefix of decoded_string that ends at the first occurrence of 69 | a stop_token. 70 | WARNING: the decoded_string *must not* include the prompt, which may have stop tokens 71 | itself. 72 | """ 73 | min_stop_index = len(decoded_string) 74 | for stop_token in stop_tokens: 75 | stop_index = decoded_string.find(stop_token) 76 | if stop_index != -1 and stop_index < min_stop_index: 77 | min_stop_index = stop_index 78 | return decoded_string[:min_stop_index] 79 | 80 | def postprocess_generation(self, generation, idx): 81 | """Defines the postprocessing for a LM generation. 82 | :param generation: str 83 | code generation from LM 84 | :param idx: int 85 | index of doc in the dataset to which the generation belongs 86 | """ 87 | prompt = self.get_prompt(self.dataset["test"][idx]) 88 | generation = generation[len(prompt) :] 89 | return prompt + self._stop_at_stop_token(generation, self.stop_words) 90 | 91 | def process_results(self, generations, references): 92 | """Takes the list of LM generations and evaluates them against ground truth references, 93 | returning the metric for the generations. 94 | :param generations: list(list(str)) 95 | list of lists containing generations 96 | :param references: list(str) 97 | list of str containing refrences 98 | """ 99 | code_metric = load("code_eval") 100 | results, _ = code_metric.compute( 101 | references=references, 102 | predictions=generations, 103 | ) 104 | return results 105 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/conala.py: -------------------------------------------------------------------------------- 1 | """Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow 2 | https://arxiv.org/pdf/1805.08949.pdf 3 | 4 | Python Code generation with CoNaLa. It is a benchmark of code and natural language pairs, for the evaluation of code generation tasks. 5 | The dataset was crawled from Stack Overflow, automatically filtered, then curated by annotators, 6 | split into 2,379 training and 500 test examples. 7 | 8 | Homepage: https://conala-corpus.github.io/ 9 | Here we use two-shot evaluation (the original paper evaluates finetuned models) 10 | """ 11 | 12 | import json 13 | 14 | from evaluate import load 15 | 16 | from bigcode_eval.base import Task 17 | 18 | _CITATION = """ 19 | @inproceedings{yin2018learning, 20 | title={Learning to mine aligned code and natural language pairs from stack overflow}, 21 | author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham}, 22 | booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)}, 23 | pages={476--486}, 24 | year={2018}, 25 | organization={IEEE} 26 | } 27 | """ 28 | 29 | 30 | class Conala(Task): 31 | """A task represents an entire benchmark including its dataset, problems, 32 | answers, generation settings and evaluation methods. 33 | """ 34 | 35 | DATASET_PATH = "neulab/conala" 36 | 37 | def __init__(self): 38 | super().__init__( 39 | stop_words=["\n"], 40 | requires_execution=False, 41 | ) 42 | 43 | def get_dataset(self): 44 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 45 | return self.dataset["test"] 46 | 47 | def fewshot_examples(self): 48 | """Loads and returns the few-shot examples for the task if they exist.""" 49 | with open( 50 | "bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json", "r" 51 | ) as file: 52 | examples = json.load(file) 53 | return examples 54 | 55 | @staticmethod 56 | def two_shot_prompt(entry, text, examples): 57 | """Two shot prompt format as instructions & solutions""" 58 | prompt = f"\nInstruction:\n{examples['instruction1']}\ 59 | \nSolution:\n{examples['solution1']}\ 60 | \nInstruction:\n{examples['instruction2']}\ 61 | \nSolution:\n{examples['solution2']}\ 62 | \nInstruction:\n{text}\ 63 | \nSolution:\n" 64 | assert ( 65 | prompt.count("Solution:\n") == 3 66 | ), "Splitting operation in postprocess_generation is invalid" 67 | return entry + prompt 68 | 69 | def get_prompt(self, doc): 70 | """Builds the prompt for the LM to generate from.""" 71 | examples = self.fewshot_examples() 72 | text_column = "rewritten_intent" if doc["rewritten_intent"] else "intent" 73 | text = doc[text_column].strip() 74 | entry = "Answer the following instructions in one line of Python code:\n" 75 | prompt = self.two_shot_prompt(entry, text, examples) 76 | return prompt 77 | 78 | def get_reference(self, doc): 79 | """Builds the reference solution for the doc (sample from the test dataset).""" 80 | return doc["snippet"] 81 | 82 | def postprocess_generation(self, generation, idx): 83 | """Defines the postprocessing for a LM generation. 84 | :param generation: str 85 | code generation from LM 86 | :param idx: int 87 | index of doc in the dataset to which the generation belongs 88 | (not used for this task) 89 | """ 90 | output = generation.split("Solution:\n", 3)[-1].strip() 91 | return output 92 | 93 | def process_results(self, generations, references): 94 | """Takes the list of LM generations and evaluates them against ground truth references, 95 | returning the metric for the generations. 96 | :param generations: list(list(str)) 97 | list of lists containing generations 98 | :param references: list(str) 99 | list of str containing references 100 | """ 101 | bleu = load("bleu") 102 | gens = [gen[0] for gen in generations] 103 | results = bleu.compute( 104 | references=references, predictions=gens, max_order=4, smooth=True 105 | ) 106 | return results 107 | -------------------------------------------------------------------------------- /human_eval/finetuning/Code-to-text/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from datasets import load_dataset 4 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 5 | Trainer, TrainingArguments, set_seed) 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine" 12 | ) 13 | parser.add_argument("--language", type=str, default="Python") 14 | parser.add_argument("--max_length", type=int, default=1024) 15 | parser.add_argument("--num_epochs", type=int, default=5) 16 | parser.add_argument("--batch_size", type=int, default=6) 17 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 18 | parser.add_argument("--freeze", type=bool, default=True) 19 | parser.add_argument("--learning_rate", type=float, default=5e-4) 20 | parser.add_argument("--seed", type=int, default=0) 21 | parser.add_argument("--lr_scheduler_type", type=str, default="cosine") 22 | parser.add_argument("--num_warmup_steps", type=int, default=10) 23 | parser.add_argument("--weight_decay", type=float, default=0.01) 24 | parser.add_argument("--output_dir", type=str, default="./results") 25 | parser.add_argument("--push_to_hub", type=bool, default=False) 26 | parser.add_argument("--model_hub_name", type=str, default="codeclone_model") 27 | return parser.parse_args() 28 | 29 | 30 | def main(): 31 | args = get_args() 32 | set_seed(args.seed) 33 | 34 | ds = load_dataset("code_x_glue_ct_code_to_text", args.language) 35 | 36 | print("Loading tokenizer and model") 37 | tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) 38 | tokenizer.pad_token = tokenizer.eos_token 39 | model = AutoModelForSequenceClassification.from_pretrained( 40 | args.model_ckpt, num_labels=2 41 | ) 42 | model.config.pad_token_id = model.config.eos_token_id 43 | 44 | if args.freeze: 45 | for param in model.roberta.parameters(): 46 | param.requires_grad = False 47 | 48 | def tokenize(example): 49 | if args.language == "Python": 50 | # remove docstring from code 51 | chunks = example["code"].split('"""') 52 | code = chunks[0].strip() + chunks[2] 53 | else: 54 | code = example["code"] 55 | inputs = tokenizer( 56 | code, padding="max_length", truncation=True, max_length=args.max_length 57 | ) 58 | labels = tokenizer( 59 | example["docstring"], 60 | padding="max_length", 61 | truncation=True, 62 | max_length=args.max_length, 63 | ).input_ids 64 | labels_with_ignore_index = [] 65 | for labels_example in labels: 66 | labels_example = [label if label != 0 else -100 for label in labels_example] 67 | labels_with_ignore_index.append(labels_example) 68 | 69 | return { 70 | "input_ids": inputs["input_ids"], 71 | "attention_mask": inputs["attention_mask"], 72 | "label": labels_with_ignore_index, 73 | } 74 | 75 | tokenized_datasets = ds.map( 76 | tokenize, 77 | batched=True, 78 | remove_columns=ds["train"].column_names, 79 | ) 80 | 81 | training_args = TrainingArguments( 82 | output_dir=args.output_dir, 83 | learning_rate=args.learning_rate, 84 | lr_scheduler_type=args.lr_scheduler_type, 85 | evaluation_strategy="epoch", 86 | save_strategy="epoch", 87 | logging_strategy="epoch", 88 | per_device_train_batch_size=args.batch_size, 89 | per_device_eval_batch_size=args.batch_size, 90 | num_train_epochs=args.num_epochs, 91 | gradient_accumulation_steps=args.gradient_accumulation_steps, 92 | weight_decay=args.weight_decay, 93 | run_name=f"code-to-text-{args.language}", 94 | report_to="wandb", 95 | ) 96 | 97 | trainer = Trainer( 98 | model=model, 99 | args=training_args, 100 | train_dataset=tokenized_datasets["train"], 101 | eval_dataset=tokenized_datasets["validation"], 102 | tokenizer=tokenizer, 103 | ) 104 | 105 | print("Training...") 106 | trainer.train() 107 | 108 | # push the model to the Hugging Face hub 109 | if args.push_to_hub: 110 | model.push_to_hub(args.model_hub_name) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/evaluator.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import json 3 | import os 4 | import warnings 5 | 6 | from bigcode_eval import tasks 7 | from bigcode_eval.generation import parallel_generations 8 | 9 | _WARNING = """ 10 | ################################################################################ 11 | !!!WARNING!!! 12 | ################################################################################ 13 | The "code_eval"/"apps_metric" you are about to use, execute untrusted 14 | model-generated code in Python. 15 | Although it is highly unlikely that model-generated code will do something 16 | overtly malicious in response to this test suite, model-generated code may act 17 | destructively due to a lack of model capability or alignment. 18 | Users are strongly encouraged to sandbox this evaluation suite so that it 19 | does not perform destructive actions on their host or network. For more 20 | information on how OpenAI sandboxes its code, see the paper "Evaluating Large 21 | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). 22 | Once you have read this disclaimer and taken appropriate precautions, set the argument 23 | "allow_code_execution" to True. 24 | ################################################################################\ 25 | """ 26 | 27 | 28 | class Evaluator: 29 | def __init__(self, accelerator, model, tokenizer, args): 30 | self.accelerator = accelerator 31 | self.model = model 32 | self.tokenizer = tokenizer 33 | self.args = args 34 | 35 | # setup arguments 36 | self.metric_output_path = args.metric_output_path 37 | 38 | # code evaluation permission 39 | self.allow_code_execution = args.allow_code_execution 40 | 41 | def generate_text(self, task_name): 42 | task = tasks.get_task(task_name, self.args) 43 | dataset = task.get_dataset() 44 | # if args.limit is None, use all samples 45 | n_tasks = self.args.limit if self.args.limit else len(dataset) 46 | references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)] 47 | 48 | if self.args.check_references: 49 | if "get_solution" in inspect.signature(task.get_reference).parameters: 50 | solutions = [[task.get_reference(dataset[i], get_solution=True)] for i in range(self.args.limit_start, self.args.limit_start+n_tasks)] 51 | else: 52 | solutions = [[ref] for ref in references] 53 | return solutions, references 54 | 55 | generations = parallel_generations( 56 | task, 57 | dataset, 58 | self.accelerator, 59 | self.model, 60 | self.tokenizer, 61 | n_tasks=n_tasks, 62 | args=self.args, 63 | ) 64 | if len(generations[0]) > self.args.n_samples: 65 | generations = [l[: self.args.n_samples] for l in generations] 66 | warnings.warn( 67 | f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}" 68 | ) 69 | return generations, references 70 | 71 | def evaluate(self, task_name): 72 | task = tasks.get_task(task_name, self.args) 73 | if task.requires_execution and not self.allow_code_execution: 74 | raise ValueError(_WARNING) 75 | 76 | generations, references = self.generate_text(task_name) 77 | 78 | if self.accelerator.is_main_process: 79 | if not self.args.load_generations_path: 80 | if self.args.save_generations: 81 | with open(self.args.save_generations_path, "w") as fp: 82 | json.dump(generations, fp) 83 | print( 84 | f"generations were saved at {self.args.save_generations_path}" 85 | ) 86 | if self.args.save_references: 87 | with open("references.json", "w") as fp: 88 | json.dump(references, fp) 89 | print("references were saved at references.json") 90 | 91 | # make sure tokenizer plays nice with multiprocessing 92 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 93 | if self.allow_code_execution and task.requires_execution: 94 | os.environ["HF_ALLOW_CODE_EVAL"] = "1" 95 | print("Evaluating generations...") 96 | results = task.process_results(generations, references) 97 | return results 98 | -------------------------------------------------------------------------------- /sensitive_memorization/generate_secret_mask.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from datasets import load_from_disk 4 | import torch 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | 7 | 8 | def mask_example(example): 9 | content = example['content'].encode('utf-8', 'ignore').decode('utf-8') 10 | secrets = eval(example['secrets']) 11 | sorted_secrets = sorted(secrets, key=lambda secret: secret['start']) 12 | # print(sorted_secrets) 13 | 14 | encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True) 15 | content_token_ids = encoding.input_ids 16 | offset_mapping = encoding.offset_mapping 17 | 18 | secret_mask = torch.zeros_like(content_token_ids, dtype=torch.bool) 19 | for secret in sorted_secrets: 20 | secret_start = secret['start'] 21 | secret_end = secret['end'] 22 | offset_mapping_start_check = offset_mapping[..., 0] <= secret_end # !!! < or <= 23 | offset_mapping_end_check = offset_mapping[..., 1] >= secret_start # !!! > or >= 24 | secret_mask = secret_mask + (offset_mapping_start_check.int() * offset_mapping_end_check.int()).bool() 25 | secret_token_ids = content_token_ids[secret_mask] 26 | # print(secret_token_ids) 27 | # print(tokenizer.convert_ids_to_tokens(secret_token_ids)) 28 | if secret_token_ids.shape[0] == 0: 29 | example['keep_flag'] = False 30 | else: 31 | example['keep_flag'] = True 32 | 33 | example['content'] = content 34 | example['secrets'] = str(sorted_secrets) 35 | example['content_token_ids'] = content_token_ids 36 | example['offset_mapping'] = offset_mapping 37 | example['secret_mask'] = secret_mask 38 | 39 | return example 40 | 41 | 42 | def filter_example(example): 43 | content = example['content'] 44 | secrets = eval(example['secrets']) 45 | 46 | encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True) 47 | content_token_ids = encoding.input_ids 48 | offset_mapping = encoding.offset_mapping 49 | if secrets[0]['start'] > offset_mapping[0, -1, -1]: 50 | return False 51 | else: 52 | return True 53 | 54 | 55 | def update_example(example): 56 | content = example['content'] 57 | secrets = eval(example['secrets']) 58 | 59 | encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True) 60 | content_token_ids = encoding.input_ids 61 | offset_mapping = encoding.offset_mapping 62 | if offset_mapping[0, -1, -1] == 0: 63 | return example 64 | elif offset_mapping[0, -1, -1] >= secrets[-1]['start']: 65 | return example 66 | else: 67 | truncation_index = len(secrets) 68 | for index in range(len(secrets)): 69 | if secrets[index]['start'] > offset_mapping[0, -1, -1]: 70 | truncation_index = index 71 | break 72 | example['secrets'] = str(secrets[:truncation_index]) 73 | example['number_secrets'] = truncation_index 74 | return example 75 | 76 | 77 | def main(): 78 | dataset_path = f"./codeparrot-clean-train-secrets-masked-{args.model_name_or_path.split('/')[-1]}" 79 | if os.path.exists(dataset_path): 80 | ds_pii = load_from_disk(dataset_path) 81 | else: 82 | ds_pii = load_from_disk('codeparrot-clean-train-secrets-filtered') 83 | ds_pii = ds_pii.map(mask_example, num_proc=48) 84 | ds_pii = ds_pii.filter(lambda example: example['keep_flag'], batched=True, batch_size=1000, num_proc=48) 85 | ds_pii = ds_pii.filter(filter_example, num_proc=32) 86 | ds_pii = ds_pii.map(update_example, num_proc=32) 87 | ds_pii.save_to_disk(dataset_path) 88 | print(ds_pii) 89 | 90 | 91 | if __name__ == '__main__': 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str, 94 | help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.") 95 | args = parser.parse_args() 96 | print(args) 97 | 98 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True) 99 | # tokenizer.pad_token = tokenizer.eos_token 100 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 101 | 102 | # max_seq_len = 1024 103 | max_seq_len = 512 104 | 105 | main() 106 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/codexglue_text_to_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation 3 | https://arxiv.org/abs/2102.04664 4 | 5 | Text to text task from CodeXGlue (documentation translation) 6 | """ 7 | 8 | import json 9 | import os 10 | import re 11 | 12 | from evaluate import load 13 | 14 | from bigcode_eval.base import Task 15 | 16 | _CITATION = """ 17 | @article{CodeXGLUE, 18 | title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, 19 | year={2020},} 20 | """ 21 | 22 | SOURCE_LANG = { 23 | "da_en": "danish", 24 | "zh_en": "chinese", 25 | "no_en": "norwegian", 26 | "lv_en": "latvian", 27 | } 28 | 29 | 30 | def create_all_tasks(): 31 | """Creates a dictionary of tasks from a list of languages 32 | :return: {task_name: task} 33 | e.g. {codexglue_text_to_text-da_en: Task, codexglue_text_to_text-zh_en: Task} 34 | """ 35 | return { 36 | f"codexglue_text_to_text-{translation_task}": create_task(translation_task) 37 | for translation_task in SOURCE_LANG 38 | } 39 | 40 | 41 | def create_task(translation_task): 42 | class CodexglueTextToTextTask(CodexglueTextToText): 43 | def __init__(self): 44 | super().__init__(translation_task) 45 | 46 | return CodexglueTextToTextTask 47 | 48 | 49 | class CodexglueTextToText(Task): 50 | 51 | DATASET_PATH = "code_x_glue_tt_text_to_text" 52 | DATASET_NAME = None 53 | 54 | def __init__(self, translation_task): 55 | self.DATASET_NAME = translation_task 56 | stop_words = ["\n"] 57 | requires_execution = False 58 | super().__init__(stop_words, requires_execution) 59 | 60 | def get_dataset(self): 61 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 62 | return self.dataset["test"] 63 | 64 | def fewshot_examples(self): 65 | """Loads and returns the few-shot examples for the task if they exist.""" 66 | with open( 67 | "bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json", 68 | "r", 69 | ) as file: 70 | examples = json.load(file) 71 | return examples 72 | 73 | @staticmethod 74 | def two_shot_prompt(entry, text, examples, language): 75 | """Two shot prompt format as source & target language documentation""" 76 | prompt = f"\n{language.title()}:\n{examples['source1']}\ 77 | \nEnglish:\n{examples['target1']}\ 78 | \n{language.title()}:\n{examples['source2']}\ 79 | \nEnglish:\n{examples['target2']}\ 80 | \n{language.title()}:\n{text}\ 81 | \nEnglish:\n" 82 | return entry + prompt 83 | 84 | def get_prompt(self, doc): 85 | """Builds the prompt for the LM to generate from.""" 86 | language = SOURCE_LANG[self.DATASET_NAME] 87 | text = doc["source"] 88 | entry = f"Translate the following documentation from {language.title()} to English:\n" 89 | examples = self.fewshot_examples() 90 | examples = examples[language] 91 | prompt = self.two_shot_prompt(entry, text, examples, language) 92 | return prompt 93 | 94 | def get_reference(self, doc): 95 | """Builds the reference solution for the doc (sample from the test dataset).""" 96 | return doc["target"].strip() 97 | 98 | def postprocess_generation(self, generation, idx): 99 | """Defines the postprocessing for a LM generation. 100 | :param generation: str 101 | code generation from LM 102 | :param idx: int 103 | index of doc in the dataset to which the generation belongs 104 | (not used for this task) 105 | """ 106 | output = generation.split("\nEnglish:\n", 3)[-1].strip() 107 | return output 108 | 109 | def process_results(self, generations, references): 110 | """Takes the list of LM generations and evaluates them against ground truth references, 111 | returning the metric for the generations. 112 | :param generations: list(list(str)) 113 | list of lists containing generations 114 | :param references: list(str) 115 | list of str containing references 116 | """ 117 | bleu = load("bleu") 118 | gens = [gen[0] for gen in generations] 119 | results = bleu.compute( 120 | references=references, predictions=gens, max_order=4, smooth=True 121 | ) 122 | return results 123 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/apps.py: -------------------------------------------------------------------------------- 1 | """Measuring Coding Challenge Competence With APPS 2 | https://arxiv.org/abs/2105.09938 3 | 4 | APPS is a benchmark for code generation with 10000 problems. With three difficulty levels: introductory, interview and competition. 5 | It can be used to evaluate the ability of language models to generate code from natural language specifications. 6 | 7 | Homepage: https://github.com/hendrycks/apps 8 | """ 9 | 10 | import json 11 | 12 | from evaluate import load 13 | 14 | from bigcode_eval.base import Task 15 | 16 | _CITATION = """ 17 | @article{hendrycksapps2021, 18 | title={Measuring Coding Challenge Competence With APPS}, 19 | author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, 20 | journal={NeurIPS}, 21 | year={2021} 22 | } 23 | """ 24 | 25 | 26 | LEVELS = ["introductory", "interview", "competition"] 27 | 28 | 29 | def create_all_tasks(): 30 | """Creates a dictionary of tasks from a list of levels 31 | :return: {task_name: task} 32 | e.g. {apps-interview: Task, apps-competitoon: Task} 33 | """ 34 | return {f"apps-{level}": create_task(level) for level in LEVELS} 35 | 36 | 37 | def create_task(level): 38 | class APPS(GeneralAPPS): 39 | def __init__(self): 40 | super().__init__(level) 41 | 42 | return APPS 43 | 44 | 45 | class GeneralAPPS(Task): 46 | """A task represents an entire benchmark including its dataset, problems, 47 | answers, generation settings and evaluation methods. 48 | """ 49 | 50 | DATASET_PATH = "codeparrot/apps" 51 | DATASET_NAME = None 52 | 53 | def __init__(self, level): 54 | self.DATASET_NAME = level 55 | super().__init__( 56 | stop_words=["\nQUESTION", "\n---", "\nANSWER"], 57 | requires_execution=True, 58 | ) 59 | 60 | def get_dataset(self): 61 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 62 | return self.dataset["test"] 63 | 64 | def get_prompt(self, doc): 65 | """Generate prompts for APPS 66 | Finetuning setup: prompt=question with some starter code and function name if they exist. 67 | We also specify the type of the prompt, i.e. whether it is call-based or standard input-based. 68 | """ 69 | starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"] 70 | try: 71 | input_outpout = json.loads(doc["input_output"]) 72 | fn_name = ( 73 | None if not input_outpout.get("fn_name") else input_outpout["fn_name"] 74 | ) 75 | except ValueError: 76 | fn_name = None 77 | prompt = "\nQUESTION:\n" 78 | prompt += doc["question"] 79 | if starter_code: 80 | prompt += starter_code 81 | if not fn_name: 82 | call_format = "\nUse Standard Input format" 83 | prompt += call_format 84 | else: 85 | call_format = "\nUse Call-Based format" 86 | prompt += call_format 87 | prompt += "\nANSWER:\n" 88 | return prompt 89 | 90 | def get_reference(self, doc): 91 | """Builds the reference solution for the doc (sample from the test dataset).""" 92 | return None 93 | 94 | def postprocess_generation(self, generation, idx): 95 | """Defines the postprocessing for a LM generation. 96 | :param generation: str 97 | code generation from LM 98 | :param idx: int 99 | index of doc in the dataset to which the generation belongs 100 | (not used for APPS) 101 | """ 102 | try: 103 | generation = generation.split("\nANSWER:", 1)[1] 104 | except IndexError: 105 | # happens when prompts were very long and got truncated 106 | pass 107 | return generation 108 | 109 | def process_results(self, generations, references): 110 | """Takes the list of LM generations and evaluates them against ground truth references, 111 | returning the metric for the generations. 112 | :param generations: list(list(str)) 113 | list of lists containing generations 114 | :param references: list(str) 115 | list of str containing refrences (not needed for APPS Task) 116 | """ 117 | code_metric = load("codeparrot/apps_metric") 118 | results = code_metric.compute( 119 | predictions=generations, k_list=[1, 10, 100], level=self.DATASET_NAME 120 | ) 121 | return results 122 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/pal_metric/pal_code_exec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from collections import Counter, defaultdict 4 | from concurrent.futures import ThreadPoolExecutor, as_completed 5 | 6 | from bigcode_eval.tasks.custom_metrics.pal_metric.python_executor import run_program 7 | 8 | # adapted from https://github.com/huggingface/evaluate/blob/main/metrics/code_eval/code_eval.py 9 | 10 | _WARNING = """ 11 | ################################################################################ 12 | !!!WARNING!!! 13 | ################################################################################ 14 | The "code_eval" metric executes untrusted model-generated code in Python. 15 | Although it is highly unlikely that model-generated code will do something 16 | overtly malicious in response to this test suite, model-generated code may act 17 | destructively due to a lack of model capability or alignment. 18 | Users are strongly encouraged to sandbox this evaluation suite so that it 19 | does not perform destructive actions on their host or network. For more 20 | information on how OpenAI sandboxes its code, see the paper "Evaluating Large 21 | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). 22 | Once you have read this disclaimer and taken appropriate precautions, 23 | set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this 24 | with: 25 | >>> import os 26 | >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" 27 | ################################################################################\ 28 | """ 29 | 30 | 31 | def compute( 32 | predictions, 33 | references, 34 | num_workers=4, 35 | timeout=3.0, 36 | majority_voting=False, 37 | answer_symbol=None, 38 | ): 39 | """ 40 | Returns the scores 41 | 42 | :param majority_voting: bool 43 | Takes majority voted answer to evaluate against the reference , defaults to False 44 | 45 | :param answer_symbol: str 46 | If speficifed the result of execution is fetched from the program's global context, 47 | the program is expected to have the variable name mentioned in `answer_symbol` that is available in globals. 48 | if not specified, the result are fetched from the stdout of the execution 49 | defaults to None. 50 | 51 | """ 52 | 53 | if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1": 54 | raise ValueError(_WARNING) 55 | 56 | if os.name == "nt": 57 | raise NotImplementedError("This metric is currently not supported on Windows.") 58 | 59 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 60 | futures = [] 61 | completion_id = Counter() 62 | n_samples = 0 63 | results = defaultdict(list) 64 | 65 | for task_id, candidates in enumerate(predictions): 66 | for candidate in candidates: 67 | args = (candidate, timeout, task_id, completion_id[task_id]) 68 | if answer_symbol: 69 | args += (answer_symbol,) 70 | future = executor.submit(run_program, *args) 71 | futures.append(future) 72 | completion_id[task_id] += 1 73 | n_samples += 1 74 | 75 | for future in as_completed(futures): 76 | result = future.result() 77 | results[result["task_id"]].append((result["completion_id"], result)) 78 | 79 | answers = [None] * len(results) 80 | for result in results.values(): 81 | result.sort() 82 | task_id = result[0][1]["task_id"] 83 | # filtering the failed generations to avoid influencing majority voting 84 | eval_answers = [ 85 | r[1]["result"] 86 | for r in result 87 | if isinstance(r[1]["result"], str) 88 | and not r[1]["result"].startswith("failed:") 89 | ] 90 | # if all generations are failed - default to empty str for soring 91 | eval_answers = [""] if len(eval_answers) == 0 else eval_answers 92 | if majority_voting: 93 | counter = Counter(eval_answers) 94 | eval_answers = [counter.most_common()[0][0]] 95 | 96 | if not majority_voting and len(eval_answers) > 1: 97 | warnings.warn( 98 | f"Multiple generations found for a task without setting `majority_voting` to True, defaulting answers from first generation" 99 | ) 100 | answers[task_id] = eval_answers[0] 101 | 102 | scores = [] 103 | # Number of code generated that failed execution. 104 | errored = 0 105 | for task_id, (ans, ref) in enumerate(zip(answers, references)): 106 | try: 107 | score = 1 if abs(float(ans) - float(ref)) < 1e-3 else 0 108 | except ValueError as e: 109 | errored += 1 110 | score = 0 111 | 112 | scores.append(score) 113 | 114 | return {"accuracy": sum(scores) / len(scores), "num_failed_execution": errored} 115 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeDefect/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | from datasets import ClassLabel, load_dataset 6 | from evaluate import load 7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 8 | DataCollatorWithPadding, Trainer, TrainerCallback, 9 | TrainingArguments, set_seed) 10 | 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine" 16 | ) 17 | parser.add_argument("--max_length", type=int, default=1024) 18 | parser.add_argument("--num_epochs", type=int, default=5) 19 | parser.add_argument("--batch_size", type=int, default=6) 20 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 21 | parser.add_argument("--freeze", type=bool, default=True) 22 | parser.add_argument("--learning_rate", type=float, default=5e-4) 23 | parser.add_argument("--seed", type=int, default=0) 24 | parser.add_argument("--lr_scheduler_type", type=str, default="cosine") 25 | parser.add_argument("--num_warmup_steps", type=int, default=10) 26 | parser.add_argument("--weight_decay", type=float, default=0.01) 27 | parser.add_argument("--output_dir", type=str, default="./results") 28 | parser.add_argument("--push_to_hub", type=bool, default=False) 29 | parser.add_argument("--model_hub_name", type=str, default="codedefect_model") 30 | return parser.parse_args() 31 | 32 | 33 | metric = load("accuracy") 34 | 35 | 36 | def compute_metrics(eval_pred): 37 | predictions, labels = eval_pred 38 | predictions = np.argmax(predictions, axis=1) 39 | return metric.compute(predictions=predictions, references=labels) 40 | 41 | 42 | class CustomCallback(TrainerCallback): 43 | def __init__(self, trainer) -> None: 44 | super().__init__() 45 | self._trainer = trainer 46 | 47 | def on_epoch_end(self, args, state, control, **kwargs): 48 | if control.should_evaluate: 49 | control_copy = deepcopy(control) 50 | self._trainer.evaluate( 51 | eval_dataset=self._trainer.train_dataset, metric_key_prefix="train" 52 | ) 53 | return control_copy 54 | 55 | 56 | def main(): 57 | args = get_args() 58 | set_seed(args.seed) 59 | 60 | ds = load_dataset("code_x_glue_cc_defect_detection") 61 | labels = ClassLabel(num_classes=2, names=[True, False]) 62 | ds = ds.cast_column("target", labels) 63 | ds = ds.rename_column("target", "label") 64 | 65 | print("Loading tokenizer and model") 66 | tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) 67 | tokenizer.pad_token = tokenizer.eos_token 68 | model = AutoModelForSequenceClassification.from_pretrained( 69 | args.model_ckpt, num_labels=2 70 | ) 71 | model.config.pad_token_id = model.config.eos_token_id 72 | 73 | if args.freeze: 74 | for param in model.roberta.parameters(): 75 | param.requires_grad = False 76 | 77 | def tokenize(example): 78 | inputs = tokenizer(example["func"], truncation=True, max_length=args.max_length) 79 | return { 80 | "input_ids": inputs["input_ids"], 81 | "attention_mask": inputs["attention_mask"], 82 | "label": example["target"], 83 | } 84 | 85 | tokenized_datasets = ds.map( 86 | tokenize, 87 | batched=True, 88 | remove_columns=["id", "func", "project", "commit_id"], 89 | ) 90 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 91 | 92 | training_args = TrainingArguments( 93 | output_dir=args.output_dir, 94 | learning_rate=args.learning_rate, 95 | lr_scheduler_type=args.lr_scheduler_type, 96 | evaluation_strategy="epoch", 97 | save_strategy="epoch", 98 | logging_strategy="epoch", 99 | per_device_train_batch_size=args.batch_size, 100 | per_device_eval_batch_size=args.batch_size, 101 | num_train_epochs=args.num_epochs, 102 | gradient_accumulation_steps=args.gradient_accumulation_steps, 103 | weight_decay=args.weight_decay, 104 | metric_for_best_model="accuracy", 105 | run_name="code-defect-c", 106 | report_to="wandb", 107 | ) 108 | 109 | trainer = Trainer( 110 | model=model, 111 | args=training_args, 112 | train_dataset=tokenized_datasets["train"], 113 | eval_dataset=tokenized_datasets["validation"], 114 | tokenizer=tokenizer, 115 | data_collator=data_collator, 116 | compute_metrics=compute_metrics, 117 | ) 118 | 119 | print("Training...") 120 | trainer.add_callback(CustomCallback(trainer)) 121 | trainer.train() 122 | 123 | result = trainer.evaluate(eval_dataset=tokenized_datasets["test"]) 124 | print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}") 125 | 126 | # push the model to the Hugging Face hub 127 | if args.push_to_hub: 128 | model.push_to_hub(args.model_hub_name) 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeClone/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | from datasets import ClassLabel, load_dataset 6 | from evaluate import load 7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 8 | DataCollatorWithPadding, Trainer, TrainerCallback, 9 | TrainingArguments, set_seed) 10 | 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine" 16 | ) 17 | parser.add_argument("--max_length", type=int, default=1024) 18 | parser.add_argument("--num_epochs", type=int, default=5) 19 | parser.add_argument("--batch_size", type=int, default=6) 20 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 21 | parser.add_argument("--freeze", type=bool, default=True) 22 | parser.add_argument("--learning_rate", type=float, default=5e-4) 23 | parser.add_argument("--seed", type=int, default=0) 24 | parser.add_argument("--lr_scheduler_type", type=str, default="cosine") 25 | parser.add_argument("--num_warmup_steps", type=int, default=10) 26 | parser.add_argument("--weight_decay", type=float, default=0.01) 27 | parser.add_argument("--output_dir", type=str, default="./results") 28 | parser.add_argument("--push_to_hub", type=bool, default=False) 29 | parser.add_argument("--model_hub_name", type=str, default="codeclone_model") 30 | return parser.parse_args() 31 | 32 | 33 | metric = load("accuracy") 34 | 35 | 36 | def compute_metrics(eval_pred): 37 | predictions, labels = eval_pred 38 | predictions = np.argmax(predictions, axis=1) 39 | return metric.compute(predictions=predictions, references=labels) 40 | 41 | 42 | class CustomCallback(TrainerCallback): 43 | def __init__(self, trainer) -> None: 44 | super().__init__() 45 | self._trainer = trainer 46 | 47 | def on_epoch_end(self, args, state, control, **kwargs): 48 | if control.should_evaluate: 49 | control_copy = deepcopy(control) 50 | self._trainer.evaluate( 51 | eval_dataset=self._trainer.train_dataset, metric_key_prefix="train" 52 | ) 53 | return control_copy 54 | 55 | 56 | def main(): 57 | args = get_args() 58 | set_seed(args.seed) 59 | 60 | ds = load_dataset("code_x_glue_cc_clone_detection_big_clone_bench") 61 | labels = ClassLabel(num_classes=2, names=[True, False]) 62 | ds = ds.cast_column("label", labels) 63 | 64 | print("Loading tokenizer and model") 65 | tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) 66 | tokenizer.pad_token = tokenizer.eos_token 67 | model = AutoModelForSequenceClassification.from_pretrained( 68 | args.model_ckpt, num_labels=2 69 | ) 70 | model.config.pad_token_id = model.config.eos_token_id 71 | 72 | if args.freeze: 73 | for param in model.roberta.parameters(): 74 | param.requires_grad = False 75 | 76 | def tokenize(example): 77 | inputs = tokenizer( 78 | example["func1"], 79 | example["func2"], 80 | truncation=True, 81 | max_length=args.max_length, 82 | ) 83 | return { 84 | "input_ids": inputs["input_ids"], 85 | "attention_mask": inputs["attention_mask"], 86 | } 87 | 88 | tokenized_datasets = ds.map( 89 | tokenize, 90 | batched=True, 91 | remove_columns=["id", "id1", "id2", "func1", "func2"], 92 | ) 93 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 94 | 95 | training_args = TrainingArguments( 96 | output_dir=args.output_dir, 97 | learning_rate=args.learning_rate, 98 | lr_scheduler_type=args.lr_scheduler_type, 99 | evaluation_strategy="epoch", 100 | save_strategy="epoch", 101 | logging_strategy="epoch", 102 | per_device_train_batch_size=args.batch_size, 103 | per_device_eval_batch_size=args.batch_size, 104 | num_train_epochs=args.num_epochs, 105 | gradient_accumulation_steps=args.gradient_accumulation_steps, 106 | weight_decay=args.weight_decay, 107 | metric_for_best_model="accuracy", 108 | run_name="code-clone-java", 109 | report_to="wandb", 110 | ) 111 | 112 | trainer = Trainer( 113 | model=model, 114 | args=training_args, 115 | train_dataset=tokenized_datasets["train"], 116 | eval_dataset=tokenized_datasets["validation"], 117 | tokenizer=tokenizer, 118 | data_collator=data_collator, 119 | compute_metrics=compute_metrics, 120 | ) 121 | 122 | print("Training...") 123 | trainer.add_callback(CustomCallback(trainer)) 124 | trainer.train() 125 | 126 | result = trainer.evaluate(eval_dataset=tokenized_datasets["test"]) 127 | print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}") 128 | 129 | # push the model to the Hugging Face hub 130 | if args.push_to_hub: 131 | model.push_to_hub(args.model_hub_name) 132 | 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/python_bugs.py: -------------------------------------------------------------------------------- 1 | """Python Bugs 2 | https://proceedings.mlr.press/v162/he22a.html 3 | 4 | This dataset is taken from the preprossing done by CarperAI (https://carper.ai/diff-models-a-new-way-to-edit-code). 5 | It is uploaded here: https://huggingface.co/datasets/Muennighoff/python-bugs 6 | 7 | Make sure to run with sufficient context length (512 is not enough for e.g. CodeGen). 8 | """ 9 | 10 | import re 11 | 12 | from evaluate import load 13 | from bigcode_eval.base import Task 14 | import tqdm 15 | 16 | _CITATION = """ 17 | @inproceedings{he2022distribution, 18 | title={On distribution shift in learning-based bug detectors}, 19 | author={He, Jingxuan and Beurer-Kellner, Luca and Vechev, Martin}, 20 | booktitle={International Conference on Machine Learning}, 21 | pages={8559--8580}, 22 | year={2022}, 23 | organization={PMLR} 24 | } 25 | """ 26 | 27 | MUTATE_TO_TASK_TO_PROMPT = { 28 | "prompt_carper": { 29 | "bin-op": "# Fixed binary operator", 30 | "var-misuse": "# Fixed incorrect variable name", 31 | }, 32 | "prompt_present": { 33 | "bin-op": "# Fix binary operator", 34 | "var-misuse": "# Fix incorrect variable name", 35 | }, 36 | # Same as prompt_carper, but other parts are still different 37 | "prompt": { 38 | "bin-op": "# Fixed binary operator", 39 | "var-misuse": "# Fixed incorrect variable name", 40 | }, 41 | "edit": { 42 | "bin-op": "Fix binary operator", 43 | "var-misuse": "Fix incorrect variable name", 44 | }, 45 | } 46 | 47 | def mutate_code(input_code, task, prompt="prompt"): 48 | """ 49 | Create template for code mutation. 50 | Args: 51 | input_code: code to be mutated 52 | task: task to be performed 53 | prompt: (Optional) 'edit' or 'prompt' 54 | Returns: 55 | template for code mutation 56 | """ 57 | instruction = MUTATE_TO_TASK_TO_PROMPT[prompt][task] 58 | if prompt == "prompt_carper": 59 | return f"# A buggy implementation\n#!/usr/bin/python3\n{input_code}\n{instruction}\ndef" 60 | if prompt == "prompt": 61 | return f"#!/usr/bin/python3\n# A buggy implementation\n{input_code}\n{instruction}\ndef" 62 | if prompt == "edit": 63 | return f"{input_code}{instruction}" 64 | else: 65 | raise ValueError(f"Unknown prompt: {prompt}") 66 | 67 | 68 | class PythonBugs(Task): 69 | 70 | DATASET_PATH = "Muennighoff/python-bugs" 71 | 72 | def __init__(self, prompt="prompt"): 73 | super().__init__( 74 | # Correct code always starts with `def ...` and is a single function, so stop everything else 75 | # Since a function always has a tab, stop when the first line does not have a tab 76 | stop_words=[ 77 | "\nclass", "\n#", "\ndef", "\nassert", '\n"', "\nprint", "\nif", 78 | # Special cases for edit 79 | "", "", "", "<|endoftext|>", 80 | ], 81 | requires_execution=True, 82 | ) 83 | self.max_length_multiplier = 2.25 # Allow 2.25 times the length of the prompt 84 | self.prompt = prompt 85 | 86 | def get_dataset(self): 87 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 88 | dataset = self.dataset["train"] 89 | return dataset 90 | 91 | def get_prompt(self, doc): 92 | """Builds the prompt for the LM to generate from.""" 93 | return mutate_code(doc["prompt_code"], doc["task"], self.prompt) 94 | 95 | def get_reference(self, doc): 96 | """Builds the reference solution for the doc (sample from the test dataset).""" 97 | return doc["correct_code"] 98 | 99 | def postprocess_generation(self, generation, idx): 100 | """Defines the postprocessing for a LM generation. 101 | :param generation: str 102 | code generation from LM 103 | :param idx: int 104 | index of doc in the dataset to which the generation belongs 105 | """ 106 | doc = self.get_dataset()[idx] 107 | prompt = self.get_prompt(doc) 108 | correct_code = self.get_reference(doc) 109 | output = generation[len(prompt):] 110 | if self.prompt.startswith("prompt"): 111 | output = "def" + output # Add def which is in the prompt back to the output 112 | return output[:len(correct_code)] 113 | 114 | def process_results(self, generations, references): 115 | """Takes the list of LM generations and evaluates them against ground truth references, 116 | returning the metric for the generations. 117 | :param generations: list(list(str)) 118 | list of lists containing generations 119 | :param references: list(str) 120 | list of str containing refrences 121 | """ 122 | num_correct = 0 123 | print("Scoring generations...") 124 | for i, ref in tqdm.tqdm(enumerate(references), total=len(references)): 125 | for gen in generations[i]: 126 | num_correct += int(gen == ref) 127 | accuracy = num_correct / len(references) / len(generations[0]) 128 | return {"mean exact match": accuracy} 129 | -------------------------------------------------------------------------------- /unlearning/dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import torch 3 | from torch.utils.data import Dataset 4 | from datasets import load_from_disk 5 | 6 | 7 | class CodeDataset(Dataset): 8 | def __init__(self, tokenizer, dataset_name, type_path, input_length, output_length, args): 9 | self.args = args 10 | self.tokenizer = tokenizer 11 | self.input_length = input_length 12 | self.output_length = output_length 13 | self.dataset_name = dataset_name 14 | self.type_path = type_path 15 | 16 | self.dataset = pd.read_csv(dataset_name, lineterminator='\n') 17 | self.dataset.columns = self.dataset.columns.str.replace('\r', '') 18 | if self.type_path == 'train': 19 | batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.ngpu 20 | if len(self.dataset) != batch_size: 21 | raise Exception("Effective batch size should be the same as length of train set.") 22 | 23 | def convert_to_features(self, example_batch): 24 | doc_id = torch.tensor(example_batch['doc_id'], dtype=torch.int) 25 | input_, target_ = example_batch['text'], example_batch['text'] 26 | 27 | source = self.tokenizer(input_, max_length=self.input_length, padding='max_length', truncation=True, return_tensors="pt") 28 | targets = self.tokenizer(target_, max_length=self.output_length, add_special_tokens=False, padding='max_length', truncation=True, return_tensors="pt") 29 | 30 | return source, targets, doc_id 31 | 32 | def __getitem__(self, index): 33 | data = self.dataset.iloc[index] 34 | source, targets, doc_id = self.convert_to_features(data) 35 | 36 | source_ids = source["input_ids"].squeeze() 37 | target_ids = targets["input_ids"].squeeze() 38 | 39 | src_mask = source["attention_mask"].squeeze() 40 | target_mask = targets["attention_mask"].squeeze() 41 | 42 | return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask, "doc_id": doc_id} 43 | 44 | def __len__(self): 45 | return len(self.dataset) 46 | 47 | 48 | class CodeSecretDataset(Dataset): 49 | def __init__(self, tokenizer, dataset_name, type_path, args): 50 | self.args = args 51 | self.tokenizer = tokenizer 52 | self.dataset_name = dataset_name 53 | self.type_path = type_path 54 | 55 | self.dataset = load_from_disk(dataset_name) 56 | self.dataset = self.dataset.add_column('doc_id', list(range(len(self.dataset)))) 57 | if self.type_path == 'train': 58 | batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.ngpu 59 | if len(self.dataset) != batch_size: 60 | raise Exception("Effective batch size should be the same as length of train set.") 61 | 62 | def __getitem__(self, index): 63 | data = self.dataset[index] 64 | input_, target_ = data['content'], data['content'] 65 | 66 | # special_tokens = self.tokenizer.special_tokens_map 67 | # print("Special tokens:", special_tokens) 68 | 69 | source = self.tokenizer(input_, max_length=512, padding='max_length', truncation=True, return_tensors="pt") 70 | targets = self.tokenizer(target_, max_length=512, add_special_tokens=False, padding='max_length', truncation=True, return_tensors="pt") 71 | 72 | source_ids = source["input_ids"].squeeze() 73 | target_ids = targets["input_ids"].squeeze() 74 | 75 | source_mask = source["attention_mask"].squeeze() 76 | target_mask = targets["attention_mask"].squeeze() 77 | 78 | # secret_spans = [] 79 | # prefix_spans = [] 80 | # current_span = [] 81 | # for i, (token, is_secret) in enumerate(zip(source_ids, torch.BoolTensor(data['secret_mask']).squeeze())): 82 | # if is_secret: 83 | # if not current_span: 84 | # prefix_spans.append(source_ids[max(0, i-1):i]) 85 | # current_span.append(token) 86 | # elif current_span: 87 | # secret_spans.append(current_span) 88 | # current_span = [] 89 | # if current_span: 90 | # secret_spans.append(current_span) 91 | # for i, (prefix_span, secret_span) in enumerate(zip(prefix_spans, secret_spans)): 92 | # print(f"Secret Span {i+1}:") 93 | # print(f" Prefix Token IDs: {prefix_span}") 94 | # print(f" Secret Token IDs: {torch.stack(secret_span)}") 95 | 96 | # secret_token_ids = torch.LongTensor(data['secret_token_ids']) 97 | # print(secret_token_ids) 98 | # secret_prefix_token_ids = torch.LongTensor(data['secret_prefix_token_ids']) 99 | # print(secret_prefix_token_ids) 100 | 101 | item = { 102 | 'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 103 | 'secret_mask': torch.BoolTensor(data['secret_mask']).squeeze(), 104 | # 'secret_mean_MA': torch.tensor(data['secret_mean_MA']), 105 | 'doc_id': torch.tensor(data['doc_id']) 106 | } 107 | # print(item) 108 | return item 109 | 110 | def __len__(self): 111 | return len(self.dataset) 112 | -------------------------------------------------------------------------------- /memorization_thresholds/TopLists/Rust-top-repos.txt: -------------------------------------------------------------------------------- 1 | 16611 https://github.com/HigherOrderCO/Bend 2 | 14212 https://github.com/huggingface/candle 3 | 13004 https://github.com/astral-sh/uv 4 | 12248 https://github.com/biomejs/biome 5 | 11170 https://github.com/sxyazi/yazi 6 | 8723 https://github.com/eza-community/eza 7 | 7158 https://github.com/rolldown/rolldown 8 | 6795 https://github.com/LibNyanpasu/clash-nyanpasu 9 | 4792 https://github.com/microsoft/sudo 10 | 4678 https://github.com/paradedb/paradedb 11 | 4559 https://github.com/ynqa/jnv 12 | 4195 https://github.com/clockworklabs/SpacetimeDB 13 | 3688 https://github.com/Speykious/cve-rs 14 | 3587 https://github.com/loco-rs/loco 15 | 3132 https://github.com/AmrDeveloper/GQL 16 | 2979 https://github.com/mainmatter/100-exercises-to-learn-rust 17 | 2770 https://github.com/EricLBuehler/mistral.rs 18 | 2747 https://github.com/getgrit/gritql 19 | 2561 https://github.com/YaLTeR/niri 20 | 2229 https://github.com/huggingface/text-embeddings-inference 21 | 2223 https://github.com/tembo-io/pgmq 22 | 2134 https://github.com/jsr-io/jsr 23 | 2061 https://github.com/gosub-browser/gosub-engine 24 | 2055 https://github.com/quarylabs/quary 25 | 2044 https://github.com/OwshenNetwork/owshen 26 | 1980 https://github.com/stacks-network/sbtc-developer-release 27 | 1883 https://github.com/rivet-gg/rivet 28 | 1840 https://github.com/microsoft/aici 29 | 1815 https://github.com/SilasMarvin/lsp-ai 30 | 1740 https://github.com/bionic-gpt/bionic-gpt 31 | 1696 https://github.com/Universal-Debloater-Alliance/universal-android-debloater-next-generation 32 | 1653 https://github.com/face-hh/webx 33 | 1635 https://github.com/altsem/gitu 34 | 1627 https://github.com/yuankunzhang/charming 35 | 1616 https://github.com/paritytech/polkadot-sdk 36 | 1440 https://github.com/memorysafety/river 37 | 1417 https://github.com/lapce/lapdev 38 | 1405 https://github.com/microsoft/windows-drivers-rs 39 | 1383 https://github.com/jafioti/luminal 40 | 1349 https://github.com/MatthiasGrandl/Loungy 41 | 1340 https://github.com/CADmium-Co/CADmium 42 | 1307 https://github.com/evilsocket/legba 43 | 1284 https://github.com/phodal/aigc 44 | 1244 https://github.com/FractalFir/rustc_codegen_clr 45 | 1229 https://github.com/ferrocene/ferrocene 46 | 1222 https://github.com/Julien-cpsn/ATAC 47 | 1218 https://github.com/ThousandBirdsInc/chidori 48 | 1207 https://github.com/cloudflare/foundations 49 | 1188 https://github.com/SeaQL/FireDBG.for.Rust 50 | 1055 https://github.com/shell-pool/shpool 51 | 1037 https://github.com/mfontanini/presenterm 52 | 1023 https://github.com/paradigmxyz/cryo 53 | 993 https://github.com/srush/llama2.rs 54 | 993 https://github.com/robertknight/ocrs 55 | 991 https://github.com/joaoviictorti/RustRedOps 56 | 988 https://github.com/orhun/daktilo 57 | 953 https://github.com/Ruddle/Fomos 58 | 939 https://github.com/Tencent/tquic 59 | 931 https://github.com/Whitecat18/Rust-for-Malware-Development 60 | 918 https://github.com/regolith-labs/ore-cli 61 | 913 https://github.com/spaceandtimelabs/sxt-proof-of-sql 62 | 911 https://github.com/aripiprazole/rinha-de-compiler 63 | 890 https://github.com/YiNNx/cmd-wrapped 64 | 867 https://github.com/mufeedvh/code2prompt 65 | 851 https://github.com/j-hc/zygisk-detach 66 | 850 https://github.com/Ragnt/AngryOxide 67 | 844 https://github.com/nvzqz/divan 68 | 836 https://github.com/samwho/spacer 69 | 815 https://github.com/redlib-org/redlib 70 | 815 https://github.com/Martichou/rquickshare 71 | 800 https://github.com/succinctlabs/sp1 72 | 787 https://github.com/LlamaEdge/LlamaEdge 73 | 773 https://github.com/FoxIO-LLC/ja4 74 | 759 https://github.com/helix-editor/nucleo 75 | 757 https://github.com/pnpm/pacquet 76 | 755 https://github.com/a2x/cs2-dumper 77 | 742 https://github.com/andyk/ht 78 | 741 https://github.com/get-convex/convex-backend 79 | 732 https://github.com/sunfishcode/eyra 80 | 730 https://github.com/every-day-things/citadel 81 | 729 https://github.com/Chleba/netscanner 82 | 718 https://github.com/moturus/motor-os 83 | 689 https://github.com/ogxd/gxhash 84 | 683 https://github.com/pipeless-ai/pipeless 85 | 675 https://github.com/tsukinaha/tsukimi 86 | 670 https://github.com/darthdeus/comfy 87 | 669 https://github.com/tembo-io/pg_vectorize 88 | 660 https://github.com/haileys/bark 89 | 645 https://github.com/apache/datafusion-comet 90 | 644 https://github.com/bitswired/rustgpt 91 | 638 https://github.com/hcavarsan/kftray 92 | 633 https://github.com/prefix-dev/rip 93 | 615 https://github.com/wintermute-cell/ngrrram 94 | 606 https://github.com/Kobzol/cargo-wizard 95 | 582 https://github.com/nexus-xyz/nexus-zkvm 96 | 578 https://github.com/timescale/pgvectorscale 97 | 578 https://github.com/ThePrimeagen/htmx-lsp 98 | 573 https://github.com/zaghaghi/openapi-tui 99 | 556 https://github.com/streamdal/streamdal 100 | 550 https://github.com/junkdog/tachyonfx 101 | 543 https://github.com/xetdata/nfsserve 102 | 541 https://github.com/regolith-labs/ore 103 | 538 https://github.com/narrowlink/narrowlink 104 | 535 https://github.com/cncases/cases 105 | 534 https://github.com/facebook/dotslash 106 | 527 https://github.com/huggingface/llm-ls 107 | 523 https://github.com/ViporMiner/VIPORMiner 108 | 519 https://github.com/opensourcecheemsburgers/RustyTube 109 | 519 https://github.com/terhechte/Ebou 110 | 512 https://github.com/KipData/FnckSQL 111 | 506 https://github.com/ad-si/Rust-Flashcards 112 | 502 https://github.com/meteroid-oss/meteroid 113 | 501 https://github.com/ynqa/sig 114 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/humaneval.py: -------------------------------------------------------------------------------- 1 | """Evaluating Large Language Models Trained on Code 2 | https://arxiv.org/abs/2107.03374 3 | 4 | The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature, 5 | docstring, body, and several unit tests. 6 | They were handwritten to ensure not to be included in the training set of code generation models. 7 | 8 | Homepage: https://github.com/openai/human-eval 9 | """ 10 | 11 | import re 12 | 13 | from evaluate import load 14 | 15 | from bigcode_eval.base import Task 16 | 17 | _CITATION = """ 18 | @misc{chen2021evaluating, 19 | title={Evaluating Large Language Models Trained on Code}, 20 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba}, 21 | year={2021}, 22 | eprint={2107.03374}, 23 | archivePrefix={arXiv}, 24 | primaryClass={cs.LG} 25 | } 26 | """ 27 | 28 | 29 | def create_all_tasks(): 30 | """Creates a dictionary of tasks from a list of levels 31 | :return: {task_name: task} 32 | e.g. {multiple-py: Task, multiple-java: Task} 33 | """ 34 | return {"humaneval": create_task(True), "humaneval-unstripped": create_task(False)} 35 | 36 | 37 | def create_task(strip_prompt): 38 | class HumanEval(GeneralHumanEval): 39 | def __init__(self): 40 | super().__init__(strip_prompt) 41 | 42 | return HumanEval 43 | 44 | 45 | class GeneralHumanEval(Task): 46 | """A task represents an entire benchmark including its dataset, problems, 47 | answers, generation settings and evaluation methods. 48 | """ 49 | 50 | DATASET_PATH = "openai_humaneval" 51 | 52 | def __init__(self, strip_prompt): 53 | super().__init__( 54 | stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```"], 55 | requires_execution=True, 56 | ) 57 | self.strip_prompt = strip_prompt 58 | 59 | def get_dataset(self): 60 | """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" 61 | return self.dataset["test"] 62 | 63 | def get_prompt(self, doc): 64 | """Builds the prompt for the LM to generate from.""" 65 | if self.strip_prompt: 66 | return doc["prompt"].strip() 67 | else: 68 | return doc["prompt"] 69 | 70 | def get_reference(self, doc): 71 | """Builds the reference solution for the doc (sample from the test dataset).""" 72 | test_func = doc["test"] 73 | entry_point = f"check({doc['entry_point']})" 74 | return "\n" + test_func + "\n" + entry_point 75 | 76 | @staticmethod 77 | def _stop_at_stop_token(decoded_string, stop_tokens): 78 | """ 79 | Produces the prefix of decoded_string that ends at the first occurrence of 80 | a stop_token. 81 | WARNING: the decoded_string *must not* include the prompt, which may have stop tokens 82 | itself. 83 | """ 84 | min_stop_index = len(decoded_string) 85 | for stop_token in stop_tokens: 86 | stop_index = decoded_string.find(stop_token) 87 | if stop_index != -1 and stop_index < min_stop_index: 88 | min_stop_index = stop_index 89 | return decoded_string[:min_stop_index] 90 | 91 | def postprocess_generation(self, generation, idx): 92 | """Defines the postprocessing for a LM generation. 93 | :param generation: str 94 | code generation from LM 95 | :param idx: int 96 | index of doc in the dataset to which the generation belongs 97 | (not used for Humaneval-Task) 98 | """ 99 | prompt = self.get_prompt(self.dataset["test"][idx]) 100 | generation = generation[len(prompt) :] 101 | return prompt + self._stop_at_stop_token(generation, self.stop_words) 102 | 103 | def process_results(self, generations, references): 104 | """Takes the list of LM generations and evaluates them against ground truth references, 105 | returning the metric for the generations. 106 | :param generations: list(list(str)) 107 | list of lists containing generations 108 | :param references: list(str) 109 | list of str containing refrences 110 | """ 111 | code_metric = load("code_eval") 112 | results, _ = code_metric.compute( 113 | references=references, 114 | predictions=generations, 115 | k=[1, 5, 10] 116 | ) 117 | return results 118 | -------------------------------------------------------------------------------- /unlearning_preparation/retained_data_sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import feather 3 | import pandas as pd 4 | import random 5 | from tqdm import tqdm 6 | import torch 7 | from transformers import AutoTokenizer, AutoModelForCausalLM 8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 9 | smoothie = SmoothingFunction().method4 10 | 11 | 12 | # calculate BLEU-4 score 13 | def calc_bleu4(tokenizer, sample, generated): 14 | ref = tokenizer.decode(sample) 15 | hyp = tokenizer.decode(generated) 16 | return sentence_bleu([ref], hyp, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie) 17 | 18 | 19 | def memorization_extraction(args): 20 | device = torch.device("cuda:" + str(args.gpu_id) if torch.cuda.is_available() else "cpu") 21 | 22 | base_tokenizer = AutoTokenizer.from_pretrained( 23 | 'Salesforce/codegen-350M-multi', 24 | padding_side='left', 25 | # add_special_tokens=True 26 | ) 27 | 28 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True) 29 | # tokenizer.pad_token = tokenizer.eos_token 30 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 31 | model = AutoModelForCausalLM.from_pretrained( 32 | args.model_name_or_path, 33 | # resid_pdrop=0, embd_pdrop=0, attn_pdrop=0, 34 | attention_dropout=0, 35 | # pad_token_id=tokenizer.eos_token_id, 36 | local_files_only=True 37 | ) 38 | model.resize_token_embeddings(len(tokenizer)) 39 | model.config.pad_token_id = tokenizer.pad_token_id 40 | model.config.vocab_size = len(tokenizer) 41 | if hasattr(model, 'model'): 42 | print("Model has model.model.embed_tokens.padding_idx") 43 | model.model.embed_tokens.padding_idx = tokenizer.pad_token_id 44 | else: 45 | print("Model has get_input_embeddings.padding_idx") 46 | model.get_input_embeddings().padding_idx = tokenizer.pad_token_id 47 | if hasattr(model, 'lm_head'): 48 | print("Model has lm_head.padding_idx") 49 | model.lm_head.padding_idx = tokenizer.pad_token_id 50 | 51 | embedding_layer = model.get_input_embeddings() 52 | print(f"padding_idx: {embedding_layer.padding_idx}") 53 | 54 | assert model.config.pad_token_id == tokenizer.pad_token_id 55 | assert embedding_layer.num_embeddings == len(tokenizer) 56 | 57 | if args.fp16: 58 | model.half() 59 | model.to(device) 60 | 61 | df = feather.read_dataframe('benchmark.feather') 62 | if base_tokenizer.vocab != tokenizer.vocab: 63 | print('Different tokenizers: Re-encoding samples...') 64 | df['sample'] = df['sample'].apply(lambda x: tokenizer.encode(base_tokenizer.decode(x, skip_special_tokens=True))) 65 | df = df[df['sample'].apply(len) >= 100].reset_index(drop=True) # drop samples that are too short 66 | else: # same tokenizer 67 | print('Same tokenizers: No need to re-encode samples...') 68 | df['prefix'] = df['sample'].apply(lambda x: x[:64]) 69 | df['suffix'] = df['sample'].apply(lambda x: x[64:128]) 70 | 71 | gen_suffix = [] 72 | # iterate with batch size 73 | with torch.no_grad(): 74 | for i in tqdm(range(0, len(df), args.batch_size)): 75 | batch = torch.tensor(df.iloc[i: i + args.batch_size].prefix.tolist()).to(device) 76 | # output = model.generate(batch, max_length=128)[..., 64:].tolist() 77 | output = model.generate(batch, max_new_tokens=64)[..., 64:].tolist() 78 | gen_suffix.extend(output) 79 | 80 | df['gen_suffix'] = gen_suffix 81 | df['bleu4'] = df.apply(lambda x: calc_bleu4(tokenizer, x['suffix'], x['gen_suffix']), axis=1) 82 | 83 | memorization_df = df[df['bleu4'] >= 0.95] 84 | memorization_df.rename(columns={'index': 'doc_id'}, inplace=True) 85 | memorization_df['text'] = memorization_df['sample'].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True)) 86 | memorization_df['corpus'] = 'BigQuery' 87 | memorization_df = memorization_df[['doc_id', 'hash', 'copies', 'corpus', 'text', 'bleu4']] 88 | print(memorization_df) 89 | model_name = args.model_name_or_path.split('/')[-1] 90 | memorization_df.to_csv(f'{model_name}_memorization.csv', index=False, encoding='utf-8') 91 | 92 | random.seed(42) 93 | memorization_df_indexes = list(range(len(memorization_df))) 94 | random.shuffle(memorization_df_indexes) 95 | sampled_df = memorization_df.iloc[memorization_df_indexes[:args.k], :] 96 | sampled_df.to_csv(f'../unlearning/data/{model_name}_secret/{model_name}_retained_set_{args.k}.csv', index=False, encoding='utf-8') 97 | 98 | 99 | def main(): 100 | # Parsing Arguments 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument("--model_name_or_path", default="Salesforce/codegen-350M-mono", type=str, 103 | help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.") 104 | parser.add_argument('--gpu_id', type=str, default="0", help="specify the GPU id") 105 | parser.add_argument('--batch_size', type=int, default=4, help="Batch size.") 106 | parser.add_argument("--fp16", action='store_true', 107 | help="Whether to use fp16 model precision.") 108 | parser.add_argument('--k', type=int, default=32, 109 | help="The number of forgotten samples.") 110 | args = parser.parse_args() 111 | 112 | memorization_extraction(args) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /human_eval/finetuning/CodeComplex/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | from datasets import ClassLabel, DatasetDict, load_dataset 6 | from evaluate import load 7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 8 | DataCollatorWithPadding, Trainer, TrainerCallback, 9 | TrainingArguments, set_seed) 10 | 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine" 16 | ) 17 | parser.add_argument("--num_epochs", type=int, default=5) 18 | parser.add_argument("--batch_size", type=int, default=6) 19 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 20 | parser.add_argument("--freeze", type=bool, default=True) 21 | parser.add_argument("--learning_rate", type=float, default=5e-4) 22 | parser.add_argument("--seed", type=int, default=0) 23 | parser.add_argument("--lr_scheduler_type", type=str, default="cosine") 24 | parser.add_argument("--num_warmup_steps", type=int, default=10) 25 | parser.add_argument("--weight_decay", type=float, default=0.01) 26 | parser.add_argument("--output_dir", type=str, default="./results") 27 | parser.add_argument("--push_to_hub", type=bool, default=False) 28 | parser.add_argument("--model_hub_name", type=str, default="codecomplex_model") 29 | return parser.parse_args() 30 | 31 | 32 | metric = load("accuracy") 33 | 34 | 35 | def compute_metrics(eval_pred): 36 | predictions, labels = eval_pred 37 | predictions = np.argmax(predictions, axis=1) 38 | return metric.compute(predictions=predictions, references=labels) 39 | 40 | 41 | class CustomCallback(TrainerCallback): 42 | def __init__(self, trainer) -> None: 43 | super().__init__() 44 | self._trainer = trainer 45 | 46 | def on_epoch_end(self, args, state, control, **kwargs): 47 | if control.should_evaluate: 48 | control_copy = deepcopy(control) 49 | self._trainer.evaluate( 50 | eval_dataset=self._trainer.train_dataset, metric_key_prefix="train" 51 | ) 52 | return control_copy 53 | 54 | 55 | def main(): 56 | args = get_args() 57 | set_seed(args.seed) 58 | 59 | dataset = load_dataset("codeparrot/codecomplex", split="train") 60 | train_test = dataset.train_test_split(test_size=0.2) 61 | test_validation = train_test["test"].train_test_split(test_size=0.5) 62 | train_test_validation = DatasetDict( 63 | { 64 | "train": train_test["train"], 65 | "test": test_validation["train"], 66 | "valid": test_validation["test"], 67 | } 68 | ) 69 | 70 | print("Loading tokenizer and model") 71 | tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) 72 | tokenizer.pad_token = tokenizer.eos_token 73 | model = AutoModelForSequenceClassification.from_pretrained( 74 | args.model_ckpt, num_labels=7 75 | ) 76 | model.config.pad_token_id = model.config.eos_token_id 77 | 78 | if args.freeze: 79 | for param in model.roberta.parameters(): 80 | param.requires_grad = False 81 | 82 | labels = ClassLabel( 83 | num_classes=7, names=list(set(train_test_validation["train"]["complexity"])) 84 | ) 85 | 86 | def tokenize(example): 87 | inputs = tokenizer(example["src"], truncation=True, max_length=1024) 88 | label = labels.str2int(example["complexity"]) 89 | return { 90 | "input_ids": inputs["input_ids"], 91 | "attention_mask": inputs["attention_mask"], 92 | "label": label, 93 | } 94 | 95 | tokenized_datasets = train_test_validation.map( 96 | tokenize, 97 | batched=True, 98 | remove_columns=train_test_validation["train"].column_names, 99 | ) 100 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 101 | 102 | training_args = TrainingArguments( 103 | output_dir=args.output_dir, 104 | learning_rate=args.learning_rate, 105 | lr_scheduler_type=args.lr_scheduler_type, 106 | evaluation_strategy="epoch", 107 | save_strategy="epoch", 108 | logging_strategy="epoch", 109 | per_device_train_batch_size=args.batch_size, 110 | per_device_eval_batch_size=args.batch_size, 111 | num_train_epochs=args.num_epochs, 112 | gradient_accumulation_steps=args.gradient_accumulation_steps, 113 | weight_decay=args.weight_decay, 114 | metric_for_best_model="accuracy", 115 | run_name="complexity-java", 116 | report_to="wandb", 117 | ) 118 | 119 | trainer = Trainer( 120 | model=model, 121 | args=training_args, 122 | train_dataset=tokenized_datasets["train"], 123 | eval_dataset=tokenized_datasets["valid"], 124 | tokenizer=tokenizer, 125 | data_collator=data_collator, 126 | compute_metrics=compute_metrics, 127 | ) 128 | 129 | print("Training...") 130 | trainer.add_callback(CustomCallback(trainer)) 131 | trainer.train() 132 | 133 | result = trainer.evaluate(eval_dataset=tokenized_datasets["test"]) 134 | print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}") 135 | 136 | # push the model to the Hugging Face hub 137 | if args.push_to_hub: 138 | model.push_to_hub(args.model_hub_name) 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/generic_eval.py: -------------------------------------------------------------------------------- 1 | # This is a helper script for evaluating benchmarks that have been translated to 2 | # different languages. 3 | # 4 | # To use this script, call eval_lang.py. 5 | # The --directory argument is required, and tells the script where the benchmarks are located. 6 | # The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated. 7 | # 8 | # The script will print the results on each benchmark, and also write to results/lang.csv. 9 | # When the script completes, it will print a summary. 10 | # 11 | # Examples 12 | # 13 | # To run the entire benchmark suite: 14 | # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ 15 | # 16 | # To run benchmarks 1, 2, and 3: 17 | # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3 18 | 19 | import argparse 20 | import sys 21 | from pathlib import Path 22 | from sys import exit as sysexit 23 | 24 | 25 | def list_files(directory, ext): 26 | files_unsorted = directory.glob(f"HumanEval_*{ext}") 27 | # assumption: base filenames are in the format of HumanEval_X_* 28 | # Where X is a valid number 29 | def key(s): 30 | return int(str(s.name).split("_")[1]) 31 | 32 | files_sorted = sorted(files_unsorted, key=(lambda s: key(s))) 33 | 34 | # assumption: there may be missing files, but no extra files 35 | # so we build files_array where the index corresponds to the file's number, 36 | # and a missing file is represented by None 37 | size = key(files_sorted[-1]) + 1 38 | files_array = [None] * size 39 | for f in files_sorted: 40 | k = key(f) 41 | files_array[k] = f 42 | 43 | return files_array 44 | 45 | 46 | def main(eval_script, language, extension): 47 | args = argparse.ArgumentParser() 48 | 49 | args.add_argument( 50 | "--directory", type=str, required=True, help="Directory to read benchmarks from" 51 | ) 52 | args.add_argument( 53 | "--files", 54 | type=int, 55 | nargs="*", 56 | default=[], 57 | help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2", 58 | ) 59 | args = args.parse_args() 60 | 61 | directory = Path(args.directory).resolve() 62 | 63 | files_sorted = list_files(directory, extension) 64 | 65 | # the directory you specified does not contain the right language 66 | if len(files_sorted) == 0: 67 | print(f"The specified directory does not contain files of type {extension}") 68 | sysexit(1) 69 | 70 | files_index = [] 71 | if len(args.files) > 0: 72 | files_index = args.files 73 | else: 74 | files_index = range(len(files_sorted)) 75 | 76 | total = 0 77 | passed = 0 78 | syntax_error = 0 79 | 80 | results_file = Path( 81 | Path(__file__).parent, "..", "results", language.lower() + ".csv" 82 | ).resolve() 83 | 84 | with open(results_file, "w") as f: 85 | for i in files_index: 86 | filepath = files_sorted[i] 87 | if filepath is None: 88 | print("File {} does not exist!".format(i)) 89 | continue 90 | res = eval_script(filepath) 91 | output = f"{language},{filepath.stem},{res['status']}\n" 92 | f.write(output) 93 | print(output, end="") 94 | total += 1 95 | if res["status"] == "OK": 96 | passed += 1 97 | elif res["status"] == "SyntaxError": 98 | syntax_error += 1 99 | print(f"Total {total}, Syntax Error {syntax_error}, Passed {passed}") 100 | 101 | 102 | def main_check_stubs(check_script, language, extension): 103 | args = argparse.ArgumentParser() 104 | 105 | args.add_argument( 106 | "--directory", type=str, required=True, help="Directory to read benchmarks from" 107 | ) 108 | args.add_argument( 109 | "--files", 110 | type=int, 111 | nargs="*", 112 | default=[], 113 | help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2", 114 | ) 115 | args = args.parse_args() 116 | 117 | directory = Path(args.directory).resolve() 118 | 119 | files_sorted = list_files(directory, extension) 120 | 121 | # the directory you specified does not contain the right language 122 | if len(files_sorted) == 0: 123 | print(f"The specified directory does not contain files of type {extension}") 124 | sysexit(1) 125 | 126 | files_index = [] 127 | if len(args.files) > 0: 128 | files_index = args.files 129 | else: 130 | files_index = range(len(files_sorted)) 131 | 132 | total = 0 133 | passed = 0 134 | 135 | results_file = Path( 136 | Path(__file__).parent, "..", "check_results", language.lower() + ".csv" 137 | ).resolve() 138 | 139 | with open(results_file, "w") as f: 140 | for i in files_index: 141 | filepath = files_sorted[i] 142 | if filepath is None: 143 | print("File {} does not exist!".format(i)) 144 | continue 145 | res = check_script(filepath) 146 | output = f"{language},{filepath.stem},{res['status']}\n" 147 | f.write(output) 148 | print(output, end="") 149 | total += 1 150 | if res["status"] == "OK": 151 | passed += 1 152 | print(f"Total {total}, Passed {passed}") 153 | 154 | if total != passed: 155 | sys.exit(1) 156 | --------------------------------------------------------------------------------