├── human_eval
    ├── bigcode_eval
    │   ├── __init__.py
    │   ├── tasks
    │   │   ├── custom_metrics
    │   │   │   ├── __init__.py
    │   │   │   ├── pal_metric
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── pal_code_exec.py
    │   │   │   └── multiple_metrics
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── safe_subprocess
    │   │   │   │       ├── .gitignore
    │   │   │   │       ├── evil_programs
    │   │   │   │       │   ├── block_on_inputs.py
    │   │   │   │       │   ├── fork_bomb.py
    │   │   │   │       │   ├── sleep_forever.py
    │   │   │   │       │   ├── unbounded_output.py
    │   │   │   │       │   ├── fork_once.py
    │   │   │   │       │   └── close_outputs.py
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── module_test.py
    │   │   │   │   ├── eval_lua.py
    │   │   │   │   ├── eval_php.py
    │   │   │   │   ├── eval_python.py
    │   │   │   │   ├── eval_pl.py
    │   │   │   │   ├── single_experiment_pass_k.py
    │   │   │   │   ├── eval_julia.py
    │   │   │   │   ├── eval_sh.py
    │   │   │   │   ├── eval_swift.py
    │   │   │   │   ├── eval_ts.py
    │   │   │   │   ├── eval_go.py
    │   │   │   │   ├── eval_cpp.py
    │   │   │   │   ├── eval_racket.py
    │   │   │   │   ├── eval_scala.py
    │   │   │   │   ├── libeval.py
    │   │   │   │   ├── eval_ruby.py
    │   │   │   │   ├── eval_r.py
    │   │   │   │   ├── eval_java.py
    │   │   │   │   ├── eval_javascript.py
    │   │   │   │   ├── eval_rust.py
    │   │   │   │   ├── eval_cs.py
    │   │   │   │   ├── eval_dlang.py
    │   │   │   │   ├── evaluation.py
    │   │   │   │   ├── containerized_eval.py
    │   │   │   │   └── generic_eval.py
    │   │   ├── few_shot_examples
    │   │   │   ├── conala_few_shot_prompts.json
    │   │   │   ├── concode_few_shot_prompts.json
    │   │   │   ├── codexglue_text_to_text_few_shot_prompts.json
    │   │   │   └── gsm8k_few_shot_prompts.json
    │   │   ├── __init__.py
    │   │   ├── concode.py
    │   │   ├── mbpp.py
    │   │   ├── conala.py
    │   │   ├── codexglue_text_to_text.py
    │   │   ├── apps.py
    │   │   ├── python_bugs.py
    │   │   └── humaneval.py
    │   ├── arguments.py
    │   ├── base.py
    │   └── evaluator.py
    ├── requirements.txt
    ├── Dockerfile
    ├── tests
    │   ├── data
    │   │   ├── mbpp_eval_gens.json
    │   │   ├── mbpp_gen_refs.json
    │   │   ├── humaneval_eval_gens.json
    │   │   ├── pal-gsm8k-greedy_eval_gens.json
    │   │   ├── humaneval_gen_refs.json
    │   │   ├── humaneval_gen_gens.json
    │   │   ├── mbpp_gen_gens.json
    │   │   └── pal-gsm8k-greedy_prompt.json
    │   ├── test_prompts.py
    │   └── test_generation_evaluation.py
    ├── eval.sh
    ├── makefile
    ├── finetuning
    │   ├── CodeComplex
    │   │   ├── README.md
    │   │   └── train.py
    │   ├── CodeDefect
    │   │   ├── README.md
    │   │   └── train.py
    │   ├── CodeClone
    │   │   ├── README.md
    │   │   └── train.py
    │   ├── APPS
    │   │   ├── README.md
    │   │   └── apps_train.py
    │   ├── Code-to-text
    │   │   ├── README.md
    │   │   └── train.py
    │   └── README.md
    ├── leaderboard
    │   ├── throughput_config.yaml
    │   ├── group_jsons.py
    │   └── multiple_eval.slurm
    ├── setup.py
    ├── Dockerfile-multiple
    ├── templates
    │   └── new_task.py
    └── .gitignore
├── sensitive_memorization
    ├── utils
    │   └── __init__.py
    ├── analyze.py
    ├── tokenize_secrets_and_prefixes.py
    ├── filter.py
    └── generate_secret_mask.py
├── assets
    ├── Unlearning.jpg
    ├── Illustration.jpg
    ├── MemorizationDistribution.jpg
    └── SensitiveMemorizationDetection.jpg
├── .gitignore
├── memorization_thresholds
    ├── TopLists
    │   ├── Ruby-top-repos.txt
    │   ├── Lua-top-repos.txt
    │   ├── PHP-top-repos.txt
    │   └── Rust-top-repos.txt
    ├── clean.sh
    ├── deduplicate.py
    ├── collect_data.sh
    ├── humaneval_mbpp_get.py
    ├── clone_repo.sh
    ├── extract_code.py
    ├── sample.py
    └── gh_crawler.py
├── cache_models.py
├── LICENSE
├── unlearning_preparation
    ├── forgotten_data_sample.py
    └── retained_data_sample.py
└── unlearning
    └── dataset.py


/human_eval/bigcode_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sensitive_memorization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/pal_metric/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/Unlearning.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/Unlearning.jpg


--------------------------------------------------------------------------------
/assets/Illustration.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/Illustration.jpg


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/.gitignore:
--------------------------------------------------------------------------------
1 | /__pycache__
2 | /.pytest_cache


--------------------------------------------------------------------------------
/assets/MemorizationDistribution.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/MemorizationDistribution.jpg


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/block_on_inputs.py:
--------------------------------------------------------------------------------
1 | while True:
2 |     input()
3 | 


--------------------------------------------------------------------------------
/assets/SensitiveMemorizationDetection.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhaoyang-Chu/code-unlearning/HEAD/assets/SensitiveMemorizationDetection.jpg


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/fork_bomb.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | while True:
4 |     os.fork()
5 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/sleep_forever.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | while True:
4 |     time.sleep(60)
5 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/unbounded_output.py:
--------------------------------------------------------------------------------
1 | b = True
2 | while True:
3 |     print(b)
4 |     b = not b
5 | 


--------------------------------------------------------------------------------
/human_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | accelerate>=0.13.2
3 | datasets>=2.6.1
4 | evaluate>=0.3.0
5 | pyext==0.7
6 | mosestokenizer==1.0.0
7 | huggingface_hub>=0.11.1
8 | fsspec<2023.10.0
9 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/fork_once.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | 
4 | if os.fork() == 0:
5 |     while True:
6 |         time.sleep(60)
7 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/close_outputs.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | print("This is the end")
4 | sys.stdout.close()
5 | sys.stderr.close()
6 | while True:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | 
 3 | memorization_thresholds/Code/
 4 | memorization_thresholds/Repos/
 5 | 
 6 | sensitive_memorization/codeparrot-clean-train-secrets-*
 7 | sensitive_memorization/.codeparrot-clean-train-cache
 8 | 
 9 | unlearning_preparation/benchmark.feather
10 | 
11 | unlearning/ckpts/
12 | 


--------------------------------------------------------------------------------
/human_eval/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt-get update && apt-get install -y python3 python3-pip
 4 | 
 5 | COPY . /app
 6 | 
 7 | WORKDIR /app
 8 | 
 9 | RUN test -f /app/generations.json && rm /app/generations.json || true
10 | 
11 | RUN pip3 install .
12 | 
13 | CMD ["python3", "main.py"]
14 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json:
--------------------------------------------------------------------------------
1 | {"instruction1": "convert a list of integers into a single integer", "instruction2": "how to convert a datetime string back to datetime object?", "solution1": "r = int(''.join(map(str, x)))", "solution2": "datetime.datetime.strptime(str, '%m/%d/%Y')"}


--------------------------------------------------------------------------------
/memorization_thresholds/TopLists/Ruby-top-repos.txt:
--------------------------------------------------------------------------------
1 | 28270	https://github.com/maybe-finance/maybe
2 | 5489	https://github.com/docusealco/docuseal
3 | 707	https://github.com/rage-rb/rage
4 | 694	https://github.com/Multiwoven/multiwoven
5 | 616	https://github.com/darwin-containers/homebrew-formula
6 | 584	https://github.com/wouterken/crystalruby
7 | 516	https://github.com/Freika/dawarich
8 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json:
--------------------------------------------------------------------------------
1 | {"instruction1": "get the distance of map coordinates to the center ", "instruction2": "check if details are parsed", "solution1": "float function ( int arg0 , int arg1 ) { int loc0 = arg0 - cx ; int loc1 = arg1 - cy ; return getSquaredDistance ( loc0 , loc1 ) ; }", "solution2": "boolean function ( ) { return isParsed ; }"}


--------------------------------------------------------------------------------
/human_eval/tests/data/mbpp_eval_gens.json:
--------------------------------------------------------------------------------
1 | [["def remove_Occ(s,ch): \r\n    for i in range(len(s)): \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    for i in range(len(s) - 1,-1,-1):  \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    return s ", "This is some random text"], ["This is some random text", "This is some random text"]]


--------------------------------------------------------------------------------
/memorization_thresholds/clean.sh:
--------------------------------------------------------------------------------
 1 | cd Repos/Python/
 2 | find . -maxdepth 2 -type d -empty | xargs -i sudo rm -rf {}
 3 | find . -maxdepth 1 -type d -empty | xargs -i sudo rm -rf {}
 4 | 
 5 | cd ../../Code/Python/
 6 | find . -maxdepth 2 -type d -empty | xargs -i sudo rm -rf {}
 7 | find . -maxdepth 1 -type d -empty | xargs -i sudo rm -rf {}
 8 | 
 9 | ls ../../Repos/Python/ | xargs -i sudo rm -rf {}
10 | sudo rm -rf ../../Repos/Python/*
11 | 


--------------------------------------------------------------------------------
/human_eval/tests/data/mbpp_gen_refs.json:
--------------------------------------------------------------------------------
1 | ["assert remove_Occ(\"hello\",\"l\") == \"heo\"\nassert remove_Occ(\"abcda\",\"a\") == \"bcd\"\nassert remove_Occ(\"PHP\",\"P\") == \"H\"", "assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\nassert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]]\nassert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]"]


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_lua.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     r = run(["lua", str(path)])
 8 |     if r.timeout:
 9 |         status = "Timeout"
10 |     elif r.exit_code == 0:
11 |         status = "OK"
12 |     else:
13 |         status = "Exception"
14 |     return {
15 |         "status": status,
16 |         "exit_code": r.exit_code,
17 |         "stdout": r.stdout,
18 |         "stderr": r.stderr,
19 |     }
20 | 


--------------------------------------------------------------------------------
/human_eval/eval.sh:
--------------------------------------------------------------------------------
 1 | model=$1
 2 | batch_size=$2
 3 | 
 4 | accelerate launch  main.py \
 5 |     --model $model \
 6 |     --tasks humaneval \
 7 |     --batch_size $batch_size \
 8 |     --max_length_generation 512 \
 9 |     --precision fp16 \
10 |     --allow_code_execution \
11 |     --metric_output_path $model/humaneval_evaluation_results.json \
12 |     --save_generations --save_generations_path $model/humaneval_generations.json \
13 |     --max_memory_per_gpu auto \
14 |     --do_sample True \
15 |     --temperature 0.2 \
16 |     --top_p 0.95 \
17 |     --n_samples 50 \
18 |     --seed 42
19 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_php.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | LANG_NAME = "PHP"
 6 | LANG_EXT = ".php"
 7 | 
 8 | 
 9 | def eval_script(path: Path):
10 |     r = run(["php", path])
11 |     if "PHP Parse error" in r.stdout:
12 |         status = "SyntaxError"
13 |     elif r.exit_code != 0:
14 |         status = "Exception"
15 |     else:
16 |         status = "OK"
17 |     return {
18 |         "status": status,
19 |         "exit_code": r.exit_code,
20 |         "stdout": r.stdout,
21 |         "stderr": r.stderr,
22 |     }
23 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_python.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     r = run(["python3", str(path)])
 8 |     if r.timeout:
 9 |         status = "Timeout"
10 |     elif r.exit_code == 0:
11 |         status = "OK"
12 |     elif "SyntaxError" in r.stderr:
13 |         status = "SyntaxError"
14 |     else:
15 |         status = "Exception"
16 |     return {
17 |         "status": status,
18 |         "exit_code": r.exit_code,
19 |         "stdout": r.stdout,
20 |         "stderr": r.stderr,
21 |     }
22 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_pl.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     r = run(["perl", path])
 8 | 
 9 |     if r.timeout:
10 |         status = "Timeout"
11 |     elif r.exit_code != 0:
12 |         status = "Exception"
13 |     elif "ERROR" in r.stdout or "ERROR" in r.stderr:
14 |         status = "Exception"
15 |     else:
16 |         status = "OK"
17 |     return {
18 |         "status": status,
19 |         "exit_code": r.exit_code,
20 |         "stdout": r.stdout,
21 |         "stderr": r.stderr,
22 |     }
23 | 


--------------------------------------------------------------------------------
/human_eval/tests/test_prompts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from bigcode_eval import tasks
 4 | 
 5 | TASKS = ["pal-gsm8k-greedy"]
 6 | 
 7 | sample_doc = {"pal-gsm8k-greedy": {"question": "test"}}
 8 | 
 9 | 
10 | def load_reference_prompt(task_name):
11 |     with open(f"tests/data/{task_name}_prompt.json") as fp:
12 |         prompts = json.load(fp)
13 |     return prompts["prompt"]
14 | 
15 | 
16 | def test_gsm_prompt():
17 |     for task_name in TASKS:
18 |         task = tasks.get_task(task_name)
19 |         task_prompt = task.get_prompt(sample_doc[task_name])
20 |         ref_prompt = load_reference_prompt(task_name)
21 |         assert task_prompt == ref_prompt
22 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/single_experiment_pass_k.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def estimator(n: int, c: int, k: int) -> float:
 7 |     """
 8 |     Calculates 1 - comb(n - c, k) / comb(n, k).
 9 |     """
10 |     if n - c < k:
11 |         return 1.0
12 |     return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
13 | 
14 | 
15 | def for_file(path):
16 |     with open(path, "r") as f:
17 |         data = json.load(f)
18 |     n = len(data["results"])
19 |     c = len(
20 |         [True for r in data["results"] if r["status"] == "OK" and r["exit_code"] == 0]
21 |     )
22 |     return np.array([estimator(n, c, 1), estimator(n, c, 10), estimator(n, c, 100)])
23 | 


--------------------------------------------------------------------------------
/human_eval/makefile:
--------------------------------------------------------------------------------
 1 | # There are two dockerfiles: for all benchmarks, and for MultiPL-E
 2 | DOCKERFILE=Dockerfile
 3 | 
 4 | ifeq ($(DOCKERFILE), Dockerfile)
 5 | 	IMAGE_NAME=evaluation-harness
 6 | else
 7 | 	IMAGE_NAME=evaluation-harness-multiple
 8 | endif
 9 | 
10 | build:
11 | 	docker build -f $(DOCKERFILE) -t $(IMAGE_NAME) .
12 | 
13 | test:
14 | 	docker run -v $(CURDIR)/tests/docker_test/test_generations.json:/app/test_generations.json:ro \
15 | 	-it $(IMAGE_NAME) python3 main.py --model dummy_model --tasks humaneval --limit 4 \
16 | 	--load_generations_path /app/test_generations.json --allow_code_execution 
17 | 
18 | 	@echo "If pass@1 is 0.25 then your configuration for standard benchmarks is correct"
19 | 
20 | all: build test


--------------------------------------------------------------------------------
/human_eval/tests/data/humaneval_eval_gens.json:
--------------------------------------------------------------------------------
1 | [["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "This is some random text"], ["This is some random text", "This is some random text"]]


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_julia.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     result = run(["julia", str(path)], timeout_seconds=5)
 8 |     if result.timeout:
 9 |         status = "Timeout"
10 |     elif result.exit_code == 0:
11 |         status = "OK"
12 |     # TODO(arjun): I would like this to be reviewed more carefully by John.
13 |     elif len(result.stderr) < 1:
14 |         status = "Exception"
15 |     else:
16 |         status = "SyntaxError"
17 | 
18 |     return {
19 |         "status": status,
20 |         "exit_code": result.exit_code,
21 |         "stdout": result.stdout,
22 |         "stderr": result.stderr,
23 |     }
24 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_sh.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | LANG_NAME = "bash"
 6 | LANG_EXT = ".sh"
 7 | 
 8 | 
 9 | def eval_script(path: Path):
10 |     # Capture output - will be generated regardless of success, fail, or syntax error
11 |     p = run(["bash", path])
12 |     if p.timeout:
13 |         status = "Timeout"
14 |     elif p.exit_code == 0:
15 |         status = "OK"
16 |     elif "syntax error" in p.stderr:
17 |         status = "SyntaxError"
18 |     else:
19 |         status = "Exception"
20 | 
21 |     return {
22 |         "status": status,
23 |         "exit_code": p.exit_code,
24 |         "stdout": p.stdout,
25 |         "stderr": p.stderr,
26 |     }
27 | 


--------------------------------------------------------------------------------
/memorization_thresholds/deduplicate.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import os
 3 | 
 4 | ROOT = 'Code'  # NOTE: hard-coded.
 5 | seen = set()
 6 | count = 0
 7 | dups = 0
 8 | 
 9 | for root_dir, _, files in os.walk(ROOT):
10 | 	for file in files:
11 | 		count += 1
12 | 		file_path = os.path.join(root_dir, file)
13 | 		# Hash the entire file's content.
14 | 		with open(file_path, 'rb') as f:
15 | 			bytes = f.read()
16 | 			hash = hashlib.sha256(bytes).hexdigest()
17 | 
18 | 		# Delete identical files.
19 | 		if hash in seen:
20 | 			os.remove(file_path)
21 | 			dups += 1
22 | 		else:
23 | 			seen.add(hash)
24 | 
25 | 		# Periodically print progress and the running duplication ratio.
26 | 		if count % 10000 == 0:
27 | 			print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')
28 | 


--------------------------------------------------------------------------------
/memorization_thresholds/TopLists/Lua-top-repos.txt:
--------------------------------------------------------------------------------
 1 | 2508	https://github.com/stevearc/conform.nvim
 2 | 2204	https://github.com/nvimtools/none-ls.nvim
 3 | 2199	https://github.com/folke/flash.nvim
 4 | 1582	https://github.com/face-hh/griddycode
 5 | 1212	https://github.com/CopilotC-Nvim/CopilotChat.nvim
 6 | 1072	https://github.com/tbhrbxx/robloxscripts
 7 | 963	https://github.com/David-Kunz/gen.nvim
 8 | 786	https://github.com/3rd/image.nvim
 9 | 649	https://github.com/kawre/leetcode.nvim
10 | 641	https://github.com/nvim-java/nvim-java
11 | 579	https://github.com/craftzdog/solarized-osaka.nvim
12 | 575	https://github.com/tris203/precognition.nvim
13 | 554	https://github.com/ejoy/vaststars
14 | 544	https://github.com/nvim-neorocks/rocks.nvim
15 | 531	https://github.com/Robitx/gp.nvim
16 | 519	https://github.com/wojciech-kulik/xcodebuild.nvim
17 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeComplex/README.md:
--------------------------------------------------------------------------------
 1 | # CodeComplex finetuning
 2 | In this folder we show how to train an autoregressive on CodeComplex dataset, for algorithmic complexity prediction of Java programs. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs.
 3 | 
 4 | ## Setup
 5 | 
 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub:
 7 | ```
 8 | wandb login
 9 | huggingface-cli login
10 | ```
11 | 
12 | To fine-tune a model on this dataset, `microsoft/unixcoder-base-nine` for example, you can use the following command:
13 | 
14 | ```python
15 | python train.py \
16 |     --model_ckpt microsoft/unixcoder-base-nine \
17 |     --num_epochs 60 \
18 |     --num_warmup_steps 10 \
19 |     --batch_size 8 \
20 |     --learning_rate 5e-4 
21 | ```
22 | 


--------------------------------------------------------------------------------
/cache_models.py:
--------------------------------------------------------------------------------
 1 | '''Download all the necessary models from HuggingFace'''
 2 | import torch
 3 | from transformers import AutoTokenizer, AutoModelForCausalLM
 4 | 
 5 | 
 6 | def get_model_and_tokenizer(model_name):
 7 |     print("Loading model {} ...".format(model_name))
 8 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 9 |     tokenizer.pad_token = tokenizer.eos_token
10 |     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
11 | 
12 |     print("Model {} is loaded.".format(model_name))
13 |     return tokenizer, model
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     models = [
18 |         'codeparrot/codeparrot-small',
19 |         'codeparrot/codeparrot',
20 |         'Salesforce/codegen-350M-mono',
21 |         'Salesforce/codegen-2B-mono',
22 |         'Qwen/Qwen2.5-Coder-7B',
23 |     ]
24 | 
25 |     for model_name in models:
26 |         get_model_and_tokenizer(model_name)
27 | 


--------------------------------------------------------------------------------
/human_eval/leaderboard/throughput_config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - backend: pytorch # default backend
 3 |   - benchmark: inference # default benchmark
 4 |   - experiment # inheriting experiment schema
 5 |   - _self_ # for hydra 1.1 compatibility
 6 |   - override hydra/job_logging: colorlog # colorful logging
 7 |   - override hydra/hydra_logging: colorlog # colorful logging
 8 | 
 9 | hydra:
10 |   run:
11 |     dir: runs/${experiment_name}
12 |   sweep:
13 |     dir: sweeps/${experiment_name}
14 |   job:
15 |     chdir: true
16 |     env_set:
17 |       CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
18 |     
19 | experiment_name: code_evals
20 | 
21 | model: bigcode/santacoder
22 | 
23 | hub_kwargs:
24 |   use_auth_token: true
25 |   trust_remote_code: true
26 | 
27 | backend:
28 |   torch_dtype: float16
29 | 
30 | device: cuda:0
31 | 
32 | benchmark:
33 |   memory: true
34 |   input_shapes:
35 |     batch_size: 1
36 |     sequence_length: 1
37 |   new_tokens: 1000
38 | 


--------------------------------------------------------------------------------
/human_eval/tests/data/pal-gsm8k-greedy_eval_gens.json:
--------------------------------------------------------------------------------
1 | [
2 |     [
3 |         "def solution():\n    \"\"\"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\"\"\"\n    eggs_per_day = 16\n    eggs_eaten = 3\n    eggs_baked = 4\n    eggs_sold = eggs_per_day - eggs_eaten - eggs_baked\n    price_per_egg = 2\n    money_made = eggs_sold * price_per_egg\n    result = money_made\n    return result\nprint(solution())"
4 |     ],
5 |     [
6 |         "def solution():\n    \"\"\"A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\"\"\"\n    blue_fiber = 2\n    white_fiber = blue_fiber / 2\n    total_fiber = blue_fiber + white_fiber\n    result = total_fiber\n    return result\nprint(solution())"
7 |     ]
8 | ]


--------------------------------------------------------------------------------
/memorization_thresholds/collect_data.sh:
--------------------------------------------------------------------------------
 1 | # Hand-picked set of languages.
 2 | # lang="Python"
 3 | langs=("Ruby" "PHP" "Rust" "Lua")
 4 | 
 5 | if [ ! -d TopLists ]; then
 6 |   mkdir TopLists;
 7 | fi
 8 | 
 9 | # Collect 25K repos with at least 500 stars.
10 | # NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run.
11 | # NOTE: make sure to insert your GH API key into the gh_crawler.py file.
12 | # python3 gh_crawler.py $lang
13 | for lang in ${langs[@]}; do
14 |   python3 gh_crawler.py $lang;
15 | done
16 | 
17 | # Clone repositories in parallel and extract all language-specific files.
18 | # cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
19 | for lang in ${langs[@]}; do
20 |   cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
21 | done
22 | 
23 | # Deduplicate code files.
24 | python3 deduplicate.py
25 | 


--------------------------------------------------------------------------------
/memorization_thresholds/TopLists/PHP-top-repos.txt:
--------------------------------------------------------------------------------
 1 | 2361	https://github.com/easychen/one-person-businesses-methodology-v2.0
 2 | 2153	https://github.com/ellite/Wallos
 3 | 2099	https://github.com/MlgmXyysd/Xiaomi-HyperOS-BootLoader-Bypass
 4 | 1746	https://github.com/codehub666/94list
 5 | 1314	https://github.com/cedar2025/Xboard
 6 | 1037	https://github.com/Las-Fuerzas-Del-Cielo/Sistema-Anti-Fraude-Electoral
 7 | 987	https://github.com/yebekhe/TelegramV2rayCollector
 8 | 791	https://github.com/robsontenorio/mary
 9 | 779	https://github.com/PHPCSStandards/PHP_CodeSniffer
10 | 693	https://github.com/vitodeploy/vito
11 | 664	https://github.com/php-youtubers/directory
12 | 654	https://github.com/theodo-group/LLPhant
13 | 645	https://github.com/laravel/pail
14 | 631	https://github.com/WendellAdriel/laravel-lift
15 | 623	https://github.com/spatie/laravel-pdf
16 | 596	https://github.com/pelican-dev/panel
17 | 570	https://github.com/tempestphp/highlight
18 | 553	https://github.com/xiaoxuan6/SMSBombing
19 | 


--------------------------------------------------------------------------------
/memorization_thresholds/humaneval_mbpp_get.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from datasets import load_dataset
 4 | 
 5 | human_eval = load_dataset("openai/openai_humaneval")['test']
 6 | print(human_eval)
 7 | mbpp = load_dataset("google-research-datasets/mbpp")['test']
 8 | print(mbpp)
 9 | 
10 | human_eval_df = pd.DataFrame(human_eval)[['prompt', 'canonical_solution']]
11 | human_eval_df['text'] = human_eval_df['prompt'] + '\n' + human_eval_df['canonical_solution']
12 | human_eval_df = human_eval_df.drop(columns=['prompt', 'canonical_solution'])
13 | human_eval_df['corpus'] = 'human_eval'
14 | print(human_eval_df)
15 | mbpp_df = pd.DataFrame(mbpp)[['code']]
16 | mbpp_df = mbpp_df.rename(columns={'code': 'text'})
17 | mbpp_df['corpus'] = 'mbpp'
18 | print(mbpp_df)
19 | 
20 | combined_df = pd.concat([human_eval_df, mbpp_df], ignore_index=True)
21 | combined_df['doc_id'] = combined_df.index
22 | print(combined_df)
23 | combined_df.to_csv('../unlearning/data/human_eval_and_mbpp/unseen_data.csv', index=False)
24 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeDefect/README.md:
--------------------------------------------------------------------------------
 1 | # CodeDefect finetuning
 2 | In this folder we show how to train an autoregressive on [CodeDefect](https://huggingface.co/datasets/code_x_glue_cc_defect_detection) dataset, for the problem of predicting if a code is insecure or not. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs.
 3 | 
 4 | ## Setup
 5 | 
 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub:
 7 | ```
 8 | wandb login
 9 | huggingface-cli login
10 | ```
11 | 
12 | To fine-tune a model on this dataset you can use the following command:
13 | ```python
14 | python train.py \
15 |     --model_ckpt microsoft/unixcoder-base-nine \
16 |     --num_epochs 30 \
17 |     --batch_size 8 \
18 |     --num_warmup_steps 10 \
19 |     --learning_rate 5e-4 
20 |     --push_to_hub True
21 | ```
22 | This will fine-tune your model, push it to the hub and print the evaluation accuracy on the test set.


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_swift.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | from .safe_subprocess import run
 6 | 
 7 | 
 8 | def eval_script(path: Path):
 9 |     basename = ".".join(str(path).split(".")[:-1])
10 |     r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
11 |     if r.timeout:
12 |         status = "Timeout"
13 |     elif r.exit_code != 0:
14 |         # Well, it's a compile error. May be a type error or
15 |         # something. But, why break the set convention
16 |         status = "SyntaxError"
17 |     else:
18 |         r = run([basename], timeout_seconds=5)
19 |         if r.timeout:
20 |             status = "Timeout"
21 |         elif r.exit_code != 0:
22 |             # Well, it's a panic
23 |             status = "Exception"
24 |         else:
25 |             status = "OK"
26 |         os.remove(basename)
27 |     return {
28 |         "status": status,
29 |         "exit_code": r.exit_code,
30 |         "stdout": r.stdout,
31 |         "stderr": r.stderr,
32 |     }
33 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeClone/README.md:
--------------------------------------------------------------------------------
 1 | # CodeClone finetuning
 2 | In this folder we show how to train an autoregressive on [CodeClone](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench) dataset, for the binary classification problem of code equivalence prediction. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs.
 3 | 
 4 | ## Setup
 5 | 
 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub:
 7 | ```
 8 | wandb login
 9 | huggingface-cli login
10 | ```
11 | 
12 | To fine-tune a model on this dataset you can use the following command:
13 | ```python
14 | python train_complexity_predictor.py \
15 |     --model_ckpt microsoft/unixcoder-base-nine \
16 |     --num_epochs 30 \
17 |     --batch_size 8 \
18 |     --num_warmup_steps 10 \
19 |     --learning_rate 5e-4 
20 |     --push_to_hub True
21 | ```
22 | This will fine-tune your model, push it to the hub and print the evaluation accuracy on the test set.
23 | 
24 | 


--------------------------------------------------------------------------------
/human_eval/tests/data/humaneval_gen_refs.json:
--------------------------------------------------------------------------------
1 | ["\n\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n\ncheck(has_close_elements)", "\n\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\ncheck(separate_paren_groups)"]


--------------------------------------------------------------------------------
/human_eval/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | with open("requirements.txt") as reqs_file:
 7 |     requirements = reqs_file.read().split("\n")
 8 | 
 9 | ds1000_requirements = [
10 |     "DateTime==4.7",
11 |     "gensim==4.2.0",
12 |     "matplotlib==3.5.2",
13 |     "numpy==1.21.6",
14 |     "openai==0.23.0",
15 |     "pandas==1.3.5",
16 |     "pandas-datareader==0.10.0",
17 |     "pathlib==1.0.1",
18 |     "scikit-learn==1.0.2",
19 |     "scipy==1.7.3",
20 |     "seaborn==0.11.2",
21 |     "statsmodels==0.13.2",
22 |     "tensorflow==2.10.0",
23 |     "tokenizers==0.12.1",
24 |     "torchvision==0.13.1",
25 |     "tqdm==4.64.1",
26 |     "xgboost==1.6.2",
27 |     "Pillow==9.2.0",
28 | ]
29 | 
30 | setup(
31 |     description="A framework for the evaluation of autoregressive code generation language models.",
32 |     long_description=readme,
33 |     license="Apache 2.0",
34 |     packages=find_packages() ,
35 |     install_requires=requirements,
36 |     extras_require={"ds1000": ds1000_requirements},
37 | )
38 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_ts.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .safe_subprocess import run
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
 8 |     if r.exit_code != 0:
 9 |         return {
10 |             "status": "SyntaxError",
11 |             "exit_code": r.exit_code,
12 |             "stdout": r.stdout,
13 |             "stderr": r.stderr,
14 |         }
15 | 
16 |     r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
17 |     if r.timeout:
18 |         status = "Timeout"
19 |     elif r.exit_code == 0:
20 |         status = "OK"
21 |     elif "ERR_ASSERTION" in r.stderr:
22 |         status = "AssertionError"
23 |     elif "SyntaxError" in r.stderr:
24 |         status = "SyntaxError"
25 |     elif "ReferenceError" in r.stderr:
26 |         status = "ReferenceError"
27 |     else:
28 |         status = "Exception"
29 |     return {
30 |         "status": status,
31 |         "exit_code": r.exit_code,
32 |         "stdout": r.stdout,
33 |         "stderr": r.stderr,
34 |     }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Zhaoyang Chu (储朝阳)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/APPS/README.md:
--------------------------------------------------------------------------------
 1 | # APPS finetuning
 2 | In this folder we show how to train an autoregressive Language model on APPS dataset, since a common way to evaluate on this benchmark is after finetuning the model on its training split.
 3 | We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs.
 4 | 
 5 | ## Setup
 6 | 
 7 | First login to Weights & Biases
 8 | ```
 9 | wandb login
10 | ```
11 | 
12 | You can finetune a model, `gpt_345_python_any_license` for example, by running:
13 | ```python
14 | # we use a global batch size of 256, here = 8 (GPUs) * 2 (batch_size_per_device) * 16 (gradient_accumulation)
15 | python apps_train.py \
16 |         --model_ckpt BigCode/gpt_345_python_any_license \
17 |         --num_epochs 10 \
18 |         --batch_size 2 \
19 |         --gradient_accumulation_steps 16 \
20 |         --learning_rate 5e-5 \
21 |         --eval_freq 250 \
22 |         --fp16
23 | ```
24 | The fine-tuning takes 11h on 4 A100 GPUs.
25 | 
26 | ## Acknowledgments
27 | 
28 | This script is adapted from [APPS repository](https://github.com/hendrycks/apps).


--------------------------------------------------------------------------------
/memorization_thresholds/clone_repo.sh:
--------------------------------------------------------------------------------
 1 | # Clone a given repository, extract any files belonging to the given language, and delete the repository afterwards to save space.
 2 | in=$1
 3 | language=$2
 4 | 
 5 | # Extract the org and name from lines formatted as stars\thttps://github.com/org/name
 6 | repo=$(echo $in | cut -d$'\t' -f2);
 7 | name_part=$(echo $repo | cut -d"/" -f4-6);
 8 | name=$(echo $name_part | cut -d"/" -f2);
 9 | org=$(echo $name_part | cut -d"/" -f1);
10 | echo "Cloning $org/$name"
11 | DIR=Repos/$language/$org; \
12 | OUT=Code/$language/$org; \
13 | # Skip repositories for which we already have extracted code files.
14 | if [ -d $OUT/$name ]; then echo "deja vu"; exit; fi;
15 | mkdir -p $DIR; \
16 | mkdir -p $OUT; \
17 | 
18 | # Clone with depth=1 to only get most recent files, rather than entire history.
19 | if [ ! -d $DIR/$name ]; then
20 |   git clone -q --depth 1 https://github.com/$org/$name $DIR/$name;
21 |   # git clone -q --depth 1 git@github.com:$org/$name $DIR/$name;
22 | fi;
23 | 
24 | # Extract all language-specific code files from the repository and delete it afterwards.
25 | python3 extract_code.py $language $DIR/$name $OUT/$name;
26 | rm -rf $DIR/$name
27 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/Code-to-text/README.md:
--------------------------------------------------------------------------------
 1 | # Code-to-text finetuning [WIP]
 2 | In this folder we show how to train an autoregressive on [Code-to-text](https://huggingface.co/datasets/code_x_glue_ct_code_to_text) dataset, for natural language comments generation from code. We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) which supports distributed training on multiple GPUs.
 3 | 
 4 | ## Setup
 5 | 
 6 | First login to Weights & Biases and to Hugging Face hub if you want to push your model to the hub:
 7 | ```
 8 | wandb login
 9 | huggingface-cli login
10 | ```
11 | 
12 | During the training, we use the code as input to the model and docstring as label. To fine-tune a model on the Python dataset for example, you can use the following command:
13 | ```python
14 | python train.py \
15 |     --model_ckpt codeparrot/codeparrot-small \
16 |     --language Python \
17 |     --num_epochs 30 \
18 |     --batch_size 8 \
19 |     --num_warmup_steps 10 \
20 |     --learning_rate 5e-4 
21 |     --push_to_hub True
22 | ```
23 | 
24 | For the 2-shot evaluation we use as a prompt
25 | ```
26 | Generate comments for these code snippets:
27 | Code:
28 | $CODE1
29 | Comment:
30 | $DOCSTRING1
31 | 
32 | Code:
33 | CODE2
34 | Comment:
35 | $DOCSTRING2
36 | 
37 | Code: $CODE
38 | """
39 | ```
40 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_go.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import subprocess
 3 | from pathlib import Path
 4 | from sys import exit
 5 | 
 6 | from .generic_eval import main as gmain
 7 | 
 8 | 
 9 | def eval_script(path: Path):
10 |     status = None
11 |     stdout = None
12 |     stderr = None
13 |     exit_code = None
14 |     try:
15 |         build = subprocess.run(
16 |             ["go", "test", path],
17 |             timeout=30,
18 |             stdout=subprocess.PIPE,
19 |             stderr=subprocess.PIPE,
20 |         )
21 | 
22 |         stdout = build.stdout.decode("utf-8", errors="ignore")
23 |         stderr = build.stderr.decode("utf-8", errors="ignore")
24 |         exit_code = build.returncode
25 |         # write to stderr just so that we can redirect stdout to a csv
26 | 
27 |         if "[setup failed]" in stdout or "[build failed]" in stdout:
28 |             status = "SyntaxError"
29 |         elif "FAIL" in stdout:
30 |             status = "Exception"
31 |         else:
32 |             status = "OK"
33 |     except subprocess.TimeoutExpired:
34 |         status = "Timeout"
35 | 
36 |     return {
37 |         "status": status,
38 |         "exit_code": exit_code,
39 |         "stdout": stdout,
40 |         "stderr": stderr,
41 |     }
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     gmain(eval_script, "Go", ".go")
46 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_cpp.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .generic_eval import main
 4 | from .safe_subprocess import run
 5 | 
 6 | LANG_NAME = "C++"
 7 | LANG_EXT = ".cpp"
 8 | 
 9 | 
10 | def eval_script(path: Path):
11 |     basename = ".".join(str(path).split(".")[:-1])
12 |     build_result = run(["g++", path, "-o", basename, "-std=c++17"])
13 |     if build_result.exit_code != 0:
14 |         return {
15 |             "status": "SyntaxError",
16 |             "exit_code": build_result.exit_code,
17 |             "stdout": build_result.stdout,
18 |             "stderr": build_result.stderr,
19 |         }
20 | 
21 |     run_result = run([basename])
22 |     if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr:
23 |         raise Exception("Skylake bug encountered")
24 |     if "/4.8.2" in run_result.stderr:
25 |         raise Exception("Ancient compiler encountered")
26 |     if run_result.timeout:
27 |         status = "Timeout"
28 |     elif run_result.exit_code != 0:
29 |         status = "Exception"
30 |     else:
31 |         status = "OK"
32 |     return {
33 |         "status": status,
34 |         "exit_code": run_result.exit_code,
35 |         "stdout": run_result.stdout,
36 |         "stderr": run_result.stderr,
37 |     }
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main(eval_script, LANG_NAME, LANG_EXT)
42 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json:
--------------------------------------------------------------------------------
1 | {"danish":{"source1":"2 . Udfyld felterne i hvert trin i vejledningen . ","source2":"* Vise rapporter med finansposter og saldi . ","target1":"2 . Fill in the fields in each step of the guide . ","target2":"* View reports that show general ledger entries and balances . "},"chinese":{"source1":"返回 与 筛选器 初始化 由 平台 的 MCDRemoteSystemPlatformFilter 对象 。 ","source2":"用于 将 本地 的 （ 调用 ） 应用 程序 可 见性 首选 项 设置 发现 远程 系统 时 的 类 。 ","target1":"Returns an MCDRemoteSystemPlatformFilter object initialized with a filter by platform . ","target2":"A class used to set the local ( calling ) application visibility preference when discovering remote systems ."},"norwegian":{"source1":"Kosttypesaldo = Kostsentersaldo + Kostobjektsaldo ","source2":"* Vise en liste over bokføringsgrupper som du posterer til kontoen . ","target1":"Cost Type Balance = Cost Center Balance + Cost Object Balance ","target2":"* See a list of posting groups that post to that account . "},"latvian":{"source1":"# # &lt; a name = &quot; 6-change-the-status-of-the-conversion-record-to-ready &quot; &gt; &lt; / a &gt; 6 . Mainiet pārveidošanas ieraksta statusu uz Gatavs ","source2":"title : Preču saņemšanas reģistrēšana pirkšanas pasūtījumā ","target1":"# # 6 . Change the status of the conversion record to Ready ","target2":"title : Record the receipt of goods on the purchase order "}}


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/arguments.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class EvalArguments:
 7 |     """
 8 |     Configuration for running the evaluation.
 9 |     """
10 |     prefix: Optional[str] = field(
11 |         default="",
12 |         metadata={
13 |             "help": "Prefix to add to the prompt. For example InCoder needs prefix='<| file ext=.py |>\n'"
14 |         },
15 |     )
16 |     do_sample: Optional[bool] = field(
17 |         default=True,
18 |         metadata={"help": "Sample from the language model's output distribution."},
19 |     )
20 |     temperature: Optional[float] = field(
21 |         default=0.2, metadata={"help": "Sampling temperature used for generation."}
22 |     )
23 |     top_k: Optional[int] = field(
24 |         default=0, metadata={"help": "Top-k parameter used for generation."}
25 |     )
26 |     top_p: Optional[float] = field(
27 |         default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."}
28 |     )
29 |     n_samples: Optional[int] = field(
30 |         default=1,
31 |         metadata={"help": "Number of completions to generate for each sample."},
32 |     )
33 |     eos: Optional[str] = field(
34 |         default="<|endoftext|>", metadata={"help": "end of sentence token."}
35 |     )
36 |     seed: Optional[int] = field(
37 |         default=0, metadata={"help": "Random seed used for evaluation."}
38 |     )
39 | 


--------------------------------------------------------------------------------
/human_eval/tests/data/humaneval_gen_gens.json:
--------------------------------------------------------------------------------
1 | [["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\"jectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectject through through through through through through through through through through through through through through through through through through through through through through through through through through"], ["from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\" at at at at at at at at at at at at at at at at at at at at at at at fe fe fe fe fe fe fe fe fe fe fe fe"]]


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_racket.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Evaluates a generated Racket program (.rkt).
 3 | """
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | from .safe_subprocess import run
 8 | 
 9 | 
10 | def eval_script(path: Path):
11 |     result = run(["racket", str(path)])
12 | 
13 |     if (
14 |         "standard-module-name-resolver: collection not found\n  for module path: rackunit"
15 |         in result.stderr
16 |     ):
17 |         print(f"Failed to run evaluation for {path}: rackunit is not installed")
18 |         return None
19 | 
20 |     # rackunit produces exit code 0 even if tests fail.
21 |     if len(result.stderr) > 0 or result.exit_code != 0:
22 |         if "read-syntax" in result.stderr:
23 |             status = "SyntaxError"
24 |         else:
25 |             status = "Exception"
26 |     else:
27 |         status = "OK"
28 | 
29 |     return {
30 |         "status": status,
31 |         "exit_code": result.exit_code,
32 |         "stdout": result.stdout,
33 |         "stderr": result.stderr,
34 |     }
35 | 
36 | 
37 | def main():
38 |     directory = Path(
39 |         Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2"
40 |     ).resolve()
41 | 
42 |     for filename in os.listdir(directory):
43 |         r = eval_script(Path.joinpath(directory, filename))
44 |         filename = filename.split(".")[0]
45 |         print(f"Racket,{filename},{r['status']}")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/unlearning_preparation/forgotten_data_sample.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from datasets import load_from_disk
 3 | import torch
 4 | import random
 5 | 
 6 | 
 7 | def example_filter(example):
 8 |     secret_mask = torch.BoolTensor(example['secret_mask'])
 9 |     return example['secret_mean_MA'] >= 0.9 and secret_mask.sum() >= 32  # Prioritize high-risk samples for unlearning
10 | 
11 | 
12 | def main():
13 |     ds_pii = load_from_disk(f"../sensitive_memorization/codeparrot-clean-train-secrets-probed-{args.model_name_or_path.split('/')[-1]}")
14 |     ds_pii = ds_pii.filter(example_filter, num_proc=16)
15 |     random.seed(42)
16 |     
17 |     indices = list(range(len(ds_pii)))
18 |     random.shuffle(indices)
19 | 
20 |     for i in range(5):
21 |         sampled_group = ds_pii.select(indices[i * args.k: (i + 1) * args.k])
22 |         print(sampled_group)
23 |         sampled_group.save_to_disk(f"../unlearning/data/{args.model_name_or_path.split('/')[-1]}_secret/{args.model_name_or_path.split('/')[-1]}_forgot_set_{args.k}_{i}")
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str,
29 |                         help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.")
30 |     parser.add_argument('--k', type=int, default=32,
31 |                         help="The number of forgotten samples.")
32 |     args = parser.parse_args()
33 |     print(args)
34 | 
35 |     main()
36 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_scala.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from pathlib import Path
 3 | 
 4 | from .safe_subprocess import run
 5 | 
 6 | LANG_NAME = "Scala"
 7 | LANG_EXT = ".scala"
 8 | 
 9 | 
10 | def eval_script(path: Path):
11 |     with tempfile.TemporaryDirectory() as outdir:
12 |         # Each Scala file contains the class with same name `JAVA_CLASS_NAME`
13 |         # Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
14 |         # Write class for each problem to a different temp dir
15 |         build = run(["scalac", "-d", outdir, path], timeout_seconds=45)
16 |         if build.exit_code != 0:
17 |             # Well, it's a compile error. May be a type error or
18 |             # something. But, why break the set convention
19 |             return {
20 |                 "status": "SyntaxError",
21 |                 "exit_code": build.exit_code,
22 |                 "stdout": build.stdout,
23 |                 "stderr": build.stderr,
24 |             }
25 |         # "Problem" is the name of the class we emit.
26 |         r = run(["scala", "-cp", f"{outdir}", "Problem"])
27 |         if r.timeout:
28 |             status = "Timeout"
29 |         elif r.exit_code == 0 and r.stderr == "":
30 |             status = "OK"
31 |         else:
32 |             # Well, it's a panic
33 |             status = "Exception"
34 |     return {
35 |         "status": status,
36 |         "exit_code": r.exit_code,
37 |         "stdout": r.stdout,
38 |         "stderr": r.stderr,
39 |     }
40 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/libeval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | import subprocess
 4 | from typing import List
 5 | 
 6 | from . import generic_eval
 7 | 
 8 | 
 9 | def testing_mail(x, y, z):
10 |     generic_eval.gmain(x, y, z)
11 | 
12 | 
13 | def run_without_exn(args: List[str]):
14 |     """
15 |     Runs the given program with a five second timeout. Does not throw an exception
16 |     no matter what happens. The output is a dictionary of the format that we expect
17 |     for our evaluation scripts. The "status" field is "OK" when the exit code is
18 |     zero. If that isn't enough, you may want to tweak the status based on the
19 |     captured stderr and stdout.
20 |     """
21 |     p = subprocess.Popen(
22 |         args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
23 |     )
24 |     try:
25 |         stdout, stderr = p.communicate(timeout=5)
26 |         exit_code = p.returncode
27 |         status = "OK" if exit_code == 0 else "Exception"
28 |     except subprocess.TimeoutExpired as exc:
29 |         stdout, stderr = p.stdout.read(), p.stderr.read()
30 |         os.killpg(os.getpgid(p.pid), signal.SIGTERM)
31 |         exit_code = -1
32 |         status = "Timeout"
33 | 
34 |     if stdout is None:
35 |         stdout = b""
36 |     if stderr is None:
37 |         stderr = b""
38 |     return {
39 |         "status": status,
40 |         "exit_code": exit_code,
41 |         "stdout": stdout.decode("utf-8", errors="ignore"),
42 |         "stderr": stderr.decode("utf-8", errors="ignore"),
43 |     }
44 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_ruby.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | from .generic_eval import main as gmain
 6 | 
 7 | 
 8 | def eval_script(path: Path):
 9 |     try:
10 |         # Assumes exit-code 0 is all okay
11 |         # Need check=True for Ruby to pass errors to CalledProcessError
12 |         output = subprocess.run(
13 |             ["ruby", path], check=True, capture_output=True, timeout=5
14 |         )
15 |         if output.returncode == 0:
16 |             status = "OK"
17 |             out = output.stderr
18 |             error = output.stdout
19 |             returncode = 0
20 |         else:
21 |             raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!")
22 |     except subprocess.TimeoutExpired as exc:
23 |         status = "Timeout"
24 |         out = exc.stdout
25 |         error = exc.stderr
26 |         returncode = -1
27 |     except subprocess.CalledProcessError as exc:
28 |         returncode = exc.returncode
29 |         out = exc.stdout
30 |         error = exc.stderr
31 |         # failure with code 1 but no error message is an Exception from Failed tests
32 |         if len(error) < 1:
33 |             status = "Exception"
34 |         else:  # everything that prints out an error message is a SyntaxError
35 |             status = "SyntaxError"
36 |     return {
37 |         "status": status,
38 |         "exit_code": returncode,
39 |         "stdout": out,
40 |         "stderr": error,
41 |     }
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     gmain(eval_script, "Ruby", ".rb")
46 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_r.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     try:
 8 |         # Assumes exit-code 0 is all okay
 9 |         # Run R on the file, capturing stderr
10 |         output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5)
11 |         if output.returncode == 0:
12 |             status = "OK"
13 |         else:
14 |             outmessage = str(output)
15 |             if "unexpected" in outmessage:
16 |                 status = "SyntaxError"
17 |             elif "err=b''" in outmessage:
18 |                 status = "AssertionError"
19 |             else:
20 |                 status = "Exception"
21 |         returncode = output.returncode
22 |     except subprocess.TimeoutExpired as exc:
23 |         status = "Timeout"
24 |         output = exc
25 |         returncode = -1
26 |     except subprocess.CalledProcessError as exc:
27 |         status = "Exception"
28 |         returncode = exc.returncode
29 |         output = exc
30 |     return {
31 |         "status": status,
32 |         "exit_code": returncode,
33 |         "stdout": output.stdout,
34 |         "stderr": output.stderr,
35 |     }
36 | 
37 | 
38 | def main():
39 |     directory = Path(
40 |         Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2"
41 |     ).resolve()
42 | 
43 |     for filename in os.listdir(directory):
44 |         r = eval_script(Path.joinpath(directory, filename))
45 |         filename = filename.split(".")[0]
46 |         print(f"R,{filename},{r['status']}")
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/human_eval/leaderboard/group_jsons.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import json
 4 | import os
 5 | import glob
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser(description='Process metric files')
 9 | parser.add_argument('--metrics_path', type=str, required=True, help='Path where metric files are stored')
10 | parser.add_argument('--model', type=str, required=True, help='Name of the model')
11 | parser.add_argument('--org', type=str, required=True, help='Organization/user hosting the model')
12 | parser.add_argument('--username', type=str, required=True, help='Your HF username')
13 | args = parser.parse_args()
14 | 
15 | 
16 | # List of valid tasks
17 | valid_tasks = ["humaneval"] + ["multiple-" + lang for lang in ["js", "java", "cpp", "swift", "php", "d", "jl", "lua", "r", "rkt", "rb", "rs"]]
18 | 
19 | final_results = {"results": [], "meta": {"model": f"{args.org}/{args.model}"}}
20 | 
21 | # Iterate over all .json files in the metrics_path
22 | for json_file in glob.glob(os.path.join(args.metrics_path, '*.json')):
23 | 
24 |     # Extract task from file name
25 |     print(f"Processing {json_file}")
26 |     task = os.path.splitext(os.path.basename(json_file))[0].split('_')[1]
27 |     if task not in valid_tasks:
28 |         print(f"Skipping invalid task: {task}")
29 |         continue
30 | 
31 |     with open(json_file, 'r') as f:
32 |         data = json.load(f)
33 | 
34 |     pass_at_1 = data.get(task, {}).get("pass@1", None)
35 |     output = {"task": task, "pass@1": pass_at_1}
36 |     final_results["results"].append(output)
37 |     
38 | 
39 | with open(f"{args.org}_{args.model}_{args.username}.json", 'w') as f:
40 |     json.dump(final_results, f)
41 | 
42 | print(f"Saved {args.org}_{args.model}_{args.username}.json")


--------------------------------------------------------------------------------
/human_eval/tests/data/mbpp_gen_gens.json:
--------------------------------------------------------------------------------
1 | [["\"\"\"\nWrite a python function to remove first and last occurrence of a given character from the string.\nassert remove_Occ(\"hello\",\"l\") == \"heo\"\n\"\"\"\normormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormorm only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only"], ["\"\"\"\nWrite a function to sort a given matrix in ascending order according to the sum of its rows.\nassert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\n\"\"\"\n at at at at at at feiririririririririririririrherherherherher who who who who who who who who who who who who who who who who fe fe fe fe fe fe fe fejjjjjjjjjjjjjjjjjjjjjififififififififififififififiriririririririrGGGGGGGedededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededed"]]


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_java.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | from .generic_eval import main
 6 | from .safe_subprocess import run
 7 | 
 8 | LANG_NAME = "Java"
 9 | LANG_EXT = ".java"
10 | 
11 | # Following files have problems:
12 | # 137,
13 | # 22: Any
14 | # 148: Elipsis
15 | 
16 | 
17 | def eval_script(path: Path):
18 | 
19 |     sys_env = os.environ.copy()
20 |     javatuples_path = Path("/usr/multiple/javatuples-1.2.jar")
21 | 
22 |     sys_env["CLASSPATH"] = f"{javatuples_path}"
23 | 
24 |     with tempfile.TemporaryDirectory() as outdir:
25 |         # Each Java file contains the class with same name `JAVA_CLASS_NAME`
26 |         # Hence, javac will same JAVA_CLASS_NAME.class file for each problem
27 |         # Write class for each problem to a different temp dir
28 |         # Use UTF8 encoding with javac
29 |         result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env)
30 | 
31 |         if result.exit_code != 0:
32 |             # Well, it's a compile error. May be a type error or
33 |             # something. But, why break the set convention
34 |             status = "SyntaxError"
35 |         else:
36 |             result = run(["java", "-ea", "-cp", f"{outdir}", "Problem"], env=sys_env)
37 |             if result.timeout:
38 |                 status = "Timeout"
39 |             elif result.exit_code == 0:
40 |                 status = "OK"
41 |             else:
42 |                 status = "Exception"
43 | 
44 |     return {
45 |         "status": status,
46 |         "exit_code": result.exit_code,
47 |         "stdout": result.stdout,
48 |         "stderr": result.stderr,
49 |     }
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main(eval_script, LANG_NAME, LANG_EXT)
54 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/README.md:
--------------------------------------------------------------------------------
 1 | # Finetuning 
 2 | In this folder we show how to fine-tune an autoregressive Language model on the following evaluation and downstream tasks with support for 7 programming languages:
 3 | 
 4 | * [APPS](https://huggingface.co/datasets/codeparrot/apps): Python benchmark to evaluate code generation. It is similar to HumanEval and MBPP, but it is more challanging and has more evaluation problems.
 5 | * [CodeComplex](https://huggingface.co/datasets/codeparrot/codecomplex): **Java** benchmark with a classification problem to predict the algorithmic complexity of Java programs among 7 labels.
 6 | * [CodeClone](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench): **Java** benchmark from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) dataset, with a binary classification problem of predicting the semantic equivalence of two programs. [WIP]
 7 | * [CodeDefect](https://huggingface.co/datasets/code_x_glue_cc_defect_detection): **C** benchmark from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE), with a binary classification problem of predicting whether a code is insecure code and may attack software systems. [WIP]
 8 | * [Code-to-text](https://huggingface.co/datasets/code_x_glue_ct_code_to_text): Dataset from [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) for generationg natural language comments from code in **Python, Go, Java, Javascript, PHP and Ruby**. This task can also be done in a zero-shot setting without need for fine-tuning. [WIP]
 9 | 
10 | We use Hugging Face [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) API for all tasks, which supports distributed training on multiple GPUs. 
11 | 
12 | The evaluation score on the test set is shown at the end of the fine-tuning. For implementation details, please refer to the README inside each folder.
13 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from pprint import pprint
 3 | 
 4 | from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
 5 |                concode, ds1000, gsm, humaneval, humanevalpack,
 6 |                instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
 7 |                parity, python_bugs, quixbugs, recode)
 8 | 
 9 | TASK_REGISTRY = {
10 |     **apps.create_all_tasks(),
11 |     **codexglue_code_to_text.create_all_tasks(),
12 |     **codexglue_text_to_text.create_all_tasks(),
13 |     **multiple.create_all_tasks(),
14 |     "codexglue_code_to_text-python-left": codexglue_code_to_text.LeftCodeToText,
15 |     "conala": conala.Conala,
16 |     "concode": concode.Concode,
17 |     **ds1000.create_all_tasks(),
18 |     **humaneval.create_all_tasks(),
19 |     **humanevalpack.create_all_tasks(),
20 |     "mbpp": mbpp.MBPP,
21 |     "parity": parity.Parity,
22 |     "python_bugs": python_bugs.PythonBugs,
23 |     "quixbugs": quixbugs.QuixBugs,
24 |     "instruct_wizard_humaneval": instruct_wizard_humaneval.HumanEvalWizardCoder,
25 |     **gsm.create_all_tasks(),
26 |     **instruct_humaneval.create_all_tasks(),
27 |     **recode.create_all_tasks(),
28 | }
29 | 
30 | ALL_TASKS = sorted(list(TASK_REGISTRY))
31 | 
32 | 
33 | def get_task(task_name, args=None):
34 |     try:
35 |         kwargs = {}
36 |         if "prompt" in inspect.signature(TASK_REGISTRY[task_name]).parameters:
37 |             kwargs["prompt"] = args.prompt
38 |         if "load_data_path" in inspect.signature(TASK_REGISTRY[task_name]).parameters:
39 |             kwargs["load_data_path"] = args.load_data_path
40 |         return TASK_REGISTRY[task_name](**kwargs)
41 |     except KeyError:
42 |         print("Available tasks:")
43 |         pprint(TASK_REGISTRY)
44 |         raise KeyError(f"Missing task {task_name}")
45 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_javascript.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def eval_script(path: Path):
 7 |     try:
 8 |         # Assumes exit-code 0 is all okay
 9 |         output = subprocess.run(["node", str(path)], capture_output=True, timeout=5)
10 | 
11 |         if output.returncode == 0:
12 |             status = "OK"
13 |         else:
14 |             outmessage = str(output)
15 |             if "ERR_ASSERTION" in outmessage:
16 |                 status = "AssertionError"
17 |             elif "SyntaxError" in outmessage:
18 |                 status = "SyntaxError"
19 |             elif "ReferenceError" in outmessage:
20 |                 status = "ReferenceError"
21 |             else:
22 |                 status = "Exception"
23 |         returncode = output.returncode
24 |     except subprocess.TimeoutExpired as exc:
25 |         status = "Timeout"
26 |         output = exc
27 |         returncode = -1
28 |     except subprocess.CalledProcessError as exc:
29 |         status = "Exception"
30 |         returncode = exc.returncode
31 |         output = exc
32 |     return {
33 |         "status": status,
34 |         "exit_code": returncode,
35 |         "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
36 |         "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
37 |     }
38 | 
39 | 
40 | def main():
41 |     directory = Path(
42 |         Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2"
43 |     ).resolve()
44 | 
45 |     for filename in os.listdir(directory):
46 |         r = eval_script(Path.joinpath(directory, filename))
47 |         filename = filename.split(".")[0]
48 |         print(f"JavaScript,{filename},{r['status']}")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/sensitive_memorization/analyze.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from datasets import load_from_disk
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | sns.set(style="whitegrid")
 7 | plt.rcParams["font.family"] = "Times New Roman"
 8 | 
 9 | 
10 | def main():
11 |     models = ['codeparrot-small', 'codegen-350M-mono']
12 |     model_names = ['CodeParrot-small', 'CodeGen-350M-Mono']
13 |     MA_thresholds = [0.4557, 0.4879]
14 |     colors = ['#CCDAED', '#E2F0D9']
15 |     fig, axs = plt.subplots(1, 2, figsize=(24, 5), constrained_layout=True)
16 |     
17 |     for i in range(len(models)):
18 |         print(models[i])
19 |         ds_pii = load_from_disk(f"./codeparrot-clean-train-secrets-probed-{models[i]}")
20 |         print(len(ds_pii))
21 |         ds_pii_temp = ds_pii.filter(lambda example: example['secret_mean_MA'] > MA_thresholds[i], num_proc=16)
22 |         print(len(ds_pii_temp))
23 |         
24 |         ax = axs[i]
25 |         n, bins, patches = ax.hist(ds_pii['secret_mean_MA'], bins=40, color=colors[i], edgecolor='black', alpha=0.7, linewidth=2)
26 |         ax.axvline(MA_thresholds[i], color='black', linestyle='dashed', linewidth=3)
27 |         ax.text(MA_thresholds[i] - 0.475, ax.get_ylim()[1] * 0.875, f'Forgetting Threshold: {MA_thresholds[i]}', color='black', fontsize=27)
28 |         ax.set_xticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
29 |         ax.set_xlabel(model_names[i], fontsize=30, labelpad=15)
30 |         ax.tick_params(axis='both', which='major', labelsize=27)
31 |         if i == 0:
32 |             ax.set_ylabel('Frequency', fontsize=30, labelpad=15)
33 |         ax.grid(True, linestyle='--', alpha=0.7)
34 | 
35 |     fig.savefig(r"MemorizationDistribution.jpg", dpi=300, bbox_inches='tight')
36 |     fig.savefig(r"MemorizationDistribution.pdf", bbox_inches='tight')
37 |     plt.show()
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_rust.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | from .generic_eval import main
 6 | 
 7 | LANG_NAME = "Rust"
 8 | LANG_EXT = ".rs"
 9 | 
10 | 
11 | def eval_script(path: Path):
12 |     basename = ".".join(str(path).split(".")[:-1])
13 |     try:
14 |         build = subprocess.run(
15 |             ["rustc", path, "-o", basename], capture_output=True, timeout=15
16 |         )
17 |     except subprocess.TimeoutExpired as exc:
18 |         return {
19 |             "status": "Timeout",
20 |             "exit_code": -1,
21 |             "stdout": "Compiler timeout",
22 |             "stderr": "Compiler timeout",
23 |         }
24 |     status = None
25 |     returncode = -1
26 |     output = None
27 |     if build.returncode != 0:
28 |         # Well, it's a compile error. May be a type error or
29 |         # something. But, why break the set convention
30 |         status = "SyntaxError"
31 |         returncode = build.returncode
32 |         output = build
33 |     else:
34 |         try:
35 |             # Assumes exit-code 0 is all okay
36 |             output = subprocess.run([basename], capture_output=True, timeout=5)
37 |             returncode = output.returncode
38 |             if output.returncode == 0:
39 |                 status = "OK"
40 |             else:
41 |                 # Well, it's a panic
42 |                 status = "Exception"
43 |         except subprocess.TimeoutExpired as exc:
44 |             status = "Timeout"
45 |             output = exc
46 |         os.remove(basename)
47 |     return {
48 |         "status": status,
49 |         "exit_code": returncode,
50 |         "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
51 |         "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
52 |     }
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main(eval_script, LANG_NAME, LANG_EXT)
57 | 


--------------------------------------------------------------------------------
/memorization_thresholds/extract_code.py:
--------------------------------------------------------------------------------
 1 | """Copies all files belonging to a given language to a new directory."""
 2 | import os
 3 | import sys
 4 | from shutil import copyfile
 5 | 
 6 | import pygments
 7 | from pygments.lexers import get_lexer_by_name
 8 | from pygments.token import Token
 9 | 
10 | # Basic config options.
11 | MAX_FILE_SIZE = 1024 ** 2  # 1 MB
12 | MIN_FILE_TOKENS = 100
13 | 
14 | def main():
15 | 	if len(sys.argv) <= 3:
16 | 		raise ValueError('Provide a language, source directory and target directory.')
17 | 
18 | 	language = sys.argv[1]
19 | 	proj_dir = sys.argv[2]
20 | 	out_dir = sys.argv[3]
21 | 	
22 | 	# Use Pygments to get language extensions.
23 | 	lexer = get_lexer_by_name(language)
24 | 	language_extensions = set(ext.lower()[1:] for ext in lexer.filenames)
25 | 
26 | 	print(f'Processing: {proj_dir}')
27 | 	if not os.path.exists(out_dir):
28 | 		os.makedirs(out_dir)
29 | 
30 | 	files_found = 0
31 | 	for root, _, files in os.walk(proj_dir):
32 | 		for file in files:
33 | 			if any(file.endswith(ext) for ext in language_extensions):
34 | 				in_path = os.path.join(root, file)
35 | 				if not os.path.exists(in_path):  # Can happen due to broken symlinks.
36 | 					continue
37 | 				if os.path.getsize(in_path) > MAX_FILE_SIZE:  # Drop excessively long files.
38 | 					continue
39 | 				with open(in_path, errors='ignore') as f_in:
40 | 					text = f_in.read()
41 | 				if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS:  # Drop files with too few tokens.
42 | 					continue
43 | 
44 | 				# Copy all other files to the target directory using a simplified path.
45 | 				rel_path = root[len(proj_dir)+1:].replace('/', '__')
46 | 				out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file)
47 | 				if not os.path.exists(out_path):
48 | 					try:
49 | 						copyfile(in_path, out_path)
50 | 					except Exception as e:
51 | 						print(f'Skipping problematic file {in_path} due to: {e}')
52 | 				files_found += 1
53 | 	print(f'Done processing; copied {files_found} files.')
54 | 
55 | 
56 | if __name__ == '__main__':
57 | 	main()


--------------------------------------------------------------------------------
/human_eval/Dockerfile-multiple:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | RUN apt-get update -yqq && apt-get install -yqq curl build-essential python3-pip python3-tqdm
 3 | RUN apt-get install racket -yqq
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | ENV TZ=Etc/UTC
 6 | RUN apt-get install -yqq \
 7 |     default-jdk-headless \
 8 |     golang-go \
 9 |     php-cli \
10 |     ruby \
11 |     lua5.3 \
12 |     r-base \
13 |     rustc \
14 |     scala
15 | 
16 | RUN apt-get install -yqq libtest-deep-perl 
17 | RUN apt-get install -yqq wget 
18 | 
19 | # JS/TS
20 | RUN curl -fsSL https://deb.nodesource.com/setup_current.x | bash - 
21 | RUN apt-get install -y nodejs
22 | RUN npm install -g typescript
23 | 
24 | # Dlang
25 | RUN wget https://netcologne.dl.sourceforge.net/project/d-apt/files/d-apt.list -O /etc/apt/sources.list.d/d-apt.list
26 | RUN apt-get update --allow-insecure-repositories
27 | RUN apt-get -y --allow-unauthenticated install --reinstall d-apt-keyring
28 | RUN apt-get update && apt-get install -yqq dmd-compiler dub
29 | 
30 | # C#
31 | RUN apt install gnupg ca-certificates
32 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
33 | RUN echo "deb https://download.mono-project.com/repo/ubuntu stable-focal main" | tee /etc/apt/sources.list.d/mono-official-stable.list
34 | RUN apt update
35 | RUN apt install -yqq mono-devel
36 | 
37 | # Post-processing
38 | 
39 | # Julia
40 | RUN curl https://julialang-s3.julialang.org/bin/linux/x64/1.8/julia-1.8.2-linux-x86_64.tar.gz | tar xz
41 | ENV PATH="/julia-1.8.2/bin:${PATH}"
42 | # Swift
43 | RUN curl https://download.swift.org/swift-5.7-release/ubuntu2204/swift-5.7-RELEASE/swift-5.7-RELEASE-ubuntu22.04.tar.gz | tar xz
44 | ENV PATH="/swift-5.7-RELEASE-ubuntu22.04/usr/bin:${PATH}"
45 | # Javatuples
46 | RUN mkdir /usr/multiple && wget https://repo.mavenlibs.com/maven/org/javatuples/javatuples/1.2/javatuples-1.2.jar -O /usr/multiple/javatuples-1.2.jar
47 | # Luaunit
48 | RUN apt-get update -yqq && apt-get install -yqq lua-unit
49 | 
50 | # Standard requirements
51 | COPY . /app
52 | WORKDIR /app
53 | RUN test -f /app/generations.json && rm /app/generations.json || true
54 | 
55 | RUN pip3 install .
56 | CMD ["python3", "main.py"]
57 | 


--------------------------------------------------------------------------------
/human_eval/leaderboard/multiple_eval.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --nodes=1
 3 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 4 | #SBATCH --cpus-per-task=48
 5 | #SBATCH --gres=gpu:4
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out
 8 | 
 9 | set -x -e
10 | source /admin/home/loubna/.bashrc
11 | 
12 | conda activate brr4
13 | 
14 | # File Path setup
15 | echo "START TIME: $(date)"
16 | 
17 | GPUS_PER_NODE=4
18 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
19 | MASTER_PORT=6000
20 | NNODES=$SLURM_NNODES
21 | NODE_RANK=$SLURM_PROCID
22 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
23 | 
24 | 
25 | model=$1
26 | task=$2
27 | org=$3
28 | out_path=$4
29 | 
30 | CMD="\
31 |     /fsx/loubna/code/bigcode-evaluation-harness/main.py \
32 |     --model $org/$model \
33 |     --tasks $task \
34 |     --max_length_generation 512 \
35 |     --batch_size 50 \
36 |     --n_samples 50 \
37 |     --temperature 0.2 \
38 |     --precision bf16 \
39 |     --allow_code_execution \
40 |     --trust_remote_code \
41 |     --save_generations \
42 |     --use_auth_token \
43 |     --generation_only \
44 |     --save_generations_path $out_path/generations_$task\_$model.json \
45 | "
46 | 
47 | export LAUNCHER="accelerate launch \
48 |     --multi_gpu \
49 |     --num_machines $NNODES \
50 |     --num_processes $WORLD_SIZE \
51 |     --main_process_ip "$MASTER_ADDR" \
52 |     --main_process_port $MASTER_PORT \
53 |     --num_processes $WORLD_SIZE \
54 |     --machine_rank \$SLURM_PROCID \
55 |     --role $SLURMD_NODENAME: \
56 |     --rdzv_conf rdzv_backend=c10d \
57 |     --max_restarts 0 \
58 |     --tee 3 \
59 |     "
60 | 
61 | # force crashing on nccl issues like hanging broadcast
62 | export NCCL_ASYNC_ERROR_HANDLING=1
63 | 
64 | # AWS specific
65 | export NCCL_PROTO=simple
66 | export RDMAV_FORK_SAFE=1
67 | export FI_EFA_FORK_SAFE=1
68 | export FI_EFA_USE_DEVICE_RDMA=1
69 | export FI_PROVIDER=efa
70 | export FI_LOG_LEVEL=1
71 | export NCCL_IB_DISABLE=1
72 | export NCCL_SOCKET_IFNAME=ens
73 | 
74 | echo $CMD
75 | 
76 | SRUN_ARGS=" \
77 |     --wait=60 \
78 |     --kill-on-bad-exit=1 \
79 |     "
80 | 
81 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD"
82 | 
83 | echo "END TIME: $(date)"


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_cs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | from .generic_eval import main
 5 | 
 6 | LANG_NAME = "CSharp"
 7 | LANG_EXT = ".cs"
 8 | 
 9 | # Following files have problems:
10 | # 137,
11 | # 22: Any
12 | # 148: Elipsis
13 | 
14 | 
15 | def eval_script(path: str):
16 |     if ".cs" not in path.name:
17 |         return
18 |     basename = ".".join(str(path).split(".")[:-1])
19 |     binaryname = basename + ".exe"
20 |     build = subprocess.run(
21 |         ["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"],
22 |         capture_output=True,
23 |     )
24 |     status = None
25 |     returncode = -1
26 |     output = None
27 |     if build.returncode != 0:
28 |         # Well, it's a compile error. May be a type error or
29 |         # something. But, why break the set convention
30 |         status = "SyntaxError"
31 |         returncode = build.returncode
32 |         output = build
33 |     else:
34 |         try:
35 |             output = subprocess.run(
36 |                 ["mono", binaryname],
37 |                 env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER": "Console.Error"},
38 |                 capture_output=True,
39 |                 timeout=5,
40 |             )
41 |             returncode = output.returncode
42 |             output.stderr = str(output.stderr, "utf-8")
43 |             # mono return 0 even when failing
44 |             fail = (
45 |                 "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr
46 |                 or "Unhandled Exception" in output.stderr
47 |             )
48 |             output.returncode = 1 if fail else 0
49 |             if output.returncode == 0:
50 |                 status = "OK"
51 |             else:
52 |                 # Well, it's a panic
53 |                 status = "Exception"
54 |         except subprocess.TimeoutExpired as exc:
55 |             status = "Timeout"
56 |             output = exc
57 |         os.remove(binaryname)
58 | 
59 |     if output.stdout is not None:
60 |         output.stdout = output.stdout.decode("utf-8")
61 |     else:
62 |         output.stdout = "None"
63 | 
64 |     if output.stderr == "":
65 |         output.stderr = "None"
66 | 
67 |     return {
68 |         "status": status,
69 |         "exit_code": returncode,
70 |         "stdout": output.stdout,
71 |         "stderr": output.stderr,
72 |     }
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main(eval_script, LANG_NAME, LANG_EXT)
77 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/eval_dlang.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | from .safe_subprocess import run
 6 | 
 7 | ENABLE_SYNTAX_CHECK = False
 8 | 
 9 | 
10 | def eval_script(path: Path):
11 |     result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15)
12 |     if "might not be correctly installed" in result.stderr:
13 |         raise Exception("D is not correctly installed")
14 | 
15 |     if result.timeout:
16 |         status = "Timeout"
17 |     elif result.exit_code == 0:
18 |         status = "OK"
19 |     elif "Error:" in result.stderr:
20 |         status = "SyntaxError"
21 |     else:
22 |         status = "Exception"
23 | 
24 |     return {
25 |         "status": status,
26 |         "exit_code": result.exit_code,
27 |         "stdout": result.stdout,
28 |         "stderr": result.stderr,
29 |     }
30 | 
31 | 
32 | DIR = "d-keep-code_davinci_001_temp_0.2"
33 | 
34 | 
35 | def main():
36 |     directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve()
37 | 
38 |     count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0}
39 |     for filename in os.listdir(directory):
40 |         path = Path.joinpath(directory, filename)
41 |         r = eval_script(path)
42 |         status = r["status"]
43 |         count[status] += 1
44 | 
45 |         if ENABLE_SYNTAX_CHECK and status == "SyntaxError":
46 |             error_msgs = r["stderr"].split("\n")
47 |             with open(path) as source_file:
48 |                 lines = source_file.readlines()
49 |                 unittest_line_start = lines.index("unittest\n")
50 |                 unittest_line_end = len(lines)
51 |                 for err_msg_line in error_msgs:
52 |                     matched_parts = re.match(
53 |                         r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)",
54 |                         err_msg_line[2:-1],
55 |                     )
56 |                     _file, line_num = matched_parts[1], int(matched_parts[2])
57 |                     if (
58 |                         unittest_line_start <= line_num
59 |                         and line_num <= unittest_line_end
60 |                     ):
61 |                         print("===============")
62 |                         print(path, "contains error in unit test part")
63 |                         print(error_msgs)
64 |                         print("===============")
65 | 
66 |         filename = filename.split(".")[0]
67 |         print(f"Dlang,{filename},{status}")
68 | 
69 |     print(DIR + ":" + str(count))
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/sensitive_memorization/tokenize_secrets_and_prefixes.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from datasets import load_from_disk
 4 | import torch
 5 | from transformers import AutoTokenizer, AutoModelForCausalLM
 6 | 
 7 | 
 8 | def tokenize_secret(example):
 9 |     content = example['content']
10 |     secrets = eval(example['secrets'])
11 |     example['secret_token_ids'] = []
12 |     for index in range(len(secrets)):
13 |         secret = secrets[index]['value']
14 |         secret_token_ids = tokenizer.encode(secret, return_tensors='pt', max_length=max_secret_len, truncation=True, padding='max_length')
15 |         example['secret_token_ids'].append(secret_token_ids)
16 |     example['secret_token_ids'] = torch.cat(example['secret_token_ids'], dim=0)
17 |     return example
18 | 
19 | 
20 | def tokenize_secret_prefix(example):
21 |     content = example['content']
22 |     secrets = eval(example['secrets'])
23 |     example['secret_prefix_token_ids'] = []
24 |     for index in range(len(secrets)):
25 |         secret_prefix = content[:secrets[index]['start']]  # Extract the context leading up to the secret
26 |         # secret_prefix_token_ids = tokenizer.encode(secret_prefix, return_tensors='pt')[..., -1 * max_prefix_len:]
27 |         secret_prefix_token_ids = tokenizer.encode(secret_prefix, return_tensors='pt', max_length=max_prefix_len, truncation=True, padding='max_length')
28 |         example['secret_prefix_token_ids'].append(secret_prefix_token_ids)
29 |     example['secret_prefix_token_ids'] = torch.cat(example['secret_prefix_token_ids'], dim=0)
30 |     return example
31 | 
32 | 
33 | def main():
34 |     dataset_path = f"./codeparrot-clean-train-secrets-tokenized-{args.model_name_or_path.split('/')[-1]}"
35 |     if os.path.exists(dataset_path):
36 |         ds_pii = load_from_disk(dataset_path)
37 |     else:
38 |         ds_pii = load_from_disk(f"codeparrot-clean-train-secrets-masked-{args.model_name_or_path.split('/')[-1]}")
39 |         ds_pii = ds_pii.map(tokenize_secret, num_proc=32)
40 |         tokenizer.truncation_side = "left"
41 |         tokenizer.padding_side = "left"
42 |         ds_pii = ds_pii.map(tokenize_secret_prefix, num_proc=16)
43 |         ds_pii.save_to_disk(dataset_path)
44 |     print(ds_pii)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str,
50 |                         help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.")
51 |     args = parser.parse_args()
52 |     print(args)
53 |     
54 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True)
55 |     # tokenizer.pad_token = tokenizer.eos_token
56 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
57 |     
58 |     max_secret_len = 32
59 |     max_prefix_len = 128
60 | 
61 |     main()
62 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "questions": ["Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
 3 |                   "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 4 |                   "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
 5 |                   "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 6 |                   "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
 7 |                   "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 8 |                   "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 9 |                   "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?"],
10 |     "solutions": ["    money_initial = 23\n    bagels = 5\n    bagel_cost = 3\n    money_spent = bagels * bagel_cost\n    money_left = money_initial - money_spent\n    result = money_left\n    return result",
11 |                 "    golf_balls_initial = 58\n    golf_balls_lost_tuesday = 23\n    golf_balls_lost_wednesday = 2\n    golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n    result = golf_balls_left\n    return result",
12 |                 "    computers_initial = 9\n    computers_per_day = 5\n    num_days = 4  # 4 days between monday and thursday\n    computers_added = computers_per_day * num_days\n    computers_total = computers_initial + computers_added\n    result = computers_total\n    return result",
13 |                 "    toys_initial = 5\n    mom_toys = 2\n    dad_toys = 2\n    total_received = mom_toys + dad_toys\n    total_toys = toys_initial + total_received\n    result = total_toys\n    return result",
14 |                 "    jason_lollipops_initial = 20\n    jason_lollipops_after = 12\n    denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n    result = denny_lollipops\n    return result",
15 |                 "    leah_chocolates = 32\n    sister_chocolates = 42\n    total_chocolates = leah_chocolates + sister_chocolates\n    chocolates_eaten = 35\n    chocolates_left = total_chocolates - chocolates_eaten\n    result = chocolates_left\n    return result",
16 |                 "    cars_initial = 3\n    cars_arrived = 2\n    total_cars = cars_initial + cars_arrived\n    result = total_cars\n    return result",
17 |                 "    trees_initial = 15\n    trees_after = 21\n    trees_added = trees_after - trees_initial\n    result = trees_added\n    return result"]
18 | }


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from warnings import warn
 3 | 
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | class Task(ABC):
 8 |     """A task represents an entire benchmark including its dataset, problems,
 9 |     answers, generation settings and evaluation methods.
10 |     """
11 | 
12 |     # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
13 |     DATASET_PATH: str = None
14 | 
15 |     # The name of a subset within `DATASET_PATH`.
16 |     DATASET_NAME: str = None
17 | 
18 |     def __init__(self, stop_words=None, requires_execution=True):
19 |         """
20 |         :param stop_words: list
21 |             list of stop words if the generation uses a stopping criteria during generation
22 |         :param requires_execution: bool
23 |             wheter the task requires code execution during evaluation or not
24 |         """
25 |         self.stop_words = stop_words
26 |         self.requires_execution = requires_execution
27 |         try:
28 |             self.dataset = load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
29 |         except Exception as e:
30 |             warn(
31 |                 f"Loading the dataset failed with {str(e)}. This task will use a locally downloaded dataset, not from the HF hub."
32 |             )
33 | 
34 |     @abstractmethod
35 |     def get_dataset(self):
36 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
37 |         return []
38 | 
39 |     def fewshot_examples(self):
40 |         """Loads and returns the few-shot examples for the task if they exist."""
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def get_prompt(self, doc):
45 |         """Builds the prompt for the LM to generate from.
46 |         :param doc: dict[str: str]
47 |             sample from the test dataset
48 |         """
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def get_reference(self, doc):
53 |         """Builds the reference solution for the doc.
54 |         :param doc: dict[str: str]
55 |             sample from the test dataset
56 |         """
57 |         pass
58 | 
59 |     @abstractmethod
60 |     def postprocess_generation(self, generation, idx):
61 |         """Defines the postprocessing for a LM generation.
62 |         :param generation: str
63 |             code generation from LM
64 |         :param idx: int
65 |             index of doc in the dataset to which the generation belongs
66 |         """
67 |         pass
68 | 
69 |     @abstractmethod
70 |     def process_results(self, generations, references):
71 |         """Takes the list of LM generations and evaluates them against ground truth references,
72 |         returning the metric for the generations as in {"metric_name": result}.
73 |         :param generations: list(list(str))
74 |             list of lists containing generations
75 |         :param references: list(str)
76 |             list of str containing refrences
77 |         :return: dict[str: float]
78 |         """
79 |         pass
80 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/evaluation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import time
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | from pathlib import Path
 6 | from threading import Lock
 7 | from typing import Optional
 8 | 
 9 | from .containerized_eval import eval_string_script
10 | 
11 | # Get working directory
12 | WORKING_DIR = Path(__file__).parent.parent
13 | 
14 | # program: str => Result
15 | CACHE = dict()
16 | CACHE_LOCK = Lock()
17 | 
18 | 
19 | def cache_get(program: str) -> Optional[dict]:
20 |     if program in CACHE:
21 |         result = CACHE[program]
22 |         return result
23 |     else:
24 |         return None
25 | 
26 | 
27 | def cache_set(program: str, result: dict):
28 |     if program in CACHE:
29 |         print("Setting already-existing cache")
30 |     CACHE[program] = result
31 | 
32 | 
33 | def cached_eval_script(problem, index) -> dict:
34 |     # here prompt is already included in completions
35 |     program = problem["completions"][index] + "\n" + problem["tests"]
36 |     CACHE_LOCK.acquire(True)
37 |     cached = cache_get(program)
38 |     if cached is not None:
39 |         CACHE_LOCK.release()
40 |         return cached
41 |     else:
42 |         result_yaml = dict()
43 |         cache_set(program, result_yaml)
44 |         CACHE_LOCK.release()
45 |         result_dict = eval_string_script(problem["language"], program)
46 |         for k in result_dict.keys():
47 |             result_yaml[k] = result_dict[k]
48 |             result_yaml["timestamp"] = int(time.time())
49 |         return result_yaml
50 | 
51 | 
52 | def get_test_results_json_path(
53 |     output_dir: str, problem_json_path: str, input_dir: Path
54 | ) -> Path:
55 |     suffixes = ".results.json"
56 |     problem_name = problem_json_path[: -len(".json")]
57 |     if input_dir:
58 |         raise ValueError("input dir given")
59 |         return Path(output_dir) / (
60 |             problem_json_path.relative_to(input_dir).parent / (problem_name + suffixes)
61 |         )
62 |     return Path(output_dir) / (problem_name + suffixes)
63 | 
64 | 
65 | def evaluate_problem(
66 |     output_dir: str, problem_json_path: str, max_workers: int, input_dir: Path = None
67 | ):
68 |     with open(problem_json_path, "r") as f:
69 |         problem = json.load(f)
70 |     test_results_path = get_test_results_json_path(
71 |         output_dir, problem_json_path, input_dir
72 |     )
73 |     test_results_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
74 | 
75 |     test_results = problem.copy()
76 |     del test_results["completions"]
77 |     test_results["results"] = []
78 | 
79 |     num_problems = len(problem["completions"])
80 |     min_problem = len(test_results["results"])
81 | 
82 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
83 |         for j in executor.map(
84 |             lambda index: cached_eval_script(problem, index),
85 |             range(min_problem, num_problems),
86 |         ):
87 |             test_results["results"].append(j)
88 |             with open(test_results_path, "w") as f:
89 |                 f.write(json.dumps(test_results, indent=2))
90 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/__init__.py:
--------------------------------------------------------------------------------
 1 | import fcntl
 2 | import os
 3 | import signal
 4 | import subprocess
 5 | import time
 6 | from typing import List
 7 | 
 8 | MAX_BYTES_PER_READ = 1024
 9 | SLEEP_BETWEEN_READS = 0.1
10 | 
11 | 
12 | class Result:
13 |     timeout: int
14 |     exit_code: int
15 |     stdout: str
16 |     stderr: str
17 | 
18 |     def __init__(self, timeout, exit_code, stdout, stderr):
19 |         self.timeout = timeout
20 |         self.exit_code = exit_code
21 |         self.stdout = stdout
22 |         self.stderr = stderr
23 | 
24 | 
25 | def set_nonblocking(reader):
26 |     fd = reader.fileno()
27 |     fl = fcntl.fcntl(fd, fcntl.F_GETFL)
28 |     fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
29 | 
30 | 
31 | def run(
32 |     args: List[str],
33 |     timeout_seconds: int = 15,
34 |     max_output_size: int = 2048,
35 |     env=None,
36 | ) -> Result:
37 |     """
38 |     Runs the given program with arguments. After the timeout elapses, kills the process
39 |     and all other processes in the process group. Captures at most max_output_size bytes
40 |     of stdout and stderr each, and discards any output beyond that.
41 |     """
42 |     p = subprocess.Popen(
43 |         args,
44 |         env=env,
45 |         stdin=subprocess.DEVNULL,
46 |         stdout=subprocess.PIPE,
47 |         stderr=subprocess.PIPE,
48 |         start_new_session=True,
49 |         bufsize=MAX_BYTES_PER_READ,
50 |     )
51 |     set_nonblocking(p.stdout)
52 |     set_nonblocking(p.stderr)
53 | 
54 |     process_group_id = os.getpgid(p.pid)
55 | 
56 |     # We sleep for 0.1 seconds in each iteration.
57 |     max_iterations = timeout_seconds * 10
58 |     stdout_saved_bytes = []
59 |     stderr_saved_bytes = []
60 |     stdout_bytes_read = 0
61 |     stderr_bytes_read = 0
62 | 
63 |     for _ in range(max_iterations):
64 |         this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
65 |         this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
66 |         # this_stdout_read and this_stderr_read may be None if stdout or stderr
67 |         # are closed. Without these checks, test_close_output fails.
68 |         if this_stdout_read is not None and stdout_bytes_read < max_output_size:
69 |             stdout_saved_bytes.append(this_stdout_read)
70 |             stdout_bytes_read += len(this_stdout_read)
71 |         if this_stderr_read is not None and stderr_bytes_read < max_output_size:
72 |             stderr_saved_bytes.append(this_stderr_read)
73 |             stderr_bytes_read += len(this_stderr_read)
74 |         exit_code = p.poll()
75 |         if exit_code is not None:
76 |             break
77 |         time.sleep(SLEEP_BETWEEN_READS)
78 | 
79 |     try:
80 |         # Kills the process group. Without this line, test_fork_once fails.
81 |         os.killpg(process_group_id, signal.SIGKILL)
82 |     except ProcessLookupError:
83 |         pass
84 | 
85 |     timeout = exit_code is None
86 |     exit_code = exit_code if exit_code is not None else -1
87 |     stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
88 |     stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
89 |     return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
90 | 


--------------------------------------------------------------------------------
/memorization_thresholds/sample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import sys
 4 | import random
 5 | from transformers import AutoTokenizer
 6 | 
 7 | 
 8 | def remove_comments_from_string(code_str):
 9 |     lines = code_str.splitlines()
10 |     
11 |     # Skipping the opening blank line and comment line
12 |     start = 0
13 |     in_block_comment = False
14 |     for i, line in enumerate(lines):
15 |         stripped = line.strip()
16 |         if in_block_comment:
17 |             if stripped.endswith('"""') or stripped.endswith("'''"):
18 |                 in_block_comment = False
19 |             continue
20 |         if stripped.startswith('#') or not stripped:
21 |             continue
22 |         if stripped.startswith('"""') or stripped.startswith("'''"):
23 |             in_block_comment = True
24 |             continue
25 |         start = i
26 |         break
27 | 
28 |     return '\n'.join(lines[start:])
29 | 
30 | 
31 | def main():
32 |     files = []
33 |     for language in ['Ruby', 'PHP', 'Rust', 'Lua']:
34 |         with open(f'TopLists/{language}-top-repos.txt', 'r') as fr:
35 |             for line in fr.readlines():
36 |                 line = line.strip()
37 |                 temp1 = line.split('\t')
38 |                 star = temp1[0]
39 |                 github_link = temp1[1]
40 |                 temp2 = github_link.split('/')
41 |                 github_org, github_repo = temp2[-2], temp2[-1]
42 |                 data_dir = f'Code/{language}/{github_org}/{github_repo}'
43 |                 if not os.path.exists(data_dir):
44 |                     continue
45 |                 for file_name in os.listdir(data_dir):
46 |                     if os.path.isfile(os.path.join(data_dir, file_name)):
47 |                         files.append(os.path.join(data_dir, file_name))
48 |     print(f"Obtained {len(files)} deduplicated files from GitHub.")
49 |     
50 |     random.seed(42)
51 |     random.shuffle(files)
52 |     
53 |     tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot")
54 |     target_csv_file = f'../unlearning/data/github/unseen_data.csv'
55 |     if not os.path.exists(target_csv_file):
56 |         directory = os.path.dirname(target_csv_file)
57 |         if not os.path.exists(directory):
58 |             os.makedirs(directory)
59 |     
60 |     max_sample_num = 10000
61 |     current_sample_num = 0
62 |     with open(target_csv_file, 'w') as fw:
63 |         writer = csv.writer(fw)
64 |         writer.writerow(['doc_id', 'corpus', 'text'])
65 |         for file_path in files:
66 |             try:
67 |                 with open(file_path, 'r', encoding='utf-8') as data_fr:
68 |                     if current_sample_num >= max_sample_num:
69 |                         return
70 |                     data = data_fr.read().strip()
71 |                     data = remove_comments_from_string(data)
72 |                     length = len(tokenizer(data)['input_ids'])
73 |                     if length > 200 and length < 1000:
74 |                         corpus = file_path.replace('Code', 'GitHub')
75 |                         writer.writerow([current_sample_num, corpus, data])
76 |                         current_sample_num += 1
77 |             except Exception:
78 |                 continue
79 |             # print(current_sample_num)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/sensitive_memorization/filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datasets import load_from_disk
 3 | 
 4 | ip_filter_func = lambda secret: (secret['value'][:4] == '127.' or secret['value'][:3] == '10.' or secret['value'][:8] == '192.168.' 
 5 |                                  or secret['value'][:8] == '169.254.' or (secret['value'][:4] == '172.' and eval(secret['value'][4:6]) >= 16 and eval(secret['value'][4:6]) <= 31))
 6 | email_filter_func = lambda secret: ('example' in secret['value'] or 'test' in secret['value'] or 
 7 |                                     'user' in secret['value'] or 'aaa' in secret['value'] or 'bbb' in secret['value'] or 'ccc' in secret['value'])
 8 | key_filter_func = lambda secret: (secret['value'] == 'ghp' or 'aaaaaaa' in secret['value'] or 'AAAAAAA' in secret['value'] or
 9 |                     'xxxxxxx' in secret['value'] or 'XXXXXXX' in secret['value'] or 'https' in secret['value'] or 'dummy' in secret['value'] or 
10 |                     'placeholder' in secret['value'] or 'changeme' in secret['value'])
11 | 
12 | secret_filter_func = lambda secret: ((secret['tag'] == 'IP_ADDRESS' and ip_filter_func(secret)) or
13 |                 (secret['tag'] == 'EMAIL' and email_filter_func(secret)) or
14 |                 (secret['tag'] == 'KEY' and key_filter_func(secret)))
15 | 
16 | 
17 | def filter_secrets(example):
18 |     if type(example['secrets']) != str:
19 |         return False
20 |     secrets = eval(example['secrets'])
21 |     if example['number_secrets'] == 1:
22 |         secret = secrets[0]
23 |         # Check if the only secret is either a local IP or an email containing "example"
24 |         if secret_filter_func(secret):
25 |         # if secret['start'] < 512 or secret_filter_func(secret):
26 |             return False  # This will remove the example
27 |     elif example['number_secrets'] > 1:
28 |         # Filter out specific secrets
29 |         filtered_secrets = [secret for secret in secrets
30 |                             if not secret_filter_func(secret)]
31 |         # filtered_secrets = [secret for secret in secrets
32 |         #                     if not (secret['start'] < 512 or secret_filter_func(secret))]
33 |         if len(filtered_secrets) == 0:
34 |             return False  # This will remove the example
35 |     return True
36 | 
37 | 
38 | def update_example(example):
39 |     secrets = eval(example['secrets'])
40 |     # Filter out specific secrets
41 |     filtered_secrets = [secret for secret in secrets
42 |                         if not secret_filter_func(secret)]
43 |     # filtered_secrets = [secret for secret in secrets
44 |     #                     if not ((secret['start'] < 512 or secret_filter_func(secret)))]
45 |     example['secrets'] = str(filtered_secrets)
46 |     example['number_secrets'] = len(filtered_secrets)
47 |     return example
48 | 
49 | 
50 | def main():    
51 |     dataset_path = './codeparrot-clean-train-secrets-filtered'
52 |     if os.path.exists(dataset_path):
53 |         ds_pii = load_from_disk(dataset_path)
54 |     else:
55 |         ds_pii = load_from_disk('codeparrot-clean-train-secrets')
56 |         ds_pii = ds_pii.filter(filter_secrets, num_proc=48)
57 |         ds_pii = ds_pii.map(update_example, num_proc=48)
58 |         ds_pii.save_to_disk(dataset_path)
59 |     print(ds_pii)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/module_test.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from pathlib import Path
  3 | 
  4 | from safe_subprocess import run
  5 | 
  6 | ROOT = Path(__file__).resolve().parent / "evil_programs"
  7 | 
  8 | 
  9 | def assert_no_running_evil():
 10 |     result = run(["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024)
 11 |     assert (
 12 |         result.exit_code == 1
 13 |     ), f"There are still evil processes running: {result.stdout}"
 14 |     assert len(result.stderr) == 0
 15 |     assert len(result.stdout) == 0
 16 | 
 17 | 
 18 | def test_fork_once():
 19 |     # The program exits cleanly and immediately. But, it forks a child that runs
 20 |     # forever.
 21 |     result = run(
 22 |         ["python3", ROOT / "fork_once.py"],
 23 |         timeout_seconds=2,
 24 |         max_output_size=1024,
 25 |     )
 26 |     assert result.exit_code == 0
 27 |     assert result.timeout == False
 28 |     assert len(result.stderr) == 0
 29 |     assert len(result.stdout) == 0
 30 |     assert_no_running_evil()
 31 | 
 32 | 
 33 | def test_close_outputs():
 34 |     # The program prints to stdout, closes its output, and then runs forever.
 35 |     result = run(
 36 |         ["python3", ROOT / "close_outputs.py"],
 37 |         timeout_seconds=2,
 38 |         max_output_size=1024,
 39 |     )
 40 |     assert result.exit_code == -1
 41 |     assert result.timeout == True
 42 |     assert len(result.stderr) == 0
 43 |     assert result.stdout == "This is the end\n"
 44 |     assert_no_running_evil()
 45 | 
 46 | 
 47 | def test_unbounded_output():
 48 |     result = run(
 49 |         ["python3", ROOT / "unbounded_output.py"],
 50 |         timeout_seconds=3,
 51 |         max_output_size=1024,
 52 |     )
 53 |     assert result.exit_code == -1
 54 |     assert result.timeout == True
 55 |     assert len(result.stderr) == 0
 56 |     assert len(result.stdout) == 1024
 57 |     assert_no_running_evil()
 58 | 
 59 | 
 60 | def test_sleep_forever():
 61 |     result = run(
 62 |         ["python3", ROOT / "sleep_forever.py"],
 63 |         timeout_seconds=2,
 64 |         max_output_size=1024,
 65 |     )
 66 |     assert result.exit_code == -1
 67 |     assert result.timeout == True
 68 |     assert len(result.stderr) == 0
 69 |     assert len(result.stdout) == 0
 70 |     assert_no_running_evil()
 71 | 
 72 | 
 73 | def test_fork_bomb():
 74 |     result = run(
 75 |         ["python3", ROOT / "fork_bomb.py"],
 76 |         timeout_seconds=2,
 77 |         max_output_size=1024,
 78 |     )
 79 |     assert result.exit_code == -1
 80 |     assert result.timeout == True
 81 |     assert len(result.stderr) == 0
 82 |     assert len(result.stdout) == 0
 83 |     # Unfortunately, this sleep seems to be necessary. My theories:
 84 |     # 1. os.killpg doesn't block until the whole process group is dead.
 85 |     # 2. pgrep can produce stale output
 86 |     time.sleep(2)
 87 |     assert_no_running_evil()
 88 | 
 89 | 
 90 | def test_block_on_inputs():
 91 |     # We run the subprocess with /dev/null as input. So, any program that tries
 92 |     # to read input will error.
 93 |     result = run(
 94 |         ["python3", ROOT / "block_on_inputs.py"],
 95 |         timeout_seconds=2,
 96 |         max_output_size=1024,
 97 |     )
 98 |     assert result.exit_code == 1
 99 |     assert result.timeout == False
100 |     assert len(result.stdout) == 0
101 |     assert "EOF when reading a line" in result.stderr
102 |     assert_no_running_evil()
103 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NOTE: Nothing containerized about this any more. This is just a helper
 3 | for problem_evaluator.py.
 4 | """
 5 | 
 6 | import tempfile
 7 | from pathlib import Path
 8 | 
 9 | from . import (eval_cpp, eval_dlang, eval_java, eval_javascript, eval_julia,
10 |                eval_lua, eval_php, eval_python, eval_r, eval_racket, eval_ruby,
11 |                eval_rust, eval_swift, eval_ts, eval_go, eval_pl, eval_sh, eval_scala, eval_cs)
12 | 
13 | EVALUATORS = {
14 |     "rb": (eval_ruby.eval_script, ".rb"),
15 |     "lua": (eval_lua.eval_script, ".lua"),
16 |     "python": (eval_python.eval_script, ".py"),
17 |     "py": (eval_python.eval_script, ".py"),
18 |     "notypes.py": (eval_python.eval_script, ".py"),
19 |     "julia": (eval_julia.eval_script, ".jl"),
20 |     "java": (eval_java.eval_script, ".java"),
21 |     "rust": (eval_rust.eval_script, ".rs"),
22 |     "rs": (eval_rust.eval_script, ".rs"),
23 |     "swift": (eval_swift.eval_script, ".swift"),
24 |     "lua": (eval_lua.eval_script, ".lua"),
25 |     "racket": (eval_racket.eval_script, ".rkt"),
26 |     "rkt": (eval_racket.eval_script, ".rkt"),
27 |     "javascript": (eval_javascript.eval_script, ".js"),
28 |     "js": (eval_javascript.eval_script, ".js"),
29 |     "cpp": (eval_cpp.eval_script, ".cpp"),
30 |     "cs": (eval_cs.eval_script, ".cs"),
31 |     "php": (eval_php.eval_script, ".php"),
32 |     "humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"),
33 |     "d": (eval_dlang.eval_script, ".d"),
34 |     "r": (eval_r.eval_script, ".r"),
35 |     "humaneval_to_r.py": (eval_r.eval_script, ".r"),
36 |     "jl": (eval_julia.eval_script, ".jl"),
37 |     "ts": (eval_ts.eval_script, ".ts"),
38 |     "go": (eval_go.eval_script, ".go"),
39 |     "pl": (eval_pl.eval_script, ".pl"),
40 |     "sh": (eval_sh.eval_script, ".sh"),
41 |     "scala": (eval_scala.eval_script, ".scala"),
42 | }
43 | 
44 | 
45 | def eval_string_script(language, program):
46 |     if language in EVALUATORS:
47 |         (eval_script, file_ext) = EVALUATORS[language]
48 |     else:
49 |         eval_module = __import__(
50 |             f"eval_{language}" if language != "go_test.go" else "eval_go"
51 |         )
52 |         eval_script = eval_module.eval_script
53 |         file_ext = f".{language}" if language != "go_test.go" else "_test.go"
54 |     with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
55 |         f.write(program.encode("utf-8"))
56 |         f.flush()
57 |         result = eval_script(Path(f.name))
58 |         # Only save the first 2K of output from the running program. Any futher
59 |         # output is very likely an exceptionally long stack trace or a long
60 |         # series of prints.
61 |         if type(result["stdout"]) == bytes:
62 |             result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
63 |         if result["stdout"] is None:
64 |             result["stdout"] = ""
65 |         if result["stderr"] is None:
66 |             result["stderr"] = ""
67 |         if type(result["stderr"]) == bytes:
68 |             result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
69 |         assert type(result["stdout"]) == str
70 |         assert type(result["stderr"]) == str
71 |         return {
72 |             "program": program,
73 |             "stdout": result["stdout"].replace("!!int", "")[:2048],
74 |             "stderr": result["stderr"][:2048],
75 |             "exit_code": result["exit_code"],
76 |             "status": result["status"],
77 |         }
78 | 


--------------------------------------------------------------------------------
/human_eval/tests/test_generation_evaluation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import tempfile
  4 | 
  5 | from accelerate import Accelerator
  6 | from accelerate.utils import write_basic_config
  7 | from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
  8 | 
  9 | from bigcode_eval.arguments import EvalArguments
 10 | from bigcode_eval.evaluator import Evaluator
 11 | 
 12 | # TODO add more tasks
 13 | 
 14 | # Tasks for generation test
 15 | GEN_TASKS = ["humaneval", "mbpp"]
 16 | # Tasks for evaluator tests
 17 | EVAL_TASKS = ["humaneval", "mbpp", "pal-gsm8k-greedy"]
 18 | TMPDIR = tempfile.mkdtemp()
 19 | TEST_MODEL = "hf-internal-testing/tiny-random-gpt2"
 20 | REF_EVAL_SCORES = {
 21 |     "humaneval": {"pass@1": 0.25},
 22 |     "mbpp": {"pass@1": 0.25},
 23 |     "pal-gsm8k-greedy": {"accuracy": 1.0, "num_failed_execution": 0},
 24 | }
 25 | 
 26 | 
 27 | def update_args(args):
 28 |     args.model = "hf-internal-testing/tiny-random-gpt2"
 29 |     # the executed code for the tests is safe (see tests/data/*_eval_gens.json)
 30 |     args.allow_code_execution = True
 31 |     args.save_generations = False
 32 |     args.save_generations_path = ""
 33 |     args.save_references = False
 34 |     args.metric_output_path = TMPDIR
 35 |     args.load_generations_path = None
 36 |     args.generation_only = False
 37 |     args.check_references = False
 38 |     # postprocessing for HumanEval and MBPP makes generations
 39 |     # with dummy model not distinctive
 40 |     args.postprocess = False
 41 |     args.instruction_tokens = None
 42 | 
 43 |     args.limit = 2
 44 |     args.limit_start = 0
 45 |     args.batch_size = 1
 46 |     args.max_length_generation = 300
 47 |     args.do_sample = False
 48 |     args.top_p = 0
 49 |     args.n_samples = 1
 50 |     args.seed = 0
 51 |     args.prompt = None
 52 |     args.precision = None
 53 |     args.modeltype = None
 54 |     args.max_memory_per_gpu = None
 55 |     return args
 56 | 
 57 | 
 58 | def setup():
 59 |     model = AutoModelForCausalLM.from_pretrained(TEST_MODEL)
 60 |     tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL)
 61 |     tokenizer.pad_token = tokenizer.eos_token
 62 |     configPath = os.path.join(TMPDIR, "default_config.yml")
 63 |     write_basic_config(save_location=configPath)
 64 |     accelerator = Accelerator()
 65 |     return model, tokenizer, accelerator
 66 | 
 67 | 
 68 | def load_generation_examples(task):
 69 |     # generations for testing the generation feature of dummy test model
 70 |     with open(f"tests/data/{task}_gen_gens.json") as fp:
 71 |         gens = json.load(fp)
 72 |     with open(f"tests/data/{task}_gen_refs.json") as fp:
 73 |         refs = json.load(fp)
 74 |     return gens, refs
 75 | 
 76 | 
 77 | args = update_args(EvalArguments())
 78 | set_seed(args.seed)
 79 | model, tokenizer, accelerator = setup()
 80 | 
 81 | 
 82 | def test_generation():
 83 |     args.generation_only = True
 84 |     evaluator = Evaluator(accelerator, model, tokenizer, args)
 85 |     for task in GEN_TASKS:
 86 |         print(f"testing task {task}")
 87 |         generations, references = evaluator.generate_text(task)
 88 |         true_gens, true_refs = load_generation_examples(task)
 89 |         assert generations == true_gens
 90 |         assert references == true_refs
 91 |     print("passed gen")
 92 | 
 93 | 
 94 | def test_evaluation():
 95 |     # TODO add scores for each task
 96 |     args.n_samples = 2
 97 |     for task in EVAL_TASKS:
 98 |         print(f"testing task {task}")
 99 |         # path to generation examples to evaluate
100 |         args.load_generations_path = f"tests/data/{task}_eval_gens.json"
101 |         evaluator = Evaluator(accelerator, None, None, args)
102 |         results = evaluator.evaluate(task)
103 |         assert results == REF_EVAL_SCORES[task]
104 |     print("passed eval")
105 | 


--------------------------------------------------------------------------------
/human_eval/templates/new_task.py:
--------------------------------------------------------------------------------
 1 | # This template file is adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/templates/new_task.py
 2 | 
 3 | # TODO: Remove all TODO comments once the implementation is complete.
 4 | """
 5 | TODO: Add the Paper Title on this line.
 6 | TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
 7 | TODO: Write a Short Description of the task.
 8 | Homepage: TODO: Add the URL to the task's Homepage here.
 9 | """
10 | from bigcode_eval.base import Task
11 | 
12 | # TODO: Add the BibTeX citation for the task.
13 | _CITATION = """
14 | """
15 | 
16 | 
17 | # TODO: Replace `NewTask` with the name of your Task.
18 | class NewTask(Task):
19 |     # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
20 |     # dataset as denoted in HuggingFace `datasets`.
21 |     DATASET_PATH = ""
22 |     # TODO: Add the `DATASET_NAME` string. This is the name of a subset within
23 |     # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
24 |     DATASET_NAME = None
25 | 
26 |     def __init__(self):
27 |         super().__init__(
28 |             # TODO: Specify the list of stop words in `stop_words` for the code generation task \
29 |             # and if the evaluation requires executing the generated code in `requires_execution`.
30 |             stop_words=[],
31 |             requires_execution=False,
32 |         )
33 | 
34 |     def get_dataset(self):
35 |         # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`)
36 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
37 |         return []
38 | 
39 |     def fewshot_examples(self):
40 |         # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist
41 |         """Loads and returns the few-shot examples for the task if they exist."""
42 |         pass
43 | 
44 |     def get_prompt(self, doc):
45 |         # TODO: build the prompt for the language model from a sample `doc` from the dataset
46 |         """
47 |         Builds the prompt for the LM to generate from.
48 |         :param doc: dict[str: str]
49 |             sample from the test dataset
50 |         :return: str
51 |         """
52 |         return ""
53 | 
54 |     def get_reference(self, doc):
55 |         # TODO: get the reference solution from a sample `doc` from the dataset
56 |         """
57 |         Builds the reference solution for the doc (sample from the test dataset).
58 |         :param doc: dict[str: str]
59 |             sample from the test dataset
60 |         :return: str
61 |         """
62 |         return ""
63 | 
64 |     def postprocess_generation(self, generation, idx):
65 |         # TODO: define the postprocessing for the LM generation
66 |         """
67 |         Defines the postprocessing for a LM generation.
68 |         :param generation: str
69 |             code generation from LM
70 |         :param idx: int (if needed)
71 |             index of doc in the dataset to which the generation belongs
72 |         :return: str
73 |         """
74 |         return ""
75 | 
76 |     def process_results(self, generations, references):
77 |         # TODO: define how the evaluation score is computed from list of \
78 |         # generations and reference solutions
79 |         """
80 |         Takes the list of LM generations and evaluates them against ground truth references,
81 |         returning the metric for the generations as in {"metric_name": result}.
82 |         We encourage to directly load the metric from `evaluate` library to keep the code concise.
83 |         :param generations: list(list(str))
84 |             list of lists containing generations
85 |         :param references: list(str)
86 |             list of str containing refrences
87 |         :return: dict[str: float]
88 |         """
89 |         return {}
90 | 


--------------------------------------------------------------------------------
/human_eval/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | .vscode/
163 | .trunk
164 | .DS_Store
165 | 
166 | # Script outputs
167 | evaluation*.json
168 | generations*.json
169 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/APPS/apps_train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fine-Tune LM on APPS train split
  3 | """
  4 | 
  5 | import argparse
  6 | import os
  7 | 
  8 | import torch
  9 | from apps_dataset import APPSBaseDataset
 10 | from datasets import load_dataset
 11 | from transformers import (AutoModelForCausalLM, Trainer, TrainingArguments,
 12 |                           logging, set_seed)
 13 | 
 14 | 
 15 | def get_args():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument("--model_ckpt", type=str, default="codeparrot/codeparrot-small")
 18 |     parser.add_argument("--max_length", type=int, default=1024)
 19 |     parser.add_argument("--num_epochs", type=int, default=10)
 20 |     parser.add_argument("--max_steps", type=int, default=-1)
 21 |     parser.add_argument("--batch_size", type=int, default=8)
 22 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=8)
 23 | 
 24 |     parser.add_argument("--learning_rate", type=float, default=5e-5)
 25 |     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
 26 |     parser.add_argument("--num_warmup_steps", type=int, default=100)
 27 |     parser.add_argument("--weight_decay", type=float, default=0.05)
 28 | 
 29 |     parser.add_argument("--fp16", default=False, action="store_true")
 30 |     parser.add_argument("--seed", type=int, default=0)
 31 |     parser.add_argument("--output_dir", type=str, default="./checkpoints")
 32 |     parser.add_argument("--log_freq", default=1, type=int)
 33 |     parser.add_argument("--eval_freq", default=250, type=int)
 34 |     parser.add_argument("--save_freq", default=250, type=int)
 35 |     return parser.parse_args()
 36 | 
 37 | 
 38 | def get_dataset(dataset, args):
 39 | 
 40 |     train_data = APPSBaseDataset(
 41 |         dataset=dataset, max_tokens=args.max_length, tokenizer_path=args.model_ckpt
 42 |     )
 43 | 
 44 |     return train_data
 45 | 
 46 | 
 47 | def run_training(args, train_data, val_data):
 48 | 
 49 |     model = AutoModelForCausalLM.from_pretrained(args.model_ckpt, use_auth_token=True)
 50 |     train_data.start_iteration = 0
 51 | 
 52 |     print(f"Starting main loop")
 53 | 
 54 |     training_args = TrainingArguments(
 55 |         output_dir=args.output_dir,
 56 |         dataloader_drop_last=True,
 57 |         evaluation_strategy="steps",
 58 |         num_train_epochs=args.num_epochs,
 59 |         max_steps=args.max_steps,
 60 |         eval_steps=args.eval_freq,
 61 |         save_steps=args.save_freq,
 62 |         logging_steps=args.log_freq,
 63 |         per_device_train_batch_size=args.batch_size,
 64 |         per_device_eval_batch_size=args.batch_size,
 65 |         learning_rate=args.learning_rate,
 66 |         lr_scheduler_type=args.lr_scheduler_type,
 67 |         warmup_steps=args.num_warmup_steps,
 68 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
 69 |         weight_decay=args.weight_decay,
 70 |         fp16=args.fp16,
 71 |         run_name="apps-train",
 72 |         report_to="wandb",
 73 |     )
 74 | 
 75 |     trainer = Trainer(
 76 |         model=model,
 77 |         args=training_args,
 78 |         train_dataset=train_data,
 79 |         eval_dataset=val_data,
 80 |     )
 81 | 
 82 |     print("Training...")
 83 |     trainer.train()
 84 | 
 85 |     print("saving last checkpoint of the model")
 86 |     model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
 87 | 
 88 | 
 89 | def main(args):
 90 | 
 91 |     dataset = load_dataset("codeparrot/apps", split="train")
 92 |     dataset.shuffle(seed=args.seed)
 93 |     data = get_dataset(dataset, args)
 94 |     train_size = int(0.95 * len(data))
 95 |     train_data, val_data = torch.utils.data.random_split(
 96 |         data,
 97 |         [train_size, len(data) - train_size],
 98 |         generator=torch.Generator().manual_seed(args.seed),
 99 |     )
100 |     print(
101 |         f"size of training data {len(train_data)}\nsize of validation data {len(val_data)}"
102 |     )
103 |     run_training(args, train_data, val_data)
104 | 
105 | 
106 | if __name__ == "__main__":
107 | 
108 |     args = get_args()
109 |     set_seed(args.seed)
110 |     os.makedirs(args.output_dir, exist_ok=True)
111 | 
112 |     logging.set_verbosity_error()
113 | 
114 |     main(args)
115 | 


--------------------------------------------------------------------------------
/human_eval/tests/data/pal-gsm8k-greedy_prompt.json:
--------------------------------------------------------------------------------
1 | {
2 |     "prompt":"Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\"\"\"\n    money_initial = 23\n    bagels = 5\n    bagel_cost = 3\n    money_spent = bagels * bagel_cost\n    money_left = money_initial - money_spent\n    result = money_left\n    return result\n\n\n\n\n\nQ: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\"\"\"\n    golf_balls_initial = 58\n    golf_balls_lost_tuesday = 23\n    golf_balls_lost_wednesday = 2\n    golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n    result = golf_balls_left\n    return result\n\n\n\n\n\nQ: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\"\"\"\n    computers_initial = 9\n    computers_per_day = 5\n    num_days = 4  # 4 days between monday and thursday\n    computers_added = computers_per_day * num_days\n    computers_total = computers_initial + computers_added\n    result = computers_total\n    return result\n\n\n\n\n\nQ: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\"\"\"\n    toys_initial = 5\n    mom_toys = 2\n    dad_toys = 2\n    total_received = mom_toys + dad_toys\n    total_toys = toys_initial + total_received\n    result = total_toys\n    return result\n\n\n\n\n\nQ: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\"\"\"\n    jason_lollipops_initial = 20\n    jason_lollipops_after = 12\n    denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n    result = denny_lollipops\n    return result\n\n\n\n\n\nQ: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\"\"\"\n    leah_chocolates = 32\n    sister_chocolates = 42\n    total_chocolates = leah_chocolates + sister_chocolates\n    chocolates_eaten = 35\n    chocolates_left = total_chocolates - chocolates_eaten\n    result = chocolates_left\n    return result\n\n\n\n\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\"\"\"\n    cars_initial = 3\n    cars_arrived = 2\n    total_cars = cars_initial + cars_arrived\n    result = total_cars\n    return result\n\n\n\n\n\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n# solution in Python:\n\n\ndef solution():\n    \"\"\"There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\"\"\"\n    trees_initial = 15\n    trees_after = 21\n    trees_added = trees_after - trees_initial\n    result = trees_added\n    return result\n\n\n\n\n\nQ: test\n\n# solution in Python:\n\n\n"
3 | }


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/concode.py:
--------------------------------------------------------------------------------
  1 | """Mapping Language to Code in Programmatic Context (Concode)
  2 | https://arxiv.org/abs/1808.09588
  3 | 
  4 | CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation
  5 | https://arxiv.org/abs/2102.04664
  6 | 
  7 | Java code generation in CodeXGLUE text-to-code dataset (built from Concode dataset)
  8 | Available at https://huggingface.co/datasets/code_x_glue_ct_code_to_text
  9 | 2000 samples are available in the test set.
 10 | 
 11 | Here we use two-shot evaluation (the original paper evaluates finetuned models)
 12 | """
 13 | import json
 14 | 
 15 | from evaluate import load
 16 | 
 17 | from bigcode_eval.base import Task
 18 | 
 19 | _CITATION = """
 20 | @article{iyer2018mapping,
 21 |   title={Mapping language to code in programmatic context},
 22 |   author={Iyer, Srinivasan and Konstas, Ioannis and Cheung, Alvin and Zettlemoyer, Luke},
 23 |   journal={arXiv preprint arXiv:1808.09588},
 24 |   year={2018}
 25 | }
 26 | """
 27 | 
 28 | 
 29 | class Concode(Task):
 30 |     """A task represents an entire benchmark including its dataset, problems,
 31 |     answers, generation settings and evaluation methods.
 32 |     """
 33 | 
 34 |     DATASET_PATH = "code_x_glue_tc_text_to_code"
 35 | 
 36 |     def __init__(self):
 37 |         super().__init__(
 38 |             stop_words=["\n"],
 39 |             requires_execution=False,
 40 |         )
 41 | 
 42 |     def get_dataset(self):
 43 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 44 |         # test split of the dataset doesn't have targets
 45 |         return self.dataset["validation"]
 46 | 
 47 |     def fewshot_examples(self):
 48 |         """Loads and returns the few-shot examples for the task if they exist."""
 49 |         with open(
 50 |             "bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json", "r"
 51 |         ) as file:
 52 |             examples = json.load(file)
 53 |         return examples
 54 | 
 55 |     @staticmethod
 56 |     def two_shot_prompt(entry, text, examples):
 57 |         """Two shot prompt format as instructions & solutions"""
 58 |         prompt = f"\nInstruction:\n{examples['instruction1']}\
 59 |                    \nSolution:\n{examples['solution1']}\
 60 |                    \nInstruction:\n{examples['instruction2']}\
 61 |                    \nSolution:\n{examples['solution2']}\
 62 |                    \nInstruction:\n{text}\
 63 |                    \nSolution:\n"
 64 |         assert (
 65 |             prompt.count("Solution:\n") == 3
 66 |         ), "Splitting operation in postprocess_generation is invalid"
 67 |         return entry + prompt
 68 | 
 69 |     def get_prompt(self, doc):
 70 |         """Builds the prompt for the LM to generate from."""
 71 |         examples = self.fewshot_examples()
 72 |         text = doc["nl"].split("concode_field_sep")[0].strip()
 73 |         if text.endswith("."):
 74 |             text = text[:-1].strip()
 75 |         entry = "Answer the following instructions in a one line of Java code:\n"
 76 |         prompt = self.two_shot_prompt(entry, text, examples)
 77 |         return prompt
 78 | 
 79 |     def get_reference(self, doc):
 80 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 81 |         return doc["code"]
 82 | 
 83 |     def postprocess_generation(self, generation, idx):
 84 |         """Defines the postprocessing for a LM generation.
 85 |         :param generation: str
 86 |             code generation from LM
 87 |         :param idx: int
 88 |             index of doc in the dataset to which the generation belongs
 89 |             (not used for this task)
 90 |         """
 91 |         output = generation.split("Solution:\n", 3)[-1].strip()
 92 |         return output
 93 | 
 94 |     def process_results(self, generations, references):
 95 |         """Takes the list of LM generations and evaluates them against ground truth references,
 96 |         returning the metric for the generations.
 97 |         :param generations: list(list(str))
 98 |             list of lists containing generations
 99 |         :param references: list(str)
100 |             list of str containing references
101 |         """
102 |         bleu = load("bleu")
103 |         gens = [gen[0] for gen in generations]
104 |         results = bleu.compute(
105 |             references=references, predictions=gens, max_order=4, smooth=True
106 |         )
107 |         return results
108 | 


--------------------------------------------------------------------------------
/memorization_thresholds/gh_crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import sys
  3 | import time
  4 | from requests.packages import urllib3
  5 | 
  6 | # Insert GitHub API token here, in place of *TOKEN*.
  7 | headers = {"Authorization": "token *TOKEN*"}
  8 | 
  9 | # Constants & language argument.
 10 | NUM_REPOS = 25_000
 11 | MIN_STARS = 500
 12 | CREATED = '2023-06-01'
 13 | LANGUAGE = "python" if len(sys.argv) <= 1 else sys.argv[1]  # Default to Python, if none passed.
 14 | 
 15 | 
 16 | def main():
 17 | 	urllib3.disable_warnings()
 18 | 	repositories = set()  # Keep track of a set of repositories seen to avoid duplicate entries across pages.
 19 | 	next_max_stars = 1_000_000_000  # Initialize to a very high value.
 20 | 	with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'a+') as f:
 21 | 		while len(repositories) < NUM_REPOS:
 22 | 			results = run_query(next_max_stars)  # Get the next set of pages.
 23 | 			if not results:
 24 | 				break
 25 | 			new_repositories = [repository for repository, _ in results]
 26 | 			next_max_stars = min([stars for _, stars in results])
 27 | 			
 28 | 			# If a query returns no new repositories, drop it.
 29 | 			if len(repositories | set(new_repositories)) == len(repositories):
 30 | 				break
 31 | 			for repository, stars in sorted(results, key=lambda e: e[1], reverse=True):
 32 | 				if repository not in repositories:
 33 | 					repositories.add(repository)
 34 | 					f.write(f'{stars}\t{repository}\n')
 35 | 			f.flush()
 36 | 			print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}')
 37 | 
 38 | 
 39 | def run_query(max_stars):
 40 | 	end_cursor = None  # Used to track pagination.
 41 | 	repositories = set()
 42 | 	
 43 | 	while end_cursor != "":
 44 | 		# Extracts non-fork, recently active repositories in the provided language, in groups of 100.
 45 | 		# Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories
 46 | 		# by repeatedly lowering the bar.
 47 | 		query = f"""
 48 | 		{{
 49 | 		  search(query: "language:{LANGUAGE} fork:false created:>{CREATED} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{
 50 | 			edges {{
 51 | 			  node {{
 52 | 				... on Repository {{
 53 | 				  url
 54 | 				  isPrivate
 55 | 				  isDisabled
 56 | 				  isLocked
 57 | 				  stargazers {{
 58 | 					totalCount
 59 | 				  }}
 60 | 				}}
 61 | 			  }}
 62 | 			}}
 63 | 			pageInfo {{
 64 | 			  hasNextPage
 65 | 			  endCursor
 66 | 			}}
 67 | 		  }}
 68 | 		}}
 69 | 		"""
 70 | 		print(f'  Retrieving next page; {len(repositories)} repositories in this batch so far.')
 71 | 		# Attempt a query up to three times, pausing when a query limit is hit.
 72 | 		attempts = 0
 73 | 		success = False
 74 | 		while not success and attempts < 3:
 75 | 			request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
 76 | 			content = request.json()
 77 | 			if 'data' not in content or 'search' not in content['data']:
 78 | 				# If this is simply a signal to pause querying, wait two minutes.
 79 | 				if 'message' in content and 'wait' in content['message']:
 80 | 					attempts += 1
 81 | 					time.sleep(120)
 82 | 				# Otherwise, assume we've hit the end of the stream.
 83 | 				else:
 84 | 					break
 85 | 			else:
 86 | 				success = True
 87 | 		if not success:
 88 | 			break
 89 | 		end_cursor = get_end_cursor(content)
 90 | 		new_repositories, is_done = get_repositories(content)
 91 | 		repositories.update(new_repositories)
 92 | 		if len(repositories) > NUM_REPOS or is_done:
 93 | 			break
 94 | 	return repositories
 95 | 
 96 | 
 97 | def get_end_cursor(content):
 98 | 	page_info = content['data']['search']['pageInfo']
 99 | 	has_next_page = page_info['hasNextPage']
100 | 	if has_next_page:
101 | 		return page_info['endCursor']
102 | 	return ""
103 | 
104 | 
105 | def get_repositories(content):
106 | 	edges = content['data']['search']['edges']
107 | 	repositories_with_stars = []
108 | 	for edge in edges:
109 | 		if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False:
110 | 			repository = edge['node']['url']
111 | 			star_count = edge['node']['stargazers']['totalCount']
112 | 			if star_count < MIN_STARS:
113 | 				return repositories_with_stars, True
114 | 			repositories_with_stars.append((repository, star_count))
115 | 	return repositories_with_stars, False
116 | 
117 | 
118 | if __name__ == '__main__':
119 | 	main()
120 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/mbpp.py:
--------------------------------------------------------------------------------
  1 | """Program Synthesis with Large Language Models
  2 | https://arxiv.org/abs/2108.07732
  3 | 
  4 | The benchmark consists of around 1,000 crowd-sourced Python programming problems, 
  5 | designed to be solvable by entry level programmers, covering programming fundamentals, 
  6 | standard library functionality, and so on. Each problem consists of a task description, 
  7 | code solution and 3 automated test cases. As described in the paper, a subset of the data
  8 | has been hand-verified by the authors.
  9 | 
 10 | Homepage:: https://github.com/google-research/google-research/tree/master/mbpp
 11 | """
 12 | 
 13 | import re
 14 | 
 15 | from evaluate import load
 16 | 
 17 | from bigcode_eval.base import Task
 18 | 
 19 | _CITATION = """
 20 | @article{austin2021program,
 21 |   title={Program Synthesis with Large Language Models},
 22 |   author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
 23 |   journal={arXiv preprint arXiv:2108.07732},
 24 |   year={2021}
 25 | }
 26 | """
 27 | 
 28 | 
 29 | class MBPP(Task):
 30 |     """A task represents an entire benchmark including its dataset, problems,
 31 |     answers, generation settings and evaluation methods.
 32 |     """
 33 | 
 34 |     DATASET_PATH = "mbpp"
 35 | 
 36 |     def __init__(self):
 37 |         super().__init__(
 38 |             stop_words=["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/", "\n```"],
 39 |             requires_execution=True,
 40 |         )
 41 | 
 42 |     def get_dataset(self):
 43 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 44 |         dataset = self.dataset["test"]
 45 |         # the wrong split of mbpp can be loaded with old datasets cache
 46 |         assert (
 47 |             len(dataset) == 500
 48 |         ), "please ensure you have the latest version of MBPP dataset, try deleting its old cache"
 49 |         return dataset
 50 | 
 51 |     def get_prompt(self, doc):
 52 |         """Builds the prompt for the LM to generate from.
 53 |         MBPP prompt is built following to InCoder (Fried et al.) approach
 54 |         prompt = docstring that includes one test
 55 |         """
 56 |         description = doc["text"]
 57 |         test_example = doc["test_list"][0]
 58 |         prompt = f'"""\n{description}\n{test_example}\n"""\n'
 59 |         return prompt
 60 | 
 61 |     def get_reference(self, doc):
 62 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 63 |         return "\n".join(doc["test_list"])
 64 | 
 65 |     @staticmethod
 66 |     def _stop_at_stop_token(decoded_string, stop_tokens):
 67 |         """
 68 |         Produces the prefix of decoded_string that ends at the first occurrence of
 69 |         a stop_token.
 70 |         WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
 71 |         itself.
 72 |         """
 73 |         min_stop_index = len(decoded_string)
 74 |         for stop_token in stop_tokens:
 75 |             stop_index = decoded_string.find(stop_token)
 76 |             if stop_index != -1 and stop_index < min_stop_index:
 77 |                 min_stop_index = stop_index
 78 |         return decoded_string[:min_stop_index]
 79 | 
 80 |     def postprocess_generation(self, generation, idx):
 81 |         """Defines the postprocessing for a LM generation.
 82 |         :param generation: str
 83 |             code generation from LM
 84 |         :param idx: int
 85 |             index of doc in the dataset to which the generation belongs
 86 |         """
 87 |         prompt = self.get_prompt(self.dataset["test"][idx])
 88 |         generation = generation[len(prompt) :]
 89 |         return prompt + self._stop_at_stop_token(generation, self.stop_words)
 90 | 
 91 |     def process_results(self, generations, references):
 92 |         """Takes the list of LM generations and evaluates them against ground truth references,
 93 |         returning the metric for the generations.
 94 |         :param generations: list(list(str))
 95 |             list of lists containing generations
 96 |         :param references: list(str)
 97 |             list of str containing refrences
 98 |         """
 99 |         code_metric = load("code_eval")
100 |         results, _ = code_metric.compute(
101 |             references=references,
102 |             predictions=generations,
103 |         )
104 |         return results
105 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/conala.py:
--------------------------------------------------------------------------------
  1 | """Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow
  2 | https://arxiv.org/pdf/1805.08949.pdf
  3 | 
  4 | Python Code generation with CoNaLa. It is a benchmark of code and natural language pairs, for the evaluation of code generation tasks. 
  5 | The dataset was crawled from Stack Overflow, automatically filtered, then curated by annotators,
  6 | split into 2,379 training and 500 test examples.
  7 | 
  8 | Homepage: https://conala-corpus.github.io/
  9 | Here we use two-shot evaluation (the original paper evaluates finetuned models)
 10 | """
 11 | 
 12 | import json
 13 | 
 14 | from evaluate import load
 15 | 
 16 | from bigcode_eval.base import Task
 17 | 
 18 | _CITATION = """
 19 | @inproceedings{yin2018learning,
 20 |   title={Learning to mine aligned code and natural language pairs from stack overflow},
 21 |   author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},
 22 |   booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},
 23 |   pages={476--486},
 24 |   year={2018},
 25 |   organization={IEEE}
 26 | }
 27 | """
 28 | 
 29 | 
 30 | class Conala(Task):
 31 |     """A task represents an entire benchmark including its dataset, problems,
 32 |     answers, generation settings and evaluation methods.
 33 |     """
 34 | 
 35 |     DATASET_PATH = "neulab/conala"
 36 | 
 37 |     def __init__(self):
 38 |         super().__init__(
 39 |             stop_words=["\n"],
 40 |             requires_execution=False,
 41 |         )
 42 | 
 43 |     def get_dataset(self):
 44 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 45 |         return self.dataset["test"]
 46 | 
 47 |     def fewshot_examples(self):
 48 |         """Loads and returns the few-shot examples for the task if they exist."""
 49 |         with open(
 50 |             "bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json", "r"
 51 |         ) as file:
 52 |             examples = json.load(file)
 53 |         return examples
 54 | 
 55 |     @staticmethod
 56 |     def two_shot_prompt(entry, text, examples):
 57 |         """Two shot prompt format as instructions & solutions"""
 58 |         prompt = f"\nInstruction:\n{examples['instruction1']}\
 59 |                    \nSolution:\n{examples['solution1']}\
 60 |                    \nInstruction:\n{examples['instruction2']}\
 61 |                    \nSolution:\n{examples['solution2']}\
 62 |                    \nInstruction:\n{text}\
 63 |                    \nSolution:\n"
 64 |         assert (
 65 |             prompt.count("Solution:\n") == 3
 66 |         ), "Splitting operation in postprocess_generation is invalid"
 67 |         return entry + prompt
 68 | 
 69 |     def get_prompt(self, doc):
 70 |         """Builds the prompt for the LM to generate from."""
 71 |         examples = self.fewshot_examples()
 72 |         text_column = "rewritten_intent" if doc["rewritten_intent"] else "intent"
 73 |         text = doc[text_column].strip()
 74 |         entry = "Answer the following instructions in one line of Python code:\n"
 75 |         prompt = self.two_shot_prompt(entry, text, examples)
 76 |         return prompt
 77 | 
 78 |     def get_reference(self, doc):
 79 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 80 |         return doc["snippet"]
 81 | 
 82 |     def postprocess_generation(self, generation, idx):
 83 |         """Defines the postprocessing for a LM generation.
 84 |         :param generation: str
 85 |             code generation from LM
 86 |         :param idx: int
 87 |             index of doc in the dataset to which the generation belongs
 88 |             (not used for this task)
 89 |         """
 90 |         output = generation.split("Solution:\n", 3)[-1].strip()
 91 |         return output
 92 | 
 93 |     def process_results(self, generations, references):
 94 |         """Takes the list of LM generations and evaluates them against ground truth references,
 95 |         returning the metric for the generations.
 96 |         :param generations: list(list(str))
 97 |             list of lists containing generations
 98 |         :param references: list(str)
 99 |             list of str containing references
100 |         """
101 |         bleu = load("bleu")
102 |         gens = [gen[0] for gen in generations]
103 |         results = bleu.compute(
104 |             references=references, predictions=gens, max_order=4, smooth=True
105 |         )
106 |         return results
107 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/Code-to-text/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from datasets import load_dataset
  4 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
  5 |                           Trainer, TrainingArguments, set_seed)
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument(
 11 |         "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
 12 |     )
 13 |     parser.add_argument("--language", type=str, default="Python")
 14 |     parser.add_argument("--max_length", type=int, default=1024)
 15 |     parser.add_argument("--num_epochs", type=int, default=5)
 16 |     parser.add_argument("--batch_size", type=int, default=6)
 17 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
 18 |     parser.add_argument("--freeze", type=bool, default=True)
 19 |     parser.add_argument("--learning_rate", type=float, default=5e-4)
 20 |     parser.add_argument("--seed", type=int, default=0)
 21 |     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
 22 |     parser.add_argument("--num_warmup_steps", type=int, default=10)
 23 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 24 |     parser.add_argument("--output_dir", type=str, default="./results")
 25 |     parser.add_argument("--push_to_hub", type=bool, default=False)
 26 |     parser.add_argument("--model_hub_name", type=str, default="codeclone_model")
 27 |     return parser.parse_args()
 28 | 
 29 | 
 30 | def main():
 31 |     args = get_args()
 32 |     set_seed(args.seed)
 33 | 
 34 |     ds = load_dataset("code_x_glue_ct_code_to_text", args.language)
 35 | 
 36 |     print("Loading tokenizer and model")
 37 |     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
 38 |     tokenizer.pad_token = tokenizer.eos_token
 39 |     model = AutoModelForSequenceClassification.from_pretrained(
 40 |         args.model_ckpt, num_labels=2
 41 |     )
 42 |     model.config.pad_token_id = model.config.eos_token_id
 43 | 
 44 |     if args.freeze:
 45 |         for param in model.roberta.parameters():
 46 |             param.requires_grad = False
 47 | 
 48 |     def tokenize(example):
 49 |         if args.language == "Python":
 50 |             # remove docstring from code
 51 |             chunks = example["code"].split('"""')
 52 |             code = chunks[0].strip() + chunks[2]
 53 |         else:
 54 |             code = example["code"]
 55 |         inputs = tokenizer(
 56 |             code, padding="max_length", truncation=True, max_length=args.max_length
 57 |         )
 58 |         labels = tokenizer(
 59 |             example["docstring"],
 60 |             padding="max_length",
 61 |             truncation=True,
 62 |             max_length=args.max_length,
 63 |         ).input_ids
 64 |         labels_with_ignore_index = []
 65 |         for labels_example in labels:
 66 |             labels_example = [label if label != 0 else -100 for label in labels_example]
 67 |             labels_with_ignore_index.append(labels_example)
 68 | 
 69 |         return {
 70 |             "input_ids": inputs["input_ids"],
 71 |             "attention_mask": inputs["attention_mask"],
 72 |             "label": labels_with_ignore_index,
 73 |         }
 74 | 
 75 |     tokenized_datasets = ds.map(
 76 |         tokenize,
 77 |         batched=True,
 78 |         remove_columns=ds["train"].column_names,
 79 |     )
 80 | 
 81 |     training_args = TrainingArguments(
 82 |         output_dir=args.output_dir,
 83 |         learning_rate=args.learning_rate,
 84 |         lr_scheduler_type=args.lr_scheduler_type,
 85 |         evaluation_strategy="epoch",
 86 |         save_strategy="epoch",
 87 |         logging_strategy="epoch",
 88 |         per_device_train_batch_size=args.batch_size,
 89 |         per_device_eval_batch_size=args.batch_size,
 90 |         num_train_epochs=args.num_epochs,
 91 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
 92 |         weight_decay=args.weight_decay,
 93 |         run_name=f"code-to-text-{args.language}",
 94 |         report_to="wandb",
 95 |     )
 96 | 
 97 |     trainer = Trainer(
 98 |         model=model,
 99 |         args=training_args,
100 |         train_dataset=tokenized_datasets["train"],
101 |         eval_dataset=tokenized_datasets["validation"],
102 |         tokenizer=tokenizer,
103 |     )
104 | 
105 |     print("Training...")
106 |     trainer.train()
107 | 
108 |     # push the model to the Hugging Face hub
109 |     if args.push_to_hub:
110 |         model.push_to_hub(args.model_hub_name)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import json
 3 | import os
 4 | import warnings
 5 | 
 6 | from bigcode_eval import tasks
 7 | from bigcode_eval.generation import parallel_generations
 8 | 
 9 | _WARNING = """
10 | ################################################################################
11 |                                   !!!WARNING!!!
12 | ################################################################################
13 | The "code_eval"/"apps_metric" you are about to use, execute untrusted 
14 | model-generated code in Python.
15 | Although it is highly unlikely that model-generated code will do something
16 | overtly malicious in response to this test suite, model-generated code may act
17 | destructively due to a lack of model capability or alignment.
18 | Users are strongly encouraged to sandbox this evaluation suite so that it
19 | does not perform destructive actions on their host or network. For more
20 | information on how OpenAI sandboxes its code, see the paper "Evaluating Large
21 | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
22 | Once you have read this disclaimer and taken appropriate precautions, set the argument 
23 | "allow_code_execution" to True.
24 | ################################################################################\
25 | """
26 | 
27 | 
28 | class Evaluator:
29 |     def __init__(self, accelerator, model, tokenizer, args):
30 |         self.accelerator = accelerator
31 |         self.model = model
32 |         self.tokenizer = tokenizer
33 |         self.args = args
34 | 
35 |         # setup arguments
36 |         self.metric_output_path = args.metric_output_path
37 | 
38 |         # code evaluation permission
39 |         self.allow_code_execution = args.allow_code_execution
40 | 
41 |     def generate_text(self, task_name):
42 |         task = tasks.get_task(task_name, self.args)
43 |         dataset = task.get_dataset()
44 |         # if args.limit is None, use all samples
45 |         n_tasks = self.args.limit if self.args.limit else len(dataset)
46 |         references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)]
47 | 
48 |         if self.args.check_references:
49 |             if "get_solution" in inspect.signature(task.get_reference).parameters:
50 |                 solutions = [[task.get_reference(dataset[i], get_solution=True)] for i in range(self.args.limit_start, self.args.limit_start+n_tasks)]
51 |             else:
52 |                 solutions = [[ref] for ref in references]
53 |             return solutions, references
54 | 
55 |         generations = parallel_generations(
56 |             task,
57 |             dataset,
58 |             self.accelerator,
59 |             self.model,
60 |             self.tokenizer,
61 |             n_tasks=n_tasks,
62 |             args=self.args,
63 |         )
64 |         if len(generations[0]) > self.args.n_samples:
65 |             generations = [l[: self.args.n_samples] for l in generations]
66 |             warnings.warn(
67 |                 f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}"
68 |             )
69 |         return generations, references
70 | 
71 |     def evaluate(self, task_name):
72 |         task = tasks.get_task(task_name, self.args)
73 |         if task.requires_execution and not self.allow_code_execution:
74 |             raise ValueError(_WARNING)
75 | 
76 |         generations, references = self.generate_text(task_name)
77 | 
78 |         if self.accelerator.is_main_process:
79 |             if not self.args.load_generations_path:
80 |                 if self.args.save_generations:
81 |                     with open(self.args.save_generations_path, "w") as fp:
82 |                         json.dump(generations, fp)
83 |                         print(
84 |                             f"generations were saved at {self.args.save_generations_path}"
85 |                         )
86 |                 if self.args.save_references:
87 |                     with open("references.json", "w") as fp:
88 |                         json.dump(references, fp)
89 |                         print("references were saved at references.json")
90 | 
91 |             # make sure tokenizer plays nice with multiprocessing
92 |             os.environ["TOKENIZERS_PARALLELISM"] = "false"
93 |             if self.allow_code_execution and task.requires_execution:
94 |                 os.environ["HF_ALLOW_CODE_EVAL"] = "1"
95 |             print("Evaluating generations...")
96 |             results = task.process_results(generations, references)
97 |             return results
98 | 


--------------------------------------------------------------------------------
/sensitive_memorization/generate_secret_mask.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from datasets import load_from_disk
  4 | import torch
  5 | from transformers import AutoTokenizer, AutoModelForCausalLM
  6 | 
  7 | 
  8 | def mask_example(example):
  9 |     content = example['content'].encode('utf-8', 'ignore').decode('utf-8')
 10 |     secrets = eval(example['secrets'])
 11 |     sorted_secrets = sorted(secrets, key=lambda secret: secret['start'])
 12 |     # print(sorted_secrets)
 13 |     
 14 |     encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True)
 15 |     content_token_ids = encoding.input_ids
 16 |     offset_mapping = encoding.offset_mapping
 17 |     
 18 |     secret_mask = torch.zeros_like(content_token_ids, dtype=torch.bool)
 19 |     for secret in sorted_secrets:
 20 |         secret_start = secret['start']
 21 |         secret_end = secret['end']
 22 |         offset_mapping_start_check = offset_mapping[..., 0] <= secret_end  # !!! < or <=
 23 |         offset_mapping_end_check = offset_mapping[..., 1] >= secret_start  # !!! > or >=
 24 |         secret_mask = secret_mask + (offset_mapping_start_check.int() * offset_mapping_end_check.int()).bool()
 25 |     secret_token_ids = content_token_ids[secret_mask]
 26 |     # print(secret_token_ids)
 27 |     # print(tokenizer.convert_ids_to_tokens(secret_token_ids))
 28 |     if secret_token_ids.shape[0] == 0:
 29 |         example['keep_flag'] = False
 30 |     else:
 31 |         example['keep_flag'] = True
 32 |     
 33 |     example['content'] = content
 34 |     example['secrets'] = str(sorted_secrets)
 35 |     example['content_token_ids'] = content_token_ids
 36 |     example['offset_mapping'] = offset_mapping
 37 |     example['secret_mask'] = secret_mask
 38 | 
 39 |     return example
 40 | 
 41 | 
 42 | def filter_example(example):
 43 |     content = example['content']
 44 |     secrets = eval(example['secrets'])
 45 |     
 46 |     encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True)
 47 |     content_token_ids = encoding.input_ids
 48 |     offset_mapping = encoding.offset_mapping
 49 |     if secrets[0]['start'] > offset_mapping[0, -1, -1]:
 50 |         return False
 51 |     else:
 52 |         return True
 53 | 
 54 | 
 55 | def update_example(example):
 56 |     content = example['content']
 57 |     secrets = eval(example['secrets'])
 58 |     
 59 |     encoding = tokenizer(content, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length', return_offsets_mapping=True)
 60 |     content_token_ids = encoding.input_ids
 61 |     offset_mapping = encoding.offset_mapping
 62 |     if offset_mapping[0, -1, -1] == 0:
 63 |         return example
 64 |     elif offset_mapping[0, -1, -1] >= secrets[-1]['start']:
 65 |         return example
 66 |     else:
 67 |         truncation_index = len(secrets)
 68 |         for index in range(len(secrets)):
 69 |             if secrets[index]['start'] > offset_mapping[0, -1, -1]:
 70 |                 truncation_index = index
 71 |                 break
 72 |         example['secrets'] = str(secrets[:truncation_index])
 73 |         example['number_secrets'] = truncation_index
 74 |         return example
 75 | 
 76 | 
 77 | def main():
 78 |     dataset_path = f"./codeparrot-clean-train-secrets-masked-{args.model_name_or_path.split('/')[-1]}"
 79 |     if os.path.exists(dataset_path):
 80 |         ds_pii = load_from_disk(dataset_path)
 81 |     else:
 82 |         ds_pii = load_from_disk('codeparrot-clean-train-secrets-filtered')
 83 |         ds_pii = ds_pii.map(mask_example, num_proc=48)
 84 |         ds_pii = ds_pii.filter(lambda example: example['keep_flag'], batched=True, batch_size=1000, num_proc=48)
 85 |         ds_pii = ds_pii.filter(filter_example, num_proc=32)
 86 |         ds_pii = ds_pii.map(update_example, num_proc=32)
 87 |         ds_pii.save_to_disk(dataset_path)
 88 |     print(ds_pii)
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument("--model_name_or_path", default="codeparrot/codeparrot", type=str,
 94 |                         help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.")
 95 |     args = parser.parse_args()
 96 |     print(args)
 97 |     
 98 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True)
 99 |     # tokenizer.pad_token = tokenizer.eos_token
100 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
101 |     
102 |     # max_seq_len = 1024
103 |     max_seq_len = 512
104 | 
105 |     main()
106 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/codexglue_text_to_text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation
  3 | https://arxiv.org/abs/2102.04664
  4 | 
  5 | Text to text task from CodeXGlue (documentation translation)
  6 | """
  7 | 
  8 | import json
  9 | import os
 10 | import re
 11 | 
 12 | from evaluate import load
 13 | 
 14 | from bigcode_eval.base import Task
 15 | 
 16 | _CITATION = """
 17 | @article{CodeXGLUE,
 18 |          title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence},
 19 |          year={2020},}
 20 | """
 21 | 
 22 | SOURCE_LANG = {
 23 |     "da_en": "danish",
 24 |     "zh_en": "chinese",
 25 |     "no_en": "norwegian",
 26 |     "lv_en": "latvian",
 27 | }
 28 | 
 29 | 
 30 | def create_all_tasks():
 31 |     """Creates a dictionary of tasks from a list of languages
 32 |     :return: {task_name: task}
 33 |         e.g. {codexglue_text_to_text-da_en: Task, codexglue_text_to_text-zh_en: Task}
 34 |     """
 35 |     return {
 36 |         f"codexglue_text_to_text-{translation_task}": create_task(translation_task)
 37 |         for translation_task in SOURCE_LANG
 38 |     }
 39 | 
 40 | 
 41 | def create_task(translation_task):
 42 |     class CodexglueTextToTextTask(CodexglueTextToText):
 43 |         def __init__(self):
 44 |             super().__init__(translation_task)
 45 | 
 46 |     return CodexglueTextToTextTask
 47 | 
 48 | 
 49 | class CodexglueTextToText(Task):
 50 | 
 51 |     DATASET_PATH = "code_x_glue_tt_text_to_text"
 52 |     DATASET_NAME = None
 53 | 
 54 |     def __init__(self, translation_task):
 55 |         self.DATASET_NAME = translation_task
 56 |         stop_words = ["\n"]
 57 |         requires_execution = False
 58 |         super().__init__(stop_words, requires_execution)
 59 | 
 60 |     def get_dataset(self):
 61 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 62 |         return self.dataset["test"]
 63 | 
 64 |     def fewshot_examples(self):
 65 |         """Loads and returns the few-shot examples for the task if they exist."""
 66 |         with open(
 67 |             "bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json",
 68 |             "r",
 69 |         ) as file:
 70 |             examples = json.load(file)
 71 |         return examples
 72 | 
 73 |     @staticmethod
 74 |     def two_shot_prompt(entry, text, examples, language):
 75 |         """Two shot prompt format as source & target language documentation"""
 76 |         prompt = f"\n{language.title()}:\n{examples['source1']}\
 77 |                    \nEnglish:\n{examples['target1']}\
 78 |                    \n{language.title()}:\n{examples['source2']}\
 79 |                    \nEnglish:\n{examples['target2']}\
 80 |                    \n{language.title()}:\n{text}\
 81 |                    \nEnglish:\n"
 82 |         return entry + prompt
 83 | 
 84 |     def get_prompt(self, doc):
 85 |         """Builds the prompt for the LM to generate from."""
 86 |         language = SOURCE_LANG[self.DATASET_NAME]
 87 |         text = doc["source"]
 88 |         entry = f"Translate the following documentation from {language.title()} to English:\n"
 89 |         examples = self.fewshot_examples()
 90 |         examples = examples[language]
 91 |         prompt = self.two_shot_prompt(entry, text, examples, language)
 92 |         return prompt
 93 | 
 94 |     def get_reference(self, doc):
 95 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 96 |         return doc["target"].strip()
 97 | 
 98 |     def postprocess_generation(self, generation, idx):
 99 |         """Defines the postprocessing for a LM generation.
100 |         :param generation: str
101 |             code generation from LM
102 |         :param idx: int
103 |             index of doc in the dataset to which the generation belongs
104 |             (not used for this task)
105 |         """
106 |         output = generation.split("\nEnglish:\n", 3)[-1].strip()
107 |         return output
108 | 
109 |     def process_results(self, generations, references):
110 |         """Takes the list of LM generations and evaluates them against ground truth references,
111 |         returning the metric for the generations.
112 |         :param generations: list(list(str))
113 |             list of lists containing generations
114 |         :param references: list(str)
115 |             list of str containing references
116 |         """
117 |         bleu = load("bleu")
118 |         gens = [gen[0] for gen in generations]
119 |         results = bleu.compute(
120 |             references=references, predictions=gens, max_order=4, smooth=True
121 |         )
122 |         return results
123 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/apps.py:
--------------------------------------------------------------------------------
  1 | """Measuring Coding Challenge Competence With APPS
  2 | https://arxiv.org/abs/2105.09938
  3 | 
  4 | APPS is a benchmark for code generation with 10000 problems. With three difficulty levels: introductory, interview and competition.
  5 | It can be used to evaluate the ability of language models to generate code from natural language specifications.
  6 | 
  7 | Homepage: https://github.com/hendrycks/apps
  8 | """
  9 | 
 10 | import json
 11 | 
 12 | from evaluate import load
 13 | 
 14 | from bigcode_eval.base import Task
 15 | 
 16 | _CITATION = """
 17 | @article{hendrycksapps2021,
 18 |   title={Measuring Coding Challenge Competence With APPS},
 19 |   author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
 20 |   journal={NeurIPS},
 21 |   year={2021}
 22 | }
 23 | """
 24 | 
 25 | 
 26 | LEVELS = ["introductory", "interview", "competition"]
 27 | 
 28 | 
 29 | def create_all_tasks():
 30 |     """Creates a dictionary of tasks from a list of levels
 31 |     :return: {task_name: task}
 32 |         e.g. {apps-interview: Task, apps-competitoon: Task}
 33 |     """
 34 |     return {f"apps-{level}": create_task(level) for level in LEVELS}
 35 | 
 36 | 
 37 | def create_task(level):
 38 |     class APPS(GeneralAPPS):
 39 |         def __init__(self):
 40 |             super().__init__(level)
 41 | 
 42 |     return APPS
 43 | 
 44 | 
 45 | class GeneralAPPS(Task):
 46 |     """A task represents an entire benchmark including its dataset, problems,
 47 |     answers, generation settings and evaluation methods.
 48 |     """
 49 | 
 50 |     DATASET_PATH = "codeparrot/apps"
 51 |     DATASET_NAME = None
 52 | 
 53 |     def __init__(self, level):
 54 |         self.DATASET_NAME = level
 55 |         super().__init__(
 56 |             stop_words=["\nQUESTION", "\n---", "\nANSWER"],
 57 |             requires_execution=True,
 58 |         )
 59 | 
 60 |     def get_dataset(self):
 61 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 62 |         return self.dataset["test"]
 63 | 
 64 |     def get_prompt(self, doc):
 65 |         """Generate prompts for APPS
 66 |         Finetuning setup: prompt=question  with some starter code and function name if they exist.
 67 |         We also specify the type of the prompt, i.e. whether it is call-based or standard input-based.
 68 |         """
 69 |         starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"]
 70 |         try:
 71 |             input_outpout = json.loads(doc["input_output"])
 72 |             fn_name = (
 73 |                 None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
 74 |             )
 75 |         except ValueError:
 76 |             fn_name = None
 77 |         prompt = "\nQUESTION:\n"
 78 |         prompt += doc["question"]
 79 |         if starter_code:
 80 |             prompt += starter_code
 81 |         if not fn_name:
 82 |             call_format = "\nUse Standard Input format"
 83 |             prompt += call_format
 84 |         else:
 85 |             call_format = "\nUse Call-Based format"
 86 |             prompt += call_format
 87 |         prompt += "\nANSWER:\n"
 88 |         return prompt
 89 | 
 90 |     def get_reference(self, doc):
 91 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 92 |         return None
 93 | 
 94 |     def postprocess_generation(self, generation, idx):
 95 |         """Defines the postprocessing for a LM generation.
 96 |         :param generation: str
 97 |             code generation from LM
 98 |         :param idx: int
 99 |             index of doc in the dataset to which the generation belongs
100 |             (not used for APPS)
101 |         """
102 |         try:
103 |             generation = generation.split("\nANSWER:", 1)[1]
104 |         except IndexError:
105 |             # happens when prompts were very long and got truncated
106 |             pass
107 |         return generation
108 | 
109 |     def process_results(self, generations, references):
110 |         """Takes the list of LM generations and evaluates them against ground truth references,
111 |         returning the metric for the generations.
112 |         :param generations: list(list(str))
113 |             list of lists containing generations
114 |         :param references: list(str)
115 |             list of str containing refrences (not needed for APPS Task)
116 |         """
117 |         code_metric = load("codeparrot/apps_metric")
118 |         results = code_metric.compute(
119 |             predictions=generations, k_list=[1, 10, 100], level=self.DATASET_NAME
120 |         )
121 |         return results
122 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/pal_metric/pal_code_exec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import warnings
  3 | from collections import Counter, defaultdict
  4 | from concurrent.futures import ThreadPoolExecutor, as_completed
  5 | 
  6 | from bigcode_eval.tasks.custom_metrics.pal_metric.python_executor import run_program
  7 | 
  8 | # adapted from https://github.com/huggingface/evaluate/blob/main/metrics/code_eval/code_eval.py
  9 | 
 10 | _WARNING = """
 11 | ################################################################################
 12 |                                   !!!WARNING!!!
 13 | ################################################################################
 14 | The "code_eval" metric executes untrusted model-generated code in Python.
 15 | Although it is highly unlikely that model-generated code will do something
 16 | overtly malicious in response to this test suite, model-generated code may act
 17 | destructively due to a lack of model capability or alignment.
 18 | Users are strongly encouraged to sandbox this evaluation suite so that it
 19 | does not perform destructive actions on their host or network. For more
 20 | information on how OpenAI sandboxes its code, see the paper "Evaluating Large
 21 | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
 22 | Once you have read this disclaimer and taken appropriate precautions,
 23 | set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
 24 | with:
 25 | >>> import os
 26 | >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
 27 | ################################################################################\
 28 | """
 29 | 
 30 | 
 31 | def compute(
 32 |     predictions,
 33 |     references,
 34 |     num_workers=4,
 35 |     timeout=3.0,
 36 |     majority_voting=False,
 37 |     answer_symbol=None,
 38 | ):
 39 |     """
 40 |     Returns the scores
 41 | 
 42 |     :param majority_voting: bool
 43 |         Takes majority voted answer to evaluate against the reference , defaults to False
 44 | 
 45 |     :param answer_symbol: str
 46 |         If speficifed the result of execution is fetched from the program's global context,
 47 |         the program is expected to have the variable name mentioned in `answer_symbol` that is available in globals.
 48 |         if not specified, the result are fetched from the stdout of the execution
 49 |         defaults to None.
 50 | 
 51 |     """
 52 | 
 53 |     if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
 54 |         raise ValueError(_WARNING)
 55 | 
 56 |     if os.name == "nt":
 57 |         raise NotImplementedError("This metric is currently not supported on Windows.")
 58 | 
 59 |     with ThreadPoolExecutor(max_workers=num_workers) as executor:
 60 |         futures = []
 61 |         completion_id = Counter()
 62 |         n_samples = 0
 63 |         results = defaultdict(list)
 64 | 
 65 |         for task_id, candidates in enumerate(predictions):
 66 |             for candidate in candidates:
 67 |                 args = (candidate, timeout, task_id, completion_id[task_id])
 68 |                 if answer_symbol:
 69 |                     args += (answer_symbol,)
 70 |                 future = executor.submit(run_program, *args)
 71 |                 futures.append(future)
 72 |                 completion_id[task_id] += 1
 73 |                 n_samples += 1
 74 | 
 75 |         for future in as_completed(futures):
 76 |             result = future.result()
 77 |             results[result["task_id"]].append((result["completion_id"], result))
 78 | 
 79 |     answers = [None] * len(results)
 80 |     for result in results.values():
 81 |         result.sort()
 82 |         task_id = result[0][1]["task_id"]
 83 |         # filtering the failed generations to avoid influencing majority voting
 84 |         eval_answers = [
 85 |             r[1]["result"]
 86 |             for r in result
 87 |             if isinstance(r[1]["result"], str)
 88 |             and not r[1]["result"].startswith("failed:")
 89 |         ]
 90 |         # if all generations are failed - default to empty str for soring
 91 |         eval_answers = [""] if len(eval_answers) == 0 else eval_answers
 92 |         if majority_voting:
 93 |             counter = Counter(eval_answers)
 94 |             eval_answers = [counter.most_common()[0][0]]
 95 | 
 96 |         if not majority_voting and len(eval_answers) > 1:
 97 |             warnings.warn(
 98 |                 f"Multiple generations found for a task without setting `majority_voting` to True, defaulting answers from first generation"
 99 |             )
100 |         answers[task_id] = eval_answers[0]
101 | 
102 |     scores = []
103 |     # Number of code generated that failed execution.
104 |     errored = 0
105 |     for task_id, (ans, ref) in enumerate(zip(answers, references)):
106 |         try:
107 |             score = 1 if abs(float(ans) - float(ref)) < 1e-3 else 0
108 |         except ValueError as e:
109 |             errored += 1
110 |             score = 0
111 | 
112 |         scores.append(score)
113 | 
114 |     return {"accuracy": sum(scores) / len(scores), "num_failed_execution": errored}
115 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeDefect/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from copy import deepcopy
  3 | 
  4 | import numpy as np
  5 | from datasets import ClassLabel, load_dataset
  6 | from evaluate import load
  7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
  8 |                           DataCollatorWithPadding, Trainer, TrainerCallback,
  9 |                           TrainingArguments, set_seed)
 10 | 
 11 | 
 12 | def get_args():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
 16 |     )
 17 |     parser.add_argument("--max_length", type=int, default=1024)
 18 |     parser.add_argument("--num_epochs", type=int, default=5)
 19 |     parser.add_argument("--batch_size", type=int, default=6)
 20 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
 21 |     parser.add_argument("--freeze", type=bool, default=True)
 22 |     parser.add_argument("--learning_rate", type=float, default=5e-4)
 23 |     parser.add_argument("--seed", type=int, default=0)
 24 |     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
 25 |     parser.add_argument("--num_warmup_steps", type=int, default=10)
 26 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 27 |     parser.add_argument("--output_dir", type=str, default="./results")
 28 |     parser.add_argument("--push_to_hub", type=bool, default=False)
 29 |     parser.add_argument("--model_hub_name", type=str, default="codedefect_model")
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | metric = load("accuracy")
 34 | 
 35 | 
 36 | def compute_metrics(eval_pred):
 37 |     predictions, labels = eval_pred
 38 |     predictions = np.argmax(predictions, axis=1)
 39 |     return metric.compute(predictions=predictions, references=labels)
 40 | 
 41 | 
 42 | class CustomCallback(TrainerCallback):
 43 |     def __init__(self, trainer) -> None:
 44 |         super().__init__()
 45 |         self._trainer = trainer
 46 | 
 47 |     def on_epoch_end(self, args, state, control, **kwargs):
 48 |         if control.should_evaluate:
 49 |             control_copy = deepcopy(control)
 50 |             self._trainer.evaluate(
 51 |                 eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
 52 |             )
 53 |             return control_copy
 54 | 
 55 | 
 56 | def main():
 57 |     args = get_args()
 58 |     set_seed(args.seed)
 59 | 
 60 |     ds = load_dataset("code_x_glue_cc_defect_detection")
 61 |     labels = ClassLabel(num_classes=2, names=[True, False])
 62 |     ds = ds.cast_column("target", labels)
 63 |     ds = ds.rename_column("target", "label")
 64 | 
 65 |     print("Loading tokenizer and model")
 66 |     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
 67 |     tokenizer.pad_token = tokenizer.eos_token
 68 |     model = AutoModelForSequenceClassification.from_pretrained(
 69 |         args.model_ckpt, num_labels=2
 70 |     )
 71 |     model.config.pad_token_id = model.config.eos_token_id
 72 | 
 73 |     if args.freeze:
 74 |         for param in model.roberta.parameters():
 75 |             param.requires_grad = False
 76 | 
 77 |     def tokenize(example):
 78 |         inputs = tokenizer(example["func"], truncation=True, max_length=args.max_length)
 79 |         return {
 80 |             "input_ids": inputs["input_ids"],
 81 |             "attention_mask": inputs["attention_mask"],
 82 |             "label": example["target"],
 83 |         }
 84 | 
 85 |     tokenized_datasets = ds.map(
 86 |         tokenize,
 87 |         batched=True,
 88 |         remove_columns=["id", "func", "project", "commit_id"],
 89 |     )
 90 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 91 | 
 92 |     training_args = TrainingArguments(
 93 |         output_dir=args.output_dir,
 94 |         learning_rate=args.learning_rate,
 95 |         lr_scheduler_type=args.lr_scheduler_type,
 96 |         evaluation_strategy="epoch",
 97 |         save_strategy="epoch",
 98 |         logging_strategy="epoch",
 99 |         per_device_train_batch_size=args.batch_size,
100 |         per_device_eval_batch_size=args.batch_size,
101 |         num_train_epochs=args.num_epochs,
102 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
103 |         weight_decay=args.weight_decay,
104 |         metric_for_best_model="accuracy",
105 |         run_name="code-defect-c",
106 |         report_to="wandb",
107 |     )
108 | 
109 |     trainer = Trainer(
110 |         model=model,
111 |         args=training_args,
112 |         train_dataset=tokenized_datasets["train"],
113 |         eval_dataset=tokenized_datasets["validation"],
114 |         tokenizer=tokenizer,
115 |         data_collator=data_collator,
116 |         compute_metrics=compute_metrics,
117 |     )
118 | 
119 |     print("Training...")
120 |     trainer.add_callback(CustomCallback(trainer))
121 |     trainer.train()
122 | 
123 |     result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
124 |     print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}")
125 | 
126 |     # push the model to the Hugging Face hub
127 |     if args.push_to_hub:
128 |         model.push_to_hub(args.model_hub_name)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeClone/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from copy import deepcopy
  3 | 
  4 | import numpy as np
  5 | from datasets import ClassLabel, load_dataset
  6 | from evaluate import load
  7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
  8 |                           DataCollatorWithPadding, Trainer, TrainerCallback,
  9 |                           TrainingArguments, set_seed)
 10 | 
 11 | 
 12 | def get_args():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
 16 |     )
 17 |     parser.add_argument("--max_length", type=int, default=1024)
 18 |     parser.add_argument("--num_epochs", type=int, default=5)
 19 |     parser.add_argument("--batch_size", type=int, default=6)
 20 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
 21 |     parser.add_argument("--freeze", type=bool, default=True)
 22 |     parser.add_argument("--learning_rate", type=float, default=5e-4)
 23 |     parser.add_argument("--seed", type=int, default=0)
 24 |     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
 25 |     parser.add_argument("--num_warmup_steps", type=int, default=10)
 26 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 27 |     parser.add_argument("--output_dir", type=str, default="./results")
 28 |     parser.add_argument("--push_to_hub", type=bool, default=False)
 29 |     parser.add_argument("--model_hub_name", type=str, default="codeclone_model")
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | metric = load("accuracy")
 34 | 
 35 | 
 36 | def compute_metrics(eval_pred):
 37 |     predictions, labels = eval_pred
 38 |     predictions = np.argmax(predictions, axis=1)
 39 |     return metric.compute(predictions=predictions, references=labels)
 40 | 
 41 | 
 42 | class CustomCallback(TrainerCallback):
 43 |     def __init__(self, trainer) -> None:
 44 |         super().__init__()
 45 |         self._trainer = trainer
 46 | 
 47 |     def on_epoch_end(self, args, state, control, **kwargs):
 48 |         if control.should_evaluate:
 49 |             control_copy = deepcopy(control)
 50 |             self._trainer.evaluate(
 51 |                 eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
 52 |             )
 53 |             return control_copy
 54 | 
 55 | 
 56 | def main():
 57 |     args = get_args()
 58 |     set_seed(args.seed)
 59 | 
 60 |     ds = load_dataset("code_x_glue_cc_clone_detection_big_clone_bench")
 61 |     labels = ClassLabel(num_classes=2, names=[True, False])
 62 |     ds = ds.cast_column("label", labels)
 63 | 
 64 |     print("Loading tokenizer and model")
 65 |     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
 66 |     tokenizer.pad_token = tokenizer.eos_token
 67 |     model = AutoModelForSequenceClassification.from_pretrained(
 68 |         args.model_ckpt, num_labels=2
 69 |     )
 70 |     model.config.pad_token_id = model.config.eos_token_id
 71 | 
 72 |     if args.freeze:
 73 |         for param in model.roberta.parameters():
 74 |             param.requires_grad = False
 75 | 
 76 |     def tokenize(example):
 77 |         inputs = tokenizer(
 78 |             example["func1"],
 79 |             example["func2"],
 80 |             truncation=True,
 81 |             max_length=args.max_length,
 82 |         )
 83 |         return {
 84 |             "input_ids": inputs["input_ids"],
 85 |             "attention_mask": inputs["attention_mask"],
 86 |         }
 87 | 
 88 |     tokenized_datasets = ds.map(
 89 |         tokenize,
 90 |         batched=True,
 91 |         remove_columns=["id", "id1", "id2", "func1", "func2"],
 92 |     )
 93 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 94 | 
 95 |     training_args = TrainingArguments(
 96 |         output_dir=args.output_dir,
 97 |         learning_rate=args.learning_rate,
 98 |         lr_scheduler_type=args.lr_scheduler_type,
 99 |         evaluation_strategy="epoch",
100 |         save_strategy="epoch",
101 |         logging_strategy="epoch",
102 |         per_device_train_batch_size=args.batch_size,
103 |         per_device_eval_batch_size=args.batch_size,
104 |         num_train_epochs=args.num_epochs,
105 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
106 |         weight_decay=args.weight_decay,
107 |         metric_for_best_model="accuracy",
108 |         run_name="code-clone-java",
109 |         report_to="wandb",
110 |     )
111 | 
112 |     trainer = Trainer(
113 |         model=model,
114 |         args=training_args,
115 |         train_dataset=tokenized_datasets["train"],
116 |         eval_dataset=tokenized_datasets["validation"],
117 |         tokenizer=tokenizer,
118 |         data_collator=data_collator,
119 |         compute_metrics=compute_metrics,
120 |     )
121 | 
122 |     print("Training...")
123 |     trainer.add_callback(CustomCallback(trainer))
124 |     trainer.train()
125 | 
126 |     result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
127 |     print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}")
128 | 
129 |     # push the model to the Hugging Face hub
130 |     if args.push_to_hub:
131 |         model.push_to_hub(args.model_hub_name)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/python_bugs.py:
--------------------------------------------------------------------------------
  1 | """Python Bugs
  2 | https://proceedings.mlr.press/v162/he22a.html
  3 | 
  4 | This dataset is taken from the preprossing done by CarperAI (https://carper.ai/diff-models-a-new-way-to-edit-code).
  5 | It is uploaded here: https://huggingface.co/datasets/Muennighoff/python-bugs
  6 | 
  7 | Make sure to run with sufficient context length (512 is not enough for e.g. CodeGen).
  8 | """
  9 | 
 10 | import re
 11 | 
 12 | from evaluate import load
 13 | from bigcode_eval.base import Task
 14 | import tqdm
 15 | 
 16 | _CITATION = """
 17 | @inproceedings{he2022distribution,
 18 |   title={On distribution shift in learning-based bug detectors},
 19 |   author={He, Jingxuan and Beurer-Kellner, Luca and Vechev, Martin},
 20 |   booktitle={International Conference on Machine Learning},
 21 |   pages={8559--8580},
 22 |   year={2022},
 23 |   organization={PMLR}
 24 | }
 25 | """
 26 | 
 27 | MUTATE_TO_TASK_TO_PROMPT = {
 28 |     "prompt_carper": {
 29 |         "bin-op": "# Fixed binary operator",
 30 |         "var-misuse": "# Fixed incorrect variable name",
 31 |     },
 32 |     "prompt_present": {
 33 |         "bin-op": "# Fix binary operator",
 34 |         "var-misuse": "# Fix incorrect variable name",
 35 |     },
 36 |     # Same as prompt_carper, but other parts are still different
 37 |     "prompt": {
 38 |         "bin-op": "# Fixed binary operator",
 39 |         "var-misuse": "# Fixed incorrect variable name",
 40 |     },
 41 |     "edit": {
 42 |         "bin-op": "Fix binary operator",
 43 |         "var-misuse": "Fix incorrect variable name",
 44 |     },
 45 | }
 46 | 
 47 | def mutate_code(input_code, task, prompt="prompt"):
 48 |     """
 49 |     Create template for code mutation.
 50 |     Args:
 51 |         input_code: code to be mutated
 52 |         task: task to be performed
 53 |         prompt: (Optional) 'edit' or 'prompt'
 54 |     Returns:
 55 |         template for code mutation
 56 |     """
 57 |     instruction = MUTATE_TO_TASK_TO_PROMPT[prompt][task]
 58 |     if prompt == "prompt_carper":
 59 |         return f"# A buggy implementation\n#!/usr/bin/python3\n{input_code}\n{instruction}\ndef"
 60 |     if prompt == "prompt":
 61 |         return f"#!/usr/bin/python3\n# A buggy implementation\n{input_code}\n{instruction}\ndef"        
 62 |     if prompt == "edit":
 63 |         return f"<commit_before>{input_code}<commit_msg>{instruction}<commit_after>"
 64 |     else:
 65 |         raise ValueError(f"Unknown prompt: {prompt}")
 66 | 
 67 | 
 68 | class PythonBugs(Task):
 69 | 
 70 |     DATASET_PATH = "Muennighoff/python-bugs"
 71 | 
 72 |     def __init__(self, prompt="prompt"):
 73 |         super().__init__(
 74 |             # Correct code always starts with `def ...` and is a single function, so stop everything else
 75 |             # Since a function always has a tab, stop when the first line does not have a tab
 76 |             stop_words=[
 77 |                 "\nclass", "\n#", "\ndef", "\nassert", '\n"', "\nprint", "\nif",
 78 |                 # Special cases for edit
 79 |                 "<commit_before>", "<commit_msg>", "<commit_after>", "<|endoftext|>",
 80 |             ],
 81 |             requires_execution=True,
 82 |         )
 83 |         self.max_length_multiplier = 2.25 # Allow 2.25 times the length of the prompt
 84 |         self.prompt = prompt
 85 | 
 86 |     def get_dataset(self):
 87 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 88 |         dataset = self.dataset["train"]
 89 |         return dataset
 90 | 
 91 |     def get_prompt(self, doc):
 92 |         """Builds the prompt for the LM to generate from."""
 93 |         return mutate_code(doc["prompt_code"], doc["task"], self.prompt)
 94 | 
 95 |     def get_reference(self, doc):
 96 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 97 |         return doc["correct_code"]
 98 | 
 99 |     def postprocess_generation(self, generation, idx):
100 |         """Defines the postprocessing for a LM generation.
101 |         :param generation: str
102 |             code generation from LM
103 |         :param idx: int
104 |             index of doc in the dataset to which the generation belongs
105 |         """
106 |         doc = self.get_dataset()[idx]
107 |         prompt = self.get_prompt(doc)
108 |         correct_code = self.get_reference(doc)
109 |         output = generation[len(prompt):]
110 |         if self.prompt.startswith("prompt"):
111 |             output = "def" + output # Add def which is in the prompt back to the output
112 |         return output[:len(correct_code)]
113 | 
114 |     def process_results(self, generations, references):
115 |         """Takes the list of LM generations and evaluates them against ground truth references,
116 |         returning the metric for the generations.
117 |         :param generations: list(list(str))
118 |             list of lists containing generations
119 |         :param references: list(str)
120 |             list of str containing refrences
121 |         """
122 |         num_correct = 0
123 |         print("Scoring generations...")
124 |         for i, ref in tqdm.tqdm(enumerate(references), total=len(references)):
125 |             for gen in generations[i]:
126 |                 num_correct += int(gen == ref)
127 |         accuracy = num_correct / len(references) / len(generations[0])
128 |         return {"mean exact match": accuracy}
129 | 


--------------------------------------------------------------------------------
/unlearning/dataset.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | from datasets import load_from_disk
  5 | 
  6 | 
  7 | class CodeDataset(Dataset):
  8 |     def __init__(self, tokenizer, dataset_name, type_path, input_length, output_length, args):
  9 |         self.args = args
 10 |         self.tokenizer = tokenizer
 11 |         self.input_length = input_length
 12 |         self.output_length = output_length
 13 |         self.dataset_name = dataset_name
 14 |         self.type_path = type_path
 15 | 
 16 |         self.dataset = pd.read_csv(dataset_name, lineterminator='\n')
 17 |         self.dataset.columns = self.dataset.columns.str.replace('\r', '')
 18 |         if self.type_path == 'train':
 19 |             batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.ngpu
 20 |             if len(self.dataset) != batch_size:
 21 |                 raise Exception("Effective batch size should be the same as length of train set.")
 22 | 
 23 |     def convert_to_features(self, example_batch):
 24 |         doc_id = torch.tensor(example_batch['doc_id'], dtype=torch.int)
 25 |         input_, target_ = example_batch['text'], example_batch['text']
 26 |         
 27 |         source = self.tokenizer(input_, max_length=self.input_length, padding='max_length', truncation=True, return_tensors="pt")
 28 |         targets = self.tokenizer(target_, max_length=self.output_length, add_special_tokens=False, padding='max_length', truncation=True, return_tensors="pt")
 29 |         
 30 |         return source, targets, doc_id
 31 | 
 32 |     def __getitem__(self, index):
 33 |         data = self.dataset.iloc[index]
 34 |         source, targets, doc_id = self.convert_to_features(data)
 35 | 
 36 |         source_ids = source["input_ids"].squeeze()
 37 |         target_ids = targets["input_ids"].squeeze()
 38 | 
 39 |         src_mask = source["attention_mask"].squeeze()
 40 |         target_mask = targets["attention_mask"].squeeze()
 41 | 
 42 |         return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask, "doc_id": doc_id}
 43 | 
 44 |     def __len__(self):
 45 |         return len(self.dataset)
 46 | 
 47 | 
 48 | class CodeSecretDataset(Dataset):
 49 |     def __init__(self, tokenizer, dataset_name, type_path, args):
 50 |         self.args = args
 51 |         self.tokenizer = tokenizer
 52 |         self.dataset_name = dataset_name
 53 |         self.type_path = type_path
 54 | 
 55 |         self.dataset = load_from_disk(dataset_name)
 56 |         self.dataset = self.dataset.add_column('doc_id', list(range(len(self.dataset))))
 57 |         if self.type_path == 'train':
 58 |             batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.ngpu
 59 |             if len(self.dataset) != batch_size:
 60 |                 raise Exception("Effective batch size should be the same as length of train set.")
 61 | 
 62 |     def __getitem__(self, index):
 63 |         data = self.dataset[index]
 64 |         input_, target_ = data['content'], data['content']
 65 | 
 66 |         # special_tokens = self.tokenizer.special_tokens_map
 67 |         # print("Special tokens:", special_tokens)
 68 |         
 69 |         source = self.tokenizer(input_, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
 70 |         targets = self.tokenizer(target_, max_length=512, add_special_tokens=False, padding='max_length', truncation=True, return_tensors="pt")
 71 | 
 72 |         source_ids = source["input_ids"].squeeze()
 73 |         target_ids = targets["input_ids"].squeeze()
 74 | 
 75 |         source_mask = source["attention_mask"].squeeze()
 76 |         target_mask = targets["attention_mask"].squeeze()
 77 |         
 78 |         # secret_spans = []
 79 |         # prefix_spans = []
 80 |         # current_span = []
 81 |         # for i, (token, is_secret) in enumerate(zip(source_ids, torch.BoolTensor(data['secret_mask']).squeeze())):
 82 |         #     if is_secret:
 83 |         #         if not current_span:
 84 |         #             prefix_spans.append(source_ids[max(0, i-1):i])
 85 |         #         current_span.append(token)
 86 |         #     elif current_span:
 87 |         #         secret_spans.append(current_span)
 88 |         #         current_span = []
 89 |         # if current_span:
 90 |         #     secret_spans.append(current_span)
 91 |         # for i, (prefix_span, secret_span) in enumerate(zip(prefix_spans, secret_spans)):
 92 |         #     print(f"Secret Span {i+1}:")
 93 |         #     print(f"  Prefix Token IDs: {prefix_span}")
 94 |         #     print(f"  Secret Token IDs: {torch.stack(secret_span)}")
 95 |         
 96 |         # secret_token_ids = torch.LongTensor(data['secret_token_ids'])
 97 |         # print(secret_token_ids)
 98 |         # secret_prefix_token_ids = torch.LongTensor(data['secret_prefix_token_ids'])
 99 |         # print(secret_prefix_token_ids)
100 |         
101 |         item = {
102 |             'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask,
103 |             'secret_mask': torch.BoolTensor(data['secret_mask']).squeeze(),
104 |             # 'secret_mean_MA': torch.tensor(data['secret_mean_MA']),
105 |             'doc_id': torch.tensor(data['doc_id'])
106 |         }
107 |         # print(item)
108 |         return item
109 | 
110 |     def __len__(self):
111 |         return len(self.dataset)
112 | 


--------------------------------------------------------------------------------
/memorization_thresholds/TopLists/Rust-top-repos.txt:
--------------------------------------------------------------------------------
  1 | 16611	https://github.com/HigherOrderCO/Bend
  2 | 14212	https://github.com/huggingface/candle
  3 | 13004	https://github.com/astral-sh/uv
  4 | 12248	https://github.com/biomejs/biome
  5 | 11170	https://github.com/sxyazi/yazi
  6 | 8723	https://github.com/eza-community/eza
  7 | 7158	https://github.com/rolldown/rolldown
  8 | 6795	https://github.com/LibNyanpasu/clash-nyanpasu
  9 | 4792	https://github.com/microsoft/sudo
 10 | 4678	https://github.com/paradedb/paradedb
 11 | 4559	https://github.com/ynqa/jnv
 12 | 4195	https://github.com/clockworklabs/SpacetimeDB
 13 | 3688	https://github.com/Speykious/cve-rs
 14 | 3587	https://github.com/loco-rs/loco
 15 | 3132	https://github.com/AmrDeveloper/GQL
 16 | 2979	https://github.com/mainmatter/100-exercises-to-learn-rust
 17 | 2770	https://github.com/EricLBuehler/mistral.rs
 18 | 2747	https://github.com/getgrit/gritql
 19 | 2561	https://github.com/YaLTeR/niri
 20 | 2229	https://github.com/huggingface/text-embeddings-inference
 21 | 2223	https://github.com/tembo-io/pgmq
 22 | 2134	https://github.com/jsr-io/jsr
 23 | 2061	https://github.com/gosub-browser/gosub-engine
 24 | 2055	https://github.com/quarylabs/quary
 25 | 2044	https://github.com/OwshenNetwork/owshen
 26 | 1980	https://github.com/stacks-network/sbtc-developer-release
 27 | 1883	https://github.com/rivet-gg/rivet
 28 | 1840	https://github.com/microsoft/aici
 29 | 1815	https://github.com/SilasMarvin/lsp-ai
 30 | 1740	https://github.com/bionic-gpt/bionic-gpt
 31 | 1696	https://github.com/Universal-Debloater-Alliance/universal-android-debloater-next-generation
 32 | 1653	https://github.com/face-hh/webx
 33 | 1635	https://github.com/altsem/gitu
 34 | 1627	https://github.com/yuankunzhang/charming
 35 | 1616	https://github.com/paritytech/polkadot-sdk
 36 | 1440	https://github.com/memorysafety/river
 37 | 1417	https://github.com/lapce/lapdev
 38 | 1405	https://github.com/microsoft/windows-drivers-rs
 39 | 1383	https://github.com/jafioti/luminal
 40 | 1349	https://github.com/MatthiasGrandl/Loungy
 41 | 1340	https://github.com/CADmium-Co/CADmium
 42 | 1307	https://github.com/evilsocket/legba
 43 | 1284	https://github.com/phodal/aigc
 44 | 1244	https://github.com/FractalFir/rustc_codegen_clr
 45 | 1229	https://github.com/ferrocene/ferrocene
 46 | 1222	https://github.com/Julien-cpsn/ATAC
 47 | 1218	https://github.com/ThousandBirdsInc/chidori
 48 | 1207	https://github.com/cloudflare/foundations
 49 | 1188	https://github.com/SeaQL/FireDBG.for.Rust
 50 | 1055	https://github.com/shell-pool/shpool
 51 | 1037	https://github.com/mfontanini/presenterm
 52 | 1023	https://github.com/paradigmxyz/cryo
 53 | 993	https://github.com/srush/llama2.rs
 54 | 993	https://github.com/robertknight/ocrs
 55 | 991	https://github.com/joaoviictorti/RustRedOps
 56 | 988	https://github.com/orhun/daktilo
 57 | 953	https://github.com/Ruddle/Fomos
 58 | 939	https://github.com/Tencent/tquic
 59 | 931	https://github.com/Whitecat18/Rust-for-Malware-Development
 60 | 918	https://github.com/regolith-labs/ore-cli
 61 | 913	https://github.com/spaceandtimelabs/sxt-proof-of-sql
 62 | 911	https://github.com/aripiprazole/rinha-de-compiler
 63 | 890	https://github.com/YiNNx/cmd-wrapped
 64 | 867	https://github.com/mufeedvh/code2prompt
 65 | 851	https://github.com/j-hc/zygisk-detach
 66 | 850	https://github.com/Ragnt/AngryOxide
 67 | 844	https://github.com/nvzqz/divan
 68 | 836	https://github.com/samwho/spacer
 69 | 815	https://github.com/redlib-org/redlib
 70 | 815	https://github.com/Martichou/rquickshare
 71 | 800	https://github.com/succinctlabs/sp1
 72 | 787	https://github.com/LlamaEdge/LlamaEdge
 73 | 773	https://github.com/FoxIO-LLC/ja4
 74 | 759	https://github.com/helix-editor/nucleo
 75 | 757	https://github.com/pnpm/pacquet
 76 | 755	https://github.com/a2x/cs2-dumper
 77 | 742	https://github.com/andyk/ht
 78 | 741	https://github.com/get-convex/convex-backend
 79 | 732	https://github.com/sunfishcode/eyra
 80 | 730	https://github.com/every-day-things/citadel
 81 | 729	https://github.com/Chleba/netscanner
 82 | 718	https://github.com/moturus/motor-os
 83 | 689	https://github.com/ogxd/gxhash
 84 | 683	https://github.com/pipeless-ai/pipeless
 85 | 675	https://github.com/tsukinaha/tsukimi
 86 | 670	https://github.com/darthdeus/comfy
 87 | 669	https://github.com/tembo-io/pg_vectorize
 88 | 660	https://github.com/haileys/bark
 89 | 645	https://github.com/apache/datafusion-comet
 90 | 644	https://github.com/bitswired/rustgpt
 91 | 638	https://github.com/hcavarsan/kftray
 92 | 633	https://github.com/prefix-dev/rip
 93 | 615	https://github.com/wintermute-cell/ngrrram
 94 | 606	https://github.com/Kobzol/cargo-wizard
 95 | 582	https://github.com/nexus-xyz/nexus-zkvm
 96 | 578	https://github.com/timescale/pgvectorscale
 97 | 578	https://github.com/ThePrimeagen/htmx-lsp
 98 | 573	https://github.com/zaghaghi/openapi-tui
 99 | 556	https://github.com/streamdal/streamdal
100 | 550	https://github.com/junkdog/tachyonfx
101 | 543	https://github.com/xetdata/nfsserve
102 | 541	https://github.com/regolith-labs/ore
103 | 538	https://github.com/narrowlink/narrowlink
104 | 535	https://github.com/cncases/cases
105 | 534	https://github.com/facebook/dotslash
106 | 527	https://github.com/huggingface/llm-ls
107 | 523	https://github.com/ViporMiner/VIPORMiner
108 | 519	https://github.com/opensourcecheemsburgers/RustyTube
109 | 519	https://github.com/terhechte/Ebou
110 | 512	https://github.com/KipData/FnckSQL
111 | 506	https://github.com/ad-si/Rust-Flashcards
112 | 502	https://github.com/meteroid-oss/meteroid
113 | 501	https://github.com/ynqa/sig
114 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/humaneval.py:
--------------------------------------------------------------------------------
  1 | """Evaluating Large Language Models Trained on Code
  2 | https://arxiv.org/abs/2107.03374
  3 | 
  4 | The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
  5 | docstring, body, and several unit tests. 
  6 | They were handwritten to ensure not to be included in the training set of code generation models.
  7 | 
  8 | Homepage: https://github.com/openai/human-eval
  9 | """
 10 | 
 11 | import re
 12 | 
 13 | from evaluate import load
 14 | 
 15 | from bigcode_eval.base import Task
 16 | 
 17 | _CITATION = """
 18 | @misc{chen2021evaluating,
 19 |       title={Evaluating Large Language Models Trained on Code},
 20 |       author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
 21 |       year={2021},
 22 |       eprint={2107.03374},
 23 |       archivePrefix={arXiv},
 24 |       primaryClass={cs.LG}
 25 | }
 26 | """
 27 | 
 28 | 
 29 | def create_all_tasks():
 30 |     """Creates a dictionary of tasks from a list of levels
 31 |     :return: {task_name: task}
 32 |         e.g. {multiple-py: Task, multiple-java: Task}
 33 |     """
 34 |     return {"humaneval": create_task(True), "humaneval-unstripped": create_task(False)}
 35 | 
 36 | 
 37 | def create_task(strip_prompt):
 38 |     class HumanEval(GeneralHumanEval):
 39 |         def __init__(self):
 40 |             super().__init__(strip_prompt)
 41 | 
 42 |     return HumanEval
 43 | 
 44 | 
 45 | class GeneralHumanEval(Task):
 46 |     """A task represents an entire benchmark including its dataset, problems,
 47 |     answers, generation settings and evaluation methods.
 48 |     """
 49 | 
 50 |     DATASET_PATH = "openai_humaneval"
 51 | 
 52 |     def __init__(self, strip_prompt):
 53 |         super().__init__(
 54 |             stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```"],
 55 |             requires_execution=True,
 56 |         )
 57 |         self.strip_prompt = strip_prompt
 58 | 
 59 |     def get_dataset(self):
 60 |         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
 61 |         return self.dataset["test"]
 62 | 
 63 |     def get_prompt(self, doc):
 64 |         """Builds the prompt for the LM to generate from."""
 65 |         if self.strip_prompt:
 66 |             return doc["prompt"].strip()
 67 |         else:
 68 |             return doc["prompt"]
 69 | 
 70 |     def get_reference(self, doc):
 71 |         """Builds the reference solution for the doc (sample from the test dataset)."""
 72 |         test_func = doc["test"]
 73 |         entry_point = f"check({doc['entry_point']})"
 74 |         return "\n" + test_func + "\n" + entry_point
 75 | 
 76 |     @staticmethod
 77 |     def _stop_at_stop_token(decoded_string, stop_tokens):
 78 |         """
 79 |         Produces the prefix of decoded_string that ends at the first occurrence of
 80 |         a stop_token.
 81 |         WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
 82 |         itself.
 83 |         """
 84 |         min_stop_index = len(decoded_string)
 85 |         for stop_token in stop_tokens:
 86 |             stop_index = decoded_string.find(stop_token)
 87 |             if stop_index != -1 and stop_index < min_stop_index:
 88 |                 min_stop_index = stop_index
 89 |         return decoded_string[:min_stop_index]
 90 | 
 91 |     def postprocess_generation(self, generation, idx):
 92 |         """Defines the postprocessing for a LM generation.
 93 |         :param generation: str
 94 |             code generation from LM
 95 |         :param idx: int
 96 |             index of doc in the dataset to which the generation belongs
 97 |             (not used for Humaneval-Task)
 98 |         """
 99 |         prompt = self.get_prompt(self.dataset["test"][idx])
100 |         generation = generation[len(prompt) :]
101 |         return prompt + self._stop_at_stop_token(generation, self.stop_words)
102 | 
103 |     def process_results(self, generations, references):
104 |         """Takes the list of LM generations and evaluates them against ground truth references,
105 |         returning the metric for the generations.
106 |         :param generations: list(list(str))
107 |             list of lists containing generations
108 |         :param references: list(str)
109 |             list of str containing refrences
110 |         """
111 |         code_metric = load("code_eval")
112 |         results, _ = code_metric.compute(
113 |             references=references,
114 |             predictions=generations,
115 |             k=[1, 5, 10]
116 |         )
117 |         return results
118 | 


--------------------------------------------------------------------------------
/unlearning_preparation/retained_data_sample.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import feather
  3 | import pandas as pd
  4 | import random
  5 | from tqdm import tqdm
  6 | import torch
  7 | from transformers import AutoTokenizer, AutoModelForCausalLM
  8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  9 | smoothie = SmoothingFunction().method4
 10 | 
 11 | 
 12 | # calculate BLEU-4 score
 13 | def calc_bleu4(tokenizer, sample, generated):
 14 |     ref = tokenizer.decode(sample)
 15 |     hyp = tokenizer.decode(generated)
 16 |     return sentence_bleu([ref], hyp, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
 17 | 
 18 | 
 19 | def memorization_extraction(args):
 20 |     device = torch.device("cuda:" + str(args.gpu_id) if torch.cuda.is_available() else "cpu")
 21 |     
 22 |     base_tokenizer = AutoTokenizer.from_pretrained(
 23 |             'Salesforce/codegen-350M-multi',
 24 |             padding_side='left',
 25 |             # add_special_tokens=True
 26 |         )
 27 |     
 28 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, local_files_only=True)
 29 |     # tokenizer.pad_token = tokenizer.eos_token
 30 |     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 31 |     model = AutoModelForCausalLM.from_pretrained(
 32 |         args.model_name_or_path,
 33 |         # resid_pdrop=0, embd_pdrop=0, attn_pdrop=0,
 34 |         attention_dropout=0,
 35 |         # pad_token_id=tokenizer.eos_token_id,
 36 |         local_files_only=True
 37 |     )
 38 |     model.resize_token_embeddings(len(tokenizer))
 39 |     model.config.pad_token_id = tokenizer.pad_token_id
 40 |     model.config.vocab_size = len(tokenizer)
 41 |     if hasattr(model, 'model'):
 42 |         print("Model has model.model.embed_tokens.padding_idx")
 43 |         model.model.embed_tokens.padding_idx = tokenizer.pad_token_id
 44 |     else:
 45 |         print("Model has get_input_embeddings.padding_idx")
 46 |         model.get_input_embeddings().padding_idx = tokenizer.pad_token_id
 47 |     if hasattr(model, 'lm_head'):
 48 |         print("Model has lm_head.padding_idx")
 49 |         model.lm_head.padding_idx = tokenizer.pad_token_id
 50 |         
 51 |     embedding_layer = model.get_input_embeddings()
 52 |     print(f"padding_idx: {embedding_layer.padding_idx}")
 53 | 
 54 |     assert model.config.pad_token_id == tokenizer.pad_token_id
 55 |     assert embedding_layer.num_embeddings == len(tokenizer)
 56 |     
 57 |     if args.fp16:
 58 |         model.half()
 59 |     model.to(device)
 60 | 
 61 |     df = feather.read_dataframe('benchmark.feather')
 62 |     if base_tokenizer.vocab != tokenizer.vocab:
 63 |         print('Different tokenizers: Re-encoding samples...')
 64 |         df['sample'] = df['sample'].apply(lambda x: tokenizer.encode(base_tokenizer.decode(x, skip_special_tokens=True)))
 65 |         df = df[df['sample'].apply(len) >= 100].reset_index(drop=True) # drop samples that are too short
 66 |     else: # same tokenizer
 67 |         print('Same tokenizers: No need to re-encode samples...')
 68 |     df['prefix'] = df['sample'].apply(lambda x: x[:64])
 69 |     df['suffix'] = df['sample'].apply(lambda x: x[64:128])
 70 | 
 71 |     gen_suffix = []
 72 |     # iterate with batch size
 73 |     with torch.no_grad():
 74 |         for i in tqdm(range(0, len(df), args.batch_size)):
 75 |             batch = torch.tensor(df.iloc[i: i + args.batch_size].prefix.tolist()).to(device)
 76 |             # output = model.generate(batch, max_length=128)[..., 64:].tolist()
 77 |             output = model.generate(batch, max_new_tokens=64)[..., 64:].tolist()
 78 |             gen_suffix.extend(output)
 79 | 
 80 |     df['gen_suffix'] = gen_suffix
 81 |     df['bleu4'] = df.apply(lambda x: calc_bleu4(tokenizer, x['suffix'], x['gen_suffix']), axis=1)
 82 |     
 83 |     memorization_df = df[df['bleu4'] >= 0.95]
 84 |     memorization_df.rename(columns={'index': 'doc_id'}, inplace=True)
 85 |     memorization_df['text'] = memorization_df['sample'].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True))
 86 |     memorization_df['corpus'] = 'BigQuery'
 87 |     memorization_df = memorization_df[['doc_id', 'hash', 'copies', 'corpus', 'text', 'bleu4']]
 88 |     print(memorization_df)
 89 |     model_name = args.model_name_or_path.split('/')[-1]
 90 |     memorization_df.to_csv(f'{model_name}_memorization.csv', index=False, encoding='utf-8')
 91 | 
 92 |     random.seed(42)
 93 |     memorization_df_indexes = list(range(len(memorization_df)))
 94 |     random.shuffle(memorization_df_indexes)
 95 |     sampled_df = memorization_df.iloc[memorization_df_indexes[:args.k], :]
 96 |     sampled_df.to_csv(f'../unlearning/data/{model_name}_secret/{model_name}_retained_set_{args.k}.csv', index=False, encoding='utf-8')
 97 | 
 98 | 
 99 | def main():
100 |     # Parsing Arguments
101 |     parser = argparse.ArgumentParser()
102 |     parser.add_argument("--model_name_or_path", default="Salesforce/codegen-350M-mono", type=str,
103 |                         help="Model to train and evaluate, provide a repo name in Hugging Face hub or a local path.")
104 |     parser.add_argument('--gpu_id', type=str, default="0", help="specify the GPU id")
105 |     parser.add_argument('--batch_size', type=int, default=4, help="Batch size.")
106 |     parser.add_argument("--fp16", action='store_true',
107 |                         help="Whether to use fp16 model precision.")
108 |     parser.add_argument('--k', type=int, default=32,
109 |                         help="The number of forgotten samples.")
110 |     args = parser.parse_args()
111 |     
112 |     memorization_extraction(args)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/human_eval/finetuning/CodeComplex/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from copy import deepcopy
  3 | 
  4 | import numpy as np
  5 | from datasets import ClassLabel, DatasetDict, load_dataset
  6 | from evaluate import load
  7 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
  8 |                           DataCollatorWithPadding, Trainer, TrainerCallback,
  9 |                           TrainingArguments, set_seed)
 10 | 
 11 | 
 12 | def get_args():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
 16 |     )
 17 |     parser.add_argument("--num_epochs", type=int, default=5)
 18 |     parser.add_argument("--batch_size", type=int, default=6)
 19 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
 20 |     parser.add_argument("--freeze", type=bool, default=True)
 21 |     parser.add_argument("--learning_rate", type=float, default=5e-4)
 22 |     parser.add_argument("--seed", type=int, default=0)
 23 |     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
 24 |     parser.add_argument("--num_warmup_steps", type=int, default=10)
 25 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 26 |     parser.add_argument("--output_dir", type=str, default="./results")
 27 |     parser.add_argument("--push_to_hub", type=bool, default=False)
 28 |     parser.add_argument("--model_hub_name", type=str, default="codecomplex_model")
 29 |     return parser.parse_args()
 30 | 
 31 | 
 32 | metric = load("accuracy")
 33 | 
 34 | 
 35 | def compute_metrics(eval_pred):
 36 |     predictions, labels = eval_pred
 37 |     predictions = np.argmax(predictions, axis=1)
 38 |     return metric.compute(predictions=predictions, references=labels)
 39 | 
 40 | 
 41 | class CustomCallback(TrainerCallback):
 42 |     def __init__(self, trainer) -> None:
 43 |         super().__init__()
 44 |         self._trainer = trainer
 45 | 
 46 |     def on_epoch_end(self, args, state, control, **kwargs):
 47 |         if control.should_evaluate:
 48 |             control_copy = deepcopy(control)
 49 |             self._trainer.evaluate(
 50 |                 eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
 51 |             )
 52 |             return control_copy
 53 | 
 54 | 
 55 | def main():
 56 |     args = get_args()
 57 |     set_seed(args.seed)
 58 | 
 59 |     dataset = load_dataset("codeparrot/codecomplex", split="train")
 60 |     train_test = dataset.train_test_split(test_size=0.2)
 61 |     test_validation = train_test["test"].train_test_split(test_size=0.5)
 62 |     train_test_validation = DatasetDict(
 63 |         {
 64 |             "train": train_test["train"],
 65 |             "test": test_validation["train"],
 66 |             "valid": test_validation["test"],
 67 |         }
 68 |     )
 69 | 
 70 |     print("Loading tokenizer and model")
 71 |     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
 72 |     tokenizer.pad_token = tokenizer.eos_token
 73 |     model = AutoModelForSequenceClassification.from_pretrained(
 74 |         args.model_ckpt, num_labels=7
 75 |     )
 76 |     model.config.pad_token_id = model.config.eos_token_id
 77 | 
 78 |     if args.freeze:
 79 |         for param in model.roberta.parameters():
 80 |             param.requires_grad = False
 81 | 
 82 |     labels = ClassLabel(
 83 |         num_classes=7, names=list(set(train_test_validation["train"]["complexity"]))
 84 |     )
 85 | 
 86 |     def tokenize(example):
 87 |         inputs = tokenizer(example["src"], truncation=True, max_length=1024)
 88 |         label = labels.str2int(example["complexity"])
 89 |         return {
 90 |             "input_ids": inputs["input_ids"],
 91 |             "attention_mask": inputs["attention_mask"],
 92 |             "label": label,
 93 |         }
 94 | 
 95 |     tokenized_datasets = train_test_validation.map(
 96 |         tokenize,
 97 |         batched=True,
 98 |         remove_columns=train_test_validation["train"].column_names,
 99 |     )
100 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
101 | 
102 |     training_args = TrainingArguments(
103 |         output_dir=args.output_dir,
104 |         learning_rate=args.learning_rate,
105 |         lr_scheduler_type=args.lr_scheduler_type,
106 |         evaluation_strategy="epoch",
107 |         save_strategy="epoch",
108 |         logging_strategy="epoch",
109 |         per_device_train_batch_size=args.batch_size,
110 |         per_device_eval_batch_size=args.batch_size,
111 |         num_train_epochs=args.num_epochs,
112 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
113 |         weight_decay=args.weight_decay,
114 |         metric_for_best_model="accuracy",
115 |         run_name="complexity-java",
116 |         report_to="wandb",
117 |     )
118 | 
119 |     trainer = Trainer(
120 |         model=model,
121 |         args=training_args,
122 |         train_dataset=tokenized_datasets["train"],
123 |         eval_dataset=tokenized_datasets["valid"],
124 |         tokenizer=tokenizer,
125 |         data_collator=data_collator,
126 |         compute_metrics=compute_metrics,
127 |     )
128 | 
129 |     print("Training...")
130 |     trainer.add_callback(CustomCallback(trainer))
131 |     trainer.train()
132 | 
133 |     result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
134 |     print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}")
135 | 
136 |     # push the model to the Hugging Face hub
137 |     if args.push_to_hub:
138 |         model.push_to_hub(args.model_hub_name)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/human_eval/bigcode_eval/tasks/custom_metrics/multiple_metrics/generic_eval.py:
--------------------------------------------------------------------------------
  1 | # This is a helper script for evaluating benchmarks that have been translated to
  2 | # different languages.
  3 | #
  4 | # To use this script, call eval_lang.py.
  5 | # The --directory argument is required, and tells the script where the benchmarks are located.
  6 | # The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
  7 | #
  8 | # The script will print the results on each benchmark, and also write to results/lang.csv.
  9 | # When the script completes, it will print a summary.
 10 | #
 11 | # Examples
 12 | #
 13 | # To run the entire benchmark suite:
 14 | #   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
 15 | #
 16 | # To run benchmarks 1, 2, and 3:
 17 | #   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
 18 | 
 19 | import argparse
 20 | import sys
 21 | from pathlib import Path
 22 | from sys import exit as sysexit
 23 | 
 24 | 
 25 | def list_files(directory, ext):
 26 |     files_unsorted = directory.glob(f"HumanEval_*{ext}")
 27 |     # assumption: base filenames are in the format of HumanEval_X_*
 28 |     # Where X is a valid number
 29 |     def key(s):
 30 |         return int(str(s.name).split("_")[1])
 31 | 
 32 |     files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
 33 | 
 34 |     # assumption: there may be missing files, but no extra files
 35 |     # so we build files_array where the index corresponds to the file's number,
 36 |     # and a missing file is represented by None
 37 |     size = key(files_sorted[-1]) + 1
 38 |     files_array = [None] * size
 39 |     for f in files_sorted:
 40 |         k = key(f)
 41 |         files_array[k] = f
 42 | 
 43 |     return files_array
 44 | 
 45 | 
 46 | def main(eval_script, language, extension):
 47 |     args = argparse.ArgumentParser()
 48 | 
 49 |     args.add_argument(
 50 |         "--directory", type=str, required=True, help="Directory to read benchmarks from"
 51 |     )
 52 |     args.add_argument(
 53 |         "--files",
 54 |         type=int,
 55 |         nargs="*",
 56 |         default=[],
 57 |         help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
 58 |     )
 59 |     args = args.parse_args()
 60 | 
 61 |     directory = Path(args.directory).resolve()
 62 | 
 63 |     files_sorted = list_files(directory, extension)
 64 | 
 65 |     # the directory you specified does not contain the right language
 66 |     if len(files_sorted) == 0:
 67 |         print(f"The specified directory does not contain files of type {extension}")
 68 |         sysexit(1)
 69 | 
 70 |     files_index = []
 71 |     if len(args.files) > 0:
 72 |         files_index = args.files
 73 |     else:
 74 |         files_index = range(len(files_sorted))
 75 | 
 76 |     total = 0
 77 |     passed = 0
 78 |     syntax_error = 0
 79 | 
 80 |     results_file = Path(
 81 |         Path(__file__).parent, "..", "results", language.lower() + ".csv"
 82 |     ).resolve()
 83 | 
 84 |     with open(results_file, "w") as f:
 85 |         for i in files_index:
 86 |             filepath = files_sorted[i]
 87 |             if filepath is None:
 88 |                 print("File {} does not exist!".format(i))
 89 |                 continue
 90 |             res = eval_script(filepath)
 91 |             output = f"{language},{filepath.stem},{res['status']}\n"
 92 |             f.write(output)
 93 |             print(output, end="")
 94 |             total += 1
 95 |             if res["status"] == "OK":
 96 |                 passed += 1
 97 |             elif res["status"] == "SyntaxError":
 98 |                 syntax_error += 1
 99 |     print(f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
100 | 
101 | 
102 | def main_check_stubs(check_script, language, extension):
103 |     args = argparse.ArgumentParser()
104 | 
105 |     args.add_argument(
106 |         "--directory", type=str, required=True, help="Directory to read benchmarks from"
107 |     )
108 |     args.add_argument(
109 |         "--files",
110 |         type=int,
111 |         nargs="*",
112 |         default=[],
113 |         help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
114 |     )
115 |     args = args.parse_args()
116 | 
117 |     directory = Path(args.directory).resolve()
118 | 
119 |     files_sorted = list_files(directory, extension)
120 | 
121 |     # the directory you specified does not contain the right language
122 |     if len(files_sorted) == 0:
123 |         print(f"The specified directory does not contain files of type {extension}")
124 |         sysexit(1)
125 | 
126 |     files_index = []
127 |     if len(args.files) > 0:
128 |         files_index = args.files
129 |     else:
130 |         files_index = range(len(files_sorted))
131 | 
132 |     total = 0
133 |     passed = 0
134 | 
135 |     results_file = Path(
136 |         Path(__file__).parent, "..", "check_results", language.lower() + ".csv"
137 |     ).resolve()
138 | 
139 |     with open(results_file, "w") as f:
140 |         for i in files_index:
141 |             filepath = files_sorted[i]
142 |             if filepath is None:
143 |                 print("File {} does not exist!".format(i))
144 |                 continue
145 |             res = check_script(filepath)
146 |             output = f"{language},{filepath.stem},{res['status']}\n"
147 |             f.write(output)
148 |             print(output, end="")
149 |             total += 1
150 |             if res["status"] == "OK":
151 |                 passed += 1
152 |     print(f"Total {total}, Passed {passed}")
153 | 
154 |     if total != passed:
155 |         sys.exit(1)
156 | 


--------------------------------------------------------------------------------