├── assets
├── repobench_dark.png
├── repobench_logo.png
└── repobench_light.png
├── .gitignore
├── requirements.txt
├── data
├── utils.py
└── README.md
├── eval.py
├── README.md
├── evaluation
└── metrics.py
├── run.py
└── LICENSE
/assets/repobench_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_dark.png
--------------------------------------------------------------------------------
/assets/repobench_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_logo.png
--------------------------------------------------------------------------------
/assets/repobench_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_light.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | cache
3 | results
4 | *temp.html
5 | /data_v1
6 | archive_data/test
7 | archive_data/train
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | transformers
3 | torch
4 | fuzzywuzzy
5 | difflib
6 | tqdm
7 | fire
8 | codebleu
9 | python-Levenshtein
10 | tree-sitter-python
11 | tree-sitter-java
--------------------------------------------------------------------------------
/data/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def construct_prompt(
4 | data: dict,
5 | language: str = "python",
6 | tokenizer= None,
7 | max_token_nums: int = 15800
8 | ) -> str:
9 | """
10 | Construct the prompt for next line prediction.
11 |
12 | :param data: data point from the dataset
13 | :param language: the language of the code
14 | :param tokenizer: the tokenizer of the evaluation model
15 | :param max_token_nums: the maximum number of tokens constraint for the prompt
16 |
17 | :return: the constructed prompt
18 | """
19 |
20 | # comment symbol for different languages
21 | comment_symbol = "#" if language == "python" else "//"
22 |
23 | # construct the cross-file prompt and in-file prompt separately
24 | # cross-file prompt
25 | cross_file_prompt = f"{comment_symbol} Repo Name: {data['repo_name']}\n"
26 |
27 | for snippet in data['context']:
28 | cross_file_prompt += f"{comment_symbol} Path: {snippet['path']}\n{snippet['snippet']}" + "\n\n"
29 |
30 | # in-file prompt
31 | in_file_prompt = f"{comment_symbol} Path: {data['file_path']}\n{data['import_statement']}\n{data['cropped_code'].rstrip()}\n"
32 |
33 | # if we assign the tokenizer and the max_token_nums, we will truncate the cross-file prompt to meet the constraint
34 | if tokenizer is not None and max_token_nums is not None:
35 |
36 | cross_file_prompt_token_nums = len(tokenizer.encode(cross_file_prompt))
37 | in_file_prompt_token_nums = len(tokenizer.encode(in_file_prompt))
38 |
39 | exceed_token_nums = cross_file_prompt_token_nums + in_file_prompt_token_nums - max_token_nums
40 |
41 | if exceed_token_nums > 0:
42 | # split the cross-file prompt into lines
43 | cross_file_prompt_lines = cross_file_prompt.split("\n")
44 | # drop lines from end until the extra token number is less than 0
45 | for i in range(len(cross_file_prompt_lines)-1, -1, -1):
46 | exceed_token_nums -= len(tokenizer.encode(cross_file_prompt_lines[i]))
47 | if exceed_token_nums < 0:
48 | break
49 |
50 | # join the lines back
51 | cross_file_prompt = "\n".join(cross_file_prompt_lines[:i]) + "\n\n"
52 |
53 | # combine the cross-file prompt and in-file prompt
54 | prompt = cross_file_prompt + in_file_prompt
55 |
56 | # normalize some empty lines
57 | prompt = re.sub(r'\n{4,}', '\n\n', prompt)
58 |
59 | return prompt
60 |
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from evaluation.metrics import exact_match_score, edit_similarity_score, codebleu_score
4 | import fire
5 |
6 | def eval(
7 | path="results/deepseek-coder-1.3b-base-python",
8 | language="python" # to calculate codebleu, we need to specify the language
9 | ):
10 |
11 | total_data_points = 0
12 | total_em_model, total_es_model, total_cb_model = 0, 0, 0
13 |
14 | for level in ["cross_file_first", "cross_file_random", "in_file"]:
15 | filepath = os.path.join(path, f"{level}.jsonl")
16 | seen_indices = set() # Track seen indices for the current level
17 |
18 | # check if the file exists
19 | if not os.path.exists(filepath):
20 | print(f"Level: {level} not found for the model")
21 | continue
22 |
23 | with open(filepath, "r") as f:
24 |
25 | data = []
26 | for line in f:
27 | entry = json.loads(line.strip())
28 | idx = entry["idx"]
29 |
30 | # Skip duplicate indices based on the chosen policy (here, keeping the former)
31 | if idx not in seen_indices:
32 | seen_indices.add(idx)
33 | data.append(entry)
34 |
35 | data_points = len(data)
36 |
37 | if data_points == 0:
38 | continue
39 |
40 | ground_truth = [d["gt"] for d in data]
41 | generated = [d["pred"] for d in data]
42 |
43 | em_model = round(exact_match_score(ground_truth, generated) * 100, 2)
44 | es_model = round(edit_similarity_score(ground_truth, generated), 2)
45 | cb_model = round(codebleu_score(generated, ground_truth, language) * 100, 2)
46 |
47 | # accumulate the data points and the metrics
48 | total_data_points += data_points
49 | total_em_model += em_model * data_points
50 | total_es_model += es_model * data_points
51 | total_cb_model += cb_model * data_points
52 |
53 | print(f"Level: {level} with {data_points} data points")
54 | print(f"EM: {em_model}, ES: {es_model}, CB: {cb_model}")
55 | print("-" * 30)
56 |
57 | # calculate the weighted averages
58 | if total_data_points > 0:
59 | avg_em_model = round(total_em_model / total_data_points, 2)
60 | avg_es_model = round(total_es_model / total_data_points, 2)
61 | avg_cb_model = round(total_cb_model / total_data_points, 2)
62 |
63 | print("Weighted Averages:")
64 | print(f"EM: {avg_em_model}, ES: {avg_es_model}, CB: {avg_cb_model}\n")
65 |
66 | else:
67 | print("No data points were found for evaluation.")
68 |
69 | if __name__ == "__main__":
70 | fire.Fire(eval)
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems
12 |
13 |
14 |
15 | ICLR 2024
16 |
17 |
18 |
19 |
20 |
21 | This directory hosts the datasets for subsequet versions of RepoBench. We are committed to updating RepoBench regularly, with updates scheduled **every 3 months**.
22 |
23 | ## 🌇 Overview
24 |
25 | - Our primary focus is on **next-line prediction** tasks to aid in code auto-completion. If your research requires retrieval data, please don't hesitate to reach out to us for collaboration.
26 | - Our datasets will be hosted on 🤗 HuggingFace, making them easily accessible for everyone.
27 | - Each data point within our datasets is categorized based on the prompt length (number of tokens), which is determined by OpenAI's GPT-4 tokenizer using [tiktoken](https://github.com/openai/tiktoken). Here's a detailed table illustrating the levels we've defined:
28 |
29 | | Level | Prompt Length (Number of Tokens) |
30 | |-------|------------------------|
31 | | 2k | 640 - 1,600 |
32 | | 4k | 1,600 - 3,600 |
33 | | 8k | 3,600 - 7,200 |
34 | | 12k | 7,200 - 10,800 |
35 | | 16k | 10,800 - 14,400 |
36 | | 24k | 14,400 - 21,600 |
37 | | 32k | 21,600 - 28,800 |
38 | | 64k | 28,800 - 57,600 |
39 | | 128k | 57,600 - 100,000 |
40 |
41 | - We hereby provide the official implementation for constructing prompts [here](https://github.com/Leolty/repobench/blob/53c1c55ad9e6d97d2b60dd2c9548ed1cd463b6a5/data/utils.py#L3). Please note that the methods provided are not necessarily the optimal way of construction. Reordering, retrieval argumentation, or employing different cropping/construction techniques could potentially lead to varying degrees of improvement. Ensure that the evaluations are conducted fairly.
42 |
43 | ## 📚 Versions
44 |
45 | ### RepoBench v1.1
46 |
47 | RepoBench v1.1 includes data collected from GitHub between **October 6, 2023**, and **December 31, 2023**. To mitigate the data leakage and memorization issues, we conducted a deduplication process on the Stack v2 (coming soon) based on the file content.
48 |
49 | You can access RepoBench v1.1 at the following links:
50 | - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
51 | - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
52 |
53 | Or, you can load the data directly from the HuggingFace Hub using the following code:
54 |
55 | ```python
56 | from datasets import load_dataset
57 |
58 | # Load the Python dataset
59 | python_dataset = load_dataset("tianyang/repobench_python_v1.1")
60 |
61 | # Load the Java dataset
62 | java_dataset = load_dataset("tianyang/repobench_java_v1.1")
63 | ```
64 |
65 | ### RepoBench v1.2
66 |
67 | *Cooming soon...*
68 |
69 | ## 📝 Citation
70 |
71 | If you use RepoBench in your research, please cite the following paper:
72 |
73 | ```bibtex
74 | @misc{liu2023repobench,
75 | title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems},
76 | author={Tianyang Liu and Canwen Xu and Julian McAuley},
77 | year={2024},
78 | url={https://arxiv.org/abs/2306.03091},
79 | booktitle={International Conference on Learning Representations}
80 | }
81 | ```
82 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems
12 |
13 |
14 |
15 | ICLR 2024
16 |
17 |
18 |
19 |
20 |
21 | ## 🔥 News
22 |
23 | - *Feb 5th, 2024*: **RepoBench v1.1** (with newest code data) is now available on the 🤗 HuggingFace Hub. You can access the datasets for Python and Java using the following links:
24 | - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
25 | - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
26 | > **For more details of RepoBench v1.1, please refer to the [data directory](./data/README.md).**
27 |
28 | - *Jan 16th, 2024*: RepoBench is accepted to ICLR 2024! 🎉
29 |
30 |
31 | ## 🛠️ Installation
32 |
33 | ```bash
34 | git clone https://github.com/Leolty/repobench.git
35 | cd repobench
36 | ```
37 |
38 | > [!NOTE]
39 | > There is a `requirements.txt` file, which contains dependencies for reproducing the results in the paper. If you are only interested in the data, you can skip the installation of dependencies.
40 |
41 | ## ⚙️ Description of Settings
42 |
43 | As discussed in the paper, we have three settings for each task:
44 |
45 | - `cross_file_first`: Masks the line where a module from a different file is used for the first time.
46 | - `cross_file_random`: Masks a random line where a module from a different file is used (not the first usage).
47 | - `in_file`: Masks a random line that has no cross-file dependency.
48 |
49 |
50 | ## 📥 Load Data
51 |
52 | ```python
53 | from datasets import load_dataset
54 |
55 | dataset = load_dataset("tianyang/repobench_python_v1.1", ignore_verifications=True)
56 | ```
57 |
58 | For more details, visit the Hugging Face dataset pages:
59 | - Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
60 | - Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
61 |
62 | ## 🚀 Running Experiments
63 |
64 | To run experiments on the RepoBench v1.1 dataset, we provide a very basic `run.py` script using the 🤗 Transformers library.
65 |
66 | Example usage:
67 |
68 | ```bash
69 | CUDA_VISIBLE_DEVICES=0 python run.py --model_name "deepseek-ai/deepseek-coder-1.3b-base" \
70 | --dataset_name "tianyang/repobench_python_v1.1" \
71 | --start_date "2023-12-01" \
72 | --end_date "2023-12-31" \
73 | --language "python" \
74 | --max_token_nums 15800 \
75 | --levels "2k" "4k" "8k" "12k" "16k" \
76 | --temperature 0.2 \
77 | --top_p 0.95 \
78 | --max_new_tokens 128 \
79 | --batch_size 1
80 | ```
81 |
82 | For a full list of available parameters, please refer to the `run.py` file. And it should be super easy to customize the script for your own needs.
83 |
84 | ## 📊 Evaluation
85 |
86 | After generating completions, you can evaluate the results using the `eval.py` script. This script calculates various metrics including Exact Match (EM), Edit Similarity (ES), and CodeBLEU (CB) scores for each setting.
87 |
88 | To run the evaluation:
89 |
90 | ```bash
91 | python eval.py --path "results/deepseek-coder-1.3b-base-python" --language "python"
92 | ```
93 |
94 | The script will output scores for each level (`cross_file_first`, `cross_file_random`, `in_file`) as well as weighted averages across all levels.
95 |
96 | ## 📝 Note
97 |
98 | This branch of the repository is specifically for RepoBench v1.1. For the results presented in our ICLR 2024 paper, which used the initial version of RepoBench, please refer to the [`archive/v0` branch](https://github.com/Leolty/repobench/tree/archive/v0) of this repository.
99 |
100 |
101 | ## 📝 Citation
102 |
103 | If you use RepoBench in your research, please consider citing us:
104 |
105 | ```bibtex
106 | @misc{liu2023repobench,
107 | title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems},
108 | author={Tianyang Liu and Canwen Xu and Julian McAuley},
109 | year={2024},
110 | url={https://arxiv.org/abs/2306.03091},
111 | booktitle={International Conference on Learning Representations}
112 | }
113 | ```
114 |
115 |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/evaluation/metrics.py:
--------------------------------------------------------------------------------
1 | from fuzzywuzzy import fuzz
2 | from codebleu import calc_codebleu
3 |
4 | def exact_match_score(predictions, ground_truths):
5 | """
6 | This function computes the average exact match score between the predicted codes and the ground truth codes.
7 | It returns a float value between 0 and 1 indicating the degree of exact match between the predicted codes
8 | and the ground truth codes, where a value of 1 means all the predicted codes exactly match their corresponding
9 | ground truth codes and a value of 0 means none of the predicted codes exactly match their corresponding
10 | ground truth codes.
11 |
12 | Args:
13 | predictions: list, predicted codes
14 | ground_truths: list, ground truth codes
15 |
16 | Returns:
17 | Float, the average exact match score between the predicted codes and the ground truth codes.
18 | """
19 | if len(predictions) != len(ground_truths):
20 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
21 |
22 | exact_match = 0
23 | for pred, gt in zip(predictions, ground_truths):
24 | if pred.split() == gt.split():
25 | exact_match += 1
26 |
27 | return round(exact_match / len(predictions), 5)
28 |
29 |
30 |
31 | def edit_similarity_score(predictions, ground_truths):
32 | """
33 | This function computes the average edit similarity score between the predicted codes and the ground truth codes.
34 | It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes
35 | and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding
36 | ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding
37 | ground truth codes.
38 |
39 | Args:
40 | predictions: list, predicted codes
41 | ground_truths: list, ground truth codes
42 |
43 | Returns:
44 | Float, the average edit similarity score between the predicted codes and the ground truth codes.
45 | """
46 | if len(predictions) != len(ground_truths):
47 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
48 |
49 | edit_sim = 0.0
50 | for pred, gt in zip(predictions, ground_truths):
51 | edit_sim += fuzz.ratio(pred, gt)
52 |
53 | return round(edit_sim / len(predictions), 5)
54 |
55 | def accuracy_at_k(prediction_list, golden_index_list, k):
56 | """
57 | This function computes the accuracy at k. It returns a float value between 0 and 1 indicating the
58 | accuracy at k, where a value of 1 means the correct code is retrieved at the top k positions and
59 | a value of 0 means the correct code is not retrieved at the top k positions.
60 |
61 | Args:
62 | prediction_list: list, a list of lists, where each list contains the indices of the retrieved codes.
63 | golden_index_list: list, a list of integers, where each integer is the index of the correct code.
64 | k: int, the number of retrieved codes.
65 |
66 | Returns:
67 | Float, the accuracy at k.
68 | """
69 |
70 | if len(golden_index_list) == 0:
71 | raise ValueError("The list of golden indices should not be empty.")
72 |
73 | assert len(golden_index_list) == len(prediction_list), \
74 | "The length of the golden indices list should be equal to the length of the prediction list, however, " \
75 | f"the length of the golden indices list is {len(golden_index_list)} and the length of the prediction list is {len(prediction_list)}."
76 |
77 |
78 | acc = 0
79 |
80 | for i in range(len(prediction_list)):
81 | golden_index = golden_index_list[i]
82 | index_list = prediction_list[i]
83 |
84 | if len(index_list) < k:
85 | raise ValueError("The number of retrieved codes should be greater than k.")
86 |
87 | top_k_indices = index_list[:k]
88 |
89 | if golden_index not in top_k_indices:
90 | continue
91 | else:
92 | acc += 1
93 |
94 | return round(acc / len(golden_index_list), 5)
95 |
96 | def codebleu_score(predictions, ground_truths, language, weight=[0.25, 0.25, 0.25, 0.25]):
97 |
98 | """
99 | This function computes the average codebleu score between the predicted codes and the ground truth codes.
100 | It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes
101 | and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding
102 | ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding
103 | ground truth codes.
104 |
105 | Args:
106 | predictions: list, predicted codes
107 | ground_truths: list, ground truth codes
108 | language: str, the programming language of the codes
109 | weight: list, the weights for each n-gram
110 |
111 | Returns:
112 | Float, the average codebleu score between the predicted codes and the ground truth codes.
113 | """
114 | if len(predictions) != len(ground_truths):
115 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
116 |
117 | # remove \r for both pred and gt
118 | predictions = [pred.replace("\r", "") for pred in predictions]
119 | ground_truths = [gt.replace("\r", "") for gt in ground_truths]
120 |
121 | res_list = calc_codebleu(
122 | ground_truths,
123 | predictions,
124 | language,
125 | weight,
126 | tokenizer=None
127 | )
128 |
129 | return res_list['codebleu']
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import fire
3 | import json
4 | from tqdm import tqdm
5 | from datasets import load_dataset
6 | from transformers import AutoTokenizer, AutoModelForCausalLM
7 | from datasets import DatasetDict, Dataset
8 | import pandas as pd
9 | from data.utils import construct_prompt
10 |
11 | # get first line that is not a comment
12 | def get_first_line_not_comment(code:str, language:str="python"):
13 | """
14 | This function gets the first line of code that is not a comment.
15 |
16 | Args:
17 | code: Str, the code
18 |
19 | Returns:
20 | Str, the first line of code that is not a comment or the first line of code if there is no line that is not a comment
21 | """
22 |
23 | # check if the language is valid
24 | assert language in ["python", "java"], "language must be one of [python, java]"
25 |
26 |
27 | # first remove the \n at the beginning of the code
28 | code = code.lstrip('\n')
29 |
30 | lines = code.split('\n')
31 | in_multiline_comment = False
32 |
33 | if language == "python":
34 | for line in lines:
35 | # if the line is empty, then skip
36 | if not line.strip():
37 | continue
38 | # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip
39 | if not in_multiline_comment and (line.strip().startswith('"""') or line.strip().startswith("'''")):
40 | in_multiline_comment = True
41 | continue
42 | # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip
43 | if in_multiline_comment and (line.strip().endswith('"""') or line.strip().endswith("'''")):
44 | in_multiline_comment = False
45 | continue
46 | # if the line is in a multiline comment, then skip
47 | if in_multiline_comment:
48 | continue
49 | # if the line is a single line comment, then skip
50 | if line.strip().startswith('#'):
51 | continue
52 | # if the line is not a comment, then return the line
53 | return line
54 |
55 | elif language == "java":
56 | for line in lines:
57 | # if the line is empty, then skip
58 | if not line.strip():
59 | continue
60 | # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip
61 | if not in_multiline_comment and line.strip().startswith('/*'):
62 | in_multiline_comment = True
63 | continue
64 | # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip
65 | if in_multiline_comment and line.strip().endswith('*/'):
66 | in_multiline_comment = False
67 | continue
68 | # if the line is in a multiline comment, then skip
69 | if in_multiline_comment:
70 | continue
71 | # if the line is a single line comment, then skip
72 | if line.strip().startswith('//'):
73 | continue
74 | # if the line is not a comment, then return the line
75 | return line
76 |
77 |
78 | # if we cannot find a line that is not a comment, then return the first line
79 | return lines[0]
80 |
81 | def filter_dataset_by_date_range(dataset: DatasetDict, start_date: str, end_date: str) -> DatasetDict:
82 | """
83 | Filters a Huggingface dataset by a specific date range.
84 |
85 | Parameters:
86 | dataset (DatasetDict): The input dataset with subsets containing a 'created_at' column.
87 | start_date (str): The start date in the format 'YYYY-MM-DD'.
88 | end_date (str): The end date in the format 'YYYY-MM-DD'.
89 |
90 | Returns:
91 | DatasetDict: The filtered dataset.
92 | """
93 | start_date = pd.to_datetime(start_date).tz_localize('UTC')
94 | end_date = pd.to_datetime(end_date).tz_localize('UTC')
95 |
96 | filtered_dataset_dict = {}
97 |
98 | for subset_name in dataset.keys():
99 | df = pd.DataFrame(dataset[subset_name])
100 | df['created_at'] = pd.to_datetime(df['created_at'])
101 |
102 | # Filter the DataFrame
103 | mask = (df['created_at'] >= start_date) & (df['created_at'] <= end_date)
104 | filtered_df = df[mask]
105 |
106 | # Convert back to Huggingface Dataset
107 | filtered_dataset_dict[subset_name] = Dataset.from_pandas(filtered_df)
108 |
109 | return DatasetDict(filtered_dataset_dict)
110 |
111 | def filter_dataset_by_levels(dataset: DatasetDict, levels: list) -> DatasetDict:
112 | """
113 | Filters a Huggingface dataset by specific levels.
114 |
115 | Parameters:
116 | dataset (DatasetDict): The input dataset with subsets containing a 'level' column.
117 | levels (list): The list of levels to filter by.
118 |
119 | Returns:
120 | DatasetDict: The filtered dataset.
121 | """
122 | filtered_dataset_dict = {}
123 |
124 | for subset_name in dataset.keys():
125 | # Filter the subset directly using the 'filter' method
126 | filtered_subset = dataset[subset_name].filter(lambda example: example['level'] in levels)
127 | filtered_dataset_dict[subset_name] = filtered_subset
128 |
129 | return DatasetDict(filtered_dataset_dict)
130 |
131 | def main(
132 | model_name: str = "deepseek-ai/deepseek-coder-1.3b-base",
133 | dataset_name: str = "tianyang/repobench_python_v1.1",
134 | start_date: str = "2023-12-01", # YYYY-MM-DD
135 | end_date: str = "2023-12-31", # YYYY-MM-DD
136 | max_token_nums: int = 15800, # max token number for the prompt, adjust according to the model
137 | levels = ["2k", "4k", "8k", "12k", "16k"], # 24k, 32k, 64k and 128k are also available, but the number of them is limited
138 | language: str = "python",
139 | temperature: float = 0.2,
140 | top_p: float = 0.95,
141 | max_new_tokens: int = 128, # max number of tokens to generate
142 | batch_size: int = 1,
143 | res_dir: str = "./results"
144 | ):
145 |
146 | # Load the dataset
147 | dataset = load_dataset(dataset_name, ignore_verifications=True)
148 |
149 | # Filter the dataset by date range
150 | dataset = filter_dataset_by_date_range(dataset, start_date, end_date)
151 |
152 | # Filter the dataset by levels
153 | dataset = filter_dataset_by_levels(dataset, levels)
154 |
155 | # Load the model and tokenizer
156 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
157 | tokenizer.padding_side = "left"
158 | tokenizer.pad_token_id = tokenizer.eos_token_id
159 |
160 | model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()
161 | model.generation_config.pad_token_id = tokenizer.pad_token_id
162 |
163 | # Create the save directory
164 | save_dir = f"{res_dir}/{model_name.split('/')[-1]}-{language}"
165 | os.makedirs(save_dir, exist_ok=True)
166 |
167 | for subset, data in dataset.items():
168 | for i in tqdm(range(0, len(data), batch_size), desc=f"Generating {subset}"):
169 | batch_data = [data[j] for j in range(i, min(i + batch_size, len(data)))]
170 | batch_prompts = [construct_prompt(d, tokenizer=tokenizer, max_token_nums=max_token_nums, language=language) for d in batch_data]
171 |
172 | batch_inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")
173 | batch_outputs = model.generate(**batch_inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True)
174 |
175 | for j, outputs in enumerate(batch_outputs):
176 | result = tokenizer.decode(outputs[batch_inputs["input_ids"][j].shape[-1]:], skip_special_tokens=True)
177 | result = get_first_line_not_comment(result, language=language)
178 |
179 | with open(f"{save_dir}/{subset}.jsonl", "a") as f_out:
180 | f_out.write(json.dumps({"idx": i + j, "level": batch_data[j]["level"], "pred": result, "gt": batch_data[j]["next_line"]}) + "\n")
181 |
182 | if __name__ == "__main__":
183 | fire.Fire(main)
184 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More_considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution 4.0 International Public License
58 |
59 | By exercising the Licensed Rights (defined below), You accept and agree
60 | to be bound by the terms and conditions of this Creative Commons
61 | Attribution 4.0 International Public License ("Public License"). To the
62 | extent this Public License may be interpreted as a contract, You are
63 | granted the Licensed Rights in consideration of Your acceptance of
64 | these terms and conditions, and the Licensor grants You such rights in
65 | consideration of benefits the Licensor receives from making the
66 | Licensed Material available under these terms and conditions.
67 |
68 |
69 | Section 1 -- Definitions.
70 |
71 | a. Adapted Material means material subject to Copyright and Similar
72 | Rights that is derived from or based upon the Licensed Material
73 | and in which the Licensed Material is translated, altered,
74 | arranged, transformed, or otherwise modified in a manner requiring
75 | permission under the Copyright and Similar Rights held by the
76 | Licensor. For purposes of this Public License, where the Licensed
77 | Material is a musical work, performance, or sound recording,
78 | Adapted Material is always produced where the Licensed Material is
79 | synched in timed relation with a moving image.
80 |
81 | b. Adapter's License means the license You apply to Your Copyright
82 | and Similar Rights in Your contributions to Adapted Material in
83 | accordance with the terms and conditions of this Public License.
84 |
85 | c. Copyright and Similar Rights means copyright and/or similar rights
86 | closely related to copyright including, without limitation,
87 | performance, broadcast, sound recording, and Sui Generis Database
88 | Rights, without regard to how the rights are labeled or
89 | categorized. For purposes of this Public License, the rights
90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
91 | Rights.
92 |
93 | d. Effective Technological Measures means those measures that, in the
94 | absence of proper authority, may not be circumvented under laws
95 | fulfilling obligations under Article 11 of the WIPO Copyright
96 | Treaty adopted on December 20, 1996, and/or similar international
97 | agreements.
98 |
99 | e. Exceptions and Limitations means fair use, fair dealing, and/or
100 | any other exception or limitation to Copyright and Similar Rights
101 | that applies to Your use of the Licensed Material.
102 |
103 | f. Licensed Material means the artistic or literary work, database,
104 | or other material to which the Licensor applied this Public
105 | License.
106 |
107 | g. Licensed Rights means the rights granted to You subject to the
108 | terms and conditions of this Public License, which are limited to
109 | all Copyright and Similar Rights that apply to Your use of the
110 | Licensed Material and that the Licensor has authority to license.
111 |
112 | h. Licensor means the individual(s) or entity(ies) granting rights
113 | under this Public License.
114 |
115 | i. Share means to provide material to the public by any means or
116 | process that requires permission under the Licensed Rights, such
117 | as reproduction, public display, public performance, distribution,
118 | dissemination, communication, or importation, and to make material
119 | available to the public including in ways that members of the
120 | public may access the material from a place and at a time
121 | individually chosen by them.
122 |
123 | j. Sui Generis Database Rights means rights other than copyright
124 | resulting from Directive 96/9/EC of the European Parliament and of
125 | the Council of 11 March 1996 on the legal protection of databases,
126 | as amended and/or succeeded, as well as other essentially
127 | equivalent rights anywhere in the world.
128 |
129 | k. You means the individual or entity exercising the Licensed Rights
130 | under this Public License. Your has a corresponding meaning.
131 |
132 |
133 | Section 2 -- Scope.
134 |
135 | a. License grant.
136 |
137 | 1. Subject to the terms and conditions of this Public License,
138 | the Licensor hereby grants You a worldwide, royalty-free,
139 | non-sublicensable, non-exclusive, irrevocable license to
140 | exercise the Licensed Rights in the Licensed Material to:
141 |
142 | a. reproduce and Share the Licensed Material, in whole or
143 | in part; and
144 |
145 | b. produce, reproduce, and Share Adapted Material.
146 |
147 | 2. Exceptions and Limitations. For the avoidance of doubt, where
148 | Exceptions and Limitations apply to Your use, this Public
149 | License does not apply, and You do not need to comply with
150 | its terms and conditions.
151 |
152 | 3. Term. The term of this Public License is specified in Section
153 | 6(a).
154 |
155 | 4. Media and formats; technical modifications allowed. The
156 | Licensor authorizes You to exercise the Licensed Rights in
157 | all media and formats whether now known or hereafter created,
158 | and to make technical modifications necessary to do so. The
159 | Licensor waives and/or agrees not to assert any right or
160 | authority to forbid You from making technical modifications
161 | necessary to exercise the Licensed Rights, including
162 | technical modifications necessary to circumvent Effective
163 | Technological Measures. For purposes of this Public License,
164 | simply making modifications authorized by this Section 2(a)
165 | (4) never produces Adapted Material.
166 |
167 | 5. Downstream recipients.
168 |
169 | a. Offer from the Licensor -- Licensed Material. Every
170 | recipient of the Licensed Material automatically
171 | receives an offer from the Licensor to exercise the
172 | Licensed Rights under the terms and conditions of this
173 | Public License.
174 |
175 | b. No downstream restrictions. You may not offer or impose
176 | any additional or different terms or conditions on, or
177 | apply any Effective Technological Measures to, the
178 | Licensed Material if doing so restricts exercise of the
179 | Licensed Rights by any recipient of the Licensed
180 | Material.
181 |
182 | 6. No endorsement. Nothing in this Public License constitutes or
183 | may be construed as permission to assert or imply that You
184 | are, or that Your use of the Licensed Material is, connected
185 | with, or sponsored, endorsed, or granted official status by,
186 | the Licensor or others designated to receive attribution as
187 | provided in Section 3(a)(1)(A)(i).
188 |
189 | b. Other rights.
190 |
191 | 1. Moral rights, such as the right of integrity, are not
192 | licensed under this Public License, nor are publicity,
193 | privacy, and/or other similar personality rights; however, to
194 | the extent possible, the Licensor waives and/or agrees not to
195 | assert any such rights held by the Licensor to the limited
196 | extent necessary to allow You to exercise the Licensed
197 | Rights, but not otherwise.
198 |
199 | 2. Patent and trademark rights are not licensed under this
200 | Public License.
201 |
202 | 3. To the extent possible, the Licensor waives any right to
203 | collect royalties from You for the exercise of the Licensed
204 | Rights, whether directly or through a collecting society
205 | under any voluntary or waivable statutory or compulsory
206 | licensing scheme. In all other cases the Licensor expressly
207 | reserves any right to collect such royalties.
208 |
209 |
210 | Section 3 -- License Conditions.
211 |
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 |
215 | a. Attribution.
216 |
217 | 1. If You Share the Licensed Material (including in modified
218 | form), You must:
219 |
220 | a. retain the following if it is supplied by the Licensor
221 | with the Licensed Material:
222 |
223 | i. identification of the creator(s) of the Licensed
224 | Material and any others designated to receive
225 | attribution, in any reasonable manner requested by
226 | the Licensor (including by pseudonym if
227 | designated);
228 |
229 | ii. a copyright notice;
230 |
231 | iii. a notice that refers to this Public License;
232 |
233 | iv. a notice that refers to the disclaimer of
234 | warranties;
235 |
236 | v. a URI or hyperlink to the Licensed Material to the
237 | extent reasonably practicable;
238 |
239 | b. indicate if You modified the Licensed Material and
240 | retain an indication of any previous modifications; and
241 |
242 | c. indicate the Licensed Material is licensed under this
243 | Public License, and include the text of, or the URI or
244 | hyperlink to, this Public License.
245 |
246 | 2. You may satisfy the conditions in Section 3(a)(1) in any
247 | reasonable manner based on the medium, means, and context in
248 | which You Share the Licensed Material. For example, it may be
249 | reasonable to satisfy the conditions by providing a URI or
250 | hyperlink to a resource that includes the required
251 | information.
252 |
253 | 3. If requested by the Licensor, You must remove any of the
254 | information required by Section 3(a)(1)(A) to the extent
255 | reasonably practicable.
256 |
257 | 4. If You Share Adapted Material You produce, the Adapter's
258 | License You apply must not prevent recipients of the Adapted
259 | Material from complying with this Public License.
260 |
261 |
262 | Section 4 -- Sui Generis Database Rights.
263 |
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 |
267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 | to extract, reuse, reproduce, and Share all or a substantial
269 | portion of the contents of the database;
270 |
271 | b. if You include all or a substantial portion of the database
272 | contents in a database in which You have Sui Generis Database
273 | Rights, then the database in which You have Sui Generis Database
274 | Rights (but not its individual contents) is Adapted Material; and
275 |
276 | c. You must comply with the conditions in Section 3(a) if You Share
277 | all or a substantial portion of the contents of the database.
278 |
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 |
283 |
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 |
286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 |
297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 |
307 | c. The disclaimer of warranties and limitation of liability provided
308 | above shall be interpreted in a manner that, to the extent
309 | possible, most closely approximates an absolute disclaimer and
310 | waiver of all liability.
311 |
312 |
313 | Section 6 -- Term and Termination.
314 |
315 | a. This Public License applies for the term of the Copyright and
316 | Similar Rights licensed here. However, if You fail to comply with
317 | this Public License, then Your rights under this Public License
318 | terminate automatically.
319 |
320 | b. Where Your right to use the Licensed Material has terminated under
321 | Section 6(a), it reinstates:
322 |
323 | 1. automatically as of the date the violation is cured, provided
324 | it is cured within 30 days of Your discovery of the
325 | violation; or
326 |
327 | 2. upon express reinstatement by the Licensor.
328 |
329 | For the avoidance of doubt, this Section 6(b) does not affect any
330 | right the Licensor may have to seek remedies for Your violations
331 | of this Public License.
332 |
333 | c. For the avoidance of doubt, the Licensor may also offer the
334 | Licensed Material under separate terms or conditions or stop
335 | distributing the Licensed Material at any time; however, doing so
336 | will not terminate this Public License.
337 |
338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 | License.
340 |
341 |
342 | Section 7 -- Other Terms and Conditions.
343 |
344 | a. The Licensor shall not be bound by any additional or different
345 | terms or conditions communicated by You unless expressly agreed.
346 |
347 | b. Any arrangements, understandings, or agreements regarding the
348 | Licensed Material not stated herein are separate from and
349 | independent of the terms and conditions of this Public License.
350 |
351 |
352 | Section 8 -- Interpretation.
353 |
354 | a. For the avoidance of doubt, this Public License does not, and
355 | shall not be interpreted to, reduce, limit, restrict, or impose
356 | conditions on any use of the Licensed Material that could lawfully
357 | be made without permission under this Public License.
358 |
359 | b. To the extent possible, if any provision of this Public License is
360 | deemed unenforceable, it shall be automatically reformed to the
361 | minimum extent necessary to make it enforceable. If the provision
362 | cannot be reformed, it shall be severed from this Public License
363 | without affecting the enforceability of the remaining terms and
364 | conditions.
365 |
366 | c. No term or condition of this Public License will be waived and no
367 | failure to comply consented to unless expressly agreed to by the
368 | Licensor.
369 |
370 | d. Nothing in this Public License constitutes or may be interpreted
371 | as a limitation upon, or waiver of, any privileges and immunities
372 | that apply to the Licensor or You, including from the legal
373 | processes of any jurisdiction or authority.
374 |
375 |
376 | =======================================================================
377 |
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 |
395 | Creative Commons may be contacted at creativecommons.org.
396 |
--------------------------------------------------------------------------------