├── assets ├── repobench_dark.png ├── repobench_logo.png └── repobench_light.png ├── .gitignore ├── requirements.txt ├── data ├── utils.py └── README.md ├── eval.py ├── README.md ├── evaluation └── metrics.py ├── run.py └── LICENSE /assets/repobench_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_dark.png -------------------------------------------------------------------------------- /assets/repobench_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_logo.png -------------------------------------------------------------------------------- /assets/repobench_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_light.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | cache 3 | results 4 | *temp.html 5 | /data_v1 6 | archive_data/test 7 | archive_data/train -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | transformers 3 | torch 4 | fuzzywuzzy 5 | difflib 6 | tqdm 7 | fire 8 | codebleu 9 | python-Levenshtein 10 | tree-sitter-python 11 | tree-sitter-java -------------------------------------------------------------------------------- /data/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def construct_prompt( 4 | data: dict, 5 | language: str = "python", 6 | tokenizer= None, 7 | max_token_nums: int = 15800 8 | ) -> str: 9 | """ 10 | Construct the prompt for next line prediction. 11 | 12 | :param data: data point from the dataset 13 | :param language: the language of the code 14 | :param tokenizer: the tokenizer of the evaluation model 15 | :param max_token_nums: the maximum number of tokens constraint for the prompt 16 | 17 | :return: the constructed prompt 18 | """ 19 | 20 | # comment symbol for different languages 21 | comment_symbol = "#" if language == "python" else "//" 22 | 23 | # construct the cross-file prompt and in-file prompt separately 24 | # cross-file prompt 25 | cross_file_prompt = f"{comment_symbol} Repo Name: {data['repo_name']}\n" 26 | 27 | for snippet in data['context']: 28 | cross_file_prompt += f"{comment_symbol} Path: {snippet['path']}\n{snippet['snippet']}" + "\n\n" 29 | 30 | # in-file prompt 31 | in_file_prompt = f"{comment_symbol} Path: {data['file_path']}\n{data['import_statement']}\n{data['cropped_code'].rstrip()}\n" 32 | 33 | # if we assign the tokenizer and the max_token_nums, we will truncate the cross-file prompt to meet the constraint 34 | if tokenizer is not None and max_token_nums is not None: 35 | 36 | cross_file_prompt_token_nums = len(tokenizer.encode(cross_file_prompt)) 37 | in_file_prompt_token_nums = len(tokenizer.encode(in_file_prompt)) 38 | 39 | exceed_token_nums = cross_file_prompt_token_nums + in_file_prompt_token_nums - max_token_nums 40 | 41 | if exceed_token_nums > 0: 42 | # split the cross-file prompt into lines 43 | cross_file_prompt_lines = cross_file_prompt.split("\n") 44 | # drop lines from end until the extra token number is less than 0 45 | for i in range(len(cross_file_prompt_lines)-1, -1, -1): 46 | exceed_token_nums -= len(tokenizer.encode(cross_file_prompt_lines[i])) 47 | if exceed_token_nums < 0: 48 | break 49 | 50 | # join the lines back 51 | cross_file_prompt = "\n".join(cross_file_prompt_lines[:i]) + "\n\n" 52 | 53 | # combine the cross-file prompt and in-file prompt 54 | prompt = cross_file_prompt + in_file_prompt 55 | 56 | # normalize some empty lines 57 | prompt = re.sub(r'\n{4,}', '\n\n', prompt) 58 | 59 | return prompt 60 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from evaluation.metrics import exact_match_score, edit_similarity_score, codebleu_score 4 | import fire 5 | 6 | def eval( 7 | path="results/deepseek-coder-1.3b-base-python", 8 | language="python" # to calculate codebleu, we need to specify the language 9 | ): 10 | 11 | total_data_points = 0 12 | total_em_model, total_es_model, total_cb_model = 0, 0, 0 13 | 14 | for level in ["cross_file_first", "cross_file_random", "in_file"]: 15 | filepath = os.path.join(path, f"{level}.jsonl") 16 | seen_indices = set() # Track seen indices for the current level 17 | 18 | # check if the file exists 19 | if not os.path.exists(filepath): 20 | print(f"Level: {level} not found for the model") 21 | continue 22 | 23 | with open(filepath, "r") as f: 24 | 25 | data = [] 26 | for line in f: 27 | entry = json.loads(line.strip()) 28 | idx = entry["idx"] 29 | 30 | # Skip duplicate indices based on the chosen policy (here, keeping the former) 31 | if idx not in seen_indices: 32 | seen_indices.add(idx) 33 | data.append(entry) 34 | 35 | data_points = len(data) 36 | 37 | if data_points == 0: 38 | continue 39 | 40 | ground_truth = [d["gt"] for d in data] 41 | generated = [d["pred"] for d in data] 42 | 43 | em_model = round(exact_match_score(ground_truth, generated) * 100, 2) 44 | es_model = round(edit_similarity_score(ground_truth, generated), 2) 45 | cb_model = round(codebleu_score(generated, ground_truth, language) * 100, 2) 46 | 47 | # accumulate the data points and the metrics 48 | total_data_points += data_points 49 | total_em_model += em_model * data_points 50 | total_es_model += es_model * data_points 51 | total_cb_model += cb_model * data_points 52 | 53 | print(f"Level: {level} with {data_points} data points") 54 | print(f"EM: {em_model}, ES: {es_model}, CB: {cb_model}") 55 | print("-" * 30) 56 | 57 | # calculate the weighted averages 58 | if total_data_points > 0: 59 | avg_em_model = round(total_em_model / total_data_points, 2) 60 | avg_es_model = round(total_es_model / total_data_points, 2) 61 | avg_cb_model = round(total_cb_model / total_data_points, 2) 62 | 63 | print("Weighted Averages:") 64 | print(f"EM: {avg_em_model}, ES: {avg_es_model}, CB: {avg_cb_model}\n") 65 | 66 | else: 67 | print("No data points were found for evaluation.") 68 | 69 | if __name__ == "__main__": 70 | fire.Fire(eval) -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | repobench logo 4 | 5 | 6 | repobench logo 7 | 8 | 9 |

10 | 11 | RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems 12 | 13 |

14 | 15 | ICLR 2024 16 | 17 |

18 | 19 |
20 | 21 | This directory hosts the datasets for subsequet versions of RepoBench. We are committed to updating RepoBench regularly, with updates scheduled **every 3 months**. 22 | 23 | ## 🌇 Overview 24 | 25 | - Our primary focus is on **next-line prediction** tasks to aid in code auto-completion. If your research requires retrieval data, please don't hesitate to reach out to us for collaboration. 26 | - Our datasets will be hosted on 🤗 HuggingFace, making them easily accessible for everyone. 27 | - Each data point within our datasets is categorized based on the prompt length (number of tokens), which is determined by OpenAI's GPT-4 tokenizer using [tiktoken](https://github.com/openai/tiktoken). Here's a detailed table illustrating the levels we've defined: 28 | 29 | | Level | Prompt Length (Number of Tokens) | 30 | |-------|------------------------| 31 | | 2k | 640 - 1,600 | 32 | | 4k | 1,600 - 3,600 | 33 | | 8k | 3,600 - 7,200 | 34 | | 12k | 7,200 - 10,800 | 35 | | 16k | 10,800 - 14,400 | 36 | | 24k | 14,400 - 21,600 | 37 | | 32k | 21,600 - 28,800 | 38 | | 64k | 28,800 - 57,600 | 39 | | 128k | 57,600 - 100,000 | 40 | 41 | - We hereby provide the official implementation for constructing prompts [here](https://github.com/Leolty/repobench/blob/53c1c55ad9e6d97d2b60dd2c9548ed1cd463b6a5/data/utils.py#L3). Please note that the methods provided are not necessarily the optimal way of construction. Reordering, retrieval argumentation, or employing different cropping/construction techniques could potentially lead to varying degrees of improvement. Ensure that the evaluations are conducted fairly. 42 | 43 | ## 📚 Versions 44 | 45 | ### RepoBench v1.1 46 | 47 | RepoBench v1.1 includes data collected from GitHub between **October 6, 2023**, and **December 31, 2023**. To mitigate the data leakage and memorization issues, we conducted a deduplication process on the Stack v2 (coming soon) based on the file content. 48 | 49 | You can access RepoBench v1.1 at the following links: 50 | - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1) 51 | - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1) 52 | 53 | Or, you can load the data directly from the HuggingFace Hub using the following code: 54 | 55 | ```python 56 | from datasets import load_dataset 57 | 58 | # Load the Python dataset 59 | python_dataset = load_dataset("tianyang/repobench_python_v1.1") 60 | 61 | # Load the Java dataset 62 | java_dataset = load_dataset("tianyang/repobench_java_v1.1") 63 | ``` 64 | 65 | ### RepoBench v1.2 66 | 67 | *Cooming soon...* 68 | 69 | ## 📝 Citation 70 | 71 | If you use RepoBench in your research, please cite the following paper: 72 | 73 | ```bibtex 74 | @misc{liu2023repobench, 75 | title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems}, 76 | author={Tianyang Liu and Canwen Xu and Julian McAuley}, 77 | year={2024}, 78 | url={https://arxiv.org/abs/2306.03091}, 79 | booktitle={International Conference on Learning Representations} 80 | } 81 | ``` 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | repobench logo 4 | 5 | 6 | repobench logo 7 | 8 | 9 |

10 | 11 | RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems 12 | 13 |

14 | 15 | ICLR 2024 16 | 17 |

18 | 19 |
20 | 21 | ## 🔥 News 22 | 23 | - *Feb 5th, 2024*: **RepoBench v1.1** (with newest code data) is now available on the 🤗 HuggingFace Hub. You can access the datasets for Python and Java using the following links: 24 | - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1) 25 | - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1) 26 | > **For more details of RepoBench v1.1, please refer to the [data directory](./data/README.md).** 27 | 28 | - *Jan 16th, 2024*: RepoBench is accepted to ICLR 2024! 🎉 29 | 30 | 31 | ## 🛠️ Installation 32 | 33 | ```bash 34 | git clone https://github.com/Leolty/repobench.git 35 | cd repobench 36 | ``` 37 | 38 | > [!NOTE] 39 | > There is a `requirements.txt` file, which contains dependencies for reproducing the results in the paper. If you are only interested in the data, you can skip the installation of dependencies. 40 | 41 | ## ⚙️ Description of Settings 42 | 43 | As discussed in the paper, we have three settings for each task: 44 | 45 | - `cross_file_first`: Masks the line where a module from a different file is used for the first time. 46 | - `cross_file_random`: Masks a random line where a module from a different file is used (not the first usage). 47 | - `in_file`: Masks a random line that has no cross-file dependency. 48 | 49 | 50 | ## 📥 Load Data 51 | 52 | ```python 53 | from datasets import load_dataset 54 | 55 | dataset = load_dataset("tianyang/repobench_python_v1.1", ignore_verifications=True) 56 | ``` 57 | 58 | For more details, visit the Hugging Face dataset pages: 59 | - Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1) 60 | - Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1) 61 | 62 | ## 🚀 Running Experiments 63 | 64 | To run experiments on the RepoBench v1.1 dataset, we provide a very basic `run.py` script using the 🤗 Transformers library. 65 | 66 | Example usage: 67 | 68 | ```bash 69 | CUDA_VISIBLE_DEVICES=0 python run.py --model_name "deepseek-ai/deepseek-coder-1.3b-base" \ 70 | --dataset_name "tianyang/repobench_python_v1.1" \ 71 | --start_date "2023-12-01" \ 72 | --end_date "2023-12-31" \ 73 | --language "python" \ 74 | --max_token_nums 15800 \ 75 | --levels "2k" "4k" "8k" "12k" "16k" \ 76 | --temperature 0.2 \ 77 | --top_p 0.95 \ 78 | --max_new_tokens 128 \ 79 | --batch_size 1 80 | ``` 81 | 82 | For a full list of available parameters, please refer to the `run.py` file. And it should be super easy to customize the script for your own needs. 83 | 84 | ## 📊 Evaluation 85 | 86 | After generating completions, you can evaluate the results using the `eval.py` script. This script calculates various metrics including Exact Match (EM), Edit Similarity (ES), and CodeBLEU (CB) scores for each setting. 87 | 88 | To run the evaluation: 89 | 90 | ```bash 91 | python eval.py --path "results/deepseek-coder-1.3b-base-python" --language "python" 92 | ``` 93 | 94 | The script will output scores for each level (`cross_file_first`, `cross_file_random`, `in_file`) as well as weighted averages across all levels. 95 | 96 | ## 📝 Note 97 | 98 | This branch of the repository is specifically for RepoBench v1.1. For the results presented in our ICLR 2024 paper, which used the initial version of RepoBench, please refer to the [`archive/v0` branch](https://github.com/Leolty/repobench/tree/archive/v0) of this repository. 99 | 100 | 101 | ## 📝 Citation 102 | 103 | If you use RepoBench in your research, please consider citing us: 104 | 105 | ```bibtex 106 | @misc{liu2023repobench, 107 | title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems}, 108 | author={Tianyang Liu and Canwen Xu and Julian McAuley}, 109 | year={2024}, 110 | url={https://arxiv.org/abs/2306.03091}, 111 | booktitle={International Conference on Learning Representations} 112 | } 113 | ``` 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | from fuzzywuzzy import fuzz 2 | from codebleu import calc_codebleu 3 | 4 | def exact_match_score(predictions, ground_truths): 5 | """ 6 | This function computes the average exact match score between the predicted codes and the ground truth codes. 7 | It returns a float value between 0 and 1 indicating the degree of exact match between the predicted codes 8 | and the ground truth codes, where a value of 1 means all the predicted codes exactly match their corresponding 9 | ground truth codes and a value of 0 means none of the predicted codes exactly match their corresponding 10 | ground truth codes. 11 | 12 | Args: 13 | predictions: list, predicted codes 14 | ground_truths: list, ground truth codes 15 | 16 | Returns: 17 | Float, the average exact match score between the predicted codes and the ground truth codes. 18 | """ 19 | if len(predictions) != len(ground_truths): 20 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.") 21 | 22 | exact_match = 0 23 | for pred, gt in zip(predictions, ground_truths): 24 | if pred.split() == gt.split(): 25 | exact_match += 1 26 | 27 | return round(exact_match / len(predictions), 5) 28 | 29 | 30 | 31 | def edit_similarity_score(predictions, ground_truths): 32 | """ 33 | This function computes the average edit similarity score between the predicted codes and the ground truth codes. 34 | It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes 35 | and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding 36 | ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding 37 | ground truth codes. 38 | 39 | Args: 40 | predictions: list, predicted codes 41 | ground_truths: list, ground truth codes 42 | 43 | Returns: 44 | Float, the average edit similarity score between the predicted codes and the ground truth codes. 45 | """ 46 | if len(predictions) != len(ground_truths): 47 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.") 48 | 49 | edit_sim = 0.0 50 | for pred, gt in zip(predictions, ground_truths): 51 | edit_sim += fuzz.ratio(pred, gt) 52 | 53 | return round(edit_sim / len(predictions), 5) 54 | 55 | def accuracy_at_k(prediction_list, golden_index_list, k): 56 | """ 57 | This function computes the accuracy at k. It returns a float value between 0 and 1 indicating the 58 | accuracy at k, where a value of 1 means the correct code is retrieved at the top k positions and 59 | a value of 0 means the correct code is not retrieved at the top k positions. 60 | 61 | Args: 62 | prediction_list: list, a list of lists, where each list contains the indices of the retrieved codes. 63 | golden_index_list: list, a list of integers, where each integer is the index of the correct code. 64 | k: int, the number of retrieved codes. 65 | 66 | Returns: 67 | Float, the accuracy at k. 68 | """ 69 | 70 | if len(golden_index_list) == 0: 71 | raise ValueError("The list of golden indices should not be empty.") 72 | 73 | assert len(golden_index_list) == len(prediction_list), \ 74 | "The length of the golden indices list should be equal to the length of the prediction list, however, " \ 75 | f"the length of the golden indices list is {len(golden_index_list)} and the length of the prediction list is {len(prediction_list)}." 76 | 77 | 78 | acc = 0 79 | 80 | for i in range(len(prediction_list)): 81 | golden_index = golden_index_list[i] 82 | index_list = prediction_list[i] 83 | 84 | if len(index_list) < k: 85 | raise ValueError("The number of retrieved codes should be greater than k.") 86 | 87 | top_k_indices = index_list[:k] 88 | 89 | if golden_index not in top_k_indices: 90 | continue 91 | else: 92 | acc += 1 93 | 94 | return round(acc / len(golden_index_list), 5) 95 | 96 | def codebleu_score(predictions, ground_truths, language, weight=[0.25, 0.25, 0.25, 0.25]): 97 | 98 | """ 99 | This function computes the average codebleu score between the predicted codes and the ground truth codes. 100 | It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes 101 | and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding 102 | ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding 103 | ground truth codes. 104 | 105 | Args: 106 | predictions: list, predicted codes 107 | ground_truths: list, ground truth codes 108 | language: str, the programming language of the codes 109 | weight: list, the weights for each n-gram 110 | 111 | Returns: 112 | Float, the average codebleu score between the predicted codes and the ground truth codes. 113 | """ 114 | if len(predictions) != len(ground_truths): 115 | raise ValueError("The length of the predicted codes and the ground truth codes should be equal.") 116 | 117 | # remove \r for both pred and gt 118 | predictions = [pred.replace("\r", "") for pred in predictions] 119 | ground_truths = [gt.replace("\r", "") for gt in ground_truths] 120 | 121 | res_list = calc_codebleu( 122 | ground_truths, 123 | predictions, 124 | language, 125 | weight, 126 | tokenizer=None 127 | ) 128 | 129 | return res_list['codebleu'] -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fire 3 | import json 4 | from tqdm import tqdm 5 | from datasets import load_dataset 6 | from transformers import AutoTokenizer, AutoModelForCausalLM 7 | from datasets import DatasetDict, Dataset 8 | import pandas as pd 9 | from data.utils import construct_prompt 10 | 11 | # get first line that is not a comment 12 | def get_first_line_not_comment(code:str, language:str="python"): 13 | """ 14 | This function gets the first line of code that is not a comment. 15 | 16 | Args: 17 | code: Str, the code 18 | 19 | Returns: 20 | Str, the first line of code that is not a comment or the first line of code if there is no line that is not a comment 21 | """ 22 | 23 | # check if the language is valid 24 | assert language in ["python", "java"], "language must be one of [python, java]" 25 | 26 | 27 | # first remove the \n at the beginning of the code 28 | code = code.lstrip('\n') 29 | 30 | lines = code.split('\n') 31 | in_multiline_comment = False 32 | 33 | if language == "python": 34 | for line in lines: 35 | # if the line is empty, then skip 36 | if not line.strip(): 37 | continue 38 | # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip 39 | if not in_multiline_comment and (line.strip().startswith('"""') or line.strip().startswith("'''")): 40 | in_multiline_comment = True 41 | continue 42 | # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip 43 | if in_multiline_comment and (line.strip().endswith('"""') or line.strip().endswith("'''")): 44 | in_multiline_comment = False 45 | continue 46 | # if the line is in a multiline comment, then skip 47 | if in_multiline_comment: 48 | continue 49 | # if the line is a single line comment, then skip 50 | if line.strip().startswith('#'): 51 | continue 52 | # if the line is not a comment, then return the line 53 | return line 54 | 55 | elif language == "java": 56 | for line in lines: 57 | # if the line is empty, then skip 58 | if not line.strip(): 59 | continue 60 | # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip 61 | if not in_multiline_comment and line.strip().startswith('/*'): 62 | in_multiline_comment = True 63 | continue 64 | # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip 65 | if in_multiline_comment and line.strip().endswith('*/'): 66 | in_multiline_comment = False 67 | continue 68 | # if the line is in a multiline comment, then skip 69 | if in_multiline_comment: 70 | continue 71 | # if the line is a single line comment, then skip 72 | if line.strip().startswith('//'): 73 | continue 74 | # if the line is not a comment, then return the line 75 | return line 76 | 77 | 78 | # if we cannot find a line that is not a comment, then return the first line 79 | return lines[0] 80 | 81 | def filter_dataset_by_date_range(dataset: DatasetDict, start_date: str, end_date: str) -> DatasetDict: 82 | """ 83 | Filters a Huggingface dataset by a specific date range. 84 | 85 | Parameters: 86 | dataset (DatasetDict): The input dataset with subsets containing a 'created_at' column. 87 | start_date (str): The start date in the format 'YYYY-MM-DD'. 88 | end_date (str): The end date in the format 'YYYY-MM-DD'. 89 | 90 | Returns: 91 | DatasetDict: The filtered dataset. 92 | """ 93 | start_date = pd.to_datetime(start_date).tz_localize('UTC') 94 | end_date = pd.to_datetime(end_date).tz_localize('UTC') 95 | 96 | filtered_dataset_dict = {} 97 | 98 | for subset_name in dataset.keys(): 99 | df = pd.DataFrame(dataset[subset_name]) 100 | df['created_at'] = pd.to_datetime(df['created_at']) 101 | 102 | # Filter the DataFrame 103 | mask = (df['created_at'] >= start_date) & (df['created_at'] <= end_date) 104 | filtered_df = df[mask] 105 | 106 | # Convert back to Huggingface Dataset 107 | filtered_dataset_dict[subset_name] = Dataset.from_pandas(filtered_df) 108 | 109 | return DatasetDict(filtered_dataset_dict) 110 | 111 | def filter_dataset_by_levels(dataset: DatasetDict, levels: list) -> DatasetDict: 112 | """ 113 | Filters a Huggingface dataset by specific levels. 114 | 115 | Parameters: 116 | dataset (DatasetDict): The input dataset with subsets containing a 'level' column. 117 | levels (list): The list of levels to filter by. 118 | 119 | Returns: 120 | DatasetDict: The filtered dataset. 121 | """ 122 | filtered_dataset_dict = {} 123 | 124 | for subset_name in dataset.keys(): 125 | # Filter the subset directly using the 'filter' method 126 | filtered_subset = dataset[subset_name].filter(lambda example: example['level'] in levels) 127 | filtered_dataset_dict[subset_name] = filtered_subset 128 | 129 | return DatasetDict(filtered_dataset_dict) 130 | 131 | def main( 132 | model_name: str = "deepseek-ai/deepseek-coder-1.3b-base", 133 | dataset_name: str = "tianyang/repobench_python_v1.1", 134 | start_date: str = "2023-12-01", # YYYY-MM-DD 135 | end_date: str = "2023-12-31", # YYYY-MM-DD 136 | max_token_nums: int = 15800, # max token number for the prompt, adjust according to the model 137 | levels = ["2k", "4k", "8k", "12k", "16k"], # 24k, 32k, 64k and 128k are also available, but the number of them is limited 138 | language: str = "python", 139 | temperature: float = 0.2, 140 | top_p: float = 0.95, 141 | max_new_tokens: int = 128, # max number of tokens to generate 142 | batch_size: int = 1, 143 | res_dir: str = "./results" 144 | ): 145 | 146 | # Load the dataset 147 | dataset = load_dataset(dataset_name, ignore_verifications=True) 148 | 149 | # Filter the dataset by date range 150 | dataset = filter_dataset_by_date_range(dataset, start_date, end_date) 151 | 152 | # Filter the dataset by levels 153 | dataset = filter_dataset_by_levels(dataset, levels) 154 | 155 | # Load the model and tokenizer 156 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 157 | tokenizer.padding_side = "left" 158 | tokenizer.pad_token_id = tokenizer.eos_token_id 159 | 160 | model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda() 161 | model.generation_config.pad_token_id = tokenizer.pad_token_id 162 | 163 | # Create the save directory 164 | save_dir = f"{res_dir}/{model_name.split('/')[-1]}-{language}" 165 | os.makedirs(save_dir, exist_ok=True) 166 | 167 | for subset, data in dataset.items(): 168 | for i in tqdm(range(0, len(data), batch_size), desc=f"Generating {subset}"): 169 | batch_data = [data[j] for j in range(i, min(i + batch_size, len(data)))] 170 | batch_prompts = [construct_prompt(d, tokenizer=tokenizer, max_token_nums=max_token_nums, language=language) for d in batch_data] 171 | 172 | batch_inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda") 173 | batch_outputs = model.generate(**batch_inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True) 174 | 175 | for j, outputs in enumerate(batch_outputs): 176 | result = tokenizer.decode(outputs[batch_inputs["input_ids"][j].shape[-1]:], skip_special_tokens=True) 177 | result = get_first_line_not_comment(result, language=language) 178 | 179 | with open(f"{save_dir}/{subset}.jsonl", "a") as f_out: 180 | f_out.write(json.dumps({"idx": i + j, "level": batch_data[j]["level"], "pred": result, "gt": batch_data[j]["next_line"]}) + "\n") 181 | 182 | if __name__ == "__main__": 183 | fire.Fire(main) 184 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | --------------------------------------------------------------------------------