├── assets
    ├── repobench_dark.png
    ├── repobench_logo.png
    └── repobench_light.png
├── .gitignore
├── requirements.txt
├── data
    ├── utils.py
    └── README.md
├── eval.py
├── README.md
├── evaluation
    └── metrics.py
├── run.py
└── LICENSE


/assets/repobench_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_dark.png


--------------------------------------------------------------------------------
/assets/repobench_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_logo.png


--------------------------------------------------------------------------------
/assets/repobench_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leolty/repobench/HEAD/assets/repobench_light.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | cache
3 | results
4 | *temp.html
5 | /data_v1
6 | archive_data/test
7 | archive_data/train


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | transformers
 3 | torch
 4 | fuzzywuzzy
 5 | difflib
 6 | tqdm
 7 | fire
 8 | codebleu
 9 | python-Levenshtein
10 | tree-sitter-python
11 | tree-sitter-java


--------------------------------------------------------------------------------
/data/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def construct_prompt(
 4 |     data: dict, 
 5 |     language: str = "python",
 6 |     tokenizer= None,
 7 |     max_token_nums: int = 15800
 8 |     ) -> str:
 9 |     """
10 |     Construct the prompt for next line prediction.
11 | 
12 |     :param data: data point from the dataset
13 |     :param language: the language of the code
14 |     :param tokenizer: the tokenizer of the evaluation model
15 |     :param max_token_nums: the maximum number of tokens constraint for the prompt
16 | 
17 |     :return: the constructed prompt
18 |     """
19 | 
20 |     # comment symbol for different languages
21 |     comment_symbol = "#" if language == "python" else "//"
22 | 
23 |     # construct the cross-file prompt and in-file prompt separately
24 |     # cross-file prompt
25 |     cross_file_prompt = f"{comment_symbol} Repo Name: {data['repo_name']}\n"
26 | 
27 |     for snippet in data['context']:
28 |         cross_file_prompt += f"{comment_symbol} Path: {snippet['path']}\n{snippet['snippet']}" + "\n\n"
29 |     
30 |     # in-file prompt
31 |     in_file_prompt = f"{comment_symbol} Path: {data['file_path']}\n{data['import_statement']}\n{data['cropped_code'].rstrip()}\n"
32 | 
33 |     # if we assign the tokenizer and the max_token_nums, we will truncate the cross-file prompt to meet the constraint
34 |     if tokenizer is not None and max_token_nums is not None:
35 |         
36 |         cross_file_prompt_token_nums = len(tokenizer.encode(cross_file_prompt))
37 |         in_file_prompt_token_nums = len(tokenizer.encode(in_file_prompt))
38 | 
39 |         exceed_token_nums = cross_file_prompt_token_nums + in_file_prompt_token_nums - max_token_nums
40 | 
41 |         if exceed_token_nums > 0:
42 |             # split the cross-file prompt into lines
43 |             cross_file_prompt_lines = cross_file_prompt.split("\n")
44 |             # drop lines from end until the extra token number is less than 0
45 |             for i in range(len(cross_file_prompt_lines)-1, -1, -1):
46 |                 exceed_token_nums -= len(tokenizer.encode(cross_file_prompt_lines[i]))
47 |                 if exceed_token_nums < 0:
48 |                     break
49 |             
50 |             # join the lines back
51 |             cross_file_prompt = "\n".join(cross_file_prompt_lines[:i]) + "\n\n"
52 |     
53 |     # combine the cross-file prompt and in-file prompt
54 |     prompt = cross_file_prompt + in_file_prompt
55 | 
56 |     # normalize some empty lines
57 |     prompt = re.sub(r'\n{4,}', '\n\n', prompt)
58 | 
59 |     return prompt
60 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from evaluation.metrics import exact_match_score, edit_similarity_score, codebleu_score
 4 | import fire
 5 | 
 6 | def eval(
 7 |     path="results/deepseek-coder-1.3b-base-python",
 8 |     language="python" # to calculate codebleu, we need to specify the language
 9 | ):
10 | 
11 |     total_data_points = 0
12 |     total_em_model, total_es_model, total_cb_model = 0, 0, 0
13 | 
14 |     for level in ["cross_file_first", "cross_file_random", "in_file"]:
15 |         filepath = os.path.join(path, f"{level}.jsonl")
16 |         seen_indices = set()  # Track seen indices for the current level
17 | 
18 |         # check if the file exists
19 |         if not os.path.exists(filepath):
20 |             print(f"Level: {level} not found for the model")
21 |             continue
22 | 
23 |         with open(filepath, "r") as f:
24 |             
25 |             data = []
26 |             for line in f:
27 |                 entry = json.loads(line.strip())
28 |                 idx = entry["idx"]
29 | 
30 |                 # Skip duplicate indices based on the chosen policy (here, keeping the former)
31 |                 if idx not in seen_indices:
32 |                     seen_indices.add(idx)
33 |                     data.append(entry)
34 |             
35 |             data_points = len(data)
36 | 
37 |             if data_points == 0:
38 |                 continue
39 | 
40 |             ground_truth = [d["gt"] for d in data]
41 |             generated = [d["pred"] for d in data]
42 | 
43 |             em_model = round(exact_match_score(ground_truth, generated) * 100, 2)
44 |             es_model = round(edit_similarity_score(ground_truth, generated), 2)
45 |             cb_model = round(codebleu_score(generated, ground_truth, language) * 100, 2)
46 | 
47 |             # accumulate the data points and the metrics
48 |             total_data_points += data_points
49 |             total_em_model += em_model * data_points
50 |             total_es_model += es_model * data_points
51 |             total_cb_model += cb_model * data_points
52 | 
53 |             print(f"Level: {level} with {data_points} data points")
54 |             print(f"EM: {em_model}, ES: {es_model}, CB: {cb_model}")
55 |             print("-" * 30)
56 | 
57 |     # calculate the weighted averages
58 |     if total_data_points > 0:
59 |         avg_em_model = round(total_em_model / total_data_points, 2)
60 |         avg_es_model = round(total_es_model / total_data_points, 2)
61 |         avg_cb_model = round(total_cb_model / total_data_points, 2)
62 | 
63 |         print("Weighted Averages:")
64 |         print(f"EM: {avg_em_model}, ES: {avg_es_model}, CB: {avg_cb_model}\n")
65 | 
66 |     else:
67 |         print("No data points were found for evaluation.")
68 |         
69 | if __name__ == "__main__":
70 |     fire.Fire(eval)


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <a href="https://github.com/Leolty/repobench#gh-light-mode-only">
 3 |     <img src="../assets/repobench_dark.png" width="318px" alt="repobench logo" />
 4 |   </a>
 5 |   <a href="https://github.com/Leolty/repobench#gh-dark-mode-only">
 6 |     <img src="../assets/repobench_light.png" width="318px" alt="repobench logo" />
 7 |   </a>
 8 | 
 9 | <p align="center">
10 |   <a href="https://arxiv.org/abs/2306.03091">
11 |     RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems
12 |   </a>
13 |   <br></br>
14 |   <a>
15 |     <b>ICLR 2024</b>
16 |   </a>
17 | </p>
18 | 
19 | <hr>
20 | 
21 | This directory hosts the datasets for subsequet versions of RepoBench. We are committed to updating RepoBench regularly, with updates scheduled **every 3 months**.
22 | 
23 | ## 🌇 Overview
24 | 
25 | - Our primary focus is on **next-line prediction** tasks to aid in code auto-completion. If your research requires retrieval data, please don't hesitate to reach out to us for collaboration.
26 | - Our datasets will be hosted on 🤗 HuggingFace, making them easily accessible for everyone.
27 | - Each data point within our datasets is categorized based on the prompt length (number of tokens), which is determined by OpenAI's GPT-4 tokenizer using [tiktoken](https://github.com/openai/tiktoken). Here's a detailed table illustrating the levels we've defined:
28 | 
29 |   | Level | Prompt Length (Number of Tokens) |
30 |   |-------|------------------------|
31 |   | 2k    | 640 - 1,600            |
32 |   | 4k    | 1,600 - 3,600          |
33 |   | 8k    | 3,600 - 7,200          |
34 |   | 12k   | 7,200 - 10,800         |
35 |   | 16k   | 10,800 - 14,400        |
36 |   | 24k   | 14,400 - 21,600        |
37 |   | 32k   | 21,600 - 28,800        |
38 |   | 64k   | 28,800 - 57,600        |
39 |   | 128k  | 57,600 - 100,000       |
40 | 
41 | - We hereby provide the official implementation for constructing prompts [here](https://github.com/Leolty/repobench/blob/53c1c55ad9e6d97d2b60dd2c9548ed1cd463b6a5/data/utils.py#L3). Please note that the methods provided are not necessarily the optimal way of construction. Reordering, retrieval argumentation, or employing different cropping/construction techniques could potentially lead to varying degrees of improvement. Ensure that the evaluations are conducted fairly.
42 | 
43 | ## 📚 Versions
44 | 
45 | ### RepoBench v1.1
46 | 
47 | RepoBench v1.1 includes data collected from GitHub between **October 6, 2023**, and **December 31, 2023**. To mitigate the data leakage and memorization issues, we conducted a deduplication process on the Stack v2 (coming soon) based on the file content.
48 | 
49 | You can access RepoBench v1.1 at the following links:
50 | - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
51 | - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
52 | 
53 | Or, you can load the data directly from the HuggingFace Hub using the following code:
54 | 
55 | ```python
56 | from datasets import load_dataset
57 | 
58 | # Load the Python dataset
59 | python_dataset = load_dataset("tianyang/repobench_python_v1.1")
60 | 
61 | # Load the Java dataset
62 | java_dataset = load_dataset("tianyang/repobench_java_v1.1")
63 | ```
64 | 
65 | ### RepoBench v1.2
66 | 
67 | *Cooming soon...*
68 | 
69 | ## 📝 Citation
70 | 
71 | If you use RepoBench in your research, please cite the following paper:
72 | 
73 | ```bibtex
74 | @misc{liu2023repobench,
75 |       title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems}, 
76 |       author={Tianyang Liu and Canwen Xu and Julian McAuley},
77 |       year={2024},
78 |       url={https://arxiv.org/abs/2306.03091},
79 |       booktitle={International Conference on Learning Representations}
80 | }
81 | ```
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://github.com/Leolty/repobench#gh-light-mode-only">
  3 |     <img src="assets/repobench_dark.png" width="318px" alt="repobench logo" />
  4 |   </a>
  5 |   <a href="https://github.com/Leolty/repobench#gh-dark-mode-only">
  6 |     <img src="assets/repobench_light.png" width="318px" alt="repobench logo" />
  7 |   </a>
  8 | 
  9 | <p align="center">
 10 |   <a href="https://arxiv.org/abs/2306.03091">
 11 |     RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems
 12 |   </a>
 13 |   <br></br>
 14 |   <a>
 15 |     <b>ICLR 2024</b>
 16 |   </a>
 17 | </p>
 18 | 
 19 | <hr>
 20 | 
 21 | ## 🔥 News
 22 | 
 23 | - *Feb 5th, 2024*: **RepoBench v1.1** (with newest code data) is now available on the 🤗 HuggingFace Hub. You can access the datasets for Python and Java using the following links:
 24 |   - For Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
 25 |   - For Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
 26 |   > **For more details of RepoBench v1.1, please refer to the [data directory](./data/README.md).**
 27 | 
 28 | - *Jan 16th, 2024*: RepoBench is accepted to ICLR 2024! 🎉
 29 | 
 30 | 
 31 | ## 🛠️ Installation
 32 | 
 33 | ```bash
 34 | git clone https://github.com/Leolty/repobench.git
 35 | cd repobench
 36 | ```
 37 | 
 38 | > [!NOTE] 
 39 | > There is a `requirements.txt` file, which contains dependencies for reproducing the results in the paper. If you are only interested in the data, you can skip the installation of dependencies.
 40 | 
 41 | ## ⚙️ Description of Settings
 42 | 
 43 | As discussed in the paper, we have three settings for each task:
 44 | 
 45 | - `cross_file_first`: Masks the line where a module from a different file is used for the first time.
 46 | - `cross_file_random`: Masks a random line where a module from a different file is used (not the first usage).
 47 | - `in_file`: Masks a random line that has no cross-file dependency.
 48 | 
 49 | 
 50 | ## 📥 Load Data
 51 | 
 52 | ```python
 53 | from datasets import load_dataset
 54 | 
 55 | dataset = load_dataset("tianyang/repobench_python_v1.1", ignore_verifications=True)
 56 | ```
 57 | 
 58 | For more details, visit the Hugging Face dataset pages:
 59 | - Python: [🤗 Repobench Python V1.1](https://huggingface.co/datasets/tianyang/repobench_python_v1.1)
 60 | - Java: [🤗 Repobench Java V1.1](https://huggingface.co/datasets/tianyang/repobench_java_v1.1)
 61 | 
 62 | ## 🚀 Running Experiments
 63 | 
 64 | To run experiments on the RepoBench v1.1 dataset, we provide a very basic `run.py` script using the 🤗 Transformers library.
 65 | 
 66 | Example usage:
 67 | 
 68 | ```bash
 69 | CUDA_VISIBLE_DEVICES=0 python run.py --model_name "deepseek-ai/deepseek-coder-1.3b-base" \
 70 |                --dataset_name "tianyang/repobench_python_v1.1" \
 71 |                --start_date "2023-12-01" \
 72 |                --end_date "2023-12-31" \
 73 |                --language "python" \
 74 |                --max_token_nums 15800 \
 75 |                --levels "2k" "4k" "8k" "12k" "16k" \
 76 |                --temperature 0.2 \
 77 |                --top_p 0.95 \
 78 |                --max_new_tokens 128 \
 79 |                --batch_size 1
 80 | ```
 81 | 
 82 | For a full list of available parameters, please refer to the `run.py` file. And it should be super easy to customize the script for your own needs.
 83 | 
 84 | ## 📊 Evaluation
 85 | 
 86 | After generating completions, you can evaluate the results using the `eval.py` script. This script calculates various metrics including Exact Match (EM), Edit Similarity (ES), and CodeBLEU (CB) scores for each setting.
 87 | 
 88 | To run the evaluation:
 89 | 
 90 | ```bash
 91 | python eval.py --path "results/deepseek-coder-1.3b-base-python" --language "python"
 92 | ```
 93 | 
 94 | The script will output scores for each level (`cross_file_first`, `cross_file_random`, `in_file`) as well as weighted averages across all levels.
 95 | 
 96 | ## 📝 Note
 97 | 
 98 | This branch of the repository is specifically for RepoBench v1.1. For the results presented in our ICLR 2024 paper, which used the initial version of RepoBench, please refer to the [`archive/v0` branch](https://github.com/Leolty/repobench/tree/archive/v0) of this repository.
 99 | 
100 | 
101 | ## 📝 Citation
102 | 
103 | If you use RepoBench in your research, please consider citing us:
104 | 
105 | ```bibtex
106 | @misc{liu2023repobench,
107 |       title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems}, 
108 |       author={Tianyang Liu and Canwen Xu and Julian McAuley},
109 |       year={2024},
110 |       url={https://arxiv.org/abs/2306.03091},
111 |       booktitle={International Conference on Learning Representations}
112 | }
113 | ```
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | from fuzzywuzzy import fuzz
  2 | from codebleu import calc_codebleu
  3 | 
  4 | def exact_match_score(predictions, ground_truths):
  5 |     """
  6 |     This function computes the average exact match score between the predicted codes and the ground truth codes. 
  7 |     It returns a float value between 0 and 1 indicating the degree of exact match between the predicted codes 
  8 |     and the ground truth codes, where a value of 1 means all the predicted codes exactly match their corresponding 
  9 |     ground truth codes and a value of 0 means none of the predicted codes exactly match their corresponding 
 10 |     ground truth codes.
 11 |     
 12 |     Args:
 13 |     predictions: list, predicted codes
 14 |     ground_truths: list, ground truth codes
 15 |     
 16 |     Returns:
 17 |     Float, the average exact match score between the predicted codes and the ground truth codes.
 18 |     """
 19 |     if len(predictions) != len(ground_truths):
 20 |         raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
 21 | 
 22 |     exact_match = 0
 23 |     for pred, gt in zip(predictions, ground_truths):
 24 |         if pred.split() == gt.split():
 25 |             exact_match += 1
 26 |     
 27 |     return round(exact_match / len(predictions), 5)
 28 |         
 29 | 
 30 | 
 31 | def edit_similarity_score(predictions, ground_truths):
 32 |     """
 33 |     This function computes the average edit similarity score between the predicted codes and the ground truth codes. 
 34 |     It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes 
 35 |     and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding 
 36 |     ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding 
 37 |     ground truth codes.
 38 |     
 39 |     Args:
 40 |     predictions: list, predicted codes
 41 |     ground_truths: list, ground truth codes
 42 |     
 43 |     Returns:
 44 |     Float, the average edit similarity score between the predicted codes and the ground truth codes.
 45 |     """
 46 |     if len(predictions) != len(ground_truths):
 47 |         raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
 48 |     
 49 |     edit_sim = 0.0
 50 |     for pred, gt in zip(predictions, ground_truths):
 51 |         edit_sim += fuzz.ratio(pred, gt)
 52 |     
 53 |     return round(edit_sim / len(predictions), 5)
 54 | 
 55 | def accuracy_at_k(prediction_list, golden_index_list, k):
 56 |     """
 57 |     This function computes the accuracy at k. It returns a float value between 0 and 1 indicating the
 58 |     accuracy at k, where a value of 1 means the correct code is retrieved at the top k positions and
 59 |     a value of 0 means the correct code is not retrieved at the top k positions.
 60 |     
 61 |     Args:
 62 |     prediction_list: list, a list of lists, where each list contains the indices of the retrieved codes.
 63 |     golden_index_list: list, a list of integers, where each integer is the index of the correct code.
 64 |     k: int, the number of retrieved codes.
 65 |     
 66 |     Returns:
 67 |     Float, the accuracy at k.
 68 |     """
 69 |     
 70 |     if len(golden_index_list) == 0:
 71 |         raise ValueError("The list of golden indices should not be empty.")
 72 |     
 73 |     assert len(golden_index_list) == len(prediction_list), \
 74 |         "The length of the golden indices list should be equal to the length of the prediction list, however, " \
 75 |         f"the length of the golden indices list is {len(golden_index_list)} and the length of the prediction list is {len(prediction_list)}."
 76 |     
 77 | 
 78 |     acc = 0
 79 | 
 80 |     for i in range(len(prediction_list)):
 81 |         golden_index = golden_index_list[i]
 82 |         index_list = prediction_list[i]
 83 | 
 84 |         if len(index_list) < k:
 85 |             raise ValueError("The number of retrieved codes should be greater than k.")
 86 |         
 87 |         top_k_indices = index_list[:k]
 88 | 
 89 |         if golden_index not in top_k_indices:
 90 |             continue
 91 |         else:
 92 |             acc += 1
 93 |         
 94 |     return round(acc / len(golden_index_list), 5)
 95 | 
 96 | def codebleu_score(predictions, ground_truths, language, weight=[0.25, 0.25, 0.25, 0.25]):
 97 |     
 98 |     """
 99 |     This function computes the average codebleu score between the predicted codes and the ground truth codes. 
100 |     It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes 
101 |     and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding 
102 |     ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding 
103 |     ground truth codes.
104 |     
105 |     Args:
106 |     predictions: list, predicted codes
107 |     ground_truths: list, ground truth codes
108 |     language: str, the programming language of the codes
109 |     weight: list, the weights for each n-gram
110 |     
111 |     Returns:
112 |     Float, the average codebleu score between the predicted codes and the ground truth codes.
113 |     """
114 |     if len(predictions) != len(ground_truths):
115 |         raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
116 |     
117 |     # remove \r for both pred and gt
118 |     predictions = [pred.replace("\r", "") for pred in predictions]
119 |     ground_truths = [gt.replace("\r", "") for gt in ground_truths]
120 |     
121 |     res_list = calc_codebleu(
122 |         ground_truths,
123 |         predictions,
124 |         language,
125 |         weight,
126 |         tokenizer=None
127 |     )
128 | 
129 |     return res_list['codebleu']


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import fire
  3 | import json
  4 | from tqdm import tqdm
  5 | from datasets import load_dataset
  6 | from transformers import AutoTokenizer, AutoModelForCausalLM
  7 | from datasets import DatasetDict, Dataset
  8 | import pandas as pd
  9 | from data.utils import construct_prompt
 10 | 
 11 | # get first line that is not a comment
 12 | def get_first_line_not_comment(code:str, language:str="python"):
 13 |     """
 14 |     This function gets the first line of code that is not a comment.
 15 | 
 16 |     Args:
 17 |     code: Str, the code
 18 | 
 19 |     Returns:
 20 |     Str, the first line of code that is not a comment or the first line of code if there is no line that is not a comment
 21 |     """
 22 | 
 23 |     # check if the language is valid
 24 |     assert language in ["python", "java"], "language must be one of [python, java]"
 25 | 
 26 | 
 27 |     # first remove the \n at the beginning of the code
 28 |     code = code.lstrip('\n')
 29 | 
 30 |     lines = code.split('\n')
 31 |     in_multiline_comment = False
 32 | 
 33 |     if language == "python":
 34 |         for line in lines:
 35 |             # if the line is empty, then skip
 36 |             if not line.strip():
 37 |                 continue
 38 |             # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip
 39 |             if not in_multiline_comment and (line.strip().startswith('"""') or line.strip().startswith("'''")):
 40 |                 in_multiline_comment = True
 41 |                 continue
 42 |             # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip
 43 |             if in_multiline_comment and (line.strip().endswith('"""') or line.strip().endswith("'''")):
 44 |                 in_multiline_comment = False
 45 |                 continue
 46 |             # if the line is in a multiline comment, then skip
 47 |             if in_multiline_comment:
 48 |                 continue
 49 |             # if the line is a single line comment, then skip
 50 |             if line.strip().startswith('#'):
 51 |                 continue
 52 |             # if the line is not a comment, then return the line
 53 |             return line
 54 |         
 55 |     elif language == "java":
 56 |         for line in lines:
 57 |             # if the line is empty, then skip
 58 |             if not line.strip():
 59 |                 continue
 60 |             # if the line is a start of a multiline comment, then set the in_multiline_comment to True and skip
 61 |             if not in_multiline_comment and line.strip().startswith('/*'):
 62 |                 in_multiline_comment = True
 63 |                 continue
 64 |             # if the line is the end of a multiline comment, then set the in_multiline_comment to False and skip
 65 |             if in_multiline_comment and line.strip().endswith('*/'):
 66 |                 in_multiline_comment = False
 67 |                 continue
 68 |             # if the line is in a multiline comment, then skip
 69 |             if in_multiline_comment:
 70 |                 continue
 71 |             # if the line is a single line comment, then skip
 72 |             if line.strip().startswith('//'):
 73 |                 continue
 74 |             # if the line is not a comment, then return the line
 75 |             return line
 76 | 
 77 | 
 78 |     # if we cannot find a line that is not a comment, then return the first line
 79 |     return lines[0]
 80 | 
 81 | def filter_dataset_by_date_range(dataset: DatasetDict, start_date: str, end_date: str) -> DatasetDict:
 82 |     """
 83 |     Filters a Huggingface dataset by a specific date range.
 84 |     
 85 |     Parameters:
 86 |     dataset (DatasetDict): The input dataset with subsets containing a 'created_at' column.
 87 |     start_date (str): The start date in the format 'YYYY-MM-DD'.
 88 |     end_date (str): The end date in the format 'YYYY-MM-DD'.
 89 |     
 90 |     Returns:
 91 |     DatasetDict: The filtered dataset.
 92 |     """
 93 |     start_date = pd.to_datetime(start_date).tz_localize('UTC')
 94 |     end_date = pd.to_datetime(end_date).tz_localize('UTC')
 95 |     
 96 |     filtered_dataset_dict = {}
 97 |     
 98 |     for subset_name in dataset.keys():
 99 |         df = pd.DataFrame(dataset[subset_name])
100 |         df['created_at'] = pd.to_datetime(df['created_at'])
101 |         
102 |         # Filter the DataFrame
103 |         mask = (df['created_at'] >= start_date) & (df['created_at'] <= end_date)
104 |         filtered_df = df[mask]
105 |         
106 |         # Convert back to Huggingface Dataset
107 |         filtered_dataset_dict[subset_name] = Dataset.from_pandas(filtered_df)
108 |     
109 |     return DatasetDict(filtered_dataset_dict)
110 | 
111 | def filter_dataset_by_levels(dataset: DatasetDict, levels: list) -> DatasetDict:
112 |     """
113 |     Filters a Huggingface dataset by specific levels.
114 |     
115 |     Parameters:
116 |     dataset (DatasetDict): The input dataset with subsets containing a 'level' column.
117 |     levels (list): The list of levels to filter by.
118 |     
119 |     Returns:
120 |     DatasetDict: The filtered dataset.
121 |     """
122 |     filtered_dataset_dict = {}
123 | 
124 |     for subset_name in dataset.keys():
125 |         # Filter the subset directly using the 'filter' method
126 |         filtered_subset = dataset[subset_name].filter(lambda example: example['level'] in levels)
127 |         filtered_dataset_dict[subset_name] = filtered_subset
128 |     
129 |     return DatasetDict(filtered_dataset_dict)
130 | 
131 | def main(
132 |     model_name: str = "deepseek-ai/deepseek-coder-1.3b-base", 
133 |     dataset_name: str = "tianyang/repobench_python_v1.1", 
134 |     start_date: str = "2023-12-01", # YYYY-MM-DD
135 |     end_date: str = "2023-12-31", # YYYY-MM-DD
136 |     max_token_nums: int = 15800,  # max token number for the prompt, adjust according to the model
137 |     levels = ["2k", "4k", "8k", "12k", "16k"], # 24k, 32k, 64k and 128k are also available, but the number of them is limited
138 |     language: str = "python", 
139 |     temperature: float = 0.2,
140 |     top_p: float = 0.95,
141 |     max_new_tokens: int = 128, # max number of tokens to generate 
142 |     batch_size: int = 1,
143 |     res_dir: str = "./results"
144 |     ):
145 |     
146 |     # Load the dataset
147 |     dataset = load_dataset(dataset_name, ignore_verifications=True)
148 |     
149 |     # Filter the dataset by date range
150 |     dataset = filter_dataset_by_date_range(dataset, start_date, end_date)
151 |     
152 |     # Filter the dataset by levels
153 |     dataset = filter_dataset_by_levels(dataset, levels)
154 | 
155 |     # Load the model and tokenizer
156 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
157 |     tokenizer.padding_side = "left"
158 |     tokenizer.pad_token_id = tokenizer.eos_token_id
159 | 
160 |     model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()
161 |     model.generation_config.pad_token_id = tokenizer.pad_token_id
162 |     
163 |     # Create the save directory
164 |     save_dir = f"{res_dir}/{model_name.split('/')[-1]}-{language}"
165 |     os.makedirs(save_dir, exist_ok=True)
166 | 
167 |     for subset, data in dataset.items():
168 |         for i in tqdm(range(0, len(data), batch_size), desc=f"Generating {subset}"):
169 |             batch_data = [data[j] for j in range(i, min(i + batch_size, len(data)))]
170 |             batch_prompts = [construct_prompt(d, tokenizer=tokenizer, max_token_nums=max_token_nums, language=language) for d in batch_data]
171 |             
172 |             batch_inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")
173 |             batch_outputs = model.generate(**batch_inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True)
174 |             
175 |             for j, outputs in enumerate(batch_outputs):
176 |                 result = tokenizer.decode(outputs[batch_inputs["input_ids"][j].shape[-1]:], skip_special_tokens=True)
177 |                 result = get_first_line_not_comment(result, language=language)
178 |                 
179 |                 with open(f"{save_dir}/{subset}.jsonl", "a") as f_out:
180 |                     f_out.write(json.dumps({"idx": i + j, "level": batch_data[j]["level"], "pred": result, "gt": batch_data[j]["next_line"]}) + "\n")
181 | 
182 | if __name__ == "__main__":
183 |     fire.Fire(main)
184 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public:
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------