├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CodeBERT ├── code2nl │ ├── README.md │ ├── bleu.py │ ├── model.py │ └── run.py └── codesearch │ ├── README.md │ ├── mrr.py │ ├── process_data.py │ ├── run_classifier.py │ └── utils.py ├── CodeExecutor ├── README.md ├── downstream │ ├── model_unixcoder.py │ ├── run.py │ └── run.sh ├── inference │ ├── dataset.py │ ├── metric.py │ ├── model.py │ ├── run.py │ └── run.sh └── pretrain │ ├── dataset.py │ ├── model.py │ ├── run.py │ └── run.sh ├── CodeReviewer ├── README.md └── code │ ├── bleu.py │ ├── configs.py │ ├── evaluator │ ├── CodeBLEU │ │ ├── bleu.py │ │ ├── calc_code_bleu.py │ │ ├── dataflow_match.py │ │ ├── keywords │ │ │ ├── c_sharp.txt │ │ │ └── java.txt │ │ ├── parser │ │ │ ├── DFG.py │ │ │ ├── __init__.py │ │ │ ├── build.py │ │ │ ├── build.sh │ │ │ └── utils.py │ │ ├── readme.txt │ │ ├── syntax_match.py │ │ ├── utils.py │ │ └── weighted_ngram_match.py │ ├── bleu.py │ ├── smooth_bleu.py │ └── stopwords.txt │ ├── models.py │ ├── run_finetune_cls.py │ ├── run_finetune_msg.py │ ├── run_finetune_ref.py │ ├── run_infer_msg.py │ ├── run_test_cls.py │ ├── run_test_msg.py │ ├── run_test_ref.py │ ├── sh │ ├── finetune-cls.sh │ ├── finetune-msg.sh │ ├── finetune-ref.sh │ ├── infer-json.sh │ ├── test-cls.sh │ ├── test-msg.sh │ ├── test-ref.sh │ └── test_nltk.sh │ ├── test_model.py │ └── utils.py ├── GraphCodeBERT ├── clonedetection │ ├── README.md │ ├── dataset.zip │ ├── evaluator │ │ ├── answers.txt │ │ ├── evaluator.py │ │ └── predictions.txt │ ├── model.py │ ├── parser │ │ ├── DFG.py │ │ ├── __init__.py │ │ ├── build.py │ │ ├── build.sh │ │ ├── my-languages.so │ │ └── utils.py │ └── run.py ├── codesearch │ ├── README.md │ ├── dataset.zip │ ├── model.py │ ├── parser │ │ ├── DFG.py │ │ ├── __init__.py │ │ ├── build.py │ │ ├── build.sh │ │ ├── my-languages.so │ │ └── utils.py │ └── run.py ├── refinement │ ├── README.md │ ├── bleu.py │ ├── data.zip │ ├── model.py │ ├── parser │ │ ├── DFG.py │ │ ├── __init__.py │ │ ├── build.py │ │ ├── build.sh │ │ ├── my-languages.so │ │ └── utils.py │ └── run.py └── translation │ ├── README.md │ ├── bleu.py │ ├── data.zip │ ├── model.py │ ├── parser │ ├── DFG.py │ ├── __init__.py │ ├── build.py │ ├── build.sh │ ├── my-languages.so │ └── utils.py │ └── run.py ├── LICENSE ├── LongCoder ├── README.md ├── longcoder.py ├── model.py ├── parser │ ├── DFG.py │ ├── __init__.py │ ├── build.py │ ├── build.sh │ ├── my-languages.so │ └── utils.py ├── run.py └── run.sh ├── NOTICE.md ├── README.md ├── SECURITY.md └── UniXcoder ├── README.md ├── downstream-tasks ├── clone-detection │ ├── BCB │ │ ├── README.md │ │ ├── model.py │ │ ├── run.py │ │ └── run.sh │ └── POJ-104 │ │ ├── README.md │ │ ├── dataset │ │ └── preprocess.py │ │ ├── model.py │ │ └── run.py ├── code-completion │ ├── README.md │ ├── dataset.zip │ ├── model.py │ └── run.py ├── code-generation │ ├── README.md │ ├── bleu.py │ ├── model.py │ ├── run.py │ └── run.sh ├── code-search │ ├── README.md │ ├── model.py │ └── run.py ├── code-summarization │ ├── README.md │ ├── bleu.py │ ├── model.py │ └── run.py └── zero-shot-search │ ├── README.md │ ├── dataset │ ├── java.jsonl │ ├── preprocess.py │ ├── python.jsonl │ └── ruby.jsonl │ ├── model.py │ └── run.py └── unixcoder.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -------------------------------------------------------------------------------- /CodeBERT/code2nl/README.md: -------------------------------------------------------------------------------- 1 | # Code Documentation Generation 2 | 3 | This repo provides the code for reproducing the experiments on [CodeSearchNet](https://arxiv.org/abs/1909.09436) dataset for code document generation tasks in six programming languages. 4 | 5 | **!News: We release a new pipeline for this task. The new pipeline only needs 2 p100 GPUs and less training time for Code Documentation Generation. Please refer to the [website](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text).** 6 | 7 | ## Dependency 8 | 9 | - pip install torch==1.4.0 10 | - pip install transformers==2.5.0 11 | - pip install filelock 12 | 13 | ## Data Preprocess 14 | 15 | We clean CodeSearchNet dataset for this task by following steps: 16 | 17 | - Remove comments in the code 18 | - Remove examples that codes cannot be parsed into an abstract syntax tree. 19 | - Remove examples that #tokens of documents is < 3 or >256 20 | - Remove examples that documents contain special tokens (e.g. or https:...) 21 | - Remove examples that documents are not English. 22 | 23 | Data statistic about the cleaned dataset for code document generation is shown in this Table. We release the cleaned dataset in this [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h). 24 | 25 | | PL | Training | Dev | Test | 26 | | :--------- | :------: | :----: | :----: | 27 | | Python | 251,820 | 13,914 | 14,918 | 28 | | PHP | 241,241 | 12,982 | 14,014 | 29 | | Go | 167,288 | 7,325 | 8,122 | 30 | | Java | 164,923 | 5,183 | 10,955 | 31 | | JavaScript | 58,025 | 3,885 | 3,291 | 32 | | Ruby | 24,927 | 1,400 | 1,261 | 33 | 34 | 35 | 36 | ## Data Download 37 | 38 | You can download dataset from the [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h). Or use the following command. 39 | 40 | ```shell 41 | pip install gdown 42 | mkdir data data/code2nl 43 | cd data/code2nl 44 | gdown https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h 45 | unzip Cleaned_CodeSearchNet.zip 46 | rm Cleaned_CodeSearchNet.zip 47 | cd ../.. 48 | ``` 49 | 50 | 51 | 52 | ## Fine-Tune 53 | 54 | We fine-tuned the model on 4*P40 GPUs. 55 | 56 | ```shell 57 | cd code2nl 58 | 59 | lang=php #programming language 60 | lr=5e-5 61 | batch_size=64 62 | beam_size=10 63 | source_length=256 64 | target_length=128 65 | data_dir=../data/code2nl/CodeSearchNet 66 | output_dir=model/$lang 67 | train_file=$data_dir/$lang/train.jsonl 68 | dev_file=$data_dir/$lang/valid.jsonl 69 | eval_steps=1000 #400 for ruby, 600 for javascript, 1000 for others 70 | train_steps=50000 #20000 for ruby, 30000 for javascript, 50000 for others 71 | pretrained_model=microsoft/codebert-base #Roberta: roberta-base 72 | 73 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --train_steps $train_steps --eval_steps $eval_steps 74 | ``` 75 | 76 | 77 | 78 | ## Inference and Evaluation 79 | 80 | After fine-tuning, inference and evaluation are as follows: 81 | 82 | ```shell 83 | lang=php #programming language 84 | beam_size=10 85 | batch_size=128 86 | source_length=256 87 | target_length=128 88 | output_dir=model/$lang 89 | data_dir=../data/code2nl/CodeSearchNet 90 | dev_file=$data_dir/$lang/valid.jsonl 91 | test_file=$data_dir/$lang/test.jsonl 92 | test_model=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test 93 | 94 | python run.py --do_test --model_type roberta --model_name_or_path microsoft/codebert-base --load_model_path $test_model --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 95 | ``` 96 | 97 | The results on CodeSearchNet are shown in this Table: 98 | 99 | | Model | Ruby | Javascript | Go | Python | Java | PHP | Overall | 100 | | ----------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: | 101 | | Seq2Seq | 9.64 | 10.21 | 13.98 | 15.93 | 15.09 | 21.08 | 14.32 | 102 | | Transformer | 11.18 | 11.59 | 16.38 | 15.81 | 16.26 | 22.12 | 15.56 | 103 | | RoBERTa | 11.17 | 11.90 | 17.72 | 18.14 | 16.47 | 24.02 | 16.57 | 104 | | CodeBERT | **12.16** | **14.90** | **18.07** | **19.06** | **17.65** | **25.16** | **17.83** | 105 | 106 | 107 | -------------------------------------------------------------------------------- /CodeBERT/codesearch/README.md: -------------------------------------------------------------------------------- 1 | # Code Search 2 | 3 | ## Data Preprocess 4 | 5 | Both training and validation datasets are created in a way that positive and negative samples are balanced. Negative samples consist of balanced number of instances with randomly replaced NL and PL. 6 | 7 | We follow the official evaluation metric to calculate the Mean Reciprocal Rank (MRR) for each pair of test data (c, w) over a fixed set of 999 distractor codes. 8 | 9 | You can use the following command to download the preprocessed training and validation dataset and preprocess the test dataset by yourself. The preprocessed testing dataset is very large, so only the preprocessing script is provided. 10 | 11 | ```shell 12 | mkdir data data/codesearch 13 | cd data/codesearch 14 | gdown https://drive.google.com/uc?id=1xgSR34XO8xXZg4cZScDYj2eGerBE9iGo 15 | unzip codesearch_data.zip 16 | rm codesearch_data.zip 17 | cd ../../codesearch 18 | python process_data.py 19 | cd .. 20 | ``` 21 | 22 | ## Fine-Tune 23 | We fine-tuned the model on 2*P100 GPUs. 24 | ```shell 25 | cd codesearch 26 | 27 | lang=php #fine-tuning a language-specific model for each programming language 28 | pretrained_model=microsoft/codebert-base #Roberta: roberta-base 29 | 30 | python run_classifier.py \ 31 | --model_type roberta \ 32 | --task_name codesearch \ 33 | --do_train \ 34 | --do_eval \ 35 | --eval_all_checkpoints \ 36 | --train_file train.txt \ 37 | --dev_file valid.txt \ 38 | --max_seq_length 200 \ 39 | --per_gpu_train_batch_size 32 \ 40 | --per_gpu_eval_batch_size 32 \ 41 | --learning_rate 1e-5 \ 42 | --num_train_epochs 8 \ 43 | --gradient_accumulation_steps 1 \ 44 | --overwrite_output_dir \ 45 | --data_dir ../data/codesearch/train_valid/$lang \ 46 | --output_dir ./models/$lang \ 47 | --model_name_or_path $pretrained_model 48 | ``` 49 | ## Inference and Evaluation 50 | 51 | Inference 52 | ```shell 53 | lang=php #programming language 54 | idx=0 #test batch idx 55 | 56 | python run_classifier.py \ 57 | --model_type roberta \ 58 | --model_name_or_path microsoft/codebert-base \ 59 | --task_name codesearch \ 60 | --do_predict \ 61 | --output_dir ./models/$lang \ 62 | --data_dir ../data/codesearch/test/$lang \ 63 | --max_seq_length 200 \ 64 | --per_gpu_train_batch_size 32 \ 65 | --per_gpu_eval_batch_size 32 \ 66 | --learning_rate 1e-5 \ 67 | --num_train_epochs 8 \ 68 | --test_file batch_${idx}.txt \ 69 | --pred_model_dir ./models/$lang/checkpoint-best/ \ 70 | --test_result_dir ./results/$lang/${idx}_batch_result.txt 71 | ``` 72 | 73 | Evaluation 74 | ```shell 75 | python mrr.py 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /CodeBERT/codesearch/mrr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | import os 6 | import numpy as np 7 | from more_itertools import chunked 8 | import argparse 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--test_batch_size', type=int, default=1000) 14 | args = parser.parse_args() 15 | languages = ['ruby', 'go', 'php', 'python', 'java', 'javascript'] 16 | MRR_dict = {} 17 | for language in languages: 18 | file_dir = './results/{}'.format(language) 19 | ranks = [] 20 | num_batch = 0 21 | for file in sorted(os.listdir(file_dir)): 22 | print(os.path.join(file_dir, file)) 23 | with open(os.path.join(file_dir, file), encoding='utf-8') as f: 24 | batched_data = chunked(f.readlines(), args.test_batch_size) 25 | for batch_idx, batch_data in enumerate(batched_data): 26 | num_batch += 1 27 | correct_score = float(batch_data[batch_idx].strip().split('')[-1]) 28 | scores = np.array([float(data.strip().split('')[-1]) for data in batch_data]) 29 | rank = np.sum(scores >= correct_score) 30 | ranks.append(rank) 31 | 32 | mean_mrr = np.mean(1.0 / np.array(ranks)) 33 | print("{} mrr: {}".format(language, mean_mrr)) 34 | MRR_dict[language] = mean_mrr 35 | for key, val in MRR_dict.items(): 36 | print("{} mrr: {}".format(key, val)) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /CodeBERT/codesearch/process_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | import gzip 6 | import os 7 | import json 8 | import numpy as np 9 | from more_itertools import chunked 10 | 11 | DATA_DIR='../data/codesearch' 12 | 13 | def format_str(string): 14 | for char in ['\r\n', '\r', '\n']: 15 | string = string.replace(char, ' ') 16 | return string 17 | 18 | 19 | def preprocess_test_data(language, test_batch_size=1000): 20 | path = os.path.join(DATA_DIR, '{}_test_0.jsonl.gz'.format(language)) 21 | print(path) 22 | with gzip.open(path, 'r') as pf: 23 | data = pf.readlines() 24 | 25 | idxs = np.arange(len(data)) 26 | data = np.array(data, dtype=np.object) 27 | 28 | np.random.seed(0) # set random seed so that random things are reproducible 29 | np.random.shuffle(idxs) 30 | data = data[idxs] 31 | batched_data = chunked(data, test_batch_size) 32 | 33 | print("start processing") 34 | for batch_idx, batch_data in enumerate(batched_data): 35 | if len(batch_data) < test_batch_size: 36 | break # the last batch is smaller than the others, exclude. 37 | examples = [] 38 | for d_idx, d in enumerate(batch_data): 39 | line_a = json.loads(str(d, encoding='utf-8')) 40 | doc_token = ' '.join(line_a['docstring_tokens']) 41 | for dd in batch_data: 42 | line_b = json.loads(str(dd, encoding='utf-8')) 43 | code_token = ' '.join([format_str(token) for token in line_b['code_tokens']]) 44 | 45 | example = (str(1), line_a['url'], line_b['url'], doc_token, code_token) 46 | example = ''.join(example) 47 | examples.append(example) 48 | 49 | data_path = os.path.join(DATA_DIR, 'test/{}'.format(language)) 50 | if not os.path.exists(data_path): 51 | os.makedirs(data_path) 52 | file_path = os.path.join(data_path, 'batch_{}.txt'.format(batch_idx)) 53 | print(file_path) 54 | with open(file_path, 'w', encoding='utf-8') as f: 55 | f.writelines('\n'.join(examples)) 56 | 57 | if __name__ == '__main__': 58 | languages = ['go', 'php', 'python', 'java', 'javascript', 'ruby'] 59 | for lang in languages: 60 | preprocess_test_data(lang) 61 | -------------------------------------------------------------------------------- /CodeExecutor/downstream/model_unixcoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch.nn as nn 4 | import torch 5 | class Model(nn.Module): 6 | def __init__(self, encoder): 7 | super(Model, self).__init__() 8 | self.encoder = encoder 9 | 10 | def forward(self, code_inputs=None, nl_inputs=None, cls=False): 11 | if code_inputs is not None: 12 | outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0] 13 | outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None] 14 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 15 | else: 16 | outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0] 17 | outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None] 18 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 19 | 20 | 21 | -------------------------------------------------------------------------------- /CodeExecutor/downstream/run.sh: -------------------------------------------------------------------------------- 1 | source_lang=python 2 | target_lang=python 3 | python run.py \ 4 | --model_name_or_path microsoft/unixcoder-base \ 5 | --query_data_file ../data/code_to_code_search_test.json \ 6 | --candidate_data_file ../data/code_to_code_search_test.json \ 7 | --trace_file ../saved_models/code_to_code_search/preds.txt \ 8 | --query_lang ${source_lang} \ 9 | --candidate_lang ${target_lang} \ 10 | --code_length 512 \ 11 | --eval_batch_size 256 12 | -------------------------------------------------------------------------------- /CodeExecutor/inference/dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from torch.utils.data import Dataset 4 | import os 5 | import pickle 6 | import logging 7 | import json 8 | from tqdm import tqdm 9 | 10 | 11 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 12 | """Truncates a sequence pair in place to the maximum length.""" 13 | while True: 14 | total_length = len(tokens_a) + len(tokens_b) 15 | if total_length <= max_length: 16 | break 17 | if len(tokens_a) > len(tokens_b): 18 | tokens_a.pop() 19 | else: 20 | tokens_b.pop() 21 | 22 | def _truncate_seq_pair_two_length(tokens_a, tokens_b, max_length_a, max_length_b): 23 | """Truncates a sequence pair in place to the maximum length.""" 24 | while True: 25 | total_length = len(tokens_a) + len(tokens_b) 26 | if total_length <= max_length_a + max_length_b: 27 | break 28 | if len(tokens_b) > max_length_b: 29 | tokens_b.pop() 30 | else: # len(tokens_a) > max_length_a 31 | tokens_a.pop() 32 | 33 | class InputFeatures(object): 34 | """A single training/test features for a example.""" 35 | def __init__(self, 36 | code_tokens, 37 | trace_tokens 38 | 39 | ): 40 | self.code_tokens = code_tokens 41 | self.trace_tokens = trace_tokens 42 | 43 | def convert_examples_to_features(item): 44 | # parsing 45 | js,tokenizer=item 46 | code_tokens = js["code_tokens"] 47 | trace_tokens = js["trace_tokens"] 48 | code_tokens = tokenizer.tokenize(" ".join(code_tokens)) 49 | trace_tokens = tokenizer.tokenize(" ".join(trace_tokens)) 50 | 51 | return InputFeatures(code_tokens,trace_tokens) 52 | 53 | 54 | 55 | class TextDataset(Dataset): 56 | def __init__(self, tokenizer, args, filename, local_rank, world_size, logger, mode, prefix=""): 57 | self.args = args 58 | self.tokenizer = tokenizer 59 | 60 | if len(prefix) > 0: 61 | cached_features_file = os.path.join('{}'.format(args.data_cache_dir), prefix + "_word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl') 62 | else: 63 | cached_features_file = os.path.join('{}'.format(args.data_cache_dir), "word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl') 64 | if os.path.exists(cached_features_file): 65 | logger.warning("Loading features from cached file %s", cached_features_file) 66 | with open(cached_features_file, 'rb') as handle1: 67 | self.examples = pickle.load(handle1) 68 | if 'train' in mode and local_rank==0: 69 | for idx, example in enumerate(self.examples[:1]): 70 | logger.warning("*** Example ***") 71 | logger.warning("idx: %s",idx) 72 | logger.warning("code_tokens: {}".format(' '.join(map(str, example.code_tokens)))) 73 | logger.warning("trace_tokens: {}".format(' '.join(map(str, example.trace_tokens)))) 74 | else: 75 | self.examples = [] 76 | total_num = 0 77 | error_num = 0 78 | logger.info("Load and create features from dataset file at %s", filename) 79 | num_lines = sum(1 for line in open(filename,'r')) 80 | with open(filename,"r",encoding="utf8") as f: 81 | for i,line in enumerate(tqdm(f,total=num_lines)): 82 | json_line = json.loads(line) 83 | if len(json_line['code_tokens']) != 0: 84 | total_num += 1 85 | if (mode == "train" and total_num % world_size == local_rank) or (mode != "train" and local_rank in [-1, 0]): 86 | js = {} 87 | if len(prefix) > 0: 88 | js["code_tokens"] = ["<"+prefix+">"] 89 | js["code_tokens"].extend(json_line["code_tokens"]) 90 | else: 91 | js["code_tokens"] = json_line["code_tokens"] 92 | js["trace_tokens"] = json_line["trace_tokens"] 93 | try: 94 | features = convert_examples_to_features((js, tokenizer)) 95 | cur_index = len(self.examples) 96 | self.examples.append(features) 97 | except: 98 | error_num += 1 99 | 100 | if mode == "train" and local_rank==0: 101 | for idx, example in enumerate(self.examples[:1]): 102 | logger.warning("*** Example ***") 103 | logger.warning("idx: %s",idx) 104 | logger.warning("code_tokens: {}".format(example.code_tokens)) 105 | logger.warning("trace_tokens: {}".format(example.trace_tokens)) 106 | 107 | logger.warning("Num examples = %d: %d", local_rank,len(self.examples)) 108 | logger.warning(f"Error num = {error_num}") 109 | # debug 110 | logger.warning("Saving features into cached file %s", cached_features_file) 111 | if not os.path.exists(args.data_cache_dir): 112 | os.makedirs(args.data_cache_dir) 113 | with open(cached_features_file, 'wb') as handle1: 114 | pickle.dump(self.examples, handle1, protocol=pickle.HIGHEST_PROTOCOL) 115 | 116 | def __len__(self): 117 | return len(self.examples) 118 | 119 | def __getitem__(self, item): 120 | js = self.examples[item] 121 | 122 | # Encoder-Decoder for Trace Generation 123 | source_tokens = js.code_tokens[0:self.args.max_source_length-5] 124 | source_tokens = ["","",""] + source_tokens + [""] + [""] 125 | source_ids = self.tokenizer.convert_tokens_to_ids(source_tokens) 126 | padding_length = self.args.max_source_length - len(source_ids) 127 | source_ids += [self.tokenizer.pad_token_id]*padding_length 128 | 129 | target_tokens = self.tokenizer.tokenize("None") # generate 130 | target_tokens = [""] + target_tokens + [self.tokenizer.sep_token] 131 | target_ids = self.tokenizer.convert_tokens_to_ids(target_tokens) 132 | padding_length = self.args.max_target_length - len(target_ids) 133 | target_ids += [self.tokenizer.pad_token_id] * padding_length 134 | 135 | gold_tokens = js.trace_tokens[:self.args.max_target_length-2] 136 | gold_tokens = [""] + gold_tokens + [self.tokenizer.sep_token] 137 | gold_ids = self.tokenizer.convert_tokens_to_ids(gold_tokens) 138 | padding_length = self.args.max_target_length - len(gold_ids) 139 | gold_ids += [self.tokenizer.pad_token_id] * padding_length 140 | 141 | return ( 142 | torch.tensor(source_ids), 143 | torch.tensor(target_ids), 144 | torch.tensor(gold_ids), 145 | ) 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /CodeExecutor/inference/run.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEBVISES=0 python run.py \ 2 | --prefix codenet \ 3 | --output_dir ../saved_models/inference \ 4 | --data_cache_dir ../saved_models/inference \ 5 | --eval_data_path ../data/codenetmut_test.json \ 6 | --model_name_or_path microsoft/codeexecutor \ 7 | --block_size 1024 \ 8 | --per_gpu_train_batch_size 8 \ 9 | --per_gpu_eval_batch_size 16 \ 10 | --gradient_accumulation_steps 8 \ 11 | --learning_rate 1e-4 \ 12 | --node_index 0 \ 13 | --weight_decay 0.01 \ 14 | --adam_epsilon 1e-6 \ 15 | --max_grad_norm 1.0 \ 16 | --max_steps 1000 \ 17 | --warmup_steps 10000 \ 18 | --save_steps 5000 \ 19 | --seed 123456 -------------------------------------------------------------------------------- /CodeExecutor/pretrain/dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from torch.utils.data import Dataset 4 | import os 5 | import pickle 6 | import logging 7 | import json 8 | from tqdm import tqdm 9 | 10 | 11 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 12 | """Truncates a sequence pair in place to the maximum length.""" 13 | while True: 14 | total_length = len(tokens_a) + len(tokens_b) 15 | if total_length <= max_length: 16 | break 17 | if len(tokens_a) > len(tokens_b): 18 | tokens_a.pop() 19 | else: 20 | tokens_b.pop() 21 | 22 | def _truncate_seq_pair_two_length(tokens_a, tokens_b, max_length_a, max_length_b): 23 | """Truncates a sequence pair in place to the maximum length.""" 24 | while True: 25 | total_length = len(tokens_a) + len(tokens_b) 26 | if total_length <= max_length_a + max_length_b: 27 | break 28 | if len(tokens_b) > max_length_b: 29 | tokens_b.pop() 30 | else: # len(tokens_a) > max_length_a 31 | tokens_a.pop() 32 | 33 | class InputFeatures(object): 34 | """A single training/test features for a example.""" 35 | def __init__(self, 36 | code_tokens, 37 | trace_tokens 38 | 39 | ): 40 | self.code_tokens = code_tokens 41 | self.trace_tokens = trace_tokens 42 | 43 | def convert_examples_to_features(item): 44 | # parsing 45 | js,tokenizer=item 46 | code_tokens = js["code_tokens"] 47 | trace_tokens = js["trace_tokens"] 48 | code_tokens = tokenizer.tokenize(" ".join(code_tokens)) 49 | trace_tokens = tokenizer.tokenize(" ".join(trace_tokens)) 50 | 51 | return InputFeatures(code_tokens,trace_tokens) 52 | 53 | 54 | 55 | class TextDataset(Dataset): 56 | def __init__(self, tokenizer, args, filename, local_rank, world_size, logger, mode, prefix=""): 57 | self.args = args 58 | self.tokenizer = tokenizer 59 | if len(prefix) > 0: 60 | cached_features_file = os.path.join('{}'.format(args.data_cache_dir), prefix + "_word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl') 61 | else: 62 | cached_features_file = os.path.join('{}'.format(args.data_cache_dir), "word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl') 63 | if os.path.exists(cached_features_file): 64 | logger.warning("Loading features from cached file %s", cached_features_file) 65 | with open(cached_features_file, 'rb') as handle1: 66 | self.examples = pickle.load(handle1) 67 | if 'train' in mode and local_rank==0: 68 | for idx, example in enumerate(self.examples[:1]): 69 | logger.warning("*** Example ***") 70 | logger.warning("idx: %s",idx) 71 | logger.warning("code_tokens: {}".format(' '.join(map(str, example.code_tokens)))) 72 | logger.warning("trace_tokens: {}".format(' '.join(map(str, example.trace_tokens)))) 73 | else: 74 | self.examples = [] 75 | total_num = 0 76 | error_num = 0 77 | logger.info("Load and create features from dataset file at %s", filename) 78 | num_lines = sum(1 for line in open(filename,'r')) 79 | with open(filename,"r",encoding="utf8") as f: 80 | for i,line in enumerate(tqdm(f,total=num_lines)): 81 | json_line = json.loads(line) 82 | if len(json_line['code_tokens']) != 0: 83 | total_num += 1 84 | if (mode == "train" and total_num % world_size == local_rank) or (mode != "train" and local_rank in [-1, 0]): 85 | js = {} 86 | if len(prefix) > 0: 87 | js["code_tokens"] = ["<"+prefix+">"] 88 | js["code_tokens"].extend(json_line["code_tokens"]) 89 | else: 90 | js["code_tokens"] = json_line["code_tokens"] 91 | js["trace_tokens"] = json_line["trace_tokens"] 92 | try: 93 | features = convert_examples_to_features((js, tokenizer)) 94 | cur_index = len(self.examples) 95 | self.examples.append(features) 96 | except: 97 | error_num += 1 98 | 99 | if mode == "train" and local_rank==0: 100 | for idx, example in enumerate(self.examples[:1]): 101 | logger.warning("*** Example ***") 102 | logger.warning("idx: %s",idx) 103 | logger.warning("code_tokens: {}".format(example.code_tokens)) 104 | logger.warning("trace_tokens: {}".format(example.trace_tokens)) 105 | 106 | 107 | logger.warning("Num examples = %d: %d", local_rank,len(self.examples)) 108 | logger.warning(f"Error num = {error_num}") 109 | # debug 110 | logger.warning("Saving features into cached file %s", cached_features_file) 111 | if not os.path.exists(args.data_cache_dir): 112 | os.makedirs(args.data_cache_dir) 113 | with open(cached_features_file, 'wb') as handle1: 114 | pickle.dump(self.examples, handle1, protocol=pickle.HIGHEST_PROTOCOL) 115 | 116 | 117 | 118 | 119 | def __len__(self): 120 | return len(self.examples) 121 | 122 | def __getitem__(self, item): 123 | js = self.examples[item] 124 | 125 | # Encoder-Decoder for Trace Generation 126 | source_tokens = js.code_tokens 127 | target_tokens = [""] + js.trace_tokens 128 | _truncate_seq_pair_two_length(source_tokens,target_tokens,self.args.block_size//4 - 1, self.args.block_size//2 + self.args.block_size//4 - 5) 129 | source_tokens = source_tokens + [""] 130 | text_tokens = ["","",""] + source_tokens + [""] + target_tokens + [""] 131 | text_ids = self.tokenizer.convert_tokens_to_ids(text_tokens) 132 | dual_gen_ids = text_ids + [self.tokenizer.pad_token_id]*(self.args.block_size-len(text_ids)) 133 | dual_gen_type_ids = [1] * len(["","",""] + source_tokens + [""]) + [2] * len(target_tokens + [""]) + [0]*(self.args.block_size-len(text_ids)) 134 | 135 | 136 | return ( 137 | torch.tensor(dual_gen_ids), 138 | torch.tensor(dual_gen_type_ids), 139 | ) 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /CodeExecutor/pretrain/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch 4 | from torch.autograd import Variable 5 | import copy 6 | import torch.nn.functional as F 7 | from torch.nn import CrossEntropyLoss, MSELoss 8 | import random 9 | 10 | 11 | class Model(nn.Module): 12 | def __init__(self, encoder,config,tokenizer,args): 13 | super(Model, self).__init__() 14 | self.encoder = encoder 15 | self.config = config 16 | self.tokenizer = tokenizer 17 | self.args = args 18 | self.lm_head = nn.Linear(config.hidden_size,config.vocab_size) 19 | self.qa_outputs = nn.Linear(config.hidden_size, config.hidden_size) 20 | self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight 21 | self.register_buffer( 22 | "bias", torch.tril(torch.ones((args.block_size, args.block_size), dtype=torch.uint8)).view(1, args.block_size, args.block_size) 23 | ) 24 | self.weights = torch.full([len(self.tokenizer)], 10.0).to(self.args.device) 25 | easy_ids = self.tokenizer.convert_tokens_to_ids(["","", "", ":"]) 26 | for i in easy_ids: self.weights[i] = 1.0 27 | 28 | def forward(self, dual_gen_ids, dual_gen_type_ids): 29 | dual_loss,align_loss,contras_loss = 0,0,0 30 | 31 | # Encoder-Decoder for Cross-modal Generation 32 | source_ids = dual_gen_ids 33 | type_ids = dual_gen_type_ids 34 | attention_mask = self.bias 35 | attention_mask = attention_mask | (type_ids.eq(1)[:,:,None]*type_ids.eq(1)[:,None,:]) 36 | outputs = self.encoder(source_ids,attention_mask=attention_mask) 37 | encoder_outputs = outputs.last_hidden_state[:,:-1] 38 | labels_mask = type_ids.eq(2)[:,1:] 39 | encoder_outputs = encoder_outputs.reshape(-1,encoder_outputs.size(-1))[labels_mask.reshape(-1)] 40 | prediction_scores = self.lm_head(encoder_outputs) 41 | lm_labels = source_ids[:,1:].reshape(-1)[labels_mask.reshape(-1)] 42 | 43 | loss_fct = CrossEntropyLoss(reduction='none') 44 | lm_loss = loss_fct(prediction_scores, lm_labels) 45 | lm_loss = self.weights[lm_labels] * lm_loss 46 | lm_loss = lm_loss.sum()/len(lm_labels) 47 | 48 | dual_loss = lm_loss.item() 49 | return lm_loss, dual_loss, align_loss, contras_loss 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /CodeExecutor/pretrain/run.sh: -------------------------------------------------------------------------------- 1 | PER_NODE_GPU=8 2 | python -m torch.distributed.launch --nproc_per_node=${PER_NODE_GPU} run.py \ 3 | --output_dir ../saved_models/pretrain_codeexecutor_stage_3 \ 4 | --data_cache_dir ../saved_models/pretrain_codeexecutor_stage_3 \ 5 | --train_data_path /drive/pretrain_codenetmut.json \ 6 | --another_train_data_path /drive/pretrain_tutorial.json \ 7 | --third_train_data_path /drive/single_line_hard_3_million.json \ 8 | --eval_data_path ../data/codenetmut_test.json \ 9 | --model_name_or_path ../saved_models/pretrain_codeexecutor_stage_2 \ 10 | --block_size 1024 \ 11 | --per_gpu_train_batch_size 4 \ 12 | --per_gpu_eval_batch_size 8 \ 13 | --gradient_accumulation_steps 8 \ 14 | --learning_rate 4e-4 \ 15 | --node_index=0 \ 16 | --gpu_per_node $PER_NODE_GPU \ 17 | --weight_decay 0.01 \ 18 | --adam_epsilon 1e-6 \ 19 | --max_grad_norm 1.0 \ 20 | --max_steps 1000000 \ 21 | --warmup_steps 10000 \ 22 | --save_steps 5000 \ 23 | --seed 123 -------------------------------------------------------------------------------- /CodeReviewer/README.md: -------------------------------------------------------------------------------- 1 | # CodeReviewer 2 | 3 | This repo provides the code for reproducing the experiments in [CodeReviewer: Pre-Training for Automating Code Review Activities](https://arxiv.org/abs/2203.09095). **CodeReviewer** is a model pre-trained with code change and code review data to support code review tasks. 4 | 5 | The pre-trained checkpoint of CodeReivewer is available in [Huggingface](https://huggingface.co/microsoft/codereviewer). 6 | 7 | Our dataset is available in [Zenodo](https://zenodo.org/record/6900648). 8 | 9 | ## 1. Dependency 10 | 11 | - conda install nltk 12 | - conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch 13 | - conda install transformers 14 | 15 | 16 | ## 2. Brief Introduction 17 | 18 | CodeReviewer supports for three related tasks: **Quality Estimation** (`cls` for short), **Comment Generation** (`msg` for short) and **Code Refinement** (`ref` for short). 19 | 20 | Demo data: 21 | 22 | ``` python 23 | { 24 | "old_file": "import torch", # f1 25 | "diff_hunk": "@@ -1 +1,2 @@\n import torch\n +import torch.nn as nn", # f1->f2 26 | "comment": "I don't think we need to import torch.nn here.", # requirements for f2->f3 27 | "target": "import torch" # f3 28 | } 29 | ``` 30 | 31 | * Quality Estimation: input with "old_file" and "diff_hunk", we need to predict that whether the code change is not good and needs a comment. 32 | 33 | * Comment Generation: input with "old_file" and "diff_hunk", we need to generate a comment for the change. An expected comment is as the "comment" above. 34 | 35 | * Code Refinement: input with "old_file", "diff_hunk", and "comment", we need to change the code again according to the review comment. For the above example, as the comment indicated we don't need *import torch.nn*, we just delete this line of code here. 36 | 37 | The model inputs are code change (old file and diff hunk) and review comment (optional according to task). Input data is preprocessed in `utils.py: ReviewExample` and wrapped to {`utils.py: CommentClsDataset, SimpleGenDataset, RefineDataset`} 38 | 39 | ## 3. Finetune/Inference 40 | 41 | Before you start to run experiments with CodeReviewer, please download the [datasets](https://zenodo.org/record/6900648) first. 42 | 43 | ```bash 44 | # prepare model checkpoint and datasets 45 | cd code/sh 46 | # adjust the arguments in the *sh* scripts 47 | bash finetune-cls.sh 48 | ``` 49 | 50 | A demo bash script (finetune-cls.sh) is shown: 51 | ```bash 52 | mnt_dir="/home/codereview" 53 | 54 | # You may change the following block for multiple gpu training 55 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 56 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 57 | RANK=0 && echo RANK: ${RANK} 58 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 59 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 60 | NODES=1 && echo NODES: ${NODES} 61 | NCCL_DEBUG=INFO 62 | 63 | bash test_nltk.sh 64 | 65 | 66 | # Change the arguments as required: 67 | # model_name_or_path, load_model_path: the path of the model to be finetuned 68 | # eval_file: the path of the evaluation data 69 | # output_dir: the directory to save finetuned model (not used at infer/test time) 70 | # out_file: the path of the output file 71 | # train_file_name: can be a directory contraining files named with "train*.jsonl" 72 | 73 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_cls.py \ 74 | --train_epochs 30 \ 75 | --model_name_or_path microsoft/codereviewer \ 76 | --output_dir ../../save/cls \ 77 | --train_filename ../../dataset/Diff_Quality_Estimation \ 78 | --dev_filename ../../dataset/Diff_Quality_Estimation/cls-valid.jsonl \ 79 | --max_source_length 512 \ 80 | --max_target_length 128 \ 81 | --train_batch_size 12 \ 82 | --learning_rate 3e-4 \ 83 | --gradient_accumulation_steps 3 \ 84 | --mask_rate 0.15 \ 85 | --save_steps 3600 \ 86 | --log_steps 100 \ 87 | --train_steps 120000 \ 88 | --gpu_per_node=${PER_NODE_GPU} \ 89 | --node_index=${RANK} \ 90 | --seed 2233 91 | ``` 92 | 93 | 94 | ## 4. File structure 95 | ``` 96 | . 97 | ├── bleu.py # demo code for BLEU evaluation 98 | ├── configs.py 99 | ├── evaluator # copied from CodeXGlue for BLEU evaluation 100 | ├── models.py # CodeReviewer model 101 | ├── run_finetune_xxx.py # finetune script - xxx in {cls, msg, gen} 102 | ├── run_infer_msg.py # inference script for comment generation task 103 | ├── run_test_xxx.py # test script - xxx in {cls, msg, gen} 104 | ├── sh/xx.sh # bash script for running finetune and test scripts with arguments 105 | │ ├── finetune-xxx.sh 106 | │ ├── infer-json.sh 107 | │ ├── test-xxx.sh 108 | │ ├── test_nltk.sh 109 | └── utils.py # utils for data preprocessing 110 | ``` 111 | 112 | # Reference 113 | If you use this code or CodeReviewer, please consider citing us. 114 | 115 |
@article{li2022codereviewer,
116 |   title={CodeReviewer: Pre-Training for Automating Code Review Activities},
117 |   author={Li, Zhiyu and Lu, Shuai and Guo, Daya and Duan, Nan and Jannu, Shailesh and Jenks, Grant and Majumder, Deep and Green, Jared and Svyatkovskiy, Alexey and Fu, Shengyu and others},
118 |   journal={arXiv preprint arXiv:2203.09095},
119 |   year={2022}
120 | }
121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /CodeReviewer/code/bleu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from evaluator.smooth_bleu import bleu_fromstr 4 | import nltk 5 | import re 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--path', type=str, required=True) 11 | args = parser.parse_args() 12 | ref = os.path.join(args.path, 'golds.txt') 13 | hyp = os.path.join(args.path, 'preds.txt') 14 | with open(ref, 'r') as f: 15 | refs = f.readlines() 16 | with open(hyp, 'r') as f: 17 | hyps = f.readlines() 18 | # refs = [ref.strip().lower() for ref in refs] 19 | # hyps = [hyp.strip().lower() for hyp in hyps] 20 | # bleu = bleu_fromstr(hyps, refs) 21 | # print(bleu) 22 | pred_nls, golds = hyps, refs 23 | for i in range(len(pred_nls)): 24 | chars = "(_)`." 25 | for c in chars: 26 | pred_nls[i] = pred_nls[i].replace(c, " " + c + " ") 27 | pred_nls[i] = " ".join(pred_nls[i].split()) 28 | golds[i] = golds[i].replace(c, " " + c + " ") 29 | golds[i] = " ".join(golds[i].split()) 30 | bleu = bleu_fromstr(pred_nls, golds, rmstop=False) 31 | print(bleu) 32 | # stopwords = open("stopwords.txt").readlines() 33 | # stopwords = [stopword.strip() for stopword in stopwords] 34 | # refs = [" ".join([word for word in ref.lower().split() if word not in stopwords]) for ref in refs] 35 | # hyps = [" ".join([word for word in hyp.lower().split() if word not in stopwords]) for hyp in hyps] 36 | # bleu = bleu_fromstr(hyps, refs) 37 | # print(bleu) 38 | 39 | if __name__ == '__main__': 40 | main() 41 | # s = "Can we use `mset.mirrorInfo()` directly?" 42 | # chars = "(_)`." 43 | # for c in chars: 44 | # s = s.replace(c, " " + c + " ") 45 | # print(nltk.wordpunct_tokenize(s)) 46 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/calc_code_bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | # https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU 4 | 5 | # -*- coding:utf-8 -*- 6 | import argparse 7 | import os 8 | from evaluator.CodeBLEU import bleu, weighted_ngram_match, syntax_match, dataflow_match 9 | 10 | 11 | def get_codebleu(refs, hyp, lang, params='0.25,0.25,0.25,0.25'): 12 | if not isinstance(refs, list): 13 | refs = [refs] 14 | alpha, beta, gamma, theta = [float(x) for x in params.split(',')] 15 | 16 | # preprocess inputs 17 | pre_references = [[x.strip() for x in open(file, 'r', encoding='utf-8').readlines()] for file in refs] 18 | hypothesis = [x.strip() for x in open(hyp, 'r', encoding='utf-8').readlines()] 19 | 20 | for i in range(len(pre_references)): 21 | assert len(hypothesis) == len(pre_references[i]) 22 | 23 | references = [] 24 | for i in range(len(hypothesis)): 25 | ref_for_instance = [] 26 | for j in range(len(pre_references)): 27 | ref_for_instance.append(pre_references[j][i]) 28 | references.append(ref_for_instance) 29 | assert len(references) == len(pre_references) * len(hypothesis) 30 | 31 | # calculate ngram match (BLEU) 32 | tokenized_hyps = [x.split() for x in hypothesis] 33 | tokenized_refs = [[x.split() for x in reference] for reference in references] 34 | 35 | ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps) 36 | 37 | # calculate weighted ngram match 38 | root_dir = os.path.dirname(__file__) 39 | keywords = [x.strip() for x in open(root_dir + '/keywords/' + lang + '.txt', 'r', encoding='utf-8').readlines()] 40 | 41 | def make_weights(reference_tokens, key_word_list): 42 | return {token: 1 if token in key_word_list else 0.2 for token in reference_tokens} 43 | 44 | tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \ 45 | for reference_tokens in reference] for reference in tokenized_refs] 46 | 47 | weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps) 48 | 49 | # calculate syntax match 50 | syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang) 51 | 52 | # calculate dataflow match 53 | dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang) 54 | 55 | print('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'. \ 56 | format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score)) 57 | 58 | code_bleu_score = alpha * ngram_match_score \ 59 | + beta * weighted_ngram_match_score \ 60 | + gamma * syntax_match_score \ 61 | + theta * dataflow_match_score 62 | 63 | return code_bleu_score 64 | 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--refs', type=str, nargs='+', required=True, 69 | help='reference files') 70 | parser.add_argument('--hyp', type=str, required=True, 71 | help='hypothesis file') 72 | parser.add_argument('--lang', type=str, required=True, 73 | choices=['java', 'js', 'c_sharp', 'php', 'go', 'python', 'ruby'], 74 | help='programming language') 75 | parser.add_argument('--params', type=str, default='0.25,0.25,0.25,0.25', 76 | help='alpha, beta and gamma') 77 | 78 | args = parser.parse_args() 79 | code_bleu_score = get_codebleu(args.refs, args.hyp, args.lang, args.params) 80 | print('CodeBLEU score: ', code_bleu_score) 81 | 82 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/dataflow_match.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from evaluator.CodeBLEU.parser import DFG_python, DFG_java, DFG_ruby, DFG_go, DFG_php, DFG_javascript, DFG_csharp 5 | from evaluator.CodeBLEU.parser import (remove_comments_and_docstrings, 6 | tree_to_token_index, 7 | index_to_code_token, 8 | tree_to_variable_index) 9 | from tree_sitter import Language, Parser 10 | import os 11 | 12 | root_dir = os.path.dirname(__file__) 13 | 14 | dfg_function = { 15 | 'python': DFG_python, 16 | 'java': DFG_java, 17 | 'ruby': DFG_ruby, 18 | 'go': DFG_go, 19 | 'php': DFG_php, 20 | 'javascript': DFG_javascript, 21 | 'c_sharp': DFG_csharp, 22 | } 23 | 24 | 25 | def calc_dataflow_match(references, candidate, lang): 26 | return corpus_dataflow_match([references], [candidate], lang) 27 | 28 | 29 | def corpus_dataflow_match(references, candidates, lang): 30 | LANGUAGE = Language(root_dir + '/parser/my-languages.so', lang) 31 | parser = Parser() 32 | parser.set_language(LANGUAGE) 33 | parser = [parser, dfg_function[lang]] 34 | match_count = 0 35 | total_count = 0 36 | 37 | for i in range(len(candidates)): 38 | references_sample = references[i] 39 | candidate = candidates[i] 40 | for reference in references_sample: 41 | try: 42 | candidate = remove_comments_and_docstrings(candidate, 'java') 43 | except: 44 | pass 45 | try: 46 | reference = remove_comments_and_docstrings(reference, 'java') 47 | except: 48 | pass 49 | 50 | cand_dfg = get_data_flow(candidate, parser) 51 | ref_dfg = get_data_flow(reference, parser) 52 | 53 | normalized_cand_dfg = normalize_dataflow(cand_dfg) 54 | normalized_ref_dfg = normalize_dataflow(ref_dfg) 55 | 56 | if len(normalized_ref_dfg) > 0: 57 | total_count += len(normalized_ref_dfg) 58 | for dataflow in normalized_ref_dfg: 59 | if dataflow in normalized_cand_dfg: 60 | match_count += 1 61 | normalized_cand_dfg.remove(dataflow) 62 | if total_count == 0: 63 | print( 64 | "WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score.") 65 | return 0 66 | score = match_count / total_count 67 | return score 68 | 69 | 70 | def get_data_flow(code, parser): 71 | try: 72 | tree = parser[0].parse(bytes(code, 'utf8')) 73 | root_node = tree.root_node 74 | tokens_index = tree_to_token_index(root_node) 75 | code = code.split('\n') 76 | code_tokens = [index_to_code_token(x, code) for x in tokens_index] 77 | index_to_code = {} 78 | for idx, (index, code) in enumerate(zip(tokens_index, code_tokens)): 79 | index_to_code[index] = (idx, code) 80 | try: 81 | DFG, _ = parser[1](root_node, index_to_code, {}) 82 | except: 83 | DFG = [] 84 | DFG = sorted(DFG, key=lambda x: x[1]) 85 | indexs = set() 86 | for d in DFG: 87 | if len(d[-1]) != 0: 88 | indexs.add(d[1]) 89 | for x in d[-1]: 90 | indexs.add(x) 91 | new_DFG = [] 92 | for d in DFG: 93 | if d[1] in indexs: 94 | new_DFG.append(d) 95 | codes = code_tokens 96 | dfg = new_DFG 97 | except: 98 | codes = code.split() 99 | dfg = [] 100 | # merge nodes 101 | dic = {} 102 | for d in dfg: 103 | if d[1] not in dic: 104 | dic[d[1]] = d 105 | else: 106 | dic[d[1]] = (d[0], d[1], d[2], list(set(dic[d[1]][3] + d[3])), list(set(dic[d[1]][4] + d[4]))) 107 | DFG = [] 108 | for d in dic: 109 | DFG.append(dic[d]) 110 | dfg = DFG 111 | return dfg 112 | 113 | 114 | def normalize_dataflow_item(dataflow_item): 115 | var_name = dataflow_item[0] 116 | var_pos = dataflow_item[1] 117 | relationship = dataflow_item[2] 118 | par_vars_name_list = dataflow_item[3] 119 | par_vars_pos_list = dataflow_item[4] 120 | 121 | var_names = list(set(par_vars_name_list + [var_name])) 122 | norm_names = {} 123 | for i in range(len(var_names)): 124 | norm_names[var_names[i]] = 'var_' + str(i) 125 | 126 | norm_var_name = norm_names[var_name] 127 | relationship = dataflow_item[2] 128 | norm_par_vars_name_list = [norm_names[x] for x in par_vars_name_list] 129 | 130 | return (norm_var_name, relationship, norm_par_vars_name_list) 131 | 132 | 133 | def normalize_dataflow(dataflow): 134 | var_dict = {} 135 | i = 0 136 | normalized_dataflow = [] 137 | for item in dataflow: 138 | var_name = item[0] 139 | relationship = item[2] 140 | par_vars_name_list = item[3] 141 | for name in par_vars_name_list: 142 | if name not in var_dict: 143 | var_dict[name] = 'var_' + str(i) 144 | i += 1 145 | if var_name not in var_dict: 146 | var_dict[var_name] = 'var_' + str(i) 147 | i += 1 148 | normalized_dataflow.append((var_dict[var_name], relationship, [var_dict[x] for x in par_vars_name_list])) 149 | return normalized_dataflow 150 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/keywords/c_sharp.txt: -------------------------------------------------------------------------------- 1 | abstract 2 | as 3 | base 4 | bool 5 | break 6 | byte 7 | case 8 | catch 9 | char 10 | checked 11 | class 12 | const 13 | continue 14 | decimal 15 | default 16 | delegate 17 | do 18 | double 19 | else 20 | enum 21 | event 22 | explicit 23 | extern 24 | false 25 | finally 26 | fixed 27 | float 28 | for 29 | foreach 30 | goto 31 | if 32 | implicit 33 | in 34 | int 35 | interface 36 | internal 37 | is 38 | lock 39 | long 40 | namespace 41 | new 42 | null 43 | object 44 | operator 45 | out 46 | override 47 | params 48 | private 49 | protected 50 | public 51 | readonly 52 | ref 53 | return 54 | sbyte 55 | sealed 56 | short 57 | sizeof 58 | stackalloc 59 | static 60 | string 61 | struct 62 | switch 63 | this 64 | throw 65 | true 66 | try 67 | typeof 68 | uint 69 | ulong 70 | unchecked 71 | unsafe 72 | ushort 73 | using 74 | virtual 75 | void 76 | volatile 77 | while 78 | add 79 | alias 80 | ascending 81 | async 82 | await 83 | by 84 | descending 85 | dynamic 86 | equals 87 | from 88 | get 89 | global 90 | group 91 | into 92 | join 93 | let 94 | nameof 95 | notnull 96 | on 97 | orderby 98 | partial 99 | remove 100 | select 101 | set 102 | unmanaged 103 | value 104 | var 105 | when 106 | where 107 | yield 108 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/keywords/java.txt: -------------------------------------------------------------------------------- 1 | abstract 2 | assert 3 | boolean 4 | break 5 | byte 6 | case 7 | catch 8 | char 9 | class 10 | const 11 | continue 12 | default 13 | do 14 | double 15 | else 16 | enum 17 | extends 18 | final 19 | finally 20 | float 21 | for 22 | goto 23 | if 24 | implements 25 | import 26 | instanceof 27 | int 28 | interface 29 | long 30 | native 31 | new 32 | package 33 | private 34 | protected 35 | public 36 | return 37 | short 38 | static 39 | strictfp 40 | super 41 | switch 42 | synchronized 43 | this 44 | throw 45 | throws 46 | transient 47 | try 48 | void 49 | volatile 50 | while 51 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from .utils import (remove_comments_and_docstrings, 5 | tree_to_token_index, 6 | index_to_code_token, 7 | tree_to_variable_index) 8 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/parser/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import re 5 | from io import StringIO 6 | import tokenize 7 | 8 | 9 | def remove_comments_and_docstrings(source, lang): 10 | if lang in ['python']: 11 | """ 12 | Returns 'source' minus comments and docstrings. 13 | """ 14 | io_obj = StringIO(source) 15 | out = "" 16 | prev_toktype = tokenize.INDENT 17 | last_lineno = -1 18 | last_col = 0 19 | for tok in tokenize.generate_tokens(io_obj.readline): 20 | token_type = tok[0] 21 | token_string = tok[1] 22 | start_line, start_col = tok[2] 23 | end_line, end_col = tok[3] 24 | ltext = tok[4] 25 | if start_line > last_lineno: 26 | last_col = 0 27 | if start_col > last_col: 28 | out += (" " * (start_col - last_col)) 29 | # Remove comments: 30 | if token_type == tokenize.COMMENT: 31 | pass 32 | # This series of conditionals removes docstrings: 33 | elif token_type == tokenize.STRING: 34 | if prev_toktype != tokenize.INDENT: 35 | # This is likely a docstring; double-check we're not inside an operator: 36 | if prev_toktype != tokenize.NEWLINE: 37 | if start_col > 0: 38 | out += token_string 39 | else: 40 | out += token_string 41 | prev_toktype = token_type 42 | last_col = end_col 43 | last_lineno = end_line 44 | temp = [] 45 | for x in out.split('\n'): 46 | if x.strip() != "": 47 | temp.append(x) 48 | return '\n'.join(temp) 49 | elif lang in ['ruby']: 50 | return source 51 | else: 52 | def replacer(match): 53 | s = match.group(0) 54 | if s.startswith('/'): 55 | return " " # note: a space and not an empty string 56 | else: 57 | return s 58 | 59 | pattern = re.compile( 60 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 61 | re.DOTALL | re.MULTILINE 62 | ) 63 | temp = [] 64 | for x in re.sub(pattern, replacer, source).split('\n'): 65 | if x.strip() != "": 66 | temp.append(x) 67 | return '\n'.join(temp) 68 | 69 | 70 | def tree_to_token_index(root_node): 71 | if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string', 72 | 'character_literal']) and root_node.type != 'comment': 73 | return [(root_node.start_point, root_node.end_point)] 74 | else: 75 | code_tokens = [] 76 | for child in root_node.children: 77 | code_tokens += tree_to_token_index(child) 78 | return code_tokens 79 | 80 | 81 | def tree_to_variable_index(root_node, index_to_code): 82 | if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string', 83 | 'character_literal']) and root_node.type != 'comment': 84 | index = (root_node.start_point, root_node.end_point) 85 | _, code = index_to_code[index] 86 | if root_node.type != code: 87 | return [(root_node.start_point, root_node.end_point)] 88 | else: 89 | return [] 90 | else: 91 | code_tokens = [] 92 | for child in root_node.children: 93 | code_tokens += tree_to_variable_index(child, index_to_code) 94 | return code_tokens 95 | 96 | 97 | def index_to_code_token(index, code): 98 | start_point = index[0] 99 | end_point = index[1] 100 | if start_point[0] == end_point[0]: 101 | s = code[start_point[0]][start_point[1]:end_point[1]] 102 | else: 103 | s = "" 104 | s += code[start_point[0]][start_point[1]:] 105 | for i in range(start_point[0] + 1, end_point[0]): 106 | s += code[i] 107 | s += code[end_point[0]][:end_point[1]] 108 | return s 109 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/readme.txt: -------------------------------------------------------------------------------- 1 | python calc_code_bleu.py --refs reference_files --hyp candidate_file --language java ( or c_sharp) --params 0.25,0.25,0.25,0.25(default) -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/syntax_match.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from evaluator.CodeBLEU.parser import DFG_python, DFG_java, DFG_ruby, DFG_go, DFG_php, DFG_javascript, DFG_csharp 5 | from evaluator.CodeBLEU.parser import (remove_comments_and_docstrings, 6 | tree_to_token_index, 7 | index_to_code_token, 8 | tree_to_variable_index) 9 | from tree_sitter import Language, Parser 10 | import os 11 | 12 | root_dir = os.path.dirname(__file__) 13 | dfg_function = { 14 | 'python': DFG_python, 15 | 'java': DFG_java, 16 | 'ruby': DFG_ruby, 17 | 'go': DFG_go, 18 | 'php': DFG_php, 19 | 'javascript': DFG_javascript, 20 | 'c_sharp': DFG_csharp, 21 | } 22 | 23 | 24 | def calc_syntax_match(references, candidate, lang): 25 | return corpus_syntax_match([references], [candidate], lang) 26 | 27 | 28 | def corpus_syntax_match(references, candidates, lang): 29 | JAVA_LANGUAGE = Language(root_dir + '/parser/my-languages.so', lang) 30 | parser = Parser() 31 | parser.set_language(JAVA_LANGUAGE) 32 | match_count = 0 33 | total_count = 0 34 | 35 | for i in range(len(candidates)): 36 | references_sample = references[i] 37 | candidate = candidates[i] 38 | for reference in references_sample: 39 | try: 40 | candidate = remove_comments_and_docstrings(candidate, 'java') 41 | except: 42 | pass 43 | try: 44 | reference = remove_comments_and_docstrings(reference, 'java') 45 | except: 46 | pass 47 | 48 | candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node 49 | 50 | reference_tree = parser.parse(bytes(reference, 'utf8')).root_node 51 | 52 | def get_all_sub_trees(root_node): 53 | node_stack = [] 54 | sub_tree_sexp_list = [] 55 | depth = 1 56 | node_stack.append([root_node, depth]) 57 | while len(node_stack) != 0: 58 | cur_node, cur_depth = node_stack.pop() 59 | sub_tree_sexp_list.append([cur_node.sexp(), cur_depth]) 60 | for child_node in cur_node.children: 61 | if len(child_node.children) != 0: 62 | depth = cur_depth + 1 63 | node_stack.append([child_node, depth]) 64 | return sub_tree_sexp_list 65 | 66 | cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)] 67 | ref_sexps = get_all_sub_trees(reference_tree) 68 | 69 | # print(cand_sexps) 70 | # print(ref_sexps) 71 | 72 | for sub_tree, depth in ref_sexps: 73 | if sub_tree in cand_sexps: 74 | match_count += 1 75 | total_count += len(ref_sexps) 76 | 77 | score = match_count / total_count 78 | return score 79 | -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/CodeBLEU/utils.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Utility functions 2 | # 3 | # Copyright (C) 2001-2020 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from itertools import chain 9 | 10 | def pad_sequence( 11 | sequence, 12 | n, 13 | pad_left=False, 14 | pad_right=False, 15 | left_pad_symbol=None, 16 | right_pad_symbol=None, 17 | ): 18 | """ 19 | Returns a padded sequence of items before ngram extraction. 20 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) 21 | ['', 1, 2, 3, 4, 5, ''] 22 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) 23 | ['', 1, 2, 3, 4, 5] 24 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) 25 | [1, 2, 3, 4, 5, ''] 26 | :param sequence: the source data to be padded 27 | :type sequence: sequence or iter 28 | :param n: the degree of the ngrams 29 | :type n: int 30 | :param pad_left: whether the ngrams should be left-padded 31 | :type pad_left: bool 32 | :param pad_right: whether the ngrams should be right-padded 33 | :type pad_right: bool 34 | :param left_pad_symbol: the symbol to use for left padding (default is None) 35 | :type left_pad_symbol: any 36 | :param right_pad_symbol: the symbol to use for right padding (default is None) 37 | :type right_pad_symbol: any 38 | :rtype: sequence or iter 39 | """ 40 | sequence = iter(sequence) 41 | if pad_left: 42 | sequence = chain((left_pad_symbol,) * (n - 1), sequence) 43 | if pad_right: 44 | sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) 45 | return sequence 46 | 47 | 48 | # add a flag to pad the sequence so we get peripheral ngrams? 49 | 50 | 51 | def ngrams( 52 | sequence, 53 | n, 54 | pad_left=False, 55 | pad_right=False, 56 | left_pad_symbol=None, 57 | right_pad_symbol=None, 58 | ): 59 | """ 60 | Return the ngrams generated from a sequence of items, as an iterator. 61 | For example: 62 | >>> from nltk.util import ngrams 63 | >>> list(ngrams([1,2,3,4,5], 3)) 64 | [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 65 | Wrap with list for a list version of this function. Set pad_left 66 | or pad_right to true in order to get additional ngrams: 67 | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) 68 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] 69 | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) 70 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, '')] 71 | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) 72 | [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] 73 | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) 74 | [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] 75 | :param sequence: the source data to be converted into ngrams 76 | :type sequence: sequence or iter 77 | :param n: the degree of the ngrams 78 | :type n: int 79 | :param pad_left: whether the ngrams should be left-padded 80 | :type pad_left: bool 81 | :param pad_right: whether the ngrams should be right-padded 82 | :type pad_right: bool 83 | :param left_pad_symbol: the symbol to use for left padding (default is None) 84 | :type left_pad_symbol: any 85 | :param right_pad_symbol: the symbol to use for right padding (default is None) 86 | :type right_pad_symbol: any 87 | :rtype: sequence or iter 88 | """ 89 | sequence = pad_sequence( 90 | sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol 91 | ) 92 | 93 | history = [] 94 | while n > 1: 95 | # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator 96 | try: 97 | next_item = next(sequence) 98 | except StopIteration: 99 | # no more data, terminate the generator 100 | return 101 | history.append(next_item) 102 | n -= 1 103 | for item in sequence: 104 | history.append(item) 105 | yield tuple(history) 106 | del history[0] -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | 28 | def _get_ngrams(segment, max_order): 29 | """Extracts all n-grams upto a given maximum order from an input segment. 30 | 31 | Args: 32 | segment: text segment from which n-grams will be extracted. 33 | max_order: maximum length in tokens of the n-grams returned by this 34 | methods. 35 | 36 | Returns: 37 | The Counter containing all n-grams upto max_order in segment 38 | with a count of how many times each n-gram occurred. 39 | """ 40 | ngram_counts = collections.Counter() 41 | for order in range(1, max_order + 1): 42 | for i in range(0, len(segment) - order + 1): 43 | ngram = tuple(segment[i:i+order]) 44 | ngram_counts[ngram] += 1 45 | return ngram_counts 46 | 47 | 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 49 | smooth=False): 50 | """Computes BLEU score of translated segments against one or more references. 51 | 52 | Args: 53 | reference_corpus: list of lists of references for each translation. Each 54 | reference should be tokenized into a list of tokens. 55 | translation_corpus: list of translations to score. Each translation 56 | should be tokenized into a list of tokens. 57 | max_order: Maximum n-gram order to use when computing BLEU score. 58 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 59 | 60 | Returns: 61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 62 | precisions and brevity penalty. 63 | """ 64 | matches_by_order = [0] * max_order 65 | possible_matches_by_order = [0] * max_order 66 | reference_length = 0 67 | translation_length = 0 68 | for (references, translation) in zip(reference_corpus, 69 | translation_corpus): 70 | reference_length += min(len(r) for r in references) 71 | translation_length += len(translation) 72 | 73 | merged_ref_ngram_counts = collections.Counter() 74 | for reference in references: 75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 76 | translation_ngram_counts = _get_ngrams(translation, max_order) 77 | overlap = translation_ngram_counts & merged_ref_ngram_counts 78 | for ngram in overlap: 79 | matches_by_order[len(ngram)-1] += overlap[ngram] 80 | for order in range(1, max_order+1): 81 | possible_matches = len(translation) - order + 1 82 | if possible_matches > 0: 83 | possible_matches_by_order[order-1] += possible_matches 84 | 85 | precisions = [0] * max_order 86 | for i in range(0, max_order): 87 | if smooth: 88 | precisions[i] = ((matches_by_order[i] + 1.) / 89 | (possible_matches_by_order[i] + 1.)) 90 | else: 91 | if possible_matches_by_order[i] > 0: 92 | precisions[i] = (float(matches_by_order[i]) / 93 | possible_matches_by_order[i]) 94 | else: 95 | precisions[i] = 0.0 96 | 97 | if min(precisions) > 0: 98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 99 | geo_mean = math.exp(p_log_sum) 100 | else: 101 | geo_mean = 0 102 | 103 | ratio = float(translation_length) / reference_length 104 | 105 | if ratio > 1.0: 106 | bp = 1. 107 | else: 108 | bp = math.exp(1 - 1. / ratio) 109 | 110 | bleu = geo_mean * bp 111 | 112 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 113 | 114 | 115 | def _bleu(ref_file, trans_file, subword_option=None): 116 | max_order = 4 117 | smooth = True 118 | ref_files = [ref_file] 119 | reference_text = [] 120 | for reference_filename in ref_files: 121 | with open(reference_filename) as fh: 122 | reference_text.append(fh.readlines()) 123 | per_segment_references = [] 124 | for references in zip(*reference_text): 125 | reference_list = [] 126 | for reference in references: 127 | reference_list.append(reference.strip().split()) 128 | per_segment_references.append(reference_list) 129 | translations = [] 130 | with open(trans_file) as fh: 131 | for line in fh: 132 | translations.append(line.strip().split()) 133 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) 134 | return round(100 * bleu_score,2) -------------------------------------------------------------------------------- /CodeReviewer/code/evaluator/stopwords.txt: -------------------------------------------------------------------------------- 1 | about 2 | above 3 | abroad 4 | according 5 | accordingly 6 | across 7 | actually 8 | adj 9 | after 10 | afterwards 11 | again 12 | against 13 | ago 14 | ahead 15 | ain't 16 | all 17 | almost 18 | alone 19 | along 20 | alongside 21 | already 22 | also 23 | although 24 | always 25 | am 26 | amid 27 | amidst 28 | among 29 | amongst 30 | an 31 | and 32 | any 33 | anybody 34 | anyhow 35 | anyone 36 | anything 37 | anyway 38 | anyways 39 | anywhere 40 | apart 41 | appear 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | a's 48 | aside 49 | associated 50 | at 51 | available 52 | away 53 | awfully 54 | back 55 | backward 56 | backwards 57 | be 58 | became 59 | because 60 | become 61 | becomes 62 | becoming 63 | been 64 | before 65 | beforehand 66 | begin 67 | behind 68 | being 69 | believe 70 | below 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | came 80 | can 81 | cannot 82 | cant 83 | can't 84 | caption 85 | cause 86 | causes 87 | certain 88 | certainly 89 | clearly 90 | c'mon 91 | co 92 | co. 93 | com 94 | come 95 | comes 96 | concerning 97 | consequently 98 | contain 99 | containing 100 | contains 101 | corresponding 102 | could 103 | couldn't 104 | course 105 | c's 106 | dare 107 | daren't 108 | definitely 109 | described 110 | despite 111 | did 112 | didn't 113 | different 114 | directly 115 | do 116 | does 117 | doesn't 118 | doing 119 | don't 120 | down 121 | downwards 122 | during 123 | each 124 | edu 125 | eg 126 | eight 127 | eighty 128 | either 129 | elsewhere 130 | end 131 | ending 132 | entirely 133 | et 134 | etc 135 | even 136 | ever 137 | evermore 138 | every 139 | everybody 140 | everyone 141 | everything 142 | everywhere 143 | ex 144 | exactly 145 | except 146 | fairly 147 | far 148 | farther 149 | few 150 | fewer 151 | fifth 152 | first 153 | five 154 | followed 155 | following 156 | follows 157 | for 158 | forever 159 | former 160 | formerly 161 | forth 162 | forward 163 | found 164 | four 165 | from 166 | further 167 | furthermore 168 | get 169 | gets 170 | getting 171 | given 172 | gives 173 | go 174 | goes 175 | going 176 | gone 177 | got 178 | gotten 179 | greetings 180 | had 181 | hadn't 182 | half 183 | happens 184 | hardly 185 | has 186 | hasn't 187 | have 188 | haven't 189 | having 190 | he 191 | he'd 192 | he'll 193 | hello 194 | help 195 | hence 196 | her 197 | here 198 | hereafter 199 | hereby 200 | herein 201 | here's 202 | hereupon 203 | hers 204 | herself 205 | he's 206 | hi 207 | him 208 | himself 209 | his 210 | hither 211 | hopefully 212 | how 213 | howbeit 214 | however 215 | hundred 216 | i'd 217 | ie 218 | if 219 | ignored 220 | i'll 221 | i'm 222 | immediate 223 | in 224 | inasmuch 225 | inc 226 | inc. 227 | indeed 228 | indicate 229 | indicated 230 | indicates 231 | inner 232 | inside 233 | insofar 234 | into 235 | inward 236 | is 237 | isn't 238 | it 239 | it'd 240 | it'll 241 | its 242 | it's 243 | itself 244 | i've 245 | just 246 | k 247 | keep 248 | keeps 249 | kept 250 | know 251 | known 252 | knows 253 | last 254 | lately 255 | later 256 | latter 257 | latterly 258 | least 259 | less 260 | lest 261 | let 262 | let's 263 | like 264 | liked 265 | likely 266 | likewise 267 | little 268 | look 269 | looking 270 | looks 271 | low 272 | lower 273 | ltd 274 | made 275 | mainly 276 | make 277 | makes 278 | many 279 | may 280 | maybe 281 | mayn't 282 | me 283 | mean 284 | meantime 285 | meanwhile 286 | merely 287 | might 288 | mightn't 289 | mine 290 | minus 291 | moreover 292 | most 293 | mostly 294 | mr 295 | mrs 296 | much 297 | must 298 | mustn't 299 | my 300 | myself 301 | name 302 | namely 303 | nd 304 | near 305 | nearly 306 | needn't 307 | neither 308 | never 309 | neverf 310 | neverless 311 | nevertheless 312 | next 313 | nine 314 | ninety 315 | no 316 | nobody 317 | non 318 | none 319 | nonetheless 320 | noone 321 | no-one 322 | nor 323 | normally 324 | not 325 | nothing 326 | notwithstanding 327 | novel 328 | now 329 | nowhere 330 | obviously 331 | of 332 | off 333 | often 334 | oh 335 | ok 336 | okay 337 | old 338 | on 339 | once 340 | one 341 | ones 342 | one's 343 | only 344 | onto 345 | opposite 346 | or 347 | otherwise 348 | ought 349 | oughtn't 350 | our 351 | ours 352 | ourselves 353 | out 354 | outside 355 | over 356 | overall 357 | own 358 | particular 359 | particularly 360 | past 361 | per 362 | perhaps 363 | placed 364 | plus 365 | possible 366 | presumably 367 | probably 368 | que 369 | quite 370 | qv 371 | rather 372 | rd 373 | re 374 | really 375 | reasonably 376 | recent 377 | recently 378 | regarding 379 | regardless 380 | regards 381 | relatively 382 | respectively 383 | right 384 | round 385 | said 386 | same 387 | saw 388 | say 389 | saying 390 | says 391 | second 392 | secondly 393 | see 394 | seeing 395 | seem 396 | seemed 397 | seeming 398 | seems 399 | seen 400 | self 401 | selves 402 | sensible 403 | sent 404 | seven 405 | several 406 | shall 407 | shan't 408 | she 409 | she'd 410 | she'll 411 | she's 412 | should 413 | shouldn't 414 | since 415 | six 416 | so 417 | somebody 418 | someday 419 | somehow 420 | someone 421 | something 422 | sometime 423 | sometimes 424 | somewhat 425 | somewhere 426 | sorry 427 | specified 428 | specify 429 | specifying 430 | still 431 | sub 432 | such 433 | sup 434 | sure 435 | take 436 | taken 437 | taking 438 | tell 439 | tends 440 | th 441 | than 442 | that 443 | that'll 444 | thats 445 | that's 446 | that've 447 | the 448 | their 449 | theirs 450 | them 451 | themselves 452 | then 453 | thence 454 | there 455 | thereafter 456 | thereby 457 | there'd 458 | therefore 459 | therein 460 | there'll 461 | there're 462 | theres 463 | there's 464 | thereupon 465 | there've 466 | these 467 | they 468 | they'd 469 | they'll 470 | they're 471 | they've 472 | thing 473 | things 474 | think 475 | third 476 | thirty 477 | this 478 | thorough 479 | thoroughly 480 | those 481 | though 482 | three 483 | through 484 | throughout 485 | thru 486 | thus 487 | till 488 | to 489 | together 490 | too 491 | took 492 | toward 493 | towards 494 | tried 495 | tries 496 | truly 497 | try 498 | trying 499 | t's 500 | twice 501 | two 502 | un 503 | under 504 | underneath 505 | undoing 506 | unfortunately 507 | unless 508 | unlike 509 | unlikely 510 | until 511 | unto 512 | up 513 | upon 514 | upwards 515 | us 516 | use 517 | used 518 | uses 519 | using 520 | usually 521 | v 522 | value 523 | various 524 | versus 525 | very 526 | via 527 | viz 528 | vs 529 | was 530 | wasn't 531 | way 532 | we 533 | we'd 534 | well 535 | we'll 536 | went 537 | were 538 | we're 539 | weren't 540 | we've 541 | what 542 | whatever 543 | what'll 544 | what's 545 | what've 546 | when 547 | whence 548 | whenever 549 | where 550 | whereafter 551 | whereas 552 | whereby 553 | wherein 554 | where's 555 | whereupon 556 | wherever 557 | whether 558 | which 559 | whichever 560 | while 561 | whilst 562 | whither 563 | who 564 | who'd 565 | whoever 566 | whole 567 | who'll 568 | whom 569 | whomever 570 | who's 571 | whose 572 | why 573 | will 574 | with 575 | within 576 | without 577 | wonder 578 | won't 579 | would 580 | wouldn't 581 | yes 582 | yet 583 | you 584 | you'd 585 | you'll 586 | your 587 | you're 588 | yours 589 | yourself 590 | yourselves 591 | you've 592 | zer 593 | -------------------------------------------------------------------------------- /CodeReviewer/code/run_infer_msg.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | import torch 3 | import logging 4 | import argparse 5 | import random 6 | import numpy as np 7 | from tqdm import tqdm 8 | import multiprocessing 9 | import time 10 | from itertools import cycle 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from transformers import AdamW, get_linear_schedule_with_warmup 14 | from models import build_or_load_gen_model 15 | from configs import add_args, set_seed, set_dist 16 | from torch.nn.parallel import DistributedDataParallel as DDP 17 | import torch.distributed as dist 18 | from utils import CommentGenDataset, SimpleGenDataset 19 | from evaluator.smooth_bleu import bleu_fromstr 20 | 21 | 22 | logging.basicConfig( 23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 24 | datefmt="%m/%d/%Y %H:%M:%S", 25 | level=logging.INFO, 26 | ) 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def get_loader(data_file, args, tokenizer, pool): 31 | def fn(features): 32 | return features 33 | logger.info(f"Start data file {data_file}.") 34 | if args.raw_input: 35 | dataset = SimpleGenDataset(tokenizer, pool, args, data_file) 36 | else: 37 | dataset = CommentGenDataset(tokenizer, pool, args, data_file) 38 | sampler = SequentialSampler(dataset) 39 | dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn) 40 | logger.info(f"Finish data files {data_file}.") 41 | return dataset, sampler, dataloader 42 | 43 | 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer): 45 | logger.info(f" ***** Running bleu evaluation on {args.eval_file} *****") 46 | logger.info(" Batch size = %d", args.eval_batch_size) 47 | model.eval() 48 | if hasattr(model, "module"): 49 | model = model.module 50 | pred_ids, ex_ids = [], [] 51 | for step, examples in tqdm(enumerate(eval_dataloader, 1)): 52 | source_ids = torch.tensor( 53 | [ex.source_ids for ex in examples], dtype=torch.long 54 | ).to(args.local_rank) 55 | ids = [ex.example_id for ex in examples] 56 | source_mask = source_ids.ne(tokenizer.pad_id) 57 | preds = model.generate(source_ids, 58 | attention_mask=source_mask, 59 | use_cache=True, 60 | num_beams=args.beam_size, 61 | early_stopping=True, 62 | max_length=args.max_target_length) 63 | top_preds = list(preds.cpu().numpy()) 64 | pred_ids.extend(top_preds) 65 | if args.break_cnt > 0 and len(pred_ids) >= args.break_cnt: 66 | break 67 | # [2:] to remove beginning '' '' 68 | pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids] 69 | valid_file = args.eval_file 70 | out_file = args.out_file 71 | outdics = [] 72 | golds = [] 73 | with open(valid_file, "r") as f: 74 | for line in f: 75 | outdics.append(json.loads(line)) 76 | golds.append(outdics[-1]["msg"]) 77 | outdics = outdics[:len(pred_nls)] 78 | golds = golds[:len(pred_nls)] 79 | with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f: 80 | for pred in pred_nls: 81 | f.write(pred.strip() + "\n") 82 | with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f: 83 | for gold in golds: 84 | f.write(gold.strip() + "\n") 85 | with open(out_file, "w", encoding="utf-8") as f: 86 | for i, outdic in enumerate(outdics): 87 | outdic["gen"] = pred_nls[i] 88 | f.write(json.dumps(outdic) + "\n") 89 | bleu = bleu_fromstr(pred_nls, golds, rmstop=False) 90 | return bleu 91 | 92 | 93 | def main(args): 94 | dist.init_process_group(backend="nccl") 95 | local_rank = dist.get_rank() % args.gpu_per_node 96 | args.global_rank = local_rank + args.node_index * args.gpu_per_node 97 | args.local_rank = local_rank 98 | args.world_size = dist.get_world_size() 99 | logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s", 100 | args.local_rank, args.global_rank, \ 101 | torch.distributed.get_world_size(), \ 102 | args.eval_batch_size) 103 | torch.cuda.set_device(local_rank) 104 | 105 | set_seed(args) 106 | config, model, tokenizer = build_or_load_gen_model(args) 107 | model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) 108 | pool = multiprocessing.Pool(args.cpu_count) 109 | data_file = args.eval_file 110 | set_seed(args) 111 | _, _, dataloader = get_loader(data_file, args, tokenizer, pool) # WARNING: this is a iterator, to save memory 112 | model.eval() 113 | bleu = eval_epoch_bleu(args, dataloader, model, tokenizer) 114 | logger.warning(f"BLEU: {bleu}") 115 | 116 | if __name__ == "__main__": 117 | parser = argparse.ArgumentParser() 118 | args = add_args(parser) 119 | args.cpu_count = multiprocessing.cpu_count() 120 | # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991 121 | logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) 122 | logger.info(args) 123 | main(args) 124 | logger.info("Test finished.") 125 | # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count()) 126 | -------------------------------------------------------------------------------- /CodeReviewer/code/run_test_cls.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import logging 4 | import argparse 5 | import random 6 | import numpy as np 7 | from tqdm import tqdm 8 | import multiprocessing 9 | import time 10 | from itertools import cycle 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from transformers import AdamW, get_linear_schedule_with_warmup 14 | from models import build_or_load_gen_model 15 | from configs import add_args, set_seed, set_dist 16 | from torch.nn.parallel import DistributedDataParallel as DDP 17 | import torch.distributed as dist 18 | from utils import CommentClsDataset, SimpleClsDataset 19 | from sklearn.metrics import classification_report 20 | 21 | 22 | logging.basicConfig( 23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 24 | datefmt="%m/%d/%Y %H:%M:%S", 25 | level=logging.INFO, 26 | ) 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def get_loader(data_file, args, tokenizer, pool): 31 | def fn(features): 32 | return features 33 | logger.info(f"Start data file {data_file}.") 34 | if args.raw_input: 35 | dataset = SimpleClsDataset(tokenizer, pool, args, data_file) 36 | else: 37 | dataset = CommentClsDataset(tokenizer, pool, args, data_file) 38 | sampler = RandomSampler(dataset) 39 | dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn) 40 | logger.info(f"Finish data files {data_file}.") 41 | return dataset, sampler, dataloader 42 | 43 | 44 | def eval_epoch_acc(args, eval_dataloader, model, tokenizer): 45 | # Start evaluating model 46 | logger.info(" " + "***** Running acc evaluation *****") 47 | logger.info(" Batch size = %d", args.eval_batch_size) 48 | 49 | model.eval() 50 | local_rank = 0 51 | pred, gold = [], [] 52 | with torch.no_grad(): 53 | for step, examples in enumerate(tqdm(eval_dataloader), 1): 54 | if step == 1: 55 | ex = examples[0] 56 | logger.info(f"batch size: {len(examples)}") 57 | logger.info(f"example source: {tokenizer.convert_ids_to_tokens(ex.source_ids)}") 58 | logger.info(f"example target: {ex.y}") 59 | source_ids = torch.tensor( 60 | [ex.source_ids for ex in examples], dtype=torch.long 61 | ).to(local_rank) 62 | source_mask = source_ids.ne(tokenizer.pad_id) 63 | logits = model( 64 | cls=True, 65 | input_ids=source_ids, 66 | labels=None, 67 | attention_mask=source_mask 68 | ) 69 | prediction = torch.argmax(logits, dim=-1).cpu().numpy() 70 | pred.extend(prediction) 71 | gold.extend([ex.y for ex in examples]) 72 | logger.info("\n" + classification_report(gold, pred, digits=4)) 73 | logger.info(f"Target positive percentage: {sum(gold) / len(gold)}") 74 | return 75 | 76 | 77 | def main(args): 78 | dist.init_process_group(backend="nccl") 79 | local_rank = dist.get_rank() % args.gpu_per_node 80 | args.global_rank = local_rank + args.node_index * args.gpu_per_node 81 | args.local_rank = local_rank 82 | args.world_size = dist.get_world_size() 83 | logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s", 84 | args.local_rank, args.global_rank, \ 85 | torch.distributed.get_world_size(), \ 86 | args.eval_batch_size) 87 | torch.cuda.set_device(local_rank) 88 | 89 | set_seed(args) 90 | config, model, tokenizer = build_or_load_gen_model(args) 91 | model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) 92 | pool = multiprocessing.Pool(args.cpu_count) 93 | data_file = args.eval_file 94 | set_seed(args) 95 | _, _, dataloader = get_loader(data_file, args, tokenizer, pool) # WARNING: this is a iterator, to save memory 96 | model.eval() 97 | eval_epoch_acc(args, dataloader, model, tokenizer) 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser() 101 | args = add_args(parser) 102 | args.cpu_count = multiprocessing.cpu_count() 103 | # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991 104 | logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) 105 | logger.info(args) 106 | main(args) 107 | logger.info("Test finished.") 108 | # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count()) 109 | -------------------------------------------------------------------------------- /CodeReviewer/code/run_test_msg.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | import torch 3 | import logging 4 | import argparse 5 | import random 6 | import numpy as np 7 | from tqdm import tqdm 8 | import multiprocessing 9 | import time 10 | from itertools import cycle 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from transformers import AdamW, get_linear_schedule_with_warmup 14 | from models import build_or_load_gen_model 15 | from configs import add_args, set_seed, set_dist 16 | from torch.nn.parallel import DistributedDataParallel as DDP 17 | import torch.distributed as dist 18 | from utils import CommentGenDataset, SimpleGenDataset 19 | from evaluator.smooth_bleu import bleu_fromstr 20 | 21 | 22 | logging.basicConfig( 23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 24 | datefmt="%m/%d/%Y %H:%M:%S", 25 | level=logging.INFO, 26 | ) 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def get_loader(data_file, args, tokenizer, pool): 31 | def fn(features): 32 | return features 33 | logger.info(f"Start data file {data_file}.") 34 | if args.raw_input: 35 | dataset = SimpleGenDataset(tokenizer, pool, args, data_file) 36 | else: 37 | dataset = CommentGenDataset(tokenizer, pool, args, data_file) 38 | sampler = SequentialSampler(dataset) 39 | dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn) 40 | logger.info(f"Finish data files {data_file}.") 41 | return dataset, sampler, dataloader 42 | 43 | 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer): 45 | logger.info(f" ***** Running bleu evaluation on {args.eval_file} *****") 46 | logger.info(" Batch size = %d", args.eval_batch_size) 47 | model.eval() 48 | if hasattr(model, "module"): 49 | model = model.module 50 | pred_ids, ex_ids = [], [] 51 | for step, examples in tqdm(enumerate(eval_dataloader, 1)): 52 | source_ids = torch.tensor( 53 | [ex.source_ids for ex in examples], dtype=torch.long 54 | ).to(args.local_rank) 55 | ids = [ex.example_id for ex in examples] 56 | source_mask = source_ids.ne(tokenizer.pad_id) 57 | preds = model.generate(source_ids, 58 | attention_mask=source_mask, 59 | use_cache=True, 60 | num_beams=args.beam_size, 61 | early_stopping=True, 62 | max_length=args.max_target_length) 63 | top_preds = list(preds.cpu().numpy()) 64 | pred_ids.extend(top_preds) 65 | pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids] 66 | valid_file = args.eval_file 67 | golds = [] 68 | with open(valid_file, "r") as f: 69 | for line in f: 70 | golds.append(json.loads(line)["msg"]) 71 | golds = golds[:len(pred_nls)] 72 | with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f: 73 | for pred in pred_nls: 74 | f.write(pred.strip() + "\n") 75 | with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f: 76 | for gold in golds: 77 | f.write(gold.strip() + "\n") 78 | bleu = bleu_fromstr(pred_nls, golds, rmstop=False) 79 | logger.warning(f"WithStop BLEU: {bleu}") 80 | bleu = bleu_fromstr(pred_nls, golds, rmstop=True) 81 | return bleu 82 | 83 | 84 | def main(args): 85 | dist.init_process_group(backend="nccl") 86 | local_rank = dist.get_rank() % args.gpu_per_node 87 | args.global_rank = local_rank + args.node_index * args.gpu_per_node 88 | args.local_rank = local_rank 89 | args.world_size = dist.get_world_size() 90 | logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s", 91 | args.local_rank, args.global_rank, \ 92 | torch.distributed.get_world_size(), \ 93 | args.eval_batch_size) 94 | torch.cuda.set_device(local_rank) 95 | 96 | set_seed(args) 97 | config, model, tokenizer = build_or_load_gen_model(args) 98 | model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) 99 | pool = multiprocessing.Pool(args.cpu_count) 100 | data_file = args.eval_file 101 | set_seed(args) 102 | _, _, dataloader = get_loader(data_file, args, tokenizer, pool) # WARNING: this is a iterator, to save memory 103 | model.eval() 104 | bleu = eval_epoch_bleu(args, dataloader, model, tokenizer) 105 | logger.warning(f"BLEU: {bleu}") 106 | 107 | if __name__ == "__main__": 108 | parser = argparse.ArgumentParser() 109 | args = add_args(parser) 110 | args.cpu_count = multiprocessing.cpu_count() 111 | # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991 112 | logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) 113 | logger.info(args) 114 | main(args) 115 | logger.info("Test finished.") 116 | # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count()) 117 | -------------------------------------------------------------------------------- /CodeReviewer/code/run_test_ref.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | import torch 3 | import logging 4 | import argparse 5 | import random 6 | import numpy as np 7 | from tqdm import tqdm 8 | import multiprocessing 9 | import time 10 | from itertools import cycle 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from transformers import AdamW, get_linear_schedule_with_warmup 14 | from models import build_or_load_gen_model 15 | from configs import add_args, set_seed, set_dist 16 | from torch.nn.parallel import DistributedDataParallel as DDP 17 | import torch.distributed as dist 18 | from utils import RefineDataset, SimpleRefineDataset 19 | from evaluator.smooth_bleu import bleu_fromstr 20 | 21 | 22 | logging.basicConfig( 23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 24 | datefmt="%m/%d/%Y %H:%M:%S", 25 | level=logging.INFO, 26 | ) 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def get_loader(data_file, args, tokenizer, pool): 31 | def fn(features): 32 | return features 33 | logger.info(f"Start data file {data_file}.") 34 | if args.raw_input: 35 | dataset = SimpleRefineDataset(tokenizer, pool, args, data_file) 36 | else: 37 | dataset = RefineDataset(tokenizer, pool, args, data_file) 38 | sampler = SequentialSampler(dataset) 39 | dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn) 40 | logger.info(f"Finish data files {data_file}.") 41 | return dataset, sampler, dataloader 42 | 43 | 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer): 45 | logger.info(f" ***** Running bleu evaluation on {args.eval_file} *****") 46 | logger.info(" Batch size = %d", args.eval_batch_size) 47 | model.eval() 48 | if hasattr(model, "module"): 49 | model = model.module 50 | pred_ids, ex_ids = [], [] 51 | for step, examples in tqdm(enumerate(eval_dataloader, 1)): 52 | source_ids = torch.tensor( 53 | [ex.source_ids for ex in examples], dtype=torch.long 54 | ).to(args.local_rank) 55 | source_mask = source_ids.ne(tokenizer.pad_id) 56 | preds = model.generate(source_ids, 57 | attention_mask=source_mask, 58 | use_cache=True, 59 | num_beams=args.beam_size, 60 | early_stopping=True, 61 | max_length=args.max_target_length) 62 | top_preds = list(preds.cpu().numpy()) 63 | pred_ids.extend(top_preds) 64 | pred_nls = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids] 65 | valid_file = args.eval_file 66 | golds = [] 67 | with open(valid_file, "r") as f: 68 | for line in f: 69 | golds.append(json.loads(line)["new"]) 70 | golds = golds[:len(pred_nls)] 71 | if args.raw_input: 72 | datasetClass = SimpleRefineDataset 73 | else: 74 | datasetClass = RefineDataset 75 | for i in range(len(golds)): 76 | pred_nls[i], golds[i] = datasetClass.process_pred_gold(pred_nls[i], golds[i]) 77 | with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f: 78 | for pred in pred_nls: 79 | f.write(pred.strip() + "\n") 80 | with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f: 81 | for gold in golds: 82 | f.write(gold.strip() + "\n") 83 | em = 0 84 | for pred, gold in zip(pred_nls, golds): 85 | if " ".join(pred.split()) == " ".join(gold.split()): 86 | em += 1 87 | em = em / len(golds) 88 | logger.warning(f"EM: {em}") 89 | bleu = bleu_fromstr(pred_nls, golds, rmstop=False) 90 | return bleu 91 | 92 | 93 | def main(args): 94 | dist.init_process_group(backend="nccl") 95 | local_rank = dist.get_rank() % args.gpu_per_node 96 | args.global_rank = local_rank + args.node_index * args.gpu_per_node 97 | args.local_rank = local_rank 98 | args.world_size = dist.get_world_size() 99 | logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s", 100 | args.local_rank, args.global_rank, \ 101 | torch.distributed.get_world_size(), \ 102 | args.eval_batch_size) 103 | torch.cuda.set_device(local_rank) 104 | 105 | set_seed(args) 106 | config, model, tokenizer = build_or_load_gen_model(args) 107 | model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) 108 | pool = multiprocessing.Pool(args.cpu_count) 109 | data_file = args.eval_file 110 | set_seed(args) 111 | _, _, dataloader = get_loader(data_file, args, tokenizer, pool) # WARNING: this is a iterator, to save memory 112 | model.eval() 113 | bleu = eval_epoch_bleu(args, dataloader, model, tokenizer) 114 | logger.warning(f"BLEU: {bleu}") 115 | 116 | if __name__ == "__main__": 117 | parser = argparse.ArgumentParser() 118 | args = add_args(parser) 119 | args.cpu_count = multiprocessing.cpu_count() 120 | # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991 121 | logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) 122 | logger.info(args) 123 | main(args) 124 | logger.info("Test finished.") 125 | # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count()) 126 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/finetune-cls.sh: -------------------------------------------------------------------------------- 1 | # batch size 12 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | # You may change the following block for multiple gpu training 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 8 | RANK=0 && echo RANK: ${RANK} 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 11 | NODES=1 && echo NODES: ${NODES} 12 | NCCL_DEBUG=INFO 13 | 14 | bash test_nltk.sh 15 | 16 | 17 | # Change the arguments as required: 18 | # model_name_or_path, load_model_path: the path of the model to be finetuned 19 | # eval_file: the path of the evaluation data 20 | # output_dir: the directory to save finetuned model (not used at infer/test time) 21 | # out_file: the path of the output file 22 | # train_file_name: can be a directory contraining files named with "train*.jsonl" 23 | 24 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_cls.py \ 25 | --train_epochs 30 \ 26 | --model_name_or_path microsoft/codereviewer \ 27 | --output_dir ../../save/cls \ 28 | --train_filename ../../dataset/Diff_Quality_Estimation \ 29 | --dev_filename ../../dataset/Diff_Quality_Estimation/cls-valid.jsonl \ 30 | --max_source_length 512 \ 31 | --max_target_length 128 \ 32 | --train_batch_size 12 \ 33 | --learning_rate 3e-4 \ 34 | --gradient_accumulation_steps 3 \ 35 | --mask_rate 0.15 \ 36 | --save_steps 3600 \ 37 | --log_steps 100 \ 38 | --train_steps 120000 \ 39 | --gpu_per_node=${PER_NODE_GPU} \ 40 | --node_index=${RANK} \ 41 | --seed 2233 42 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/finetune-msg.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | # You may change the following block for multiple gpu training 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 8 | RANK=0 && echo RANK: ${RANK} 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 11 | NODES=1 && echo NODES: ${NODES} 12 | NCCL_DEBUG=INFO 13 | 14 | bash test_nltk.sh 15 | 16 | 17 | # Change the arguments as required: 18 | # model_name_or_path, load_model_path: the path of the model to be finetuned 19 | # eval_file: the path of the evaluation data 20 | # output_dir: the directory to save finetuned model (not used at infer/test time) 21 | # out_file: the path of the output file 22 | # train_filename: can be a directory contraining files named with "train*.jsonl" 23 | # raw_input: to select the preprocess method, set to True in this task 24 | 25 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_msg.py \ 26 | --train_epochs 30 \ 27 | --model_name_or_path microsoft/codereviewer \ 28 | --output_dir ../../save/gen \ 29 | --train_filename ../../dataset/gen-train.jsonl \ 30 | --dev_filename ../../dataset/gen-valid.jsonl \ 31 | --max_source_length 512 \ 32 | --max_target_length 128 \ 33 | --train_batch_size 6 \ 34 | --learning_rate 3e-4 \ 35 | --gradient_accumulation_steps 3 \ 36 | --mask_rate 0.15 \ 37 | --save_steps 1800 \ 38 | --log_steps 100 \ 39 | --train_steps 60000 \ 40 | --gpu_per_node=${PER_NODE_GPU} \ 41 | --node_index=${RANK} \ 42 | --seed 2233 \ 43 | --raw_input -------------------------------------------------------------------------------- /CodeReviewer/code/sh/finetune-ref.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | # You may change the following block for multiple gpu training 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 8 | RANK=0 && echo RANK: ${RANK} 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 11 | NODES=1 && echo NODES: ${NODES} 12 | NCCL_DEBUG=INFO 13 | 14 | bash test_nltk.sh 15 | 16 | 17 | # Change the arguments as required: 18 | # model_name_or_path, load_model_path: the path of the model to be finetuned 19 | # eval_file: the path of the evaluation data 20 | # output_dir: the directory to save finetuned model (not used at infer/test time) 21 | # out_file: the path of the output file 22 | # train_file_name: can be a directory contraining files named with "train*.jsonl" 23 | 24 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_ref.py \ 25 | --train_epochs 30 \ 26 | --model_name_or_path microsoft/codereviewer \ 27 | --output_dir ../../save/ref \ 28 | --train_filename ../../data/ref-train.jsonl \ 29 | --dev_filename ../../data/ref-valid.jsonl \ 30 | --max_source_length 200 \ 31 | --max_target_length 200 \ 32 | --train_batch_size 6 \ 33 | --learning_rate 3e-4 \ 34 | --gradient_accumulation_steps 3 \ 35 | --mask_rate 0.15 \ 36 | --save_steps 1800 \ 37 | --log_steps 100 \ 38 | --train_steps 60000 \ 39 | --gpu_per_node=${PER_NODE_GPU} \ 40 | --node_index=${RANK} \ 41 | --seed 2233 \ 42 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/infer-json.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 7 | RANK=0 && echo RANK: ${RANK} 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 10 | NODES=1 && echo NODES: ${NODES} 11 | NCCL_DEBUG=INFO 12 | 13 | # change break_cnt to truncate the number of examples (useful at debug time maybe) 14 | # --break_cnt -1 \ will keep the whole dataset 15 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_infer_msg.py \ 16 | --model_name_or_path microsoft/codereviewer \ 17 | --output_dir ../../save/gen \ 18 | --load_model_path ../../save/gen/checkpoint \ 19 | --output_dir empty \ 20 | --eval_file test.jsonl \ 21 | --out_file test_out.jsonl \ 22 | --max_source_length 512 \ 23 | --max_target_length 128 \ 24 | --eval_batch_size 12 \ 25 | --beam_size 10 \ 26 | --gpu_per_node=${PER_NODE_GPU} \ 27 | --node_index=${RANK} \ 28 | --seed 2233 \ 29 | --raw_input \ 30 | --break_cnt 20 31 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/test-cls.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 7 | RANK=0 && echo RANK: ${RANK} 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 10 | NODES=1 && echo NODES: ${NODES} 11 | NCCL_DEBUG=INFO 12 | 13 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_cls.py \ 14 | --model_name_or_path microsoft/codereviewer \ 15 | --output_dir ../../save/gen \ 16 | --load_model_path ../../save/gen/checkpoint \ 17 | --output_dir empty \ 18 | --eval_file cls-test.jsonl \ 19 | --max_source_length 512 \ 20 | --max_target_length 128 \ 21 | --eval_batch_size 16 \ 22 | --mask_rate 0.15 \ 23 | --save_steps 4000 \ 24 | --log_steps 100 \ 25 | --train_steps 120000 \ 26 | --gpu_per_node=${PER_NODE_GPU} \ 27 | --node_index=${RANK} \ 28 | --seed 2233 -------------------------------------------------------------------------------- /CodeReviewer/code/sh/test-msg.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | 6 | # You may change the following block for multiple gpu training 7 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 8 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 9 | RANK=0 && echo RANK: ${RANK} 10 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 11 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 12 | NODES=1 && echo NODES: ${NODES} 13 | NCCL_DEBUG=INFO 14 | 15 | 16 | bash test_nltk.sh 17 | 18 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_msg.py \ 19 | --model_name_or_path microsoft/codereviewer \ 20 | --output_dir ../../save/gen \ 21 | --load_model_path ../../save/gen/checkpoint \ 22 | --output_dir empty \ 23 | --eval_file ref-test.jsonl \ 24 | --max_source_length 512 \ 25 | --max_target_length 128 \ 26 | --eval_batch_size 12 \ 27 | --mask_rate 0.15 \ 28 | --save_steps 1800 \ 29 | --beam_size 10 \ 30 | --log_steps 100 \ 31 | --train_steps 120000 \ 32 | --gpu_per_node=${PER_NODE_GPU} \ 33 | --node_index=${RANK} \ 34 | --seed 2233 \ 35 | --raw_input 36 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/test-ref.sh: -------------------------------------------------------------------------------- 1 | # batch size 6 for 16 GB GPU 2 | 3 | mnt_dir="/home/codereview" 4 | 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST} 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT} 7 | RANK=0 && echo RANK: ${RANK} 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU} 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE} 10 | NODES=1 && echo NODES: ${NODES} 11 | NCCL_DEBUG=INFO 12 | 13 | bash test_nltk.sh 14 | 15 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_ref.py \ 16 | --model_name_or_path microsoft/codereviewer \ 17 | --output_dir ../../save/gen \ 18 | --load_model_path ../../save/gen/checkpoint \ 19 | --output_dir empty \ 20 | --eval_file ref-test.jsonl \ 21 | --max_source_length 200 \ 22 | --max_target_length 200 \ 23 | --eval_batch_size 12 \ 24 | --mask_rate 0.15 \ 25 | --save_steps 1800 \ 26 | --beam_size 10 \ 27 | --log_steps 100 \ 28 | --train_steps 120000 \ 29 | --gpu_per_node=${PER_NODE_GPU} \ 30 | --node_index=${RANK} \ 31 | --seed 2233 \ 32 | -------------------------------------------------------------------------------- /CodeReviewer/code/sh/test_nltk.sh: -------------------------------------------------------------------------------- 1 | echo -e "import nltk\nnltk.download('punkt')" > ttmp.py 2 | python ttmp.py 3 | rm ttmp.py -------------------------------------------------------------------------------- /CodeReviewer/code/test_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from configs import add_args 4 | from models import ReviewerModel, build_or_load_gen_model 5 | 6 | MAX_SOURCE_LENGTH=512 7 | 8 | def pad_assert(tokenizer, source_ids): 9 | source_ids = source_ids[:MAX_SOURCE_LENGTH - 2] 10 | source_ids = [tokenizer.bos_id] + source_ids + [tokenizer.eos_id] 11 | pad_len = MAX_SOURCE_LENGTH - len(source_ids) 12 | source_ids += [tokenizer.pad_id] * pad_len 13 | assert len(source_ids) == MAX_SOURCE_LENGTH, "Not equal length." 14 | return source_ids 15 | 16 | def encode_diff(tokenizer, diff): 17 | difflines = diff.split("\n")[1:] # remove start @@ 18 | difflines = [line for line in difflines if len(line.strip()) > 0] 19 | map_dic = {"-": 0, "+": 1, " ": 2} 20 | def f(s): 21 | if s in map_dic: 22 | return map_dic[s] 23 | else: 24 | return 2 25 | labels = [f(line[0]) for line in difflines] 26 | difflines = [line[1:].strip() for line in difflines] 27 | inputstr = "" 28 | for label, line in zip(labels, difflines): 29 | if label == 1: 30 | inputstr += "" + line 31 | elif label == 0: 32 | inputstr += "" + line 33 | else: 34 | inputstr += "" + line 35 | source_ids = tokenizer.encode(inputstr, max_length=MAX_SOURCE_LENGTH, truncation=True)[1:-1] 36 | source_ids = pad_assert(tokenizer, source_ids) 37 | return source_ids 38 | 39 | parser = argparse.ArgumentParser() 40 | args = add_args(parser) 41 | args.model_name_or_path = "microsoft/codereviewer" 42 | config, model, tokenizer = build_or_load_gen_model(args) 43 | model.to("cuda") 44 | model.eval() 45 | code_diff = """@@ -11,6 +11,8 @@\n \n invoiceDtoCopy.setState(InvoiceState.OPEN);\n _invoiceAggregateRepository.updateInvoiceState(invoiceCopy, InvoiceState.OPEN);\n+ _erpIntegrationService.createAndSendInvoiceEvent(invoiceCopy);\n+\n }\n }\n \n""" 46 | 47 | inputs = torch.tensor([encode_diff(tokenizer, code_diff)], dtype=torch.long).to("cuda") 48 | inputs_mask = inputs.ne(tokenizer.pad_id) 49 | preds = model.generate(inputs, 50 | attention_mask=inputs_mask, 51 | use_cache=True, 52 | num_beams=5, 53 | early_stopping=True, 54 | max_length=100, 55 | num_return_sequences=2 56 | ) 57 | preds = list(preds.cpu().numpy()) 58 | pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in preds] 59 | print(pred_nls[0]) 60 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/README.md: -------------------------------------------------------------------------------- 1 | # Clone Detection 2 | 3 | ## Task Definition 4 | 5 | Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score. 6 | 7 | ## Updates 8 | 9 | 2021-9-13: We have update the evaluater script. Since it's a binary classification, we use binary F1 score instead of "macro" F1 score. 10 | 11 | ## Dataset 12 | 13 | The dataset we use is [BigCloneBench](https://www.cs.usask.ca/faculty/croy/papers/2014/SvajlenkoICSME2014BigERA.pdf) and filtered following the paper [Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree](https://arxiv.org/pdf/2002.08653.pdf). 14 | 15 | ### Data Format 16 | 17 | 1. dataset/data.jsonl is stored in jsonlines format. Each line in the uncompressed file represents one function. One row is illustrated below. 18 | 19 | - **func:** the function 20 | 21 | - **idx:** index of the example 22 | 23 | 2. train.txt/valid.txt/test.txt provide examples, stored in the following format: idx1 idx2 label 24 | 25 | ### Data Statistics 26 | 27 | Data statistics of the dataset are shown in the below table: 28 | 29 | | | #Examples | 30 | | ----- | :-------: | 31 | | Train | 901,028 | 32 | | Dev | 415,416 | 33 | | Test | 415,416 | 34 | 35 | You can get data using the following command. 36 | 37 | ``` 38 | unzip dataset.zip 39 | ``` 40 | 41 | ## Evaluator 42 | 43 | We provide a script to evaluate predictions for this task, and report F1 score 44 | 45 | ### Example 46 | 47 | ```bash 48 | python evaluator/evaluator.py -a evaluator/answers.txt -p evaluator/predictions.txt 49 | ``` 50 | 51 | {'Recall': 0.25, 'Prediction': 0.5, 'F1': 0.3333333333333333} 52 | 53 | ### Input predictions 54 | 55 | A predications file that has predictions in TXT format, such as evaluator/predictions.txt. For example: 56 | 57 | ```b 58 | 13653451 21955002 0 59 | 1188160 8831513 1 60 | 1141235 14322332 0 61 | 16765164 17526811 1 62 | ``` 63 | 64 | ## Pipeline-GraphCodeBERT 65 | 66 | We also provide a pipeline that fine-tunes GraphCodeBERT on this task. 67 | ### Dependency 68 | 69 | - pip install torch 70 | - pip install transformers 71 | - pip install tree_sitter 72 | - pip sklearn 73 | 74 | ### Tree-sitter (optional) 75 | 76 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command: 77 | 78 | ```shell 79 | cd parser 80 | bash build.sh 81 | cd .. 82 | ``` 83 | 84 | ### Fine-tune 85 | 86 | We use 4*V100-16G to fine-tune and 10% valid data to evaluate. 87 | 88 | 89 | ```shell 90 | mkdir saved_models 91 | python run.py \ 92 | --output_dir=saved_models \ 93 | --config_name=microsoft/graphcodebert-base \ 94 | --model_name_or_path=microsoft/graphcodebert-base \ 95 | --tokenizer_name=microsoft/graphcodebert-base \ 96 | --do_train \ 97 | --train_data_file=dataset/train.txt \ 98 | --eval_data_file=dataset/valid.txt \ 99 | --test_data_file=dataset/test.txt \ 100 | --epoch 1 \ 101 | --code_length 512 \ 102 | --data_flow_length 128 \ 103 | --train_batch_size 16 \ 104 | --eval_batch_size 32 \ 105 | --learning_rate 2e-5 \ 106 | --max_grad_norm 1.0 \ 107 | --evaluate_during_training \ 108 | --seed 123456 2>&1| tee saved_models/train.log 109 | ``` 110 | 111 | ### Inference 112 | 113 | We use full test data for inference. 114 | 115 | ```shell 116 | python run.py \ 117 | --output_dir=saved_models \ 118 | --config_name=microsoft/graphcodebert-base \ 119 | --model_name_or_path=microsoft/graphcodebert-base \ 120 | --tokenizer_name=microsoft/graphcodebert-base \ 121 | --do_eval \ 122 | --do_test \ 123 | --train_data_file=dataset/train.txt \ 124 | --eval_data_file=dataset/valid.txt \ 125 | --test_data_file=dataset/test.txt \ 126 | --epoch 1 \ 127 | --code_length 512 \ 128 | --data_flow_length 128 \ 129 | --train_batch_size 16 \ 130 | --eval_batch_size 32 \ 131 | --learning_rate 2e-5 \ 132 | --max_grad_norm 1.0 \ 133 | --evaluate_during_training \ 134 | --seed 123456 2>&1| tee saved_models/test.log 135 | ``` 136 | 137 | ### Evaluation 138 | 139 | ```shell 140 | python evaluator/evaluator.py -a dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log 141 | ``` 142 | 143 | ## Result 144 | 145 | The results on the test set are shown as below: 146 | 147 | | Method | Precision | Recall | F1 | 148 | | ------------- | :-------: | :-------: | :-------: | 149 | | Deckard | 0.93 | 0.02 | 0.03 | 150 | | RtvNN | 0.95 | 0.01 | 0.01 | 151 | | CDLH | 0.92 | 0.74 | 0.82 | 152 | | ASTNN | 0.92 | 0.94 | 0.93 | 153 | | FA-AST-GMN | **0.96** | 0.94 | 0.95 | 154 | | CodeBERT | 0.947 | 0.934 | 0.941 | 155 | | GraphCodeBERT | 0.948 | **0.952** | **0.950** | 156 | 157 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/clonedetection/dataset.zip -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/evaluator/answers.txt: -------------------------------------------------------------------------------- 1 | 13653451 21955002 0 2 | 1188160 8831513 0 3 | 1141235 14322332 0 4 | 16765164 17526811 0 -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/evaluator/evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import logging 4 | import sys 5 | from sklearn.metrics import recall_score,precision_score,f1_score 6 | 7 | def read_answers(filename): 8 | answers={} 9 | with open(filename) as f: 10 | for line in f: 11 | line=line.strip() 12 | idx1,idx2,label=line.split() 13 | answers[(idx1,idx2)]=int(label) 14 | return answers 15 | 16 | def read_predictions(filename): 17 | predictions={} 18 | with open(filename) as f: 19 | for line in f: 20 | line=line.strip() 21 | idx1,idx2,label=line.split() 22 | predictions[(idx1,idx2)]=int(label) 23 | return predictions 24 | 25 | def calculate_scores(answers,predictions): 26 | y_trues,y_preds=[],[] 27 | for key in answers: 28 | if key not in predictions: 29 | logging.error("Missing prediction for ({},{}) pair.".format(key[0],key[1])) 30 | sys.exit() 31 | y_trues.append(answers[key]) 32 | y_preds.append(predictions[key]) 33 | scores={} 34 | scores['Recall']=recall_score(y_trues, y_preds) 35 | scores['Prediction']=precision_score(y_trues, y_preds) 36 | scores['F1']=f1_score(y_trues, y_preds) 37 | return scores 38 | 39 | def main(): 40 | import argparse 41 | parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.') 42 | parser.add_argument('--answers', '-a',help="filename of the labels, in txt format.") 43 | parser.add_argument('--predictions', '-p',help="filename of the leaderboard predictions, in txt format.") 44 | 45 | 46 | args = parser.parse_args() 47 | answers=read_answers(args.answers) 48 | predictions=read_predictions(args.predictions) 49 | scores=calculate_scores(answers,predictions) 50 | print(scores) 51 | 52 | if __name__ == '__main__': 53 | main() 54 | 55 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/evaluator/predictions.txt: -------------------------------------------------------------------------------- 1 | 13653451 21955002 0 2 | 1188160 8831513 1 3 | 1141235 14322332 0 4 | 16765164 17526811 1 -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch 4 | from torch.autograd import Variable 5 | import copy 6 | import torch.nn.functional as F 7 | from torch.nn import CrossEntropyLoss, MSELoss 8 | 9 | class RobertaClassificationHead(nn.Module): 10 | """Head for sentence-level classification tasks.""" 11 | 12 | def __init__(self, config): 13 | super().__init__() 14 | self.dense = nn.Linear(config.hidden_size*2, config.hidden_size) 15 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 16 | self.out_proj = nn.Linear(config.hidden_size, 2) 17 | 18 | def forward(self, features, **kwargs): 19 | x = features[:, 0, :] # take token (equiv. to [CLS]) 20 | x = x.reshape(-1,x.size(-1)*2) 21 | x = self.dropout(x) 22 | x = self.dense(x) 23 | x = torch.tanh(x) 24 | x = self.dropout(x) 25 | x = self.out_proj(x) 26 | return x 27 | 28 | class Model(nn.Module): 29 | def __init__(self, encoder,config,tokenizer,args): 30 | super(Model, self).__init__() 31 | self.encoder = encoder 32 | self.config=config 33 | self.tokenizer=tokenizer 34 | self.classifier=RobertaClassificationHead(config) 35 | self.args=args 36 | 37 | 38 | def forward(self, inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels=None): 39 | bs,l=inputs_ids_1.size() 40 | inputs_ids=torch.cat((inputs_ids_1.unsqueeze(1),inputs_ids_2.unsqueeze(1)),1).view(bs*2,l) 41 | position_idx=torch.cat((position_idx_1.unsqueeze(1),position_idx_2.unsqueeze(1)),1).view(bs*2,l) 42 | attn_mask=torch.cat((attn_mask_1.unsqueeze(1),attn_mask_2.unsqueeze(1)),1).view(bs*2,l,l) 43 | 44 | #embedding 45 | nodes_mask=position_idx.eq(0) 46 | token_mask=position_idx.ge(2) 47 | inputs_embeddings=self.encoder.roberta.embeddings.word_embeddings(inputs_ids) 48 | nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask 49 | nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None] 50 | avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings) 51 | inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None] 52 | 53 | outputs = self.encoder.roberta(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx,token_type_ids=position_idx.eq(-1).long())[0] 54 | logits=self.classifier(outputs) 55 | # shape: [batch_size, num_classes] 56 | prob=F.softmax(logits, dim=-1) 57 | if labels is not None: 58 | loss_fct = CrossEntropyLoss() 59 | loss = loss_fct(logits, labels) 60 | return loss,prob 61 | else: 62 | return prob 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | index_to_code_token, 4 | tree_to_variable_index) 5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/clonedetection/parser/my-languages.so -------------------------------------------------------------------------------- /GraphCodeBERT/clonedetection/parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | return '\n'.join(temp) 44 | elif lang in ['ruby']: 45 | return source 46 | else: 47 | def replacer(match): 48 | s = match.group(0) 49 | if s.startswith('/'): 50 | return " " # note: a space and not an empty string 51 | else: 52 | return s 53 | pattern = re.compile( 54 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 55 | re.DOTALL | re.MULTILINE 56 | ) 57 | temp=[] 58 | for x in re.sub(pattern, replacer, source).split('\n'): 59 | if x.strip()!="": 60 | temp.append(x) 61 | return '\n'.join(temp) 62 | 63 | def tree_to_token_index(root_node): 64 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 65 | return [(root_node.start_point,root_node.end_point)] 66 | else: 67 | code_tokens=[] 68 | for child in root_node.children: 69 | code_tokens+=tree_to_token_index(child) 70 | return code_tokens 71 | 72 | def tree_to_variable_index(root_node,index_to_code): 73 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 74 | index=(root_node.start_point,root_node.end_point) 75 | _,code=index_to_code[index] 76 | if root_node.type!=code: 77 | return [(root_node.start_point,root_node.end_point)] 78 | else: 79 | return [] 80 | else: 81 | code_tokens=[] 82 | for child in root_node.children: 83 | code_tokens+=tree_to_variable_index(child,index_to_code) 84 | return code_tokens 85 | 86 | def index_to_code_token(index,code): 87 | start_point=index[0] 88 | end_point=index[1] 89 | if start_point[0]==end_point[0]: 90 | s=code[start_point[0]][start_point[1]:end_point[1]] 91 | else: 92 | s="" 93 | s+=code[start_point[0]][start_point[1]:] 94 | for i in range(start_point[0]+1,end_point[0]): 95 | s+=code[i] 96 | s+=code[end_point[0]][:end_point[1]] 97 | return s 98 | -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Code Search 4 | 5 | ## Data Preprocess 6 | 7 | Different from the setting of [CodeSearchNet](husain2019codesearchnet), the answer of each query is retrieved from the whole development and testing code corpus instead of 1,000 candidate codes. Besides, we observe that some queries contain content unrelated to the code, such as a link ``http://..." that refers to external resources. Therefore, we filter following examples to improve the quality of the dataset. 8 | 9 | - Remove comments in the code 10 | 11 | - Remove examples that codes cannot be parsed into an abstract syntax tree. 12 | 13 | - Remove examples that #tokens of documents is < 3 or >256 14 | 15 | - Remove examples that documents contain special tokens (e.g. or https:...) 16 | 17 | - Remove examples that documents are not English. 18 | 19 | Data statistic about the cleaned dataset for code document generation is shown in this Table. 20 | 21 | | PL | Training | Dev | Test | Candidates code | 22 | | :--------- | :------: | :----: | :----: | :-------------: | 23 | | Python | 251,820 | 13,914 | 14,918 | 43,827 | 24 | | PHP | 241,241 | 12,982 | 14,014 | 52,660 | 25 | | Go | 167,288 | 7,325 | 8,122 | 28,120 | 26 | | Java | 164,923 | 5,183 | 10,955 | 40,347 | 27 | | JavaScript | 58,025 | 3,885 | 3,291 | 13,981 | 28 | | Ruby | 24,927 | 1,400 | 1,261 | 4,360 | 29 | 30 | You can download and preprocess data using the following command. 31 | ```shell 32 | unzip dataset.zip 33 | cd dataset 34 | bash run.sh 35 | cd .. 36 | ``` 37 | 38 | ## Dependency 39 | 40 | - pip install torch 41 | - pip install transformers 42 | - pip install tree_sitter 43 | 44 | ### Tree-sitter (optional) 45 | 46 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command: 47 | 48 | ```shell 49 | cd parser 50 | bash build.sh 51 | cd .. 52 | ``` 53 | 54 | ## Fine-Tune 55 | 56 | We fine-tuned the model on 2*V100-16G GPUs. 57 | ```shell 58 | lang=ruby 59 | mkdir -p ./saved_models/$lang 60 | python run.py \ 61 | --output_dir=./saved_models/$lang \ 62 | --config_name=microsoft/graphcodebert-base \ 63 | --model_name_or_path=microsoft/graphcodebert-base \ 64 | --tokenizer_name=microsoft/graphcodebert-base \ 65 | --lang=$lang \ 66 | --do_train \ 67 | --train_data_file=dataset/$lang/train.jsonl \ 68 | --eval_data_file=dataset/$lang/valid.jsonl \ 69 | --test_data_file=dataset/$lang/test.jsonl \ 70 | --codebase_file=dataset/$lang/codebase.jsonl \ 71 | --num_train_epochs 10 \ 72 | --code_length 256 \ 73 | --data_flow_length 64 \ 74 | --nl_length 128 \ 75 | --train_batch_size 32 \ 76 | --eval_batch_size 64 \ 77 | --learning_rate 2e-5 \ 78 | --seed 123456 2>&1| tee saved_models/$lang/train.log 79 | ``` 80 | ## Inference and Evaluation 81 | 82 | ```shell 83 | lang=ruby 84 | python run.py \ 85 | --output_dir=./saved_models/$lang \ 86 | --config_name=microsoft/graphcodebert-base \ 87 | --model_name_or_path=microsoft/graphcodebert-base \ 88 | --tokenizer_name=microsoft/graphcodebert-base \ 89 | --lang=$lang \ 90 | --do_eval \ 91 | --do_test \ 92 | --train_data_file=dataset/$lang/train.jsonl \ 93 | --eval_data_file=dataset/$lang/valid.jsonl \ 94 | --test_data_file=dataset/$lang/test.jsonl \ 95 | --codebase_file=dataset/$lang/codebase.jsonl \ 96 | --num_train_epochs 10 \ 97 | --code_length 256 \ 98 | --data_flow_length 64 \ 99 | --nl_length 128 \ 100 | --train_batch_size 32 \ 101 | --eval_batch_size 64 \ 102 | --learning_rate 2e-5 \ 103 | --seed 123456 2>&1| tee saved_models/$lang/test.log 104 | ``` 105 | 106 | ## Results 107 | 108 | The results on the filtered dataset are shown in this Table: 109 | 110 | | Model | Ruby | Javascript | Go | Python | Java | PHP | Overall | 111 | | -------------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: | 112 | | NBow | 0.162 | 0.157 | 0.330 | 0.161 | 0.171 | 0.152 | 0.189 | 113 | | CNN | 0.276 | 0.224 | 0.680 | 0.242 | 0.263 | 0.260 | 0.324 | 114 | | BiRNN | 0.213 | 0.193 | 0.688 | 0.290 | 0.304 | 0.338 | 0.338 | 115 | | SelfAtt | 0.275 | 0.287 | 0.723 | 0.398 | 0.404 | 0.426 | 0.419 | 116 | | RoBERTa | 0.587 | 0.517 | 0.850 | 0.587 | 0.599 | 0.560 | 0.617 | 117 | | RoBERTa (code) | 0.628 | 0.562 | 0.859 | 0.610 | 0.620 | 0.579 | 0.643 | 118 | | CodeBERT | 0.679 | 0.620 | 0.882 | 0.672 | 0.676 | 0.628 | 0.693 | 119 | | GraphCodeBERT | **0.703** | **0.644** | **0.897** | **0.692** | **0.691** | **0.649** | **0.713** | 120 | 121 | 122 | ## Model and Demo 123 | A pretrained model, additional training script with dataset, and demo of a finetuned CodeBERT model for the task of Code Search can be found here: https://drive.google.com/file/d/1ZO-xVIzGcNE6Gz9DEg2z5mIbBv4Ft1cK/view. 124 | -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/codesearch/dataset.zip -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch.nn as nn 4 | import torch 5 | class Model(nn.Module): 6 | def __init__(self, encoder): 7 | super(Model, self).__init__() 8 | self.encoder = encoder 9 | 10 | def forward(self, code_inputs=None, attn_mask=None,position_idx=None, nl_inputs=None): 11 | if code_inputs is not None: 12 | nodes_mask=position_idx.eq(0) 13 | token_mask=position_idx.ge(2) 14 | inputs_embeddings=self.encoder.embeddings.word_embeddings(code_inputs) 15 | nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask 16 | nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None] 17 | avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings) 18 | inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None] 19 | return self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)[1] 20 | else: 21 | return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1] 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | index_to_code_token, 4 | tree_to_variable_index) 5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/codesearch/parser/my-languages.so -------------------------------------------------------------------------------- /GraphCodeBERT/codesearch/parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | return '\n'.join(temp) 44 | elif lang in ['ruby']: 45 | return source 46 | else: 47 | def replacer(match): 48 | s = match.group(0) 49 | if s.startswith('/'): 50 | return " " # note: a space and not an empty string 51 | else: 52 | return s 53 | pattern = re.compile( 54 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 55 | re.DOTALL | re.MULTILINE 56 | ) 57 | temp=[] 58 | for x in re.sub(pattern, replacer, source).split('\n'): 59 | if x.strip()!="": 60 | temp.append(x) 61 | return '\n'.join(temp) 62 | 63 | def tree_to_token_index(root_node): 64 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 65 | return [(root_node.start_point,root_node.end_point)] 66 | else: 67 | code_tokens=[] 68 | for child in root_node.children: 69 | code_tokens+=tree_to_token_index(child) 70 | return code_tokens 71 | 72 | def tree_to_variable_index(root_node,index_to_code): 73 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 74 | index=(root_node.start_point,root_node.end_point) 75 | _,code=index_to_code[index] 76 | if root_node.type!=code: 77 | return [(root_node.start_point,root_node.end_point)] 78 | else: 79 | return [] 80 | else: 81 | code_tokens=[] 82 | for child in root_node.children: 83 | code_tokens+=tree_to_variable_index(child,index_to_code) 84 | return code_tokens 85 | 86 | def index_to_code_token(index,code): 87 | start_point=index[0] 88 | end_point=index[1] 89 | if start_point[0]==end_point[0]: 90 | s=code[start_point[0]][start_point[1]:end_point[1]] 91 | else: 92 | s="" 93 | s+=code[start_point[0]][start_point[1]:] 94 | for i in range(start_point[0]+1,end_point[0]): 95 | s+=code[i] 96 | s+=code[end_point[0]][:end_point[1]] 97 | return s 98 | -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/README.md: -------------------------------------------------------------------------------- 1 | # Code Refinement 2 | 3 | ## Task Definition 4 | 5 | Code refinement aims to automatically fix bugs in the code, which can contribute to reducing the cost of bug-fixes for developers. 6 | In CodeXGLUE, given a piece of Java code with bugs, the task is to remove the bugs to output the refined code. 7 | Models are evaluated by BLEU scores and accuracy (exactly match). 8 | 9 | ## Dataset 10 | 11 | We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. 12 | All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length. 13 | 14 | ### Data Format 15 | 16 | The dataset is in the "data" folder. Each line of the files is a function. You can get data using the following command: 17 | 18 | ``` 19 | unzip data.zip 20 | ``` 21 | 22 | ### Data Statistics 23 | 24 | Data statistics of this dataset are shown in the below table: 25 | 26 | | | #Examples | #Examples | 27 | | ------- | :-------: | :-------: | 28 | | | Small | Medium | 29 | | Train | 46,680 | 52,364 | 30 | | Valid | 5,835 | 6,545 | 31 | | Test | 5,835 | 6,545 | 32 | 33 | ## Pipeline-GraphCodeBERT 34 | 35 | ### Dependency 36 | 37 | - pip install torch 38 | - pip install transformers 39 | - pip install tree_sitter 40 | 41 | ### Tree-sitter (optional) 42 | 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command: 44 | 45 | ```shell 46 | cd parser 47 | bash build.sh 48 | cd .. 49 | ``` 50 | 51 | ### Fine-tune 52 | We use 4*V100-16G to fine-tune. Taking the "small" subset as example: 53 | 54 | ```shell 55 | scale=small 56 | lr=1e-4 57 | batch_size=32 58 | beam_size=10 59 | source_length=320 60 | target_length=256 61 | output_dir=saved_models/$scale/ 62 | train_file=data/$scale/train.buggy-fixed.buggy,data/$scale/train.buggy-fixed.fixed 63 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed 64 | epochs=50 65 | pretrained_model=microsoft/graphcodebert-base 66 | 67 | mkdir -p $output_dir 68 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --num_train_epochs $epochs 2>&1| tee $output_dir/train.log 69 | ``` 70 | 71 | ### Inference 72 | 73 | We use full test data for inference. 74 | 75 | ```shell 76 | batch_size=64 77 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed 78 | test_file=data/$scale/test.buggy-fixed.buggy,data/$scale/test.buggy-fixed.fixed 79 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test 80 | 81 | python run.py --do_test --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --load_model_path $load_model_path --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log 82 | ``` 83 | 84 | 85 | 86 | ## Result 87 | 88 | The results on the test set are shown as below: 89 | 90 | Small: 91 | 92 | | Method | BLEU | Acc (100%) | 93 | | ------------- | :-------: | :--------: | 94 | | Naive copy | 78.06 | 0.0 | 95 | | LSTM | 76.76 | 10.0 | 96 | | Transformer | 77.21 | 14.7 | 97 | | CodeBERT | 77.42 | 16.4 | 98 | | GraphCodeBERT | **80.02** | **17.3** | 99 | 100 | Medium: 101 | 102 | | Method | BLEU | Acc (100%) | 103 | | ------------- | :-------: | :--------: | 104 | | Naive copy | 90.91 | 0.0 | 105 | | LSTM | 72.08 | 2.5 | 106 | | Transformer | 89.25 | 3.7 | 107 | | CodeBERT | 91.07 | 5.16 | 108 | | GraphCodeBERT | **91.31** | **9.1** | 109 | 110 | 111 | -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | 28 | def _get_ngrams(segment, max_order): 29 | """Extracts all n-grams upto a given maximum order from an input segment. 30 | 31 | Args: 32 | segment: text segment from which n-grams will be extracted. 33 | max_order: maximum length in tokens of the n-grams returned by this 34 | methods. 35 | 36 | Returns: 37 | The Counter containing all n-grams upto max_order in segment 38 | with a count of how many times each n-gram occurred. 39 | """ 40 | ngram_counts = collections.Counter() 41 | for order in range(1, max_order + 1): 42 | for i in range(0, len(segment) - order + 1): 43 | ngram = tuple(segment[i:i+order]) 44 | ngram_counts[ngram] += 1 45 | return ngram_counts 46 | 47 | 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 49 | smooth=False): 50 | """Computes BLEU score of translated segments against one or more references. 51 | 52 | Args: 53 | reference_corpus: list of lists of references for each translation. Each 54 | reference should be tokenized into a list of tokens. 55 | translation_corpus: list of translations to score. Each translation 56 | should be tokenized into a list of tokens. 57 | max_order: Maximum n-gram order to use when computing BLEU score. 58 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 59 | 60 | Returns: 61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 62 | precisions and brevity penalty. 63 | """ 64 | matches_by_order = [0] * max_order 65 | possible_matches_by_order = [0] * max_order 66 | reference_length = 0 67 | translation_length = 0 68 | for (references, translation) in zip(reference_corpus, 69 | translation_corpus): 70 | reference_length += min(len(r) for r in references) 71 | translation_length += len(translation) 72 | 73 | merged_ref_ngram_counts = collections.Counter() 74 | for reference in references: 75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 76 | translation_ngram_counts = _get_ngrams(translation, max_order) 77 | overlap = translation_ngram_counts & merged_ref_ngram_counts 78 | for ngram in overlap: 79 | matches_by_order[len(ngram)-1] += overlap[ngram] 80 | for order in range(1, max_order+1): 81 | possible_matches = len(translation) - order + 1 82 | if possible_matches > 0: 83 | possible_matches_by_order[order-1] += possible_matches 84 | 85 | precisions = [0] * max_order 86 | for i in range(0, max_order): 87 | if smooth: 88 | precisions[i] = ((matches_by_order[i] + 1.) / 89 | (possible_matches_by_order[i] + 1.)) 90 | else: 91 | if possible_matches_by_order[i] > 0: 92 | precisions[i] = (float(matches_by_order[i]) / 93 | possible_matches_by_order[i]) 94 | else: 95 | precisions[i] = 0.0 96 | 97 | if min(precisions) > 0: 98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 99 | geo_mean = math.exp(p_log_sum) 100 | else: 101 | geo_mean = 0 102 | 103 | ratio = float(translation_length) / reference_length 104 | 105 | if ratio > 1.0: 106 | bp = 1. 107 | else: 108 | bp = math.exp(1 - 1. / ratio) 109 | 110 | bleu = geo_mean * bp 111 | 112 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 113 | 114 | 115 | def _bleu(ref_file, trans_file, subword_option=None): 116 | max_order = 4 117 | smooth = True 118 | ref_files = [ref_file] 119 | reference_text = [] 120 | for reference_filename in ref_files: 121 | with open(reference_filename) as fh: 122 | reference_text.append(fh.readlines()) 123 | per_segment_references = [] 124 | for references in zip(*reference_text): 125 | reference_list = [] 126 | for reference in references: 127 | reference_list.append(reference.strip().split()) 128 | per_segment_references.append(reference_list) 129 | translations = [] 130 | with open(trans_file) as fh: 131 | for line in fh: 132 | translations.append(line.strip().split()) 133 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) 134 | return round(100 * bleu_score,2) -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/refinement/data.zip -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | index_to_code_token, 4 | tree_to_variable_index) 5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/refinement/parser/my-languages.so -------------------------------------------------------------------------------- /GraphCodeBERT/refinement/parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | return '\n'.join(temp) 44 | elif lang in ['ruby']: 45 | return source 46 | else: 47 | def replacer(match): 48 | s = match.group(0) 49 | if s.startswith('/'): 50 | return " " # note: a space and not an empty string 51 | else: 52 | return s 53 | pattern = re.compile( 54 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 55 | re.DOTALL | re.MULTILINE 56 | ) 57 | temp=[] 58 | for x in re.sub(pattern, replacer, source).split('\n'): 59 | if x.strip()!="": 60 | temp.append(x) 61 | return '\n'.join(temp) 62 | 63 | def tree_to_token_index(root_node): 64 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 65 | return [(root_node.start_point,root_node.end_point)] 66 | else: 67 | code_tokens=[] 68 | for child in root_node.children: 69 | code_tokens+=tree_to_token_index(child) 70 | return code_tokens 71 | 72 | def tree_to_variable_index(root_node,index_to_code): 73 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 74 | index=(root_node.start_point,root_node.end_point) 75 | _,code=index_to_code[index] 76 | if root_node.type!=code: 77 | return [(root_node.start_point,root_node.end_point)] 78 | else: 79 | return [] 80 | else: 81 | code_tokens=[] 82 | for child in root_node.children: 83 | code_tokens+=tree_to_variable_index(child,index_to_code) 84 | return code_tokens 85 | 86 | def index_to_code_token(index,code): 87 | start_point=index[0] 88 | end_point=index[1] 89 | if start_point[0]==end_point[0]: 90 | s=code[start_point[0]][start_point[1]:end_point[1]] 91 | else: 92 | s="" 93 | s+=code[start_point[0]][start_point[1]:] 94 | for i in range(start_point[0]+1,end_point[0]): 95 | s+=code[i] 96 | s+=code[end_point[0]][:end_point[1]] 97 | return s 98 | -------------------------------------------------------------------------------- /GraphCodeBERT/translation/README.md: -------------------------------------------------------------------------------- 1 | # Code Translation 2 | 3 | ## Task Definition 4 | 5 | Code translation aims to migrate legacy software from one programming language in a platform toanother. 6 | Given a piece of Java (C#) code, the task is to translate the code into C# (Java) version. 7 | Models are evaluated by BLEU scores and accuracy (exactly match). 8 | 9 | ## Dataset 10 | 11 | The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/). 12 | 13 | We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets. 14 | 15 | ### Data Format 16 | 17 | The dataset is in the "data" folder. Each line of the files is a function, and the suffix of the file indicates the programming language. You can get data using the following command: 18 | 19 | ``` 20 | unzip data.zip 21 | ``` 22 | 23 | ### Data Statistics 24 | 25 | Data statistics of the dataset are shown in the below table: 26 | 27 | | | #Examples | 28 | | ----- | :-------: | 29 | | Train | 10,300 | 30 | | Valid | 500 | 31 | | Test | 1,000 | 32 | 33 | ## Pipeline-GraphCodeBERT 34 | 35 | ### Dependency 36 | 37 | - pip install torch 38 | - pip install transformers 39 | - pip install tree_sitter 40 | 41 | ### Tree-sitter (optional) 42 | 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command: 44 | 45 | ```shell 46 | cd parser 47 | bash build.sh 48 | cd .. 49 | ``` 50 | 51 | ### Fine-tune 52 | We use 4*V100-16G to fine-tune. Taking Java to C# translation as example: 53 | 54 | ```shell 55 | source=java 56 | target=cs 57 | lr=1e-4 58 | batch_size=32 59 | beam_size=10 60 | source_length=320 61 | target_length=256 62 | output_dir=saved_models/$source-$target/ 63 | train_file=data/train.java-cs.txt.$source,data/train.java-cs.txt.$target 64 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target 65 | epochs=100 66 | pretrained_model=microsoft/graphcodebert-base 67 | 68 | mkdir -p $output_dir 69 | python run.py \ 70 | --do_train \ 71 | --do_eval \ 72 | --model_type roberta \ 73 | --source_lang $source \ 74 | --model_name_or_path $pretrained_model \ 75 | --tokenizer_name microsoft/graphcodebert-base \ 76 | --config_name microsoft/graphcodebert-base \ 77 | --train_filename $train_file \ 78 | --dev_filename $dev_file \ 79 | --output_dir $output_dir \ 80 | --max_source_length $source_length \ 81 | --max_target_length $target_length \ 82 | --beam_size $beam_size \ 83 | --train_batch_size $batch_size \ 84 | --eval_batch_size $batch_size \ 85 | --learning_rate $lr \ 86 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log 87 | ``` 88 | 89 | ### Inference 90 | 91 | We use full test data for inference. 92 | 93 | ```shell 94 | batch_size=64 95 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target 96 | test_file=data/test.java-cs.txt.$source,data/test.java-cs.txt.$target 97 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test 98 | 99 | python run.py \ 100 | --do_test \ 101 | --model_type roberta \ 102 | --source_lang $source \ 103 | --model_name_or_path $pretrained_model \ 104 | --tokenizer_name microsoft/graphcodebert-base \ 105 | --config_name microsoft/graphcodebert-base \ 106 | --load_model_path $load_model_path \ 107 | --dev_filename $dev_file \ 108 | --test_filename $test_file \ 109 | --output_dir $output_dir \ 110 | --max_source_length $source_length \ 111 | --max_target_length $target_length \ 112 | --beam_size $beam_size \ 113 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log 114 | ``` 115 | 116 | 117 | 118 | ## Result 119 | 120 | The results on the test set are shown as below: 121 | 122 | Java to C#: 123 | 124 | | Method | BLEU | Acc (100%) | 125 | | -------------- | :-------: | :--------: | 126 | | Naive copy | 18.54 | 0.0 | 127 | | PBSMT | 43.53 | 12.5 | 128 | | Transformer | 55.84 | 33.0 | 129 | | Roborta (code) | 77.46 | 56.1 | 130 | | CodeBERT | 79.92 | 59.0 | 131 | | GraphCodeBERT | **80.58** | **59.4** | 132 | 133 | C# to Java: 134 | 135 | | Method | BLEU | Acc (100%) | 136 | | -------------- | :-------: | :--------: | 137 | | Naive copy | 18.69 | 0.0 | 138 | | PBSMT | 40.06 | 16.1 | 139 | | Transformer | 50.47 | 37.9 | 140 | | Roborta (code) | 71.99 | 57.9 | 141 | | CodeBERT | 72.14 | 58.0 | 142 | | GraphCodeBERT | **72.64** | **58.8** | 143 | -------------------------------------------------------------------------------- /GraphCodeBERT/translation/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | 28 | def _get_ngrams(segment, max_order): 29 | """Extracts all n-grams upto a given maximum order from an input segment. 30 | 31 | Args: 32 | segment: text segment from which n-grams will be extracted. 33 | max_order: maximum length in tokens of the n-grams returned by this 34 | methods. 35 | 36 | Returns: 37 | The Counter containing all n-grams upto max_order in segment 38 | with a count of how many times each n-gram occurred. 39 | """ 40 | ngram_counts = collections.Counter() 41 | for order in range(1, max_order + 1): 42 | for i in range(0, len(segment) - order + 1): 43 | ngram = tuple(segment[i:i+order]) 44 | ngram_counts[ngram] += 1 45 | return ngram_counts 46 | 47 | 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 49 | smooth=False): 50 | """Computes BLEU score of translated segments against one or more references. 51 | 52 | Args: 53 | reference_corpus: list of lists of references for each translation. Each 54 | reference should be tokenized into a list of tokens. 55 | translation_corpus: list of translations to score. Each translation 56 | should be tokenized into a list of tokens. 57 | max_order: Maximum n-gram order to use when computing BLEU score. 58 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 59 | 60 | Returns: 61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 62 | precisions and brevity penalty. 63 | """ 64 | matches_by_order = [0] * max_order 65 | possible_matches_by_order = [0] * max_order 66 | reference_length = 0 67 | translation_length = 0 68 | for (references, translation) in zip(reference_corpus, 69 | translation_corpus): 70 | reference_length += min(len(r) for r in references) 71 | translation_length += len(translation) 72 | 73 | merged_ref_ngram_counts = collections.Counter() 74 | for reference in references: 75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 76 | translation_ngram_counts = _get_ngrams(translation, max_order) 77 | overlap = translation_ngram_counts & merged_ref_ngram_counts 78 | for ngram in overlap: 79 | matches_by_order[len(ngram)-1] += overlap[ngram] 80 | for order in range(1, max_order+1): 81 | possible_matches = len(translation) - order + 1 82 | if possible_matches > 0: 83 | possible_matches_by_order[order-1] += possible_matches 84 | 85 | precisions = [0] * max_order 86 | for i in range(0, max_order): 87 | if smooth: 88 | precisions[i] = ((matches_by_order[i] + 1.) / 89 | (possible_matches_by_order[i] + 1.)) 90 | else: 91 | if possible_matches_by_order[i] > 0: 92 | precisions[i] = (float(matches_by_order[i]) / 93 | possible_matches_by_order[i]) 94 | else: 95 | precisions[i] = 0.0 96 | 97 | if min(precisions) > 0: 98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 99 | geo_mean = math.exp(p_log_sum) 100 | else: 101 | geo_mean = 0 102 | 103 | ratio = float(translation_length) / reference_length 104 | 105 | if ratio > 1.0: 106 | bp = 1. 107 | else: 108 | bp = math.exp(1 - 1. / ratio) 109 | 110 | bleu = geo_mean * bp 111 | 112 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 113 | 114 | 115 | def _bleu(ref_file, trans_file, subword_option=None): 116 | max_order = 4 117 | smooth = True 118 | ref_files = [ref_file] 119 | reference_text = [] 120 | for reference_filename in ref_files: 121 | with open(reference_filename) as fh: 122 | reference_text.append(fh.readlines()) 123 | per_segment_references = [] 124 | for references in zip(*reference_text): 125 | reference_list = [] 126 | for reference in references: 127 | reference_list.append(reference.strip().split()) 128 | per_segment_references.append(reference_list) 129 | translations = [] 130 | with open(trans_file) as fh: 131 | for line in fh: 132 | translations.append(line.strip().split()) 133 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) 134 | return round(100 * bleu_score,2) -------------------------------------------------------------------------------- /GraphCodeBERT/translation/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/translation/data.zip -------------------------------------------------------------------------------- /GraphCodeBERT/translation/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | index_to_code_token, 4 | tree_to_variable_index) 5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /GraphCodeBERT/translation/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /GraphCodeBERT/translation/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /GraphCodeBERT/translation/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/translation/parser/my-languages.so -------------------------------------------------------------------------------- /GraphCodeBERT/translation/parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | return '\n'.join(temp) 44 | elif lang in ['ruby']: 45 | return source 46 | else: 47 | def replacer(match): 48 | s = match.group(0) 49 | if s.startswith('/'): 50 | return " " # note: a space and not an empty string 51 | else: 52 | return s 53 | pattern = re.compile( 54 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 55 | re.DOTALL | re.MULTILINE 56 | ) 57 | temp=[] 58 | for x in re.sub(pattern, replacer, source).split('\n'): 59 | if x.strip()!="": 60 | temp.append(x) 61 | return '\n'.join(temp) 62 | 63 | def tree_to_token_index(root_node): 64 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 65 | return [(root_node.start_point,root_node.end_point)] 66 | else: 67 | code_tokens=[] 68 | for child in root_node.children: 69 | code_tokens+=tree_to_token_index(child) 70 | return code_tokens 71 | 72 | def tree_to_variable_index(root_node,index_to_code): 73 | if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment': 74 | index=(root_node.start_point,root_node.end_point) 75 | _,code=index_to_code[index] 76 | if root_node.type!=code: 77 | return [(root_node.start_point,root_node.end_point)] 78 | else: 79 | return [] 80 | else: 81 | code_tokens=[] 82 | for child in root_node.children: 83 | code_tokens+=tree_to_variable_index(child,index_to_code) 84 | return code_tokens 85 | 86 | def index_to_code_token(index,code): 87 | start_point=index[0] 88 | end_point=index[1] 89 | if start_point[0]==end_point[0]: 90 | s=code[start_point[0]][start_point[1]:end_point[1]] 91 | else: 92 | s="" 93 | s+=code[start_point[0]][start_point[1]:] 94 | for i in range(start_point[0]+1,end_point[0]): 95 | s+=code[i] 96 | s+=code[end_point[0]][:end_point[1]] 97 | return s 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LongCoder/README.md: -------------------------------------------------------------------------------- 1 | # LongCoder 2 | 3 | This repo will provide the code for reproducing the experiments on LCC datasets in [LongCoder: A Long-Range Pre-trained Language Model for Code Completion](https://arxiv.org/abs/2306.14893). LongCoder is a sparse and efficient pre-trained Transformer model for long code modeling. 4 | 5 | ## 1. Dependency 6 | 7 | - pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 8 | - pip install --upgrade transformers fuzzywuzzy tree_sitter datasets 9 | 10 | ## 2. Dataset 11 | In this repo, the LCC dataset will be automatically downloaded when running the fine-tuning script. If you want to download LCC datasets by yourself, you can find them in the following links: 12 | ``` 13 | https://huggingface.co/datasets/microsoft/LCC_python 14 | https://huggingface.co/datasets/microsoft/LCC_java 15 | https://huggingface.co/datasets/microsoft/LCC_csharp 16 | ``` 17 | ## 3. Fine-Tune Setting 18 | Here we provide fine-tune settings for code completion on LCC datasets in C# programming language, whose results are reported in the paper. 19 | 20 | Note that it requires 8 v100-32G GPUs, and you can adjust batch size or source length based on your requirements. 21 | 22 | ```shell 23 | lang=csharp #csharp, python, java 24 | lr=2e-4 25 | batch_size=16 26 | beam_size=5 27 | source_length=3968 28 | target_length=128 29 | global_length=64 30 | window_size=512 31 | epochs=10 32 | output_dir=saved_models/$lang 33 | mkdir -p $output_dir 34 | 35 | python run.py \ 36 | --do_train \ 37 | --do_eval \ 38 | --lang $lang \ 39 | --output_dir $output_dir \ 40 | --model_name_or_path microsoft/longcoder-base \ 41 | --filename microsoft/LCC_$lang \ 42 | --max_source_length $source_length \ 43 | --max_target_length $target_length \ 44 | --max_global_length $global_length \ 45 | --window_size $window_size \ 46 | --beam_size $beam_size \ 47 | --train_batch_size $batch_size \ 48 | --eval_batch_size $batch_size \ 49 | --learning_rate $lr \ 50 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log 51 | ``` 52 | 53 | ## 4. Evaluating LongCoder 54 | 55 | ```shell 56 | lang=csharp #csharp, python, java 57 | batch_size=16 58 | beam_size=5 59 | source_length=3968 60 | target_length=128 61 | global_length=64 62 | window_size=512 63 | output_dir=saved_models/$lang 64 | reload_model=$output_dir/checkpoint-best-acc/model.bin 65 | 66 | python run.py \ 67 | --do_test \ 68 | --lang $lang \ 69 | --load_model_path $reload_model \ 70 | --output_dir $output_dir \ 71 | --model_name_or_path microsoft/longcoder-base \ 72 | --filename microsoft/LCC_$lang \ 73 | --max_source_length $source_length \ 74 | --max_target_length $target_length \ 75 | --max_global_length $global_length \ 76 | --window_size $window_size \ 77 | --beam_size $beam_size \ 78 | --train_batch_size $batch_size \ 79 | --eval_batch_size $batch_size \ 80 | --num_train_epochs $epochs 2>&1| tee $output_dir/test.log 81 | ``` 82 | 83 | # Reference 84 | If you use this code or LongCoder, please consider citing us. 85 | 86 |
@article{longcoder,
87 |     title={LongCoder: A Long-Range Pre-trained Language Model for Code Completion},
88 |     author={Daya Guo and Canwen Xu and Nan Duan and Jian Yin and Julian McAuley},
89 |     journal={arXiv preprint arXiv:2306.14893},
90 |     year={2023}
91 | }
92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /LongCoder/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | index_to_code_token, 4 | tree_to_variable_index) 5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /LongCoder/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | 'tree-sitter-cpp', 20 | 'tree-sitter-c', 21 | ] 22 | ) 23 | 24 | -------------------------------------------------------------------------------- /LongCoder/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-c 2 | git clone https://github.com/tree-sitter/tree-sitter-cpp 3 | git clone https://github.com/tree-sitter/tree-sitter-typescript 4 | git clone https://github.com/tree-sitter/tree-sitter-go 5 | git clone https://github.com/tree-sitter/tree-sitter-javascript 6 | git clone https://github.com/tree-sitter/tree-sitter-python 7 | git clone https://github.com/tree-sitter/tree-sitter-ruby 8 | git clone https://github.com/tree-sitter/tree-sitter-php 9 | git clone https://github.com/tree-sitter/tree-sitter-java 10 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 11 | python build.py 12 | -------------------------------------------------------------------------------- /LongCoder/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/LongCoder/parser/my-languages.so -------------------------------------------------------------------------------- /LongCoder/parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | return '\n'.join(temp) 44 | elif lang in ['ruby']: 45 | return source 46 | else: 47 | def replacer(match): 48 | s = match.group(0) 49 | if s.startswith('/'): 50 | return " " # note: a space and not an empty string 51 | else: 52 | return s 53 | pattern = re.compile( 54 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 55 | re.DOTALL | re.MULTILINE 56 | ) 57 | temp=[] 58 | for x in re.sub(pattern, replacer, source).split('\n'): 59 | if x.strip()!="": 60 | temp.append(x) 61 | return '\n'.join(temp) 62 | 63 | def tree_to_token_index(root_node): 64 | if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type): 65 | return [(root_node.start_point,root_node.end_point)] 66 | else: 67 | code_tokens=[] 68 | for child in root_node.children: 69 | code_tokens+=tree_to_token_index(child) 70 | return code_tokens 71 | 72 | def tree_to_variable_index(root_node,index_to_code): 73 | if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type): 74 | index=(root_node.start_point,root_node.end_point) 75 | _,code=index_to_code[index] 76 | if root_node.type!=code: 77 | return [(root_node.start_point,root_node.end_point)] 78 | else: 79 | return [] 80 | else: 81 | code_tokens=[] 82 | for child in root_node.children: 83 | code_tokens+=tree_to_variable_index(child,index_to_code) 84 | return code_tokens 85 | 86 | def index_to_code_token(index,code): 87 | start_point=index[0] 88 | end_point=index[1] 89 | if start_point[0]==end_point[0]: 90 | s=code[start_point[0]][start_point[1]:end_point[1]] 91 | else: 92 | s="" 93 | s+=code[start_point[0]][start_point[1]:] 94 | for i in range(start_point[0]+1,end_point[0]): 95 | s+=" "+code[i] 96 | s+=" "+code[end_point[0]][:end_point[1]] 97 | return s 98 | -------------------------------------------------------------------------------- /LongCoder/run.sh: -------------------------------------------------------------------------------- 1 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 2 | pip install --upgrade scipy transformers tqdm fuzzywuzzy tree_sitter datasets 3 | 4 | lang=$1 #programming language 5 | lr=2e-4 6 | batch_size=16 7 | beam_size=5 8 | source_length=3968 9 | target_length=128 10 | global_length=64 11 | window_size=512 12 | output_dir=saved_models/$1 13 | epochs=10 14 | pretrained_model=microsoft/longcoder-base 15 | 16 | mkdir -p $output_dir 17 | 18 | python run.py \ 19 | --do_train \ 20 | --do_eval \ 21 | --lang $1 \ 22 | --output_dir $output_dir \ 23 | --model_name_or_path $pretrained_model \ 24 | --filename microsoft/LCC_$1 \ 25 | --max_source_length $source_length \ 26 | --max_target_length $target_length \ 27 | --max_global_length $global_length \ 28 | --window_size $window_size \ 29 | --beam_size $beam_size \ 30 | --train_batch_size $batch_size \ 31 | --eval_batch_size $batch_size \ 32 | --learning_rate $lr \ 33 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log 34 | 35 | 36 | 37 | 38 | 39 | reload_model=$output_dir/checkpoint-best-acc/model.bin 40 | python run.py \ 41 | --do_test \ 42 | --lang $1 \ 43 | --load_model_path $reload_model \ 44 | --model_name_or_path $pretrained_model \ 45 | --filename microsoft/LCC_$1 \ 46 | --output_dir $output_dir \ 47 | --max_source_length $source_length \ 48 | --max_target_length $target_length \ 49 | --max_global_length $global_length \ 50 | --window_size $window_size \ 51 | --beam_size $beam_size \ 52 | --train_batch_size $batch_size \ 53 | --eval_batch_size $batch_size \ 54 | --learning_rate $lr \ 55 | --num_train_epochs $epochs 2>&1| tee $output_dir/test.log 56 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/BCB/README.md: -------------------------------------------------------------------------------- 1 | # Clone Detection (BigCloneDetection) 2 | 3 | ## Data Download 4 | 5 | ```bash 6 | mkdir dataset 7 | cd dataset 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/test.txt 10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/train.txt 11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/valid.txt 12 | cd .. 13 | 14 | ``` 15 | 16 | ## Dependency 17 | 18 | - pip install torch 19 | - pip install transformers 20 | 21 | ## Fine-Tune 22 | 23 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper. 24 | 25 | ```shell 26 | # Training 27 | python run.py \ 28 | --output_dir saved_models \ 29 | --model_name_or_path microsoft/unixcoder-base \ 30 | --do_train \ 31 | --train_data_file dataset/train.txt \ 32 | --eval_data_file dataset/valid.txt \ 33 | --num_train_epochs 1 \ 34 | --block_size 512 \ 35 | --train_batch_size 16 \ 36 | --eval_batch_size 32 \ 37 | --learning_rate 5e-5 \ 38 | --max_grad_norm 1.0 \ 39 | --seed 123456 40 | 41 | # Evaluating 42 | python run.py \ 43 | --output_dir saved_models \ 44 | --model_name_or_path microsoft/unixcoder-base \ 45 | --do_test \ 46 | --test_data_file dataset/test.txt \ 47 | --num_train_epochs 1 \ 48 | --block_size 512 \ 49 | --train_batch_size 16 \ 50 | --eval_batch_size 32 \ 51 | --learning_rate 5e-5 \ 52 | --max_grad_norm 1.0 \ 53 | --seed 123456 54 | ``` 55 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/BCB/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import torch 4 | import torch.nn as nn 5 | import torch 6 | from torch.autograd import Variable 7 | import copy 8 | import torch.nn.functional as F 9 | from torch.nn import CrossEntropyLoss, MSELoss 10 | 11 | class RobertaClassificationHead(nn.Module): 12 | """Head for sentence-level classification tasks.""" 13 | 14 | def __init__(self, config): 15 | super().__init__() 16 | self.dense = nn.Linear(config.hidden_size*2, config.hidden_size) 17 | self.dropout = nn.Dropout(0.1) 18 | self.out_proj = nn.Linear(config.hidden_size, 2) 19 | 20 | def forward(self, x): 21 | x = x.reshape(-1,x.size(-1)*2) 22 | x = self.dropout(x) 23 | x = self.dense(x) 24 | x = torch.tanh(x) 25 | x = self.dropout(x) 26 | x = self.out_proj(x) 27 | return x 28 | 29 | class Model(nn.Module): 30 | def __init__(self, encoder,config,tokenizer,args): 31 | super(Model, self).__init__() 32 | self.encoder = encoder 33 | self.config = config 34 | self.tokenizer = tokenizer 35 | self.classifier = RobertaClassificationHead(config) 36 | self.args = args 37 | 38 | 39 | def forward(self, input_ids=None,labels=None): 40 | input_ids = input_ids.view(-1,self.args.block_size) 41 | outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0] 42 | outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None] 43 | outputs = outputs.reshape(-1,2,outputs.size(-1)) 44 | outputs = torch.nn.functional.normalize(outputs, p=2, dim=-1) 45 | cos_sim = (outputs[:,0]*outputs[:,1]).sum(-1) 46 | 47 | if labels is not None: 48 | loss = ((cos_sim-labels.float())**2).mean() 49 | return loss,cos_sim 50 | else: 51 | return cos_sim 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/BCB/run.sh: -------------------------------------------------------------------------------- 1 | model=../../../../pretrained-model/UniXcoder-base 2 | mkdir saved_models 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \ 4 | --output_dir=./saved_models \ 5 | --model_type=roberta \ 6 | --model_name_or_path=$model \ 7 | --do_train \ 8 | --train_data_file=../../dataset/train.txt \ 9 | --eval_data_file=../../dataset/valid.txt \ 10 | --test_data_file=../../dataset/test.txt \ 11 | --epoch 1 \ 12 | --block_size 512 \ 13 | --train_batch_size 16 \ 14 | --eval_batch_size 32 \ 15 | --learning_rate 5e-5 \ 16 | --max_grad_norm 1.0 \ 17 | --evaluate_during_training \ 18 | --seed 123456 2>&1| tee saved_models/train.log 19 | 20 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \ 21 | --output_dir=./saved_models \ 22 | --model_type=roberta \ 23 | --model_name_or_path=$model \ 24 | --do_eval \ 25 | --do_test \ 26 | --train_data_file=../../dataset/train.txt \ 27 | --eval_data_file=../../dataset/valid.txt \ 28 | --test_data_file=../../dataset/test.txt \ 29 | --epoch 1 \ 30 | --block_size 512 \ 31 | --train_batch_size 16 \ 32 | --eval_batch_size 32 \ 33 | --learning_rate 5e-5 \ 34 | --max_grad_norm 1.0 \ 35 | --evaluate_during_training \ 36 | --seed 123456 2>&1| tee saved_models/test.log 37 | 38 | python ../evaluator/evaluator.py -a ../../dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log 39 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/POJ-104/README.md: -------------------------------------------------------------------------------- 1 | # Clone Detection (POJ-104) 2 | 3 | ## Data Download 4 | 5 | ```bash 6 | cd dataset 7 | pip install gdown 8 | gdown https://drive.google.com/uc?id=0B2i-vWnOu7MxVlJwQXN6eVNONUU 9 | tar -xvf programs.tar.gz 10 | python preprocess.py 11 | cd .. 12 | ``` 13 | 14 | ## Dependency 15 | 16 | - pip install torch 17 | - pip install transformers 18 | 19 | ## Fine-Tune 20 | 21 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper. 22 | 23 | ```shell 24 | # Training 25 | python run.py \ 26 | --output_dir saved_models \ 27 | --model_name_or_path microsoft/unixcoder-base \ 28 | --do_train \ 29 | --train_data_file dataset/train.jsonl \ 30 | --eval_data_file dataset/valid.jsonl \ 31 | --test_data_file dataset/test.jsonl \ 32 | --num_train_epochs 2 \ 33 | --block_size 400 \ 34 | --train_batch_size 8 \ 35 | --eval_batch_size 16 \ 36 | --learning_rate 2e-5 \ 37 | --max_grad_norm 1.0 \ 38 | --seed 123456 39 | 40 | # Evaluating 41 | python run.py \ 42 | --output_dir saved_models \ 43 | --model_name_or_path microsoft/unixcoder-base \ 44 | --do_eval \ 45 | --do_test \ 46 | --eval_data_file dataset/valid.jsonl \ 47 | --test_data_file dataset/test.jsonl \ 48 | --num_train_epochs 2 \ 49 | --block_size 400 \ 50 | --train_batch_size 8 \ 51 | --eval_batch_size 16 \ 52 | --learning_rate 2e-5 \ 53 | --max_grad_norm 1.0 \ 54 | --seed 123456 55 | ``` 56 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/POJ-104/dataset/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | def files(path): 7 | g = os.walk(path) 8 | file=[] 9 | for path,dir_list,file_list in g: 10 | for file_name in file_list: 11 | file.append(os.path.join(path, file_name)) 12 | return file 13 | 14 | cont=0 15 | with open("train.jsonl",'w') as f: 16 | for i in tqdm(range(1,65),total=64): 17 | items=files("ProgramData/{}".format(i)) 18 | for item in items: 19 | js={} 20 | js['label']=item.split('/')[1] 21 | js['index']=str(cont) 22 | js['code']=open(item,encoding='latin-1').read() 23 | f.write(json.dumps(js)+'\n') 24 | cont+=1 25 | 26 | with open("valid.jsonl",'w') as f: 27 | for i in tqdm(range(65,81),total=16): 28 | items=files("ProgramData/{}".format(i)) 29 | for item in items: 30 | js={} 31 | js['label']=item.split('/')[1] 32 | js['index']=str(cont) 33 | js['code']=open(item,encoding='latin-1').read() 34 | f.write(json.dumps(js)+'\n') 35 | cont+=1 36 | 37 | with open("test.jsonl",'w') as f: 38 | for i in tqdm(range(81,195),total=24): 39 | items=files("ProgramData/{}".format(i)) 40 | for item in items: 41 | js={} 42 | js['label']=item.split('/')[1] 43 | js['index']=str(cont) 44 | js['code']=open(item,encoding='latin-1').read() 45 | f.write(json.dumps(js)+'\n') 46 | cont+=1 -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/clone-detection/POJ-104/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch 4 | import torch.nn as nn 5 | import torch 6 | from torch.autograd import Variable 7 | import copy 8 | import torch.nn.functional as F 9 | from torch.nn import CrossEntropyLoss, MSELoss 10 | 11 | 12 | 13 | class Model(nn.Module): 14 | def __init__(self, encoder,config,tokenizer,args): 15 | super(Model, self).__init__() 16 | self.encoder = encoder 17 | self.config=config 18 | self.tokenizer=tokenizer 19 | self.args=args 20 | 21 | 22 | def forward(self, input_ids=None,p_input_ids=None,n_input_ids=None,labels=None): 23 | bs,_ = input_ids.size() 24 | input_ids = torch.cat((input_ids,p_input_ids,n_input_ids),0) 25 | 26 | outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0] 27 | outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None] 28 | outputs = torch.nn.functional.normalize(outputs, p=2, dim=1) 29 | outputs = outputs.split(bs,0) 30 | 31 | prob_1 = (outputs[0]*outputs[1]).sum(-1)*20 32 | prob_2 = (outputs[0]*outputs[2]).sum(-1)*20 33 | temp = torch.cat((outputs[0],outputs[1]),0) 34 | temp_labels = torch.cat((labels,labels),0) 35 | prob_3 = torch.mm(outputs[0],temp.t())*20 36 | mask = labels[:,None]==temp_labels[None,:] 37 | prob_3 = prob_3*(1-mask.float())-1e9*mask.float() 38 | 39 | prob = torch.softmax(torch.cat((prob_1[:,None],prob_2[:,None],prob_3),-1),-1) 40 | loss = torch.log(prob[:,0]+1e-10) 41 | loss = -loss.mean() 42 | return loss,outputs[0] 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-completion/README.md: -------------------------------------------------------------------------------- 1 | # Code Completion 2 | 3 | ## Dependency 4 | 5 | - pip install torch 6 | - pip install transformers 7 | - pip install javalang 8 | 9 | ## Data Download 10 | 11 | ```bash 12 | unzip dataset.zip 13 | 14 | cd dataset/javaCorpus/ 15 | bash download.sh 16 | python preprocess.py --base_dir=token_completion --output_dir=./ 17 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion/test.json 18 | 19 | cd ../py150 20 | bash download.sh 21 | python preprocess.py --base_dir=py150_files --output_dir=./ 22 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion/test.json 23 | 24 | cd ../.. 25 | ``` 26 | 27 | 28 | 29 | ## Fine-Tune Setting 30 | 31 | Here we provide fine-tune settings for code completion, whose results are reported in the paper. 32 | 33 | #### JavaCorpus Dataset 34 | 35 | ```shell 36 | # Training 37 | python run.py \ 38 | --do_train \ 39 | --do_eval \ 40 | --lang java \ 41 | --model_name_or_path microsoft/unixcoder-base \ 42 | --train_filename dataset/javaCorpus/train.txt \ 43 | --dev_filename dataset/javaCorpus/dev.json \ 44 | --output_dir saved_models/javaCorpus \ 45 | --max_source_length 936 \ 46 | --max_target_length 64 \ 47 | --beam_size 5 \ 48 | --train_batch_size 32 \ 49 | --gradient_accumulation_steps 1 \ 50 | --eval_batch_size 32 \ 51 | --learning_rate 2e-5 \ 52 | --num_train_epochs 10 53 | 54 | # Output predictions of test set 55 | python run.py \ 56 | --do_test \ 57 | --lang java \ 58 | --model_name_or_path microsoft/unixcoder-base \ 59 | --load_model_path saved_models/javaCorpus/checkpoint-best-acc/pytorch_model.bin \ 60 | --test_filename dataset/javaCorpus/test.json \ 61 | --output_dir saved_models/javaCorpus \ 62 | --max_source_length 936 \ 63 | --max_target_length 64 \ 64 | --beam_size 5 \ 65 | --eval_batch_size 32 66 | ``` 67 | 68 | Prediction results of test set are ```saved_models/javaCorpus/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com. 69 | 70 | 71 | #### PY150 Dataset 72 | 73 | ```shell 74 | # Training 75 | python run.py \ 76 | --do_train \ 77 | --do_eval \ 78 | --lang python \ 79 | --model_name_or_path microsoft/unixcoder-base \ 80 | --train_filename dataset/py150/train.txt \ 81 | --dev_filename dataset/py150/dev.json \ 82 | --output_dir saved_models/py150 \ 83 | --max_source_length 936 \ 84 | --max_target_length 64 \ 85 | --beam_size 5 \ 86 | --train_batch_size 32 \ 87 | --gradient_accumulation_steps 1 \ 88 | --eval_batch_size 32 \ 89 | --learning_rate 2e-4 \ 90 | --num_train_epochs 10 91 | 92 | # Output predictions of test set 93 | python run.py \ 94 | --do_test \ 95 | --lang python \ 96 | --model_name_or_path microsoft/unixcoder-base \ 97 | --load_model_path saved_models/py150/checkpoint-best-acc/pytorch_model.bin \ 98 | --test_filename dataset/py150/test.json \ 99 | --output_dir saved_models/py150 \ 100 | --max_source_length 936 \ 101 | --max_target_length 64 \ 102 | --beam_size 5 \ 103 | --eval_batch_size 32 104 | ``` 105 | 106 | Prediction results of test set are ```saved_models/py150/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com. 107 | 108 | 109 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-completion/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/UniXcoder/downstream-tasks/code-completion/dataset.zip -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-generation/README.md: -------------------------------------------------------------------------------- 1 | # Code Generation 2 | 3 | ## Data Download 4 | 5 | ```bash 6 | mkdir dataset 7 | cd dataset 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/train.json 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/dev.json 10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/test.json 11 | cd .. 12 | ``` 13 | 14 | ## Dependency 15 | 16 | - pip install torch 17 | - pip install transformers 18 | 19 | ## Fine-Tune Setting 20 | 21 | Here we provide fine-tune settings for code generation, whose results are reported in the paper. 22 | 23 | ```shell 24 | # Training 25 | python run.py \ 26 | --do_train \ 27 | --do_eval \ 28 | --model_name_or_path microsoft/unixcoder-base \ 29 | --train_filename dataset/train.json \ 30 | --dev_filename dataset/dev.json \ 31 | --output_dir saved_models \ 32 | --max_source_length 350 \ 33 | --max_target_length 150 \ 34 | --beam_size 3 \ 35 | --train_batch_size 32 \ 36 | --eval_batch_size 32 \ 37 | --learning_rate 5e-5 \ 38 | --gradient_accumulation_steps 1 \ 39 | --num_train_epochs 30 40 | 41 | # Output results 42 | python run.py \ 43 | --do_test \ 44 | --model_name_or_path microsoft/unixcoder-base \ 45 | --test_filename dataset/test.json \ 46 | --output_dir saved_models \ 47 | --max_source_length 350 \ 48 | --max_target_length 150 \ 49 | --beam_size 3 \ 50 | --train_batch_size 32 \ 51 | --eval_batch_size 32 \ 52 | --learning_rate 5e-5 \ 53 | --gradient_accumulation_steps 1 \ 54 | --num_train_epochs 30 55 | ``` 56 | 57 | Prediction results of test set are ```saved_models/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com. 58 | 59 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-generation/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | 28 | def _get_ngrams(segment, max_order): 29 | """Extracts all n-grams upto a given maximum order from an input segment. 30 | 31 | Args: 32 | segment: text segment from which n-grams will be extracted. 33 | max_order: maximum length in tokens of the n-grams returned by this 34 | methods. 35 | 36 | Returns: 37 | The Counter containing all n-grams upto max_order in segment 38 | with a count of how many times each n-gram occurred. 39 | """ 40 | ngram_counts = collections.Counter() 41 | for order in range(1, max_order + 1): 42 | for i in range(0, len(segment) - order + 1): 43 | ngram = tuple(segment[i:i+order]) 44 | ngram_counts[ngram] += 1 45 | return ngram_counts 46 | 47 | 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 49 | smooth=False): 50 | """Computes BLEU score of translated segments against one or more references. 51 | 52 | Args: 53 | reference_corpus: list of lists of references for each translation. Each 54 | reference should be tokenized into a list of tokens. 55 | translation_corpus: list of translations to score. Each translation 56 | should be tokenized into a list of tokens. 57 | max_order: Maximum n-gram order to use when computing BLEU score. 58 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 59 | 60 | Returns: 61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 62 | precisions and brevity penalty. 63 | """ 64 | matches_by_order = [0] * max_order 65 | possible_matches_by_order = [0] * max_order 66 | reference_length = 0 67 | translation_length = 0 68 | for (references, translation) in zip(reference_corpus, 69 | translation_corpus): 70 | reference_length += min(len(r) for r in references) 71 | translation_length += len(translation) 72 | 73 | merged_ref_ngram_counts = collections.Counter() 74 | for reference in references: 75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 76 | translation_ngram_counts = _get_ngrams(translation, max_order) 77 | overlap = translation_ngram_counts & merged_ref_ngram_counts 78 | for ngram in overlap: 79 | matches_by_order[len(ngram)-1] += overlap[ngram] 80 | for order in range(1, max_order+1): 81 | possible_matches = len(translation) - order + 1 82 | if possible_matches > 0: 83 | possible_matches_by_order[order-1] += possible_matches 84 | 85 | precisions = [0] * max_order 86 | for i in range(0, max_order): 87 | if smooth: 88 | precisions[i] = ((matches_by_order[i] + 1.) / 89 | (possible_matches_by_order[i] + 1.)) 90 | else: 91 | if possible_matches_by_order[i] > 0: 92 | precisions[i] = (float(matches_by_order[i]) / 93 | possible_matches_by_order[i]) 94 | else: 95 | precisions[i] = 0.0 96 | 97 | if min(precisions) > 0: 98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 99 | geo_mean = math.exp(p_log_sum) 100 | else: 101 | geo_mean = 0 102 | 103 | ratio = float(translation_length) / reference_length 104 | 105 | if ratio > 1.0: 106 | bp = 1. 107 | else: 108 | bp = math.exp(1 - 1. / ratio) 109 | 110 | bleu = geo_mean * bp 111 | 112 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 113 | 114 | 115 | def _bleu(ref_file, trans_file, subword_option=None): 116 | max_order = 4 117 | smooth = True 118 | ref_files = [ref_file] 119 | reference_text = [] 120 | for reference_filename in ref_files: 121 | with open(reference_filename) as fh: 122 | reference_text.append(fh.readlines()) 123 | per_segment_references = [] 124 | for references in zip(*reference_text): 125 | reference_list = [] 126 | for reference in references: 127 | reference_list.append(reference.strip().split()) 128 | per_segment_references.append(reference_list) 129 | translations = [] 130 | with open(trans_file) as fh: 131 | for line in fh: 132 | translations.append(line.strip().split()) 133 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) 134 | return round(100 * bleu_score,2) -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-generation/run.sh: -------------------------------------------------------------------------------- 1 | pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html > log.txt 2>&1 2 | pip install sklearn scipy transformers tqdm > log.txt 2>&1 3 | CUDA_VISIBLE_DEVICES=15,12,13,14 4 | lang=java #programming language 5 | lr=5e-5 6 | batch_size=32 7 | accm_steps=1 8 | beam_size=3 9 | source_length=512 10 | target_length=150 11 | data_dir=../../dataset 12 | output_dir=saved_models/$lang 13 | train_file=$data_dir/train.json 14 | dev_file=$data_dir/dev.json 15 | epochs=30 16 | pretrained_model=../../../pretrained-model/UniXcoder-base/ 17 | 18 | mkdir -p $output_dir 19 | python run.py \ 20 | --do_train \ 21 | --do_eval \ 22 | --model_name_or_path $pretrained_model \ 23 | --train_filename $train_file \ 24 | --dev_filename $dev_file \ 25 | --tokenizer_name roberta-base \ 26 | --output_dir $output_dir \ 27 | --max_source_length $source_length \ 28 | --max_target_length $target_length \ 29 | --beam_size $beam_size \ 30 | --train_batch_size $batch_size \ 31 | --eval_batch_size $batch_size \ 32 | --learning_rate $lr \ 33 | --gradient_accumulation_steps $accm_steps \ 34 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log 35 | 36 | 37 | batch_size=64 38 | dev_file=$data_dir/dev.json 39 | test_file=$data_dir/test.json 40 | test_model=$output_dir/checkpoint-best-score/pytorch_model.bin #checkpoint for test 41 | 42 | python run.py \ 43 | --do_test \ 44 | --model_name_or_path $pretrained_model \ 45 | --load_model_path $test_model \ 46 | --dev_filename $dev_file \ 47 | --test_filename $test_file \ 48 | --output_dir $output_dir \ 49 | --max_source_length $source_length \ 50 | --max_target_length $target_length \ 51 | --beam_size $beam_size \ 52 | --gradient_accumulation_steps $accm_steps \ 53 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log 54 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-search/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Code Search 4 | 5 | ## Data Download 6 | 7 | #### 1. AdvTest dataset 8 | 9 | ```bash 10 | mkdir dataset && cd dataset 11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/NL-code-search-Adv/dataset.zip 12 | unzip dataset.zip && rm -r dataset.zip && mv dataset AdvTest && cd AdvTest 13 | wget https://zenodo.org/record/7857872/files/python.zip 14 | unzip python.zip && python preprocess.py && rm -r python && rm -r *.pkl && rm python.zip 15 | cd ../.. 16 | ``` 17 | 18 | #### 2. CosQA dataset 19 | 20 | ```bash 21 | cd dataset 22 | mkdir cosqa && cd cosqa 23 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/code_idx_map.txt 24 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-dev-500.json 25 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-test-500.json 26 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-train-19604.json 27 | cd ../.. 28 | ``` 29 | 30 | #### 3. CSN dataset 31 | 32 | ```bash 33 | cd dataset 34 | wget https://github.com/microsoft/CodeBERT/raw/master/GraphCodeBERT/codesearch/dataset.zip 35 | unzip dataset.zip && rm -r dataset.zip && mv dataset CSN && cd CSN 36 | bash run.sh 37 | cd ../.. 38 | ``` 39 | 40 | 41 | 42 | ## Dependency 43 | 44 | - pip install torch 45 | - pip install transformers 46 | 47 | ## Zero-Shot Setting 48 | 49 | We first provide scripts for zero-shot code search. The similarity between code and nl we use is cosine distance of hidden states of UniXcoder. 50 | 51 | #### 1. AdvTest dataset 52 | 53 | ```bash 54 | python run.py \ 55 | --output_dir saved_models/AdvTest \ 56 | --model_name_or_path microsoft/unixcoder-base \ 57 | --do_zero_shot \ 58 | --do_test \ 59 | --test_data_file dataset/AdvTest/test.jsonl \ 60 | --codebase_file dataset/AdvTest/test.jsonl \ 61 | --num_train_epochs 2 \ 62 | --code_length 256 \ 63 | --nl_length 128 \ 64 | --train_batch_size 64 \ 65 | --eval_batch_size 64 \ 66 | --learning_rate 2e-5 \ 67 | --seed 123456 68 | ``` 69 | 70 | #### 2. CosQA dataset 71 | 72 | ```bash 73 | python run.py \ 74 | --output_dir saved_models/cosqa \ 75 | --model_name_or_path microsoft/unixcoder-base \ 76 | --do_zero_shot \ 77 | --do_test \ 78 | --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \ 79 | --codebase_file dataset/cosqa/code_idx_map.txt \ 80 | --num_train_epochs 10 \ 81 | --code_length 256 \ 82 | --nl_length 128 \ 83 | --train_batch_size 64 \ 84 | --eval_batch_size 64 \ 85 | --learning_rate 2e-5 \ 86 | --seed 123456 87 | ``` 88 | 89 | #### 3. CSN dataset 90 | 91 | ```bash 92 | lang=python 93 | python run.py \ 94 | --output_dir saved_models/CSN/$lang \ 95 | --model_name_or_path microsoft/unixcoder-base \ 96 | --do_zero_shot \ 97 | --do_test \ 98 | --test_data_file dataset/CSN/$lang/test.jsonl \ 99 | --codebase_file dataset/CSN/$lang/codebase.jsonl \ 100 | --num_train_epochs 10 \ 101 | --code_length 256 \ 102 | --nl_length 128 \ 103 | --train_batch_size 64 \ 104 | --eval_batch_size 64 \ 105 | --learning_rate 2e-5 \ 106 | --seed 123456 107 | ``` 108 | 109 | 110 | 111 | ## Fine-Tune Setting 112 | 113 | Here we provide fine-tune settings for code search, whose results are reported in the paper. 114 | 115 | #### 1. AdvTest dataset 116 | 117 | ```shell 118 | # Training 119 | python run.py \ 120 | --output_dir saved_models/AdvTest \ 121 | --model_name_or_path microsoft/unixcoder-base \ 122 | --do_train \ 123 | --train_data_file dataset/AdvTest/train.jsonl \ 124 | --eval_data_file dataset/AdvTest/valid.jsonl \ 125 | --codebase_file dataset/AdvTest/valid.jsonl \ 126 | --num_train_epochs 2 \ 127 | --code_length 256 \ 128 | --nl_length 128 \ 129 | --train_batch_size 64 \ 130 | --eval_batch_size 64 \ 131 | --learning_rate 2e-5 \ 132 | --seed 123456 133 | 134 | # Evaluating 135 | python run.py \ 136 | --output_dir saved_models/AdvTest \ 137 | --model_name_or_path microsoft/unixcoder-base \ 138 | --do_test \ 139 | --test_data_file dataset/AdvTest/test.jsonl \ 140 | --codebase_file dataset/AdvTest/test.jsonl \ 141 | --num_train_epochs 2 \ 142 | --code_length 256 \ 143 | --nl_length 128 \ 144 | --train_batch_size 64 \ 145 | --eval_batch_size 64 \ 146 | --learning_rate 2e-5 \ 147 | --seed 123456 148 | ``` 149 | #### 2. CosQA dataset 150 | 151 | ```bash 152 | # Training 153 | python run.py \ 154 | --output_dir saved_models/cosqa \ 155 | --model_name_or_path microsoft/unixcoder-base \ 156 | --do_train \ 157 | --train_data_file dataset/cosqa/cosqa-retrieval-train-19604.json \ 158 | --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \ 159 | --codebase_file dataset/cosqa/code_idx_map.txt \ 160 | --num_train_epochs 10 \ 161 | --code_length 256 \ 162 | --nl_length 128 \ 163 | --train_batch_size 64 \ 164 | --eval_batch_size 64 \ 165 | --learning_rate 2e-5 \ 166 | --seed 123456 167 | 168 | # Evaluating 169 | python run.py \ 170 | --output_dir saved_models/cosqa \ 171 | --model_name_or_path microsoft/unixcoder-base \ 172 | --do_eval \ 173 | --do_test \ 174 | --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \ 175 | --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \ 176 | --codebase_file dataset/cosqa/code_idx_map.txt \ 177 | --num_train_epochs 10 \ 178 | --code_length 256 \ 179 | --nl_length 128 \ 180 | --train_batch_size 64 \ 181 | --eval_batch_size 64 \ 182 | --learning_rate 2e-5 \ 183 | --seed 123456 184 | ``` 185 | 186 | #### 3. CSN dataset 187 | 188 | ```bash 189 | # Training 190 | lang=python 191 | python run.py \ 192 | --output_dir saved_models/CSN/$lang \ 193 | --model_name_or_path microsoft/unixcoder-base \ 194 | --do_train \ 195 | --train_data_file dataset/CSN/$lang/train.jsonl \ 196 | --eval_data_file dataset/CSN/$lang/valid.jsonl \ 197 | --codebase_file dataset/CSN/$lang/codebase.jsonl \ 198 | --num_train_epochs 10 \ 199 | --code_length 256 \ 200 | --nl_length 128 \ 201 | --train_batch_size 64 \ 202 | --eval_batch_size 64 \ 203 | --learning_rate 2e-5 \ 204 | --seed 123456 205 | 206 | # Evaluating 207 | python run.py \ 208 | --output_dir saved_models/CSN/$lang \ 209 | --model_name_or_path microsoft/unixcoder-base \ 210 | --do_eval \ 211 | --do_test \ 212 | --eval_data_file dataset/CSN/$lang/valid.jsonl \ 213 | --test_data_file dataset/CSN/$lang/test.jsonl \ 214 | --codebase_file dataset/CSN/$lang/codebase.jsonl \ 215 | --num_train_epochs 10 \ 216 | --code_length 256 \ 217 | --nl_length 128 \ 218 | --train_batch_size 64 \ 219 | --eval_batch_size 64 \ 220 | --learning_rate 2e-5 \ 221 | --seed 123456 222 | 223 | ``` 224 | 225 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-search/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch.nn as nn 4 | import torch 5 | class Model(nn.Module): 6 | def __init__(self, encoder): 7 | super(Model, self).__init__() 8 | self.encoder = encoder 9 | 10 | def forward(self, code_inputs=None, nl_inputs=None): 11 | if code_inputs is not None: 12 | outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0] 13 | outputs = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None] 14 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 15 | else: 16 | outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0] 17 | outputs = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None] 18 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 19 | 20 | 21 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/code-summarization/README.md: -------------------------------------------------------------------------------- 1 | # Code Summarization 2 | 3 | ## Data Download 4 | 5 | ```bash 6 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Text/code-to-text/dataset.zip 7 | unzip dataset.zip 8 | rm dataset.zip 9 | cd dataset 10 | wget https://zenodo.org/record/7857872/files/python.zip 11 | wget https://zenodo.org/record/7857872/files/java.zip 12 | wget https://zenodo.org/record/7857872/files/ruby.zip 13 | wget https://zenodo.org/record/7857872/files/javascript.zip 14 | wget https://zenodo.org/record/7857872/files/go.zip 15 | wget https://zenodo.org/record/7857872/files/php.zip 16 | 17 | unzip python.zip 18 | unzip java.zip 19 | unzip ruby.zip 20 | unzip javascript.zip 21 | unzip go.zip 22 | unzip php.zip 23 | rm *.zip 24 | rm *.pkl 25 | 26 | python preprocess.py 27 | rm -r */final 28 | cd .. 29 | ``` 30 | 31 | ## Dependency 32 | 33 | - pip install torch 34 | - pip install transformers 35 | 36 | ## Fine-Tune Setting 37 | 38 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper. 39 | 40 | ```shell 41 | lang=python 42 | 43 | # Training 44 | python run.py \ 45 | --do_train \ 46 | --do_eval \ 47 | --model_name_or_path microsoft/unixcoder-base \ 48 | --train_filename dataset/$lang/train.jsonl \ 49 | --dev_filename dataset/$lang/valid.jsonl \ 50 | --output_dir saved_models/$lang \ 51 | --max_source_length 256 \ 52 | --max_target_length 128 \ 53 | --beam_size 10 \ 54 | --train_batch_size 48 \ 55 | --eval_batch_size 48 \ 56 | --learning_rate 5e-5 \ 57 | --gradient_accumulation_steps 2 \ 58 | --num_train_epochs 10 59 | 60 | # Evaluating 61 | python run.py \ 62 | --do_test \ 63 | --model_name_or_path microsoft/unixcoder-base \ 64 | --test_filename dataset/$lang/test.jsonl \ 65 | --output_dir saved_models/$lang \ 66 | --max_source_length 256 \ 67 | --max_target_length 128 \ 68 | --beam_size 10 \ 69 | --train_batch_size 48 \ 70 | --eval_batch_size 48 \ 71 | --learning_rate 5e-5 \ 72 | --gradient_accumulation_steps 2 \ 73 | --num_train_epochs 10 74 | ``` 75 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/zero-shot-search/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Zero-shot Code-to-Code Search 4 | 5 | Given a source code as the query, the task aims to retrieve codes with the same semantics from a collection of candidates in zero-shot setting. We collect 11,744/15,594/23,530 functions from [CodeNet](https://github.com/IBM/Project_CodeNet) corpus in Ruby/Python/Java. Each function solves one of 4,053 problems. 6 | 7 | 8 | 9 | ## Data Download 10 | 11 | ```bash 12 | cd dataset 13 | wget https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet.tar.gz 14 | tar -xvf Project_CodeNet.tar.gz 15 | python preprocess.py 16 | cd .. 17 | ``` 18 | 19 | 20 | 21 | ## Dependency 22 | 23 | - pip install torch 24 | - pip install transformers 25 | 26 | 27 | 28 | ## Zero-Shot Setting 29 | 30 | ```bash 31 | source_lang=ruby 32 | target_lang=python 33 | python run.py \ 34 | --model_name_or_path microsoft/unixcoder-base \ 35 | --query_data_file dataset/${source_lang}_with_func.jsonl \ 36 | --candidate_data_file dataset/${target_lang}_with_func.jsonl \ 37 | --query_lang ${source_lang} \ 38 | --candidate_lang ${target_lang} \ 39 | --code_length 512 \ 40 | --eval_batch_size 256 41 | ``` 42 | 43 | 44 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/zero-shot-search/dataset/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | for lang,suffix in [("Java",".java"),("Ruby",".rb"),("Python",".py")]: 4 | with open("{}.jsonl".format(lang.lower())) as f, open("{}_with_func.jsonl".format(lang.lower()),"w") as f1: 5 | for line in f: 6 | js = json.loads(line.strip()) 7 | problem_id = str(js["label"]) 8 | problem_id = "p" + "0" * (5-len(problem_id)) + problem_id 9 | language = lang 10 | submission_id = js["index"] 11 | func = open("Project_CodeNet/data/{}/{}/{}{}".format(problem_id,language,submission_id,suffix)).read() 12 | js["func"] = func 13 | f1.write(json.dumps(js)+"\n") 14 | -------------------------------------------------------------------------------- /UniXcoder/downstream-tasks/zero-shot-search/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch.nn as nn 4 | import torch 5 | class Model(nn.Module): 6 | def __init__(self, encoder): 7 | super(Model, self).__init__() 8 | self.encoder = encoder 9 | 10 | def forward(self, code_inputs=None, nl_inputs=None, cls=False): 11 | if code_inputs is not None: 12 | outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0] 13 | outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None] 14 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 15 | else: 16 | outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0] 17 | outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None] 18 | return torch.nn.functional.normalize(outputs, p=2, dim=1) 19 | 20 | 21 | 22 | --------------------------------------------------------------------------------