├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CodeBERT
    ├── code2nl
    │   ├── README.md
    │   ├── bleu.py
    │   ├── model.py
    │   └── run.py
    └── codesearch
    │   ├── README.md
    │   ├── mrr.py
    │   ├── process_data.py
    │   ├── run_classifier.py
    │   └── utils.py
├── CodeExecutor
    ├── README.md
    ├── downstream
    │   ├── model_unixcoder.py
    │   ├── run.py
    │   └── run.sh
    ├── inference
    │   ├── dataset.py
    │   ├── metric.py
    │   ├── model.py
    │   ├── run.py
    │   └── run.sh
    └── pretrain
    │   ├── dataset.py
    │   ├── model.py
    │   ├── run.py
    │   └── run.sh
├── CodeReviewer
    ├── README.md
    └── code
    │   ├── bleu.py
    │   ├── configs.py
    │   ├── evaluator
    │       ├── CodeBLEU
    │       │   ├── bleu.py
    │       │   ├── calc_code_bleu.py
    │       │   ├── dataflow_match.py
    │       │   ├── keywords
    │       │   │   ├── c_sharp.txt
    │       │   │   └── java.txt
    │       │   ├── parser
    │       │   │   ├── DFG.py
    │       │   │   ├── __init__.py
    │       │   │   ├── build.py
    │       │   │   ├── build.sh
    │       │   │   └── utils.py
    │       │   ├── readme.txt
    │       │   ├── syntax_match.py
    │       │   ├── utils.py
    │       │   └── weighted_ngram_match.py
    │       ├── bleu.py
    │       ├── smooth_bleu.py
    │       └── stopwords.txt
    │   ├── models.py
    │   ├── run_finetune_cls.py
    │   ├── run_finetune_msg.py
    │   ├── run_finetune_ref.py
    │   ├── run_infer_msg.py
    │   ├── run_test_cls.py
    │   ├── run_test_msg.py
    │   ├── run_test_ref.py
    │   ├── sh
    │       ├── finetune-cls.sh
    │       ├── finetune-msg.sh
    │       ├── finetune-ref.sh
    │       ├── infer-json.sh
    │       ├── test-cls.sh
    │       ├── test-msg.sh
    │       ├── test-ref.sh
    │       └── test_nltk.sh
    │   ├── test_model.py
    │   └── utils.py
├── GraphCodeBERT
    ├── clonedetection
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── evaluator
    │   │   ├── answers.txt
    │   │   ├── evaluator.py
    │   │   └── predictions.txt
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── codesearch
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── refinement
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    └── translation
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │       ├── DFG.py
    │       ├── __init__.py
    │       ├── build.py
    │       ├── build.sh
    │       ├── my-languages.so
    │       └── utils.py
    │   └── run.py
├── LICENSE
├── LongCoder
    ├── README.md
    ├── longcoder.py
    ├── model.py
    ├── parser
    │   ├── DFG.py
    │   ├── __init__.py
    │   ├── build.py
    │   ├── build.sh
    │   ├── my-languages.so
    │   └── utils.py
    ├── run.py
    └── run.sh
├── NOTICE.md
├── README.md
├── SECURITY.md
└── UniXcoder
    ├── README.md
    ├── downstream-tasks
        ├── clone-detection
        │   ├── BCB
        │   │   ├── README.md
        │   │   ├── model.py
        │   │   ├── run.py
        │   │   └── run.sh
        │   └── POJ-104
        │   │   ├── README.md
        │   │   ├── dataset
        │   │       └── preprocess.py
        │   │   ├── model.py
        │   │   └── run.py
        ├── code-completion
        │   ├── README.md
        │   ├── dataset.zip
        │   ├── model.py
        │   └── run.py
        ├── code-generation
        │   ├── README.md
        │   ├── bleu.py
        │   ├── model.py
        │   ├── run.py
        │   └── run.sh
        ├── code-search
        │   ├── README.md
        │   ├── model.py
        │   └── run.py
        ├── code-summarization
        │   ├── README.md
        │   ├── bleu.py
        │   ├── model.py
        │   └── run.py
        └── zero-shot-search
        │   ├── README.md
        │   ├── dataset
        │       ├── java.jsonl
        │       ├── preprocess.py
        │       ├── python.jsonl
        │       └── ruby.jsonl
        │   ├── model.py
        │   └── run.py
    └── unixcoder.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 | 
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 | 
5 | Resources:
6 | 
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/CodeBERT/code2nl/README.md:
--------------------------------------------------------------------------------
  1 | # Code Documentation Generation
  2 | 
  3 | This repo provides the code for reproducing the experiments on [CodeSearchNet](https://arxiv.org/abs/1909.09436) dataset for code document generation tasks in six programming languages.
  4 | 
  5 | **!News: We release a new pipeline for this task. The new pipeline only needs 2 p100 GPUs and less training time for Code Documentation Generation. Please refer to the [website](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text).**
  6 | 
  7 | ## Dependency
  8 | 
  9 | - pip install torch==1.4.0
 10 | - pip install transformers==2.5.0
 11 | - pip install filelock
 12 | 
 13 | ## Data Preprocess
 14 | 
 15 | We clean CodeSearchNet dataset for this task by following steps:
 16 | 
 17 | - Remove comments in the code
 18 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 19 | - Remove examples that #tokens of documents is < 3 or >256
 20 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 21 | - Remove examples that documents are not English.
 22 | 
 23 | Data statistic about the cleaned dataset for code document generation is shown in this Table. We release the cleaned dataset in this [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h).
 24 | 
 25 | | PL         | Training |  Dev   |  Test  |
 26 | | :--------- | :------: | :----: | :----: |
 27 | | Python     | 251,820  | 13,914 | 14,918 |
 28 | | PHP        | 241,241  | 12,982 | 14,014 |
 29 | | Go         | 167,288  | 7,325  | 8,122  |
 30 | | Java       | 164,923  | 5,183  | 10,955 |
 31 | | JavaScript |  58,025  | 3,885  | 3,291  |
 32 | | Ruby       |  24,927  | 1,400  | 1,261  |
 33 | 
 34 | 
 35 | 
 36 | ## Data Download
 37 | 
 38 | You can download dataset from the [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h). Or use the following command.
 39 | 
 40 | ```shell
 41 | pip install gdown
 42 | mkdir data data/code2nl
 43 | cd data/code2nl
 44 | gdown https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h
 45 | unzip Cleaned_CodeSearchNet.zip
 46 | rm Cleaned_CodeSearchNet.zip
 47 | cd ../..
 48 | ```
 49 | 
 50 | 
 51 | 
 52 | ## Fine-Tune
 53 | 
 54 | We fine-tuned the model on 4*P40 GPUs. 
 55 | 
 56 | ```shell
 57 | cd code2nl
 58 | 
 59 | lang=php #programming language
 60 | lr=5e-5
 61 | batch_size=64
 62 | beam_size=10
 63 | source_length=256
 64 | target_length=128
 65 | data_dir=../data/code2nl/CodeSearchNet
 66 | output_dir=model/$lang
 67 | train_file=$data_dir/$lang/train.jsonl
 68 | dev_file=$data_dir/$lang/valid.jsonl
 69 | eval_steps=1000 #400 for ruby, 600 for javascript, 1000 for others
 70 | train_steps=50000 #20000 for ruby, 30000 for javascript, 50000 for others
 71 | pretrained_model=microsoft/codebert-base #Roberta: roberta-base
 72 | 
 73 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --train_steps $train_steps --eval_steps $eval_steps 
 74 | ```
 75 | 
 76 | 
 77 | 
 78 | ## Inference and Evaluation
 79 | 
 80 | After fine-tuning, inference and evaluation are as follows:
 81 | 
 82 | ```shell
 83 | lang=php #programming language
 84 | beam_size=10
 85 | batch_size=128
 86 | source_length=256
 87 | target_length=128
 88 | output_dir=model/$lang
 89 | data_dir=../data/code2nl/CodeSearchNet
 90 | dev_file=$data_dir/$lang/valid.jsonl
 91 | test_file=$data_dir/$lang/test.jsonl
 92 | test_model=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 93 | 
 94 | python run.py --do_test --model_type roberta --model_name_or_path microsoft/codebert-base --load_model_path $test_model --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size
 95 | ```
 96 | 
 97 | The results on CodeSearchNet are shown in this Table:
 98 | 
 99 | | Model       |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
100 | | ----------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
101 | | Seq2Seq     |   9.64    |   10.21    |   13.98   |   15.93   |   15.09   |   21.08   |   14.32   |
102 | | Transformer |   11.18   |   11.59    |   16.38   |   15.81   |   16.26   |   22.12   |   15.56   |
103 | | RoBERTa     |   11.17   |   11.90    |   17.72   |   18.14   |   16.47   |   24.02   |   16.57   |
104 | | CodeBERT    | **12.16** | **14.90**  | **18.07** | **19.06** | **17.65** | **25.16** | **17.83** |
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
 1 | # Code Search
 2 | 
 3 | ## Data Preprocess
 4 | 
 5 | Both training and validation datasets are created in a way that positive and negative samples are balanced. Negative samples consist of balanced number of instances with randomly replaced NL and PL.
 6 | 
 7 | We follow the official evaluation metric to calculate the Mean Reciprocal Rank (MRR) for each pair of test data (c, w) over a fixed set of 999 distractor codes.
 8 | 
 9 | You can use the following command to download the preprocessed training and validation dataset and preprocess the test dataset by yourself. The preprocessed testing dataset is very large, so only the preprocessing script is provided.
10 | 
11 | ```shell
12 | mkdir data data/codesearch
13 | cd data/codesearch
14 | gdown https://drive.google.com/uc?id=1xgSR34XO8xXZg4cZScDYj2eGerBE9iGo  
15 | unzip codesearch_data.zip
16 | rm  codesearch_data.zip
17 | cd ../../codesearch
18 | python process_data.py
19 | cd ..
20 | ```
21 | 
22 | ## Fine-Tune
23 | We fine-tuned the model on 2*P100 GPUs. 
24 | ```shell
25 | cd codesearch
26 | 
27 | lang=php #fine-tuning a language-specific model for each programming language 
28 | pretrained_model=microsoft/codebert-base  #Roberta: roberta-base
29 | 
30 | python run_classifier.py \
31 | --model_type roberta \
32 | --task_name codesearch \
33 | --do_train \
34 | --do_eval \
35 | --eval_all_checkpoints \
36 | --train_file train.txt \
37 | --dev_file valid.txt \
38 | --max_seq_length 200 \
39 | --per_gpu_train_batch_size 32 \
40 | --per_gpu_eval_batch_size 32 \
41 | --learning_rate 1e-5 \
42 | --num_train_epochs 8 \
43 | --gradient_accumulation_steps 1 \
44 | --overwrite_output_dir \
45 | --data_dir ../data/codesearch/train_valid/$lang \
46 | --output_dir ./models/$lang  \
47 | --model_name_or_path $pretrained_model
48 | ```
49 | ## Inference and Evaluation
50 | 
51 | Inference
52 | ```shell
53 | lang=php #programming language
54 | idx=0 #test batch idx
55 | 
56 | python run_classifier.py \
57 | --model_type roberta \
58 | --model_name_or_path microsoft/codebert-base \
59 | --task_name codesearch \
60 | --do_predict \
61 | --output_dir ./models/$lang \
62 | --data_dir ../data/codesearch/test/$lang \
63 | --max_seq_length 200 \
64 | --per_gpu_train_batch_size 32 \
65 | --per_gpu_eval_batch_size 32 \
66 | --learning_rate 1e-5 \
67 | --num_train_epochs 8 \
68 | --test_file batch_${idx}.txt \
69 | --pred_model_dir ./models/$lang/checkpoint-best/ \
70 | --test_result_dir ./results/$lang/${idx}_batch_result.txt
71 | ```
72 | 
73 | Evaluation
74 | ```shell
75 | python mrr.py
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/mrr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import os
 6 | import numpy as np
 7 | from more_itertools import chunked
 8 | import argparse
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--test_batch_size', type=int, default=1000)
14 |     args = parser.parse_args()
15 |     languages = ['ruby', 'go', 'php', 'python', 'java', 'javascript']
16 |     MRR_dict = {}
17 |     for language in languages:
18 |         file_dir = './results/{}'.format(language)
19 |         ranks = []
20 |         num_batch = 0
21 |         for file in sorted(os.listdir(file_dir)):
22 |             print(os.path.join(file_dir, file))
23 |             with open(os.path.join(file_dir, file), encoding='utf-8') as f:
24 |                 batched_data = chunked(f.readlines(), args.test_batch_size)
25 |                 for batch_idx, batch_data in enumerate(batched_data):
26 |                     num_batch += 1
27 |                     correct_score = float(batch_data[batch_idx].strip().split('<CODESPLIT>')[-1])
28 |                     scores = np.array([float(data.strip().split('<CODESPLIT>')[-1]) for data in batch_data])
29 |                     rank = np.sum(scores >= correct_score)
30 |                     ranks.append(rank)
31 | 
32 |         mean_mrr = np.mean(1.0 / np.array(ranks))
33 |         print("{} mrr: {}".format(language, mean_mrr))
34 |         MRR_dict[language] = mean_mrr
35 |     for key, val in MRR_dict.items():
36 |         print("{} mrr: {}".format(key, val))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/process_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import gzip
 6 | import os
 7 | import json
 8 | import numpy as np
 9 | from more_itertools import chunked
10 | 
11 | DATA_DIR='../data/codesearch'
12 | 
13 | def format_str(string):
14 |     for char in ['\r\n', '\r', '\n']:
15 |         string = string.replace(char, ' ')
16 |     return string
17 | 
18 | 
19 | def preprocess_test_data(language, test_batch_size=1000):
20 |     path = os.path.join(DATA_DIR, '{}_test_0.jsonl.gz'.format(language))
21 |     print(path)
22 |     with gzip.open(path, 'r') as pf:
23 |         data = pf.readlines()  
24 | 
25 |     idxs = np.arange(len(data))
26 |     data = np.array(data, dtype=np.object)
27 | 
28 |     np.random.seed(0)   # set random seed so that random things are reproducible
29 |     np.random.shuffle(idxs)
30 |     data = data[idxs]
31 |     batched_data = chunked(data, test_batch_size)
32 | 
33 |     print("start processing")
34 |     for batch_idx, batch_data in enumerate(batched_data):
35 |         if len(batch_data) < test_batch_size:
36 |             break # the last batch is smaller than the others, exclude.
37 |         examples = []
38 |         for d_idx, d in enumerate(batch_data): 
39 |             line_a = json.loads(str(d, encoding='utf-8'))
40 |             doc_token = ' '.join(line_a['docstring_tokens'])
41 |             for dd in batch_data:
42 |                 line_b = json.loads(str(dd, encoding='utf-8'))
43 |                 code_token = ' '.join([format_str(token) for token in line_b['code_tokens']])
44 | 
45 |                 example = (str(1), line_a['url'], line_b['url'], doc_token, code_token)
46 |                 example = '<CODESPLIT>'.join(example)
47 |                 examples.append(example)
48 | 
49 |         data_path = os.path.join(DATA_DIR, 'test/{}'.format(language))
50 |         if not os.path.exists(data_path):
51 |             os.makedirs(data_path)
52 |         file_path = os.path.join(data_path, 'batch_{}.txt'.format(batch_idx))
53 |         print(file_path)
54 |         with open(file_path, 'w', encoding='utf-8') as f:
55 |             f.writelines('\n'.join(examples))
56 | 
57 | if __name__ == '__main__':
58 |     languages = ['go', 'php', 'python', 'java', 'javascript', 'ruby']
59 |     for lang in languages:
60 |         preprocess_test_data(lang)
61 | 


--------------------------------------------------------------------------------
/CodeExecutor/downstream/model_unixcoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, nl_inputs=None, cls=False): 
11 |         if code_inputs is not None:
12 |             outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 |             outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None]
14 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 |         else:
16 |             outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 |             outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None]
18 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |       
20 |         
21 |  


--------------------------------------------------------------------------------
/CodeExecutor/downstream/run.sh:
--------------------------------------------------------------------------------
 1 | source_lang=python
 2 | target_lang=python
 3 | python run.py \
 4 |     --model_name_or_path microsoft/unixcoder-base  \
 5 |     --query_data_file ../data/code_to_code_search_test.json \
 6 |     --candidate_data_file ../data/code_to_code_search_test.json \
 7 |     --trace_file ../saved_models/code_to_code_search/preds.txt \
 8 |     --query_lang ${source_lang} \
 9 |     --candidate_lang ${target_lang} \
10 |     --code_length 512 \
11 |     --eval_batch_size 256 
12 | 


--------------------------------------------------------------------------------
/CodeExecutor/inference/dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | import os
  5 | import pickle
  6 | import logging
  7 | import json
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 12 |     """Truncates a sequence pair in place to the maximum length."""
 13 |     while True:
 14 |         total_length = len(tokens_a) + len(tokens_b)
 15 |         if total_length <= max_length:
 16 |             break
 17 |         if len(tokens_a) > len(tokens_b):
 18 |             tokens_a.pop()
 19 |         else:
 20 |             tokens_b.pop()
 21 | 
 22 | def _truncate_seq_pair_two_length(tokens_a, tokens_b, max_length_a, max_length_b):
 23 |     """Truncates a sequence pair in place to the maximum length."""
 24 |     while True:
 25 |         total_length = len(tokens_a) + len(tokens_b)
 26 |         if total_length <= max_length_a + max_length_b:
 27 |             break
 28 |         if len(tokens_b) > max_length_b:
 29 |             tokens_b.pop()
 30 |         else: # len(tokens_a) > max_length_a
 31 |             tokens_a.pop()
 32 | 
 33 | class InputFeatures(object):
 34 |     """A single training/test features for a example."""
 35 |     def __init__(self,
 36 |                  code_tokens,
 37 |                  trace_tokens
 38 | 
 39 |     ):
 40 |         self.code_tokens = code_tokens
 41 |         self.trace_tokens = trace_tokens
 42 |         
 43 | def convert_examples_to_features(item):
 44 |     # parsing
 45 |     js,tokenizer=item
 46 |     code_tokens = js["code_tokens"]
 47 |     trace_tokens = js["trace_tokens"]
 48 |     code_tokens = tokenizer.tokenize(" ".join(code_tokens))
 49 |     trace_tokens = tokenizer.tokenize(" ".join(trace_tokens))
 50 | 
 51 |     return InputFeatures(code_tokens,trace_tokens)
 52 | 
 53 | 
 54 | 
 55 | class TextDataset(Dataset):
 56 |     def __init__(self, tokenizer, args, filename, local_rank, world_size, logger, mode, prefix=""):
 57 |         self.args = args
 58 |         self.tokenizer = tokenizer
 59 | 
 60 |         if len(prefix) > 0:
 61 |             cached_features_file = os.path.join('{}'.format(args.data_cache_dir), prefix + "_word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl')
 62 |         else:
 63 |             cached_features_file = os.path.join('{}'.format(args.data_cache_dir), "word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl')
 64 |         if os.path.exists(cached_features_file):
 65 |             logger.warning("Loading features from cached file %s", cached_features_file)
 66 |             with open(cached_features_file, 'rb') as handle1:
 67 |                 self.examples = pickle.load(handle1)
 68 |             if 'train' in mode and local_rank==0:
 69 |                 for idx, example in enumerate(self.examples[:1]):
 70 |                         logger.warning("*** Example ***")
 71 |                         logger.warning("idx: %s",idx)
 72 |                         logger.warning("code_tokens: {}".format(' '.join(map(str, example.code_tokens))))   
 73 |                         logger.warning("trace_tokens: {}".format(' '.join(map(str, example.trace_tokens))))           
 74 |         else:
 75 |             self.examples = []
 76 |             total_num = 0
 77 |             error_num = 0
 78 |             logger.info("Load and create features from dataset file at %s", filename)
 79 |             num_lines = sum(1 for line in open(filename,'r'))
 80 |             with open(filename,"r",encoding="utf8") as f:
 81 |                 for i,line in enumerate(tqdm(f,total=num_lines)):
 82 |                     json_line = json.loads(line)
 83 |                     if len(json_line['code_tokens']) != 0: 
 84 |                         total_num += 1
 85 |                         if (mode == "train" and total_num % world_size == local_rank) or (mode != "train" and local_rank in [-1, 0]):
 86 |                             js = {}
 87 |                             if len(prefix) > 0:
 88 |                                 js["code_tokens"] = ["<"+prefix+">"]
 89 |                                 js["code_tokens"].extend(json_line["code_tokens"])
 90 |                             else:
 91 |                                 js["code_tokens"] = json_line["code_tokens"]
 92 |                             js["trace_tokens"] = json_line["trace_tokens"]
 93 |                             try:
 94 |                                 features = convert_examples_to_features((js, tokenizer))
 95 |                                 cur_index = len(self.examples)
 96 |                                 self.examples.append(features)
 97 |                             except:
 98 |                                 error_num += 1 
 99 | 
100 |             if mode == "train" and local_rank==0:
101 |                 for idx, example in enumerate(self.examples[:1]):
102 |                     logger.warning("*** Example ***")
103 |                     logger.warning("idx: %s",idx)
104 |                     logger.warning("code_tokens: {}".format(example.code_tokens))   
105 |                     logger.warning("trace_tokens: {}".format(example.trace_tokens))
106 | 
107 |             logger.warning("Num examples = %d: %d", local_rank,len(self.examples))
108 |             logger.warning(f"Error num = {error_num}")
109 |             # debug
110 |             logger.warning("Saving features into cached file %s", cached_features_file)
111 |             if not os.path.exists(args.data_cache_dir):
112 |                 os.makedirs(args.data_cache_dir)
113 |             with open(cached_features_file, 'wb') as handle1:
114 |                 pickle.dump(self.examples, handle1, protocol=pickle.HIGHEST_PROTOCOL)
115 | 
116 |     def __len__(self):
117 |         return len(self.examples)
118 |     
119 |     def __getitem__(self, item): 
120 |         js = self.examples[item]  
121 | 
122 |         # Encoder-Decoder for Trace Generation
123 |         source_tokens = js.code_tokens[0:self.args.max_source_length-5]
124 |         source_tokens = ["<s>","<encoder-decoder>","</s>"] + source_tokens + ["<mask0>"] + ["</s>"]
125 |         source_ids = self.tokenizer.convert_tokens_to_ids(source_tokens) 
126 |         padding_length = self.args.max_source_length - len(source_ids)
127 |         source_ids += [self.tokenizer.pad_token_id]*padding_length
128 | 
129 |         target_tokens = self.tokenizer.tokenize("None") # generate
130 |         target_tokens = ["<mask0>"] + target_tokens + [self.tokenizer.sep_token]            
131 |         target_ids = self.tokenizer.convert_tokens_to_ids(target_tokens)
132 |         padding_length = self.args.max_target_length - len(target_ids)
133 |         target_ids += [self.tokenizer.pad_token_id] * padding_length
134 | 
135 |         gold_tokens = js.trace_tokens[:self.args.max_target_length-2]
136 |         gold_tokens = ["<mask0>"] + gold_tokens + [self.tokenizer.sep_token]            
137 |         gold_ids = self.tokenizer.convert_tokens_to_ids(gold_tokens)
138 |         padding_length = self.args.max_target_length - len(gold_ids)
139 |         gold_ids += [self.tokenizer.pad_token_id] * padding_length
140 |         
141 |         return (
142 |                torch.tensor(source_ids),
143 |                torch.tensor(target_ids),   
144 |                torch.tensor(gold_ids),        
145 |                )
146 | 
147 | 
148 | 
149 |             
150 | 
151 |     
152 | 


--------------------------------------------------------------------------------
/CodeExecutor/inference/run.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEBVISES=0 python run.py \
 2 |     --prefix codenet \
 3 |     --output_dir ../saved_models/inference \
 4 |     --data_cache_dir ../saved_models/inference \
 5 |     --eval_data_path ../data/codenetmut_test.json \
 6 |     --model_name_or_path microsoft/codeexecutor \
 7 |     --block_size 1024 \
 8 |     --per_gpu_train_batch_size 8 \
 9 |     --per_gpu_eval_batch_size 16 \
10 |     --gradient_accumulation_steps 8 \
11 |     --learning_rate 1e-4 \
12 |     --node_index 0 \
13 |     --weight_decay 0.01 \
14 |     --adam_epsilon 1e-6 \
15 |     --max_grad_norm 1.0 \
16 |     --max_steps 1000 \
17 |     --warmup_steps 10000 \
18 |     --save_steps 5000 \
19 |     --seed 123456


--------------------------------------------------------------------------------
/CodeExecutor/pretrain/dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | import os
  5 | import pickle
  6 | import logging
  7 | import json
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 12 |     """Truncates a sequence pair in place to the maximum length."""
 13 |     while True:
 14 |         total_length = len(tokens_a) + len(tokens_b)
 15 |         if total_length <= max_length:
 16 |             break
 17 |         if len(tokens_a) > len(tokens_b):
 18 |             tokens_a.pop()
 19 |         else:
 20 |             tokens_b.pop()
 21 | 
 22 | def _truncate_seq_pair_two_length(tokens_a, tokens_b, max_length_a, max_length_b):
 23 |     """Truncates a sequence pair in place to the maximum length."""
 24 |     while True:
 25 |         total_length = len(tokens_a) + len(tokens_b)
 26 |         if total_length <= max_length_a + max_length_b:
 27 |             break
 28 |         if len(tokens_b) > max_length_b:
 29 |             tokens_b.pop()
 30 |         else: # len(tokens_a) > max_length_a
 31 |             tokens_a.pop()
 32 | 
 33 | class InputFeatures(object):
 34 |     """A single training/test features for a example."""
 35 |     def __init__(self,
 36 |                  code_tokens,
 37 |                  trace_tokens
 38 | 
 39 |     ):
 40 |         self.code_tokens = code_tokens
 41 |         self.trace_tokens = trace_tokens
 42 |         
 43 | def convert_examples_to_features(item):
 44 |     # parsing
 45 |     js,tokenizer=item
 46 |     code_tokens = js["code_tokens"]
 47 |     trace_tokens = js["trace_tokens"]
 48 |     code_tokens = tokenizer.tokenize(" ".join(code_tokens))
 49 |     trace_tokens = tokenizer.tokenize(" ".join(trace_tokens))
 50 | 
 51 |     return InputFeatures(code_tokens,trace_tokens)
 52 | 
 53 | 
 54 | 
 55 | class TextDataset(Dataset):
 56 |     def __init__(self, tokenizer, args, filename, local_rank, world_size, logger, mode, prefix=""):
 57 |         self.args = args
 58 |         self.tokenizer = tokenizer
 59 |         if len(prefix) > 0:
 60 |             cached_features_file = os.path.join('{}'.format(args.data_cache_dir), prefix + "_word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl')
 61 |         else:
 62 |             cached_features_file = os.path.join('{}'.format(args.data_cache_dir), "word_size_"+str(world_size)+"_rank_"+str(local_rank)+'_size_'+ str(args.block_size)+'_'+mode+'.pkl')
 63 |         if os.path.exists(cached_features_file):
 64 |             logger.warning("Loading features from cached file %s", cached_features_file)
 65 |             with open(cached_features_file, 'rb') as handle1:
 66 |                 self.examples = pickle.load(handle1)
 67 |             if 'train' in mode and local_rank==0:
 68 |                 for idx, example in enumerate(self.examples[:1]):
 69 |                         logger.warning("*** Example ***")
 70 |                         logger.warning("idx: %s",idx)
 71 |                         logger.warning("code_tokens: {}".format(' '.join(map(str, example.code_tokens))))   
 72 |                         logger.warning("trace_tokens: {}".format(' '.join(map(str, example.trace_tokens))))           
 73 |         else:
 74 |             self.examples = []
 75 |             total_num = 0
 76 |             error_num = 0
 77 |             logger.info("Load and create features from dataset file at %s", filename)
 78 |             num_lines = sum(1 for line in open(filename,'r'))
 79 |             with open(filename,"r",encoding="utf8") as f:
 80 |                 for i,line in enumerate(tqdm(f,total=num_lines)):
 81 |                     json_line = json.loads(line)
 82 |                     if len(json_line['code_tokens']) != 0: 
 83 |                         total_num += 1
 84 |                         if (mode == "train" and total_num % world_size == local_rank) or (mode != "train" and local_rank in [-1, 0]):
 85 |                             js = {}
 86 |                             if len(prefix) > 0:
 87 |                                 js["code_tokens"] = ["<"+prefix+">"]
 88 |                                 js["code_tokens"].extend(json_line["code_tokens"])
 89 |                             else:
 90 |                                 js["code_tokens"] = json_line["code_tokens"]
 91 |                             js["trace_tokens"] = json_line["trace_tokens"]
 92 |                             try:
 93 |                                 features = convert_examples_to_features((js, tokenizer))
 94 |                                 cur_index = len(self.examples)
 95 |                                 self.examples.append(features)
 96 |                             except:
 97 |                                 error_num += 1 
 98 | 
 99 |             if mode == "train" and local_rank==0:
100 |                 for idx, example in enumerate(self.examples[:1]):
101 |                     logger.warning("*** Example ***")
102 |                     logger.warning("idx: %s",idx)
103 |                     logger.warning("code_tokens: {}".format(example.code_tokens))   
104 |                     logger.warning("trace_tokens: {}".format(example.trace_tokens))
105 | 
106 |             
107 |             logger.warning("Num examples = %d: %d", local_rank,len(self.examples))
108 |             logger.warning(f"Error num = {error_num}")
109 |             # debug
110 |             logger.warning("Saving features into cached file %s", cached_features_file)
111 |             if not os.path.exists(args.data_cache_dir):
112 |                 os.makedirs(args.data_cache_dir)
113 |             with open(cached_features_file, 'wb') as handle1:
114 |                 pickle.dump(self.examples, handle1, protocol=pickle.HIGHEST_PROTOCOL)
115 |         
116 |             
117 | 
118 | 
119 |     def __len__(self):
120 |         return len(self.examples)
121 |     
122 |     def __getitem__(self, item): 
123 |         js = self.examples[item]  
124 | 
125 |         # Encoder-Decoder for Trace Generation
126 |         source_tokens = js.code_tokens
127 |         target_tokens = ["<mask0>"] + js.trace_tokens    
128 |         _truncate_seq_pair_two_length(source_tokens,target_tokens,self.args.block_size//4 - 1, self.args.block_size//2 + self.args.block_size//4 - 5)   
129 |         source_tokens = source_tokens + ["<mask0>"]
130 |         text_tokens = ["<s>","<encoder-decoder>","</s>"] + source_tokens + ["</s>"] + target_tokens + ["</s>"]
131 |         text_ids = self.tokenizer.convert_tokens_to_ids(text_tokens)
132 |         dual_gen_ids = text_ids + [self.tokenizer.pad_token_id]*(self.args.block_size-len(text_ids)) 
133 |         dual_gen_type_ids = [1] * len(["<s>","<encoder-decoder>","</s>"] + source_tokens + ["</s>"]) + [2] * len(target_tokens + ["</s>"]) + [0]*(self.args.block_size-len(text_ids))        
134 | 
135 | 
136 |         return (
137 |                torch.tensor(dual_gen_ids),
138 |                torch.tensor(dual_gen_type_ids),             
139 |                )
140 | 
141 | 
142 | 
143 |             
144 | 
145 |     
146 | 


--------------------------------------------------------------------------------
/CodeExecutor/pretrain/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch
 4 | from torch.autograd import Variable
 5 | import copy
 6 | import torch.nn.functional as F
 7 | from torch.nn import CrossEntropyLoss, MSELoss
 8 | import random
 9 | 
10 |     
11 | class Model(nn.Module):   
12 |     def __init__(self, encoder,config,tokenizer,args):
13 |         super(Model, self).__init__()
14 |         self.encoder = encoder
15 |         self.config = config
16 |         self.tokenizer = tokenizer
17 |         self.args = args
18 |         self.lm_head = nn.Linear(config.hidden_size,config.vocab_size)
19 |         self.qa_outputs = nn.Linear(config.hidden_size, config.hidden_size)
20 |         self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
21 |         self.register_buffer(
22 |         "bias", torch.tril(torch.ones((args.block_size, args.block_size), dtype=torch.uint8)).view(1, args.block_size, args.block_size)
23 |         )
24 |         self.weights = torch.full([len(self.tokenizer)], 10.0).to(self.args.device)
25 |         easy_ids = self.tokenizer.convert_tokens_to_ids(["<state>","</state>", "<dictsep>", ":"])
26 |         for i in easy_ids: self.weights[i] = 1.0
27 | 
28 |     def forward(self, dual_gen_ids, dual_gen_type_ids): 
29 |         dual_loss,align_loss,contras_loss = 0,0,0
30 |         
31 |         # Encoder-Decoder for Cross-modal Generation
32 |         source_ids = dual_gen_ids
33 |         type_ids = dual_gen_type_ids
34 |         attention_mask = self.bias
35 |         attention_mask = attention_mask | (type_ids.eq(1)[:,:,None]*type_ids.eq(1)[:,None,:])
36 |         outputs = self.encoder(source_ids,attention_mask=attention_mask)
37 |         encoder_outputs = outputs.last_hidden_state[:,:-1]
38 |         labels_mask = type_ids.eq(2)[:,1:]
39 |         encoder_outputs = encoder_outputs.reshape(-1,encoder_outputs.size(-1))[labels_mask.reshape(-1)]
40 |         prediction_scores = self.lm_head(encoder_outputs)
41 |         lm_labels = source_ids[:,1:].reshape(-1)[labels_mask.reshape(-1)]
42 | 
43 |         loss_fct = CrossEntropyLoss(reduction='none')
44 |         lm_loss = loss_fct(prediction_scores, lm_labels)
45 |         lm_loss = self.weights[lm_labels] * lm_loss
46 |         lm_loss = lm_loss.sum()/len(lm_labels)
47 | 
48 |         dual_loss = lm_loss.item() 
49 |         return lm_loss, dual_loss, align_loss, contras_loss
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/CodeExecutor/pretrain/run.sh:
--------------------------------------------------------------------------------
 1 | PER_NODE_GPU=8
 2 | python -m torch.distributed.launch --nproc_per_node=${PER_NODE_GPU} run.py \
 3 |     --output_dir ../saved_models/pretrain_codeexecutor_stage_3 \
 4 |     --data_cache_dir ../saved_models/pretrain_codeexecutor_stage_3 \
 5 |     --train_data_path /drive/pretrain_codenetmut.json \
 6 |     --another_train_data_path /drive/pretrain_tutorial.json \
 7 |     --third_train_data_path /drive/single_line_hard_3_million.json \
 8 |     --eval_data_path ../data/codenetmut_test.json \
 9 |     --model_name_or_path ../saved_models/pretrain_codeexecutor_stage_2 \
10 |     --block_size 1024 \
11 |     --per_gpu_train_batch_size 4 \
12 |     --per_gpu_eval_batch_size 8 \
13 |     --gradient_accumulation_steps 8 \
14 |     --learning_rate 4e-4 \
15 |     --node_index=0 \
16 |     --gpu_per_node $PER_NODE_GPU \
17 |     --weight_decay 0.01 \
18 |     --adam_epsilon 1e-6 \
19 |     --max_grad_norm 1.0 \
20 |     --max_steps 1000000 \
21 |     --warmup_steps 10000 \
22 |     --save_steps 5000 \
23 |     --seed 123


--------------------------------------------------------------------------------
/CodeReviewer/README.md:
--------------------------------------------------------------------------------
  1 | # CodeReviewer
  2 | 
  3 | This repo provides the code for reproducing the experiments in [CodeReviewer: Pre-Training for Automating Code Review Activities](https://arxiv.org/abs/2203.09095). **CodeReviewer** is a model pre-trained with code change and code review data to support code review tasks.
  4 | 
  5 | The pre-trained checkpoint of CodeReivewer is available in [Huggingface](https://huggingface.co/microsoft/codereviewer). 
  6 | 
  7 | Our dataset is available in [Zenodo](https://zenodo.org/record/6900648).
  8 | 
  9 | ## 1. Dependency
 10 | 
 11 | - conda install nltk
 12 | - conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
 13 | - conda install transformers
 14 | 
 15 | 
 16 | ## 2. Brief Introduction
 17 | 
 18 | CodeReviewer supports for three related tasks: **Quality Estimation** (`cls` for short), **Comment Generation** (`msg` for short) and **Code Refinement** (`ref` for short).
 19 | 
 20 | Demo data:
 21 | 
 22 | ``` python
 23 | {
 24 |     "old_file": "import torch",  # f1
 25 |     "diff_hunk": "@@ -1 +1,2 @@\n import torch\n +import torch.nn as nn",  # f1->f2
 26 |     "comment": "I don't think we need to import torch.nn here.",  # requirements for f2->f3
 27 |     "target": "import torch"  # f3
 28 | }
 29 | ```
 30 | 
 31 | * Quality Estimation: input with "old_file" and "diff_hunk", we need to predict that whether the code change is not good and needs a comment.
 32 | 
 33 | * Comment Generation: input with "old_file" and "diff_hunk", we need to generate a comment for the change. An expected comment is as the "comment" above.
 34 | 
 35 | * Code Refinement: input with "old_file", "diff_hunk", and "comment", we need to change the code again according to the review comment. For the above example, as the comment indicated we don't need *import torch.nn*, we just delete this line of code here.
 36 | 
 37 | The model inputs are code change (old file and diff hunk) and review comment (optional according to task). Input data is preprocessed in `utils.py: ReviewExample` and wrapped to {`utils.py: CommentClsDataset, SimpleGenDataset, RefineDataset`}
 38 | 
 39 | ## 3. Finetune/Inference
 40 | 
 41 | Before you start to run experiments with CodeReviewer, please download the [datasets](https://zenodo.org/record/6900648) first.
 42 | 
 43 | ```bash
 44 | # prepare model checkpoint and datasets
 45 | cd code/sh
 46 | # adjust the arguments in the *sh* scripts
 47 | bash finetune-cls.sh
 48 | ```
 49 | 
 50 | A demo bash script (finetune-cls.sh) is shown:
 51 | ```bash
 52 | mnt_dir="/home/codereview"
 53 | 
 54 | # You may change the following block for multiple gpu training
 55 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 56 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 57 | RANK=0 && echo RANK: ${RANK}
 58 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
 59 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
 60 | NODES=1 && echo NODES: ${NODES}
 61 | NCCL_DEBUG=INFO
 62 | 
 63 | bash test_nltk.sh
 64 | 
 65 | 
 66 | # Change the arguments as required:
 67 | #   model_name_or_path, load_model_path: the path of the model to be finetuned
 68 | #   eval_file: the path of the evaluation data
 69 | #   output_dir: the directory to save finetuned model (not used at infer/test time)
 70 | #   out_file: the path of the output file
 71 | #   train_file_name: can be a directory contraining files named with "train*.jsonl"
 72 | 
 73 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_cls.py  \
 74 |   --train_epochs 30 \
 75 |   --model_name_or_path microsoft/codereviewer \
 76 |   --output_dir ../../save/cls \
 77 |   --train_filename ../../dataset/Diff_Quality_Estimation \
 78 |   --dev_filename ../../dataset/Diff_Quality_Estimation/cls-valid.jsonl \
 79 |   --max_source_length 512 \
 80 |   --max_target_length 128 \
 81 |   --train_batch_size 12 \
 82 |   --learning_rate 3e-4 \
 83 |   --gradient_accumulation_steps 3 \
 84 |   --mask_rate 0.15 \
 85 |   --save_steps 3600 \
 86 |   --log_steps 100 \
 87 |   --train_steps 120000 \
 88 |   --gpu_per_node=${PER_NODE_GPU} \
 89 |   --node_index=${RANK} \
 90 |   --seed 2233 
 91 | ```
 92 | 
 93 | 
 94 | ## 4. File structure
 95 | ```
 96 | .
 97 | ├── bleu.py                 # demo code for BLEU evaluation
 98 | ├── configs.py
 99 | ├── evaluator               # copied from CodeXGlue for BLEU evaluation
100 | ├── models.py               # CodeReviewer model
101 | ├── run_finetune_xxx.py     # finetune script - xxx in {cls, msg, gen}
102 | ├── run_infer_msg.py        # inference script for comment generation task
103 | ├── run_test_xxx.py         # test script - xxx in {cls, msg, gen}
104 | ├── sh/xx.sh                # bash script for running finetune and test scripts with arguments
105 | │   ├── finetune-xxx.sh
106 | │   ├── infer-json.sh
107 | │   ├── test-xxx.sh
108 | │   ├── test_nltk.sh
109 | └── utils.py                # utils for data preprocessing
110 | ```
111 | 
112 | # Reference
113 | If you use this code or CodeReviewer, please consider citing us.
114 | 
115 | <pre><code>@article{li2022codereviewer,
116 |   title={CodeReviewer: Pre-Training for Automating Code Review Activities},
117 |   author={Li, Zhiyu and Lu, Shuai and Guo, Daya and Duan, Nan and Jannu, Shailesh and Jenks, Grant and Majumder, Deep and Green, Jared and Svyatkovskiy, Alexey and Fu, Shengyu and others},
118 |   journal={arXiv preprint arXiv:2203.09095},
119 |   year={2022}
120 | }</code></pre>
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/bleu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from evaluator.smooth_bleu import bleu_fromstr
 4 | import nltk
 5 | import re
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--path', type=str, required=True)
11 |     args = parser.parse_args()
12 |     ref = os.path.join(args.path, 'golds.txt')
13 |     hyp = os.path.join(args.path, 'preds.txt')
14 |     with open(ref, 'r') as f:
15 |         refs = f.readlines()
16 |     with open(hyp, 'r') as f:
17 |         hyps = f.readlines()
18 |     # refs = [ref.strip().lower() for ref in refs]
19 |     # hyps = [hyp.strip().lower() for hyp in hyps]
20 |     # bleu = bleu_fromstr(hyps, refs)
21 |     # print(bleu)
22 |     pred_nls, golds = hyps, refs
23 |     for i in range(len(pred_nls)):
24 |         chars = "(_)`."
25 |         for c in chars:
26 |             pred_nls[i] = pred_nls[i].replace(c, " " + c + " ")
27 |             pred_nls[i] = " ".join(pred_nls[i].split())
28 |             golds[i] = golds[i].replace(c, " " + c + " ")
29 |             golds[i] = " ".join(golds[i].split())
30 |     bleu = bleu_fromstr(pred_nls, golds, rmstop=False)
31 |     print(bleu)
32 |     # stopwords = open("stopwords.txt").readlines()
33 |     # stopwords = [stopword.strip() for stopword in stopwords]
34 |     # refs = [" ".join([word for word in ref.lower().split() if word not in stopwords]) for ref in refs]
35 |     # hyps = [" ".join([word for word in hyp.lower().split() if word not in stopwords]) for hyp in hyps]
36 |     # bleu = bleu_fromstr(hyps, refs)
37 |     # print(bleu)
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 |     # s = "Can we use `mset.mirrorInfo()` directly?"
42 |     # chars = "(_)`."
43 |     # for c in chars:
44 |     #     s = s.replace(c, " " + c + " ")
45 |     # print(nltk.wordpunct_tokenize(s))
46 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/calc_code_bleu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | # https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU
 4 | 
 5 | # -*- coding:utf-8 -*-
 6 | import argparse
 7 | import os
 8 | from evaluator.CodeBLEU import bleu, weighted_ngram_match, syntax_match, dataflow_match
 9 | 
10 | 
11 | def get_codebleu(refs, hyp, lang, params='0.25,0.25,0.25,0.25'):
12 |     if not isinstance(refs, list):
13 |         refs = [refs]
14 |     alpha, beta, gamma, theta = [float(x) for x in params.split(',')]
15 | 
16 |     # preprocess inputs
17 |     pre_references = [[x.strip() for x in open(file, 'r', encoding='utf-8').readlines()] for file in refs]
18 |     hypothesis = [x.strip() for x in open(hyp, 'r', encoding='utf-8').readlines()]
19 | 
20 |     for i in range(len(pre_references)):
21 |         assert len(hypothesis) == len(pre_references[i])
22 | 
23 |     references = []
24 |     for i in range(len(hypothesis)):
25 |         ref_for_instance = []
26 |         for j in range(len(pre_references)):
27 |             ref_for_instance.append(pre_references[j][i])
28 |         references.append(ref_for_instance)
29 |     assert len(references) == len(pre_references) * len(hypothesis)
30 | 
31 |     # calculate ngram match (BLEU)
32 |     tokenized_hyps = [x.split() for x in hypothesis]
33 |     tokenized_refs = [[x.split() for x in reference] for reference in references]
34 | 
35 |     ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)
36 | 
37 |     # calculate weighted ngram match
38 |     root_dir = os.path.dirname(__file__)
39 |     keywords = [x.strip() for x in open(root_dir + '/keywords/' + lang + '.txt', 'r', encoding='utf-8').readlines()]
40 | 
41 |     def make_weights(reference_tokens, key_word_list):
42 |         return {token: 1 if token in key_word_list else 0.2 for token in reference_tokens}
43 | 
44 |     tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \
45 |                                     for reference_tokens in reference] for reference in tokenized_refs]
46 | 
47 |     weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps)
48 | 
49 |     # calculate syntax match
50 |     syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)
51 | 
52 |     # calculate dataflow match
53 |     dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)
54 | 
55 |     print('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'. \
56 |           format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score))
57 | 
58 |     code_bleu_score = alpha * ngram_match_score \
59 |                       + beta * weighted_ngram_match_score \
60 |                       + gamma * syntax_match_score \
61 |                       + theta * dataflow_match_score
62 | 
63 |     return code_bleu_score
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument('--refs', type=str, nargs='+', required=True,
69 |                         help='reference files')
70 |     parser.add_argument('--hyp', type=str, required=True,
71 |                         help='hypothesis file')
72 |     parser.add_argument('--lang', type=str, required=True,
73 |                         choices=['java', 'js', 'c_sharp', 'php', 'go', 'python', 'ruby'],
74 |                         help='programming language')
75 |     parser.add_argument('--params', type=str, default='0.25,0.25,0.25,0.25',
76 |                         help='alpha, beta and gamma')
77 | 
78 |     args = parser.parse_args()
79 |     code_bleu_score = get_codebleu(args.refs, args.hyp, args.lang, args.params)
80 |     print('CodeBLEU score: ', code_bleu_score)
81 | 
82 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/dataflow_match.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | from evaluator.CodeBLEU.parser import DFG_python, DFG_java, DFG_ruby, DFG_go, DFG_php, DFG_javascript, DFG_csharp
  5 | from evaluator.CodeBLEU.parser import (remove_comments_and_docstrings,
  6 |                                        tree_to_token_index,
  7 |                                        index_to_code_token,
  8 |                                        tree_to_variable_index)
  9 | from tree_sitter import Language, Parser
 10 | import os
 11 | 
 12 | root_dir = os.path.dirname(__file__)
 13 | 
 14 | dfg_function = {
 15 |     'python': DFG_python,
 16 |     'java': DFG_java,
 17 |     'ruby': DFG_ruby,
 18 |     'go': DFG_go,
 19 |     'php': DFG_php,
 20 |     'javascript': DFG_javascript,
 21 |     'c_sharp': DFG_csharp,
 22 | }
 23 | 
 24 | 
 25 | def calc_dataflow_match(references, candidate, lang):
 26 |     return corpus_dataflow_match([references], [candidate], lang)
 27 | 
 28 | 
 29 | def corpus_dataflow_match(references, candidates, lang):
 30 |     LANGUAGE = Language(root_dir + '/parser/my-languages.so', lang)
 31 |     parser = Parser()
 32 |     parser.set_language(LANGUAGE)
 33 |     parser = [parser, dfg_function[lang]]
 34 |     match_count = 0
 35 |     total_count = 0
 36 | 
 37 |     for i in range(len(candidates)):
 38 |         references_sample = references[i]
 39 |         candidate = candidates[i]
 40 |         for reference in references_sample:
 41 |             try:
 42 |                 candidate = remove_comments_and_docstrings(candidate, 'java')
 43 |             except:
 44 |                 pass
 45 |             try:
 46 |                 reference = remove_comments_and_docstrings(reference, 'java')
 47 |             except:
 48 |                 pass
 49 | 
 50 |             cand_dfg = get_data_flow(candidate, parser)
 51 |             ref_dfg = get_data_flow(reference, parser)
 52 | 
 53 |             normalized_cand_dfg = normalize_dataflow(cand_dfg)
 54 |             normalized_ref_dfg = normalize_dataflow(ref_dfg)
 55 | 
 56 |             if len(normalized_ref_dfg) > 0:
 57 |                 total_count += len(normalized_ref_dfg)
 58 |                 for dataflow in normalized_ref_dfg:
 59 |                     if dataflow in normalized_cand_dfg:
 60 |                         match_count += 1
 61 |                         normalized_cand_dfg.remove(dataflow)
 62 |     if total_count == 0:
 63 |         print(
 64 |             "WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score.")
 65 |         return 0
 66 |     score = match_count / total_count
 67 |     return score
 68 | 
 69 | 
 70 | def get_data_flow(code, parser):
 71 |     try:
 72 |         tree = parser[0].parse(bytes(code, 'utf8'))
 73 |         root_node = tree.root_node
 74 |         tokens_index = tree_to_token_index(root_node)
 75 |         code = code.split('\n')
 76 |         code_tokens = [index_to_code_token(x, code) for x in tokens_index]
 77 |         index_to_code = {}
 78 |         for idx, (index, code) in enumerate(zip(tokens_index, code_tokens)):
 79 |             index_to_code[index] = (idx, code)
 80 |         try:
 81 |             DFG, _ = parser[1](root_node, index_to_code, {})
 82 |         except:
 83 |             DFG = []
 84 |         DFG = sorted(DFG, key=lambda x: x[1])
 85 |         indexs = set()
 86 |         for d in DFG:
 87 |             if len(d[-1]) != 0:
 88 |                 indexs.add(d[1])
 89 |             for x in d[-1]:
 90 |                 indexs.add(x)
 91 |         new_DFG = []
 92 |         for d in DFG:
 93 |             if d[1] in indexs:
 94 |                 new_DFG.append(d)
 95 |         codes = code_tokens
 96 |         dfg = new_DFG
 97 |     except:
 98 |         codes = code.split()
 99 |         dfg = []
100 |     # merge nodes
101 |     dic = {}
102 |     for d in dfg:
103 |         if d[1] not in dic:
104 |             dic[d[1]] = d
105 |         else:
106 |             dic[d[1]] = (d[0], d[1], d[2], list(set(dic[d[1]][3] + d[3])), list(set(dic[d[1]][4] + d[4])))
107 |     DFG = []
108 |     for d in dic:
109 |         DFG.append(dic[d])
110 |     dfg = DFG
111 |     return dfg
112 | 
113 | 
114 | def normalize_dataflow_item(dataflow_item):
115 |     var_name = dataflow_item[0]
116 |     var_pos = dataflow_item[1]
117 |     relationship = dataflow_item[2]
118 |     par_vars_name_list = dataflow_item[3]
119 |     par_vars_pos_list = dataflow_item[4]
120 | 
121 |     var_names = list(set(par_vars_name_list + [var_name]))
122 |     norm_names = {}
123 |     for i in range(len(var_names)):
124 |         norm_names[var_names[i]] = 'var_' + str(i)
125 | 
126 |     norm_var_name = norm_names[var_name]
127 |     relationship = dataflow_item[2]
128 |     norm_par_vars_name_list = [norm_names[x] for x in par_vars_name_list]
129 | 
130 |     return (norm_var_name, relationship, norm_par_vars_name_list)
131 | 
132 | 
133 | def normalize_dataflow(dataflow):
134 |     var_dict = {}
135 |     i = 0
136 |     normalized_dataflow = []
137 |     for item in dataflow:
138 |         var_name = item[0]
139 |         relationship = item[2]
140 |         par_vars_name_list = item[3]
141 |         for name in par_vars_name_list:
142 |             if name not in var_dict:
143 |                 var_dict[name] = 'var_' + str(i)
144 |                 i += 1
145 |         if var_name not in var_dict:
146 |             var_dict[var_name] = 'var_' + str(i)
147 |             i += 1
148 |         normalized_dataflow.append((var_dict[var_name], relationship, [var_dict[x] for x in par_vars_name_list]))
149 |     return normalized_dataflow
150 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/keywords/c_sharp.txt:
--------------------------------------------------------------------------------
  1 | abstract
  2 | as
  3 | base
  4 | bool
  5 | break
  6 | byte
  7 | case
  8 | catch
  9 | char
 10 | checked
 11 | class
 12 | const
 13 | continue
 14 | decimal
 15 | default
 16 | delegate
 17 | do
 18 | double
 19 | else
 20 | enum
 21 | event
 22 | explicit
 23 | extern
 24 | false
 25 | finally
 26 | fixed
 27 | float
 28 | for
 29 | foreach
 30 | goto
 31 | if
 32 | implicit
 33 | in
 34 | int
 35 | interface
 36 | internal
 37 | is
 38 | lock
 39 | long
 40 | namespace
 41 | new
 42 | null
 43 | object
 44 | operator
 45 | out
 46 | override
 47 | params
 48 | private
 49 | protected
 50 | public
 51 | readonly
 52 | ref
 53 | return
 54 | sbyte
 55 | sealed
 56 | short
 57 | sizeof
 58 | stackalloc
 59 | static
 60 | string
 61 | struct
 62 | switch
 63 | this
 64 | throw
 65 | true
 66 | try
 67 | typeof
 68 | uint
 69 | ulong
 70 | unchecked
 71 | unsafe
 72 | ushort
 73 | using
 74 | virtual
 75 | void
 76 | volatile
 77 | while
 78 | add
 79 | alias
 80 | ascending
 81 | async
 82 | await
 83 | by
 84 | descending
 85 | dynamic
 86 | equals
 87 | from
 88 | get
 89 | global
 90 | group
 91 | into
 92 | join
 93 | let
 94 | nameof
 95 | notnull
 96 | on
 97 | orderby
 98 | partial
 99 | remove
100 | select
101 | set
102 | unmanaged
103 | value
104 | var
105 | when
106 | where
107 | yield
108 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/keywords/java.txt:
--------------------------------------------------------------------------------
 1 | abstract
 2 | assert
 3 | boolean
 4 | break
 5 | byte
 6 | case
 7 | catch
 8 | char
 9 | class
10 | const
11 | continue
12 | default
13 | do
14 | double
15 | else
16 | enum
17 | extends
18 | final
19 | finally
20 | float
21 | for
22 | goto
23 | if
24 | implements
25 | import
26 | instanceof
27 | int
28 | interface
29 | long
30 | native
31 | new
32 | package
33 | private
34 | protected
35 | public
36 | return
37 | short
38 | static
39 | strictfp
40 | super
41 | switch
42 | synchronized
43 | this
44 | throw
45 | throws
46 | transient
47 | try
48 | void
49 | volatile
50 | while
51 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/parser/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. 
2 | # Licensed under the MIT license.
3 | 
4 | from .utils import (remove_comments_and_docstrings,
5 |                    tree_to_token_index,
6 |                    index_to_code_token,
7 |                    tree_to_variable_index)
8 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/parser/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import re
  5 | from io import StringIO
  6 | import tokenize
  7 | 
  8 | 
  9 | def remove_comments_and_docstrings(source, lang):
 10 |     if lang in ['python']:
 11 |         """
 12 |         Returns 'source' minus comments and docstrings.
 13 |         """
 14 |         io_obj = StringIO(source)
 15 |         out = ""
 16 |         prev_toktype = tokenize.INDENT
 17 |         last_lineno = -1
 18 |         last_col = 0
 19 |         for tok in tokenize.generate_tokens(io_obj.readline):
 20 |             token_type = tok[0]
 21 |             token_string = tok[1]
 22 |             start_line, start_col = tok[2]
 23 |             end_line, end_col = tok[3]
 24 |             ltext = tok[4]
 25 |             if start_line > last_lineno:
 26 |                 last_col = 0
 27 |             if start_col > last_col:
 28 |                 out += (" " * (start_col - last_col))
 29 |             # Remove comments:
 30 |             if token_type == tokenize.COMMENT:
 31 |                 pass
 32 |             # This series of conditionals removes docstrings:
 33 |             elif token_type == tokenize.STRING:
 34 |                 if prev_toktype != tokenize.INDENT:
 35 |                     # This is likely a docstring; double-check we're not inside an operator:
 36 |                     if prev_toktype != tokenize.NEWLINE:
 37 |                         if start_col > 0:
 38 |                             out += token_string
 39 |             else:
 40 |                 out += token_string
 41 |             prev_toktype = token_type
 42 |             last_col = end_col
 43 |             last_lineno = end_line
 44 |         temp = []
 45 |         for x in out.split('\n'):
 46 |             if x.strip() != "":
 47 |                 temp.append(x)
 48 |         return '\n'.join(temp)
 49 |     elif lang in ['ruby']:
 50 |         return source
 51 |     else:
 52 |         def replacer(match):
 53 |             s = match.group(0)
 54 |             if s.startswith('/'):
 55 |                 return " "  # note: a space and not an empty string
 56 |             else:
 57 |                 return s
 58 | 
 59 |         pattern = re.compile(
 60 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
 61 |             re.DOTALL | re.MULTILINE
 62 |         )
 63 |         temp = []
 64 |         for x in re.sub(pattern, replacer, source).split('\n'):
 65 |             if x.strip() != "":
 66 |                 temp.append(x)
 67 |         return '\n'.join(temp)
 68 | 
 69 | 
 70 | def tree_to_token_index(root_node):
 71 |     if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string',
 72 |                                                            'character_literal']) and root_node.type != 'comment':
 73 |         return [(root_node.start_point, root_node.end_point)]
 74 |     else:
 75 |         code_tokens = []
 76 |         for child in root_node.children:
 77 |             code_tokens += tree_to_token_index(child)
 78 |         return code_tokens
 79 | 
 80 | 
 81 | def tree_to_variable_index(root_node, index_to_code):
 82 |     if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string',
 83 |                                                            'character_literal']) and root_node.type != 'comment':
 84 |         index = (root_node.start_point, root_node.end_point)
 85 |         _, code = index_to_code[index]
 86 |         if root_node.type != code:
 87 |             return [(root_node.start_point, root_node.end_point)]
 88 |         else:
 89 |             return []
 90 |     else:
 91 |         code_tokens = []
 92 |         for child in root_node.children:
 93 |             code_tokens += tree_to_variable_index(child, index_to_code)
 94 |         return code_tokens
 95 | 
 96 | 
 97 | def index_to_code_token(index, code):
 98 |     start_point = index[0]
 99 |     end_point = index[1]
100 |     if start_point[0] == end_point[0]:
101 |         s = code[start_point[0]][start_point[1]:end_point[1]]
102 |     else:
103 |         s = ""
104 |         s += code[start_point[0]][start_point[1]:]
105 |         for i in range(start_point[0] + 1, end_point[0]):
106 |             s += code[i]
107 |         s += code[end_point[0]][:end_point[1]]
108 |     return s
109 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/readme.txt:
--------------------------------------------------------------------------------
1 | python calc_code_bleu.py --refs reference_files --hyp candidate_file --language java ( or c_sharp) --params 0.25,0.25,0.25,0.25(default)


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/syntax_match.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from evaluator.CodeBLEU.parser import DFG_python, DFG_java, DFG_ruby, DFG_go, DFG_php, DFG_javascript, DFG_csharp
 5 | from evaluator.CodeBLEU.parser import (remove_comments_and_docstrings,
 6 |                                        tree_to_token_index,
 7 |                                        index_to_code_token,
 8 |                                        tree_to_variable_index)
 9 | from tree_sitter import Language, Parser
10 | import os
11 | 
12 | root_dir = os.path.dirname(__file__)
13 | dfg_function = {
14 |     'python': DFG_python,
15 |     'java': DFG_java,
16 |     'ruby': DFG_ruby,
17 |     'go': DFG_go,
18 |     'php': DFG_php,
19 |     'javascript': DFG_javascript,
20 |     'c_sharp': DFG_csharp,
21 | }
22 | 
23 | 
24 | def calc_syntax_match(references, candidate, lang):
25 |     return corpus_syntax_match([references], [candidate], lang)
26 | 
27 | 
28 | def corpus_syntax_match(references, candidates, lang):
29 |     JAVA_LANGUAGE = Language(root_dir + '/parser/my-languages.so', lang)
30 |     parser = Parser()
31 |     parser.set_language(JAVA_LANGUAGE)
32 |     match_count = 0
33 |     total_count = 0
34 | 
35 |     for i in range(len(candidates)):
36 |         references_sample = references[i]
37 |         candidate = candidates[i]
38 |         for reference in references_sample:
39 |             try:
40 |                 candidate = remove_comments_and_docstrings(candidate, 'java')
41 |             except:
42 |                 pass
43 |             try:
44 |                 reference = remove_comments_and_docstrings(reference, 'java')
45 |             except:
46 |                 pass
47 | 
48 |             candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
49 | 
50 |             reference_tree = parser.parse(bytes(reference, 'utf8')).root_node
51 | 
52 |             def get_all_sub_trees(root_node):
53 |                 node_stack = []
54 |                 sub_tree_sexp_list = []
55 |                 depth = 1
56 |                 node_stack.append([root_node, depth])
57 |                 while len(node_stack) != 0:
58 |                     cur_node, cur_depth = node_stack.pop()
59 |                     sub_tree_sexp_list.append([cur_node.sexp(), cur_depth])
60 |                     for child_node in cur_node.children:
61 |                         if len(child_node.children) != 0:
62 |                             depth = cur_depth + 1
63 |                             node_stack.append([child_node, depth])
64 |                 return sub_tree_sexp_list
65 | 
66 |             cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)]
67 |             ref_sexps = get_all_sub_trees(reference_tree)
68 | 
69 |             # print(cand_sexps)
70 |             # print(ref_sexps)
71 | 
72 |             for sub_tree, depth in ref_sexps:
73 |                 if sub_tree in cand_sexps:
74 |                     match_count += 1
75 |             total_count += len(ref_sexps)
76 | 
77 |     score = match_count / total_count
78 |     return score
79 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/CodeBLEU/utils.py:
--------------------------------------------------------------------------------
  1 | # Natural Language Toolkit: Utility functions
  2 | #
  3 | # Copyright (C) 2001-2020 NLTK Project
  4 | # Author: Steven Bird <stevenbird1@gmail.com>
  5 | # URL: <http://nltk.org/>
  6 | # For license information, see LICENSE.TXT
  7 | 
  8 | from itertools import chain
  9 | 
 10 | def pad_sequence(
 11 |     sequence,
 12 |     n,
 13 |     pad_left=False,
 14 |     pad_right=False,
 15 |     left_pad_symbol=None,
 16 |     right_pad_symbol=None,
 17 | ):
 18 |     """
 19 |     Returns a padded sequence of items before ngram extraction.
 20 |         >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
 21 |         ['<s>', 1, 2, 3, 4, 5, '</s>']
 22 |         >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
 23 |         ['<s>', 1, 2, 3, 4, 5]
 24 |         >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
 25 |         [1, 2, 3, 4, 5, '</s>']
 26 |     :param sequence: the source data to be padded
 27 |     :type sequence: sequence or iter
 28 |     :param n: the degree of the ngrams
 29 |     :type n: int
 30 |     :param pad_left: whether the ngrams should be left-padded
 31 |     :type pad_left: bool
 32 |     :param pad_right: whether the ngrams should be right-padded
 33 |     :type pad_right: bool
 34 |     :param left_pad_symbol: the symbol to use for left padding (default is None)
 35 |     :type left_pad_symbol: any
 36 |     :param right_pad_symbol: the symbol to use for right padding (default is None)
 37 |     :type right_pad_symbol: any
 38 |     :rtype: sequence or iter
 39 |     """
 40 |     sequence = iter(sequence)
 41 |     if pad_left:
 42 |         sequence = chain((left_pad_symbol,) * (n - 1), sequence)
 43 |     if pad_right:
 44 |         sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
 45 |     return sequence
 46 | 
 47 | 
 48 | # add a flag to pad the sequence so we get peripheral ngrams?
 49 | 
 50 | 
 51 | def ngrams(
 52 |     sequence,
 53 |     n,
 54 |     pad_left=False,
 55 |     pad_right=False,
 56 |     left_pad_symbol=None,
 57 |     right_pad_symbol=None,
 58 | ):
 59 |     """
 60 |     Return the ngrams generated from a sequence of items, as an iterator.
 61 |     For example:
 62 |         >>> from nltk.util import ngrams
 63 |         >>> list(ngrams([1,2,3,4,5], 3))
 64 |         [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
 65 |     Wrap with list for a list version of this function.  Set pad_left
 66 |     or pad_right to true in order to get additional ngrams:
 67 |         >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
 68 |         [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
 69 |         >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
 70 |         [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
 71 |         >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
 72 |         [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
 73 |         >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
 74 |         [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
 75 |     :param sequence: the source data to be converted into ngrams
 76 |     :type sequence: sequence or iter
 77 |     :param n: the degree of the ngrams
 78 |     :type n: int
 79 |     :param pad_left: whether the ngrams should be left-padded
 80 |     :type pad_left: bool
 81 |     :param pad_right: whether the ngrams should be right-padded
 82 |     :type pad_right: bool
 83 |     :param left_pad_symbol: the symbol to use for left padding (default is None)
 84 |     :type left_pad_symbol: any
 85 |     :param right_pad_symbol: the symbol to use for right padding (default is None)
 86 |     :type right_pad_symbol: any
 87 |     :rtype: sequence or iter
 88 |     """
 89 |     sequence = pad_sequence(
 90 |         sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
 91 |     )
 92 | 
 93 |     history = []
 94 |     while n > 1:
 95 |         # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
 96 |         try:
 97 |             next_item = next(sequence)
 98 |         except StopIteration:
 99 |             # no more data, terminate the generator
100 |             return
101 |         history.append(next_item)
102 |         n -= 1
103 |     for item in sequence:
104 |         history.append(item)
105 |         yield tuple(history)
106 |         del history[0]


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/CodeReviewer/code/evaluator/stopwords.txt:
--------------------------------------------------------------------------------
  1 | about
  2 | above
  3 | abroad
  4 | according
  5 | accordingly
  6 | across
  7 | actually
  8 | adj
  9 | after
 10 | afterwards
 11 | again
 12 | against
 13 | ago
 14 | ahead
 15 | ain't
 16 | all
 17 | almost
 18 | alone
 19 | along
 20 | alongside
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | amid
 27 | amidst
 28 | among
 29 | amongst
 30 | an
 31 | and
 32 | any
 33 | anybody
 34 | anyhow
 35 | anyone
 36 | anything
 37 | anyway
 38 | anyways
 39 | anywhere
 40 | apart
 41 | appear
 42 | appropriate
 43 | are
 44 | aren't
 45 | around
 46 | as
 47 | a's
 48 | aside
 49 | associated
 50 | at
 51 | available
 52 | away
 53 | awfully
 54 | back
 55 | backward
 56 | backwards
 57 | be
 58 | became
 59 | because
 60 | become
 61 | becomes
 62 | becoming
 63 | been
 64 | before
 65 | beforehand
 66 | begin
 67 | behind
 68 | being
 69 | believe
 70 | below
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | came
 80 | can
 81 | cannot
 82 | cant
 83 | can't
 84 | caption
 85 | cause
 86 | causes
 87 | certain
 88 | certainly
 89 | clearly
 90 | c'mon
 91 | co
 92 | co.
 93 | com
 94 | come
 95 | comes
 96 | concerning
 97 | consequently
 98 | contain
 99 | containing
100 | contains
101 | corresponding
102 | could
103 | couldn't
104 | course
105 | c's
106 | dare
107 | daren't
108 | definitely
109 | described
110 | despite
111 | did
112 | didn't
113 | different
114 | directly
115 | do
116 | does
117 | doesn't
118 | doing
119 | don't
120 | down
121 | downwards
122 | during
123 | each
124 | edu
125 | eg
126 | eight
127 | eighty
128 | either
129 | elsewhere
130 | end
131 | ending
132 | entirely
133 | et
134 | etc
135 | even
136 | ever
137 | evermore
138 | every
139 | everybody
140 | everyone
141 | everything
142 | everywhere
143 | ex
144 | exactly
145 | except
146 | fairly
147 | far
148 | farther
149 | few
150 | fewer
151 | fifth
152 | first
153 | five
154 | followed
155 | following
156 | follows
157 | for
158 | forever
159 | former
160 | formerly
161 | forth
162 | forward
163 | found
164 | four
165 | from
166 | further
167 | furthermore
168 | get
169 | gets
170 | getting
171 | given
172 | gives
173 | go
174 | goes
175 | going
176 | gone
177 | got
178 | gotten
179 | greetings
180 | had
181 | hadn't
182 | half
183 | happens
184 | hardly
185 | has
186 | hasn't
187 | have
188 | haven't
189 | having
190 | he
191 | he'd
192 | he'll
193 | hello
194 | help
195 | hence
196 | her
197 | here
198 | hereafter
199 | hereby
200 | herein
201 | here's
202 | hereupon
203 | hers
204 | herself
205 | he's
206 | hi
207 | him
208 | himself
209 | his
210 | hither
211 | hopefully
212 | how
213 | howbeit
214 | however
215 | hundred
216 | i'd
217 | ie
218 | if
219 | ignored
220 | i'll
221 | i'm
222 | immediate
223 | in
224 | inasmuch
225 | inc
226 | inc.
227 | indeed
228 | indicate
229 | indicated
230 | indicates
231 | inner
232 | inside
233 | insofar
234 | into
235 | inward
236 | is
237 | isn't
238 | it
239 | it'd
240 | it'll
241 | its
242 | it's
243 | itself
244 | i've
245 | just
246 | k
247 | keep
248 | keeps
249 | kept
250 | know
251 | known
252 | knows
253 | last
254 | lately
255 | later
256 | latter
257 | latterly
258 | least
259 | less
260 | lest
261 | let
262 | let's
263 | like
264 | liked
265 | likely
266 | likewise
267 | little
268 | look
269 | looking
270 | looks
271 | low
272 | lower
273 | ltd
274 | made
275 | mainly
276 | make
277 | makes
278 | many
279 | may
280 | maybe
281 | mayn't
282 | me
283 | mean
284 | meantime
285 | meanwhile
286 | merely
287 | might
288 | mightn't
289 | mine
290 | minus
291 | moreover
292 | most
293 | mostly
294 | mr
295 | mrs
296 | much
297 | must
298 | mustn't
299 | my
300 | myself
301 | name
302 | namely
303 | nd
304 | near
305 | nearly
306 | needn't
307 | neither
308 | never
309 | neverf
310 | neverless
311 | nevertheless
312 | next
313 | nine
314 | ninety
315 | no
316 | nobody
317 | non
318 | none
319 | nonetheless
320 | noone
321 | no-one
322 | nor
323 | normally
324 | not
325 | nothing
326 | notwithstanding
327 | novel
328 | now
329 | nowhere
330 | obviously
331 | of
332 | off
333 | often
334 | oh
335 | ok
336 | okay
337 | old
338 | on
339 | once
340 | one
341 | ones
342 | one's
343 | only
344 | onto
345 | opposite
346 | or
347 | otherwise
348 | ought
349 | oughtn't
350 | our
351 | ours
352 | ourselves
353 | out
354 | outside
355 | over
356 | overall
357 | own
358 | particular
359 | particularly
360 | past
361 | per
362 | perhaps
363 | placed
364 | plus
365 | possible
366 | presumably
367 | probably
368 | que
369 | quite
370 | qv
371 | rather
372 | rd
373 | re
374 | really
375 | reasonably
376 | recent
377 | recently
378 | regarding
379 | regardless
380 | regards
381 | relatively
382 | respectively
383 | right
384 | round
385 | said
386 | same
387 | saw
388 | say
389 | saying
390 | says
391 | second
392 | secondly
393 | see
394 | seeing
395 | seem
396 | seemed
397 | seeming
398 | seems
399 | seen
400 | self
401 | selves
402 | sensible
403 | sent
404 | seven
405 | several
406 | shall
407 | shan't
408 | she
409 | she'd
410 | she'll
411 | she's
412 | should
413 | shouldn't
414 | since
415 | six
416 | so
417 | somebody
418 | someday
419 | somehow
420 | someone
421 | something
422 | sometime
423 | sometimes
424 | somewhat
425 | somewhere
426 | sorry
427 | specified
428 | specify
429 | specifying
430 | still
431 | sub
432 | such
433 | sup
434 | sure
435 | take
436 | taken
437 | taking
438 | tell
439 | tends
440 | th
441 | than
442 | that
443 | that'll
444 | thats
445 | that's
446 | that've
447 | the
448 | their
449 | theirs
450 | them
451 | themselves
452 | then
453 | thence
454 | there
455 | thereafter
456 | thereby
457 | there'd
458 | therefore
459 | therein
460 | there'll
461 | there're
462 | theres
463 | there's
464 | thereupon
465 | there've
466 | these
467 | they
468 | they'd
469 | they'll
470 | they're
471 | they've
472 | thing
473 | things
474 | think
475 | third
476 | thirty
477 | this
478 | thorough
479 | thoroughly
480 | those
481 | though
482 | three
483 | through
484 | throughout
485 | thru
486 | thus
487 | till
488 | to
489 | together
490 | too
491 | took
492 | toward
493 | towards
494 | tried
495 | tries
496 | truly
497 | try
498 | trying
499 | t's
500 | twice
501 | two
502 | un
503 | under
504 | underneath
505 | undoing
506 | unfortunately
507 | unless
508 | unlike
509 | unlikely
510 | until
511 | unto
512 | up
513 | upon
514 | upwards
515 | us
516 | use
517 | used
518 | uses
519 | using
520 | usually
521 | v
522 | value
523 | various
524 | versus
525 | very
526 | via
527 | viz
528 | vs
529 | was
530 | wasn't
531 | way
532 | we
533 | we'd
534 | well
535 | we'll
536 | went
537 | were
538 | we're
539 | weren't
540 | we've
541 | what
542 | whatever
543 | what'll
544 | what's
545 | what've
546 | when
547 | whence
548 | whenever
549 | where
550 | whereafter
551 | whereas
552 | whereby
553 | wherein
554 | where's
555 | whereupon
556 | wherever
557 | whether
558 | which
559 | whichever
560 | while
561 | whilst
562 | whither
563 | who
564 | who'd
565 | whoever
566 | whole
567 | who'll
568 | whom
569 | whomever
570 | who's
571 | whose
572 | why
573 | will
574 | with
575 | within
576 | without
577 | wonder
578 | won't
579 | would
580 | wouldn't
581 | yes
582 | yet
583 | you
584 | you'd
585 | you'll
586 | your
587 | you're
588 | yours
589 | yourself
590 | yourselves
591 | you've
592 | zer
593 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/run_infer_msg.py:
--------------------------------------------------------------------------------
  1 | import os, json
  2 | import torch
  3 | import logging
  4 | import argparse
  5 | import random
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import multiprocessing
  9 | import time
 10 | from itertools import cycle
 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from transformers import AdamW, get_linear_schedule_with_warmup
 14 | from models import build_or_load_gen_model
 15 | from configs import add_args, set_seed, set_dist
 16 | from torch.nn.parallel import DistributedDataParallel as DDP
 17 | import torch.distributed as dist
 18 | from utils import CommentGenDataset, SimpleGenDataset
 19 | from evaluator.smooth_bleu import bleu_fromstr
 20 | 
 21 | 
 22 | logging.basicConfig(
 23 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 24 |     datefmt="%m/%d/%Y %H:%M:%S",
 25 |     level=logging.INFO,
 26 | )
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def get_loader(data_file, args, tokenizer, pool):
 31 |     def fn(features):
 32 |         return features
 33 |     logger.info(f"Start data file {data_file}.")
 34 |     if args.raw_input:
 35 |         dataset = SimpleGenDataset(tokenizer, pool, args, data_file)
 36 |     else:
 37 |         dataset = CommentGenDataset(tokenizer, pool, args, data_file)
 38 |     sampler = SequentialSampler(dataset)
 39 |     dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn)
 40 |     logger.info(f"Finish data files {data_file}.")
 41 |     return dataset, sampler, dataloader
 42 | 
 43 | 
 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer):
 45 |     logger.info(f"  ***** Running bleu evaluation on {args.eval_file} *****")
 46 |     logger.info("  Batch size = %d", args.eval_batch_size)
 47 |     model.eval()
 48 |     if hasattr(model, "module"):
 49 |         model = model.module
 50 |     pred_ids, ex_ids = [], []
 51 |     for step, examples in tqdm(enumerate(eval_dataloader, 1)):
 52 |         source_ids = torch.tensor(
 53 |             [ex.source_ids for ex in examples], dtype=torch.long
 54 |         ).to(args.local_rank)
 55 |         ids = [ex.example_id for ex in examples]
 56 |         source_mask = source_ids.ne(tokenizer.pad_id)
 57 |         preds = model.generate(source_ids,
 58 |                             attention_mask=source_mask,
 59 |                             use_cache=True,
 60 |                             num_beams=args.beam_size,
 61 |                             early_stopping=True,
 62 |                             max_length=args.max_target_length)
 63 |         top_preds = list(preds.cpu().numpy())
 64 |         pred_ids.extend(top_preds)
 65 |         if args.break_cnt > 0 and len(pred_ids) >= args.break_cnt:
 66 |             break
 67 |     # [2:] to remove beginning '<s>' '<msg>'
 68 |     pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids]
 69 |     valid_file = args.eval_file
 70 |     out_file = args.out_file
 71 |     outdics = []
 72 |     golds = []
 73 |     with open(valid_file, "r") as f:
 74 |         for line in f:
 75 |             outdics.append(json.loads(line))
 76 |             golds.append(outdics[-1]["msg"])
 77 |     outdics = outdics[:len(pred_nls)]
 78 |     golds = golds[:len(pred_nls)]
 79 |     with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f:
 80 |         for pred in pred_nls:
 81 |             f.write(pred.strip() + "\n")
 82 |     with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f:
 83 |         for gold in golds:
 84 |             f.write(gold.strip() + "\n")
 85 |     with open(out_file, "w", encoding="utf-8") as f:
 86 |         for i, outdic in enumerate(outdics):
 87 |             outdic["gen"] = pred_nls[i]
 88 |             f.write(json.dumps(outdic) + "\n")
 89 |     bleu = bleu_fromstr(pred_nls, golds, rmstop=False)
 90 |     return bleu
 91 | 
 92 | 
 93 | def main(args):
 94 |     dist.init_process_group(backend="nccl")
 95 |     local_rank = dist.get_rank() % args.gpu_per_node
 96 |     args.global_rank = local_rank + args.node_index * args.gpu_per_node
 97 |     args.local_rank = local_rank
 98 |     args.world_size = dist.get_world_size()
 99 |     logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s",
100 |                    args.local_rank, args.global_rank, \
101 |                    torch.distributed.get_world_size(), \
102 |                    args.eval_batch_size)
103 |     torch.cuda.set_device(local_rank)
104 | 
105 |     set_seed(args)
106 |     config, model, tokenizer = build_or_load_gen_model(args)
107 |     model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
108 |     pool = multiprocessing.Pool(args.cpu_count)
109 |     data_file = args.eval_file
110 |     set_seed(args)
111 |     _, _, dataloader = get_loader(data_file, args, tokenizer, pool)        # WARNING: this is a iterator, to save memory
112 |     model.eval()
113 |     bleu = eval_epoch_bleu(args, dataloader, model, tokenizer)
114 |     logger.warning(f"BLEU: {bleu}")
115 | 
116 | if __name__ == "__main__":
117 |     parser = argparse.ArgumentParser()
118 |     args = add_args(parser)
119 |     args.cpu_count = multiprocessing.cpu_count()
120 |     # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991
121 |     logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
122 |     logger.info(args)
123 |     main(args)
124 |     logger.info("Test finished.")
125 |     # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count())
126 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/run_test_cls.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import logging
  4 | import argparse
  5 | import random
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import multiprocessing
  9 | import time
 10 | from itertools import cycle
 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from transformers import AdamW, get_linear_schedule_with_warmup
 14 | from models import build_or_load_gen_model
 15 | from configs import add_args, set_seed, set_dist
 16 | from torch.nn.parallel import DistributedDataParallel as DDP
 17 | import torch.distributed as dist
 18 | from utils import CommentClsDataset, SimpleClsDataset
 19 | from sklearn.metrics import classification_report
 20 | 
 21 | 
 22 | logging.basicConfig(
 23 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 24 |     datefmt="%m/%d/%Y %H:%M:%S",
 25 |     level=logging.INFO,
 26 | )
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def get_loader(data_file, args, tokenizer, pool):
 31 |     def fn(features):
 32 |         return features
 33 |     logger.info(f"Start data file {data_file}.")
 34 |     if args.raw_input:
 35 |         dataset = SimpleClsDataset(tokenizer, pool, args, data_file)
 36 |     else:
 37 |         dataset = CommentClsDataset(tokenizer, pool, args, data_file)
 38 |     sampler = RandomSampler(dataset)
 39 |     dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn)
 40 |     logger.info(f"Finish data files {data_file}.")
 41 |     return dataset, sampler, dataloader
 42 | 
 43 | 
 44 | def eval_epoch_acc(args, eval_dataloader, model, tokenizer):
 45 |     # Start evaluating model
 46 |     logger.info("  " + "***** Running acc evaluation *****")
 47 |     logger.info("  Batch size = %d", args.eval_batch_size)
 48 | 
 49 |     model.eval()
 50 |     local_rank = 0
 51 |     pred, gold = [], []
 52 |     with torch.no_grad():
 53 |         for step, examples in enumerate(tqdm(eval_dataloader), 1):
 54 |             if step == 1:
 55 |                 ex = examples[0]
 56 |                 logger.info(f"batch size: {len(examples)}")
 57 |                 logger.info(f"example source: {tokenizer.convert_ids_to_tokens(ex.source_ids)}")
 58 |                 logger.info(f"example target: {ex.y}")
 59 |             source_ids = torch.tensor(
 60 |                 [ex.source_ids for ex in examples], dtype=torch.long
 61 |             ).to(local_rank)
 62 |             source_mask = source_ids.ne(tokenizer.pad_id)
 63 |             logits = model(
 64 |                 cls=True,
 65 |                 input_ids=source_ids,
 66 |                 labels=None,
 67 |                 attention_mask=source_mask
 68 |             )
 69 |             prediction = torch.argmax(logits, dim=-1).cpu().numpy()
 70 |             pred.extend(prediction)
 71 |             gold.extend([ex.y for ex in examples])
 72 |     logger.info("\n" + classification_report(gold, pred, digits=4))
 73 |     logger.info(f"Target positive percentage: {sum(gold) / len(gold)}")
 74 |     return
 75 | 
 76 | 
 77 | def main(args):
 78 |     dist.init_process_group(backend="nccl")
 79 |     local_rank = dist.get_rank() % args.gpu_per_node
 80 |     args.global_rank = local_rank + args.node_index * args.gpu_per_node
 81 |     args.local_rank = local_rank
 82 |     args.world_size = dist.get_world_size()
 83 |     logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s",
 84 |                    args.local_rank, args.global_rank, \
 85 |                    torch.distributed.get_world_size(), \
 86 |                    args.eval_batch_size)
 87 |     torch.cuda.set_device(local_rank)
 88 | 
 89 |     set_seed(args)
 90 |     config, model, tokenizer = build_or_load_gen_model(args)
 91 |     model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
 92 |     pool = multiprocessing.Pool(args.cpu_count)
 93 |     data_file = args.eval_file
 94 |     set_seed(args)
 95 |     _, _, dataloader = get_loader(data_file, args, tokenizer, pool)        # WARNING: this is a iterator, to save memory
 96 |     model.eval()
 97 |     eval_epoch_acc(args, dataloader, model, tokenizer)
 98 | 
 99 | if __name__ == "__main__":
100 |     parser = argparse.ArgumentParser()
101 |     args = add_args(parser)
102 |     args.cpu_count = multiprocessing.cpu_count()
103 |     # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991
104 |     logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
105 |     logger.info(args)
106 |     main(args)
107 |     logger.info("Test finished.")
108 |     # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count())
109 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/run_test_msg.py:
--------------------------------------------------------------------------------
  1 | import os, json
  2 | import torch
  3 | import logging
  4 | import argparse
  5 | import random
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import multiprocessing
  9 | import time
 10 | from itertools import cycle
 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from transformers import AdamW, get_linear_schedule_with_warmup
 14 | from models import build_or_load_gen_model
 15 | from configs import add_args, set_seed, set_dist
 16 | from torch.nn.parallel import DistributedDataParallel as DDP
 17 | import torch.distributed as dist
 18 | from utils import CommentGenDataset, SimpleGenDataset
 19 | from evaluator.smooth_bleu import bleu_fromstr
 20 | 
 21 | 
 22 | logging.basicConfig(
 23 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 24 |     datefmt="%m/%d/%Y %H:%M:%S",
 25 |     level=logging.INFO,
 26 | )
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def get_loader(data_file, args, tokenizer, pool):
 31 |     def fn(features):
 32 |         return features
 33 |     logger.info(f"Start data file {data_file}.")
 34 |     if args.raw_input:
 35 |         dataset = SimpleGenDataset(tokenizer, pool, args, data_file)
 36 |     else:
 37 |         dataset = CommentGenDataset(tokenizer, pool, args, data_file)
 38 |     sampler = SequentialSampler(dataset)
 39 |     dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn)
 40 |     logger.info(f"Finish data files {data_file}.")
 41 |     return dataset, sampler, dataloader
 42 | 
 43 | 
 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer):
 45 |     logger.info(f"  ***** Running bleu evaluation on {args.eval_file} *****")
 46 |     logger.info("  Batch size = %d", args.eval_batch_size)
 47 |     model.eval()
 48 |     if hasattr(model, "module"):
 49 |         model = model.module
 50 |     pred_ids, ex_ids = [], []
 51 |     for step, examples in tqdm(enumerate(eval_dataloader, 1)):
 52 |         source_ids = torch.tensor(
 53 |             [ex.source_ids for ex in examples], dtype=torch.long
 54 |         ).to(args.local_rank)
 55 |         ids = [ex.example_id for ex in examples]
 56 |         source_mask = source_ids.ne(tokenizer.pad_id)
 57 |         preds = model.generate(source_ids,
 58 |                             attention_mask=source_mask,
 59 |                             use_cache=True,
 60 |                             num_beams=args.beam_size,
 61 |                             early_stopping=True,
 62 |                             max_length=args.max_target_length)
 63 |         top_preds = list(preds.cpu().numpy())
 64 |         pred_ids.extend(top_preds)
 65 |     pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids]
 66 |     valid_file = args.eval_file
 67 |     golds = []
 68 |     with open(valid_file, "r") as f:
 69 |         for line in f:
 70 |             golds.append(json.loads(line)["msg"])
 71 |     golds = golds[:len(pred_nls)]
 72 |     with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f:
 73 |         for pred in pred_nls:
 74 |             f.write(pred.strip() + "\n")
 75 |     with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f:
 76 |         for gold in golds:
 77 |             f.write(gold.strip() + "\n")
 78 |     bleu = bleu_fromstr(pred_nls, golds, rmstop=False)
 79 |     logger.warning(f"WithStop BLEU: {bleu}")
 80 |     bleu = bleu_fromstr(pred_nls, golds, rmstop=True)
 81 |     return bleu
 82 | 
 83 | 
 84 | def main(args):
 85 |     dist.init_process_group(backend="nccl")
 86 |     local_rank = dist.get_rank() % args.gpu_per_node
 87 |     args.global_rank = local_rank + args.node_index * args.gpu_per_node
 88 |     args.local_rank = local_rank
 89 |     args.world_size = dist.get_world_size()
 90 |     logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s",
 91 |                    args.local_rank, args.global_rank, \
 92 |                    torch.distributed.get_world_size(), \
 93 |                    args.eval_batch_size)
 94 |     torch.cuda.set_device(local_rank)
 95 | 
 96 |     set_seed(args)
 97 |     config, model, tokenizer = build_or_load_gen_model(args)
 98 |     model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
 99 |     pool = multiprocessing.Pool(args.cpu_count)
100 |     data_file = args.eval_file
101 |     set_seed(args)
102 |     _, _, dataloader = get_loader(data_file, args, tokenizer, pool)        # WARNING: this is a iterator, to save memory
103 |     model.eval()
104 |     bleu = eval_epoch_bleu(args, dataloader, model, tokenizer)
105 |     logger.warning(f"BLEU: {bleu}")
106 | 
107 | if __name__ == "__main__":
108 |     parser = argparse.ArgumentParser()
109 |     args = add_args(parser)
110 |     args.cpu_count = multiprocessing.cpu_count()
111 |     # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991
112 |     logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
113 |     logger.info(args)
114 |     main(args)
115 |     logger.info("Test finished.")
116 |     # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count())
117 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/run_test_ref.py:
--------------------------------------------------------------------------------
  1 | import os, json
  2 | import torch
  3 | import logging
  4 | import argparse
  5 | import random
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import multiprocessing
  9 | import time
 10 | from itertools import cycle
 11 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from transformers import AdamW, get_linear_schedule_with_warmup
 14 | from models import build_or_load_gen_model
 15 | from configs import add_args, set_seed, set_dist
 16 | from torch.nn.parallel import DistributedDataParallel as DDP
 17 | import torch.distributed as dist
 18 | from utils import RefineDataset, SimpleRefineDataset
 19 | from evaluator.smooth_bleu import bleu_fromstr
 20 | 
 21 | 
 22 | logging.basicConfig(
 23 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 24 |     datefmt="%m/%d/%Y %H:%M:%S",
 25 |     level=logging.INFO,
 26 | )
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def get_loader(data_file, args, tokenizer, pool):
 31 |     def fn(features):
 32 |         return features
 33 |     logger.info(f"Start data file {data_file}.")
 34 |     if args.raw_input:
 35 |         dataset = SimpleRefineDataset(tokenizer, pool, args, data_file)
 36 |     else:
 37 |         dataset = RefineDataset(tokenizer, pool, args, data_file)
 38 |     sampler = SequentialSampler(dataset)
 39 |     dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=args.cpu_count, collate_fn=fn)
 40 |     logger.info(f"Finish data files {data_file}.")
 41 |     return dataset, sampler, dataloader
 42 | 
 43 | 
 44 | def eval_epoch_bleu(args, eval_dataloader, model, tokenizer):
 45 |     logger.info(f"  ***** Running bleu evaluation on {args.eval_file} *****")
 46 |     logger.info("  Batch size = %d", args.eval_batch_size)
 47 |     model.eval()
 48 |     if hasattr(model, "module"):
 49 |         model = model.module
 50 |     pred_ids, ex_ids = [], []
 51 |     for step, examples in tqdm(enumerate(eval_dataloader, 1)):
 52 |         source_ids = torch.tensor(
 53 |             [ex.source_ids for ex in examples], dtype=torch.long
 54 |         ).to(args.local_rank)
 55 |         source_mask = source_ids.ne(tokenizer.pad_id)
 56 |         preds = model.generate(source_ids,
 57 |                             attention_mask=source_mask,
 58 |                             use_cache=True,
 59 |                             num_beams=args.beam_size,
 60 |                             early_stopping=True,
 61 |                             max_length=args.max_target_length)
 62 |         top_preds = list(preds.cpu().numpy())
 63 |         pred_ids.extend(top_preds)
 64 |     pred_nls = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids]
 65 |     valid_file = args.eval_file
 66 |     golds = []
 67 |     with open(valid_file, "r") as f:
 68 |         for line in f:
 69 |             golds.append(json.loads(line)["new"])
 70 |     golds = golds[:len(pred_nls)]
 71 |     if args.raw_input:
 72 |         datasetClass = SimpleRefineDataset
 73 |     else:
 74 |         datasetClass = RefineDataset
 75 |     for i in range(len(golds)):
 76 |         pred_nls[i], golds[i] = datasetClass.process_pred_gold(pred_nls[i], golds[i])
 77 |     with open(os.path.join(args.model_name_or_path, "preds.txt"), "w", encoding="utf-8") as f:
 78 |         for pred in pred_nls:
 79 |             f.write(pred.strip() + "\n")
 80 |     with open(os.path.join(args.model_name_or_path, "golds.txt"), "w", encoding="utf-8") as f:
 81 |         for gold in golds:
 82 |             f.write(gold.strip() + "\n")
 83 |     em = 0
 84 |     for pred, gold in zip(pred_nls, golds):
 85 |         if " ".join(pred.split()) == " ".join(gold.split()):
 86 |             em += 1
 87 |     em = em / len(golds)
 88 |     logger.warning(f"EM: {em}")
 89 |     bleu = bleu_fromstr(pred_nls, golds, rmstop=False)
 90 |     return bleu
 91 | 
 92 | 
 93 | def main(args):
 94 |     dist.init_process_group(backend="nccl")
 95 |     local_rank = dist.get_rank() % args.gpu_per_node
 96 |     args.global_rank = local_rank + args.node_index * args.gpu_per_node
 97 |     args.local_rank = local_rank
 98 |     args.world_size = dist.get_world_size()
 99 |     logger.warning("Process rank: %s, global rank: %s, world size: %s, bs: %s",
100 |                    args.local_rank, args.global_rank, \
101 |                    torch.distributed.get_world_size(), \
102 |                    args.eval_batch_size)
103 |     torch.cuda.set_device(local_rank)
104 | 
105 |     set_seed(args)
106 |     config, model, tokenizer = build_or_load_gen_model(args)
107 |     model = DDP(model.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
108 |     pool = multiprocessing.Pool(args.cpu_count)
109 |     data_file = args.eval_file
110 |     set_seed(args)
111 |     _, _, dataloader = get_loader(data_file, args, tokenizer, pool)        # WARNING: this is a iterator, to save memory
112 |     model.eval()
113 |     bleu = eval_epoch_bleu(args, dataloader, model, tokenizer)
114 |     logger.warning(f"BLEU: {bleu}")
115 | 
116 | if __name__ == "__main__":
117 |     parser = argparse.ArgumentParser()
118 |     args = add_args(parser)
119 |     args.cpu_count = multiprocessing.cpu_count()
120 |     # remove long tokenization warning. ref: https://github.com/huggingface/transformers/issues/991
121 |     logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
122 |     logger.info(args)
123 |     main(args)
124 |     logger.info("Test finished.")
125 |     # torch.multiprocessing.spawn(main, args=(args,), nprocs=torch.cuda.device_count())
126 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/finetune-cls.sh:
--------------------------------------------------------------------------------
 1 | # batch size 12 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | # You may change the following block for multiple gpu training
 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 8 | RANK=0 && echo RANK: ${RANK}
 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
11 | NODES=1 && echo NODES: ${NODES}
12 | NCCL_DEBUG=INFO
13 | 
14 | bash test_nltk.sh
15 | 
16 | 
17 | # Change the arguments as required:
18 | #   model_name_or_path, load_model_path: the path of the model to be finetuned
19 | #   eval_file: the path of the evaluation data
20 | #   output_dir: the directory to save finetuned model (not used at infer/test time)
21 | #   out_file: the path of the output file
22 | #   train_file_name: can be a directory contraining files named with "train*.jsonl"
23 | 
24 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_cls.py  \
25 |   --train_epochs 30 \
26 |   --model_name_or_path microsoft/codereviewer \
27 |   --output_dir ../../save/cls \
28 |   --train_filename ../../dataset/Diff_Quality_Estimation \
29 |   --dev_filename ../../dataset/Diff_Quality_Estimation/cls-valid.jsonl \
30 |   --max_source_length 512 \
31 |   --max_target_length 128 \
32 |   --train_batch_size 12 \
33 |   --learning_rate 3e-4 \
34 |   --gradient_accumulation_steps 3 \
35 |   --mask_rate 0.15 \
36 |   --save_steps 3600 \
37 |   --log_steps 100 \
38 |   --train_steps 120000 \
39 |   --gpu_per_node=${PER_NODE_GPU} \
40 |   --node_index=${RANK} \
41 |   --seed 2233 
42 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/finetune-msg.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | # You may change the following block for multiple gpu training
 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 8 | RANK=0 && echo RANK: ${RANK}
 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
11 | NODES=1 && echo NODES: ${NODES}
12 | NCCL_DEBUG=INFO
13 | 
14 | bash test_nltk.sh
15 | 
16 | 
17 | # Change the arguments as required:
18 | #   model_name_or_path, load_model_path: the path of the model to be finetuned
19 | #   eval_file: the path of the evaluation data
20 | #   output_dir: the directory to save finetuned model (not used at infer/test time)
21 | #   out_file: the path of the output file
22 | #   train_filename: can be a directory contraining files named with "train*.jsonl"
23 | #   raw_input: to select the preprocess method, set to True in this task
24 | 
25 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_msg.py  \
26 |   --train_epochs 30 \
27 |   --model_name_or_path microsoft/codereviewer \
28 |   --output_dir ../../save/gen \
29 |   --train_filename ../../dataset/gen-train.jsonl \
30 |   --dev_filename ../../dataset/gen-valid.jsonl \
31 |   --max_source_length 512 \
32 |   --max_target_length 128 \
33 |   --train_batch_size 6 \
34 |   --learning_rate 3e-4 \
35 |   --gradient_accumulation_steps 3 \
36 |   --mask_rate 0.15 \
37 |   --save_steps 1800 \
38 |   --log_steps 100 \
39 |   --train_steps 60000 \
40 |   --gpu_per_node=${PER_NODE_GPU} \
41 |   --node_index=${RANK} \
42 |   --seed 2233 \
43 |   --raw_input 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/finetune-ref.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | # You may change the following block for multiple gpu training
 6 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 7 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 8 | RANK=0 && echo RANK: ${RANK}
 9 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
10 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
11 | NODES=1 && echo NODES: ${NODES}
12 | NCCL_DEBUG=INFO
13 | 
14 | bash test_nltk.sh
15 | 
16 | 
17 | # Change the arguments as required:
18 | #   model_name_or_path, load_model_path: the path of the model to be finetuned
19 | #   eval_file: the path of the evaluation data
20 | #   output_dir: the directory to save finetuned model (not used at infer/test time)
21 | #   out_file: the path of the output file
22 | #   train_file_name: can be a directory contraining files named with "train*.jsonl"
23 | 
24 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_finetune_ref.py  \
25 |   --train_epochs 30 \
26 |   --model_name_or_path microsoft/codereviewer \
27 |   --output_dir ../../save/ref \
28 |   --train_filename ../../data/ref-train.jsonl \
29 |   --dev_filename ../../data/ref-valid.jsonl \
30 |   --max_source_length 200 \
31 |   --max_target_length 200 \
32 |   --train_batch_size 6 \
33 |   --learning_rate 3e-4 \
34 |   --gradient_accumulation_steps 3 \
35 |   --mask_rate 0.15 \
36 |   --save_steps 1800 \
37 |   --log_steps 100 \
38 |   --train_steps 60000 \
39 |   --gpu_per_node=${PER_NODE_GPU} \
40 |   --node_index=${RANK} \
41 |   --seed 2233 \
42 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/infer-json.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 7 | RANK=0 && echo RANK: ${RANK}
 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
10 | NODES=1 && echo NODES: ${NODES}
11 | NCCL_DEBUG=INFO
12 | 
13 | # change break_cnt to truncate the number of examples (useful at debug time maybe)
14 | #   --break_cnt -1 \  will keep the whole dataset 
15 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_infer_msg.py  \
16 |   --model_name_or_path microsoft/codereviewer \
17 |   --output_dir ../../save/gen \
18 |   --load_model_path ../../save/gen/checkpoint \
19 |   --output_dir empty \
20 |   --eval_file test.jsonl \
21 |   --out_file test_out.jsonl \
22 |   --max_source_length 512 \
23 |   --max_target_length 128 \
24 |   --eval_batch_size 12 \
25 |   --beam_size 10 \
26 |   --gpu_per_node=${PER_NODE_GPU} \
27 |   --node_index=${RANK} \
28 |   --seed 2233 \
29 |   --raw_input \
30 |   --break_cnt 20
31 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/test-cls.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 7 | RANK=0 && echo RANK: ${RANK}
 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
10 | NODES=1 && echo NODES: ${NODES}
11 | NCCL_DEBUG=INFO
12 | 
13 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_cls.py  \
14 |   --model_name_or_path microsoft/codereviewer \
15 |   --output_dir ../../save/gen \
16 |   --load_model_path ../../save/gen/checkpoint \
17 |   --output_dir empty \
18 |   --eval_file cls-test.jsonl \
19 |   --max_source_length 512 \
20 |   --max_target_length 128 \
21 |   --eval_batch_size 16 \
22 |   --mask_rate 0.15 \
23 |   --save_steps 4000 \
24 |   --log_steps 100 \
25 |   --train_steps 120000 \
26 |   --gpu_per_node=${PER_NODE_GPU} \
27 |   --node_index=${RANK} \
28 |   --seed 2233


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/test-msg.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | 
 6 | # You may change the following block for multiple gpu training
 7 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 8 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 9 | RANK=0 && echo RANK: ${RANK}
10 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
11 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
12 | NODES=1 && echo NODES: ${NODES}
13 | NCCL_DEBUG=INFO
14 | 
15 | 
16 | bash test_nltk.sh
17 | 
18 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_msg.py  \
19 |   --model_name_or_path microsoft/codereviewer \
20 |   --output_dir ../../save/gen \
21 |   --load_model_path ../../save/gen/checkpoint \
22 |   --output_dir empty \
23 |   --eval_file ref-test.jsonl \
24 |   --max_source_length 512 \
25 |   --max_target_length 128 \
26 |   --eval_batch_size 12 \
27 |   --mask_rate 0.15 \
28 |   --save_steps 1800 \
29 |   --beam_size 10 \
30 |   --log_steps 100 \
31 |   --train_steps 120000 \
32 |   --gpu_per_node=${PER_NODE_GPU} \
33 |   --node_index=${RANK} \
34 |   --seed 2233 \
35 |   --raw_input
36 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/test-ref.sh:
--------------------------------------------------------------------------------
 1 | # batch size 6 for 16 GB GPU
 2 | 
 3 | mnt_dir="/home/codereview"
 4 | 
 5 | MASTER_HOST=localhost && echo MASTER_HOST: ${MASTER_HOST}
 6 | MASTER_PORT=23333 && echo MASTER_PORT: ${MASTER_PORT}
 7 | RANK=0 && echo RANK: ${RANK}
 8 | PER_NODE_GPU=1 && echo PER_NODE_GPU: ${PER_NODE_GPU}
 9 | WORLD_SIZE=1 && echo WORLD_SIZE: ${WORLD_SIZE}
10 | NODES=1 && echo NODES: ${NODES}
11 | NCCL_DEBUG=INFO
12 | 
13 | bash test_nltk.sh
14 | 
15 | python -m torch.distributed.launch --nproc_per_node ${PER_NODE_GPU} --node_rank=${RANK} --nnodes=${NODES} --master_addr=${MASTER_HOST} --master_port=${MASTER_PORT} ../run_test_ref.py  \
16 |   --model_name_or_path microsoft/codereviewer \
17 |   --output_dir ../../save/gen \
18 |   --load_model_path ../../save/gen/checkpoint \
19 |   --output_dir empty \
20 |   --eval_file ref-test.jsonl \
21 |   --max_source_length 200 \
22 |   --max_target_length 200 \
23 |   --eval_batch_size 12 \
24 |   --mask_rate 0.15 \
25 |   --save_steps 1800 \
26 |   --beam_size 10 \
27 |   --log_steps 100 \
28 |   --train_steps 120000 \
29 |   --gpu_per_node=${PER_NODE_GPU} \
30 |   --node_index=${RANK} \
31 |   --seed 2233 \
32 | 


--------------------------------------------------------------------------------
/CodeReviewer/code/sh/test_nltk.sh:
--------------------------------------------------------------------------------
1 | echo -e "import nltk\nnltk.download('punkt')" > ttmp.py
2 | python ttmp.py
3 | rm ttmp.py


--------------------------------------------------------------------------------
/CodeReviewer/code/test_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | from configs import add_args
 4 | from models import ReviewerModel, build_or_load_gen_model
 5 | 
 6 | MAX_SOURCE_LENGTH=512
 7 | 
 8 | def pad_assert(tokenizer, source_ids):
 9 |     source_ids = source_ids[:MAX_SOURCE_LENGTH - 2]
10 |     source_ids = [tokenizer.bos_id] + source_ids + [tokenizer.eos_id]
11 |     pad_len = MAX_SOURCE_LENGTH - len(source_ids)
12 |     source_ids += [tokenizer.pad_id] * pad_len
13 |     assert len(source_ids) == MAX_SOURCE_LENGTH, "Not equal length."
14 |     return source_ids
15 | 
16 | def encode_diff(tokenizer, diff):
17 |     difflines = diff.split("\n")[1:]        # remove start @@
18 |     difflines = [line for line in difflines if len(line.strip()) > 0]
19 |     map_dic = {"-": 0, "+": 1, " ": 2}
20 |     def f(s):
21 |         if s in map_dic:
22 |             return map_dic[s]
23 |         else:
24 |             return 2
25 |     labels = [f(line[0]) for line in difflines]
26 |     difflines = [line[1:].strip() for line in difflines]
27 |     inputstr = ""
28 |     for label, line in zip(labels, difflines):
29 |         if label == 1:
30 |             inputstr += "<add>" + line
31 |         elif label == 0:
32 |             inputstr += "<del>" + line
33 |         else:
34 |             inputstr += "<keep>" + line
35 |     source_ids = tokenizer.encode(inputstr, max_length=MAX_SOURCE_LENGTH, truncation=True)[1:-1]
36 |     source_ids = pad_assert(tokenizer, source_ids)
37 |     return source_ids
38 | 
39 | parser = argparse.ArgumentParser()
40 | args = add_args(parser)
41 | args.model_name_or_path = "microsoft/codereviewer"
42 | config, model, tokenizer = build_or_load_gen_model(args)
43 | model.to("cuda")
44 | model.eval()
45 | code_diff = """@@ -11,6 +11,8 @@\n \n         invoiceDtoCopy.setState(InvoiceState.OPEN);\n         _invoiceAggregateRepository.updateInvoiceState(invoiceCopy, InvoiceState.OPEN);\n+        _erpIntegrationService.createAndSendInvoiceEvent(invoiceCopy);\n+\n       }\n     }\n \n"""
46 | 
47 | inputs = torch.tensor([encode_diff(tokenizer, code_diff)], dtype=torch.long).to("cuda")
48 | inputs_mask = inputs.ne(tokenizer.pad_id)
49 | preds = model.generate(inputs,
50 |                         attention_mask=inputs_mask,
51 |                         use_cache=True,
52 |                         num_beams=5,
53 |                         early_stopping=True,
54 |                         max_length=100,
55 |                         num_return_sequences=2
56 |                         )
57 | preds = list(preds.cpu().numpy())
58 | pred_nls = [tokenizer.decode(id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in preds]
59 | print(pred_nls[0])
60 |     


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/README.md:
--------------------------------------------------------------------------------
  1 | # Clone Detection
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.
  6 | 
  7 | ## Updates
  8 | 
  9 | 2021-9-13: We have update the evaluater script. Since it's a binary classification, we use binary F1 score instead of "macro" F1 score.
 10 | 
 11 | ## Dataset
 12 | 
 13 | The dataset we use is [BigCloneBench](https://www.cs.usask.ca/faculty/croy/papers/2014/SvajlenkoICSME2014BigERA.pdf) and filtered following the paper [Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree](https://arxiv.org/pdf/2002.08653.pdf).
 14 | 
 15 | ### Data Format
 16 | 
 17 | 1. dataset/data.jsonl is stored in jsonlines format. Each line in the uncompressed file represents one function.  One row is illustrated below.
 18 | 
 19 |    - **func:** the function
 20 | 
 21 |    - **idx:** index of the example
 22 | 
 23 | 2. train.txt/valid.txt/test.txt provide examples, stored in the following format:    idx1	idx2	label
 24 | 
 25 | ### Data Statistics
 26 | 
 27 | Data statistics of the dataset are shown in the below table:
 28 | 
 29 | |       | #Examples |
 30 | | ----- | :-------: |
 31 | | Train |  901,028  |
 32 | | Dev   |  415,416  |
 33 | | Test  |  415,416  |
 34 | 
 35 | You can get data using the following command.
 36 | 
 37 | ```
 38 | unzip dataset.zip
 39 | ```
 40 | 
 41 | ## Evaluator
 42 | 
 43 | We provide a script to evaluate predictions for this task, and report F1 score
 44 | 
 45 | ### Example
 46 | 
 47 | ```bash
 48 | python evaluator/evaluator.py -a evaluator/answers.txt -p evaluator/predictions.txt
 49 | ```
 50 | 
 51 | {'Recall': 0.25, 'Prediction': 0.5, 'F1': 0.3333333333333333}
 52 | 
 53 | ### Input predictions
 54 | 
 55 | A predications file that has predictions in TXT format, such as evaluator/predictions.txt. For example:
 56 | 
 57 | ```b
 58 | 13653451	21955002	0
 59 | 1188160	8831513	1
 60 | 1141235	14322332	0
 61 | 16765164	17526811	1
 62 | ```
 63 | 
 64 | ## Pipeline-GraphCodeBERT
 65 | 
 66 | We also provide a pipeline that fine-tunes GraphCodeBERT on this task. 
 67 | ### Dependency
 68 | 
 69 | - pip install torch
 70 | - pip install transformers
 71 | - pip install tree_sitter
 72 | - pip sklearn
 73 | 
 74 | ### Tree-sitter (optional)
 75 | 
 76 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 77 | 
 78 | ```shell
 79 | cd parser
 80 | bash build.sh
 81 | cd ..
 82 | ```
 83 | 
 84 | ### Fine-tune
 85 | 
 86 | We use 4*V100-16G to fine-tune and 10% valid data to evaluate.
 87 | 
 88 | 
 89 | ```shell
 90 | mkdir saved_models
 91 | python run.py \
 92 |     --output_dir=saved_models \
 93 |     --config_name=microsoft/graphcodebert-base \
 94 |     --model_name_or_path=microsoft/graphcodebert-base \
 95 |     --tokenizer_name=microsoft/graphcodebert-base \
 96 |     --do_train \
 97 |     --train_data_file=dataset/train.txt \
 98 |     --eval_data_file=dataset/valid.txt \
 99 |     --test_data_file=dataset/test.txt \
100 |     --epoch 1 \
101 |     --code_length 512 \
102 |     --data_flow_length 128 \
103 |     --train_batch_size 16 \
104 |     --eval_batch_size 32 \
105 |     --learning_rate 2e-5 \
106 |     --max_grad_norm 1.0 \
107 |     --evaluate_during_training \
108 |     --seed 123456 2>&1| tee saved_models/train.log
109 | ```
110 | 
111 | ### Inference
112 | 
113 | We use full test data for inference. 
114 | 
115 | ```shell
116 | python run.py \
117 |     --output_dir=saved_models \
118 |     --config_name=microsoft/graphcodebert-base \
119 |     --model_name_or_path=microsoft/graphcodebert-base \
120 |     --tokenizer_name=microsoft/graphcodebert-base \
121 |     --do_eval \
122 |     --do_test \
123 |     --train_data_file=dataset/train.txt \
124 |     --eval_data_file=dataset/valid.txt \
125 |     --test_data_file=dataset/test.txt \
126 |     --epoch 1 \
127 |     --code_length 512 \
128 |     --data_flow_length 128 \
129 |     --train_batch_size 16 \
130 |     --eval_batch_size 32 \
131 |     --learning_rate 2e-5 \
132 |     --max_grad_norm 1.0 \
133 |     --evaluate_during_training \
134 |     --seed 123456 2>&1| tee saved_models/test.log
135 | ```
136 | 
137 | ### Evaluation
138 | 
139 | ```shell
140 | python evaluator/evaluator.py -a dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
141 | ```
142 | 
143 | ## Result
144 | 
145 | The results on the test set are shown as below:
146 | 
147 | | Method        | Precision |  Recall   |    F1     |
148 | | ------------- | :-------: | :-------: | :-------: |
149 | | Deckard       |   0.93    |   0.02    |   0.03    |
150 | | RtvNN         |   0.95    |   0.01    |   0.01    |
151 | | CDLH          |   0.92    |   0.74    |   0.82    |
152 | | ASTNN         |   0.92    |   0.94    |   0.93    |
153 | | FA-AST-GMN    |   **0.96**    |   0.94    |   0.95    |
154 | | CodeBERT      |   0.947   |   0.934   |   0.941   |
155 | | GraphCodeBERT |  0.948 | **0.952** | **0.950** |
156 | 
157 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/clonedetection/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/answers.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	0
3 | 1141235	14322332	0
4 | 16765164	17526811	0


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | import logging
 4 | import sys
 5 | from sklearn.metrics import recall_score,precision_score,f1_score
 6 | 
 7 | def read_answers(filename):
 8 |     answers={}
 9 |     with open(filename) as f:
10 |         for line in f:
11 |             line=line.strip()
12 |             idx1,idx2,label=line.split()
13 |             answers[(idx1,idx2)]=int(label)
14 |     return answers
15 | 
16 | def read_predictions(filename):
17 |     predictions={}
18 |     with open(filename) as f:
19 |         for line in f:
20 |             line=line.strip()
21 |             idx1,idx2,label=line.split()
22 |             predictions[(idx1,idx2)]=int(label)
23 |     return predictions
24 | 
25 | def calculate_scores(answers,predictions):
26 |     y_trues,y_preds=[],[]
27 |     for key in answers:
28 |         if key not in predictions:
29 |             logging.error("Missing prediction for ({},{}) pair.".format(key[0],key[1]))
30 |             sys.exit()
31 |         y_trues.append(answers[key])
32 |         y_preds.append(predictions[key])
33 |     scores={}
34 |     scores['Recall']=recall_score(y_trues, y_preds)
35 |     scores['Prediction']=precision_score(y_trues, y_preds)
36 |     scores['F1']=f1_score(y_trues, y_preds)
37 |     return scores
38 | 
39 | def main():
40 |     import argparse
41 |     parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.')
42 |     parser.add_argument('--answers', '-a',help="filename of the labels, in txt format.")
43 |     parser.add_argument('--predictions', '-p',help="filename of the leaderboard predictions, in txt format.")
44 |     
45 | 
46 |     args = parser.parse_args()
47 |     answers=read_answers(args.answers)
48 |     predictions=read_predictions(args.predictions)
49 |     scores=calculate_scores(answers,predictions)
50 |     print(scores)
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 
55 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/predictions.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	1
3 | 1141235	14322332	0
4 | 16765164	17526811	1


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch
 4 | from torch.autograd import Variable
 5 | import copy
 6 | import torch.nn.functional as F
 7 | from torch.nn import CrossEntropyLoss, MSELoss
 8 | 
 9 | class RobertaClassificationHead(nn.Module):
10 |     """Head for sentence-level classification tasks."""
11 | 
12 |     def __init__(self, config):
13 |         super().__init__()
14 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
15 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
16 |         self.out_proj = nn.Linear(config.hidden_size, 2)
17 | 
18 |     def forward(self, features, **kwargs):
19 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
20 |         x = x.reshape(-1,x.size(-1)*2)
21 |         x = self.dropout(x)
22 |         x = self.dense(x)
23 |         x = torch.tanh(x)
24 |         x = self.dropout(x)
25 |         x = self.out_proj(x)
26 |         return x
27 |         
28 | class Model(nn.Module):   
29 |     def __init__(self, encoder,config,tokenizer,args):
30 |         super(Model, self).__init__()
31 |         self.encoder = encoder
32 |         self.config=config
33 |         self.tokenizer=tokenizer
34 |         self.classifier=RobertaClassificationHead(config)
35 |         self.args=args
36 |     
37 |         
38 |     def forward(self, inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels=None): 
39 |         bs,l=inputs_ids_1.size()
40 |         inputs_ids=torch.cat((inputs_ids_1.unsqueeze(1),inputs_ids_2.unsqueeze(1)),1).view(bs*2,l)
41 |         position_idx=torch.cat((position_idx_1.unsqueeze(1),position_idx_2.unsqueeze(1)),1).view(bs*2,l)
42 |         attn_mask=torch.cat((attn_mask_1.unsqueeze(1),attn_mask_2.unsqueeze(1)),1).view(bs*2,l,l)
43 | 
44 |         #embedding
45 |         nodes_mask=position_idx.eq(0)
46 |         token_mask=position_idx.ge(2)        
47 |         inputs_embeddings=self.encoder.roberta.embeddings.word_embeddings(inputs_ids)
48 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
49 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
50 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
51 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
52 |         
53 |         outputs = self.encoder.roberta(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx,token_type_ids=position_idx.eq(-1).long())[0]
54 |         logits=self.classifier(outputs)
55 |         # shape: [batch_size, num_classes]
56 |         prob=F.softmax(logits, dim=-1)
57 |         if labels is not None:
58 |             loss_fct = CrossEntropyLoss()
59 |             loss = loss_fct(logits, labels)
60 |             return loss,prob
61 |         else:
62 |             return prob
63 |       
64 |         
65 | 
66 |        
67 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/clonedetection/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Code Search
  4 | 
  5 | ## Data Preprocess
  6 | 
  7 | Different from the setting of [CodeSearchNet](husain2019codesearchnet), the answer of each query is retrieved from the whole development and testing code corpus instead of 1,000 candidate codes. Besides, we observe that some queries contain content unrelated to the code, such as a link ``http://..." that refers to external resources.  Therefore, we filter following examples to improve the quality of the dataset. 
  8 | 
  9 | - Remove comments in the code
 10 | 
 11 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 12 | 
 13 | - Remove examples that #tokens of documents is < 3 or >256
 14 | 
 15 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 16 | 
 17 | - Remove examples that documents are not English.
 18 | 
 19 | Data statistic about the cleaned dataset for code document generation is shown in this Table.
 20 | 
 21 | | PL         | Training |  Dev   |  Test  | Candidates code |
 22 | | :--------- | :------: | :----: | :----: | :-------------: |
 23 | | Python     | 251,820  | 13,914 | 14,918 |     43,827      |
 24 | | PHP        | 241,241  | 12,982 | 14,014 |     52,660      |
 25 | | Go         | 167,288  | 7,325  | 8,122  |     28,120      |
 26 | | Java       | 164,923  | 5,183  | 10,955 |     40,347      |
 27 | | JavaScript |  58,025  | 3,885  | 3,291  |     13,981      |
 28 | | Ruby       |  24,927  | 1,400  | 1,261  |      4,360      |
 29 | 
 30 | You can download and preprocess data using the following command.
 31 | ```shell
 32 | unzip dataset.zip
 33 | cd dataset
 34 | bash run.sh 
 35 | cd ..
 36 | ```
 37 | 
 38 | ## Dependency 
 39 | 
 40 | - pip install torch
 41 | - pip install transformers
 42 | - pip install tree_sitter
 43 | 
 44 | ### Tree-sitter (optional)
 45 | 
 46 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 47 | 
 48 | ```shell
 49 | cd parser
 50 | bash build.sh
 51 | cd ..
 52 | ```
 53 | 
 54 | ## Fine-Tune
 55 | 
 56 | We fine-tuned the model on 2*V100-16G GPUs. 
 57 | ```shell
 58 | lang=ruby
 59 | mkdir -p ./saved_models/$lang
 60 | python run.py \
 61 |     --output_dir=./saved_models/$lang \
 62 |     --config_name=microsoft/graphcodebert-base \
 63 |     --model_name_or_path=microsoft/graphcodebert-base \
 64 |     --tokenizer_name=microsoft/graphcodebert-base \
 65 |     --lang=$lang \
 66 |     --do_train \
 67 |     --train_data_file=dataset/$lang/train.jsonl \
 68 |     --eval_data_file=dataset/$lang/valid.jsonl \
 69 |     --test_data_file=dataset/$lang/test.jsonl \
 70 |     --codebase_file=dataset/$lang/codebase.jsonl \
 71 |     --num_train_epochs 10 \
 72 |     --code_length 256 \
 73 |     --data_flow_length 64 \
 74 |     --nl_length 128 \
 75 |     --train_batch_size 32 \
 76 |     --eval_batch_size 64 \
 77 |     --learning_rate 2e-5 \
 78 |     --seed 123456 2>&1| tee saved_models/$lang/train.log
 79 | ```
 80 | ## Inference and Evaluation
 81 | 
 82 | ```shell
 83 | lang=ruby
 84 | python run.py \
 85 |     --output_dir=./saved_models/$lang \
 86 |     --config_name=microsoft/graphcodebert-base \
 87 |     --model_name_or_path=microsoft/graphcodebert-base \
 88 |     --tokenizer_name=microsoft/graphcodebert-base \
 89 |     --lang=$lang \
 90 |     --do_eval \
 91 |     --do_test \
 92 |     --train_data_file=dataset/$lang/train.jsonl \
 93 |     --eval_data_file=dataset/$lang/valid.jsonl \
 94 |     --test_data_file=dataset/$lang/test.jsonl \
 95 |     --codebase_file=dataset/$lang/codebase.jsonl \
 96 |     --num_train_epochs 10 \
 97 |     --code_length 256 \
 98 |     --data_flow_length 64 \
 99 |     --nl_length 128 \
100 |     --train_batch_size 32 \
101 |     --eval_batch_size 64 \
102 |     --learning_rate 2e-5 \
103 |     --seed 123456 2>&1| tee saved_models/$lang/test.log
104 | ```
105 | 
106 | ## Results	
107 | 
108 | The results on the filtered dataset are shown in this Table:
109 | 
110 | | Model          |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
111 | | -------------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
112 | | NBow           |   0.162   |   0.157    |   0.330   |   0.161   |   0.171   |   0.152   |   0.189   |
113 | | CNN            |   0.276   |   0.224    |   0.680   |   0.242   |   0.263   |   0.260   |   0.324   |
114 | | BiRNN          |   0.213   |   0.193    |   0.688   |   0.290   |   0.304   |   0.338   |   0.338   |
115 | | SelfAtt        |   0.275   |   0.287    |   0.723   |   0.398   |   0.404   |   0.426   |   0.419   |
116 | | RoBERTa        |   0.587   |   0.517    |   0.850   |   0.587   |   0.599   |   0.560   |   0.617   |
117 | | RoBERTa (code) |   0.628   |   0.562    |   0.859   |   0.610   |   0.620   |   0.579   |   0.643   |
118 | | CodeBERT       |   0.679   |   0.620    |   0.882   |   0.672   |   0.676   |   0.628   |   0.693   |
119 | | GraphCodeBERT  | **0.703** | **0.644**  | **0.897** | **0.692** | **0.691** | **0.649** | **0.713** |
120 | 
121 | 
122 | ## Model and Demo
123 | A pretrained model, additional training script with dataset, and demo of a finetuned CodeBERT model for the task of Code Search can be found here: https://drive.google.com/file/d/1ZO-xVIzGcNE6Gz9DEg2z5mIbBv4Ft1cK/view.
124 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/codesearch/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, attn_mask=None,position_idx=None, nl_inputs=None): 
11 |         if code_inputs is not None:
12 |             nodes_mask=position_idx.eq(0)
13 |             token_mask=position_idx.ge(2)        
14 |             inputs_embeddings=self.encoder.embeddings.word_embeddings(code_inputs)
15 |             nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
16 |             nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
17 |             avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
18 |             inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
19 |             return self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)[1]
20 |         else:
21 |             return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]
22 | 
23 |       
24 |         
25 |  
26 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/codesearch/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/README.md:
--------------------------------------------------------------------------------
  1 | # Code Refinement
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code refinement aims to automatically fix bugs in the code, which can contribute to reducing the cost of bug-fixes for developers.
  6 | In CodeXGLUE, given a piece of Java code with bugs, the task is to remove the bugs to output the refined code. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. 
 12 | All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.
 13 | 
 14 | ### Data Format
 15 | 
 16 | The dataset is in the "data" folder. Each line of the files is a function. You can get data using the following command:
 17 | 
 18 | ```
 19 | unzip data.zip
 20 | ```
 21 | 
 22 | ### Data Statistics
 23 | 
 24 | Data statistics of this dataset are shown in the below table:
 25 | 
 26 | |         | #Examples | #Examples |
 27 | | ------- | :-------: | :-------: |
 28 | |         |   Small   |   Medium  |
 29 | |  Train  |   46,680  |   52,364  |
 30 | |  Valid  |    5,835  |    6,545  |
 31 | |   Test  |    5,835  |    6,545  |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking the "small" subset as example:
 53 | 
 54 | ```shell
 55 | scale=small
 56 | lr=1e-4
 57 | batch_size=32
 58 | beam_size=10
 59 | source_length=320
 60 | target_length=256
 61 | output_dir=saved_models/$scale/
 62 | train_file=data/$scale/train.buggy-fixed.buggy,data/$scale/train.buggy-fixed.fixed
 63 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 64 | epochs=50 
 65 | pretrained_model=microsoft/graphcodebert-base
 66 | 
 67 | mkdir -p $output_dir
 68 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 69 | ```
 70 | 
 71 | ### Inference
 72 | 
 73 | We use full test data for inference. 
 74 | 
 75 | ```shell
 76 | batch_size=64
 77 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 78 | test_file=data/$scale/test.buggy-fixed.buggy,data/$scale/test.buggy-fixed.fixed
 79 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 80 | 
 81 | python run.py --do_test --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --load_model_path $load_model_path --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
 82 | ```
 83 | 
 84 | 
 85 | 
 86 | ## Result
 87 | 
 88 | The results on the test set are shown as below:
 89 | 
 90 | Small:
 91 | 
 92 | | Method        |   BLEU    | Acc (100%) |
 93 | | ------------- | :-------: | :--------: |
 94 | | Naive copy    |   78.06   |    0.0     |
 95 | | LSTM          |   76.76   |    10.0    |
 96 | | Transformer   |   77.21   |    14.7    |
 97 | | CodeBERT      |   77.42   |    16.4    |
 98 | | GraphCodeBERT | **80.02** |  **17.3**  |
 99 | 
100 | Medium:
101 | 
102 | | Method        |   BLEU    | Acc (100%) |
103 | | ------------- | :-------: | :--------: |
104 | | Naive copy    |   90.91   |    0.0     |
105 | | LSTM          |   72.08   |    2.5     |
106 | | Transformer   |   89.25   |    3.7     |
107 | | CodeBERT      |   91.07   |    5.16    |
108 | | GraphCodeBERT | **91.31** |  **9.1**   |
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/refinement/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/refinement/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/README.md:
--------------------------------------------------------------------------------
  1 | # Code Translation
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code translation aims to migrate legacy software from one programming language in a platform toanother.
  6 | Given a piece of Java (C#) code, the task is to translate the code into C# (Java) version. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).
 12 | 
 13 | We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.
 14 | 
 15 | ### Data Format
 16 | 
 17 | The dataset is in the "data" folder. Each line of the files is a function, and the suffix of the file indicates the programming language. You can get data using the following command:
 18 | 
 19 | ```
 20 | unzip data.zip
 21 | ```
 22 | 
 23 | ### Data Statistics
 24 | 
 25 | Data statistics of the dataset are shown in the below table:
 26 | 
 27 | |       | #Examples |
 28 | | ----- | :-------: |
 29 | | Train |  10,300   |
 30 | | Valid |    500    |
 31 | | Test  |   1,000   |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking Java to C# translation as example:
 53 | 
 54 | ```shell
 55 | source=java
 56 | target=cs
 57 | lr=1e-4
 58 | batch_size=32
 59 | beam_size=10
 60 | source_length=320
 61 | target_length=256
 62 | output_dir=saved_models/$source-$target/
 63 | train_file=data/train.java-cs.txt.$source,data/train.java-cs.txt.$target
 64 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 65 | epochs=100
 66 | pretrained_model=microsoft/graphcodebert-base
 67 | 
 68 | mkdir -p $output_dir
 69 | python run.py \
 70 | --do_train \
 71 | --do_eval \
 72 | --model_type roberta \
 73 | --source_lang $source \
 74 | --model_name_or_path $pretrained_model \
 75 | --tokenizer_name microsoft/graphcodebert-base \
 76 | --config_name microsoft/graphcodebert-base \
 77 | --train_filename $train_file \
 78 | --dev_filename $dev_file \
 79 | --output_dir $output_dir \
 80 | --max_source_length $source_length \
 81 | --max_target_length $target_length \
 82 | --beam_size $beam_size \
 83 | --train_batch_size $batch_size \
 84 | --eval_batch_size $batch_size \
 85 | --learning_rate $lr \
 86 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 87 | ```
 88 | 
 89 | ### Inference
 90 | 
 91 | We use full test data for inference. 
 92 | 
 93 | ```shell
 94 | batch_size=64
 95 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 96 | test_file=data/test.java-cs.txt.$source,data/test.java-cs.txt.$target
 97 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 98 | 
 99 | python run.py \
100 | --do_test \
101 | --model_type roberta \
102 | --source_lang $source \
103 | --model_name_or_path $pretrained_model \
104 | --tokenizer_name microsoft/graphcodebert-base \
105 | --config_name microsoft/graphcodebert-base \
106 | --load_model_path $load_model_path \
107 | --dev_filename $dev_file \
108 | --test_filename $test_file \
109 | --output_dir $output_dir \
110 | --max_source_length $source_length \
111 | --max_target_length $target_length \
112 | --beam_size $beam_size \
113 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
114 | ```
115 | 
116 | 
117 | 
118 | ## Result
119 | 
120 | The results on the test set are shown as below:
121 | 
122 | Java to C#:
123 | 
124 | | Method         |   BLEU    | Acc (100%) |
125 | | -------------- | :-------: | :--------: |
126 | | Naive copy     |   18.54   |    0.0     |
127 | | PBSMT          |   43.53   |    12.5    |
128 | | Transformer    |   55.84   |    33.0    |
129 | | Roborta (code) |   77.46   |    56.1    |
130 | | CodeBERT       |   79.92   |    59.0    |
131 | | GraphCodeBERT  | **80.58** |  **59.4**  |
132 | 
133 | C# to Java:
134 | 
135 | | Method         |   BLEU    | Acc (100%) |
136 | | -------------- | :-------: | :--------: |
137 | | Naive copy     |   18.69   |    0.0     |
138 | | PBSMT          |   40.06   |    16.1    |
139 | | Transformer    |   50.47   |    37.9    |
140 | | Roborta (code) |   71.99   |    57.9    |
141 | | CodeBERT       |   72.14   |    58.0    |
142 | | GraphCodeBERT  | **72.64** |  **58.8**  |
143 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/translation/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/GraphCodeBERT/translation/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Microsoft Corporation.
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LongCoder/README.md:
--------------------------------------------------------------------------------
 1 | # LongCoder
 2 | 
 3 | This repo will provide the code for reproducing the experiments on LCC datasets in [LongCoder: A Long-Range Pre-trained Language Model for Code Completion](https://arxiv.org/abs/2306.14893). LongCoder is a sparse and efficient pre-trained Transformer model for long code modeling.
 4 | 
 5 | ## 1. Dependency
 6 | 
 7 | - pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
 8 | - pip install --upgrade  transformers fuzzywuzzy tree_sitter datasets
 9 | 
10 | ## 2. Dataset
11 | In this repo, the LCC dataset will be automatically downloaded when running the fine-tuning script. If you want to download LCC datasets by yourself, you can find them in the following links:
12 | ```
13 | https://huggingface.co/datasets/microsoft/LCC_python
14 | https://huggingface.co/datasets/microsoft/LCC_java
15 | https://huggingface.co/datasets/microsoft/LCC_csharp
16 | ```
17 | ## 3. Fine-Tune Setting
18 | Here we provide fine-tune settings for code completion on LCC datasets in C# programming language, whose results are reported in the paper.
19 | 
20 | Note that it requires 8 v100-32G GPUs, and you can adjust batch size or source length based on your requirements.
21 | 
22 | ```shell
23 | lang=csharp #csharp, python, java
24 | lr=2e-4
25 | batch_size=16
26 | beam_size=5
27 | source_length=3968
28 | target_length=128
29 | global_length=64
30 | window_size=512
31 | epochs=10
32 | output_dir=saved_models/$lang
33 | mkdir -p $output_dir
34 | 
35 | python run.py \
36 | --do_train \
37 | --do_eval \
38 | --lang $lang \
39 | --output_dir $output_dir \
40 | --model_name_or_path microsoft/longcoder-base \
41 | --filename microsoft/LCC_$lang \
42 | --max_source_length $source_length \
43 | --max_target_length $target_length \
44 | --max_global_length $global_length \
45 | --window_size $window_size \
46 | --beam_size $beam_size \
47 | --train_batch_size $batch_size \
48 | --eval_batch_size $batch_size \
49 | --learning_rate $lr \
50 | --num_train_epochs $epochs  2>&1| tee $output_dir/train.log
51 | ```
52 | 
53 | ## 4. Evaluating LongCoder
54 | 
55 | ```shell
56 | lang=csharp #csharp, python, java
57 | batch_size=16
58 | beam_size=5
59 | source_length=3968
60 | target_length=128
61 | global_length=64
62 | window_size=512
63 | output_dir=saved_models/$lang
64 | reload_model=$output_dir/checkpoint-best-acc/model.bin
65 | 
66 | python run.py \
67 | --do_test \
68 | --lang $lang \
69 | --load_model_path $reload_model \
70 | --output_dir $output_dir \
71 | --model_name_or_path microsoft/longcoder-base \
72 | --filename microsoft/LCC_$lang \
73 | --max_source_length $source_length \
74 | --max_target_length $target_length \
75 | --max_global_length $global_length \
76 | --window_size $window_size \
77 | --beam_size $beam_size \
78 | --train_batch_size $batch_size \
79 | --eval_batch_size $batch_size \
80 | --num_train_epochs $epochs 2>&1| tee $output_dir/test.log
81 | ```
82 | 
83 | # Reference
84 | If you use this code or LongCoder, please consider citing us.
85 | 
86 | <pre><code>@article{longcoder,
87 |     title={LongCoder: A Long-Range Pre-trained Language Model for Code Completion},
88 |     author={Daya Guo and Canwen Xu and Nan Duan and Jian Yin and Julian McAuley},
89 |     journal={arXiv preprint arXiv:2306.14893},
90 |     year={2023}
91 | }</code></pre>
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/LongCoder/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/LongCoder/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |     'tree-sitter-cpp',
20 |     'tree-sitter-c',
21 |   ]
22 | )
23 | 
24 | 


--------------------------------------------------------------------------------
/LongCoder/parser/build.sh:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/tree-sitter/tree-sitter-c
 2 | git clone https://github.com/tree-sitter/tree-sitter-cpp
 3 | git clone https://github.com/tree-sitter/tree-sitter-typescript
 4 | git clone https://github.com/tree-sitter/tree-sitter-go
 5 | git clone https://github.com/tree-sitter/tree-sitter-javascript
 6 | git clone https://github.com/tree-sitter/tree-sitter-python
 7 | git clone https://github.com/tree-sitter/tree-sitter-ruby
 8 | git clone https://github.com/tree-sitter/tree-sitter-php
 9 | git clone https://github.com/tree-sitter/tree-sitter-java
10 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
11 | python build.py
12 | 


--------------------------------------------------------------------------------
/LongCoder/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/LongCoder/parser/my-languages.so


--------------------------------------------------------------------------------
/LongCoder/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type):
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type):
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=" "+code[i]
96 |         s+=" "+code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/LongCoder/run.sh:
--------------------------------------------------------------------------------
 1 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
 2 | pip install --upgrade scipy transformers tqdm fuzzywuzzy tree_sitter datasets
 3 | 
 4 | lang=$1 #programming language
 5 | lr=2e-4
 6 | batch_size=16
 7 | beam_size=5
 8 | source_length=3968
 9 | target_length=128
10 | global_length=64
11 | window_size=512
12 | output_dir=saved_models/$1
13 | epochs=10
14 | pretrained_model=microsoft/longcoder-base
15 | 
16 | mkdir -p $output_dir
17 | 
18 | python run.py \
19 | --do_train \
20 | --do_eval \
21 | --lang $1 \
22 | --output_dir $output_dir \
23 | --model_name_or_path $pretrained_model \
24 | --filename microsoft/LCC_$1 \
25 | --max_source_length $source_length \
26 | --max_target_length $target_length \
27 | --max_global_length $global_length \
28 | --window_size $window_size \
29 | --beam_size $beam_size \
30 | --train_batch_size $batch_size \
31 | --eval_batch_size $batch_size \
32 | --learning_rate $lr \
33 | --num_train_epochs $epochs  2>&1| tee $output_dir/train.log
34 | 
35 | 
36 | 
37 | 
38 | 
39 | reload_model=$output_dir/checkpoint-best-acc/model.bin
40 | python run.py \
41 | --do_test \
42 | --lang $1 \
43 | --load_model_path $reload_model \
44 | --model_name_or_path $pretrained_model \
45 | --filename microsoft/LCC_$1 \
46 | --output_dir $output_dir \
47 | --max_source_length $source_length \
48 | --max_target_length $target_length \
49 | --max_global_length $global_length \
50 | --window_size $window_size \
51 | --beam_size $beam_size \
52 | --train_batch_size $batch_size \
53 | --eval_batch_size $batch_size \
54 | --learning_rate $lr \
55 | --num_train_epochs $epochs 2>&1| tee $output_dir/test.log
56 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/README.md:
--------------------------------------------------------------------------------
 1 | # Clone Detection (BigCloneDetection)
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | mkdir dataset
 7 | cd dataset
 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl
 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/test.txt
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/train.txt
11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/valid.txt
12 | cd ..
13 | 
14 | ```
15 | 
16 | ## Dependency 
17 | 
18 | - pip install torch
19 | - pip install transformers
20 | 
21 | ## Fine-Tune
22 | 
23 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
24 | 
25 | ```shell
26 | # Training
27 | python run.py \
28 |     --output_dir saved_models \
29 |     --model_name_or_path microsoft/unixcoder-base \
30 |     --do_train \
31 |     --train_data_file dataset/train.txt \
32 |     --eval_data_file dataset/valid.txt \
33 |     --num_train_epochs 1 \
34 |     --block_size 512 \
35 |     --train_batch_size 16 \
36 |     --eval_batch_size 32 \
37 |     --learning_rate 5e-5 \
38 |     --max_grad_norm 1.0 \
39 |     --seed 123456 
40 |     
41 | # Evaluating
42 | python run.py \
43 |     --output_dir saved_models \
44 |     --model_name_or_path microsoft/unixcoder-base \
45 |     --do_test \
46 |     --test_data_file dataset/test.txt \
47 |     --num_train_epochs 1 \
48 |     --block_size 512 \
49 |     --train_batch_size 16 \
50 |     --eval_batch_size 32 \
51 |     --learning_rate 5e-5 \
52 |     --max_grad_norm 1.0 \
53 |     --seed 123456 
54 | ```
55 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch
 6 | from torch.autograd import Variable
 7 | import copy
 8 | import torch.nn.functional as F
 9 | from torch.nn import CrossEntropyLoss, MSELoss
10 | 
11 | class RobertaClassificationHead(nn.Module):
12 |     """Head for sentence-level classification tasks."""
13 | 
14 |     def __init__(self, config):
15 |         super().__init__()
16 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
17 |         self.dropout = nn.Dropout(0.1)
18 |         self.out_proj = nn.Linear(config.hidden_size, 2)
19 | 
20 |     def forward(self, x):
21 |         x = x.reshape(-1,x.size(-1)*2)
22 |         x = self.dropout(x)
23 |         x = self.dense(x)
24 |         x = torch.tanh(x)
25 |         x = self.dropout(x)
26 |         x = self.out_proj(x)
27 |         return x
28 |         
29 | class Model(nn.Module):   
30 |     def __init__(self, encoder,config,tokenizer,args):
31 |         super(Model, self).__init__()
32 |         self.encoder = encoder
33 |         self.config = config
34 |         self.tokenizer = tokenizer
35 |         self.classifier = RobertaClassificationHead(config)
36 |         self.args = args
37 |     
38 |         
39 |     def forward(self, input_ids=None,labels=None): 
40 |         input_ids = input_ids.view(-1,self.args.block_size)
41 |         outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
42 |         outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
43 |         outputs = outputs.reshape(-1,2,outputs.size(-1))
44 |         outputs = torch.nn.functional.normalize(outputs, p=2, dim=-1)
45 |         cos_sim = (outputs[:,0]*outputs[:,1]).sum(-1)
46 | 
47 |         if labels is not None:
48 |             loss = ((cos_sim-labels.float())**2).mean()
49 |             return loss,cos_sim
50 |         else:
51 |             return cos_sim
52 |       
53 |         
54 |  
55 |         
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/run.sh:
--------------------------------------------------------------------------------
 1 | model=../../../../pretrained-model/UniXcoder-base
 2 | mkdir saved_models
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
 4 |     --output_dir=./saved_models \
 5 |     --model_type=roberta \
 6 |     --model_name_or_path=$model \
 7 |     --do_train \
 8 |     --train_data_file=../../dataset/train.txt \
 9 |     --eval_data_file=../../dataset/valid.txt \
10 |     --test_data_file=../../dataset/test.txt \
11 |     --epoch 1 \
12 |     --block_size 512 \
13 |     --train_batch_size 16 \
14 |     --eval_batch_size 32 \
15 |     --learning_rate 5e-5 \
16 |     --max_grad_norm 1.0 \
17 |     --evaluate_during_training \
18 |     --seed 123456 2>&1| tee saved_models/train.log
19 |     
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
21 |     --output_dir=./saved_models \
22 |     --model_type=roberta \
23 |     --model_name_or_path=$model \
24 |     --do_eval \
25 |     --do_test \
26 |     --train_data_file=../../dataset/train.txt \
27 |     --eval_data_file=../../dataset/valid.txt \
28 |     --test_data_file=../../dataset/test.txt \
29 |     --epoch 1 \
30 |     --block_size 512 \
31 |     --train_batch_size 16 \
32 |     --eval_batch_size 32 \
33 |     --learning_rate 5e-5 \
34 |     --max_grad_norm 1.0 \
35 |     --evaluate_during_training \
36 |     --seed 123456 2>&1| tee saved_models/test.log
37 |     
38 | python ../evaluator/evaluator.py -a ../../dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
39 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/README.md:
--------------------------------------------------------------------------------
 1 | # Clone Detection (POJ-104)
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | cd dataset
 7 | pip install gdown
 8 | gdown https://drive.google.com/uc?id=0B2i-vWnOu7MxVlJwQXN6eVNONUU
 9 | tar -xvf programs.tar.gz
10 | python preprocess.py
11 | cd ..
12 | ```
13 | 
14 | ## Dependency 
15 | 
16 | - pip install torch
17 | - pip install transformers
18 | 
19 | ## Fine-Tune
20 | 
21 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
22 | 
23 | ```shell
24 | # Training
25 | python run.py \
26 |     --output_dir saved_models \
27 |     --model_name_or_path microsoft/unixcoder-base \
28 |     --do_train \
29 |     --train_data_file dataset/train.jsonl \
30 |     --eval_data_file dataset/valid.jsonl \
31 |     --test_data_file dataset/test.jsonl \
32 |     --num_train_epochs 2 \
33 |     --block_size 400 \
34 |     --train_batch_size 8 \
35 |     --eval_batch_size 16 \
36 |     --learning_rate 2e-5 \
37 |     --max_grad_norm 1.0 \
38 |     --seed 123456
39 |     
40 | # Evaluating	
41 | python run.py \
42 |     --output_dir saved_models \
43 |     --model_name_or_path microsoft/unixcoder-base \
44 |     --do_eval \
45 |     --do_test \
46 |     --eval_data_file dataset/valid.jsonl \
47 |     --test_data_file dataset/test.jsonl \
48 |     --num_train_epochs 2 \
49 |     --block_size 400 \
50 |     --train_batch_size 8 \
51 |     --eval_batch_size 16 \
52 |     --learning_rate 2e-5 \
53 |     --max_grad_norm 1.0 \
54 |     --seed 123456
55 | ```
56 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/dataset/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import os
 4 | import json
 5 | from tqdm import tqdm
 6 | def files(path):
 7 |     g = os.walk(path) 
 8 |     file=[]
 9 |     for path,dir_list,file_list in g:  
10 |         for file_name in file_list:  
11 |             file.append(os.path.join(path, file_name))
12 |     return file
13 | 
14 | cont=0
15 | with open("train.jsonl",'w') as f:
16 |     for i in tqdm(range(1,65),total=64):
17 |         items=files("ProgramData/{}".format(i))
18 |         for item in items:
19 |             js={}
20 |             js['label']=item.split('/')[1]
21 |             js['index']=str(cont)
22 |             js['code']=open(item,encoding='latin-1').read()
23 |             f.write(json.dumps(js)+'\n')
24 |             cont+=1
25 |         
26 | with open("valid.jsonl",'w') as f:
27 |     for i in tqdm(range(65,81),total=16):
28 |         items=files("ProgramData/{}".format(i))
29 |         for item in items:
30 |             js={}
31 |             js['label']=item.split('/')[1]
32 |             js['index']=str(cont)
33 |             js['code']=open(item,encoding='latin-1').read()
34 |             f.write(json.dumps(js)+'\n')
35 |             cont+=1
36 |             
37 | with open("test.jsonl",'w') as f:
38 |     for i in tqdm(range(81,195),total=24):
39 |         items=files("ProgramData/{}".format(i))
40 |         for item in items:
41 |             js={}
42 |             js['label']=item.split('/')[1]
43 |             js['index']=str(cont)
44 |             js['code']=open(item,encoding='latin-1').read()
45 |             f.write(json.dumps(js)+'\n')
46 |             cont+=1


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch
 6 | from torch.autograd import Variable
 7 | import copy
 8 | import torch.nn.functional as F
 9 | from torch.nn import CrossEntropyLoss, MSELoss
10 | 
11 | 
12 |     
13 | class Model(nn.Module):   
14 |     def __init__(self, encoder,config,tokenizer,args):
15 |         super(Model, self).__init__()
16 |         self.encoder = encoder
17 |         self.config=config
18 |         self.tokenizer=tokenizer
19 |         self.args=args
20 |     
21 |         
22 |     def forward(self, input_ids=None,p_input_ids=None,n_input_ids=None,labels=None): 
23 |         bs,_ = input_ids.size()
24 |         input_ids = torch.cat((input_ids,p_input_ids,n_input_ids),0)
25 |         
26 |         outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
27 |         outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
28 |         outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
29 |         outputs = outputs.split(bs,0)
30 |         
31 |         prob_1 = (outputs[0]*outputs[1]).sum(-1)*20
32 |         prob_2 = (outputs[0]*outputs[2]).sum(-1)*20
33 |         temp = torch.cat((outputs[0],outputs[1]),0)
34 |         temp_labels = torch.cat((labels,labels),0)
35 |         prob_3 = torch.mm(outputs[0],temp.t())*20
36 |         mask = labels[:,None]==temp_labels[None,:]
37 |         prob_3 = prob_3*(1-mask.float())-1e9*mask.float()
38 |         
39 |         prob = torch.softmax(torch.cat((prob_1[:,None],prob_2[:,None],prob_3),-1),-1)
40 |         loss = torch.log(prob[:,0]+1e-10)
41 |         loss = -loss.mean()
42 |         return loss,outputs[0]
43 | 
44 |       
45 |         
46 |  
47 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/README.md:
--------------------------------------------------------------------------------
  1 | # Code Completion
  2 | 
  3 | ## Dependency 
  4 | 
  5 | - pip install torch
  6 | - pip install transformers
  7 | - pip install javalang
  8 | 
  9 | ## Data Download
 10 | 
 11 | ```bash
 12 | unzip dataset.zip
 13 | 
 14 | cd dataset/javaCorpus/
 15 | bash download.sh
 16 | python preprocess.py --base_dir=token_completion --output_dir=./
 17 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion/test.json
 18 | 
 19 | cd ../py150
 20 | bash download.sh
 21 | python preprocess.py --base_dir=py150_files --output_dir=./
 22 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion/test.json
 23 | 
 24 | cd ../..
 25 | ```
 26 | 
 27 | 
 28 | 
 29 | ## Fine-Tune Setting
 30 | 
 31 | Here we provide fine-tune settings for code completion, whose results are reported in the paper.
 32 | 
 33 | #### JavaCorpus Dataset
 34 | 
 35 | ```shell
 36 | # Training
 37 | python run.py \
 38 | 	--do_train \
 39 | 	--do_eval \
 40 | 	--lang java \
 41 | 	--model_name_or_path microsoft/unixcoder-base \
 42 | 	--train_filename dataset/javaCorpus/train.txt \
 43 | 	--dev_filename dataset/javaCorpus/dev.json \
 44 |   --output_dir saved_models/javaCorpus \
 45 |   --max_source_length 936 \
 46 |   --max_target_length 64 \
 47 |   --beam_size 5 \
 48 |   --train_batch_size 32 \
 49 |   --gradient_accumulation_steps 1 \
 50 |   --eval_batch_size 32 \
 51 |   --learning_rate 2e-5 \
 52 |   --num_train_epochs 10
 53 |   
 54 | # Output predictions of test set
 55 | python run.py \
 56 | 	--do_test \
 57 | 	--lang java \
 58 | 	--model_name_or_path microsoft/unixcoder-base \
 59 | 	--load_model_path saved_models/javaCorpus/checkpoint-best-acc/pytorch_model.bin \
 60 | 	--test_filename dataset/javaCorpus/test.json \
 61 |   --output_dir saved_models/javaCorpus \
 62 |   --max_source_length 936 \
 63 |   --max_target_length 64 \
 64 |   --beam_size 5 \
 65 |   --eval_batch_size 32
 66 | ```
 67 | 
 68 | Prediction results of test set are  ```saved_models/javaCorpus/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
 69 | 
 70 | 
 71 | #### PY150 Dataset
 72 | 
 73 | ```shell
 74 | # Training
 75 | python run.py \
 76 | 	--do_train \
 77 | 	--do_eval \
 78 | 	--lang python \
 79 | 	--model_name_or_path microsoft/unixcoder-base \
 80 | 	--train_filename dataset/py150/train.txt \
 81 | 	--dev_filename dataset/py150/dev.json \
 82 |   --output_dir saved_models/py150 \
 83 |   --max_source_length 936 \
 84 |   --max_target_length 64 \
 85 |   --beam_size 5 \
 86 |   --train_batch_size 32 \
 87 |   --gradient_accumulation_steps 1 \
 88 |   --eval_batch_size 32 \
 89 |   --learning_rate 2e-4 \
 90 |   --num_train_epochs 10
 91 |   
 92 | # Output predictions of test set  
 93 | python run.py \
 94 | 	--do_test \
 95 | 	--lang python \
 96 | 	--model_name_or_path microsoft/unixcoder-base \
 97 | 	--load_model_path saved_models/py150/checkpoint-best-acc/pytorch_model.bin \
 98 | 	--test_filename dataset/py150/test.json \
 99 |   --output_dir saved_models/py150 \
100 |   --max_source_length 936 \
101 |   --max_target_length 64 \
102 |   --beam_size 5 \
103 |   --eval_batch_size 32
104 | ```
105 | 
106 | Prediction results of test set are  ```saved_models/py150/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CodeBERT/c0de43d3aaf38e89290f1efb771f8de845e7a489/UniXcoder/downstream-tasks/code-completion/dataset.zip


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/README.md:
--------------------------------------------------------------------------------
 1 | # Code Generation
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | mkdir dataset
 7 | cd dataset
 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/train.json
 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/dev.json
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/test.json
11 | cd ..
12 | ```
13 | 
14 | ## Dependency 
15 | 
16 | - pip install torch
17 | - pip install transformers
18 | 
19 | ## Fine-Tune Setting
20 | 
21 | Here we provide fine-tune settings for code generation, whose results are reported in the paper.
22 | 
23 | ```shell
24 | # Training
25 | python run.py \
26 | 	--do_train \
27 | 	--do_eval \
28 | 	--model_name_or_path microsoft/unixcoder-base \
29 | 	--train_filename dataset/train.json \
30 | 	--dev_filename dataset/dev.json \
31 | 	--output_dir saved_models \
32 | 	--max_source_length 350 \
33 | 	--max_target_length 150 \
34 | 	--beam_size 3 \
35 | 	--train_batch_size 32 \
36 | 	--eval_batch_size 32 \
37 | 	--learning_rate 5e-5 \
38 | 	--gradient_accumulation_steps 1 \
39 | 	--num_train_epochs 30 
40 | 
41 | # Output results
42 | python run.py \
43 | 	--do_test \
44 | 	--model_name_or_path microsoft/unixcoder-base \
45 | 	--test_filename dataset/test.json \
46 | 	--output_dir saved_models \
47 | 	--max_source_length 350 \
48 | 	--max_target_length 150 \
49 | 	--beam_size 3 \
50 | 	--train_batch_size 32 \
51 | 	--eval_batch_size 32 \
52 | 	--learning_rate 5e-5 \
53 | 	--gradient_accumulation_steps 1 \
54 | 	--num_train_epochs 30 
55 | ```
56 | 
57 | Prediction results of test set are  ```saved_models/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
58 | 
59 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/run.sh:
--------------------------------------------------------------------------------
 1 | pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html > log.txt 2>&1
 2 | pip install sklearn scipy transformers tqdm > log.txt 2>&1
 3 | CUDA_VISIBLE_DEVICES=15,12,13,14
 4 | lang=java #programming language
 5 | lr=5e-5
 6 | batch_size=32
 7 | accm_steps=1
 8 | beam_size=3
 9 | source_length=512
10 | target_length=150
11 | data_dir=../../dataset
12 | output_dir=saved_models/$lang
13 | train_file=$data_dir/train.json
14 | dev_file=$data_dir/dev.json
15 | epochs=30 
16 | pretrained_model=../../../pretrained-model/UniXcoder-base/
17 | 
18 | mkdir -p $output_dir
19 | python run.py \
20 | --do_train \
21 | --do_eval \
22 | --model_name_or_path $pretrained_model \
23 | --train_filename $train_file \
24 | --dev_filename $dev_file \
25 | --tokenizer_name roberta-base \
26 | --output_dir $output_dir \
27 | --max_source_length $source_length \
28 | --max_target_length $target_length \
29 | --beam_size $beam_size \
30 | --train_batch_size $batch_size \
31 | --eval_batch_size $batch_size \
32 | --learning_rate $lr \
33 | --gradient_accumulation_steps $accm_steps \
34 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
35 | 
36 | 
37 | batch_size=64
38 | dev_file=$data_dir/dev.json
39 | test_file=$data_dir/test.json
40 | test_model=$output_dir/checkpoint-best-score/pytorch_model.bin #checkpoint for test
41 | 
42 | python run.py \
43 | --do_test \
44 | --model_name_or_path $pretrained_model \
45 | --load_model_path $test_model \
46 | --dev_filename $dev_file \
47 | --test_filename $test_file \
48 | --output_dir $output_dir \
49 | --max_source_length $source_length \
50 | --max_target_length $target_length \
51 | --beam_size $beam_size \
52 | --gradient_accumulation_steps $accm_steps \
53 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
54 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Code Search
  4 | 
  5 | ## Data Download
  6 | 
  7 | #### 1. AdvTest dataset
  8 | 
  9 | ```bash
 10 | mkdir dataset && cd dataset
 11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/NL-code-search-Adv/dataset.zip
 12 | unzip dataset.zip && rm -r dataset.zip && mv dataset AdvTest && cd AdvTest
 13 | wget https://zenodo.org/record/7857872/files/python.zip
 14 | unzip python.zip && python preprocess.py && rm -r python && rm -r *.pkl && rm python.zip
 15 | cd ../..
 16 | ```
 17 | 
 18 | #### 2. CosQA dataset
 19 | 
 20 | ```bash
 21 | cd dataset
 22 | mkdir cosqa && cd cosqa
 23 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/code_idx_map.txt
 24 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-dev-500.json
 25 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-test-500.json
 26 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-train-19604.json
 27 | cd ../..
 28 | ```
 29 | 
 30 | #### 3. CSN dataset
 31 | 
 32 | ```bash
 33 | cd dataset
 34 | wget https://github.com/microsoft/CodeBERT/raw/master/GraphCodeBERT/codesearch/dataset.zip
 35 | unzip dataset.zip && rm -r dataset.zip && mv dataset CSN && cd CSN
 36 | bash run.sh 
 37 | cd ../..
 38 | ```
 39 | 
 40 | 
 41 | 
 42 | ## Dependency 
 43 | 
 44 | - pip install torch
 45 | - pip install transformers
 46 | 
 47 | ## Zero-Shot Setting
 48 | 
 49 | We first provide scripts for zero-shot code search. The similarity between code and nl we use is cosine distance of hidden states of UniXcoder.
 50 | 
 51 | #### 1. AdvTest dataset
 52 | 
 53 | ```bash
 54 | python run.py \
 55 |     --output_dir saved_models/AdvTest \
 56 |     --model_name_or_path microsoft/unixcoder-base  \
 57 |     --do_zero_shot \
 58 |     --do_test \
 59 |     --test_data_file dataset/AdvTest/test.jsonl \
 60 |     --codebase_file dataset/AdvTest/test.jsonl \
 61 |     --num_train_epochs 2 \
 62 |     --code_length 256 \
 63 |     --nl_length 128 \
 64 |     --train_batch_size 64 \
 65 |     --eval_batch_size 64 \
 66 |     --learning_rate 2e-5 \
 67 |     --seed 123456
 68 | ```
 69 | 
 70 | #### 2. CosQA dataset
 71 | 
 72 | ```bash
 73 | python run.py \
 74 |     --output_dir saved_models/cosqa \
 75 |     --model_name_or_path microsoft/unixcoder-base  \
 76 |     --do_zero_shot \
 77 |     --do_test \
 78 |     --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
 79 |     --codebase_file dataset/cosqa/code_idx_map.txt \
 80 |     --num_train_epochs 10 \
 81 |     --code_length 256 \
 82 |     --nl_length 128 \
 83 |     --train_batch_size 64 \
 84 |     --eval_batch_size 64 \
 85 |     --learning_rate 2e-5 \
 86 |     --seed 123456
 87 | ```
 88 | 
 89 | #### 3. CSN dataset
 90 | 
 91 | ```bash
 92 | lang=python
 93 | python run.py \
 94 |     --output_dir saved_models/CSN/$lang \
 95 |     --model_name_or_path microsoft/unixcoder-base  \
 96 |     --do_zero_shot \
 97 |     --do_test \
 98 |     --test_data_file dataset/CSN/$lang/test.jsonl \
 99 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
100 |     --num_train_epochs 10 \
101 |     --code_length 256 \
102 |     --nl_length 128 \
103 |     --train_batch_size 64 \
104 |     --eval_batch_size 64 \
105 |     --learning_rate 2e-5 \
106 |     --seed 123456
107 | ```
108 | 
109 | 
110 | 
111 | ## Fine-Tune Setting
112 | 
113 | Here we provide fine-tune settings for code search, whose results are reported in the paper.
114 | 
115 | #### 1. AdvTest dataset
116 | 
117 | ```shell
118 | # Training
119 | python run.py \
120 |     --output_dir saved_models/AdvTest \
121 |     --model_name_or_path microsoft/unixcoder-base  \
122 |     --do_train \
123 |     --train_data_file dataset/AdvTest/train.jsonl \
124 |     --eval_data_file dataset/AdvTest/valid.jsonl \
125 |     --codebase_file dataset/AdvTest/valid.jsonl \
126 |     --num_train_epochs 2 \
127 |     --code_length 256 \
128 |     --nl_length 128 \
129 |     --train_batch_size 64 \
130 |     --eval_batch_size 64 \
131 |     --learning_rate 2e-5 \
132 |     --seed 123456
133 |     
134 | # Evaluating
135 | python run.py \
136 |     --output_dir saved_models/AdvTest \
137 |     --model_name_or_path microsoft/unixcoder-base  \
138 |     --do_test \
139 |     --test_data_file dataset/AdvTest/test.jsonl \
140 |     --codebase_file dataset/AdvTest/test.jsonl \
141 |     --num_train_epochs 2 \
142 |     --code_length 256 \
143 |     --nl_length 128 \
144 |     --train_batch_size 64 \
145 |     --eval_batch_size 64 \
146 |     --learning_rate 2e-5 \
147 |     --seed 123456
148 | ```
149 | #### 2. CosQA dataset
150 | 
151 | ```bash
152 | # Training
153 | python run.py \
154 |     --output_dir saved_models/cosqa \
155 |     --model_name_or_path microsoft/unixcoder-base  \
156 |     --do_train \
157 |     --train_data_file dataset/cosqa/cosqa-retrieval-train-19604.json \
158 |     --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
159 |     --codebase_file dataset/cosqa/code_idx_map.txt \
160 |     --num_train_epochs 10 \
161 |     --code_length 256 \
162 |     --nl_length 128 \
163 |     --train_batch_size 64 \
164 |     --eval_batch_size 64 \
165 |     --learning_rate 2e-5 \
166 |     --seed 123456
167 | 
168 | # Evaluating
169 | python run.py \
170 |     --output_dir saved_models/cosqa \
171 |     --model_name_or_path microsoft/unixcoder-base  \
172 |     --do_eval \
173 |     --do_test \
174 |     --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
175 |     --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
176 |     --codebase_file dataset/cosqa/code_idx_map.txt \
177 |     --num_train_epochs 10 \
178 |     --code_length 256 \
179 |     --nl_length 128 \
180 |     --train_batch_size 64 \
181 |     --eval_batch_size 64 \
182 |     --learning_rate 2e-5 \
183 |     --seed 123456 
184 | ```
185 | 
186 | #### 3. CSN dataset
187 | 
188 | ```bash
189 | # Training
190 | lang=python
191 | python run.py \
192 |     --output_dir saved_models/CSN/$lang \
193 |     --model_name_or_path microsoft/unixcoder-base  \
194 |     --do_train \
195 |     --train_data_file dataset/CSN/$lang/train.jsonl \
196 |     --eval_data_file dataset/CSN/$lang/valid.jsonl \
197 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
198 |     --num_train_epochs 10 \
199 |     --code_length 256 \
200 |     --nl_length 128 \
201 |     --train_batch_size 64 \
202 |     --eval_batch_size 64 \
203 |     --learning_rate 2e-5 \
204 |     --seed 123456 
205 | 
206 | # Evaluating
207 | python run.py \
208 |     --output_dir saved_models/CSN/$lang \
209 |     --model_name_or_path microsoft/unixcoder-base  \
210 |     --do_eval \
211 |     --do_test \
212 |     --eval_data_file dataset/CSN/$lang/valid.jsonl \
213 |     --test_data_file dataset/CSN/$lang/test.jsonl \
214 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
215 |     --num_train_epochs 10 \
216 |     --code_length 256 \
217 |     --nl_length 128 \
218 |     --train_batch_size 64 \
219 |     --eval_batch_size 64 \
220 |     --learning_rate 2e-5 \
221 |     --seed 123456
222 | 
223 | ```
224 | 
225 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, nl_inputs=None): 
11 |         if code_inputs is not None:
12 |             outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 |             outputs = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None]
14 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 |         else:
16 |             outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 |             outputs = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None]
18 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |         
20 |  
21 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/README.md:
--------------------------------------------------------------------------------
 1 | # Code Summarization
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Text/code-to-text/dataset.zip
 7 | unzip dataset.zip
 8 | rm dataset.zip
 9 | cd dataset
10 | wget https://zenodo.org/record/7857872/files/python.zip
11 | wget https://zenodo.org/record/7857872/files/java.zip
12 | wget https://zenodo.org/record/7857872/files/ruby.zip
13 | wget https://zenodo.org/record/7857872/files/javascript.zip
14 | wget https://zenodo.org/record/7857872/files/go.zip
15 | wget https://zenodo.org/record/7857872/files/php.zip
16 | 
17 | unzip python.zip
18 | unzip java.zip
19 | unzip ruby.zip
20 | unzip javascript.zip
21 | unzip go.zip
22 | unzip php.zip
23 | rm *.zip
24 | rm *.pkl
25 | 
26 | python preprocess.py
27 | rm -r */final
28 | cd ..
29 | ```
30 | 
31 | ## Dependency 
32 | 
33 | - pip install torch
34 | - pip install transformers
35 | 
36 | ## Fine-Tune Setting
37 | 
38 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
39 | 
40 | ```shell
41 | lang=python
42 | 
43 | # Training
44 | python run.py \
45 | 	--do_train \
46 | 	--do_eval \
47 | 	--model_name_or_path microsoft/unixcoder-base \
48 | 	--train_filename dataset/$lang/train.jsonl \
49 | 	--dev_filename dataset/$lang/valid.jsonl \
50 | 	--output_dir saved_models/$lang \
51 | 	--max_source_length 256 \
52 | 	--max_target_length 128 \
53 | 	--beam_size 10 \
54 | 	--train_batch_size 48 \
55 | 	--eval_batch_size 48 \
56 | 	--learning_rate 5e-5 \
57 | 	--gradient_accumulation_steps 2 \
58 | 	--num_train_epochs 10 
59 | 	
60 | # Evaluating	
61 | python run.py \
62 | 	--do_test \
63 | 	--model_name_or_path microsoft/unixcoder-base \
64 | 	--test_filename dataset/$lang/test.jsonl \
65 | 	--output_dir saved_models/$lang \
66 | 	--max_source_length 256 \
67 | 	--max_target_length 128 \
68 | 	--beam_size 10 \
69 | 	--train_batch_size 48 \
70 | 	--eval_batch_size 48 \
71 | 	--learning_rate 5e-5 \
72 | 	--gradient_accumulation_steps 2 \
73 | 	--num_train_epochs 10 	
74 | ```
75 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Zero-shot Code-to-Code Search
 4 | 
 5 | Given a source code as the query, the task aims to retrieve codes with the same semantics from a collection of candidates in zero-shot setting.  We collect 11,744/15,594/23,530 functions from [CodeNet](https://github.com/IBM/Project_CodeNet) corpus in Ruby/Python/Java. Each function solves one of 4,053 problems. 
 6 | 
 7 | 
 8 | 
 9 | ## Data Download
10 | 
11 | ```bash
12 | cd dataset
13 | wget https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet.tar.gz
14 | tar -xvf Project_CodeNet.tar.gz
15 | python preprocess.py
16 | cd ..
17 | ```
18 | 
19 | 
20 | 
21 | ## Dependency 
22 | 
23 | - pip install torch
24 | - pip install transformers
25 | 
26 | 
27 | 
28 | ## Zero-Shot Setting
29 | 
30 | ```bash
31 | source_lang=ruby
32 | target_lang=python
33 | python run.py \
34 | --model_name_or_path microsoft/unixcoder-base  \
35 | --query_data_file dataset/${source_lang}_with_func.jsonl \
36 | --candidate_data_file dataset/${target_lang}_with_func.jsonl \
37 | --query_lang ${source_lang} \
38 | --candidate_lang ${target_lang} \
39 | --code_length 512 \
40 | --eval_batch_size 256 
41 | ```
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/dataset/preprocess.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | for lang,suffix in [("Java",".java"),("Ruby",".rb"),("Python",".py")]:
 4 |     with open("{}.jsonl".format(lang.lower())) as f, open("{}_with_func.jsonl".format(lang.lower()),"w") as f1:
 5 |         for line in f:
 6 |             js = json.loads(line.strip())
 7 |             problem_id = str(js["label"])
 8 |             problem_id = "p" + "0" * (5-len(problem_id)) + problem_id
 9 |             language = lang
10 |             submission_id = js["index"]
11 |             func = open("Project_CodeNet/data/{}/{}/{}{}".format(problem_id,language,submission_id,suffix)).read()
12 |             js["func"] = func
13 |             f1.write(json.dumps(js)+"\n")
14 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, nl_inputs=None, cls=False): 
11 |         if code_inputs is not None:
12 |             outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 |             outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None]
14 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 |         else:
16 |             outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 |             outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None]
18 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |       
20 |         
21 |  
22 | 


--------------------------------------------------------------------------------