├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CodeBERT
    ├── code2nl
    │   ├── README.md
    │   ├── bleu.py
    │   ├── model.py
    │   └── run.py
    └── codesearch
    │   ├── README.md
    │   ├── mrr.py
    │   ├── process_data.py
    │   ├── run_classifier.py
    │   └── utils.py
├── GraphCodeBERT
    ├── clonedetection
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── evaluator
    │   │   ├── answers.txt
    │   │   ├── evaluator.py
    │   │   └── predictions.txt
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── codesearch
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── refinement
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    └── translation
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │       ├── DFG.py
    │       ├── __init__.py
    │       ├── build.py
    │       ├── build.sh
    │       ├── my-languages.so
    │       └── utils.py
    │   └── run.py
├── LICENSE
├── NOTICE.md
├── README.md
└── SECURITY.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 | 
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 | 
5 | Resources:
6 | 
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/CodeBERT/code2nl/README.md:
--------------------------------------------------------------------------------
  1 | # Code Documentation Generation
  2 | 
  3 | This repo provides the code for reproducing the experiments on [CodeSearchNet](https://arxiv.org/abs/1909.09436) dataset for code document generation tasks in six programming languages.
  4 | 
  5 | **!News: We release a new pipeline for this task. The new pipeline only needs 2 p100 GPUs and less training time for Code Documentation Generation. Please refer to the [website](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text).**
  6 | 
  7 | ## Dependency
  8 | 
  9 | - pip install torch==1.4.0
 10 | - pip install transformers==2.5.0
 11 | - pip install filelock
 12 | 
 13 | ## Data Preprocess
 14 | 
 15 | We clean CodeSearchNet dataset for this task by following steps:
 16 | 
 17 | - Remove comments in the code
 18 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 19 | - Remove examples that #tokens of documents is < 3 or >256
 20 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 21 | - Remove examples that documents are not English.
 22 | 
 23 | Data statistic about the cleaned dataset for code document generation is shown in this Table. We release the cleaned dataset in this [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h).
 24 | 
 25 | | PL         | Training |  Dev   |  Test  |
 26 | | :--------- | :------: | :----: | :----: |
 27 | | Python     | 251,820  | 13,914 | 14,918 |
 28 | | PHP        | 241,241  | 12,982 | 14,014 |
 29 | | Go         | 167,288  | 7,325  | 8,122  |
 30 | | Java       | 164,923  | 5,183  | 10,955 |
 31 | | JavaScript |  58,025  | 3,885  | 3,291  |
 32 | | Ruby       |  24,927  | 1,400  | 1,261  |
 33 | 
 34 | 
 35 | 
 36 | ## Data Download
 37 | 
 38 | You can download dataset from the [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h). Or use the following command.
 39 | 
 40 | ```shell
 41 | pip install gdown
 42 | mkdir data data/code2nl
 43 | cd data/code2nl
 44 | gdown https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h
 45 | unzip Cleaned_CodeSearchNet.zip
 46 | rm Cleaned_CodeSearchNet.zip
 47 | cd ../..
 48 | ```
 49 | 
 50 | 
 51 | 
 52 | ## Fine-Tune
 53 | 
 54 | We fine-tuned the model on 4*P40 GPUs. 
 55 | 
 56 | ```shell
 57 | cd code2nl
 58 | 
 59 | lang=php #programming language
 60 | lr=5e-5
 61 | batch_size=64
 62 | beam_size=10
 63 | source_length=256
 64 | target_length=128
 65 | data_dir=../data/code2nl/CodeSearchNet
 66 | output_dir=model/$lang
 67 | train_file=$data_dir/$lang/train.jsonl
 68 | dev_file=$data_dir/$lang/valid.jsonl
 69 | eval_steps=1000 #400 for ruby, 600 for javascript, 1000 for others
 70 | train_steps=50000 #20000 for ruby, 30000 for javascript, 50000 for others
 71 | pretrained_model=microsoft/codebert-base #Roberta: roberta-base
 72 | 
 73 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --train_steps $train_steps --eval_steps $eval_steps 
 74 | ```
 75 | 
 76 | 
 77 | 
 78 | ## Inference and Evaluation
 79 | 
 80 | After fine-tuning, inference and evaluation are as follows:
 81 | 
 82 | ```shell
 83 | lang=php #programming language
 84 | beam_size=10
 85 | batch_size=128
 86 | source_length=256
 87 | target_length=128
 88 | output_dir=model/$lang
 89 | data_dir=../data/code2nl/CodeSearchNet
 90 | dev_file=$data_dir/$lang/valid.jsonl
 91 | test_file=$data_dir/$lang/test.jsonl
 92 | test_model=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 93 | 
 94 | python run.py --do_test --model_type roberta --model_name_or_path microsoft/codebert-base --load_model_path $test_model --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size
 95 | ```
 96 | 
 97 | The results on CodeSearchNet are shown in this Table:
 98 | 
 99 | | Model       |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
100 | | ----------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
101 | | Seq2Seq     |   9.64    |   10.21    |   13.98   |   15.93   |   15.09   |   21.08   |   14.32   |
102 | | Transformer |   11.18   |   11.59    |   16.38   |   15.81   |   16.26   |   22.12   |   15.56   |
103 | | RoBERTa     |   11.17   |   11.90    |   17.72   |   18.14   |   16.47   |   24.02   |   16.57   |
104 | | CodeBERT    | **12.16** | **14.90**  | **18.07** | **19.06** | **17.65** | **25.16** | **17.83** |
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/CodeBERT/code2nl/bleu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | '''
  4 | This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
  5 | '''
  6 | 
  7 | # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
  8 | 
  9 | '''Provides:
 10 | 
 11 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 12 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 13 | score_cooked(alltest, n=4): Score a list of cooked test sentences.
 14 | 
 15 | score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
 16 | 
 17 | The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
 18 | '''
 19 | 
 20 | import sys, math, re, xml.sax.saxutils
 21 | import subprocess
 22 | import os
 23 | 
 24 | # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 25 | nonorm = 0
 26 | 
 27 | preserve_case = False
 28 | eff_ref_len = "shortest"
 29 | 
 30 | normalize1 = [
 31 |     ('<skipped>', ''),         # strip "skipped" tags
 32 |     (r'-\n', ''),              # strip end-of-line hyphenation and join lines
 33 |     (r'\n', ' '),              # join lines
 34 | #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
 35 | ]
 36 | normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
 37 | 
 38 | normalize2 = [
 39 |     (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
 40 |     (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
 41 |     (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
 42 |     (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
 43 | ]
 44 | normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
 45 | 
 46 | def normalize(s):
 47 |     '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
 48 |     # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 49 |     if (nonorm):
 50 |         return s.split()
 51 |     if type(s) is not str:
 52 |         s = " ".join(s)
 53 |     # language-independent part:
 54 |     for (pattern, replace) in normalize1:
 55 |         s = re.sub(pattern, replace, s)
 56 |     s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
 57 |     # language-dependent part (assuming Western languages):
 58 |     s = " %s " % s
 59 |     if not preserve_case:
 60 |         s = s.lower()         # this might not be identical to the original
 61 |     for (pattern, replace) in normalize2:
 62 |         s = re.sub(pattern, replace, s)
 63 |     return s.split()
 64 | 
 65 | def count_ngrams(words, n=4):
 66 |     counts = {}
 67 |     for k in range(1,n+1):
 68 |         for i in range(len(words)-k+1):
 69 |             ngram = tuple(words[i:i+k])
 70 |             counts[ngram] = counts.get(ngram, 0)+1
 71 |     return counts
 72 | 
 73 | def cook_refs(refs, n=4):
 74 |     '''Takes a list of reference sentences for a single segment
 75 |     and returns an object that encapsulates everything that BLEU
 76 |     needs to know about them.'''
 77 |     
 78 |     refs = [normalize(ref) for ref in refs]
 79 |     maxcounts = {}
 80 |     for ref in refs:
 81 |         counts = count_ngrams(ref, n)
 82 |         for (ngram,count) in counts.items():
 83 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 84 |     return ([len(ref) for ref in refs], maxcounts)
 85 | 
 86 | def cook_test(test, item, n=4):
 87 |     '''Takes a test sentence and returns an object that
 88 |     encapsulates everything that BLEU needs to know about it.'''
 89 |     (reflens, refmaxcounts)=item
 90 |     test = normalize(test)
 91 |     result = {}
 92 |     result["testlen"] = len(test)
 93 | 
 94 |     # Calculate effective reference sentence length.
 95 |     
 96 |     if eff_ref_len == "shortest":
 97 |         result["reflen"] = min(reflens)
 98 |     elif eff_ref_len == "average":
 99 |         result["reflen"] = float(sum(reflens))/len(reflens)
100 |     elif eff_ref_len == "closest":
101 |         min_diff = None
102 |         for reflen in reflens:
103 |             if min_diff is None or abs(reflen-len(test)) < min_diff:
104 |                 min_diff = abs(reflen-len(test))
105 |                 result['reflen'] = reflen
106 | 
107 |     result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
108 | 
109 |     result['correct'] = [0]*n
110 |     counts = count_ngrams(test, n)
111 |     for (ngram, count) in counts.items():
112 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
113 | 
114 |     return result
115 | 
116 | def score_cooked(allcomps, n=4, ground=0, smooth=1):
117 |     totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
118 |     for comps in allcomps:
119 |         for key in ['testlen','reflen']:
120 |             totalcomps[key] += comps[key]
121 |         for key in ['guess','correct']:
122 |             for k in range(n):
123 |                 totalcomps[key][k] += comps[key][k]
124 |     logbleu = 0.0
125 |     all_bleus = []
126 |     for k in range(n):
127 |       correct = totalcomps['correct'][k]
128 |       guess = totalcomps['guess'][k]
129 |       addsmooth = 0
130 |       if smooth == 1 and k > 0:
131 |         addsmooth = 1
132 |       logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
133 |       if guess == 0:
134 |         all_bleus.append(-10000000)
135 |       else:
136 |         all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
137 | 
138 |     logbleu /= float(n)
139 |     all_bleus.insert(0, logbleu)
140 | 
141 |     brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
142 |     for i in range(len(all_bleus)):
143 |       if i ==0:
144 |         all_bleus[i] += brevPenalty
145 |       all_bleus[i] = math.exp(all_bleus[i])
146 |     return all_bleus
147 | 
148 | def bleu(refs,  candidate, ground=0, smooth=1):
149 |     refs = cook_refs(refs)
150 |     test = cook_test(candidate, refs)
151 |     return score_cooked([test], ground=ground, smooth=smooth)
152 | 
153 | def splitPuncts(line):
154 |   return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
155 | 
156 | def computeMaps(predictions, goldfile):
157 |   predictionMap = {}
158 |   goldMap = {}
159 |   gf = open(goldfile, 'r')
160 | 
161 |   for row in predictions:
162 |     cols = row.strip().split('\t')
163 |     if len(cols) == 1:
164 |       (rid, pred) = (cols[0], '') 
165 |     else:
166 |       (rid, pred) = (cols[0], cols[1]) 
167 |     predictionMap[rid] = [splitPuncts(pred.strip().lower())]
168 | 
169 |   for row in gf:
170 |     (rid, pred) = row.split('\t') 
171 |     if rid in predictionMap: # Only insert if the id exists for the method
172 |       if rid not in goldMap:
173 |         goldMap[rid] = []
174 |       goldMap[rid].append(splitPuncts(pred.strip().lower()))
175 | 
176 |   sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
177 |   return (goldMap, predictionMap)
178 | 
179 | 
180 | #m1 is the reference map
181 | #m2 is the prediction map
182 | def bleuFromMaps(m1, m2):
183 |   score = [0] * 5
184 |   num = 0.0
185 | 
186 |   for key in m1:
187 |     if key in m2:
188 |       bl = bleu(m1[key], m2[key][0])
189 |       score = [ score[i] + bl[i] for i in range(0, len(bl))]
190 |       num += 1
191 |   return [s * 100.0 / num for s in score]
192 | 
193 | if __name__ == '__main__':
194 |   reference_file = sys.argv[1]
195 |   predictions = []
196 |   for row in sys.stdin:
197 |     predictions.append(row)
198 |   (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
199 |   print (bleuFromMaps(goldMap, predictionMap)[0])
200 | 
201 | 


--------------------------------------------------------------------------------
/CodeBERT/code2nl/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
 55 |         outputs = self.encoder(source_ids, attention_mask=source_mask)
 56 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 57 |         if target_ids is not None:  
 58 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 59 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 60 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 61 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 62 |             lm_logits = self.lm_head(hidden_states)
 63 |             # Shift so that tokens < n predict n
 64 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 65 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 66 |             shift_labels = target_ids[..., 1:].contiguous()
 67 |             # Flatten the tokens
 68 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 69 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 70 |                             shift_labels.view(-1)[active_loss])
 71 | 
 72 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 73 |             return outputs
 74 |         else:
 75 |             #Predict 
 76 |             preds=[]       
 77 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 78 |             for i in range(source_ids.shape[0]):
 79 |                 context=encoder_output[:,i:i+1]
 80 |                 context_mask=source_mask[i:i+1,:]
 81 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 82 |                 input_ids=beam.getCurrentState()
 83 |                 context=context.repeat(1, self.beam_size,1)
 84 |                 context_mask=context_mask.repeat(self.beam_size,1)
 85 |                 for _ in range(self.max_length): 
 86 |                     if beam.done():
 87 |                         break
 88 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 89 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
 90 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
 91 |                     out = torch.tanh(self.dense(out))
 92 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
 93 |                     out = self.lsm(self.lm_head(hidden_states)).data
 94 |                     beam.advance(out)
 95 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 96 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
 97 |                 hyp= beam.getHyp(beam.getFinal())
 98 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
 99 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
100 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
101 |                 
102 |             preds=torch.cat(preds,0)                
103 |             return preds   
104 |         
105 |         
106 | 
107 | class Beam(object):
108 |     def __init__(self, size,sos,eos):
109 |         self.size = size
110 |         self.tt = torch.cuda
111 |         # The score for each translation on the beam.
112 |         self.scores = self.tt.FloatTensor(size).zero_()
113 |         # The backpointers at each time-step.
114 |         self.prevKs = []
115 |         # The outputs at each time-step.
116 |         self.nextYs = [self.tt.LongTensor(size)
117 |                        .fill_(0)]
118 |         self.nextYs[0][0] = sos
119 |         # Has EOS topped the beam yet.
120 |         self._eos = eos
121 |         self.eosTop = False
122 |         # Time and k pair for finished.
123 |         self.finished = []
124 | 
125 |     def getCurrentState(self):
126 |         "Get the outputs for the current timestep."
127 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
128 |         return batch
129 | 
130 |     def getCurrentOrigin(self):
131 |         "Get the backpointers for the current timestep."
132 |         return self.prevKs[-1]
133 | 
134 |     def advance(self, wordLk):
135 |         """
136 |         Given prob over words for every last beam `wordLk` and attention
137 |         `attnOut`: Compute and update the beam search.
138 | 
139 |         Parameters:
140 | 
141 |         * `wordLk`- probs of advancing from the last step (K x words)
142 |         * `attnOut`- attention at the last step
143 | 
144 |         Returns: True if beam search is complete.
145 |         """
146 |         numWords = wordLk.size(1)
147 | 
148 |         # Sum the previous scores.
149 |         if len(self.prevKs) > 0:
150 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
151 | 
152 |             # Don't let EOS have children.
153 |             for i in range(self.nextYs[-1].size(0)):
154 |                 if self.nextYs[-1][i] == self._eos:
155 |                     beamLk[i] = -1e20
156 |         else:
157 |             beamLk = wordLk[0]
158 |         flatBeamLk = beamLk.view(-1)
159 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
160 | 
161 |         self.scores = bestScores
162 | 
163 |         # bestScoresId is flattened beam x word array, so calculate which
164 |         # word and beam each score came from
165 |         prevK = bestScoresId // numWords
166 |         self.prevKs.append(prevK)
167 |         self.nextYs.append((bestScoresId - prevK * numWords))
168 | 
169 | 
170 |         for i in range(self.nextYs[-1].size(0)):
171 |             if self.nextYs[-1][i] == self._eos:
172 |                 s = self.scores[i]
173 |                 self.finished.append((s, len(self.nextYs) - 1, i))
174 | 
175 |         # End condition is when top-of-beam is EOS and no global score.
176 |         if self.nextYs[-1][0] == self._eos:
177 |             self.eosTop = True
178 | 
179 |     def done(self):
180 |         return self.eosTop and len(self.finished) >=self.size
181 | 
182 |     def getFinal(self):
183 |         if len(self.finished) == 0:
184 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
185 |         self.finished.sort(key=lambda a: -a[0])
186 |         if len(self.finished) != self.size:
187 |             unfinished=[]
188 |             for i in range(self.nextYs[-1].size(0)):
189 |                 if self.nextYs[-1][i] != self._eos:
190 |                     s = self.scores[i]
191 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
192 |             unfinished.sort(key=lambda a: -a[0])
193 |             self.finished+=unfinished[:self.size-len(self.finished)]
194 |         return self.finished[:self.size]
195 | 
196 |     def getHyp(self, beam_res):
197 |         """
198 |         Walk back to construct the full hypothesis.
199 |         """
200 |         hyps=[]
201 |         for _,timestep, k in beam_res:
202 |             hyp = []
203 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
204 |                 hyp.append(self.nextYs[j+1][k])
205 |                 k = self.prevKs[j][k]
206 |             hyps.append(hyp[::-1])
207 |         return hyps
208 |     
209 |     def buildTargetTokens(self, preds):
210 |         sentence=[]
211 |         for pred in preds:
212 |             tokens = []
213 |             for tok in pred:
214 |                 if tok==self._eos:
215 |                     break
216 |                 tokens.append(tok)
217 |             sentence.append(tokens)
218 |         return sentence
219 |         
220 | 


--------------------------------------------------------------------------------
/CodeBERT/code2nl/run.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 19 | using a masked language modeling (MLM) loss.
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | import os
 24 | import sys
 25 | import bleu
 26 | import pickle
 27 | import torch
 28 | import json
 29 | import random
 30 | import logging
 31 | import argparse
 32 | import numpy as np
 33 | from io import open
 34 | from itertools import cycle
 35 | import torch.nn as nn
 36 | from model import Seq2Seq
 37 | from tqdm import tqdm, trange
 38 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
 39 | from torch.utils.data.distributed import DistributedSampler
 40 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
 41 |                           RobertaConfig, RobertaModel, RobertaTokenizer)
 42 | MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
 43 | 
 44 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 45 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 46 |                     level = logging.INFO)
 47 | logger = logging.getLogger(__name__)
 48 | 
 49 | class Example(object):
 50 |     """A single training/test example."""
 51 |     def __init__(self,
 52 |                  idx,
 53 |                  source,
 54 |                  target,
 55 |                  ):
 56 |         self.idx = idx
 57 |         self.source = source
 58 |         self.target = target
 59 | 
 60 | def read_examples(filename):
 61 |     """Read examples from filename."""
 62 |     examples=[]
 63 |     with open(filename,encoding="utf-8") as f:
 64 |         for idx, line in enumerate(f):
 65 |             line=line.strip()
 66 |             js=json.loads(line)
 67 |             if 'idx' not in js:
 68 |                 js['idx']=idx
 69 |             code=' '.join(js['code_tokens']).replace('\n',' ')
 70 |             code=' '.join(code.strip().split())
 71 |             nl=' '.join(js['docstring_tokens']).replace('\n','')
 72 |             nl=' '.join(nl.strip().split())            
 73 |             examples.append(
 74 |                 Example(
 75 |                         idx = idx,
 76 |                         source=code,
 77 |                         target = nl,
 78 |                         ) 
 79 |             )
 80 |     return examples
 81 | 
 82 | 
 83 | class InputFeatures(object):
 84 |     """A single training/test features for a example."""
 85 |     def __init__(self,
 86 |                  example_id,
 87 |                  source_ids,
 88 |                  target_ids,
 89 |                  source_mask,
 90 |                  target_mask,
 91 | 
 92 |     ):
 93 |         self.example_id = example_id
 94 |         self.source_ids = source_ids
 95 |         self.target_ids = target_ids
 96 |         self.source_mask = source_mask
 97 |         self.target_mask = target_mask       
 98 |         
 99 | 
100 | 
101 | def convert_examples_to_features(examples, tokenizer, args,stage=None):
102 |     features = []
103 |     for example_index, example in enumerate(examples):
104 |         #source
105 |         source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-2]
106 |         source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
107 |         source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
108 |         source_mask = [1] * (len(source_tokens))
109 |         padding_length = args.max_source_length - len(source_ids)
110 |         source_ids+=[tokenizer.pad_token_id]*padding_length
111 |         source_mask+=[0]*padding_length
112 |  
113 |         #target
114 |         if stage=="test":
115 |             target_tokens = tokenizer.tokenize("None")
116 |         else:
117 |             target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
118 |         target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
119 |         target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
120 |         target_mask = [1] *len(target_ids)
121 |         padding_length = args.max_target_length - len(target_ids)
122 |         target_ids+=[tokenizer.pad_token_id]*padding_length
123 |         target_mask+=[0]*padding_length   
124 |    
125 |         if example_index < 5:
126 |             if stage=='train':
127 |                 logger.info("*** Example ***")
128 |                 logger.info("idx: {}".format(example.idx))
129 | 
130 |                 logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
131 |                 logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
132 |                 logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
133 |                 
134 |                 logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
135 |                 logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
136 |                 logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
137 |        
138 |         features.append(
139 |             InputFeatures(
140 |                  example_index,
141 |                  source_ids,
142 |                  target_ids,
143 |                  source_mask,
144 |                  target_mask,
145 |             )
146 |         )
147 |     return features
148 | 
149 | 
150 | 
151 | def set_seed(args):
152 |     """set random seed."""
153 |     random.seed(args.seed)
154 |     np.random.seed(args.seed)
155 |     torch.manual_seed(args.seed)
156 |     if args.n_gpu > 0:
157 |         torch.cuda.manual_seed_all(args.seed)
158 |         
159 | def main():
160 |     parser = argparse.ArgumentParser()
161 | 
162 |     ## Required parameters  
163 |     parser.add_argument("--model_type", default=None, type=str, required=True,
164 |                         help="Model type: e.g. roberta")
165 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
166 |                         help="Path to pre-trained model: e.g. roberta-base" )   
167 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
168 |                         help="The output directory where the model predictions and checkpoints will be written.")
169 |     parser.add_argument("--load_model_path", default=None, type=str, 
170 |                         help="Path to trained model: Should contain the .bin files" )    
171 |     ## Other parameters
172 |     parser.add_argument("--train_filename", default=None, type=str, 
173 |                         help="The train filename. Should contain the .jsonl files for this task.")
174 |     parser.add_argument("--dev_filename", default=None, type=str, 
175 |                         help="The dev filename. Should contain the .jsonl files for this task.")
176 |     parser.add_argument("--test_filename", default=None, type=str, 
177 |                         help="The test filename. Should contain the .jsonl files for this task.")  
178 |     
179 |     parser.add_argument("--config_name", default="", type=str,
180 |                         help="Pretrained config name or path if not the same as model_name")
181 |     parser.add_argument("--tokenizer_name", default="", type=str,
182 |                         help="Pretrained tokenizer name or path if not the same as model_name") 
183 |     parser.add_argument("--max_source_length", default=64, type=int,
184 |                         help="The maximum total source sequence length after tokenization. Sequences longer "
185 |                              "than this will be truncated, sequences shorter will be padded.")
186 |     parser.add_argument("--max_target_length", default=32, type=int,
187 |                         help="The maximum total target sequence length after tokenization. Sequences longer "
188 |                              "than this will be truncated, sequences shorter will be padded.")
189 |     
190 |     parser.add_argument("--do_train", action='store_true',
191 |                         help="Whether to run training.")
192 |     parser.add_argument("--do_eval", action='store_true',
193 |                         help="Whether to run eval on the dev set.")
194 |     parser.add_argument("--do_test", action='store_true',
195 |                         help="Whether to run eval on the dev set.")
196 |     parser.add_argument("--do_lower_case", action='store_true',
197 |                         help="Set this flag if you are using an uncased model.")
198 |     parser.add_argument("--no_cuda", action='store_true',
199 |                         help="Avoid using CUDA when available") 
200 |     
201 |     parser.add_argument("--train_batch_size", default=8, type=int,
202 |                         help="Batch size per GPU/CPU for training.")
203 |     parser.add_argument("--eval_batch_size", default=8, type=int,
204 |                         help="Batch size per GPU/CPU for evaluation.")
205 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
206 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
207 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
208 |                         help="The initial learning rate for Adam.")
209 |     parser.add_argument("--beam_size", default=10, type=int,
210 |                         help="beam size for beam search")    
211 |     parser.add_argument("--weight_decay", default=0.0, type=float,
212 |                         help="Weight deay if we apply some.")
213 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
214 |                         help="Epsilon for Adam optimizer.")
215 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
216 |                         help="Max gradient norm.")
217 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
218 |                         help="Total number of training epochs to perform.")
219 |     parser.add_argument("--max_steps", default=-1, type=int,
220 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
221 |     parser.add_argument("--eval_steps", default=-1, type=int,
222 |                         help="")
223 |     parser.add_argument("--train_steps", default=-1, type=int,
224 |                         help="")
225 |     parser.add_argument("--warmup_steps", default=0, type=int,
226 |                         help="Linear warmup over warmup_steps.")
227 |     parser.add_argument("--local_rank", type=int, default=-1,
228 |                         help="For distributed training: local_rank")   
229 |     parser.add_argument('--seed', type=int, default=42,
230 |                         help="random seed for initialization")
231 |     # print arguments
232 |     args = parser.parse_args()
233 |     logger.info(args)
234 | 
235 |     # Setup CUDA, GPU & distributed training
236 |     if args.local_rank == -1 or args.no_cuda:
237 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
238 |         args.n_gpu = torch.cuda.device_count()
239 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
240 |         torch.cuda.set_device(args.local_rank)
241 |         device = torch.device("cuda", args.local_rank)
242 |         torch.distributed.init_process_group(backend='nccl')
243 |         args.n_gpu = 1
244 |     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
245 |                     args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
246 |     args.device = device
247 |     # Set seed
248 |     set_seed(args)
249 |     # make dir if output_dir not exist
250 |     if os.path.exists(args.output_dir) is False:
251 |         os.makedirs(args.output_dir)
252 |         
253 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
254 |     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
255 |     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)
256 |     
257 |     #budild model
258 |     encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
259 |     decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
260 |     decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
261 |     model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
262 |                   beam_size=args.beam_size,max_length=args.max_target_length,
263 |                   sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
264 |     if args.load_model_path is not None:
265 |         logger.info("reload model from {}".format(args.load_model_path))
266 |         model.load_state_dict(torch.load(args.load_model_path))
267 |         
268 |     model.to(device)
269 |     if args.local_rank != -1:
270 |         # Distributed training
271 |         try:
272 |             from apex.parallel import DistributedDataParallel as DDP
273 |         except ImportError:
274 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
275 | 
276 |         model = DDP(model)
277 |     elif args.n_gpu > 1:
278 |         # multi-gpu training
279 |         model = torch.nn.DataParallel(model)
280 | 
281 | 
282 | 
283 | 
284 |     if args.do_train:
285 |         # Prepare training data loader
286 |         train_examples = read_examples(args.train_filename)
287 |         train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
288 |         all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
289 |         all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
290 |         all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
291 |         all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
292 |         train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
293 |         
294 |         if args.local_rank == -1:
295 |             train_sampler = RandomSampler(train_data)
296 |         else:
297 |             train_sampler = DistributedSampler(train_data)
298 |         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps)
299 | 
300 |         num_train_optimization_steps =  args.train_steps
301 | 
302 |         # Prepare optimizer and schedule (linear warmup and decay)
303 |         no_decay = ['bias', 'LayerNorm.weight']
304 |         optimizer_grouped_parameters = [
305 |             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
306 |              'weight_decay': args.weight_decay},
307 |             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
308 |         ]
309 |         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
310 |         scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
311 |                                                     num_training_steps=num_train_optimization_steps)
312 |     
313 |         
314 |         #Start training
315 |         logger.info("***** Running training *****")
316 |         logger.info("  Num examples = %d", len(train_examples))
317 |         logger.info("  Batch size = %d", args.train_batch_size)
318 |         logger.info("  Num epoch = %d", num_train_optimization_steps*args.train_batch_size//len(train_examples))
319 |         
320 | 
321 |         model.train()
322 |         dev_dataset={}
323 |         nb_tr_examples, nb_tr_steps,tr_loss,global_step,best_bleu,best_loss = 0, 0,0,0,0,1e6 
324 |         bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps)
325 |         train_dataloader=cycle(train_dataloader)
326 |         eval_flag = True
327 |         for step in bar:
328 |             batch = next(train_dataloader)
329 |             batch = tuple(t.to(device) for t in batch)
330 |             source_ids,source_mask,target_ids,target_mask = batch
331 |             loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,target_ids=target_ids,target_mask=target_mask)
332 |             
333 |             if args.n_gpu > 1:
334 |                 loss = loss.mean() # mean() to average on multi-gpu.
335 |             if args.gradient_accumulation_steps > 1:
336 |                 loss = loss / args.gradient_accumulation_steps
337 |             tr_loss += loss.item()
338 |             train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
339 |             bar.set_description("loss {}".format(train_loss))
340 |             nb_tr_examples += source_ids.size(0)
341 |             nb_tr_steps += 1
342 |             loss.backward()
343 | 
344 |             if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
345 |                 #Update parameters
346 |                 optimizer.step()
347 |                 optimizer.zero_grad()
348 |                 scheduler.step()
349 |                 global_step += 1
350 |                 eval_flag = True
351 |                 
352 |             if args.do_eval and ((global_step + 1) %args.eval_steps == 0) and eval_flag:
353 |                 #Eval model with dev dataset
354 |                 tr_loss = 0
355 |                 nb_tr_examples, nb_tr_steps = 0, 0                     
356 |                 eval_flag=False    
357 |                 if 'dev_loss' in dev_dataset:
358 |                     eval_examples,eval_data=dev_dataset['dev_loss']
359 |                 else:
360 |                     eval_examples = read_examples(args.dev_filename)
361 |                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
362 |                     all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
363 |                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
364 |                     all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
365 |                     all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long)      
366 |                     eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)   
367 |                     dev_dataset['dev_loss']=eval_examples,eval_data
368 |                 eval_sampler = SequentialSampler(eval_data)
369 |                 eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
370 |                 
371 |                 logger.info("\n***** Running evaluation *****")
372 |                 logger.info("  Num examples = %d", len(eval_examples))
373 |                 logger.info("  Batch size = %d", args.eval_batch_size)
374 | 
375 |                 #Start Evaling model
376 |                 model.eval()
377 |                 eval_loss,tokens_num = 0,0
378 |                 for batch in eval_dataloader:
379 |                     batch = tuple(t.to(device) for t in batch)
380 |                     source_ids,source_mask,target_ids,target_mask = batch                  
381 | 
382 |                     with torch.no_grad():
383 |                         _,loss,num = model(source_ids=source_ids,source_mask=source_mask,
384 |                                            target_ids=target_ids,target_mask=target_mask)     
385 |                     eval_loss += loss.sum().item()
386 |                     tokens_num += num.sum().item()
387 |                 #Pring loss of dev dataset    
388 |                 model.train()
389 |                 eval_loss = eval_loss / tokens_num
390 |                 result = {'eval_ppl': round(np.exp(eval_loss),5),
391 |                           'global_step': global_step+1,
392 |                           'train_loss': round(train_loss,5)}
393 |                 for key in sorted(result.keys()):
394 |                     logger.info("  %s = %s", key, str(result[key]))
395 |                 logger.info("  "+"*"*20)   
396 |                 
397 |                 #save last checkpoint
398 |                 last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
399 |                 if not os.path.exists(last_output_dir):
400 |                     os.makedirs(last_output_dir)
401 |                 model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
402 |                 output_model_file = os.path.join(last_output_dir, "pytorch_model.bin")
403 |                 torch.save(model_to_save.state_dict(), output_model_file)                    
404 |                 if eval_loss<best_loss:
405 |                     logger.info("  Best ppl:%s",round(np.exp(eval_loss),5))
406 |                     logger.info("  "+"*"*20)
407 |                     best_loss=eval_loss
408 |                     # Save best checkpoint for best ppl
409 |                     output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl')
410 |                     if not os.path.exists(output_dir):
411 |                         os.makedirs(output_dir)
412 |                     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
413 |                     output_model_file = os.path.join(output_dir, "pytorch_model.bin")
414 |                     torch.save(model_to_save.state_dict(), output_model_file)  
415 |                             
416 |                             
417 |                 #Calculate bleu  
418 |                 if 'dev_bleu' in dev_dataset:
419 |                     eval_examples,eval_data=dev_dataset['dev_bleu']
420 |                 else:
421 |                     eval_examples = read_examples(args.dev_filename)
422 |                     eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
423 |                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
424 |                     all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
425 |                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
426 |                     eval_data = TensorDataset(all_source_ids,all_source_mask)   
427 |                     dev_dataset['dev_bleu']=eval_examples,eval_data
428 | 
429 | 
430 |                 
431 |                 eval_sampler = SequentialSampler(eval_data)
432 |                 eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
433 | 
434 |                 model.eval() 
435 |                 p=[]
436 |                 for batch in eval_dataloader:
437 |                     batch = tuple(t.to(device) for t in batch)
438 |                     source_ids,source_mask= batch                  
439 |                     with torch.no_grad():
440 |                         preds = model(source_ids=source_ids,source_mask=source_mask)  
441 |                         for pred in preds:
442 |                             t=pred[0].cpu().numpy()
443 |                             t=list(t)
444 |                             if 0 in t:
445 |                                 t=t[:t.index(0)]
446 |                             text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
447 |                             p.append(text)
448 |                 model.train()
449 |                 predictions=[]
450 |                 with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
451 |                     for ref,gold in zip(p,eval_examples):
452 |                         predictions.append(str(gold.idx)+'\t'+ref)
453 |                         f.write(str(gold.idx)+'\t'+ref+'\n')
454 |                         f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
455 | 
456 |                 (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "dev.gold")) 
457 |                 dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
458 |                 logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
459 |                 logger.info("  "+"*"*20)    
460 |                 if dev_bleu>best_bleu:
461 |                     logger.info("  Best bleu:%s",dev_bleu)
462 |                     logger.info("  "+"*"*20)
463 |                     best_bleu=dev_bleu
464 |                     # Save best checkpoint for best bleu
465 |                     output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu')
466 |                     if not os.path.exists(output_dir):
467 |                         os.makedirs(output_dir)
468 |                     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
469 |                     output_model_file = os.path.join(output_dir, "pytorch_model.bin")
470 |                     torch.save(model_to_save.state_dict(), output_model_file)
471 |                
472 |     if args.do_test:
473 |         files=[]
474 |         if args.dev_filename is not None:
475 |             files.append(args.dev_filename)
476 |         if args.test_filename is not None:
477 |             files.append(args.test_filename)
478 |         for idx,file in enumerate(files):   
479 |             logger.info("Test file: {}".format(file))
480 |             eval_examples = read_examples(file)
481 |             eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
482 |             all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
483 |             all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
484 |             eval_data = TensorDataset(all_source_ids,all_source_mask)   
485 | 
486 |             # Calculate bleu
487 |             eval_sampler = SequentialSampler(eval_data)
488 |             eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
489 | 
490 |             model.eval() 
491 |             p=[]
492 |             for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
493 |                 batch = tuple(t.to(device) for t in batch)
494 |                 source_ids,source_mask= batch                  
495 |                 with torch.no_grad():
496 |                     preds = model(source_ids=source_ids,source_mask=source_mask)  
497 |                     for pred in preds:
498 |                         t=pred[0].cpu().numpy()
499 |                         t=list(t)
500 |                         if 0 in t:
501 |                             t=t[:t.index(0)]
502 |                         text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
503 |                         p.append(text)
504 |             model.train()
505 |             predictions=[]
506 |             with open(os.path.join(args.output_dir,"test_{}.output".format(str(idx))),'w') as f, open(os.path.join(args.output_dir,"test_{}.gold".format(str(idx))),'w') as f1:
507 |                 for ref,gold in zip(p,eval_examples):
508 |                     predictions.append(str(gold.idx)+'\t'+ref)
509 |                     f.write(str(gold.idx)+'\t'+ref+'\n')
510 |                     f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
511 | 
512 |             (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "test_{}.gold".format(idx))) 
513 |             dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
514 |             logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
515 |             logger.info("  "+"*"*20)    
516 | 
517 | 
518 | 
519 |                             
520 | 
521 |                 
522 |                 
523 | if __name__ == "__main__":
524 |     main()
525 | 
526 | 
527 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
 1 | # Code Search
 2 | 
 3 | ## Data Preprocess
 4 | 
 5 | Both training and validation datasets are created in a way that positive and negative samples are balanced. Negative samples consist of balanced number of instances with randomly replaced NL and PL.
 6 | 
 7 | We follow the official evaluation metric to calculate the Mean Reciprocal Rank (MRR) for each pair of test data (c, w) over a fixed set of 999 distractor codes.
 8 | 
 9 | You can use the following command to download the preprocessed training and validation dataset and preprocess the test dataset by yourself. The preprocessed testing dataset is very large, so only the preprocessing script is provided.
10 | 
11 | ```shell
12 | mkdir data data/codesearch
13 | cd data/codesearch
14 | gdown https://drive.google.com/uc?id=1xgSR34XO8xXZg4cZScDYj2eGerBE9iGo  
15 | unzip codesearch_data.zip
16 | rm  codesearch_data.zip
17 | cd ../../codesearch
18 | python process_data.py
19 | cd ..
20 | ```
21 | 
22 | ## Fine-Tune
23 | We fine-tuned the model on 2*P100 GPUs. 
24 | ```shell
25 | cd codesearch
26 | 
27 | lang=php #fine-tuning a language-specific model for each programming language 
28 | pretrained_model=microsoft/codebert-base  #Roberta: roberta-base
29 | 
30 | python run_classifier.py \
31 | --model_type roberta \
32 | --task_name codesearch \
33 | --do_train \
34 | --do_eval \
35 | --eval_all_checkpoints \
36 | --train_file train.txt \
37 | --dev_file valid.txt \
38 | --max_seq_length 200 \
39 | --per_gpu_train_batch_size 32 \
40 | --per_gpu_eval_batch_size 32 \
41 | --learning_rate 1e-5 \
42 | --num_train_epochs 8 \
43 | --gradient_accumulation_steps 1 \
44 | --overwrite_output_dir \
45 | --data_dir ../data/codesearch/train_valid/$lang \
46 | --output_dir ./models/$lang  \
47 | --model_name_or_path $pretrained_model
48 | ```
49 | ## Inference and Evaluation
50 | 
51 | Inference
52 | ```shell
53 | lang=php #programming language
54 | idx=0 #test batch idx
55 | 
56 | python run_classifier.py \
57 | --model_type roberta \
58 | --model_name_or_path microsoft/codebert-base \
59 | --task_name codesearch \
60 | --do_predict \
61 | --output_dir ./models/$lang \
62 | --data_dir ../data/codesearch/test/$lang \
63 | --max_seq_length 200 \
64 | --per_gpu_train_batch_size 32 \
65 | --per_gpu_eval_batch_size 32 \
66 | --learning_rate 1e-5 \
67 | --num_train_epochs 8 \
68 | --test_file batch_${idx}.txt \
69 | --pred_model_dir ./models/$lang/checkpoint-best/ \
70 | --test_result_dir ./results/$lang/${idx}_batch_result.txt
71 | ```
72 | 
73 | Evaluation
74 | ```shell
75 | python mrr.py
76 | ```
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/mrr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import os
 6 | import numpy as np
 7 | from more_itertools import chunked
 8 | import argparse
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--test_batch_size', type=int, default=1000)
14 |     args = parser.parse_args()
15 |     languages = ['ruby', 'go', 'php', 'python', 'java', 'javascript']
16 |     MRR_dict = {}
17 |     for language in languages:
18 |         file_dir = './results/{}'.format(language)
19 |         ranks = []
20 |         num_batch = 0
21 |         for file in sorted(os.listdir(file_dir)):
22 |             print(os.path.join(file_dir, file))
23 |             with open(os.path.join(file_dir, file), encoding='utf-8') as f:
24 |                 batched_data = chunked(f.readlines(), args.test_batch_size)
25 |                 for batch_idx, batch_data in enumerate(batched_data):
26 |                     num_batch += 1
27 |                     correct_score = float(batch_data[batch_idx].strip().split('<CODESPLIT>')[-1])
28 |                     scores = np.array([float(data.strip().split('<CODESPLIT>')[-1]) for data in batch_data])
29 |                     rank = np.sum(scores >= correct_score)
30 |                     ranks.append(rank)
31 | 
32 |         mean_mrr = np.mean(1.0 / np.array(ranks))
33 |         print("{} mrr: {}".format(language, mean_mrr))
34 |         MRR_dict[language] = mean_mrr
35 |     for key, val in MRR_dict.items():
36 |         print("{} mrr: {}".format(key, val))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/process_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import gzip
 6 | import os
 7 | import json
 8 | import numpy as np
 9 | from more_itertools import chunked
10 | 
11 | DATA_DIR='../data/codesearch'
12 | 
13 | def format_str(string):
14 |     for char in ['\r\n', '\r', '\n']:
15 |         string = string.replace(char, ' ')
16 |     return string
17 | 
18 | 
19 | def preprocess_test_data(language, test_batch_size=1000):
20 |     path = os.path.join(DATA_DIR, '{}_test_0.jsonl.gz'.format(language))
21 |     print(path)
22 |     with gzip.open(path, 'r') as pf:
23 |         data = pf.readlines()  
24 | 
25 |     idxs = np.arange(len(data))
26 |     data = np.array(data, dtype=np.object)
27 | 
28 |     np.random.seed(0)   # set random seed so that random things are reproducible
29 |     np.random.shuffle(idxs)
30 |     data = data[idxs]
31 |     batched_data = chunked(data, test_batch_size)
32 | 
33 |     print("start processing")
34 |     for batch_idx, batch_data in enumerate(batched_data):
35 |         if len(batch_data) < test_batch_size:
36 |             break # the last batch is smaller than the others, exclude.
37 |         examples = []
38 |         for d_idx, d in enumerate(batch_data): 
39 |             line_a = json.loads(str(d, encoding='utf-8'))
40 |             doc_token = ' '.join(line_a['docstring_tokens'])
41 |             for dd in batch_data:
42 |                 line_b = json.loads(str(dd, encoding='utf-8'))
43 |                 code_token = ' '.join([format_str(token) for token in line_b['code_tokens']])
44 | 
45 |                 example = (str(1), line_a['url'], line_b['url'], doc_token, code_token)
46 |                 example = '<CODESPLIT>'.join(example)
47 |                 examples.append(example)
48 | 
49 |         data_path = os.path.join(DATA_DIR, 'test/{}'.format(language))
50 |         if not os.path.exists(data_path):
51 |             os.makedirs(data_path)
52 |         file_path = os.path.join(data_path, 'batch_{}.txt'.format(batch_idx))
53 |         print(file_path)
54 |         with open(file_path, 'w', encoding='utf-8') as f:
55 |             f.writelines('\n'.join(examples))
56 | 
57 | if __name__ == '__main__':
58 |     languages = ['go', 'php', 'python', 'java', 'javascript', 'ruby']
59 |     for lang in languages:
60 |         preprocess_test_data(lang)
61 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT classification fine-tuning: utilities to work with GLUE tasks """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import csv
 21 | import logging
 22 | import os
 23 | import sys
 24 | from io import open
 25 | from sklearn.metrics import f1_score
 26 | 
 27 | csv.field_size_limit(sys.maxsize)
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class InputExample(object):
 32 |     """A single training/test example for simple sequence classification."""
 33 | 
 34 |     def __init__(self, guid, text_a, text_b=None, label=None):
 35 |         """Constructs a InputExample.
 36 | 
 37 |         Args:
 38 |             guid: Unique id for the example.
 39 |             text_a: string. The untokenized text of the first sequence. For single
 40 |             sequence tasks, only this sequence must be specified.
 41 |             text_b: (Optional) string. The untokenized text of the second sequence.
 42 |             Only must be specified for sequence pair tasks.
 43 |             label: (Optional) string. The label of the example. This should be
 44 |             specified for train and dev examples, but not for test examples.
 45 |         """
 46 |         self.guid = guid
 47 |         self.text_a = text_a
 48 |         self.text_b = text_b
 49 |         self.label = label
 50 | 
 51 | 
 52 | class InputFeatures(object):
 53 |     """A single set of features of data."""
 54 | 
 55 |     def __init__(self, input_ids, input_mask, segment_ids, label_id):
 56 |         self.input_ids = input_ids
 57 |         self.input_mask = input_mask
 58 |         self.segment_ids = segment_ids
 59 |         self.label_id = label_id
 60 | 
 61 | 
 62 | class DataProcessor(object):
 63 |     """Base class for data converters for sequence classification data sets."""
 64 | 
 65 |     def get_train_examples(self, data_dir):
 66 |         """Gets a collection of `InputExample`s for the train set."""
 67 |         raise NotImplementedError()
 68 | 
 69 |     def get_dev_examples(self, data_dir):
 70 |         """Gets a collection of `InputExample`s for the dev set."""
 71 |         raise NotImplementedError()
 72 | 
 73 |     def get_labels(self):
 74 |         """Gets the list of labels for this data set."""
 75 |         raise NotImplementedError()
 76 | 
 77 |     @classmethod
 78 |     def _read_tsv(cls, input_file, quotechar=None):
 79 |         """Reads a tab separated value file."""
 80 |         with open(input_file, "r", encoding='utf-8') as f:
 81 |             lines = []
 82 |             for line in f.readlines():
 83 |                 line = line.strip().split('<CODESPLIT>')
 84 |                 if len(line) != 5:
 85 |                     continue
 86 |                 lines.append(line)
 87 |             return lines
 88 | 
 89 | 
 90 | class CodesearchProcessor(DataProcessor):
 91 |     """Processor for the MRPC data set (GLUE version)."""
 92 | 
 93 |     def get_train_examples(self, data_dir, train_file):
 94 |         """See base class."""
 95 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, train_file)))
 96 |         return self._create_examples(
 97 |             self._read_tsv(os.path.join(data_dir, train_file)), "train")
 98 | 
 99 |     def get_dev_examples(self, data_dir, dev_file):
100 |         """See base class."""
101 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, dev_file)))
102 |         return self._create_examples(
103 |             self._read_tsv(os.path.join(data_dir, dev_file)), "dev")
104 | 
105 |     def get_test_examples(self, data_dir, test_file):
106 |         """See base class."""
107 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, test_file)))
108 |         return self._create_examples(
109 |             self._read_tsv(os.path.join(data_dir, test_file)), "test")
110 | 
111 |     def get_labels(self):
112 |         """See base class."""
113 |         return ["0", "1"]
114 | 
115 |     def _create_examples(self, lines, set_type):
116 |         """Creates examples for the training and dev sets."""
117 |         examples = []
118 |         for (i, line) in enumerate(lines):
119 |             guid = "%s-%s" % (set_type, i)
120 |             text_a = line[3]
121 |             text_b = line[4]
122 |             if (set_type == 'test'):
123 |                 label = self.get_labels()[0]
124 |             else:
125 |                 label = line[0]
126 |             examples.append(
127 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
128 |         if (set_type == 'test'):
129 |             return examples, lines
130 |         else:
131 |             return examples
132 | 
133 | 
134 | def convert_examples_to_features(examples, label_list, max_seq_length,
135 |                                  tokenizer, output_mode,
136 |                                  cls_token_at_end=False, pad_on_left=False,
137 |                                  cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
138 |                                  sequence_a_segment_id=0, sequence_b_segment_id=1,
139 |                                  cls_token_segment_id=1, pad_token_segment_id=0,
140 |                                  mask_padding_with_zero=True):
141 |     """ Loads a data file into a list of `InputBatch`s
142 |         `cls_token_at_end` define the location of the CLS token:
143 |             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
144 |             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
145 |         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
146 |     """
147 | 
148 |     label_map = {label: i for i, label in enumerate(label_list)}
149 | 
150 |     features = []
151 |     for (ex_index, example) in enumerate(examples):
152 |         if ex_index % 10000 == 0:
153 |             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
154 | 
155 |         tokens_a = tokenizer.tokenize(example.text_a)[:50]
156 | 
157 |         tokens_b = None
158 |         if example.text_b:
159 |             tokens_b = tokenizer.tokenize(example.text_b)
160 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
161 |             # length is less than the specified length.
162 |             # Account for [CLS], [SEP], [SEP] with "- 3"
163 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
164 |         else:
165 |             # Account for [CLS] and [SEP] with "- 2"
166 |             if len(tokens_a) > max_seq_length - 2:
167 |                 tokens_a = tokens_a[:(max_seq_length - 2)]
168 | 
169 |         # The convention in BERT is:
170 |         # (a) For sequence pairs:
171 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
172 |         #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
173 |         # (b) For single sequences:
174 |         #  tokens:   [CLS] the dog is hairy . [SEP]
175 |         #  type_ids:   0   0   0   0  0     0   0
176 |         #
177 |         # Where "type_ids" are used to indicate whether this is the first
178 |         # sequence or the second sequence. The embedding vectors for `type=0` and
179 |         # `type=1` were learned during pre-training and are added to the wordpiece
180 |         # embedding vector (and position vector). This is not *strictly* necessary
181 |         # since the [SEP] token unambiguously separates the sequences, but it makes
182 |         # it easier for the model to learn the concept of sequences.
183 |         #
184 |         # For classification tasks, the first vector (corresponding to [CLS]) is
185 |         # used as as the "sentence vector". Note that this only makes sense because
186 |         # the entire model is fine-tuned.
187 |         tokens = tokens_a + [sep_token]
188 |         segment_ids = [sequence_a_segment_id] * len(tokens)
189 | 
190 |         if tokens_b:
191 |             tokens += tokens_b + [sep_token]
192 |             segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
193 | 
194 |         if cls_token_at_end:
195 |             tokens = tokens + [cls_token]
196 |             segment_ids = segment_ids + [cls_token_segment_id]
197 |         else:
198 |             tokens = [cls_token] + tokens
199 |             segment_ids = [cls_token_segment_id] + segment_ids
200 | 
201 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
202 | 
203 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
204 |         # tokens are attended to.
205 |         input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
206 | 
207 |         # Zero-pad up to the sequence length.
208 |         padding_length = max_seq_length - len(input_ids)
209 |         if pad_on_left:
210 |             input_ids = ([pad_token] * padding_length) + input_ids
211 |             input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
212 |             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
213 |         else:
214 |             input_ids = input_ids + ([pad_token] * padding_length)
215 |             input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
216 |             segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
217 | 
218 |         assert len(input_ids) == max_seq_length
219 |         assert len(input_mask) == max_seq_length
220 |         assert len(segment_ids) == max_seq_length
221 | 
222 |         if output_mode == "classification":
223 |             label_id = label_map[example.label]
224 |         elif output_mode == "regression":
225 |             label_id = float(example.label)
226 |         else:
227 |             raise KeyError(output_mode)
228 | 
229 |         if ex_index < 5:
230 |             logger.info("*** Example ***")
231 |             logger.info("guid: %s" % (example.guid))
232 |             logger.info("tokens: %s" % " ".join(
233 |                 [str(x) for x in tokens]))
234 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
235 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
236 |             logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
237 |             logger.info("label: %s (id = %d)" % (example.label, label_id))
238 | 
239 |         features.append(
240 |             InputFeatures(input_ids=input_ids,
241 |                           input_mask=input_mask,
242 |                           segment_ids=segment_ids,
243 |                           label_id=label_id))
244 |     return features
245 | 
246 | 
247 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
248 |     """Truncates a sequence pair in place to the maximum length."""
249 | 
250 |     # This is a simple heuristic which will always truncate the longer sequence
251 |     # one token at a time. This makes more sense than truncating an equal percent
252 |     # of tokens from each, since if one sequence is very short then each token
253 |     # that's truncated likely contains more information than a longer sequence.
254 |     while True:
255 |         total_length = len(tokens_a) + len(tokens_b)
256 |         if total_length <= max_length:
257 |             break
258 |         if len(tokens_a) > len(tokens_b):
259 |             tokens_a.pop()
260 |         else:
261 |             tokens_b.pop()
262 | 
263 | 
264 | def simple_accuracy(preds, labels):
265 |     return (preds == labels).mean()
266 | 
267 | 
268 | def acc_and_f1(preds, labels):
269 |     acc = simple_accuracy(preds, labels)
270 |     f1 = f1_score(y_true=labels, y_pred=preds)
271 |     return {
272 |         "acc": acc,
273 |         "f1": f1,
274 |         "acc_and_f1": (acc + f1) / 2,
275 |     }
276 | 
277 | 
278 | def compute_metrics(task_name, preds, labels):
279 |     assert len(preds) == len(labels)
280 |     if task_name == "codesearch":
281 |         return acc_and_f1(preds, labels)
282 |     else:
283 |         raise KeyError(task_name)
284 | 
285 | 
286 | processors = {
287 |     "codesearch": CodesearchProcessor,
288 | }
289 | 
290 | output_modes = {
291 |     "codesearch": "classification",
292 | }
293 | 
294 | GLUE_TASKS_NUM_LABELS = {
295 |     "codesearch": 2,
296 | }
297 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/README.md:
--------------------------------------------------------------------------------
  1 | # Clone Detection
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.
  6 | 
  7 | ## Dataset
  8 | 
  9 | The dataset we use is [BigCloneBench](https://www.cs.usask.ca/faculty/croy/papers/2014/SvajlenkoICSME2014BigERA.pdf) and filtered following the paper [Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree](https://arxiv.org/pdf/2002.08653.pdf).
 10 | 
 11 | ### Data Format
 12 | 
 13 | 1. dataset/data.jsonl is stored in jsonlines format. Each line in the uncompressed file represents one function.  One row is illustrated below.
 14 | 
 15 |    - **func:** the function
 16 | 
 17 |    - **idx:** index of the example
 18 | 
 19 | 2. train.txt/valid.txt/test.txt provide examples, stored in the following format:    idx1	idx2	label
 20 | 
 21 | ### Data Statistics
 22 | 
 23 | Data statistics of the dataset are shown in the below table:
 24 | 
 25 | |       | #Examples |
 26 | | ----- | :-------: |
 27 | | Train |  901,028  |
 28 | | Dev   |  415,416  |
 29 | | Test  |  415,416  |
 30 | 
 31 | You can get data using the following command.
 32 | 
 33 | ```
 34 | unzip dataset.zip
 35 | ```
 36 | 
 37 | ## Evaluator
 38 | 
 39 | We provide a script to evaluate predictions for this task, and report F1 score
 40 | 
 41 | ### Example
 42 | 
 43 | ```bash
 44 | python evaluator/evaluator.py -a evaluator/answers.txt -p evaluator/predictions.txt
 45 | ```
 46 | 
 47 | {'Recall': 0.25, 'Prediction': 0.5, 'F1': 0.3333333333333333}
 48 | 
 49 | ### Input predictions
 50 | 
 51 | A predications file that has predictions in TXT format, such as evaluator/predictions.txt. For example:
 52 | 
 53 | ```b
 54 | 13653451	21955002	0
 55 | 1188160	8831513	1
 56 | 1141235	14322332	0
 57 | 16765164	17526811	1
 58 | ```
 59 | 
 60 | ## Pipeline-GraphCodeBERT
 61 | 
 62 | We also provide a pipeline that fine-tunes GraphCodeBERT on this task. 
 63 | ### Dependency
 64 | 
 65 | - pip install torch
 66 | - pip install transformers
 67 | - pip install tree_sitter
 68 | - pip sklearn
 69 | 
 70 | ### Tree-sitter (optional)
 71 | 
 72 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 73 | 
 74 | ```shell
 75 | cd parser
 76 | bash build.sh
 77 | cd ..
 78 | ```
 79 | 
 80 | ### Fine-tune
 81 | 
 82 | We use 4*V100-16G to fine-tune and 10% valid data to evaluate.
 83 | 
 84 | 
 85 | ```shell
 86 | mkdir saved_models
 87 | python run.py \
 88 |     --output_dir=saved_models \
 89 |     --config_name=microsoft/graphcodebert-base \
 90 |     --model_name_or_path=microsoft/graphcodebert-base \
 91 |     --tokenizer_name=microsoft/graphcodebert-base \
 92 |     --do_train \
 93 |     --train_data_file=dataset/train.txt \
 94 |     --eval_data_file=dataset/valid.txt \
 95 |     --test_data_file=dataset/test.txt \
 96 |     --epoch 1 \
 97 |     --code_length 512 \
 98 |     --data_flow_length 128 \
 99 |     --train_batch_size 16 \
100 |     --eval_batch_size 32 \
101 |     --learning_rate 2e-5 \
102 |     --max_grad_norm 1.0 \
103 |     --evaluate_during_training \
104 |     --seed 123456 2>&1| tee saved_models/train.log
105 | ```
106 | 
107 | ### Inference
108 | 
109 | We use full test data for inference. 
110 | 
111 | ```shell
112 | python run.py \
113 |     --output_dir=saved_models \
114 |     --config_name=microsoft/graphcodebert-base \
115 |     --model_name_or_path=microsoft/graphcodebert-base \
116 |     --tokenizer_name=microsoft/graphcodebert-base \
117 |     --do_eval \
118 |     --do_test \
119 |     --train_data_file=dataset/train.txt \
120 |     --eval_data_file=dataset/valid.txt \
121 |     --test_data_file=dataset/test.txt \
122 |     --epoch 1 \
123 |     --code_length 512 \
124 |     --data_flow_length 128 \
125 |     --train_batch_size 16 \
126 |     --eval_batch_size 32 \
127 |     --learning_rate 2e-5 \
128 |     --max_grad_norm 1.0 \
129 |     --evaluate_during_training \
130 |     --seed 123456 2>&1| tee saved_models/test.log
131 | ```
132 | 
133 | ### Evaluation
134 | 
135 | ```shell
136 | python evaluator/evaluator.py -a dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
137 | ```
138 | 
139 | ## Result
140 | 
141 | The results on the test set are shown as below:
142 | 
143 | | Method        | Precision |  Recall   |    F1     |
144 | | ------------- | :-------: | :-------: | :-------: |
145 | | Deckard       |   0.93    |   0.02    |   0.03    |
146 | | RtvNN         |   0.95    |   0.01    |   0.01    |
147 | | CDLH          |   0.92    |   0.74    |   0.82    |
148 | | ASTNN         |   0.92    |   0.94    |   0.93    |
149 | | FA-AST-GMN    |   0.96    |   0.94    |   0.95    |
150 | | CodeBERT      |   0.964   |   0.966   |   0.965   |
151 | | GraphCodeBERT | **0.973** | **0.968** | **0.971** |
152 | 
153 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/clonedetection/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/answers.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	0
3 | 1141235	14322332	0
4 | 16765164	17526811	0


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | import logging
 4 | import sys
 5 | from sklearn.metrics import recall_score,precision_score,f1_score
 6 | 
 7 | def read_answers(filename):
 8 |     answers={}
 9 |     with open(filename) as f:
10 |         for line in f:
11 |             line=line.strip()
12 |             idx1,idx2,label=line.split()
13 |             answers[(idx1,idx2)]=label
14 |     return answers
15 | 
16 | def read_predictions(filename):
17 |     predictions={}
18 |     with open(filename) as f:
19 |         for line in f:
20 |             line=line.strip()
21 |             idx1,idx2,label=line.split()
22 |             if 'txt' in line:
23 |                 idx1=idx1.split('/')[-1][:-4]
24 |                 idx2=idx2.split('/')[-1][:-4]    
25 |             predictions[(idx1,idx2)]=label
26 |     return predictions
27 | 
28 | def calculate_scores(answers,predictions):
29 |     y_trues,y_preds=[],[]
30 |     for key in answers:
31 |         if key not in predictions:
32 |             logging.error("Missing prediction for ({},{}) pair.".format(key[0],key[1]))
33 |             sys.exit()
34 |         y_trues.append(answers[key])
35 |         y_preds.append(predictions[key])
36 |     scores={}
37 |     scores['Recall']=recall_score(y_trues, y_preds, average='macro')
38 |     scores['Prediction']=precision_score(y_trues, y_preds, average='macro')
39 |     scores['F1']=f1_score(y_trues, y_preds, average='macro')
40 |     return scores
41 | 
42 | def main():
43 |     import argparse
44 |     parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.')
45 |     parser.add_argument('--answers', '-a',help="filename of the labels, in txt format.")
46 |     parser.add_argument('--predictions', '-p',help="filename of the leaderboard predictions, in txt format.")
47 |     
48 | 
49 |     args = parser.parse_args()
50 |     answers=read_answers(args.answers)
51 |     predictions=read_predictions(args.predictions)
52 |     scores=calculate_scores(answers,predictions)
53 |     print(scores)
54 | 
55 | if __name__ == '__main__':
56 |     main()
57 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/predictions.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	1
3 | 1141235	14322332	0
4 | 16765164	17526811	1


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch
 4 | from torch.autograd import Variable
 5 | import copy
 6 | import torch.nn.functional as F
 7 | from torch.nn import CrossEntropyLoss, MSELoss
 8 | 
 9 | class RobertaClassificationHead(nn.Module):
10 |     """Head for sentence-level classification tasks."""
11 | 
12 |     def __init__(self, config):
13 |         super().__init__()
14 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
15 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
16 |         self.out_proj = nn.Linear(config.hidden_size, 2)
17 | 
18 |     def forward(self, features, **kwargs):
19 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
20 |         x = x.reshape(-1,x.size(-1)*2)
21 |         x = self.dropout(x)
22 |         x = self.dense(x)
23 |         x = torch.tanh(x)
24 |         x = self.dropout(x)
25 |         x = self.out_proj(x)
26 |         return x
27 |         
28 | class Model(nn.Module):   
29 |     def __init__(self, encoder,config,tokenizer,args):
30 |         super(Model, self).__init__()
31 |         self.encoder = encoder
32 |         self.config=config
33 |         self.tokenizer=tokenizer
34 |         self.classifier=RobertaClassificationHead(config)
35 |         self.args=args
36 |     
37 |         
38 |     def forward(self, inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels=None): 
39 |         bs,l=inputs_ids_1.size()
40 |         inputs_ids=torch.cat((inputs_ids_1.unsqueeze(1),inputs_ids_2.unsqueeze(1)),1).view(bs*2,l)
41 |         position_idx=torch.cat((position_idx_1.unsqueeze(1),position_idx_2.unsqueeze(1)),1).view(bs*2,l)
42 |         attn_mask=torch.cat((attn_mask_1.unsqueeze(1),attn_mask_2.unsqueeze(1)),1).view(bs*2,l,l)
43 | 
44 |         #embedding
45 |         nodes_mask=position_idx.eq(0)
46 |         token_mask=position_idx.ge(2)        
47 |         inputs_embeddings=self.encoder.roberta.embeddings.word_embeddings(inputs_ids)
48 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
49 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
50 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
51 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
52 |         
53 |         outputs = self.encoder.roberta(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)[0]
54 |         logits=self.classifier(outputs)
55 |         prob=F.softmax(logits)
56 |         if labels is not None:
57 |             loss_fct = CrossEntropyLoss()
58 |             loss = loss_fct(logits, labels)
59 |             return loss,prob
60 |         else:
61 |             return prob
62 |       
63 |         
64 | 
65 |        
66 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/clonedetection/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/run.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 19 | using a masked language modeling (MLM) loss.
 20 | """
 21 | 
 22 | from __future__ import absolute_import, division, print_function
 23 | 
 24 | import argparse
 25 | import glob
 26 | import logging
 27 | import os
 28 | import pickle
 29 | import random
 30 | import re
 31 | import shutil
 32 | import json
 33 | import numpy as np
 34 | import torch
 35 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
 36 | from torch.utils.data.distributed import DistributedSampler
 37 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
 38 |                           RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
 39 | from tqdm import tqdm, trange
 40 | import multiprocessing
 41 | from model import Model
 42 | 
 43 | cpu_cont = 16
 44 | logger = logging.getLogger(__name__)
 45 | 
 46 | from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript
 47 | from parser import (remove_comments_and_docstrings,
 48 |                    tree_to_token_index,
 49 |                    index_to_code_token,
 50 |                    tree_to_variable_index)
 51 | from tree_sitter import Language, Parser
 52 | dfg_function={
 53 |     'python':DFG_python,
 54 |     'java':DFG_java,
 55 |     'ruby':DFG_ruby,
 56 |     'go':DFG_go,
 57 |     'php':DFG_php,
 58 |     'javascript':DFG_javascript
 59 | }
 60 | 
 61 | #load parsers
 62 | parsers={}        
 63 | for lang in dfg_function:
 64 |     LANGUAGE = Language('parser/my-languages.so', lang)
 65 |     parser = Parser()
 66 |     parser.set_language(LANGUAGE) 
 67 |     parser = [parser,dfg_function[lang]]    
 68 |     parsers[lang]= parser
 69 |     
 70 |     
 71 | #remove comments, tokenize code and extract dataflow                                        
 72 | def extract_dataflow(code, parser,lang):
 73 |     #remove comments
 74 |     try:
 75 |         code=remove_comments_and_docstrings(code,lang)
 76 |     except:
 77 |         pass    
 78 |     #obtain dataflow
 79 |     if lang=="php":
 80 |         code="<?php"+code+"?>"    
 81 |     try:
 82 |         tree = parser[0].parse(bytes(code,'utf8'))    
 83 |         root_node = tree.root_node  
 84 |         tokens_index=tree_to_token_index(root_node)     
 85 |         code=code.split('\n')
 86 |         code_tokens=[index_to_code_token(x,code) for x in tokens_index]  
 87 |         index_to_code={}
 88 |         for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
 89 |             index_to_code[index]=(idx,code)  
 90 |         try:
 91 |             DFG,_=parser[1](root_node,index_to_code,{}) 
 92 |         except:
 93 |             DFG=[]
 94 |         DFG=sorted(DFG,key=lambda x:x[1])
 95 |         indexs=set()
 96 |         for d in DFG:
 97 |             if len(d[-1])!=0:
 98 |                 indexs.add(d[1])
 99 |             for x in d[-1]:
100 |                 indexs.add(x)
101 |         new_DFG=[]
102 |         for d in DFG:
103 |             if d[1] in indexs:
104 |                 new_DFG.append(d)
105 |         dfg=new_DFG
106 |     except:
107 |         dfg=[]
108 |     return code_tokens,dfg
109 | 
110 | class InputFeatures(object):
111 |     """A single training/test features for a example."""
112 |     def __init__(self,
113 |              input_tokens_1,
114 |              input_ids_1,
115 |              position_idx_1,
116 |              dfg_to_code_1,
117 |              dfg_to_dfg_1,
118 |              input_tokens_2,
119 |              input_ids_2,
120 |              position_idx_2,
121 |              dfg_to_code_2,
122 |              dfg_to_dfg_2,
123 |              label,
124 |              url1,
125 |              url2
126 | 
127 |     ):
128 |         #The first code function
129 |         self.input_tokens_1 = input_tokens_1
130 |         self.input_ids_1 = input_ids_1
131 |         self.position_idx_1=position_idx_1
132 |         self.dfg_to_code_1=dfg_to_code_1
133 |         self.dfg_to_dfg_1=dfg_to_dfg_1
134 |         
135 |         #The second code function
136 |         self.input_tokens_2 = input_tokens_2
137 |         self.input_ids_2 = input_ids_2
138 |         self.position_idx_2=position_idx_2
139 |         self.dfg_to_code_2=dfg_to_code_2
140 |         self.dfg_to_dfg_2=dfg_to_dfg_2
141 |         
142 |         #label
143 |         self.label=label
144 |         self.url1=url1
145 |         self.url2=url2
146 |         
147 | 
148 | def convert_examples_to_features(item):
149 |     #source
150 |     url1,url2,label,tokenizer, args,cache,url_to_code=item
151 |     parser=parsers['java']
152 |     
153 |     for url in [url1,url2]:
154 |         if url not in cache:
155 |             func=url_to_code[url]
156 |             
157 |             #extract data flow
158 |             code_tokens,dfg=extract_dataflow(func,parser,'java')
159 |             code_tokens=[tokenizer.tokenize('@ '+x)[1:] if idx!=0 else tokenizer.tokenize(x) for idx,x in enumerate(code_tokens)]
160 |             ori2cur_pos={}
161 |             ori2cur_pos[-1]=(0,0)
162 |             for i in range(len(code_tokens)):
163 |                 ori2cur_pos[i]=(ori2cur_pos[i-1][1],ori2cur_pos[i-1][1]+len(code_tokens[i]))    
164 |             code_tokens=[y for x in code_tokens for y in x]  
165 |             
166 |             #truncating
167 |             code_tokens=code_tokens[:args.code_length+args.data_flow_length-3-min(len(dfg),args.data_flow_length)][:512-3]
168 |             source_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
169 |             source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
170 |             position_idx = [i+tokenizer.pad_token_id + 1 for i in range(len(source_tokens))]
171 |             dfg=dfg[:args.code_length+args.data_flow_length-len(source_tokens)]
172 |             source_tokens+=[x[0] for x in dfg]
173 |             position_idx+=[0 for x in dfg]
174 |             source_ids+=[tokenizer.unk_token_id for x in dfg]
175 |             padding_length=args.code_length+args.data_flow_length-len(source_ids)
176 |             position_idx+=[tokenizer.pad_token_id]*padding_length
177 |             source_ids+=[tokenizer.pad_token_id]*padding_length      
178 |             
179 |             #reindex
180 |             reverse_index={}
181 |             for idx,x in enumerate(dfg):
182 |                 reverse_index[x[1]]=idx
183 |             for idx,x in enumerate(dfg):
184 |                 dfg[idx]=x[:-1]+([reverse_index[i] for i in x[-1] if i in reverse_index],)    
185 |             dfg_to_dfg=[x[-1] for x in dfg]
186 |             dfg_to_code=[ori2cur_pos[x[1]] for x in dfg]
187 |             length=len([tokenizer.cls_token])
188 |             dfg_to_code=[(x[0]+length,x[1]+length) for x in dfg_to_code]        
189 |             cache[url]=source_tokens,source_ids,position_idx,dfg_to_code,dfg_to_dfg
190 | 
191 |         
192 |     source_tokens_1,source_ids_1,position_idx_1,dfg_to_code_1,dfg_to_dfg_1=cache[url1]   
193 |     source_tokens_2,source_ids_2,position_idx_2,dfg_to_code_2,dfg_to_dfg_2=cache[url2]   
194 |     return InputFeatures(source_tokens_1,source_ids_1,position_idx_1,dfg_to_code_1,dfg_to_dfg_1,
195 |                    source_tokens_2,source_ids_2,position_idx_2,dfg_to_code_2,dfg_to_dfg_2,
196 |                      label,url1,url2)
197 | 
198 | class TextDataset(Dataset):
199 |     def __init__(self, tokenizer, args, file_path='train'):
200 |         self.examples = []
201 |         self.args=args
202 |         index_filename=file_path
203 |         
204 |         #load index
205 |         logger.info("Creating features from index file at %s ", index_filename)
206 |         url_to_code={}
207 |         with open('/'.join(index_filename.split('/')[:-1])+'/data.jsonl') as f:
208 |             for line in f:
209 |                 line=line.strip()
210 |                 js=json.loads(line)
211 |                 url_to_code[js['idx']]=js['func']
212 |                 
213 |         #load code function according to index
214 |         data=[]
215 |         cache={}
216 |         f=open(index_filename)
217 |         with open(index_filename) as f:
218 |             for line in f:
219 |                 line=line.strip()
220 |                 url1,url2,label=line.split('\t')
221 |                 if url1 not in url_to_code or url2 not in url_to_code:
222 |                     continue
223 |                 if label=='0':
224 |                     label=0
225 |                 else:
226 |                     label=1
227 |                 data.append((url1,url2,label,tokenizer, args,cache,url_to_code))
228 |                 
229 |         #only use 10% valid data to keep best model        
230 |         if 'valid' in file_path:
231 |             data=random.sample(data,int(len(data)*0.1))
232 |             
233 |         #convert example to input features    
234 |         self.examples=[convert_examples_to_features(x) for x in tqdm(data,total=len(data))]
235 |         
236 |         if 'train' in file_path:
237 |             for idx, example in enumerate(self.examples[:3]):
238 |                 logger.info("*** Example ***")
239 |                 logger.info("idx: {}".format(idx))
240 |                 logger.info("label: {}".format(example.label))
241 |                 logger.info("input_tokens_1: {}".format([x.replace('\u0120','_') for x in example.input_tokens_1]))
242 |                 logger.info("input_ids_1: {}".format(' '.join(map(str, example.input_ids_1))))       
243 |                 logger.info("position_idx_1: {}".format(example.position_idx_1))
244 |                 logger.info("dfg_to_code_1: {}".format(' '.join(map(str, example.dfg_to_code_1))))
245 |                 logger.info("dfg_to_dfg_1: {}".format(' '.join(map(str, example.dfg_to_dfg_1))))
246 |                 
247 |                 logger.info("input_tokens_2: {}".format([x.replace('\u0120','_') for x in example.input_tokens_2]))
248 |                 logger.info("input_ids_2: {}".format(' '.join(map(str, example.input_ids_2))))       
249 |                 logger.info("position_idx_2: {}".format(example.position_idx_2))
250 |                 logger.info("dfg_to_code_2: {}".format(' '.join(map(str, example.dfg_to_code_2))))
251 |                 logger.info("dfg_to_dfg_2: {}".format(' '.join(map(str, example.dfg_to_dfg_2))))
252 | 
253 | 
254 |     def __len__(self):
255 |         return len(self.examples)
256 |     
257 |     def __getitem__(self, item):
258 |         #calculate graph-guided masked function
259 |         attn_mask_1= np.zeros((self.args.code_length+self.args.data_flow_length,
260 |                         self.args.code_length+self.args.data_flow_length),dtype=np.bool)
261 |         #calculate begin index of node and max length of input
262 |         node_index=sum([i>1 for i in self.examples[item].position_idx_1])
263 |         max_length=sum([i!=1 for i in self.examples[item].position_idx_1])
264 |         #sequence can attend to sequence
265 |         attn_mask_1[:node_index,:node_index]=True
266 |         #special tokens attend to all tokens
267 |         for idx,i in enumerate(self.examples[item].input_ids_1):
268 |             if i in [0,2]:
269 |                 attn_mask_1[idx,:max_length]=True
270 |         #nodes attend to code tokens that are identified from
271 |         for idx,(a,b) in enumerate(self.examples[item].dfg_to_code_1):
272 |             if a<node_index and b<node_index:
273 |                 attn_mask_1[idx+node_index,a:b]=True
274 |                 attn_mask_1[a:b,idx+node_index]=True
275 |         #nodes attend to adjacent nodes 
276 |         for idx,nodes in enumerate(self.examples[item].dfg_to_dfg_1):
277 |             for a in nodes:
278 |                 if a+node_index<len(self.examples[item].position_idx_1):
279 |                     attn_mask_1[idx+node_index,a+node_index]=True  
280 |                     
281 |         #calculate graph-guided masked function
282 |         attn_mask_2= np.zeros((self.args.code_length+self.args.data_flow_length,
283 |                         self.args.code_length+self.args.data_flow_length),dtype=np.bool)
284 |         #calculate begin index of node and max length of input
285 |         node_index=sum([i>1 for i in self.examples[item].position_idx_2])
286 |         max_length=sum([i!=1 for i in self.examples[item].position_idx_2])
287 |         #sequence can attend to sequence
288 |         attn_mask_2[:node_index,:node_index]=True
289 |         #special tokens attend to all tokens
290 |         for idx,i in enumerate(self.examples[item].input_ids_2):
291 |             if i in [0,2]:
292 |                 attn_mask_2[idx,:max_length]=True
293 |         #nodes attend to code tokens that are identified from
294 |         for idx,(a,b) in enumerate(self.examples[item].dfg_to_code_2):
295 |             if a<node_index and b<node_index:
296 |                 attn_mask_2[idx+node_index,a:b]=True
297 |                 attn_mask_2[a:b,idx+node_index]=True
298 |         #nodes attend to adjacent nodes 
299 |         for idx,nodes in enumerate(self.examples[item].dfg_to_dfg_2):
300 |             for a in nodes:
301 |                 if a+node_index<len(self.examples[item].position_idx_2):
302 |                     attn_mask_2[idx+node_index,a+node_index]=True                      
303 |                     
304 |         return (torch.tensor(self.examples[item].input_ids_1),
305 |                 torch.tensor(self.examples[item].position_idx_1),
306 |                 torch.tensor(attn_mask_1), 
307 |                 torch.tensor(self.examples[item].input_ids_2),
308 |                 torch.tensor(self.examples[item].position_idx_2),
309 |                 torch.tensor(attn_mask_2),                 
310 |                 torch.tensor(self.examples[item].label))
311 | 
312 | 
313 | def set_seed(args):
314 |     random.seed(args.seed)
315 |     np.random.seed(args.seed)
316 |     torch.manual_seed(args.seed)
317 |     if args.n_gpu > 0:
318 |         torch.cuda.manual_seed_all(args.seed)
319 | 
320 | 
321 | def train(args, train_dataset, model, tokenizer):
322 |     """ Train the model """
323 |     
324 |     #build dataloader
325 |     train_sampler = RandomSampler(train_dataset)
326 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4)
327 |     
328 |     args.max_steps=args.epochs*len( train_dataloader)
329 |     args.save_steps=len( train_dataloader)//10
330 |     args.warmup_steps=args.max_steps//5
331 |     model.to(args.device)
332 |     
333 |     # Prepare optimizer and schedule (linear warmup and decay)
334 |     no_decay = ['bias', 'LayerNorm.weight']
335 |     optimizer_grouped_parameters = [
336 |         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
337 |          'weight_decay': args.weight_decay},
338 |         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
339 |     ]
340 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
341 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
342 |                                                 num_training_steps=args.max_steps)
343 | 
344 |     # multi-gpu training
345 |     if args.n_gpu > 1:
346 |         model = torch.nn.DataParallel(model)
347 | 
348 |     # Train!
349 |     logger.info("***** Running training *****")
350 |     logger.info("  Num examples = %d", len(train_dataset))
351 |     logger.info("  Num Epochs = %d", args.epochs)
352 |     logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size//args.n_gpu)
353 |     logger.info("  Total train batch size = %d",args.train_batch_size*args.gradient_accumulation_steps)
354 |     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
355 |     logger.info("  Total optimization steps = %d", args.max_steps)
356 |     
357 |     global_step=0
358 |     tr_loss, logging_loss,avg_loss,tr_nb,tr_num,train_loss = 0.0, 0.0,0.0,0,0,0
359 |     best_f1=0
360 | 
361 |     model.zero_grad()
362 |  
363 |     for idx in range(args.epochs): 
364 |         bar = tqdm(train_dataloader,total=len(train_dataloader))
365 |         tr_num=0
366 |         train_loss=0
367 |         for step, batch in enumerate(bar):
368 |             (inputs_ids_1,position_idx_1,attn_mask_1,
369 |             inputs_ids_2,position_idx_2,attn_mask_2,
370 |             labels)=[x.to(args.device)  for x in batch]
371 |             model.train()
372 |             loss,logits = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)
373 | 
374 |             if args.n_gpu > 1:
375 |                 loss = loss.mean()
376 |                 
377 |             if args.gradient_accumulation_steps > 1:
378 |                 loss = loss / args.gradient_accumulation_steps
379 | 
380 |             loss.backward()
381 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
382 | 
383 |             tr_loss += loss.item()
384 |             tr_num+=1
385 |             train_loss+=loss.item()
386 |             if avg_loss==0:
387 |                 avg_loss=tr_loss
388 |                 
389 |             avg_loss=round(train_loss/tr_num,5)
390 |             bar.set_description("epoch {} loss {}".format(idx,avg_loss))
391 |               
392 |             if (step + 1) % args.gradient_accumulation_steps == 0:
393 |                 optimizer.step()
394 |                 optimizer.zero_grad()
395 |                 scheduler.step()  
396 |                 global_step += 1
397 |                 output_flag=True
398 |                 avg_loss=round(np.exp((tr_loss - logging_loss) /(global_step- tr_nb)),4)
399 | 
400 |                 if global_step % args.save_steps == 0:
401 |                     results = evaluate(args, model, tokenizer, eval_when_training=True)    
402 |                     
403 |                     # Save model checkpoint
404 |                     if results['eval_f1']>best_f1:
405 |                         best_f1=results['eval_f1']
406 |                         logger.info("  "+"*"*20)  
407 |                         logger.info("  Best f1:%s",round(best_f1,4))
408 |                         logger.info("  "+"*"*20)                          
409 |                         
410 |                         checkpoint_prefix = 'checkpoint-best-f1'
411 |                         output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))                        
412 |                         if not os.path.exists(output_dir):
413 |                             os.makedirs(output_dir)                        
414 |                         model_to_save = model.module if hasattr(model,'module') else model
415 |                         output_dir = os.path.join(output_dir, '{}'.format('model.bin')) 
416 |                         torch.save(model_to_save.state_dict(), output_dir)
417 |                         logger.info("Saving model checkpoint to %s", output_dir)
418 |                         
419 | def evaluate(args, model, tokenizer, eval_when_training=False):
420 |     #build dataloader
421 |     eval_dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file)
422 |     eval_sampler = SequentialSampler(eval_dataset)
423 |     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler,batch_size=args.eval_batch_size,num_workers=4)
424 | 
425 |     # multi-gpu evaluate
426 |     if args.n_gpu > 1 and eval_when_training is False:
427 |         model = torch.nn.DataParallel(model)
428 | 
429 |     # Eval!
430 |     logger.info("***** Running evaluation *****")
431 |     logger.info("  Num examples = %d", len(eval_dataset))
432 |     logger.info("  Batch size = %d", args.eval_batch_size)
433 |     
434 |     eval_loss = 0.0
435 |     nb_eval_steps = 0
436 |     model.eval()
437 |     logits=[]  
438 |     y_trues=[]
439 |     for batch in eval_dataloader:
440 |         (inputs_ids_1,position_idx_1,attn_mask_1,
441 |         inputs_ids_2,position_idx_2,attn_mask_2,
442 |         labels)=[x.to(args.device)  for x in batch]
443 |         with torch.no_grad():
444 |             lm_loss,logit = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)
445 |             eval_loss += lm_loss.mean().item()
446 |             logits.append(logit.cpu().numpy())
447 |             y_trues.append(labels.cpu().numpy())
448 |         nb_eval_steps += 1
449 |     
450 |     #calculate scores
451 |     logits=np.concatenate(logits,0)
452 |     y_trues=np.concatenate(y_trues,0)
453 |     best_threshold=0.5
454 |     best_f1=0
455 | 
456 |     y_preds=logits[:,1]>best_threshold
457 |     from sklearn.metrics import recall_score
458 |     recall=recall_score(y_trues, y_preds, average='macro')
459 |     from sklearn.metrics import precision_score
460 |     precision=precision_score(y_trues, y_preds, average='macro')   
461 |     from sklearn.metrics import f1_score
462 |     f1=f1_score(y_trues, y_preds, average='macro')             
463 |     result = {
464 |         "eval_recall": float(recall),
465 |         "eval_precision": float(precision),
466 |         "eval_f1": float(f1),
467 |         "eval_threshold":best_threshold,
468 |         
469 |     }
470 | 
471 |     logger.info("***** Eval results *****")
472 |     for key in sorted(result.keys()):
473 |         logger.info("  %s = %s", key, str(round(result[key],4)))
474 | 
475 |     return result
476 | 
477 | def test(args, model, tokenizer, best_threshold=0):
478 |     #build dataloader
479 |     eval_dataset = TextDataset(tokenizer, args, file_path=args.test_data_file)
480 |     eval_sampler = SequentialSampler(eval_dataset)
481 |     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4)
482 | 
483 |     # multi-gpu evaluate
484 |     if args.n_gpu > 1:
485 |         model = torch.nn.DataParallel(model)
486 | 
487 |     # Eval!
488 |     logger.info("***** Running Test *****")
489 |     logger.info("  Num examples = %d", len(eval_dataset))
490 |     logger.info("  Batch size = %d", args.eval_batch_size)
491 |     eval_loss = 0.0
492 |     nb_eval_steps = 0
493 |     model.eval()
494 |     logits=[]  
495 |     y_trues=[]
496 |     for batch in eval_dataloader:
497 |         (inputs_ids_1,position_idx_1,attn_mask_1,
498 |         inputs_ids_2,position_idx_2,attn_mask_2,
499 |         labels)=[x.to(args.device)  for x in batch]
500 |         with torch.no_grad():
501 |             lm_loss,logit = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)
502 |             eval_loss += lm_loss.mean().item()
503 |             logits.append(logit.cpu().numpy())
504 |             y_trues.append(labels.cpu().numpy())
505 |         nb_eval_steps += 1
506 |     
507 |     #output result
508 |     logits=np.concatenate(logits,0)
509 |     y_preds=logits[:,1]>best_threshold
510 |     with open(os.path.join(args.output_dir,"predictions.txt"),'w') as f:
511 |         for example,pred in zip(eval_dataset.examples,y_preds):
512 |             if pred:
513 |                 f.write(example.url1+'\t'+example.url2+'\t'+'1'+'\n')
514 |             else:
515 |                 f.write(example.url1+'\t'+example.url2+'\t'+'0'+'\n')
516 |                                                 
517 | def main():
518 |     parser = argparse.ArgumentParser()
519 | 
520 |     ## Required parameters
521 |     parser.add_argument("--train_data_file", default=None, type=str, required=True,
522 |                         help="The input training data file (a text file).")
523 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
524 |                         help="The output directory where the model predictions and checkpoints will be written.")
525 | 
526 |     ## Other parameters
527 |     parser.add_argument("--eval_data_file", default=None, type=str,
528 |                         help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
529 |     parser.add_argument("--test_data_file", default=None, type=str,
530 |                         help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
531 |                     
532 |     parser.add_argument("--model_name_or_path", default=None, type=str,
533 |                         help="The model checkpoint for weights initialization.")
534 | 
535 |     parser.add_argument("--config_name", default="", type=str,
536 |                         help="Optional pretrained config name or path if not the same as model_name_or_path")
537 |     parser.add_argument("--tokenizer_name", default="", type=str,
538 |                         help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
539 | 
540 |     parser.add_argument("--code_length", default=256, type=int,
541 |                         help="Optional Code input sequence length after tokenization.") 
542 |     parser.add_argument("--data_flow_length", default=64, type=int,
543 |                         help="Optional Data Flow input sequence length after tokenization.") 
544 |     parser.add_argument("--do_train", action='store_true',
545 |                         help="Whether to run training.")
546 |     parser.add_argument("--do_eval", action='store_true',
547 |                         help="Whether to run eval on the dev set.")
548 |     parser.add_argument("--do_test", action='store_true',
549 |                         help="Whether to run eval on the dev set.")    
550 |     parser.add_argument("--evaluate_during_training", action='store_true',
551 |                         help="Run evaluation during training at each logging step.")
552 | 
553 |     parser.add_argument("--train_batch_size", default=4, type=int,
554 |                         help="Batch size per GPU/CPU for training.")
555 |     parser.add_argument("--eval_batch_size", default=4, type=int,
556 |                         help="Batch size per GPU/CPU for evaluation.")
557 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
558 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
559 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
560 |                         help="The initial learning rate for Adam.")
561 |     parser.add_argument("--weight_decay", default=0.0, type=float,
562 |                         help="Weight deay if we apply some.")
563 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
564 |                         help="Epsilon for Adam optimizer.")
565 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
566 |                         help="Max gradient norm.")
567 |     parser.add_argument("--max_steps", default=-1, type=int,
568 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
569 |     parser.add_argument("--warmup_steps", default=0, type=int,
570 |                         help="Linear warmup over warmup_steps.")
571 | 
572 |     parser.add_argument('--seed', type=int, default=42,
573 |                         help="random seed for initialization")
574 |     parser.add_argument('--epochs', type=int, default=1,
575 |                         help="training epochs")
576 | 
577 |     args = parser.parse_args()
578 | 
579 |     # Setup CUDA, GPU
580 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
581 |     args.n_gpu = torch.cuda.device_count()
582 | 
583 |     args.device = device
584 | 
585 |     # Setup logging
586 |     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO)
587 |     logger.warning("device: %s, n_gpu: %s",device, args.n_gpu,)
588 | 
589 | 
590 |     # Set seed
591 |     set_seed(args)
592 |     config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
593 |     config.num_labels=1
594 |     tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
595 |     model = RobertaForSequenceClassification.from_pretrained(args.model_name_or_path,config=config)    
596 | 
597 |     model=Model(model,config,tokenizer,args)
598 |     logger.info("Training/evaluation parameters %s", args)
599 |     # Training
600 |     if args.do_train:
601 |         train_dataset = TextDataset(tokenizer, args, file_path=args.train_data_file)
602 |         train(args, train_dataset, model, tokenizer)
603 | 
604 |     # Evaluation
605 |     results = {}
606 |     if args.do_eval:
607 |         checkpoint_prefix = 'checkpoint-best-f1/model.bin'
608 |         output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
609 |         model.load_state_dict(torch.load(output_dir))
610 |         model.to(args.device)
611 |         result=evaluate(args, model, tokenizer)
612 |         
613 |     if args.do_test:
614 |         checkpoint_prefix = 'checkpoint-best-f1/model.bin'
615 |         output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
616 |         model.load_state_dict(torch.load(output_dir))
617 |         model.to(args.device)
618 |         test(args, model, tokenizer,best_threshold=0.5)
619 | 
620 |     return results
621 | 
622 | 
623 | if __name__ == "__main__":
624 |     main()
625 | 
626 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Code Search
  4 | 
  5 | ## Data Preprocess
  6 | 
  7 | Different from the setting of [CodeSearchNet](husain2019codesearchnet), the answer of each query is retrieved from the whole development and testing code corpus instead of 1,000 candidate codes. Besides, we observe that some queries contain content unrelated to the code, such as a link ``http://..." that refers to external resources.  Therefore, we filter following examples to improve the quality of the dataset. 
  8 | 
  9 | - Remove comments in the code
 10 | 
 11 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 12 | 
 13 | - Remove examples that #tokens of documents is < 3 or >256
 14 | 
 15 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 16 | 
 17 | - Remove examples that documents are not English.
 18 | 
 19 | Data statistic about the cleaned dataset for code document generation is shown in this Table.
 20 | 
 21 | | PL         | Training |  Dev   |  Test  | Candidates code |
 22 | | :--------- | :------: | :----: | :----: | :-------------: |
 23 | | Python     | 251,820  | 13,914 | 14,918 |     43,827      |
 24 | | PHP        | 241,241  | 12,982 | 14,014 |     52,660      |
 25 | | Go         | 167,288  | 7,325  | 8,122  |     28,120      |
 26 | | Java       | 164,923  | 5,183  | 10,955 |     40,347      |
 27 | | JavaScript |  58,025  | 3,885  | 3,291  |     13,981      |
 28 | | Ruby       |  24,927  | 1,400  | 1,261  |      4,360      |
 29 | 
 30 | You can download and preprocess data using the following command.
 31 | ```shell
 32 | unzip dataset.zip
 33 | cd dataset
 34 | bash run.sh 
 35 | cd ..
 36 | ```
 37 | 
 38 | ## Dependency 
 39 | 
 40 | - pip install torch
 41 | - pip install transformers
 42 | - pip install tree_sitter
 43 | 
 44 | ### Tree-sitter (optional)
 45 | 
 46 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 47 | 
 48 | ```shell
 49 | cd parser
 50 | bash build.sh
 51 | cd ..
 52 | ```
 53 | 
 54 | ## Fine-Tune
 55 | 
 56 | We fine-tuned the model on 2*V100-16G GPUs. 
 57 | ```shell
 58 | lang=ruby
 59 | mkdir -p ./saved_models/$lang
 60 | python run.py \
 61 |     --output_dir=./saved_models/$lang \
 62 |     --config_name=microsoft/graphcodebert-base \
 63 |     --model_name_or_path=microsoft/graphcodebert-base \
 64 |     --tokenizer_name=microsoft/graphcodebert-base \
 65 |     --lang=$lang \
 66 |     --do_train \
 67 |     --train_data_file=dataset/$lang/train.jsonl \
 68 |     --eval_data_file=dataset/$lang/valid.jsonl \
 69 |     --test_data_file=dataset/$lang/test.jsonl \
 70 |     --codebase_file=dataset/$lang/codebase.jsonl \
 71 |     --num_train_epochs 10 \
 72 |     --code_length 256 \
 73 |     --data_flow_length 64 \
 74 |     --nl_length 128 \
 75 |     --train_batch_size 32 \
 76 |     --eval_batch_size 64 \
 77 |     --learning_rate 2e-5 \
 78 |     --seed 123456 2>&1| tee saved_models/$lang/train.log
 79 | ```
 80 | ## Inference and Evaluation
 81 | 
 82 | ```shell
 83 | lang=ruby
 84 | python run.py \
 85 |     --output_dir=./saved_models/$lang \
 86 |     --config_name=microsoft/graphcodebert-base \
 87 |     --model_name_or_path=microsoft/graphcodebert-base \
 88 |     --tokenizer_name=microsoft/graphcodebert-base \
 89 |     --lang=$lang \
 90 |     --do_eval \
 91 |     --do_test \
 92 |     --train_data_file=dataset/$lang/train.jsonl \
 93 |     --eval_data_file=dataset/$lang/valid.jsonl \
 94 |     --test_data_file=dataset/$lang/test.jsonl \
 95 |     --codebase_file=dataset/$lang/codebase.jsonl \
 96 |     --num_train_epochs 10 \
 97 |     --code_length 256 \
 98 |     --data_flow_length 64 \
 99 |     --nl_length 128 \
100 |     --train_batch_size 32 \
101 |     --eval_batch_size 64 \
102 |     --learning_rate 2e-5 \
103 |     --seed 123456 2>&1| tee saved_models/$lang/test.log
104 | ```
105 | 
106 | ## Results	
107 | 
108 | The results on the filtered dataset are shown in this Table:
109 | 
110 | | Model          |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
111 | | -------------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
112 | | NBow           |   0.162   |   0.157    |   0.330   |   0.161   |   0.171   |   0.152   |   0.189   |
113 | | CNN            |   0.276   |   0.224    |   0.680   |   0.242   |   0.263   |   0.260   |   0.324   |
114 | | BiRNN          |   0.213   |   0.193    |   0.688   |   0.290   |   0.304   |   0.338   |   0.338   |
115 | | SelfAtt        |   0.275   |   0.287    |   0.723   |   0.398   |   0.404   |   0.426   |   0.419   |
116 | | RoBERTa        |   0.587   |   0.517    |   0.850   |   0.587   |   0.599   |   0.560   |   0.617   |
117 | | RoBERTa (code) |   0.628   |   0.562    |   0.859   |   0.610   |   0.620   |   0.579   |   0.643   |
118 | | CodeBERT       |   0.679   |   0.620    |   0.882   |   0.672   |   0.676   |   0.628   |   0.693   |
119 | | GraphCodeBERT  | **0.703** | **0.644**  | **0.897** | **0.692** | **0.691** | **0.649** | **0.713** |


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/codesearch/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, attn_mask=None,position_idx=None, nl_inputs=None): 
11 |         if code_inputs is not None:
12 |             nodes_mask=position_idx.eq(0)
13 |             token_mask=position_idx.ge(2)        
14 |             inputs_embeddings=self.encoder.embeddings.word_embeddings(code_inputs)
15 |             nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
16 |             nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
17 |             avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
18 |             inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
19 |             return self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)[1]
20 |         else:
21 |             return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]
22 | 
23 |       
24 |         
25 |  
26 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/codesearch/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/run.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 19 | using a masked language modeling (MLM) loss.
 20 | """
 21 | 
 22 | import argparse
 23 | import logging
 24 | import os
 25 | import pickle
 26 | import random
 27 | import torch
 28 | import json
 29 | import numpy as np
 30 | from model import Model
 31 | from torch.nn import CrossEntropyLoss, MSELoss
 32 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
 33 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
 34 |                   RobertaConfig, RobertaModel, RobertaTokenizer)
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | from tqdm import tqdm, trange
 39 | import multiprocessing
 40 | cpu_cont = 16
 41 | 
 42 | from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript
 43 | from parser import (remove_comments_and_docstrings,
 44 |                    tree_to_token_index,
 45 |                    index_to_code_token,
 46 |                    tree_to_variable_index)
 47 | from tree_sitter import Language, Parser
 48 | dfg_function={
 49 |     'python':DFG_python,
 50 |     'java':DFG_java,
 51 |     'ruby':DFG_ruby,
 52 |     'go':DFG_go,
 53 |     'php':DFG_php,
 54 |     'javascript':DFG_javascript
 55 | }
 56 | 
 57 | #load parsers
 58 | parsers={}        
 59 | for lang in dfg_function:
 60 |     LANGUAGE = Language('parser/my-languages.so', lang)
 61 |     parser = Parser()
 62 |     parser.set_language(LANGUAGE) 
 63 |     parser = [parser,dfg_function[lang]]    
 64 |     parsers[lang]= parser
 65 |     
 66 |     
 67 | #remove comments, tokenize code and extract dataflow                                        
 68 | def extract_dataflow(code, parser,lang):
 69 |     #remove comments
 70 |     try:
 71 |         code=remove_comments_and_docstrings(code,lang)
 72 |     except:
 73 |         pass    
 74 |     #obtain dataflow
 75 |     if lang=="php":
 76 |         code="<?php"+code+"?>"    
 77 |     try:
 78 |         tree = parser[0].parse(bytes(code,'utf8'))    
 79 |         root_node = tree.root_node  
 80 |         tokens_index=tree_to_token_index(root_node)     
 81 |         code=code.split('\n')
 82 |         code_tokens=[index_to_code_token(x,code) for x in tokens_index]  
 83 |         index_to_code={}
 84 |         for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
 85 |             index_to_code[index]=(idx,code)  
 86 |         try:
 87 |             DFG,_=parser[1](root_node,index_to_code,{}) 
 88 |         except:
 89 |             DFG=[]
 90 |         DFG=sorted(DFG,key=lambda x:x[1])
 91 |         indexs=set()
 92 |         for d in DFG:
 93 |             if len(d[-1])!=0:
 94 |                 indexs.add(d[1])
 95 |             for x in d[-1]:
 96 |                 indexs.add(x)
 97 |         new_DFG=[]
 98 |         for d in DFG:
 99 |             if d[1] in indexs:
100 |                 new_DFG.append(d)
101 |         dfg=new_DFG
102 |     except:
103 |         dfg=[]
104 |     return code_tokens,dfg
105 | 
106 | class InputFeatures(object):
107 |     """A single training/test features for a example."""
108 |     def __init__(self,
109 |                  code_tokens,
110 |                  code_ids,
111 |                  position_idx,
112 |                  dfg_to_code,
113 |                  dfg_to_dfg,                 
114 |                  nl_tokens,
115 |                  nl_ids,
116 |                  url,
117 | 
118 |     ):
119 |         self.code_tokens = code_tokens
120 |         self.code_ids = code_ids
121 |         self.position_idx=position_idx
122 |         self.dfg_to_code=dfg_to_code
123 |         self.dfg_to_dfg=dfg_to_dfg        
124 |         self.nl_tokens = nl_tokens
125 |         self.nl_ids = nl_ids
126 |         self.url=url
127 |         
128 |         
129 | def convert_examples_to_features(item):
130 |     js,tokenizer,args=item
131 |     #code
132 |     parser=parsers[args.lang]
133 |     #extract data flow
134 |     code_tokens,dfg=extract_dataflow(js['original_string'],parser,args.lang)
135 |     code_tokens=[tokenizer.tokenize('@ '+x)[1:] if idx!=0 else tokenizer.tokenize(x) for idx,x in enumerate(code_tokens)]
136 |     ori2cur_pos={}
137 |     ori2cur_pos[-1]=(0,0)
138 |     for i in range(len(code_tokens)):
139 |         ori2cur_pos[i]=(ori2cur_pos[i-1][1],ori2cur_pos[i-1][1]+len(code_tokens[i]))    
140 |     code_tokens=[y for x in code_tokens for y in x]  
141 |     #truncating
142 |     code_tokens=code_tokens[:args.code_length+args.data_flow_length-2-min(len(dfg),args.data_flow_length)]
143 |     code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
144 |     code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
145 |     position_idx = [i+tokenizer.pad_token_id + 1 for i in range(len(code_tokens))]
146 |     dfg=dfg[:args.code_length+args.data_flow_length-len(code_tokens)]
147 |     code_tokens+=[x[0] for x in dfg]
148 |     position_idx+=[0 for x in dfg]
149 |     code_ids+=[tokenizer.unk_token_id for x in dfg]
150 |     padding_length=args.code_length+args.data_flow_length-len(code_ids)
151 |     position_idx+=[tokenizer.pad_token_id]*padding_length
152 |     code_ids+=[tokenizer.pad_token_id]*padding_length    
153 |     #reindex
154 |     reverse_index={}
155 |     for idx,x in enumerate(dfg):
156 |         reverse_index[x[1]]=idx
157 |     for idx,x in enumerate(dfg):
158 |         dfg[idx]=x[:-1]+([reverse_index[i] for i in x[-1] if i in reverse_index],)    
159 |     dfg_to_dfg=[x[-1] for x in dfg]
160 |     dfg_to_code=[ori2cur_pos[x[1]] for x in dfg]
161 |     length=len([tokenizer.cls_token])
162 |     dfg_to_code=[(x[0]+length,x[1]+length) for x in dfg_to_code]        
163 |     #nl
164 |     nl=' '.join(js['docstring_tokens'])
165 |     nl_tokens=tokenizer.tokenize(nl)[:args.nl_length-2]
166 |     nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
167 |     nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
168 |     padding_length = args.nl_length - len(nl_ids)
169 |     nl_ids+=[tokenizer.pad_token_id]*padding_length    
170 |     
171 |     return InputFeatures(code_tokens,code_ids,position_idx,dfg_to_code,dfg_to_dfg,nl_tokens,nl_ids,js['url'])
172 | 
173 | class TextDataset(Dataset):
174 |     def __init__(self, tokenizer, args, file_path=None,pool=None):
175 |         self.args=args
176 |         prefix=file_path.split('/')[-1][:-6]
177 |         cache_file=args.output_dir+'/'+prefix+'.pkl'
178 |         if os.path.exists(cache_file):
179 |             self.examples=pickle.load(open(cache_file,'rb'))
180 |         else:
181 |             self.examples = []
182 |             data=[]
183 |             with open(file_path) as f:
184 |                 for line in f:
185 |                     line=line.strip()
186 |                     js=json.loads(line)
187 |                     data.append((js,tokenizer,args))
188 |             self.examples=pool.map(convert_examples_to_features, tqdm(data,total=len(data)))
189 |             pickle.dump(self.examples,open(cache_file,'wb'))
190 |             
191 |         if 'train' in file_path:
192 |             for idx, example in enumerate(self.examples[:3]):
193 |                 logger.info("*** Example ***")
194 |                 logger.info("idx: {}".format(idx))
195 |                 logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
196 |                 logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
197 |                 logger.info("position_idx: {}".format(example.position_idx))
198 |                 logger.info("dfg_to_code: {}".format(' '.join(map(str, example.dfg_to_code))))
199 |                 logger.info("dfg_to_dfg: {}".format(' '.join(map(str, example.dfg_to_dfg))))                
200 |                 logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
201 |                 logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))          
202 |                 
203 |     def __len__(self):
204 |         return len(self.examples)
205 | 
206 |     def __getitem__(self, item): 
207 |         #calculate graph-guided masked function
208 |         attn_mask=np.zeros((self.args.code_length+self.args.data_flow_length,
209 |                             self.args.code_length+self.args.data_flow_length),dtype=np.bool)
210 |         #calculate begin index of node and max length of input
211 |         node_index=sum([i>1 for i in self.examples[item].position_idx])
212 |         max_length=sum([i!=1 for i in self.examples[item].position_idx])
213 |         #sequence can attend to sequence
214 |         attn_mask[:node_index,:node_index]=True
215 |         #special tokens attend to all tokens
216 |         for idx,i in enumerate(self.examples[item].code_ids):
217 |             if i in [0,2]:
218 |                 attn_mask[idx,:max_length]=True
219 |         #nodes attend to code tokens that are identified from
220 |         for idx,(a,b) in enumerate(self.examples[item].dfg_to_code):
221 |             if a<node_index and b<node_index:
222 |                 attn_mask[idx+node_index,a:b]=True
223 |                 attn_mask[a:b,idx+node_index]=True
224 |         #nodes attend to adjacent nodes 
225 |         for idx,nodes in enumerate(self.examples[item].dfg_to_dfg):
226 |             for a in nodes:
227 |                 if a+node_index<len(self.examples[item].position_idx):
228 |                     attn_mask[idx+node_index,a+node_index]=True  
229 |                     
230 |         return (torch.tensor(self.examples[item].code_ids),
231 |               torch.tensor(attn_mask),
232 |               torch.tensor(self.examples[item].position_idx), 
233 |               torch.tensor(self.examples[item].nl_ids))
234 |             
235 | 
236 | def set_seed(seed=42):
237 |     random.seed(seed)
238 |     os.environ['PYHTONHASHSEED'] = str(seed)
239 |     np.random.seed(seed)
240 |     torch.manual_seed(seed)
241 |     torch.cuda.manual_seed(seed)
242 |     torch.backends.cudnn.deterministic = True
243 | 
244 | 
245 | def train(args, model, tokenizer,pool):
246 |     """ Train the model """
247 |     #get training dataset
248 |     train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
249 |     train_sampler = RandomSampler(train_dataset)
250 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4)
251 |     
252 |     #get optimizer and scheduler
253 |     optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=1e-8)
254 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*args.num_train_epochs)
255 |     
256 |     # multi-gpu training (should be after apex fp16 initialization)
257 |     if args.n_gpu > 1:
258 |         model = torch.nn.DataParallel(model)
259 | 
260 |     # Train!
261 |     logger.info("***** Running training *****")
262 |     logger.info("  Num examples = %d", len(train_dataset))
263 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
264 |     logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size//args.n_gpu)
265 |     logger.info("  Total train batch size  = %d", args.train_batch_size)
266 |     logger.info("  Total optimization steps = %d", len(train_dataloader)*args.num_train_epochs)
267 |     
268 |     # model.resize_token_embeddings(len(tokenizer))
269 |     model.zero_grad()
270 |     
271 |     model.train()
272 |     tr_num,tr_loss,best_mrr=0,0,0 
273 |     for idx in range(args.num_train_epochs): 
274 |         for step,batch in enumerate(train_dataloader):
275 |             #get inputs
276 |             code_inputs = batch[0].to(args.device)  
277 |             attn_mask = batch[1].to(args.device)
278 |             position_idx = batch[2].to(args.device)
279 |             nl_inputs = batch[3].to(args.device)
280 |             #get code and nl vectors
281 |             code_vec = model(code_inputs=code_inputs,attn_mask=attn_mask,position_idx=position_idx)
282 |             nl_vec = model(nl_inputs=nl_inputs)
283 |             
284 |             #calculate scores and loss
285 |             scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
286 |             loss_fct = CrossEntropyLoss()
287 |             loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))
288 |             
289 |             #report loss
290 |             tr_loss += loss.item()
291 |             tr_num+=1
292 |             if (step+1)% 100==0:
293 |                 logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
294 |                 tr_loss=0
295 |                 tr_num=0
296 |             
297 |             #backward
298 |             loss.backward()
299 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
300 |             optimizer.step()
301 |             optimizer.zero_grad()
302 |             scheduler.step() 
303 |             
304 |         #evaluate    
305 |         results = evaluate(args, model, tokenizer,args.eval_data_file, pool, eval_when_training=True)
306 |         for key, value in results.items():
307 |             logger.info("  %s = %s", key, round(value,4))    
308 |             
309 |         #save best model
310 |         if results['eval_mrr']>best_mrr:
311 |             best_mrr=results['eval_mrr']
312 |             logger.info("  "+"*"*20)  
313 |             logger.info("  Best mrr:%s",round(best_mrr,4))
314 |             logger.info("  "+"*"*20)                          
315 | 
316 |             checkpoint_prefix = 'checkpoint-best-mrr'
317 |             output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))                        
318 |             if not os.path.exists(output_dir):
319 |                 os.makedirs(output_dir)                        
320 |             model_to_save = model.module if hasattr(model,'module') else model
321 |             output_dir = os.path.join(output_dir, '{}'.format('model.bin')) 
322 |             torch.save(model_to_save.state_dict(), output_dir)
323 |             logger.info("Saving model checkpoint to %s", output_dir)
324 | 
325 | 
326 | def evaluate(args, model, tokenizer,file_name,pool, eval_when_training=False):
327 |     query_dataset = TextDataset(tokenizer, args, file_name, pool)
328 |     query_sampler = SequentialSampler(query_dataset)
329 |     query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=args.eval_batch_size,num_workers=4)
330 |     
331 |     code_dataset = TextDataset(tokenizer, args, args.codebase_file, pool)
332 |     code_sampler = SequentialSampler(code_dataset)
333 |     code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=args.eval_batch_size,num_workers=4)    
334 | 
335 |     # multi-gpu evaluate
336 |     if args.n_gpu > 1 and eval_when_training is False:
337 |         model = torch.nn.DataParallel(model)
338 | 
339 |     # Eval!
340 |     logger.info("***** Running evaluation *****")
341 |     logger.info("  Num queries = %d", len(query_dataset))
342 |     logger.info("  Num codes = %d", len(code_dataset))
343 |     logger.info("  Batch size = %d", args.eval_batch_size)
344 | 
345 |     
346 |     model.eval()
347 |     code_vecs=[] 
348 |     nl_vecs=[]
349 |     for batch in query_dataloader:  
350 |         nl_inputs = batch[3].to(args.device)
351 |         with torch.no_grad():
352 |             nl_vec = model(nl_inputs=nl_inputs) 
353 |             nl_vecs.append(nl_vec.cpu().numpy()) 
354 | 
355 |     for batch in code_dataloader:
356 |         code_inputs = batch[0].to(args.device)    
357 |         attn_mask = batch[1].to(args.device)
358 |         position_idx =batch[2].to(args.device)
359 |         with torch.no_grad():
360 |             code_vec= model(code_inputs=code_inputs, attn_mask=attn_mask,position_idx=position_idx)
361 |             code_vecs.append(code_vec.cpu().numpy())  
362 |     model.train()    
363 |     code_vecs=np.concatenate(code_vecs,0)
364 |     nl_vecs=np.concatenate(nl_vecs,0)
365 | 
366 |     scores=np.matmul(nl_vecs,code_vecs.T)
367 |     
368 |     sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
369 |     
370 |     nl_urls=[]
371 |     code_urls=[]
372 |     for example in query_dataset.examples:
373 |         nl_urls.append(example.url)
374 |         
375 |     for example in code_dataset.examples:
376 |         code_urls.append(example.url)
377 |         
378 |     ranks=[]
379 |     for url, sort_id in zip(nl_urls,sort_ids):
380 |         rank=0
381 |         find=False
382 |         for idx in sort_id[:1000]:
383 |             if find is False:
384 |                 rank+=1
385 |             if code_urls[idx]==url:
386 |                 find=True
387 |         if find:
388 |             ranks.append(1/rank)
389 |         else:
390 |             ranks.append(0)
391 |     
392 |     result = {
393 |         "eval_mrr":float(np.mean(ranks))
394 |     }
395 | 
396 |     return result
397 | 
398 |                         
399 |                         
400 | def main():
401 |     parser = argparse.ArgumentParser()
402 | 
403 |     ## Required parameters
404 |     parser.add_argument("--train_data_file", default=None, type=str, required=True,
405 |                         help="The input training data file (a json file).")
406 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
407 |                         help="The output directory where the model predictions and checkpoints will be written.")
408 |     parser.add_argument("--eval_data_file", default=None, type=str,
409 |                         help="An optional input evaluation data file to evaluate the MRR(a jsonl file).")
410 |     parser.add_argument("--test_data_file", default=None, type=str,
411 |                         help="An optional input test data file to test the MRR(a josnl file).")
412 |     parser.add_argument("--codebase_file", default=None, type=str,
413 |                         help="An optional input test data file to codebase (a jsonl file).")  
414 |     
415 |     parser.add_argument("--lang", default=None, type=str,
416 |                         help="language.")  
417 |     
418 |     parser.add_argument("--model_name_or_path", default=None, type=str,
419 |                         help="The model checkpoint for weights initialization.")
420 |     parser.add_argument("--config_name", default="", type=str,
421 |                         help="Optional pretrained config name or path if not the same as model_name_or_path")
422 |     parser.add_argument("--tokenizer_name", default="", type=str,
423 |                         help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
424 |     
425 |     parser.add_argument("--nl_length", default=128, type=int,
426 |                         help="Optional NL input sequence length after tokenization.")    
427 |     parser.add_argument("--code_length", default=256, type=int,
428 |                         help="Optional Code input sequence length after tokenization.") 
429 |     parser.add_argument("--data_flow_length", default=64, type=int,
430 |                         help="Optional Data Flow input sequence length after tokenization.") 
431 |     
432 |     parser.add_argument("--do_train", action='store_true',
433 |                         help="Whether to run training.")
434 |     parser.add_argument("--do_eval", action='store_true',
435 |                         help="Whether to run eval on the dev set.")
436 |     parser.add_argument("--do_test", action='store_true',
437 |                         help="Whether to run eval on the test set.")  
438 |     
439 | 
440 |     parser.add_argument("--train_batch_size", default=4, type=int,
441 |                         help="Batch size for training.")
442 |     parser.add_argument("--eval_batch_size", default=4, type=int,
443 |                         help="Batch size for evaluation.")
444 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
445 |                         help="The initial learning rate for Adam.")
446 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
447 |                         help="Max gradient norm.")
448 |     parser.add_argument("--num_train_epochs", default=1, type=int,
449 |                         help="Total number of training epochs to perform.")
450 | 
451 |     parser.add_argument('--seed', type=int, default=42,
452 |                         help="random seed for initialization")
453 |     
454 |     pool = multiprocessing.Pool(cpu_cont)
455 |     
456 |     #print arguments
457 |     args = parser.parse_args()
458 |     
459 |     #set log
460 |     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
461 |                     datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
462 |     #set device
463 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
464 |     args.n_gpu = torch.cuda.device_count()
465 |     args.device = device
466 |     logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
467 |     
468 |     # Set seed
469 |     set_seed(args.seed)
470 | 
471 |     #build model
472 |     config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
473 |     tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
474 |     model = RobertaModel.from_pretrained(args.model_name_or_path)    
475 |     model=Model(model)
476 |     logger.info("Training/evaluation parameters %s", args)
477 |     model.to(args.device)
478 |     
479 |     # Training
480 |     if args.do_train:
481 |         train(args, model, tokenizer, pool)
482 | 
483 |     # Evaluation
484 |     results = {}
485 |     if args.do_eval:
486 |         checkpoint_prefix = 'checkpoint-best-mrr/model.bin'
487 |         output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
488 |         model.load_state_dict(torch.load(output_dir),strict=False)      
489 |         model.to(args.device)
490 |         result=evaluate(args, model, tokenizer,args.eval_data_file, pool)
491 |         logger.info("***** Eval results *****")
492 |         for key in sorted(result.keys()):
493 |             logger.info("  %s = %s", key, str(round(result[key],4)))
494 |             
495 |     if args.do_test:
496 |         checkpoint_prefix = 'checkpoint-best-mrr/model.bin'
497 |         output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
498 |         model.load_state_dict(torch.load(output_dir),strict=False)      
499 |         model.to(args.device)
500 |         result=evaluate(args, model, tokenizer,args.test_data_file, pool)
501 |         logger.info("***** Eval results *****")
502 |         for key in sorted(result.keys()):
503 |             logger.info("  %s = %s", key, str(round(result[key],4)))
504 | 
505 |     return results
506 | 
507 | 
508 | if __name__ == "__main__":
509 |     main()
510 | 
511 | 
512 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/README.md:
--------------------------------------------------------------------------------
  1 | # Code Refinement
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code refinement aims to automatically fix bugs in the code, which can contribute to reducing the cost of bug-fixes for developers.
  6 | In CodeXGLUE, given a piece of Java code with bugs, the task is to remove the bugs to output the refined code. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. 
 12 | All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.
 13 | 
 14 | ### Data Format
 15 | 
 16 | The dataset is in the "data" folder. Each line of the files is a function. You can get data using the following command:
 17 | 
 18 | ```
 19 | unzip data.zip
 20 | ```
 21 | 
 22 | ### Data Statistics
 23 | 
 24 | Data statistics of this dataset are shown in the below table:
 25 | 
 26 | |         | #Examples | #Examples |
 27 | | ------- | :-------: | :-------: |
 28 | |         |   Small   |   Medium  |
 29 | |  Train  |   46,680  |   52,364  |
 30 | |  Valid  |    5,835  |    6,545  |
 31 | |   Test  |    5,835  |    6,545  |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking the "small" subset as example:
 53 | 
 54 | ```shell
 55 | scale=small
 56 | lr=1e-4
 57 | batch_size=32
 58 | beam_size=10
 59 | source_length=320
 60 | target_length=256
 61 | output_dir=saved_models/$scale/
 62 | train_file=data/$scale/train.buggy-fixed.buggy,data/$scale/train.buggy-fixed.fixed
 63 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 64 | epochs=50 
 65 | pretrained_model=microsoft/graphcodebert-base
 66 | 
 67 | mkdir -p $output_dir
 68 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 69 | ```
 70 | 
 71 | ### Inference
 72 | 
 73 | We use full test data for inference. 
 74 | 
 75 | ```shell
 76 | batch_size=64
 77 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 78 | test_file=data/$scale/test.buggy-fixed.buggy,data/$scale/test.buggy-fixed.fixed
 79 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 80 | 
 81 | python run.py --do_test --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --load_model_path $load_model_path --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
 82 | ```
 83 | 
 84 | 
 85 | 
 86 | ## Result
 87 | 
 88 | The results on the test set are shown as below:
 89 | 
 90 | Small:
 91 | 
 92 | | Method        |   BLEU    | Acc (100%) |
 93 | | ------------- | :-------: | :--------: |
 94 | | Naive copy    |   78.06   |    0.0     |
 95 | | LSTM          |   76.76   |    10.0    |
 96 | | Transformer   |   77.21   |    14.7    |
 97 | | CodeBERT      |   77.42   |    16.4    |
 98 | | GraphCodeBERT | **80.02** |  **17.3**  |
 99 | 
100 | Medium:
101 | 
102 | | Method        |   BLEU    | Acc (100%) |
103 | | ------------- | :-------: | :--------: |
104 | | Naive copy    |   90.91   |    0.0     |
105 | | LSTM          |   72.08   |    2.5     |
106 | | Transformer   |   89.25   |    3.7     |
107 | | CodeBERT      |   91.07   |    5.16    |
108 | | GraphCodeBERT | **91.31** |  **9.1**   |
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/refinement/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids,source_mask,position_idx,attn_mask,target_ids=None,target_mask=None,args=None):   
 55 |         #embedding
 56 |         nodes_mask=position_idx.eq(0)
 57 |         token_mask=position_idx.ge(2)        
 58 |         inputs_embeddings=self.encoder.embeddings.word_embeddings(source_ids)
 59 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
 60 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
 61 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
 62 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]  
 63 |         
 64 |         outputs = self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)
 65 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 66 |         #source_mask=token_mask.float()
 67 |         if target_ids is not None:  
 68 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 69 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 70 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 71 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 72 |             lm_logits = self.lm_head(hidden_states)
 73 |             # Shift so that tokens < n predict n
 74 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 75 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 76 |             shift_labels = target_ids[..., 1:].contiguous()
 77 |             # Flatten the tokens
 78 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 79 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 80 |                             shift_labels.view(-1)[active_loss])
 81 | 
 82 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 83 |             return outputs
 84 |         else:
 85 |             #Predict 
 86 |             preds=[]       
 87 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 88 |             for i in range(source_ids.shape[0]):
 89 |                 context=encoder_output[:,i:i+1]
 90 |                 context_mask=source_mask[i:i+1,:]
 91 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 92 |                 input_ids=beam.getCurrentState()
 93 |                 context=context.repeat(1, self.beam_size,1)
 94 |                 context_mask=context_mask.repeat(self.beam_size,1)
 95 |                 for j in range(self.max_length):
 96 |                     if beam.done():
 97 |                         break
 98 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 99 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
100 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
101 |                     out = torch.tanh(self.dense(out))
102 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
103 |                     out = self.lsm(self.lm_head(hidden_states)).data
104 |                     beam.advance(out)
105 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
106 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
107 |                 hyp= beam.getHyp(beam.getFinal())
108 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
109 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
110 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
111 |                 
112 |             preds=torch.cat(preds,0)                
113 |             return preds   
114 |         
115 |         
116 | 
117 | class Beam(object):
118 |     def __init__(self, size,sos,eos):
119 |         self.size = size
120 |         self.tt = torch.cuda
121 |         # The score for each translation on the beam.
122 |         self.scores = self.tt.FloatTensor(size).zero_()
123 |         # The backpointers at each time-step.
124 |         self.prevKs = []
125 |         # The outputs at each time-step.
126 |         self.nextYs = [self.tt.LongTensor(size)
127 |                        .fill_(0)]
128 |         self.nextYs[0][0] = sos
129 |         # Has EOS topped the beam yet.
130 |         self._eos = eos
131 |         self.eosTop = False
132 |         # Time and k pair for finished.
133 |         self.finished = []
134 | 
135 |     def getCurrentState(self):
136 |         "Get the outputs for the current timestep."
137 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
138 |         return batch
139 | 
140 |     def getCurrentOrigin(self):
141 |         "Get the backpointers for the current timestep."
142 |         return self.prevKs[-1]
143 | 
144 |     def advance(self, wordLk):
145 |         """
146 |         Given prob over words for every last beam `wordLk` and attention
147 |         `attnOut`: Compute and update the beam search.
148 | 
149 |         Parameters:
150 | 
151 |         * `wordLk`- probs of advancing from the last step (K x words)
152 |         * `attnOut`- attention at the last step
153 | 
154 |         Returns: True if beam search is complete.
155 |         """
156 |         numWords = wordLk.size(1)
157 | 
158 |         # Sum the previous scores.
159 |         if len(self.prevKs) > 0:
160 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
161 | 
162 |             # Don't let EOS have children.
163 |             for i in range(self.nextYs[-1].size(0)):
164 |                 if self.nextYs[-1][i] == self._eos:
165 |                     beamLk[i] = -1e20
166 |         else:
167 |             beamLk = wordLk[0]
168 |         flatBeamLk = beamLk.view(-1)
169 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
170 | 
171 |         self.scores = bestScores
172 | 
173 |         # bestScoresId is flattened beam x word array, so calculate which
174 |         # word and beam each score came from
175 |         prevK = bestScoresId // numWords
176 |         self.prevKs.append(prevK)
177 |         self.nextYs.append((bestScoresId - prevK * numWords))
178 | 
179 | 
180 |         for i in range(self.nextYs[-1].size(0)):
181 |             if self.nextYs[-1][i] == self._eos:
182 |                 s = self.scores[i]
183 |                 self.finished.append((s, len(self.nextYs) - 1, i))
184 | 
185 |         # End condition is when top-of-beam is EOS and no global score.
186 |         if self.nextYs[-1][0] == self._eos:
187 |             self.eosTop = True
188 | 
189 |     def done(self):
190 |         return self.eosTop and len(self.finished) >=self.size
191 | 
192 |     def getFinal(self):
193 |         if len(self.finished) == 0:
194 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
195 |         self.finished.sort(key=lambda a: -a[0])
196 |         if len(self.finished) != self.size:
197 |             unfinished=[]
198 |             for i in range(self.nextYs[-1].size(0)):
199 |                 if self.nextYs[-1][i] != self._eos:
200 |                     s = self.scores[i]
201 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
202 |             unfinished.sort(key=lambda a: -a[0])
203 |             self.finished+=unfinished[:self.size-len(self.finished)]
204 |         return self.finished[:self.size]
205 | 
206 |     def getHyp(self, beam_res):
207 |         """
208 |         Walk back to construct the full hypothesis.
209 |         """
210 |         hyps=[]
211 |         for _,timestep, k in beam_res:
212 |             hyp = []
213 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
214 |                 hyp.append(self.nextYs[j+1][k])
215 |                 k = self.prevKs[j][k]
216 |             hyps.append(hyp[::-1])
217 |         return hyps
218 |     
219 |     def buildTargetTokens(self, preds):
220 |         sentence=[]
221 |         for pred in preds:
222 |             tokens = []
223 |             for tok in pred:
224 |                 if tok==self._eos:
225 |                     break
226 |                 tokens.append(tok)
227 |             sentence.append(tokens)
228 |         return sentence
229 |         
230 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/refinement/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/README.md:
--------------------------------------------------------------------------------
  1 | # Code Translation
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code translation aims to migrate legacy software from one programming language in a platform toanother.
  6 | Given a piece of Java (C#) code, the task is to translate the code into C# (Java) version. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).
 12 | 
 13 | We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.
 14 | 
 15 | ### Data Format
 16 | 
 17 | The dataset is in the "data" folder. Each line of the files is a function, and the suffix of the file indicates the programming language. You can get data using the following command:
 18 | 
 19 | ```
 20 | unzip data.zip
 21 | ```
 22 | 
 23 | ### Data Statistics
 24 | 
 25 | Data statistics of the dataset are shown in the below table:
 26 | 
 27 | |       | #Examples |
 28 | | ------- | :-------: |
 29 | |  Train  |   10,300  |
 30 | |  Valid  |      500   |
 31 | |   Test  |    1,000  |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking Java to C# translation as example:
 53 | 
 54 | ```shell
 55 | source=java
 56 | target=cs
 57 | lr=1e-4
 58 | batch_size=32
 59 | beam_size=10
 60 | source_length=320
 61 | target_length=256
 62 | output_dir=saved_models/$source-$target/
 63 | train_file=data/train.java-cs.txt.$source,data/train.java-cs.txt.$target
 64 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 65 | epochs=100
 66 | pretrained_model=microsoft/graphcodebert-base
 67 | 
 68 | mkdir -p $output_dir
 69 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 70 | ```
 71 | 
 72 | ### Inference
 73 | 
 74 | We use full test data for inference. 
 75 | 
 76 | ```shell
 77 | batch_size=64
 78 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 79 | test_file=data/test.java-cs.txt.$source,data/test.java-cs.txt.$target
 80 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 81 | 
 82 | python run.py --do_test --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --load_model_path $load_model_path --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
 83 | ```
 84 | 
 85 | 
 86 | 
 87 | ## Result
 88 | 
 89 | The results on the test set are shown as below:
 90 | 
 91 | Java to C#:
 92 | 
 93 | |     Method     |    BLEU    | Acc (100%) |
 94 | |    ----------  | :--------: | :-------:  |
 95 | | Naive copy     |   18.54    |    0.0     |
 96 | | PBSMT      	 |   43.53    |   12.5     |
 97 | | Transformer    |   55.84    |   33.0     |
 98 | | Roborta (code) |   77.46    |   56.1     |
 99 | | CodeBERT   	 | 79.92 | 59.0   |
100 | | GraphCodeBERT | **80.58** | **59.4** |
101 | 
102 | C# to Java:
103 | 
104 | | Method         |   BLEU    | Acc (100%) |
105 | | -------------- | :-------: | :--------: |
106 | | Naive copy     |   18.69   |    0.0     |
107 | | PBSMT          |   40.06   |    16.1    |
108 | | Transformer    |   50.47   |    37.9    |
109 | | Roborta (code) |   71.99   |    57.9    |
110 | | CodeBERT       |   72.14   |    58.0    |
111 | | GraphCodeBERT  | **72.64** |  **58.8**  |
112 | 
113 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/translation/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids,source_mask,position_idx,attn_mask,target_ids=None,target_mask=None,args=None):   
 55 |         #embedding
 56 |         nodes_mask=position_idx.eq(0)
 57 |         token_mask=position_idx.ge(2)        
 58 |         inputs_embeddings=self.encoder.embeddings.word_embeddings(source_ids)
 59 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
 60 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
 61 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
 62 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]  
 63 |         
 64 |         outputs = self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)
 65 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 66 |         #source_mask=token_mask.float()
 67 |         if target_ids is not None:  
 68 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 69 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 70 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 71 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 72 |             lm_logits = self.lm_head(hidden_states)
 73 |             # Shift so that tokens < n predict n
 74 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 75 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 76 |             shift_labels = target_ids[..., 1:].contiguous()
 77 |             # Flatten the tokens
 78 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 79 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 80 |                             shift_labels.view(-1)[active_loss])
 81 | 
 82 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 83 |             return outputs
 84 |         else:
 85 |             #Predict 
 86 |             preds=[]       
 87 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 88 |             for i in range(source_ids.shape[0]):
 89 |                 context=encoder_output[:,i:i+1]
 90 |                 context_mask=source_mask[i:i+1,:]
 91 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 92 |                 input_ids=beam.getCurrentState()
 93 |                 context=context.repeat(1, self.beam_size,1)
 94 |                 context_mask=context_mask.repeat(self.beam_size,1)
 95 |                 for _ in range(self.max_length): 
 96 |                     if beam.done():
 97 |                         break
 98 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 99 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
100 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
101 |                     out = torch.tanh(self.dense(out))
102 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
103 |                     out = self.lsm(self.lm_head(hidden_states)).data
104 |                     beam.advance(out)
105 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
106 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
107 |                 hyp= beam.getHyp(beam.getFinal())
108 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
109 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
110 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
111 |                 
112 |             preds=torch.cat(preds,0)                
113 |             return preds   
114 |         
115 |         
116 | 
117 | class Beam(object):
118 |     def __init__(self, size,sos,eos):
119 |         self.size = size
120 |         self.tt = torch.cuda
121 |         # The score for each translation on the beam.
122 |         self.scores = self.tt.FloatTensor(size).zero_()
123 |         # The backpointers at each time-step.
124 |         self.prevKs = []
125 |         # The outputs at each time-step.
126 |         self.nextYs = [self.tt.LongTensor(size)
127 |                        .fill_(0)]
128 |         self.nextYs[0][0] = sos
129 |         # Has EOS topped the beam yet.
130 |         self._eos = eos
131 |         self.eosTop = False
132 |         # Time and k pair for finished.
133 |         self.finished = []
134 | 
135 |     def getCurrentState(self):
136 |         "Get the outputs for the current timestep."
137 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
138 |         return batch
139 | 
140 |     def getCurrentOrigin(self):
141 |         "Get the backpointers for the current timestep."
142 |         return self.prevKs[-1]
143 | 
144 |     def advance(self, wordLk):
145 |         """
146 |         Given prob over words for every last beam `wordLk` and attention
147 |         `attnOut`: Compute and update the beam search.
148 | 
149 |         Parameters:
150 | 
151 |         * `wordLk`- probs of advancing from the last step (K x words)
152 |         * `attnOut`- attention at the last step
153 | 
154 |         Returns: True if beam search is complete.
155 |         """
156 |         numWords = wordLk.size(1)
157 | 
158 |         # Sum the previous scores.
159 |         if len(self.prevKs) > 0:
160 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
161 | 
162 |             # Don't let EOS have children.
163 |             for i in range(self.nextYs[-1].size(0)):
164 |                 if self.nextYs[-1][i] == self._eos:
165 |                     beamLk[i] = -1e20
166 |         else:
167 |             beamLk = wordLk[0]
168 |         flatBeamLk = beamLk.view(-1)
169 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
170 | 
171 |         self.scores = bestScores
172 | 
173 |         # bestScoresId is flattened beam x word array, so calculate which
174 |         # word and beam each score came from
175 |         prevK = bestScoresId // numWords
176 |         self.prevKs.append(prevK)
177 |         self.nextYs.append((bestScoresId - prevK * numWords))
178 | 
179 | 
180 |         for i in range(self.nextYs[-1].size(0)):
181 |             if self.nextYs[-1][i] == self._eos:
182 |                 s = self.scores[i]
183 |                 self.finished.append((s, len(self.nextYs) - 1, i))
184 | 
185 |         # End condition is when top-of-beam is EOS and no global score.
186 |         if self.nextYs[-1][0] == self._eos:
187 |             self.eosTop = True
188 | 
189 |     def done(self):
190 |         return self.eosTop and len(self.finished) >=self.size
191 | 
192 |     def getFinal(self):
193 |         if len(self.finished) == 0:
194 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
195 |         self.finished.sort(key=lambda a: -a[0])
196 |         if len(self.finished) != self.size:
197 |             unfinished=[]
198 |             for i in range(self.nextYs[-1].size(0)):
199 |                 if self.nextYs[-1][i] != self._eos:
200 |                     s = self.scores[i]
201 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
202 |             unfinished.sort(key=lambda a: -a[0])
203 |             self.finished+=unfinished[:self.size-len(self.finished)]
204 |         return self.finished[:self.size]
205 | 
206 |     def getHyp(self, beam_res):
207 |         """
208 |         Walk back to construct the full hypothesis.
209 |         """
210 |         hyps=[]
211 |         for _,timestep, k in beam_res:
212 |             hyp = []
213 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
214 |                 hyp.append(self.nextYs[j+1][k])
215 |                 k = self.prevKs[j][k]
216 |             hyps.append(hyp[::-1])
217 |         return hyps
218 |     
219 |     def buildTargetTokens(self, preds):
220 |         sentence=[]
221 |         for pred in preds:
222 |             tokens = []
223 |             for tok in pred:
224 |                 if tok==self._eos:
225 |                     break
226 |                 tokens.append(tok)
227 |             sentence.append(tokens)
228 |         return sentence
229 |         
230 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sxjscience/CodeBERT/e20547d53e4e6b7d97c2394470d2f6ef922e88ad/GraphCodeBERT/translation/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Microsoft Corporation.
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
  1 | NOTICES AND INFORMATION
  2 | 
  3 | Do Not Translate or Localize
  4 | 
  5 | This software incorporates material from third parties. Microsoft makes certain open source code available at http://3rdpartysource.microsoft.com, or you may send a check or money order for US $5.00, including the product name, the open source component name, and version number, to:
  6 | 
  7 | Source Code Compliance Team Microsoft Corporation One Microsoft Way Redmond, WA 98052 USA
  8 | 
  9 | Notwithstanding any other terms, you may reverse engineer this software to the extent required to debug changes to any libraries licensed under the GNU Lesser General Public License.
 10 | 
 11 | ===============================================================================
 12 | 
 13 | Component.
 14 | 
 15 | huggingface/transformers
 16 | 
 17 | Open Source License/Copyright Notice.
 18 | 
 19 | ```
 20 |                                  Apache License
 21 |                            Version 2.0, January 2004
 22 |                         http://www.apache.org/licenses/
 23 | 
 24 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 25 | 
 26 |    1. Definitions.
 27 | 
 28 |       "License" shall mean the terms and conditions for use, reproduction,
 29 |       and distribution as defined by Sections 1 through 9 of this document.
 30 | 
 31 |       "Licensor" shall mean the copyright owner or entity authorized by
 32 |       the copyright owner that is granting the License.
 33 | 
 34 |       "Legal Entity" shall mean the union of the acting entity and all
 35 |       other entities that control, are controlled by, or are under common
 36 |       control with that entity. For the purposes of this definition,
 37 |       "control" means (i) the power, direct or indirect, to cause the
 38 |       direction or management of such entity, whether by contract or
 39 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 40 |       outstanding shares, or (iii) beneficial ownership of such entity.
 41 | 
 42 |       "You" (or "Your") shall mean an individual or Legal Entity
 43 |       exercising permissions granted by this License.
 44 | 
 45 |       "Source" form shall mean the preferred form for making modifications,
 46 |       including but not limited to software source code, documentation
 47 |       source, and configuration files.
 48 | 
 49 |       "Object" form shall mean any form resulting from mechanical
 50 |       transformation or translation of a Source form, including but
 51 |       not limited to compiled object code, generated documentation,
 52 |       and conversions to other media types.
 53 | 
 54 |       "Work" shall mean the work of authorship, whether in Source or
 55 |       Object form, made available under the License, as indicated by a
 56 |       copyright notice that is included in or attached to the work
 57 |       (an example is provided in the Appendix below).
 58 | 
 59 |       "Derivative Works" shall mean any work, whether in Source or Object
 60 |       form, that is based on (or derived from) the Work and for which the
 61 |       editorial revisions, annotations, elaborations, or other modifications
 62 |       represent, as a whole, an original work of authorship. For the purposes
 63 |       of this License, Derivative Works shall not include works that remain
 64 |       separable from, or merely link (or bind by name) to the interfaces of,
 65 |       the Work and Derivative Works thereof.
 66 | 
 67 |       "Contribution" shall mean any work of authorship, including
 68 |       the original version of the Work and any modifications or additions
 69 |       to that Work or Derivative Works thereof, that is intentionally
 70 |       submitted to Licensor for inclusion in the Work by the copyright owner
 71 |       or by an individual or Legal Entity authorized to submit on behalf of
 72 |       the copyright owner. For the purposes of this definition, "submitted"
 73 |       means any form of electronic, verbal, or written communication sent
 74 |       to the Licensor or its representatives, including but not limited to
 75 |       communication on electronic mailing lists, source code control systems,
 76 |       and issue tracking systems that are managed by, or on behalf of, the
 77 |       Licensor for the purpose of discussing and improving the Work, but
 78 |       excluding communication that is conspicuously marked or otherwise
 79 |       designated in writing by the copyright owner as "Not a Contribution."
 80 | 
 81 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 82 |       on behalf of whom a Contribution has been received by Licensor and
 83 |       subsequently incorporated within the Work.
 84 | 
 85 |    2. Grant of Copyright License. Subject to the terms and conditions of
 86 |       this License, each Contributor hereby grants to You a perpetual,
 87 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 88 |       copyright license to reproduce, prepare Derivative Works of,
 89 |       publicly display, publicly perform, sublicense, and distribute the
 90 |       Work and such Derivative Works in Source or Object form.
 91 | 
 92 |    3. Grant of Patent License. Subject to the terms and conditions of
 93 |       this License, each Contributor hereby grants to You a perpetual,
 94 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 95 |       (except as stated in this section) patent license to make, have made,
 96 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 97 |       where such license applies only to those patent claims licensable
 98 |       by such Contributor that are necessarily infringed by their
 99 |       Contribution(s) alone or by combination of their Contribution(s)
100 |       with the Work to which such Contribution(s) was submitted. If You
101 |       institute patent litigation against any entity (including a
102 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
103 |       or a Contribution incorporated within the Work constitutes direct
104 |       or contributory patent infringement, then any patent licenses
105 |       granted to You under this License for that Work shall terminate
106 |       as of the date such litigation is filed.
107 | 
108 |    4. Redistribution. You may reproduce and distribute copies of the
109 |       Work or Derivative Works thereof in any medium, with or without
110 |       modifications, and in Source or Object form, provided that You
111 |       meet the following conditions:
112 | 
113 |       (a) You must give any other recipients of the Work or
114 |           Derivative Works a copy of this License; and
115 | 
116 |       (b) You must cause any modified files to carry prominent notices
117 |           stating that You changed the files; and
118 | 
119 |       (c) You must retain, in the Source form of any Derivative Works
120 |           that You distribute, all copyright, patent, trademark, and
121 |           attribution notices from the Source form of the Work,
122 |           excluding those notices that do not pertain to any part of
123 |           the Derivative Works; and
124 | 
125 |       (d) If the Work includes a "NOTICE" text file as part of its
126 |           distribution, then any Derivative Works that You distribute must
127 |           include a readable copy of the attribution notices contained
128 |           within such NOTICE file, excluding those notices that do not
129 |           pertain to any part of the Derivative Works, in at least one
130 |           of the following places: within a NOTICE text file distributed
131 |           as part of the Derivative Works; within the Source form or
132 |           documentation, if provided along with the Derivative Works; or,
133 |           within a display generated by the Derivative Works, if and
134 |           wherever such third-party notices normally appear. The contents
135 |           of the NOTICE file are for informational purposes only and
136 |           do not modify the License. You may add Your own attribution
137 |           notices within Derivative Works that You distribute, alongside
138 |           or as an addendum to the NOTICE text from the Work, provided
139 |           that such additional attribution notices cannot be construed
140 |           as modifying the License.
141 | 
142 |       You may add Your own copyright statement to Your modifications and
143 |       may provide additional or different license terms and conditions
144 |       for use, reproduction, or distribution of Your modifications, or
145 |       for any such Derivative Works as a whole, provided Your use,
146 |       reproduction, and distribution of the Work otherwise complies with
147 |       the conditions stated in this License.
148 | 
149 |    5. Submission of Contributions. Unless You explicitly state otherwise,
150 |       any Contribution intentionally submitted for inclusion in the Work
151 |       by You to the Licensor shall be under the terms and conditions of
152 |       this License, without any additional terms or conditions.
153 |       Notwithstanding the above, nothing herein shall supersede or modify
154 |       the terms of any separate license agreement you may have executed
155 |       with Licensor regarding such Contributions.
156 | 
157 |    6. Trademarks. This License does not grant permission to use the trade
158 |       names, trademarks, service marks, or product names of the Licensor,
159 |       except as required for reasonable and customary use in describing the
160 |       origin of the Work and reproducing the content of the NOTICE file.
161 | 
162 |    7. Disclaimer of Warranty. Unless required by applicable law or
163 |       agreed to in writing, Licensor provides the Work (and each
164 |       Contributor provides its Contributions) on an "AS IS" BASIS,
165 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
166 |       implied, including, without limitation, any warranties or conditions
167 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
168 |       PARTICULAR PURPOSE. You are solely responsible for determining the
169 |       appropriateness of using or redistributing the Work and assume any
170 |       risks associated with Your exercise of permissions under this License.
171 | 
172 |    8. Limitation of Liability. In no event and under no legal theory,
173 |       whether in tort (including negligence), contract, or otherwise,
174 |       unless required by applicable law (such as deliberate and grossly
175 |       negligent acts) or agreed to in writing, shall any Contributor be
176 |       liable to You for damages, including any direct, indirect, special,
177 |       incidental, or consequential damages of any character arising as a
178 |       result of this License or out of the use or inability to use the
179 |       Work (including but not limited to damages for loss of goodwill,
180 |       work stoppage, computer failure or malfunction, or any and all
181 |       other commercial damages or losses), even if such Contributor
182 |       has been advised of the possibility of such damages.
183 | 
184 |    9. Accepting Warranty or Additional Liability. While redistributing
185 |       the Work or Derivative Works thereof, You may choose to offer,
186 |       and charge a fee for, acceptance of support, warranty, indemnity,
187 |       or other liability obligations and/or rights consistent with this
188 |       License. However, in accepting such obligations, You may act only
189 |       on Your own behalf and on Your sole responsibility, not on behalf
190 |       of any other Contributor, and only if You agree to indemnify,
191 |       defend, and hold each Contributor harmless for any liability
192 |       incurred by, or claims asserted against, such Contributor by reason
193 |       of your accepting any such warranty or additional liability.
194 | 
195 |    END OF TERMS AND CONDITIONS
196 | 
197 |    APPENDIX: How to apply the Apache License to your work.
198 | 
199 |       To apply the Apache License to your work, attach the following
200 |       boilerplate notice, with the fields enclosed by brackets "[]"
201 |       replaced with your own identifying information. (Don't include
202 |       the brackets!)  The text should be enclosed in the appropriate
203 |       comment syntax for the file format. We also recommend that a
204 |       file or class name and description of purpose be included on the
205 |       same "printed page" as the copyright notice for easier
206 |       identification within third-party archives.
207 | 
208 |    Copyright [yyyy] [name of copyright owner]
209 | 
210 |    Licensed under the Apache License, Version 2.0 (the "License");
211 |    you may not use this file except in compliance with the License.
212 |    You may obtain a copy of the License at
213 | 
214 |        http://www.apache.org/licenses/LICENSE-2.0
215 | 
216 |    Unless required by applicable law or agreed to in writing, software
217 |    distributed under the License is distributed on an "AS IS" BASIS,
218 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
219 |    See the License for the specific language governing permissions and
220 |    limitations under the License.
221 | ```
222 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CodeBERT
 2 | This repo provides the code for reproducing the experiments in [CodeBERT: A Pre-Trained Model for Programming and Natural Languages](https://arxiv.org/pdf/2002.08155.pdf). CodeBERT is a pre-trained model for programming language, which is a multi-programming-lingual model pre-trained on NL-PL pairs in 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go). 
 3 | 
 4 | ### Dependency
 5 | 
 6 | - pip install torch
 7 | - pip install transformers
 8 | 
 9 | ### Qiuck Tour
10 | We use huggingface/transformers framework to train the model. You can use our model like the pre-trained Roberta base. Now, We give an example on how to load the model.
11 | ```python
12 | import torch
13 | from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
14 | 
15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16 | tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
17 | model = RobertaModel.from_pretrained("microsoft/codebert-base")
18 | model.to(device)
19 | ```
20 | 
21 | ### NL-PL Embeddings
22 | 
23 | Here, we give an example to obtain embedding from CodeBERT.
24 | 
25 | ```python
26 | >>> from transformers import AutoTokenizer, AutoModel
27 | >>> import torch
28 | >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
29 | >>> model = AutoModel.from_pretrained("microsoft/codebert-base")
30 | >>> nl_tokens=tokenizer.tokenize("return maximum value")
31 | ['return', 'Ġmaximum', 'Ġvalue']
32 | >>> code_tokens=tokenizer.tokenize("def max(a,b): if a>b: return a else return b")
33 | ['def', 'Ġmax', '(', 'a', ',', 'b', '):', 'Ġif', 'Ġa', '>', 'b', ':', 'Ġreturn', 'Ġa', 'Ġelse', 'Ġreturn', 'Ġb']
34 | >>> tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
35 | ['<s>', 'return', 'Ġmaximum', 'Ġvalue', '</s>', 'def', 'Ġmax', '(', 'a', ',', 'b', '):', 'Ġif', 'Ġa', '>', 'b', ':', 'Ġreturn', 'Ġa', 'Ġelse', 'Ġreturn', 'Ġb', '</s>']
36 | >>> tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
37 | [0, 30921, 4532, 923, 2, 9232, 19220, 1640, 102, 6, 428, 3256, 114, 10, 15698, 428, 35, 671, 10, 1493, 671, 741, 2]
38 | >>> context_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
39 | torch.Size([1, 23, 768])
40 | tensor([[-0.1423,  0.3766,  0.0443,  ..., -0.2513, -0.3099,  0.3183],
41 |         [-0.5739,  0.1333,  0.2314,  ..., -0.1240, -0.1219,  0.2033],
42 |         [-0.1579,  0.1335,  0.0291,  ...,  0.2340, -0.8801,  0.6216],
43 |         ...,
44 |         [-0.4042,  0.2284,  0.5241,  ..., -0.2046, -0.2419,  0.7031],
45 |         [-0.3894,  0.4603,  0.4797,  ..., -0.3335, -0.6049,  0.4730],
46 |         [-0.1433,  0.3785,  0.0450,  ..., -0.2527, -0.3121,  0.3207]],
47 |        grad_fn=<SelectBackward>)
48 | ```
49 | 
50 | 
51 | ### Probing
52 | 
53 | As stated in the paper, CodeBERT is not suitable for mask prediction task, while CodeBERT (MLM) is suitable for mask prediction task.
54 | 
55 | 
56 | We give an example on how to use CodeBERT(MLM) for mask prediction task.
57 | ```python
58 | from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
59 | 
60 | model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
61 | tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
62 | 
63 | CODE = "if (x is not None) <mask> (x>1)"
64 | fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
65 | 
66 | outputs = fill_mask(CODE)
67 | print(outputs)
68 | 
69 | ```
70 | Results
71 | ```python
72 | 'and', 'or', 'if', 'then', 'AND'
73 | ```
74 | The detailed outputs are as follows:
75 | ```python
76 | {'sequence': '<s> if (x is not None) and (x>1)</s>', 'score': 0.6049249172210693, 'token': 8}
77 | {'sequence': '<s> if (x is not None) or (x>1)</s>', 'score': 0.30680200457572937, 'token': 50}
78 | {'sequence': '<s> if (x is not None) if (x>1)</s>', 'score': 0.02133703976869583, 'token': 114}
79 | {'sequence': '<s> if (x is not None) then (x>1)</s>', 'score': 0.018607674166560173, 'token': 172}
80 | {'sequence': '<s> if (x is not None) AND (x>1)</s>', 'score': 0.007619690150022507, 'token': 4248}
81 | ```
82 | 
83 | ### Downstream Tasks
84 | 
85 | For Code Search and Code Docsmentation Generation tasks, please refer to the [CodeBERT](https://github.com/guoday/CodeBERT/tree/master/CodeBERT) folder.
86 | 
87 | 
88 | 
89 | # GraphCodeBERT
90 | 
91 | This repo also provides the code for reproducing the experiments in [GraphCodeBERT: Pre-training Code Representations with Data Flow](https://openreview.net/pdf?id=jLoC4ez43PZ). GraphCodeBERT a pre-trained model for programming language that considers the inherent structure of code i.e. data flow, which is a multi-programming-lingual model pre-trained on NL-PL pairs in 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go). 
92 | 
93 | For downstream tasks like code search, clone detection, code refinement and code translation, please refer to the [GraphCodeBERT](https://github.com/guoday/CodeBERT/tree/master/GraphCodeBERT) folder.
94 | 
95 | ## Contact
96 | 
97 | Feel free to contact Daya Guo (guody5@mail2.sysu.edu.cn) and Duyu Tang (dutang@microsoft.com) if you have any further questions.
98 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------