├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CodeBERT
    ├── code2nl
    │   ├── README.md
    │   ├── bleu.py
    │   ├── model.py
    │   └── run.py
    └── codesearch
    │   ├── README.md
    │   ├── mrr.py
    │   ├── process_data.py
    │   ├── run_classifier.py
    │   └── utils.py
├── GraphCodeBERT
    ├── clonedetection
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── evaluator
    │   │   ├── answers.txt
    │   │   ├── evaluator.py
    │   │   └── predictions.txt
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── codesearch
    │   ├── README.md
    │   ├── dataset.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    ├── refinement
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │   │   ├── DFG.py
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── build.sh
    │   │   ├── my-languages.so
    │   │   └── utils.py
    │   └── run.py
    └── translation
    │   ├── README.md
    │   ├── bleu.py
    │   ├── data.zip
    │   ├── model.py
    │   ├── parser
    │       ├── DFG.py
    │       ├── __init__.py
    │       ├── build.py
    │       ├── build.sh
    │       ├── my-languages.so
    │       └── utils.py
    │   └── run.py
├── LICENSE
├── NOTICE.md
├── README.md
├── SECURITY.md
└── UniXcoder
    ├── README.md
    ├── downstream-tasks
        ├── clone-detection
        │   ├── BCB
        │   │   ├── README.md
        │   │   ├── model.py
        │   │   ├── run.py
        │   │   └── run.sh
        │   └── POJ-104
        │   │   ├── README.md
        │   │   ├── dataset
        │   │       └── preprocess.py
        │   │   ├── model.py
        │   │   └── run.py
        ├── code-completion
        │   ├── README.md
        │   ├── dataset.zip
        │   ├── model.py
        │   └── run.py
        ├── code-generation
        │   ├── README.md
        │   ├── bleu.py
        │   ├── model.py
        │   ├── run.py
        │   └── run.sh
        ├── code-search
        │   ├── README.md
        │   ├── model.py
        │   └── run.py
        ├── code-summarization
        │   ├── README.md
        │   ├── bleu.py
        │   ├── model.py
        │   └── run.py
        └── zero-shot-search
        │   ├── README.md
        │   ├── dataset
        │       ├── java.jsonl
        │       ├── python.jsonl
        │       └── ruby.jsonl
        │   ├── model.py
        │   └── run.py
    └── unixcoder.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 | 
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 | 
5 | Resources:
6 | 
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/CodeBERT/code2nl/README.md:
--------------------------------------------------------------------------------
  1 | # Code Documentation Generation
  2 | 
  3 | This repo provides the code for reproducing the experiments on [CodeSearchNet](https://arxiv.org/abs/1909.09436) dataset for code document generation tasks in six programming languages.
  4 | 
  5 | **!News: We release a new pipeline for this task. The new pipeline only needs 2 p100 GPUs and less training time for Code Documentation Generation. Please refer to the [website](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text).**
  6 | 
  7 | ## Dependency
  8 | 
  9 | - pip install torch==1.4.0
 10 | - pip install transformers==2.5.0
 11 | - pip install filelock
 12 | 
 13 | ## Data Preprocess
 14 | 
 15 | We clean CodeSearchNet dataset for this task by following steps:
 16 | 
 17 | - Remove comments in the code
 18 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 19 | - Remove examples that #tokens of documents is < 3 or >256
 20 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 21 | - Remove examples that documents are not English.
 22 | 
 23 | Data statistic about the cleaned dataset for code document generation is shown in this Table. We release the cleaned dataset in this [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h).
 24 | 
 25 | | PL         | Training |  Dev   |  Test  |
 26 | | :--------- | :------: | :----: | :----: |
 27 | | Python     | 251,820  | 13,914 | 14,918 |
 28 | | PHP        | 241,241  | 12,982 | 14,014 |
 29 | | Go         | 167,288  | 7,325  | 8,122  |
 30 | | Java       | 164,923  | 5,183  | 10,955 |
 31 | | JavaScript |  58,025  | 3,885  | 3,291  |
 32 | | Ruby       |  24,927  | 1,400  | 1,261  |
 33 | 
 34 | 
 35 | 
 36 | ## Data Download
 37 | 
 38 | You can download dataset from the [website](https://drive.google.com/open?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h). Or use the following command.
 39 | 
 40 | ```shell
 41 | pip install gdown
 42 | mkdir data data/code2nl
 43 | cd data/code2nl
 44 | gdown https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h
 45 | unzip Cleaned_CodeSearchNet.zip
 46 | rm Cleaned_CodeSearchNet.zip
 47 | cd ../..
 48 | ```
 49 | 
 50 | 
 51 | 
 52 | ## Fine-Tune
 53 | 
 54 | We fine-tuned the model on 4*P40 GPUs. 
 55 | 
 56 | ```shell
 57 | cd code2nl
 58 | 
 59 | lang=php #programming language
 60 | lr=5e-5
 61 | batch_size=64
 62 | beam_size=10
 63 | source_length=256
 64 | target_length=128
 65 | data_dir=../data/code2nl/CodeSearchNet
 66 | output_dir=model/$lang
 67 | train_file=$data_dir/$lang/train.jsonl
 68 | dev_file=$data_dir/$lang/valid.jsonl
 69 | eval_steps=1000 #400 for ruby, 600 for javascript, 1000 for others
 70 | train_steps=50000 #20000 for ruby, 30000 for javascript, 50000 for others
 71 | pretrained_model=microsoft/codebert-base #Roberta: roberta-base
 72 | 
 73 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --train_steps $train_steps --eval_steps $eval_steps 
 74 | ```
 75 | 
 76 | 
 77 | 
 78 | ## Inference and Evaluation
 79 | 
 80 | After fine-tuning, inference and evaluation are as follows:
 81 | 
 82 | ```shell
 83 | lang=php #programming language
 84 | beam_size=10
 85 | batch_size=128
 86 | source_length=256
 87 | target_length=128
 88 | output_dir=model/$lang
 89 | data_dir=../data/code2nl/CodeSearchNet
 90 | dev_file=$data_dir/$lang/valid.jsonl
 91 | test_file=$data_dir/$lang/test.jsonl
 92 | test_model=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 93 | 
 94 | python run.py --do_test --model_type roberta --model_name_or_path microsoft/codebert-base --load_model_path $test_model --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size
 95 | ```
 96 | 
 97 | The results on CodeSearchNet are shown in this Table:
 98 | 
 99 | | Model       |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
100 | | ----------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
101 | | Seq2Seq     |   9.64    |   10.21    |   13.98   |   15.93   |   15.09   |   21.08   |   14.32   |
102 | | Transformer |   11.18   |   11.59    |   16.38   |   15.81   |   16.26   |   22.12   |   15.56   |
103 | | RoBERTa     |   11.17   |   11.90    |   17.72   |   18.14   |   16.47   |   24.02   |   16.57   |
104 | | CodeBERT    | **12.16** | **14.90**  | **18.07** | **19.06** | **17.65** | **25.16** | **17.83** |
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/CodeBERT/code2nl/bleu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | '''
  4 | This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
  5 | '''
  6 | 
  7 | # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
  8 | 
  9 | '''Provides:
 10 | 
 11 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 12 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 13 | score_cooked(alltest, n=4): Score a list of cooked test sentences.
 14 | 
 15 | score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
 16 | 
 17 | The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
 18 | '''
 19 | 
 20 | import sys, math, re, xml.sax.saxutils
 21 | import subprocess
 22 | import os
 23 | 
 24 | # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 25 | nonorm = 0
 26 | 
 27 | preserve_case = False
 28 | eff_ref_len = "shortest"
 29 | 
 30 | normalize1 = [
 31 |     ('<skipped>', ''),         # strip "skipped" tags
 32 |     (r'-\n', ''),              # strip end-of-line hyphenation and join lines
 33 |     (r'\n', ' '),              # join lines
 34 | #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
 35 | ]
 36 | normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
 37 | 
 38 | normalize2 = [
 39 |     (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
 40 |     (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
 41 |     (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
 42 |     (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
 43 | ]
 44 | normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
 45 | 
 46 | def normalize(s):
 47 |     '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
 48 |     # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 49 |     if (nonorm):
 50 |         return s.split()
 51 |     if type(s) is not str:
 52 |         s = " ".join(s)
 53 |     # language-independent part:
 54 |     for (pattern, replace) in normalize1:
 55 |         s = re.sub(pattern, replace, s)
 56 |     s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
 57 |     # language-dependent part (assuming Western languages):
 58 |     s = " %s " % s
 59 |     if not preserve_case:
 60 |         s = s.lower()         # this might not be identical to the original
 61 |     for (pattern, replace) in normalize2:
 62 |         s = re.sub(pattern, replace, s)
 63 |     return s.split()
 64 | 
 65 | def count_ngrams(words, n=4):
 66 |     counts = {}
 67 |     for k in range(1,n+1):
 68 |         for i in range(len(words)-k+1):
 69 |             ngram = tuple(words[i:i+k])
 70 |             counts[ngram] = counts.get(ngram, 0)+1
 71 |     return counts
 72 | 
 73 | def cook_refs(refs, n=4):
 74 |     '''Takes a list of reference sentences for a single segment
 75 |     and returns an object that encapsulates everything that BLEU
 76 |     needs to know about them.'''
 77 |     
 78 |     refs = [normalize(ref) for ref in refs]
 79 |     maxcounts = {}
 80 |     for ref in refs:
 81 |         counts = count_ngrams(ref, n)
 82 |         for (ngram,count) in counts.items():
 83 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 84 |     return ([len(ref) for ref in refs], maxcounts)
 85 | 
 86 | def cook_test(test, item, n=4):
 87 |     '''Takes a test sentence and returns an object that
 88 |     encapsulates everything that BLEU needs to know about it.'''
 89 |     (reflens, refmaxcounts)=item
 90 |     test = normalize(test)
 91 |     result = {}
 92 |     result["testlen"] = len(test)
 93 | 
 94 |     # Calculate effective reference sentence length.
 95 |     
 96 |     if eff_ref_len == "shortest":
 97 |         result["reflen"] = min(reflens)
 98 |     elif eff_ref_len == "average":
 99 |         result["reflen"] = float(sum(reflens))/len(reflens)
100 |     elif eff_ref_len == "closest":
101 |         min_diff = None
102 |         for reflen in reflens:
103 |             if min_diff is None or abs(reflen-len(test)) < min_diff:
104 |                 min_diff = abs(reflen-len(test))
105 |                 result['reflen'] = reflen
106 | 
107 |     result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
108 | 
109 |     result['correct'] = [0]*n
110 |     counts = count_ngrams(test, n)
111 |     for (ngram, count) in counts.items():
112 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
113 | 
114 |     return result
115 | 
116 | def score_cooked(allcomps, n=4, ground=0, smooth=1):
117 |     totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
118 |     for comps in allcomps:
119 |         for key in ['testlen','reflen']:
120 |             totalcomps[key] += comps[key]
121 |         for key in ['guess','correct']:
122 |             for k in range(n):
123 |                 totalcomps[key][k] += comps[key][k]
124 |     logbleu = 0.0
125 |     all_bleus = []
126 |     for k in range(n):
127 |       correct = totalcomps['correct'][k]
128 |       guess = totalcomps['guess'][k]
129 |       addsmooth = 0
130 |       if smooth == 1 and k > 0:
131 |         addsmooth = 1
132 |       logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
133 |       if guess == 0:
134 |         all_bleus.append(-10000000)
135 |       else:
136 |         all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
137 | 
138 |     logbleu /= float(n)
139 |     all_bleus.insert(0, logbleu)
140 | 
141 |     brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
142 |     for i in range(len(all_bleus)):
143 |       if i ==0:
144 |         all_bleus[i] += brevPenalty
145 |       all_bleus[i] = math.exp(all_bleus[i])
146 |     return all_bleus
147 | 
148 | def bleu(refs,  candidate, ground=0, smooth=1):
149 |     refs = cook_refs(refs)
150 |     test = cook_test(candidate, refs)
151 |     return score_cooked([test], ground=ground, smooth=smooth)
152 | 
153 | def splitPuncts(line):
154 |   return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
155 | 
156 | def computeMaps(predictions, goldfile):
157 |   predictionMap = {}
158 |   goldMap = {}
159 |   gf = open(goldfile, 'r')
160 | 
161 |   for row in predictions:
162 |     cols = row.strip().split('\t')
163 |     if len(cols) == 1:
164 |       (rid, pred) = (cols[0], '') 
165 |     else:
166 |       (rid, pred) = (cols[0], cols[1]) 
167 |     predictionMap[rid] = [splitPuncts(pred.strip().lower())]
168 | 
169 |   for row in gf:
170 |     (rid, pred) = row.split('\t') 
171 |     if rid in predictionMap: # Only insert if the id exists for the method
172 |       if rid not in goldMap:
173 |         goldMap[rid] = []
174 |       goldMap[rid].append(splitPuncts(pred.strip().lower()))
175 | 
176 |   sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
177 |   return (goldMap, predictionMap)
178 | 
179 | 
180 | #m1 is the reference map
181 | #m2 is the prediction map
182 | def bleuFromMaps(m1, m2):
183 |   score = [0] * 5
184 |   num = 0.0
185 | 
186 |   for key in m1:
187 |     if key in m2:
188 |       bl = bleu(m1[key], m2[key][0])
189 |       score = [ score[i] + bl[i] for i in range(0, len(bl))]
190 |       num += 1
191 |   return [s * 100.0 / num for s in score]
192 | 
193 | if __name__ == '__main__':
194 |   reference_file = sys.argv[1]
195 |   predictions = []
196 |   for row in sys.stdin:
197 |     predictions.append(row)
198 |   (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
199 |   print (bleuFromMaps(goldMap, predictionMap)[0])
200 | 
201 | 


--------------------------------------------------------------------------------
/CodeBERT/code2nl/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
 55 |         outputs = self.encoder(source_ids, attention_mask=source_mask)
 56 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 57 |         if target_ids is not None:  
 58 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 59 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 60 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 61 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 62 |             lm_logits = self.lm_head(hidden_states)
 63 |             # Shift so that tokens < n predict n
 64 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 65 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 66 |             shift_labels = target_ids[..., 1:].contiguous()
 67 |             # Flatten the tokens
 68 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 69 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 70 |                             shift_labels.view(-1)[active_loss])
 71 | 
 72 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 73 |             return outputs
 74 |         else:
 75 |             #Predict 
 76 |             preds=[]       
 77 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 78 |             for i in range(source_ids.shape[0]):
 79 |                 context=encoder_output[:,i:i+1]
 80 |                 context_mask=source_mask[i:i+1,:]
 81 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 82 |                 input_ids=beam.getCurrentState()
 83 |                 context=context.repeat(1, self.beam_size,1)
 84 |                 context_mask=context_mask.repeat(self.beam_size,1)
 85 |                 for _ in range(self.max_length): 
 86 |                     if beam.done():
 87 |                         break
 88 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 89 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
 90 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
 91 |                     out = torch.tanh(self.dense(out))
 92 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
 93 |                     out = self.lsm(self.lm_head(hidden_states)).data
 94 |                     beam.advance(out)
 95 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 96 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
 97 |                 hyp= beam.getHyp(beam.getFinal())
 98 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
 99 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
100 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
101 |                 
102 |             preds=torch.cat(preds,0)                
103 |             return preds   
104 |         
105 |         
106 | 
107 | class Beam(object):
108 |     def __init__(self, size,sos,eos):
109 |         self.size = size
110 |         self.tt = torch.cuda
111 |         # The score for each translation on the beam.
112 |         self.scores = self.tt.FloatTensor(size).zero_()
113 |         # The backpointers at each time-step.
114 |         self.prevKs = []
115 |         # The outputs at each time-step.
116 |         self.nextYs = [self.tt.LongTensor(size)
117 |                        .fill_(0)]
118 |         self.nextYs[0][0] = sos
119 |         # Has EOS topped the beam yet.
120 |         self._eos = eos
121 |         self.eosTop = False
122 |         # Time and k pair for finished.
123 |         self.finished = []
124 | 
125 |     def getCurrentState(self):
126 |         "Get the outputs for the current timestep."
127 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
128 |         return batch
129 | 
130 |     def getCurrentOrigin(self):
131 |         "Get the backpointers for the current timestep."
132 |         return self.prevKs[-1]
133 | 
134 |     def advance(self, wordLk):
135 |         """
136 |         Given prob over words for every last beam `wordLk` and attention
137 |         `attnOut`: Compute and update the beam search.
138 | 
139 |         Parameters:
140 | 
141 |         * `wordLk`- probs of advancing from the last step (K x words)
142 |         * `attnOut`- attention at the last step
143 | 
144 |         Returns: True if beam search is complete.
145 |         """
146 |         numWords = wordLk.size(1)
147 | 
148 |         # Sum the previous scores.
149 |         if len(self.prevKs) > 0:
150 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
151 | 
152 |             # Don't let EOS have children.
153 |             for i in range(self.nextYs[-1].size(0)):
154 |                 if self.nextYs[-1][i] == self._eos:
155 |                     beamLk[i] = -1e20
156 |         else:
157 |             beamLk = wordLk[0]
158 |         flatBeamLk = beamLk.view(-1)
159 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
160 | 
161 |         self.scores = bestScores
162 | 
163 |         # bestScoresId is flattened beam x word array, so calculate which
164 |         # word and beam each score came from
165 |         prevK = bestScoresId // numWords
166 |         self.prevKs.append(prevK)
167 |         self.nextYs.append((bestScoresId - prevK * numWords))
168 | 
169 | 
170 |         for i in range(self.nextYs[-1].size(0)):
171 |             if self.nextYs[-1][i] == self._eos:
172 |                 s = self.scores[i]
173 |                 self.finished.append((s, len(self.nextYs) - 1, i))
174 | 
175 |         # End condition is when top-of-beam is EOS and no global score.
176 |         if self.nextYs[-1][0] == self._eos:
177 |             self.eosTop = True
178 | 
179 |     def done(self):
180 |         return self.eosTop and len(self.finished) >=self.size
181 | 
182 |     def getFinal(self):
183 |         if len(self.finished) == 0:
184 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
185 |         self.finished.sort(key=lambda a: -a[0])
186 |         if len(self.finished) != self.size:
187 |             unfinished=[]
188 |             for i in range(self.nextYs[-1].size(0)):
189 |                 if self.nextYs[-1][i] != self._eos:
190 |                     s = self.scores[i]
191 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
192 |             unfinished.sort(key=lambda a: -a[0])
193 |             self.finished+=unfinished[:self.size-len(self.finished)]
194 |         return self.finished[:self.size]
195 | 
196 |     def getHyp(self, beam_res):
197 |         """
198 |         Walk back to construct the full hypothesis.
199 |         """
200 |         hyps=[]
201 |         for _,timestep, k in beam_res:
202 |             hyp = []
203 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
204 |                 hyp.append(self.nextYs[j+1][k])
205 |                 k = self.prevKs[j][k]
206 |             hyps.append(hyp[::-1])
207 |         return hyps
208 |     
209 |     def buildTargetTokens(self, preds):
210 |         sentence=[]
211 |         for pred in preds:
212 |             tokens = []
213 |             for tok in pred:
214 |                 if tok==self._eos:
215 |                     break
216 |                 tokens.append(tok)
217 |             sentence.append(tokens)
218 |         return sentence
219 |         
220 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
 1 | # Code Search
 2 | 
 3 | ## Data Preprocess
 4 | 
 5 | Both training and validation datasets are created in a way that positive and negative samples are balanced. Negative samples consist of balanced number of instances with randomly replaced NL and PL.
 6 | 
 7 | We follow the official evaluation metric to calculate the Mean Reciprocal Rank (MRR) for each pair of test data (c, w) over a fixed set of 999 distractor codes.
 8 | 
 9 | You can use the following command to download the preprocessed training and validation dataset and preprocess the test dataset by yourself. The preprocessed testing dataset is very large, so only the preprocessing script is provided.
10 | 
11 | ```shell
12 | mkdir data data/codesearch
13 | cd data/codesearch
14 | gdown https://drive.google.com/uc?id=1xgSR34XO8xXZg4cZScDYj2eGerBE9iGo  
15 | unzip codesearch_data.zip
16 | rm  codesearch_data.zip
17 | cd ../../codesearch
18 | python process_data.py
19 | cd ..
20 | ```
21 | 
22 | ## Fine-Tune
23 | We fine-tuned the model on 2*P100 GPUs. 
24 | ```shell
25 | cd codesearch
26 | 
27 | lang=php #fine-tuning a language-specific model for each programming language 
28 | pretrained_model=microsoft/codebert-base  #Roberta: roberta-base
29 | 
30 | python run_classifier.py \
31 | --model_type roberta \
32 | --task_name codesearch \
33 | --do_train \
34 | --do_eval \
35 | --eval_all_checkpoints \
36 | --train_file train.txt \
37 | --dev_file valid.txt \
38 | --max_seq_length 200 \
39 | --per_gpu_train_batch_size 32 \
40 | --per_gpu_eval_batch_size 32 \
41 | --learning_rate 1e-5 \
42 | --num_train_epochs 8 \
43 | --gradient_accumulation_steps 1 \
44 | --overwrite_output_dir \
45 | --data_dir ../data/codesearch/train_valid/$lang \
46 | --output_dir ./models/$lang  \
47 | --model_name_or_path $pretrained_model
48 | ```
49 | ## Inference and Evaluation
50 | 
51 | Inference
52 | ```shell
53 | lang=php #programming language
54 | idx=0 #test batch idx
55 | 
56 | python run_classifier.py \
57 | --model_type roberta \
58 | --model_name_or_path microsoft/codebert-base \
59 | --task_name codesearch \
60 | --do_predict \
61 | --output_dir ./models/$lang \
62 | --data_dir ../data/codesearch/test/$lang \
63 | --max_seq_length 200 \
64 | --per_gpu_train_batch_size 32 \
65 | --per_gpu_eval_batch_size 32 \
66 | --learning_rate 1e-5 \
67 | --num_train_epochs 8 \
68 | --test_file batch_${idx}.txt \
69 | --pred_model_dir ./models/$lang/checkpoint-best/ \
70 | --test_result_dir ./results/$lang/${idx}_batch_result.txt
71 | ```
72 | 
73 | Evaluation
74 | ```shell
75 | python mrr.py
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/mrr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import os
 6 | import numpy as np
 7 | from more_itertools import chunked
 8 | import argparse
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--test_batch_size', type=int, default=1000)
14 |     args = parser.parse_args()
15 |     languages = ['ruby', 'go', 'php', 'python', 'java', 'javascript']
16 |     MRR_dict = {}
17 |     for language in languages:
18 |         file_dir = './results/{}'.format(language)
19 |         ranks = []
20 |         num_batch = 0
21 |         for file in sorted(os.listdir(file_dir)):
22 |             print(os.path.join(file_dir, file))
23 |             with open(os.path.join(file_dir, file), encoding='utf-8') as f:
24 |                 batched_data = chunked(f.readlines(), args.test_batch_size)
25 |                 for batch_idx, batch_data in enumerate(batched_data):
26 |                     num_batch += 1
27 |                     correct_score = float(batch_data[batch_idx].strip().split('<CODESPLIT>')[-1])
28 |                     scores = np.array([float(data.strip().split('<CODESPLIT>')[-1]) for data in batch_data])
29 |                     rank = np.sum(scores >= correct_score)
30 |                     ranks.append(rank)
31 | 
32 |         mean_mrr = np.mean(1.0 / np.array(ranks))
33 |         print("{} mrr: {}".format(language, mean_mrr))
34 |         MRR_dict[language] = mean_mrr
35 |     for key, val in MRR_dict.items():
36 |         print("{} mrr: {}".format(key, val))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/CodeBERT/codesearch/process_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Microsoft Corporation. 
 3 | # Licensed under the MIT license.
 4 | 
 5 | import gzip
 6 | import os
 7 | import json
 8 | import numpy as np
 9 | from more_itertools import chunked
10 | 
11 | DATA_DIR='../data/codesearch'
12 | 
13 | def format_str(string):
14 |     for char in ['\r\n', '\r', '\n']:
15 |         string = string.replace(char, ' ')
16 |     return string
17 | 
18 | 
19 | def preprocess_test_data(language, test_batch_size=1000):
20 |     path = os.path.join(DATA_DIR, '{}_test_0.jsonl.gz'.format(language))
21 |     print(path)
22 |     with gzip.open(path, 'r') as pf:
23 |         data = pf.readlines()  
24 | 
25 |     idxs = np.arange(len(data))
26 |     data = np.array(data, dtype=np.object)
27 | 
28 |     np.random.seed(0)   # set random seed so that random things are reproducible
29 |     np.random.shuffle(idxs)
30 |     data = data[idxs]
31 |     batched_data = chunked(data, test_batch_size)
32 | 
33 |     print("start processing")
34 |     for batch_idx, batch_data in enumerate(batched_data):
35 |         if len(batch_data) < test_batch_size:
36 |             break # the last batch is smaller than the others, exclude.
37 |         examples = []
38 |         for d_idx, d in enumerate(batch_data): 
39 |             line_a = json.loads(str(d, encoding='utf-8'))
40 |             doc_token = ' '.join(line_a['docstring_tokens'])
41 |             for dd in batch_data:
42 |                 line_b = json.loads(str(dd, encoding='utf-8'))
43 |                 code_token = ' '.join([format_str(token) for token in line_b['code_tokens']])
44 | 
45 |                 example = (str(1), line_a['url'], line_b['url'], doc_token, code_token)
46 |                 example = '<CODESPLIT>'.join(example)
47 |                 examples.append(example)
48 | 
49 |         data_path = os.path.join(DATA_DIR, 'test/{}'.format(language))
50 |         if not os.path.exists(data_path):
51 |             os.makedirs(data_path)
52 |         file_path = os.path.join(data_path, 'batch_{}.txt'.format(batch_idx))
53 |         print(file_path)
54 |         with open(file_path, 'w', encoding='utf-8') as f:
55 |             f.writelines('\n'.join(examples))
56 | 
57 | if __name__ == '__main__':
58 |     languages = ['go', 'php', 'python', 'java', 'javascript', 'ruby']
59 |     for lang in languages:
60 |         preprocess_test_data(lang)
61 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/README.md:
--------------------------------------------------------------------------------
  1 | # Clone Detection
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.
  6 | 
  7 | ## Updates
  8 | 
  9 | 2021-9-13: We have update the evaluater script. Since it's a binary classification, we use binary F1 score instead of "macro" F1 score.
 10 | 
 11 | ## Dataset
 12 | 
 13 | The dataset we use is [BigCloneBench](https://www.cs.usask.ca/faculty/croy/papers/2014/SvajlenkoICSME2014BigERA.pdf) and filtered following the paper [Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree](https://arxiv.org/pdf/2002.08653.pdf).
 14 | 
 15 | ### Data Format
 16 | 
 17 | 1. dataset/data.jsonl is stored in jsonlines format. Each line in the uncompressed file represents one function.  One row is illustrated below.
 18 | 
 19 |    - **func:** the function
 20 | 
 21 |    - **idx:** index of the example
 22 | 
 23 | 2. train.txt/valid.txt/test.txt provide examples, stored in the following format:    idx1	idx2	label
 24 | 
 25 | ### Data Statistics
 26 | 
 27 | Data statistics of the dataset are shown in the below table:
 28 | 
 29 | |       | #Examples |
 30 | | ----- | :-------: |
 31 | | Train |  901,028  |
 32 | | Dev   |  415,416  |
 33 | | Test  |  415,416  |
 34 | 
 35 | You can get data using the following command.
 36 | 
 37 | ```
 38 | unzip dataset.zip
 39 | ```
 40 | 
 41 | ## Evaluator
 42 | 
 43 | We provide a script to evaluate predictions for this task, and report F1 score
 44 | 
 45 | ### Example
 46 | 
 47 | ```bash
 48 | python evaluator/evaluator.py -a evaluator/answers.txt -p evaluator/predictions.txt
 49 | ```
 50 | 
 51 | {'Recall': 0.25, 'Prediction': 0.5, 'F1': 0.3333333333333333}
 52 | 
 53 | ### Input predictions
 54 | 
 55 | A predications file that has predictions in TXT format, such as evaluator/predictions.txt. For example:
 56 | 
 57 | ```b
 58 | 13653451	21955002	0
 59 | 1188160	8831513	1
 60 | 1141235	14322332	0
 61 | 16765164	17526811	1
 62 | ```
 63 | 
 64 | ## Pipeline-GraphCodeBERT
 65 | 
 66 | We also provide a pipeline that fine-tunes GraphCodeBERT on this task. 
 67 | ### Dependency
 68 | 
 69 | - pip install torch
 70 | - pip install transformers
 71 | - pip install tree_sitter
 72 | - pip sklearn
 73 | 
 74 | ### Tree-sitter (optional)
 75 | 
 76 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 77 | 
 78 | ```shell
 79 | cd parser
 80 | bash build.sh
 81 | cd ..
 82 | ```
 83 | 
 84 | ### Fine-tune
 85 | 
 86 | We use 4*V100-16G to fine-tune and 10% valid data to evaluate.
 87 | 
 88 | 
 89 | ```shell
 90 | mkdir saved_models
 91 | python run.py \
 92 |     --output_dir=saved_models \
 93 |     --config_name=microsoft/graphcodebert-base \
 94 |     --model_name_or_path=microsoft/graphcodebert-base \
 95 |     --tokenizer_name=microsoft/graphcodebert-base \
 96 |     --do_train \
 97 |     --train_data_file=dataset/train.txt \
 98 |     --eval_data_file=dataset/valid.txt \
 99 |     --test_data_file=dataset/test.txt \
100 |     --epoch 1 \
101 |     --code_length 512 \
102 |     --data_flow_length 128 \
103 |     --train_batch_size 16 \
104 |     --eval_batch_size 32 \
105 |     --learning_rate 2e-5 \
106 |     --max_grad_norm 1.0 \
107 |     --evaluate_during_training \
108 |     --seed 123456 2>&1| tee saved_models/train.log
109 | ```
110 | 
111 | ### Inference
112 | 
113 | We use full test data for inference. 
114 | 
115 | ```shell
116 | python run.py \
117 |     --output_dir=saved_models \
118 |     --config_name=microsoft/graphcodebert-base \
119 |     --model_name_or_path=microsoft/graphcodebert-base \
120 |     --tokenizer_name=microsoft/graphcodebert-base \
121 |     --do_eval \
122 |     --do_test \
123 |     --train_data_file=dataset/train.txt \
124 |     --eval_data_file=dataset/valid.txt \
125 |     --test_data_file=dataset/test.txt \
126 |     --epoch 1 \
127 |     --code_length 512 \
128 |     --data_flow_length 128 \
129 |     --train_batch_size 16 \
130 |     --eval_batch_size 32 \
131 |     --learning_rate 2e-5 \
132 |     --max_grad_norm 1.0 \
133 |     --evaluate_during_training \
134 |     --seed 123456 2>&1| tee saved_models/test.log
135 | ```
136 | 
137 | ### Evaluation
138 | 
139 | ```shell
140 | python evaluator/evaluator.py -a dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
141 | ```
142 | 
143 | ## Result
144 | 
145 | The results on the test set are shown as below:
146 | 
147 | | Method        | Precision |  Recall   |    F1     |
148 | | ------------- | :-------: | :-------: | :-------: |
149 | | Deckard       |   0.93    |   0.02    |   0.03    |
150 | | RtvNN         |   0.95    |   0.01    |   0.01    |
151 | | CDLH          |   0.92    |   0.74    |   0.82    |
152 | | ASTNN         |   0.92    |   0.94    |   0.93    |
153 | | FA-AST-GMN    |   **0.96**    |   0.94    |   0.95    |
154 | | CodeBERT      |   0.947   |   0.934   |   0.941   |
155 | | GraphCodeBERT |  0.948 | **0.952** | **0.950** |
156 | 
157 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/clonedetection/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/answers.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	0
3 | 1141235	14322332	0
4 | 16765164	17526811	0


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | import logging
 4 | import sys
 5 | from sklearn.metrics import recall_score,precision_score,f1_score
 6 | 
 7 | def read_answers(filename):
 8 |     answers={}
 9 |     with open(filename) as f:
10 |         for line in f:
11 |             line=line.strip()
12 |             idx1,idx2,label=line.split()
13 |             answers[(idx1,idx2)]=int(label)
14 |     return answers
15 | 
16 | def read_predictions(filename):
17 |     predictions={}
18 |     with open(filename) as f:
19 |         for line in f:
20 |             line=line.strip()
21 |             idx1,idx2,label=line.split()
22 |             predictions[(idx1,idx2)]=int(label)
23 |     return predictions
24 | 
25 | def calculate_scores(answers,predictions):
26 |     y_trues,y_preds=[],[]
27 |     for key in answers:
28 |         if key not in predictions:
29 |             logging.error("Missing prediction for ({},{}) pair.".format(key[0],key[1]))
30 |             sys.exit()
31 |         y_trues.append(answers[key])
32 |         y_preds.append(predictions[key])
33 |     scores={}
34 |     scores['Recall']=recall_score(y_trues, y_preds)
35 |     scores['Prediction']=precision_score(y_trues, y_preds)
36 |     scores['F1']=f1_score(y_trues, y_preds)
37 |     return scores
38 | 
39 | def main():
40 |     import argparse
41 |     parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.')
42 |     parser.add_argument('--answers', '-a',help="filename of the labels, in txt format.")
43 |     parser.add_argument('--predictions', '-p',help="filename of the leaderboard predictions, in txt format.")
44 |     
45 | 
46 |     args = parser.parse_args()
47 |     answers=read_answers(args.answers)
48 |     predictions=read_predictions(args.predictions)
49 |     scores=calculate_scores(answers,predictions)
50 |     print(scores)
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 
55 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/evaluator/predictions.txt:
--------------------------------------------------------------------------------
1 | 13653451	21955002	0
2 | 1188160	8831513	1
3 | 1141235	14322332	0
4 | 16765164	17526811	1


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch
 4 | from torch.autograd import Variable
 5 | import copy
 6 | import torch.nn.functional as F
 7 | from torch.nn import CrossEntropyLoss, MSELoss
 8 | 
 9 | class RobertaClassificationHead(nn.Module):
10 |     """Head for sentence-level classification tasks."""
11 | 
12 |     def __init__(self, config):
13 |         super().__init__()
14 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
15 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
16 |         self.out_proj = nn.Linear(config.hidden_size, 2)
17 | 
18 |     def forward(self, features, **kwargs):
19 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
20 |         x = x.reshape(-1,x.size(-1)*2)
21 |         x = self.dropout(x)
22 |         x = self.dense(x)
23 |         x = torch.tanh(x)
24 |         x = self.dropout(x)
25 |         x = self.out_proj(x)
26 |         return x
27 |         
28 | class Model(nn.Module):   
29 |     def __init__(self, encoder,config,tokenizer,args):
30 |         super(Model, self).__init__()
31 |         self.encoder = encoder
32 |         self.config=config
33 |         self.tokenizer=tokenizer
34 |         self.classifier=RobertaClassificationHead(config)
35 |         self.args=args
36 |     
37 |         
38 |     def forward(self, inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels=None): 
39 |         bs,l=inputs_ids_1.size()
40 |         inputs_ids=torch.cat((inputs_ids_1.unsqueeze(1),inputs_ids_2.unsqueeze(1)),1).view(bs*2,l)
41 |         position_idx=torch.cat((position_idx_1.unsqueeze(1),position_idx_2.unsqueeze(1)),1).view(bs*2,l)
42 |         attn_mask=torch.cat((attn_mask_1.unsqueeze(1),attn_mask_2.unsqueeze(1)),1).view(bs*2,l,l)
43 | 
44 |         #embedding
45 |         nodes_mask=position_idx.eq(0)
46 |         token_mask=position_idx.ge(2)        
47 |         inputs_embeddings=self.encoder.roberta.embeddings.word_embeddings(inputs_ids)
48 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
49 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
50 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
51 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
52 |         
53 |         outputs = self.encoder.roberta(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx,token_type_ids=position_idx.eq(-1).long())[0]
54 |         logits=self.classifier(outputs)
55 |         # shape: [batch_size, num_classes]
56 |         prob=F.softmax(logits, dim=-1)
57 |         if labels is not None:
58 |             loss_fct = CrossEntropyLoss()
59 |             loss = loss_fct(logits, labels)
60 |             return loss,prob
61 |         else:
62 |             return prob
63 |       
64 |         
65 | 
66 |        
67 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/clonedetection/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/clonedetection/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Code Search
  4 | 
  5 | ## Data Preprocess
  6 | 
  7 | Different from the setting of [CodeSearchNet](husain2019codesearchnet), the answer of each query is retrieved from the whole development and testing code corpus instead of 1,000 candidate codes. Besides, we observe that some queries contain content unrelated to the code, such as a link ``http://..." that refers to external resources.  Therefore, we filter following examples to improve the quality of the dataset. 
  8 | 
  9 | - Remove comments in the code
 10 | 
 11 | - Remove examples that codes cannot be parsed into an abstract syntax tree.
 12 | 
 13 | - Remove examples that #tokens of documents is < 3 or >256
 14 | 
 15 | - Remove examples that documents contain special tokens (e.g. <img ...> or https:...)
 16 | 
 17 | - Remove examples that documents are not English.
 18 | 
 19 | Data statistic about the cleaned dataset for code document generation is shown in this Table.
 20 | 
 21 | | PL         | Training |  Dev   |  Test  | Candidates code |
 22 | | :--------- | :------: | :----: | :----: | :-------------: |
 23 | | Python     | 251,820  | 13,914 | 14,918 |     43,827      |
 24 | | PHP        | 241,241  | 12,982 | 14,014 |     52,660      |
 25 | | Go         | 167,288  | 7,325  | 8,122  |     28,120      |
 26 | | Java       | 164,923  | 5,183  | 10,955 |     40,347      |
 27 | | JavaScript |  58,025  | 3,885  | 3,291  |     13,981      |
 28 | | Ruby       |  24,927  | 1,400  | 1,261  |      4,360      |
 29 | 
 30 | You can download and preprocess data using the following command.
 31 | ```shell
 32 | unzip dataset.zip
 33 | cd dataset
 34 | bash run.sh 
 35 | cd ..
 36 | ```
 37 | 
 38 | ## Dependency 
 39 | 
 40 | - pip install torch
 41 | - pip install transformers
 42 | - pip install tree_sitter
 43 | 
 44 | ### Tree-sitter (optional)
 45 | 
 46 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 47 | 
 48 | ```shell
 49 | cd parser
 50 | bash build.sh
 51 | cd ..
 52 | ```
 53 | 
 54 | ## Fine-Tune
 55 | 
 56 | We fine-tuned the model on 2*V100-16G GPUs. 
 57 | ```shell
 58 | lang=ruby
 59 | mkdir -p ./saved_models/$lang
 60 | python run.py \
 61 |     --output_dir=./saved_models/$lang \
 62 |     --config_name=microsoft/graphcodebert-base \
 63 |     --model_name_or_path=microsoft/graphcodebert-base \
 64 |     --tokenizer_name=microsoft/graphcodebert-base \
 65 |     --lang=$lang \
 66 |     --do_train \
 67 |     --train_data_file=dataset/$lang/train.jsonl \
 68 |     --eval_data_file=dataset/$lang/valid.jsonl \
 69 |     --test_data_file=dataset/$lang/test.jsonl \
 70 |     --codebase_file=dataset/$lang/codebase.jsonl \
 71 |     --num_train_epochs 10 \
 72 |     --code_length 256 \
 73 |     --data_flow_length 64 \
 74 |     --nl_length 128 \
 75 |     --train_batch_size 32 \
 76 |     --eval_batch_size 64 \
 77 |     --learning_rate 2e-5 \
 78 |     --seed 123456 2>&1| tee saved_models/$lang/train.log
 79 | ```
 80 | ## Inference and Evaluation
 81 | 
 82 | ```shell
 83 | lang=ruby
 84 | python run.py \
 85 |     --output_dir=./saved_models/$lang \
 86 |     --config_name=microsoft/graphcodebert-base \
 87 |     --model_name_or_path=microsoft/graphcodebert-base \
 88 |     --tokenizer_name=microsoft/graphcodebert-base \
 89 |     --lang=$lang \
 90 |     --do_eval \
 91 |     --do_test \
 92 |     --train_data_file=dataset/$lang/train.jsonl \
 93 |     --eval_data_file=dataset/$lang/valid.jsonl \
 94 |     --test_data_file=dataset/$lang/test.jsonl \
 95 |     --codebase_file=dataset/$lang/codebase.jsonl \
 96 |     --num_train_epochs 10 \
 97 |     --code_length 256 \
 98 |     --data_flow_length 64 \
 99 |     --nl_length 128 \
100 |     --train_batch_size 32 \
101 |     --eval_batch_size 64 \
102 |     --learning_rate 2e-5 \
103 |     --seed 123456 2>&1| tee saved_models/$lang/test.log
104 | ```
105 | 
106 | ## Results	
107 | 
108 | The results on the filtered dataset are shown in this Table:
109 | 
110 | | Model          |   Ruby    | Javascript |    Go     |  Python   |   Java    |    PHP    |  Overall  |
111 | | -------------- | :-------: | :--------: | :-------: | :-------: | :-------: | :-------: | :-------: |
112 | | NBow           |   0.162   |   0.157    |   0.330   |   0.161   |   0.171   |   0.152   |   0.189   |
113 | | CNN            |   0.276   |   0.224    |   0.680   |   0.242   |   0.263   |   0.260   |   0.324   |
114 | | BiRNN          |   0.213   |   0.193    |   0.688   |   0.290   |   0.304   |   0.338   |   0.338   |
115 | | SelfAtt        |   0.275   |   0.287    |   0.723   |   0.398   |   0.404   |   0.426   |   0.419   |
116 | | RoBERTa        |   0.587   |   0.517    |   0.850   |   0.587   |   0.599   |   0.560   |   0.617   |
117 | | RoBERTa (code) |   0.628   |   0.562    |   0.859   |   0.610   |   0.620   |   0.579   |   0.643   |
118 | | CodeBERT       |   0.679   |   0.620    |   0.882   |   0.672   |   0.676   |   0.628   |   0.693   |
119 | | GraphCodeBERT  | **0.703** | **0.644**  | **0.897** | **0.692** | **0.691** | **0.649** | **0.713** |
120 | 
121 | 
122 | ## Model and Demo
123 | A pretrained model, additional training script with dataset, and demo of a finetuned CodeBERT model for the task of Code Search can be found here: https://drive.google.com/file/d/1ZO-xVIzGcNE6Gz9DEg2z5mIbBv4Ft1cK/view.
124 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/codesearch/dataset.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, attn_mask=None,position_idx=None, nl_inputs=None): 
11 |         if code_inputs is not None:
12 |             nodes_mask=position_idx.eq(0)
13 |             token_mask=position_idx.ge(2)        
14 |             inputs_embeddings=self.encoder.embeddings.word_embeddings(code_inputs)
15 |             nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
16 |             nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
17 |             avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
18 |             inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]    
19 |             return self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)[1]
20 |         else:
21 |             return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]
22 | 
23 |       
24 |         
25 |  
26 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/codesearch/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/codesearch/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/README.md:
--------------------------------------------------------------------------------
  1 | # Code Refinement
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code refinement aims to automatically fix bugs in the code, which can contribute to reducing the cost of bug-fixes for developers.
  6 | In CodeXGLUE, given a piece of Java code with bugs, the task is to remove the bugs to output the refined code. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. 
 12 | All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.
 13 | 
 14 | ### Data Format
 15 | 
 16 | The dataset is in the "data" folder. Each line of the files is a function. You can get data using the following command:
 17 | 
 18 | ```
 19 | unzip data.zip
 20 | ```
 21 | 
 22 | ### Data Statistics
 23 | 
 24 | Data statistics of this dataset are shown in the below table:
 25 | 
 26 | |         | #Examples | #Examples |
 27 | | ------- | :-------: | :-------: |
 28 | |         |   Small   |   Medium  |
 29 | |  Train  |   46,680  |   52,364  |
 30 | |  Valid  |    5,835  |    6,545  |
 31 | |   Test  |    5,835  |    6,545  |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking the "small" subset as example:
 53 | 
 54 | ```shell
 55 | scale=small
 56 | lr=1e-4
 57 | batch_size=32
 58 | beam_size=10
 59 | source_length=320
 60 | target_length=256
 61 | output_dir=saved_models/$scale/
 62 | train_file=data/$scale/train.buggy-fixed.buggy,data/$scale/train.buggy-fixed.fixed
 63 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 64 | epochs=50 
 65 | pretrained_model=microsoft/graphcodebert-base
 66 | 
 67 | mkdir -p $output_dir
 68 | python run.py --do_train --do_eval --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --train_filename $train_file --dev_filename $dev_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --train_batch_size $batch_size --eval_batch_size $batch_size --learning_rate $lr --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 69 | ```
 70 | 
 71 | ### Inference
 72 | 
 73 | We use full test data for inference. 
 74 | 
 75 | ```shell
 76 | batch_size=64
 77 | dev_file=data/$scale/valid.buggy-fixed.buggy,data/$scale/valid.buggy-fixed.fixed
 78 | test_file=data/$scale/test.buggy-fixed.buggy,data/$scale/test.buggy-fixed.fixed
 79 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 80 | 
 81 | python run.py --do_test --model_type roberta --model_name_or_path $pretrained_model --tokenizer_name microsoft/graphcodebert-base --config_name microsoft/graphcodebert-base --load_model_path $load_model_path --dev_filename $dev_file --test_filename $test_file --output_dir $output_dir --max_source_length $source_length --max_target_length $target_length --beam_size $beam_size --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
 82 | ```
 83 | 
 84 | 
 85 | 
 86 | ## Result
 87 | 
 88 | The results on the test set are shown as below:
 89 | 
 90 | Small:
 91 | 
 92 | | Method        |   BLEU    | Acc (100%) |
 93 | | ------------- | :-------: | :--------: |
 94 | | Naive copy    |   78.06   |    0.0     |
 95 | | LSTM          |   76.76   |    10.0    |
 96 | | Transformer   |   77.21   |    14.7    |
 97 | | CodeBERT      |   77.42   |    16.4    |
 98 | | GraphCodeBERT | **80.02** |  **17.3**  |
 99 | 
100 | Medium:
101 | 
102 | | Method        |   BLEU    | Acc (100%) |
103 | | ------------- | :-------: | :--------: |
104 | | Naive copy    |   90.91   |    0.0     |
105 | | LSTM          |   72.08   |    2.5     |
106 | | Transformer   |   89.25   |    3.7     |
107 | | CodeBERT      |   91.07   |    5.16    |
108 | | GraphCodeBERT | **91.31** |  **9.1**   |
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/refinement/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids,source_mask,position_idx,attn_mask,target_ids=None,target_mask=None,args=None):   
 55 |         #embedding
 56 |         nodes_mask=position_idx.eq(0)
 57 |         token_mask=position_idx.ge(2)        
 58 |         inputs_embeddings=self.encoder.embeddings.word_embeddings(source_ids)
 59 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
 60 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
 61 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
 62 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]  
 63 |         
 64 |         outputs = self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)
 65 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 66 |         #source_mask=token_mask.float()
 67 |         if target_ids is not None:  
 68 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 69 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 70 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 71 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 72 |             lm_logits = self.lm_head(hidden_states)
 73 |             # Shift so that tokens < n predict n
 74 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 75 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 76 |             shift_labels = target_ids[..., 1:].contiguous()
 77 |             # Flatten the tokens
 78 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 79 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 80 |                             shift_labels.view(-1)[active_loss])
 81 | 
 82 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 83 |             return outputs
 84 |         else:
 85 |             #Predict 
 86 |             preds=[]       
 87 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 88 |             for i in range(source_ids.shape[0]):
 89 |                 context=encoder_output[:,i:i+1]
 90 |                 context_mask=source_mask[i:i+1,:]
 91 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 92 |                 input_ids=beam.getCurrentState()
 93 |                 context=context.repeat(1, self.beam_size,1)
 94 |                 context_mask=context_mask.repeat(self.beam_size,1)
 95 |                 for _ in range(self.max_length): 
 96 |                     if beam.done():
 97 |                         break
 98 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 99 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
100 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
101 |                     out = torch.tanh(self.dense(out))
102 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
103 |                     out = self.lsm(self.lm_head(hidden_states)).data
104 |                     beam.advance(out)
105 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
106 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
107 |                 hyp= beam.getHyp(beam.getFinal())
108 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
109 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
110 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
111 |                 
112 |             preds=torch.cat(preds,0)                
113 |             return preds   
114 |         
115 |         
116 | 
117 | class Beam(object):
118 |     def __init__(self, size,sos,eos):
119 |         self.size = size
120 |         self.tt = torch.cuda
121 |         # The score for each translation on the beam.
122 |         self.scores = self.tt.FloatTensor(size).zero_()
123 |         # The backpointers at each time-step.
124 |         self.prevKs = []
125 |         # The outputs at each time-step.
126 |         self.nextYs = [self.tt.LongTensor(size)
127 |                        .fill_(0)]
128 |         self.nextYs[0][0] = sos
129 |         # Has EOS topped the beam yet.
130 |         self._eos = eos
131 |         self.eosTop = False
132 |         # Time and k pair for finished.
133 |         self.finished = []
134 | 
135 |     def getCurrentState(self):
136 |         "Get the outputs for the current timestep."
137 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
138 |         return batch
139 | 
140 |     def getCurrentOrigin(self):
141 |         "Get the backpointers for the current timestep."
142 |         return self.prevKs[-1]
143 | 
144 |     def advance(self, wordLk):
145 |         """
146 |         Given prob over words for every last beam `wordLk` and attention
147 |         `attnOut`: Compute and update the beam search.
148 | 
149 |         Parameters:
150 | 
151 |         * `wordLk`- probs of advancing from the last step (K x words)
152 |         * `attnOut`- attention at the last step
153 | 
154 |         Returns: True if beam search is complete.
155 |         """
156 |         numWords = wordLk.size(1)
157 | 
158 |         # Sum the previous scores.
159 |         if len(self.prevKs) > 0:
160 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
161 | 
162 |             # Don't let EOS have children.
163 |             for i in range(self.nextYs[-1].size(0)):
164 |                 if self.nextYs[-1][i] == self._eos:
165 |                     beamLk[i] = -1e20
166 |         else:
167 |             beamLk = wordLk[0]
168 |         flatBeamLk = beamLk.view(-1)
169 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
170 | 
171 |         self.scores = bestScores
172 | 
173 |         # bestScoresId is flattened beam x word array, so calculate which
174 |         # word and beam each score came from
175 |         prevK = bestScoresId // numWords
176 |         self.prevKs.append(prevK)
177 |         self.nextYs.append((bestScoresId - prevK * numWords))
178 | 
179 | 
180 |         for i in range(self.nextYs[-1].size(0)):
181 |             if self.nextYs[-1][i] == self._eos:
182 |                 s = self.scores[i]
183 |                 self.finished.append((s, len(self.nextYs) - 1, i))
184 | 
185 |         # End condition is when top-of-beam is EOS and no global score.
186 |         if self.nextYs[-1][0] == self._eos:
187 |             self.eosTop = True
188 | 
189 |     def done(self):
190 |         return self.eosTop and len(self.finished) >=self.size
191 | 
192 |     def getFinal(self):
193 |         if len(self.finished) == 0:
194 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
195 |         self.finished.sort(key=lambda a: -a[0])
196 |         if len(self.finished) != self.size:
197 |             unfinished=[]
198 |             for i in range(self.nextYs[-1].size(0)):
199 |                 if self.nextYs[-1][i] != self._eos:
200 |                     s = self.scores[i]
201 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
202 |             unfinished.sort(key=lambda a: -a[0])
203 |             self.finished+=unfinished[:self.size-len(self.finished)]
204 |         return self.finished[:self.size]
205 | 
206 |     def getHyp(self, beam_res):
207 |         """
208 |         Walk back to construct the full hypothesis.
209 |         """
210 |         hyps=[]
211 |         for _,timestep, k in beam_res:
212 |             hyp = []
213 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
214 |                 hyp.append(self.nextYs[j+1][k])
215 |                 k = self.prevKs[j][k]
216 |             hyps.append(hyp[::-1])
217 |         return hyps
218 |     
219 |     def buildTargetTokens(self, preds):
220 |         sentence=[]
221 |         for pred in preds:
222 |             tokens = []
223 |             for tok in pred:
224 |                 if tok==self._eos:
225 |                     break
226 |                 tokens.append(tok)
227 |             sentence.append(tokens)
228 |         return sentence
229 |         
230 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/refinement/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/refinement/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/README.md:
--------------------------------------------------------------------------------
  1 | # Code Translation
  2 | 
  3 | ## Task Definition
  4 | 
  5 | Code translation aims to migrate legacy software from one programming language in a platform toanother.
  6 | Given a piece of Java (C#) code, the task is to translate the code into C# (Java) version. 
  7 | Models are evaluated by BLEU scores and accuracy (exactly match).
  8 | 
  9 | ## Dataset
 10 | 
 11 | The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).
 12 | 
 13 | We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.
 14 | 
 15 | ### Data Format
 16 | 
 17 | The dataset is in the "data" folder. Each line of the files is a function, and the suffix of the file indicates the programming language. You can get data using the following command:
 18 | 
 19 | ```
 20 | unzip data.zip
 21 | ```
 22 | 
 23 | ### Data Statistics
 24 | 
 25 | Data statistics of the dataset are shown in the below table:
 26 | 
 27 | |       | #Examples |
 28 | | ----- | :-------: |
 29 | | Train |  10,300   |
 30 | | Valid |    500    |
 31 | | Test  |   1,000   |
 32 | 
 33 | ## Pipeline-GraphCodeBERT
 34 | 
 35 | ### Dependency
 36 | 
 37 | - pip install torch
 38 | - pip install transformers
 39 | - pip install tree_sitter
 40 | 
 41 | ### Tree-sitter (optional)
 42 | 
 43 | If the built file "parser/my-languages.so" doesn't work for you, please rebuild as the following command:
 44 | 
 45 | ```shell
 46 | cd parser
 47 | bash build.sh
 48 | cd ..
 49 | ```
 50 | 
 51 | ### Fine-tune
 52 | We use 4*V100-16G to fine-tune. Taking Java to C# translation as example:
 53 | 
 54 | ```shell
 55 | source=java
 56 | target=cs
 57 | lr=1e-4
 58 | batch_size=32
 59 | beam_size=10
 60 | source_length=320
 61 | target_length=256
 62 | output_dir=saved_models/$source-$target/
 63 | train_file=data/train.java-cs.txt.$source,data/train.java-cs.txt.$target
 64 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 65 | epochs=100
 66 | pretrained_model=microsoft/graphcodebert-base
 67 | 
 68 | mkdir -p $output_dir
 69 | python run.py \
 70 | --do_train \
 71 | --do_eval \
 72 | --model_type roberta \
 73 | --source_lang $source \
 74 | --model_name_or_path $pretrained_model \
 75 | --tokenizer_name microsoft/graphcodebert-base \
 76 | --config_name microsoft/graphcodebert-base \
 77 | --train_filename $train_file \
 78 | --dev_filename $dev_file \
 79 | --output_dir $output_dir \
 80 | --max_source_length $source_length \
 81 | --max_target_length $target_length \
 82 | --beam_size $beam_size \
 83 | --train_batch_size $batch_size \
 84 | --eval_batch_size $batch_size \
 85 | --learning_rate $lr \
 86 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
 87 | ```
 88 | 
 89 | ### Inference
 90 | 
 91 | We use full test data for inference. 
 92 | 
 93 | ```shell
 94 | batch_size=64
 95 | dev_file=data/valid.java-cs.txt.$source,data/valid.java-cs.txt.$target
 96 | test_file=data/test.java-cs.txt.$source,data/test.java-cs.txt.$target
 97 | load_model_path=$output_dir/checkpoint-best-bleu/pytorch_model.bin #checkpoint for test
 98 | 
 99 | python run.py \
100 | --do_test \
101 | --model_type roberta \
102 | --source_lang $source \
103 | --model_name_or_path $pretrained_model \
104 | --tokenizer_name microsoft/graphcodebert-base \
105 | --config_name microsoft/graphcodebert-base \
106 | --load_model_path $load_model_path \
107 | --dev_filename $dev_file \
108 | --test_filename $test_file \
109 | --output_dir $output_dir \
110 | --max_source_length $source_length \
111 | --max_target_length $target_length \
112 | --beam_size $beam_size \
113 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
114 | ```
115 | 
116 | 
117 | 
118 | ## Result
119 | 
120 | The results on the test set are shown as below:
121 | 
122 | Java to C#:
123 | 
124 | | Method         |   BLEU    | Acc (100%) |
125 | | -------------- | :-------: | :--------: |
126 | | Naive copy     |   18.54   |    0.0     |
127 | | PBSMT          |   43.53   |    12.5    |
128 | | Transformer    |   55.84   |    33.0    |
129 | | Roborta (code) |   77.46   |    56.1    |
130 | | CodeBERT       |   79.92   |    59.0    |
131 | | GraphCodeBERT  | **80.58** |  **59.4**  |
132 | 
133 | C# to Java:
134 | 
135 | | Method         |   BLEU    | Acc (100%) |
136 | | -------------- | :-------: | :--------: |
137 | | Naive copy     |   18.69   |    0.0     |
138 | | PBSMT          |   40.06   |    16.1    |
139 | | Transformer    |   50.47   |    37.9    |
140 | | Roborta (code) |   71.99   |    57.9    |
141 | | CodeBERT       |   72.14   |    58.0    |
142 | | GraphCodeBERT  | **72.64** |  **58.8**  |
143 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/translation/data.zip


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
 29 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 31 |         self.lsm = nn.LogSoftmax(dim=-1)
 32 |         self.tie_weights()
 33 |         
 34 |         self.beam_size=beam_size
 35 |         self.max_length=max_length
 36 |         self.sos_id=sos_id
 37 |         self.eos_id=eos_id
 38 |         
 39 |     def _tie_or_clone_weights(self, first_module, second_module):
 40 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
 41 |         """
 42 |         if self.config.torchscript:
 43 |             first_module.weight = nn.Parameter(second_module.weight.clone())
 44 |         else:
 45 |             first_module.weight = second_module.weight
 46 |                   
 47 |     def tie_weights(self):
 48 |         """ Make sure we are sharing the input and output embeddings.
 49 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 50 |         """
 51 |         self._tie_or_clone_weights(self.lm_head,
 52 |                                    self.encoder.embeddings.word_embeddings)        
 53 |         
 54 |     def forward(self, source_ids,source_mask,position_idx,attn_mask,target_ids=None,target_mask=None,args=None):   
 55 |         #embedding
 56 |         nodes_mask=position_idx.eq(0)
 57 |         token_mask=position_idx.ge(2)        
 58 |         inputs_embeddings=self.encoder.embeddings.word_embeddings(source_ids)
 59 |         nodes_to_token_mask=nodes_mask[:,:,None]&token_mask[:,None,:]&attn_mask
 60 |         nodes_to_token_mask=nodes_to_token_mask/(nodes_to_token_mask.sum(-1)+1e-10)[:,:,None]
 61 |         avg_embeddings=torch.einsum("abc,acd->abd",nodes_to_token_mask,inputs_embeddings)
 62 |         inputs_embeddings=inputs_embeddings*(~nodes_mask)[:,:,None]+avg_embeddings*nodes_mask[:,:,None]  
 63 |         
 64 |         outputs = self.encoder(inputs_embeds=inputs_embeddings,attention_mask=attn_mask,position_ids=position_idx)
 65 |         encoder_output = outputs[0].permute([1,0,2]).contiguous()
 66 |         #source_mask=token_mask.float()
 67 |         if target_ids is not None:  
 68 |             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
 69 |             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
 70 |             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
 71 |             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
 72 |             lm_logits = self.lm_head(hidden_states)
 73 |             # Shift so that tokens < n predict n
 74 |             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
 75 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 76 |             shift_labels = target_ids[..., 1:].contiguous()
 77 |             # Flatten the tokens
 78 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 79 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 80 |                             shift_labels.view(-1)[active_loss])
 81 | 
 82 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 83 |             return outputs
 84 |         else:
 85 |             #Predict 
 86 |             preds=[]       
 87 |             zero=torch.cuda.LongTensor(1).fill_(0)     
 88 |             for i in range(source_ids.shape[0]):
 89 |                 context=encoder_output[:,i:i+1]
 90 |                 context_mask=source_mask[i:i+1,:]
 91 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 92 |                 input_ids=beam.getCurrentState()
 93 |                 context=context.repeat(1, self.beam_size,1)
 94 |                 context_mask=context_mask.repeat(self.beam_size,1)
 95 |                 for _ in range(self.max_length): 
 96 |                     if beam.done():
 97 |                         break
 98 |                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
 99 |                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
100 |                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
101 |                     out = torch.tanh(self.dense(out))
102 |                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
103 |                     out = self.lsm(self.lm_head(hidden_states)).data
104 |                     beam.advance(out)
105 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
106 |                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
107 |                 hyp= beam.getHyp(beam.getFinal())
108 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
109 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
110 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
111 |                 
112 |             preds=torch.cat(preds,0)                
113 |             return preds   
114 |         
115 |         
116 | 
117 | class Beam(object):
118 |     def __init__(self, size,sos,eos):
119 |         self.size = size
120 |         self.tt = torch.cuda
121 |         # The score for each translation on the beam.
122 |         self.scores = self.tt.FloatTensor(size).zero_()
123 |         # The backpointers at each time-step.
124 |         self.prevKs = []
125 |         # The outputs at each time-step.
126 |         self.nextYs = [self.tt.LongTensor(size)
127 |                        .fill_(0)]
128 |         self.nextYs[0][0] = sos
129 |         # Has EOS topped the beam yet.
130 |         self._eos = eos
131 |         self.eosTop = False
132 |         # Time and k pair for finished.
133 |         self.finished = []
134 | 
135 |     def getCurrentState(self):
136 |         "Get the outputs for the current timestep."
137 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
138 |         return batch
139 | 
140 |     def getCurrentOrigin(self):
141 |         "Get the backpointers for the current timestep."
142 |         return self.prevKs[-1]
143 | 
144 |     def advance(self, wordLk):
145 |         """
146 |         Given prob over words for every last beam `wordLk` and attention
147 |         `attnOut`: Compute and update the beam search.
148 | 
149 |         Parameters:
150 | 
151 |         * `wordLk`- probs of advancing from the last step (K x words)
152 |         * `attnOut`- attention at the last step
153 | 
154 |         Returns: True if beam search is complete.
155 |         """
156 |         numWords = wordLk.size(1)
157 | 
158 |         # Sum the previous scores.
159 |         if len(self.prevKs) > 0:
160 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
161 | 
162 |             # Don't let EOS have children.
163 |             for i in range(self.nextYs[-1].size(0)):
164 |                 if self.nextYs[-1][i] == self._eos:
165 |                     beamLk[i] = -1e20
166 |         else:
167 |             beamLk = wordLk[0]
168 |         flatBeamLk = beamLk.view(-1)
169 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
170 | 
171 |         self.scores = bestScores
172 | 
173 |         # bestScoresId is flattened beam x word array, so calculate which
174 |         # word and beam each score came from
175 |         prevK = bestScoresId // numWords
176 |         self.prevKs.append(prevK)
177 |         self.nextYs.append((bestScoresId - prevK * numWords))
178 | 
179 | 
180 |         for i in range(self.nextYs[-1].size(0)):
181 |             if self.nextYs[-1][i] == self._eos:
182 |                 s = self.scores[i]
183 |                 self.finished.append((s, len(self.nextYs) - 1, i))
184 | 
185 |         # End condition is when top-of-beam is EOS and no global score.
186 |         if self.nextYs[-1][0] == self._eos:
187 |             self.eosTop = True
188 | 
189 |     def done(self):
190 |         return self.eosTop and len(self.finished) >=self.size
191 | 
192 |     def getFinal(self):
193 |         if len(self.finished) == 0:
194 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
195 |         self.finished.sort(key=lambda a: -a[0])
196 |         if len(self.finished) != self.size:
197 |             unfinished=[]
198 |             for i in range(self.nextYs[-1].size(0)):
199 |                 if self.nextYs[-1][i] != self._eos:
200 |                     s = self.scores[i]
201 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
202 |             unfinished.sort(key=lambda a: -a[0])
203 |             self.finished+=unfinished[:self.size-len(self.finished)]
204 |         return self.finished[:self.size]
205 | 
206 |     def getHyp(self, beam_res):
207 |         """
208 |         Walk back to construct the full hypothesis.
209 |         """
210 |         hyps=[]
211 |         for _,timestep, k in beam_res:
212 |             hyp = []
213 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
214 |                 hyp.append(self.nextYs[j+1][k])
215 |                 k = self.prevKs[j][k]
216 |             hyps.append(hyp[::-1])
217 |         return hyps
218 |     
219 |     def buildTargetTokens(self, preds):
220 |         sentence=[]
221 |         for pred in preds:
222 |             tokens = []
223 |             for tok in pred:
224 |                 if tok==self._eos:
225 |                     break
226 |                 tokens.append(tok)
227 |             sentence.append(tokens)
228 |         return sentence
229 |         
230 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (remove_comments_and_docstrings,
2 |                    tree_to_token_index,
3 |                    index_to_code_token,
4 |                    tree_to_variable_index)
5 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | 
 4 | from tree_sitter import Language, Parser
 5 | 
 6 | Language.build_library(
 7 |   # Store the library in the `build` directory
 8 |   'my-languages.so',
 9 | 
10 |   # Include one or more languages
11 |   [
12 |     'tree-sitter-go',
13 |     'tree-sitter-javascript',
14 |     'tree-sitter-python',
15 |     'tree-sitter-php',
16 |     'tree-sitter-java',
17 |     'tree-sitter-ruby',
18 |     'tree-sitter-c-sharp',
19 |   ]
20 | )
21 | 
22 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/build.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/tree-sitter/tree-sitter-go
2 | git clone https://github.com/tree-sitter/tree-sitter-javascript
3 | git clone https://github.com/tree-sitter/tree-sitter-python
4 | git clone https://github.com/tree-sitter/tree-sitter-ruby
5 | git clone https://github.com/tree-sitter/tree-sitter-php
6 | git clone https://github.com/tree-sitter/tree-sitter-java
7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8 | python build.py
9 | 


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/my-languages.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/GraphCodeBERT/translation/parser/my-languages.so


--------------------------------------------------------------------------------
/GraphCodeBERT/translation/parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import StringIO
 3 | import  tokenize
 4 | def remove_comments_and_docstrings(source,lang):
 5 |     if lang in ['python']:
 6 |         """
 7 |         Returns 'source' minus comments and docstrings.
 8 |         """
 9 |         io_obj = StringIO(source)
10 |         out = ""
11 |         prev_toktype = tokenize.INDENT
12 |         last_lineno = -1
13 |         last_col = 0
14 |         for tok in tokenize.generate_tokens(io_obj.readline):
15 |             token_type = tok[0]
16 |             token_string = tok[1]
17 |             start_line, start_col = tok[2]
18 |             end_line, end_col = tok[3]
19 |             ltext = tok[4]
20 |             if start_line > last_lineno:
21 |                 last_col = 0
22 |             if start_col > last_col:
23 |                 out += (" " * (start_col - last_col))
24 |             # Remove comments:
25 |             if token_type == tokenize.COMMENT:
26 |                 pass
27 |             # This series of conditionals removes docstrings:
28 |             elif token_type == tokenize.STRING:
29 |                 if prev_toktype != tokenize.INDENT:
30 |             # This is likely a docstring; double-check we're not inside an operator:
31 |                     if prev_toktype != tokenize.NEWLINE:
32 |                         if start_col > 0:
33 |                             out += token_string
34 |             else:
35 |                 out += token_string
36 |             prev_toktype = token_type
37 |             last_col = end_col
38 |             last_lineno = end_line
39 |         temp=[]
40 |         for x in out.split('\n'):
41 |             if x.strip()!="":
42 |                 temp.append(x)
43 |         return '\n'.join(temp)
44 |     elif lang in ['ruby']:
45 |         return source
46 |     else:
47 |         def replacer(match):
48 |             s = match.group(0)
49 |             if s.startswith('/'):
50 |                 return " " # note: a space and not an empty string
51 |             else:
52 |                 return s
53 |         pattern = re.compile(
54 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
55 |             re.DOTALL | re.MULTILINE
56 |         )
57 |         temp=[]
58 |         for x in re.sub(pattern, replacer, source).split('\n'):
59 |             if x.strip()!="":
60 |                 temp.append(x)
61 |         return '\n'.join(temp)
62 | 
63 | def tree_to_token_index(root_node):
64 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
65 |         return [(root_node.start_point,root_node.end_point)]
66 |     else:
67 |         code_tokens=[]
68 |         for child in root_node.children:
69 |             code_tokens+=tree_to_token_index(child)
70 |         return code_tokens
71 |     
72 | def tree_to_variable_index(root_node,index_to_code):
73 |     if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
74 |         index=(root_node.start_point,root_node.end_point)
75 |         _,code=index_to_code[index]
76 |         if root_node.type!=code:
77 |             return [(root_node.start_point,root_node.end_point)]
78 |         else:
79 |             return []
80 |     else:
81 |         code_tokens=[]
82 |         for child in root_node.children:
83 |             code_tokens+=tree_to_variable_index(child,index_to_code)
84 |         return code_tokens    
85 | 
86 | def index_to_code_token(index,code):
87 |     start_point=index[0]
88 |     end_point=index[1]
89 |     if start_point[0]==end_point[0]:
90 |         s=code[start_point[0]][start_point[1]:end_point[1]]
91 |     else:
92 |         s=""
93 |         s+=code[start_point[0]][start_point[1]:]
94 |         for i in range(start_point[0]+1,end_point[0]):
95 |             s+=code[i]
96 |         s+=code[end_point[0]][:end_point[1]]   
97 |     return s
98 |    


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Microsoft Corporation.
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CodeBERT
  2 | This repo provides the code for reproducing the experiments in [CodeBERT: A Pre-Trained Model for Programming and Natural Languages](https://arxiv.org/pdf/2002.08155.pdf). CodeBERT is a pre-trained model for programming language, which is a multi-programming-lingual model pre-trained on NL-PL pairs in 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go). 
  3 | 
  4 | ### Dependency
  5 | 
  6 | - pip install torch
  7 | - pip install transformers
  8 | 
  9 | ### Quick Tour
 10 | We use huggingface/transformers framework to train the model. You can use our model like the pre-trained Roberta base. Now, We give an example on how to load the model.
 11 | ```python
 12 | import torch
 13 | from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
 14 | 
 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 16 | tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
 17 | model = RobertaModel.from_pretrained("microsoft/codebert-base")
 18 | model.to(device)
 19 | ```
 20 | 
 21 | ### NL-PL Embeddings
 22 | 
 23 | Here, we give an example to obtain embedding from CodeBERT.
 24 | 
 25 | ```python
 26 | >>> from transformers import AutoTokenizer, AutoModel
 27 | >>> import torch
 28 | >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
 29 | >>> model = AutoModel.from_pretrained("microsoft/codebert-base")
 30 | >>> nl_tokens=tokenizer.tokenize("return maximum value")
 31 | ['return', 'Ġmaximum', 'Ġvalue']
 32 | >>> code_tokens=tokenizer.tokenize("def max(a,b): if a>b: return a else return b")
 33 | ['def', 'Ġmax', '(', 'a', ',', 'b', '):', 'Ġif', 'Ġa', '>', 'b', ':', 'Ġreturn', 'Ġa', 'Ġelse', 'Ġreturn', 'Ġb']
 34 | >>> tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
 35 | ['<s>', 'return', 'Ġmaximum', 'Ġvalue', '</s>', 'def', 'Ġmax', '(', 'a', ',', 'b', '):', 'Ġif', 'Ġa', '>', 'b', ':', 'Ġreturn', 'Ġa', 'Ġelse', 'Ġreturn', 'Ġb', '</s>']
 36 | >>> tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
 37 | [0, 30921, 4532, 923, 2, 9232, 19220, 1640, 102, 6, 428, 3256, 114, 10, 15698, 428, 35, 671, 10, 1493, 671, 741, 2]
 38 | >>> context_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
 39 | torch.Size([1, 23, 768])
 40 | tensor([[-0.1423,  0.3766,  0.0443,  ..., -0.2513, -0.3099,  0.3183],
 41 |         [-0.5739,  0.1333,  0.2314,  ..., -0.1240, -0.1219,  0.2033],
 42 |         [-0.1579,  0.1335,  0.0291,  ...,  0.2340, -0.8801,  0.6216],
 43 |         ...,
 44 |         [-0.4042,  0.2284,  0.5241,  ..., -0.2046, -0.2419,  0.7031],
 45 |         [-0.3894,  0.4603,  0.4797,  ..., -0.3335, -0.6049,  0.4730],
 46 |         [-0.1433,  0.3785,  0.0450,  ..., -0.2527, -0.3121,  0.3207]],
 47 |        grad_fn=<SelectBackward>)
 48 | ```
 49 | 
 50 | 
 51 | ### Probing
 52 | 
 53 | As stated in the paper, CodeBERT is not suitable for mask prediction task, while CodeBERT (MLM) is suitable for mask prediction task.
 54 | 
 55 | 
 56 | We give an example on how to use CodeBERT(MLM) for mask prediction task.
 57 | ```python
 58 | from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
 59 | 
 60 | model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
 61 | tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
 62 | 
 63 | CODE = "if (x is not None) <mask> (x>1)"
 64 | fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
 65 | 
 66 | outputs = fill_mask(CODE)
 67 | print(outputs)
 68 | 
 69 | ```
 70 | Results
 71 | ```python
 72 | 'and', 'or', 'if', 'then', 'AND'
 73 | ```
 74 | The detailed outputs are as follows:
 75 | ```python
 76 | {'sequence': '<s> if (x is not None) and (x>1)</s>', 'score': 0.6049249172210693, 'token': 8}
 77 | {'sequence': '<s> if (x is not None) or (x>1)</s>', 'score': 0.30680200457572937, 'token': 50}
 78 | {'sequence': '<s> if (x is not None) if (x>1)</s>', 'score': 0.02133703976869583, 'token': 114}
 79 | {'sequence': '<s> if (x is not None) then (x>1)</s>', 'score': 0.018607674166560173, 'token': 172}
 80 | {'sequence': '<s> if (x is not None) AND (x>1)</s>', 'score': 0.007619690150022507, 'token': 4248}
 81 | ```
 82 | 
 83 | ### Downstream Tasks
 84 | 
 85 | For Code Search and Code Docsmentation Generation tasks, please refer to the [CodeBERT](https://github.com/guoday/CodeBERT/tree/master/CodeBERT) folder.
 86 | 
 87 | 
 88 | 
 89 | # GraphCodeBERT
 90 | 
 91 | This repo also provides the code for reproducing the experiments in [GraphCodeBERT: Pre-training Code Representations with Data Flow](https://openreview.net/pdf?id=jLoC4ez43PZ). GraphCodeBERT is a pre-trained model for programming language that considers the inherent structure of code i.e. data flow, which is a multi-programming-lingual model pre-trained on NL-PL pairs in 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go). 
 92 | 
 93 | For downstream tasks like code search, clone detection, code refinement and code translation, please refer to the [GraphCodeBERT](https://github.com/guoday/CodeBERT/tree/master/GraphCodeBERT) folder.
 94 | 
 95 | # UniXcoder
 96 | 
 97 | This repo will provide the code for reproducing the experiments in [UniXcoder: Unified Cross-Modal Pre-training for Code Representation](https://arxiv.org/pdf/2203.03850.pdf). UniXcoder is a unified cross-modal pre-trained model for programming languages to support both code-related understanding and generation tasks. 
 98 | 
 99 | Please refer to the [UniXcoder](https://github.com/guoday/CodeBERT/tree/master/UniXcoder) folder for tutorials and downstream tasks.
100 | 
101 | ## Contact
102 | 
103 | Feel free to contact Daya Guo (guody5@mail2.sysu.edu.cn), Duyu Tang (dutang@microsoft.com), Shuai Lu (v-shuailu@microsoft.com) and Nan Duan (nanduan@microsoft.com) if you have any further questions.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/UniXcoder/README.md:
--------------------------------------------------------------------------------
  1 | # UniXcoder
  2 | 
  3 | This repo will provide the code for reproducing the experiments in [UniXcoder: Unified Cross-Modal Pre-training for Code Representation](https://arxiv.org/pdf/2203.03850.pdf). UniXcoder is a unified cross-modal pre-trained model for programming languages to support both code-related understanding and generation tasks. 
  4 | 
  5 | Here, we provide three types of UniXcoder:
  6 | 
  7 | [unixcoder-base-unimodal](https://huggingface.co/microsoft/unixcoder-base-unimodal): Pre-trained on C4 and CodeSearchNet dataset (without NL)
  8 | 
  9 | [unixcoder-base](https://huggingface.co/microsoft/unixcoder-base): Continue pre-training ```unixcoder-base-unimodal``` on NL-PL pairs of CodeSearchNet dataset. The model can support six languages: **java, ruby, python, php, javascript, and go**. This model is reported in the paper.
 10 | 
 11 | [unixcoder-base-nine](https://huggingface.co/microsoft/unixcoder-base-nine):  Continue pre-training ```unixcoder-base-unimodal``` on NL-PL pairs of CodeSearchNet dataset and additional 1.5M NL-PL pairs of C, C++ and C# programming language. The model can support nine languages: **java, ruby, python, php, javascript, go, c, c++ and c#**.
 12 | 
 13 | ## 1. Dependency
 14 | 
 15 | - pip install torch
 16 | - pip install transformers
 17 | 
 18 | ## 2. Quick Tour
 19 | We implement a class to use UniXcoder and you can follow the code to build UniXcoder.
 20 | ```python
 21 | import torch
 22 | from unixcoder import UniXcoder
 23 | 
 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 25 | model = UniXcoder("microsoft/unixcoder-base")
 26 | model.to(device)
 27 | ```
 28 | 
 29 | In the following, we will give zero-shot examples for several tasks under different mode, including **code search (encoder-only)**, **code completion (decoder-only)**, **function name prediction (encoder-decoder)** , **API recommendation (encoder-decoder)**, **code summarization (encoder-decoder)**.
 30 | 
 31 | ## 3. Encoder-only Mode
 32 | 
 33 | For encoder-only mode, we give an example of **code search**.
 34 | 
 35 | ### 1) Code and NL Embeddings
 36 | 
 37 | Here, we give an example to obtain code fragment embedding from CodeBERT.
 38 | 
 39 | ```python
 40 | # Encode maximum function
 41 | func = "def f(a,b): if a>b: return a else return b"
 42 | tokens_ids = model.tokenize([func],max_length=512,mode="<encoder-only>")
 43 | source_ids = torch.tensor(tokens_ids).to(device)
 44 | tokens_embeddings,max_func_embedding = model(source_ids)
 45 | 
 46 | # Encode minimum function
 47 | func = "def f(a,b): if a<b: return a else return b"
 48 | tokens_ids = model.tokenize([func],max_length=512,mode="<encoder-only>")
 49 | source_ids = torch.tensor(tokens_ids).to(device)
 50 | tokens_embeddings,min_func_embedding = model(source_ids)
 51 | 
 52 | # Encode NL
 53 | nl = "return maximum value"
 54 | tokens_ids = model.tokenize([nl],max_length=512,mode="<encoder-only>")
 55 | source_ids = torch.tensor(tokens_ids).to(device)
 56 | tokens_embeddings,nl_embedding = model(source_ids)
 57 | 
 58 | print(max_func_embedding.shape)
 59 | print(max_func_embedding)
 60 | ```
 61 | 
 62 | ```python
 63 | torch.Size([1, 768])
 64 | tensor([[ 8.6533e-01, -1.9796e+00, -8.6849e-01,  4.2652e-01, -5.3696e-01,
 65 |          -1.5521e-01,  5.3770e-01,  3.4199e-01,  3.6305e-01, -3.9391e-01,
 66 |          -1.1816e+00,  2.6010e+00, -7.7133e-01,  1.8441e+00,  2.3645e+00,
 67 | 				 ...,
 68 |          -2.9188e+00,  1.2555e+00, -1.9953e+00, -1.9795e+00,  1.7279e+00,
 69 |           6.4590e-01, -5.2769e-02,  2.4965e-01,  2.3962e-02,  5.9996e-02,
 70 |           2.5659e+00,  3.6533e+00,  2.0301e+00]], device='cuda:0',
 71 |        grad_fn=<DivBackward0>)
 72 | ```
 73 | 
 74 | ### 2) Similarity between code and NL
 75 | 
 76 | Now, we calculate cosine similarity between NL and two functions. Although the difference of two functions is only a operator (```<``` and ```>```), UniXcoder can distinguish them.
 77 | 
 78 | ```python
 79 | # Normalize embedding
 80 | norm_max_func_embedding = torch.nn.functional.normalize(max_func_embedding, p=2, dim=1)
 81 | norm_min_func_embedding = torch.nn.functional.normalize(min_func_embedding, p=2, dim=1)
 82 | norm_nl_embedding = torch.nn.functional.normalize(nl_embedding, p=2, dim=1)
 83 | 
 84 | max_func_nl_similarity = torch.einsum("ac,bc->ab",norm_max_func_embedding,norm_nl_embedding)
 85 | min_func_nl_similarity = torch.einsum("ac,bc->ab",norm_min_func_embedding,norm_nl_embedding)
 86 | 
 87 | print(max_func_nl_similarity)
 88 | print(min_func_nl_similarity)
 89 | ```
 90 | 
 91 | ```python
 92 | tensor([[0.3002]], device='cuda:0', grad_fn=<ViewBackward>)
 93 | tensor([[0.1881]], device='cuda:0', grad_fn=<ViewBackward>)
 94 | ```
 95 | 
 96 | ## 3. Decoder-only Mode
 97 | 
 98 | For decoder-only mode, we give an example of **code completion**.
 99 | 
100 | ```python
101 | context = """
102 | def f(data,file_path):
103 |     # write json data into file_path in python language
104 | """
105 | tokens_ids = model.tokenize([context],max_length=512,mode="<decoder-only>")
106 | source_ids = torch.tensor(tokens_ids).to(device)
107 | prediction_ids = model.generate(source_ids, decoder_only=True, beam_size=3, max_length=128)
108 | predictions = model.decode(prediction_ids)
109 | print(context+predictions[0][0])
110 | ```
111 | 
112 | ```python
113 | def f(data,file_path):
114 |     # write json data into file_path in python language
115 |     data = json.dumps(data)
116 |     with open(file_path, 'w') as f:
117 |         f.write(data)
118 | ```
119 | 
120 | ## 4. Encoder-Decoder Mode
121 | 
122 | For encoder-decoder mode, we give two examples including: **function name prediction**, **API recommendation**, **code summarization**.
123 | 
124 | ### 1) **Function Name Prediction**
125 | 
126 | ```python
127 | context = """
128 | def <mask0>(data,file_path):
129 |     data = json.dumps(data)
130 |     with open(file_path, 'w') as f:
131 |         f.write(data)
132 | """
133 | tokens_ids = model.tokenize([context],max_length=512,mode="<encoder-decoder>")
134 | source_ids = torch.tensor(tokens_ids).to(device)
135 | prediction_ids = model.generate(source_ids, decoder_only=False, beam_size=3, max_length=128)
136 | predictions = model.decode(prediction_ids)
137 | print([x.replace("<mask0>","").strip() for x in predictions[0]])
138 | ```
139 | 
140 | ```python
141 | ['write_json', 'write_file', 'to_json']
142 | ```
143 | 
144 | ### 2) API Recommendation
145 | 
146 | ```python
147 | context = """
148 | def write_json(data,file_path):
149 |     data = <mask0>(data)
150 |     with open(file_path, 'w') as f:
151 |         f.write(data)
152 | """
153 | tokens_ids = model.tokenize([context],max_length=512,mode="<encoder-decoder>")
154 | source_ids = torch.tensor(tokens_ids).to(device)
155 | prediction_ids = model.generate(source_ids, decoder_only=False, beam_size=3, max_length=128)
156 | predictions = model.decode(prediction_ids)
157 | print([x.replace("<mask0>","").strip() for x in predictions[0]])
158 | ```
159 | 
160 | ```python
161 | ['json.dumps', 'json.loads', 'str']
162 | ```
163 | 
164 | ### 3) Code Summarization
165 | 
166 | ```python
167 | context = """
168 | # <mask0>
169 | def write_json(data,file_path):
170 |     data = json.dumps(data)
171 |     with open(file_path, 'w') as f:
172 |         f.write(data)
173 | """
174 | tokens_ids = model.tokenize([context],max_length=512,mode="<encoder-decoder>")
175 | source_ids = torch.tensor(tokens_ids).to(device)
176 | prediction_ids = model.generate(source_ids, decoder_only=False, beam_size=3, max_length=128)
177 | predictions = model.decode(prediction_ids)
178 | print([x.replace("<mask0>","").strip() for x in predictions[0]])
179 | ```
180 | 
181 | ```python
182 | ['Write JSON to file', 'Write json to file', 'Write a json file']
183 | ```
184 | 
185 | ## 5.  Fine-tuning
186 | 
187 | For downstream tasks reported in the paper, please refer to the [downstream-tasks](https://github.com/guoday/CodeBERT/tree/master/UniXcoder/downstream-tasks) folders.
188 | 
189 | 
190 | 
191 | # Reference
192 | If you use this code or CodeXGLUE, please consider citing us.
193 | 
194 | <pre><code>@article{guo2022unixcoder,
195 |   title={UniXcoder: Unified Cross-Modal Pre-training for Code Representation},
196 |   author={Guo, Daya and Lu, Shuai and Duan, Nan and Wang, Yanlin and Zhou, Ming and Yin, Jian},
197 |   journal={arXiv preprint arXiv:2203.03850},
198 |   year={2022}
199 | }</code></pre>
200 | 
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/README.md:
--------------------------------------------------------------------------------
 1 | # Clone Detection (BigCloneDetection)
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | mkdir dataset
 7 | cd dataset
 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl
 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/test.txt
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/train.txt
11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/valid.txt
12 | cd ..
13 | 
14 | ```
15 | 
16 | ## Dependency 
17 | 
18 | - pip install torch
19 | - pip install transformers
20 | 
21 | ## Fine-Tune
22 | 
23 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
24 | 
25 | ```shell
26 | # Training
27 | python run.py \
28 |     --output_dir saved_models \
29 |     --model_name_or_path microsoft/unixcoder-base \
30 |     --do_train \
31 |     --train_data_file dataset/train.txt \
32 |     --eval_data_file dataset/valid.txt \
33 |     --num_train_epochs 1 \
34 |     --block_size 512 \
35 |     --train_batch_size 16 \
36 |     --eval_batch_size 32 \
37 |     --learning_rate 5e-5 \
38 |     --max_grad_norm 1.0 \
39 |     --seed 123456 
40 |     
41 | # Evaluating
42 | python run.py \
43 |     --output_dir saved_models \
44 |     --model_name_or_path microsoft/unixcoder-base \
45 |     --do_test \
46 |     --test_data_file dataset/test.txt \
47 |     --num_train_epochs 1 \
48 |     --block_size 512 \
49 |     --train_batch_size 16 \
50 |     --eval_batch_size 32 \
51 |     --learning_rate 5e-5 \
52 |     --max_grad_norm 1.0 \
53 |     --seed 123456 
54 | ```
55 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. 
 2 | # Licensed under the MIT license.
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch
 6 | from torch.autograd import Variable
 7 | import copy
 8 | import torch.nn.functional as F
 9 | from torch.nn import CrossEntropyLoss, MSELoss
10 | 
11 | class RobertaClassificationHead(nn.Module):
12 |     """Head for sentence-level classification tasks."""
13 | 
14 |     def __init__(self, config):
15 |         super().__init__()
16 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
17 |         self.dropout = nn.Dropout(0.1)
18 |         self.out_proj = nn.Linear(config.hidden_size, 2)
19 | 
20 |     def forward(self, x):
21 |         x = x.reshape(-1,x.size(-1)*2)
22 |         x = self.dropout(x)
23 |         x = self.dense(x)
24 |         x = torch.tanh(x)
25 |         x = self.dropout(x)
26 |         x = self.out_proj(x)
27 |         return x
28 |         
29 | class Model(nn.Module):   
30 |     def __init__(self, encoder,config,tokenizer,args):
31 |         super(Model, self).__init__()
32 |         self.encoder = encoder
33 |         self.config = config
34 |         self.tokenizer = tokenizer
35 |         self.classifier = RobertaClassificationHead(config)
36 |         self.args = args
37 |     
38 |         
39 |     def forward(self, input_ids=None,labels=None): 
40 |         input_ids = input_ids.view(-1,self.args.block_size)
41 |         outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
42 |         outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
43 |         outputs = outputs.reshape(-1,2,outputs.size(-1))
44 |         outputs = torch.nn.functional.normalize(outputs, p=2, dim=-1)
45 |         cos_sim = (outputs[:,0]*outputs[:,1]).sum(-1)
46 | 
47 |         if labels is not None:
48 |             loss = ((cos_sim-labels.float())**2).mean()
49 |             return loss,cos_sim
50 |         else:
51 |             return cos_sim
52 |       
53 |         
54 |  
55 |         
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/run.sh:
--------------------------------------------------------------------------------
 1 | model=../../../../pretrained-model/UniXcoder-base
 2 | mkdir saved_models
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
 4 |     --output_dir=./saved_models \
 5 |     --model_type=roberta \
 6 |     --model_name_or_path=$model \
 7 |     --do_train \
 8 |     --train_data_file=../../dataset/train.txt \
 9 |     --eval_data_file=../../dataset/valid.txt \
10 |     --test_data_file=../../dataset/test.txt \
11 |     --epoch 1 \
12 |     --block_size 512 \
13 |     --train_batch_size 16 \
14 |     --eval_batch_size 32 \
15 |     --learning_rate 5e-5 \
16 |     --max_grad_norm 1.0 \
17 |     --evaluate_during_training \
18 |     --seed 123456 2>&1| tee saved_models/train.log
19 |     
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
21 |     --output_dir=./saved_models \
22 |     --model_type=roberta \
23 |     --model_name_or_path=$model \
24 |     --do_eval \
25 |     --do_test \
26 |     --train_data_file=../../dataset/train.txt \
27 |     --eval_data_file=../../dataset/valid.txt \
28 |     --test_data_file=../../dataset/test.txt \
29 |     --epoch 1 \
30 |     --block_size 512 \
31 |     --train_batch_size 16 \
32 |     --eval_batch_size 32 \
33 |     --learning_rate 5e-5 \
34 |     --max_grad_norm 1.0 \
35 |     --evaluate_during_training \
36 |     --seed 123456 2>&1| tee saved_models/test.log
37 |     
38 | python ../evaluator/evaluator.py -a ../../dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
39 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/README.md:
--------------------------------------------------------------------------------
 1 | # Clone Detection (POJ-104)
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | cd dataset
 7 | pip install gdown
 8 | gdown https://drive.google.com/uc?id=0B2i-vWnOu7MxVlJwQXN6eVNONUU
 9 | tar -xvf programs.tar.gz
10 | python preprocess.py
11 | cd ..
12 | ```
13 | 
14 | ## Dependency 
15 | 
16 | - pip install torch
17 | - pip install transformers
18 | 
19 | ## Fine-Tune
20 | 
21 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
22 | 
23 | ```shell
24 | # Training
25 | python run.py \
26 |     --output_dir saved_models \
27 |     --model_name_or_path microsoft/unixcoder-base \
28 |     --do_train \
29 |     --train_data_file dataset/train.jsonl \
30 |     --eval_data_file dataset/valid.jsonl \
31 |     --test_data_file dataset/test.jsonl \
32 |     --num_train_epochs 2 \
33 |     --block_size 400 \
34 |     --train_batch_size 8 \
35 |     --eval_batch_size 16 \
36 |     --learning_rate 2e-5 \
37 |     --max_grad_norm 1.0 \
38 |     --seed 123456
39 |     
40 | # Evaluating	
41 | python run.py \
42 |     --output_dir saved_models \
43 |     --model_name_or_path microsoft/unixcoder-base \
44 |     --do_eval \
45 |     --do_test \
46 |     --eval_data_file dataset/valid.jsonl \
47 |     --test_data_file dataset/test.jsonl \
48 |     --num_train_epochs 2 \
49 |     --block_size 400 \
50 |     --train_batch_size 8 \
51 |     --eval_batch_size 16 \
52 |     --learning_rate 2e-5 \
53 |     --max_grad_norm 1.0 \
54 |     --seed 123456
55 | ```
56 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/dataset/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import os
 4 | import json
 5 | from tqdm import tqdm
 6 | def files(path):
 7 |     g = os.walk(path) 
 8 |     file=[]
 9 |     for path,dir_list,file_list in g:  
10 |         for file_name in file_list:  
11 |             file.append(os.path.join(path, file_name))
12 |     return file
13 | 
14 | cont=0
15 | with open("train.jsonl",'w') as f:
16 |     for i in tqdm(range(1,65),total=64):
17 |         items=files("ProgramData/{}".format(i))
18 |         for item in items:
19 |             js={}
20 |             js['label']=item.split('/')[1]
21 |             js['index']=str(cont)
22 |             js['code']=open(item,encoding='latin-1').read()
23 |             f.write(json.dumps(js)+'\n')
24 |             cont+=1
25 |         
26 | with open("valid.jsonl",'w') as f:
27 |     for i in tqdm(range(65,81),total=16):
28 |         items=files("ProgramData/{}".format(i))
29 |         for item in items:
30 |             js={}
31 |             js['label']=item.split('/')[1]
32 |             js['index']=str(cont)
33 |             js['code']=open(item,encoding='latin-1').read()
34 |             f.write(json.dumps(js)+'\n')
35 |             cont+=1
36 |             
37 | with open("test.jsonl",'w') as f:
38 |     for i in tqdm(range(81,195),total=24):
39 |         items=files("ProgramData/{}".format(i))
40 |         for item in items:
41 |             js={}
42 |             js['label']=item.split('/')[1]
43 |             js['index']=str(cont)
44 |             js['code']=open(item,encoding='latin-1').read()
45 |             f.write(json.dumps(js)+'\n')
46 |             cont+=1


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch
 6 | from torch.autograd import Variable
 7 | import copy
 8 | import torch.nn.functional as F
 9 | from torch.nn import CrossEntropyLoss, MSELoss
10 | 
11 | 
12 |     
13 | class Model(nn.Module):   
14 |     def __init__(self, encoder,config,tokenizer,args):
15 |         super(Model, self).__init__()
16 |         self.encoder = encoder
17 |         self.config=config
18 |         self.tokenizer=tokenizer
19 |         self.args=args
20 |     
21 |         
22 |     def forward(self, input_ids=None,p_input_ids=None,n_input_ids=None,labels=None): 
23 |         bs,_ = input_ids.size()
24 |         input_ids = torch.cat((input_ids,p_input_ids,n_input_ids),0)
25 |         
26 |         outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
27 |         outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
28 |         outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
29 |         outputs = outputs.split(bs,0)
30 |         
31 |         prob_1 = (outputs[0]*outputs[1]).sum(-1)*20
32 |         prob_2 = (outputs[0]*outputs[2]).sum(-1)*20
33 |         temp = torch.cat((outputs[0],outputs[1]),0)
34 |         temp_labels = torch.cat((labels,labels),0)
35 |         prob_3 = torch.mm(outputs[0],temp.t())*20
36 |         mask = labels[:,None]==temp_labels[None,:]
37 |         prob_3 = prob_3*(1-mask.float())-1e9*mask.float()
38 |         
39 |         prob = torch.softmax(torch.cat((prob_1[:,None],prob_2[:,None],prob_3),-1),-1)
40 |         loss = torch.log(prob[:,0]+1e-10)
41 |         loss = -loss.mean()
42 |         return loss,outputs[0]
43 | 
44 |       
45 |         
46 |  
47 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/README.md:
--------------------------------------------------------------------------------
  1 | # Code Completion
  2 | 
  3 | ## Dependency 
  4 | 
  5 | - pip install torch
  6 | - pip install transformers
  7 | - pip install javalang
  8 | 
  9 | ## Data Download
 10 | 
 11 | ```bash
 12 | unzip dataset.zip
 13 | 
 14 | cd dataset/javaCorpus/
 15 | bash download.sh
 16 | python preprocess.py --base_dir=token_completion --output_dir=./
 17 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion/test.json
 18 | 
 19 | cd ../py150
 20 | bash download.sh
 21 | python preprocess.py --base_dir=py150_files --output_dir=./
 22 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion/test.json
 23 | 
 24 | cd ../..
 25 | ```
 26 | 
 27 | 
 28 | 
 29 | ## Fine-Tune Setting
 30 | 
 31 | Here we provide fine-tune settings for code completion, whose results are reported in the paper.
 32 | 
 33 | #### JavaCorpus Dataset
 34 | 
 35 | ```shell
 36 | # Training
 37 | python run.py \
 38 | 	--do_train \
 39 | 	--do_eval \
 40 | 	--lang java \
 41 | 	--model_name_or_path microsoft/unixcoder-base \
 42 | 	--train_filename dataset/javaCorpus/train.txt \
 43 | 	--dev_filename dataset/javaCorpus/dev.json \
 44 |   --output_dir saved_models/javaCorpus \
 45 |   --max_source_length 936 \
 46 |   --max_target_length 64 \
 47 |   --beam_size 5 \
 48 |   --train_batch_size 32 \
 49 |   --gradient_accumulation_steps 1 \
 50 |   --eval_batch_size 32 \
 51 |   --learning_rate 2e-5 \
 52 |   --num_train_epochs 10
 53 |   
 54 | # Output predictions of test set
 55 | python run.py \
 56 | 	--do_test \
 57 | 	--lang java \
 58 | 	--model_name_or_path microsoft/unixcoder-base \
 59 | 	--load_model_path saved_models/javaCorpus/checkpoint-best-acc/pytorch_model.bin \
 60 | 	--test_filename dataset/javaCorpus/test.json \
 61 |   --output_dir saved_models/javaCorpus \
 62 |   --max_source_length 936 \
 63 |   --max_target_length 64 \
 64 |   --beam_size 5 \
 65 |   --eval_batch_size 32
 66 | ```
 67 | 
 68 | Prediction results of test set are  ```saved_models/javaCorpus/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
 69 | 
 70 | 
 71 | #### PY150 Dataset
 72 | 
 73 | ```shell
 74 | # Training
 75 | python run.py \
 76 | 	--do_train \
 77 | 	--do_eval \
 78 | 	--lang python \
 79 | 	--model_name_or_path microsoft/unixcoder-base \
 80 | 	--train_filename dataset/py150/train.txt \
 81 | 	--dev_filename dataset/py150/dev.json \
 82 |   --output_dir saved_models/py150 \
 83 |   --max_source_length 936 \
 84 |   --max_target_length 64 \
 85 |   --beam_size 5 \
 86 |   --train_batch_size 32 \
 87 |   --gradient_accumulation_steps 1 \
 88 |   --eval_batch_size 32 \
 89 |   --learning_rate 2e-4 \
 90 |   --num_train_epochs 10
 91 |   
 92 | # Output predictions of test set  
 93 | python run.py \
 94 | 	--do_test \
 95 | 	--lang python \
 96 | 	--model_name_or_path microsoft/unixcoder-base \
 97 | 	--load_model_path saved_models/py150/checkpoint-best-acc/pytorch_model.bin \
 98 | 	--test_filename dataset/py150/test.json \
 99 |   --output_dir saved_models/py150 \
100 |   --max_source_length 936 \
101 |   --max_target_length 64 \
102 |   --beam_size 5 \
103 |   --eval_batch_size 32
104 | ```
105 | 
106 | Prediction results of test set are  ```saved_models/py150/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/UniXcoder/downstream-tasks/code-completion/dataset.zip


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer(
 29 |             "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
 30 |         )
 31 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 32 |         self.lm_head.weight=self.encoder.embeddings.word_embeddings.weight
 33 |         self.lsm = nn.LogSoftmax(dim=-1)
 34 |         
 35 |         self.beam_size=beam_size
 36 |         self.max_length=max_length
 37 |         self.sos_id=sos_id
 38 |         self.eos_id=eos_id
 39 |         
 40 |                   
 41 |     def tie_weights(self):
 42 |         """ Make sure we are sharing the input and output embeddings.
 43 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
 44 |         """
 45 |         self._tie_or_clone_weights(self.lm_head,
 46 |                                    self.encoder.embeddings.word_embeddings)        
 47 |         
 48 |     def forward(self, source_ids,train=False): 
 49 |         max_length = source_ids.ne(1).sum(-1).max()
 50 |         source_ids = source_ids[:,:max_length]        
 51 |         if train:  
 52 |             length = source_ids.size(-1)
 53 |             out = self.decoder(source_ids,attention_mask=self.bias[:,:length,:length]).last_hidden_state
 54 |             lm_logits = self.lm_head(out)
 55 |             # Shift so that tokens < n predict n
 56 |             active_loss = source_ids[..., 1:].ne(1).view(-1)
 57 |             shift_logits = lm_logits[..., :-1, :].contiguous()
 58 |             shift_labels = source_ids[..., 1:].contiguous()
 59 |             # Flatten the tokens
 60 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 61 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 62 |                             shift_labels.view(-1)[active_loss])
 63 | 
 64 |             outputs = loss,loss*active_loss.sum(),active_loss.sum()
 65 |             return outputs
 66 |         else:
 67 |             #Predict 
 68 |             preds=[]       
 69 |             zero=torch.cuda.LongTensor(1).fill_(0)   
 70 |             source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
 71 |             length = source_ids.size(-1)
 72 |             encoder_output = self.decoder(source_ids,attention_mask=self.bias[:,:length,:length])
 73 |             for i in range(source_ids.shape[0]):
 74 |                 context=[[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y] 
 75 |                          for y in encoder_output.past_key_values]
 76 |                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 77 |                 input_ids=beam.getCurrentState()
 78 |                 context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
 79 |                 out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(self.beam_size,1,1)
 80 |                 for _ in range(self.max_length): 
 81 |                     if beam.done():
 82 |                         break
 83 |                     if _ == 0: 
 84 |                         hidden_states=out[:,-1,:]
 85 |                         out = self.lsm(self.lm_head(hidden_states)).data
 86 |                         beam.advance(out)
 87 |                         input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 88 |                         input_ids=beam.getCurrentState()
 89 |                     else:
 90 |                         length = context_ids.size(-1)+input_ids.size(-1)
 91 |                         out = self.decoder(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
 92 |                                            past_key_values=context).last_hidden_state
 93 |                         hidden_states=out[:,-1,:]
 94 |                         out = self.lsm(self.lm_head(hidden_states)).data
 95 |                         beam.advance(out)
 96 |                         input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 97 |                         input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
 98 |                 hyp= beam.getHyp(beam.getFinal())
 99 |                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
100 |                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
101 |                 preds.append(torch.cat(pred,0).unsqueeze(0))
102 |                 
103 |             preds=torch.cat(preds,0)    
104 | 
105 |             return preds   
106 |         
107 |         
108 | 
109 | class Beam(object):
110 |     def __init__(self, size, sos, eos):
111 |         self.size = size
112 |         self.tt = torch.cuda
113 |         # The score for each translation on the beam.
114 |         self.scores = self.tt.FloatTensor(size).zero_()
115 |         # The backpointers at each time-step.
116 |         self.prevKs = []
117 |         # The outputs at each time-step.
118 |         self.nextYs = [self.tt.LongTensor(size)
119 |                        .fill_(0)]
120 |         self.nextYs[0][:] = sos
121 |         # Has EOS topped the beam yet.
122 |         self._eos = eos
123 |         self.eosTop = False
124 |         # Time and k pair for finished.
125 |         self.finished = []
126 | 
127 |     def getCurrentState(self):
128 |         "Get the outputs for the current timestep."
129 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
130 |         return batch
131 | 
132 |     def getCurrentOrigin(self):
133 |         "Get the backpointers for the current timestep."
134 |         return self.prevKs[-1]
135 | 
136 |     def advance(self, wordLk):
137 |         """
138 |         Given prob over words for every last beam `wordLk` and attention
139 |         `attnOut`: Compute and update the beam search.
140 | 
141 |         Parameters:
142 | 
143 |         * `wordLk`- probs of advancing from the last step (K x words)
144 |         * `attnOut`- attention at the last step
145 | 
146 |         Returns: True if beam search is complete.
147 |         """
148 |         numWords = wordLk.size(1)
149 | 
150 |         # Sum the previous scores.
151 |         if len(self.prevKs) > 0:
152 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
153 | 
154 |             # Don't let EOS have children.
155 |             for i in range(self.nextYs[-1].size(0)):
156 |                 if self.nextYs[-1][i] in self._eos:
157 |                     beamLk[i] = -1e20
158 |         else:
159 |             beamLk = wordLk[0]
160 |         flatBeamLk = beamLk.view(-1)
161 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
162 | 
163 |         self.scores = bestScores
164 | 
165 |         # bestScoresId is flattened beam x word array, so calculate which
166 |         # word and beam each score came from
167 |         prevK = bestScoresId // numWords
168 |         self.prevKs.append(prevK)
169 |         self.nextYs.append((bestScoresId - prevK * numWords))
170 | 
171 | 
172 |         for i in range(self.nextYs[-1].size(0)):
173 |             if self.nextYs[-1][i] in self._eos:
174 |                 s = self.scores[i]
175 |                 self.finished.append((s, len(self.nextYs) - 1, i))
176 | 
177 |         # End condition is when top-of-beam is EOS and no global score.
178 |         if self.nextYs[-1][0] in self._eos:
179 |             self.eosTop = True
180 | 
181 |     def done(self):
182 |         return self.eosTop and len(self.finished) >=self.size
183 | 
184 |     def getFinal(self):
185 |         if len(self.finished) == 0:
186 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
187 |         self.finished.sort(key=lambda a: -a[0])
188 |         if len(self.finished) != self.size:
189 |             unfinished=[]
190 |             for i in range(self.nextYs[-1].size(0)):
191 |                 if self.nextYs[-1][i] not in self._eos:
192 |                     s = self.scores[i]
193 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
194 |             unfinished.sort(key=lambda a: -a[0])
195 |             self.finished+=unfinished[:self.size-len(self.finished)]
196 |         return self.finished[:self.size]
197 | 
198 |     def getHyp(self, beam_res):
199 |         """
200 |         Walk back to construct the full hypothesis.
201 |         """
202 |         hyps=[]
203 |         for _,timestep, k in beam_res:
204 |             hyp = []
205 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
206 |                 hyp.append(self.nextYs[j+1][k])
207 |                 k = self.prevKs[j][k]
208 |             hyps.append(hyp[::-1])
209 |         return hyps
210 |     
211 |     def buildTargetTokens(self, preds):
212 |         sentence=[]
213 |         for pred in preds:
214 |             tokens = []
215 |             for tok in pred:
216 |                 tokens.append(tok)
217 |                 if tok in self._eos:
218 |                     break
219 |             sentence.append(tokens)
220 |         return sentence
221 |         
222 | 
223 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/README.md:
--------------------------------------------------------------------------------
 1 | # Code Generation
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | mkdir dataset
 7 | cd dataset
 8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/train.json
 9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/dev.json
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/test.json
11 | cd ..
12 | ```
13 | 
14 | ## Dependency 
15 | 
16 | - pip install torch
17 | - pip install transformers
18 | 
19 | ## Fine-Tune Setting
20 | 
21 | Here we provide fine-tune settings for code generation, whose results are reported in the paper.
22 | 
23 | ```shell
24 | # Training
25 | python run.py \
26 | 	--do_train \
27 | 	--do_eval \
28 | 	--model_name_or_path microsoft/unixcoder-base \
29 | 	--train_filename dataset/train.json \
30 | 	--dev_filename dataset/dev.json \
31 | 	--output_dir saved_models \
32 | 	--max_source_length 350 \
33 | 	--max_target_length 150 \
34 | 	--beam_size 3 \
35 | 	--train_batch_size 32 \
36 | 	--eval_batch_size 32 \
37 | 	--learning_rate 5e-5 \
38 | 	--gradient_accumulation_steps 1 \
39 | 	--num_train_epochs 30 
40 | 
41 | # Output results
42 | python run.py \
43 | 	--do_test \
44 | 	--model_name_or_path microsoft/unixcoder-base \
45 | 	--test_filename dataset/test.json \
46 | 	--output_dir saved_models \
47 | 	--max_source_length 350 \
48 | 	--max_target_length 150 \
49 | 	--beam_size 3 \
50 | 	--train_batch_size 32 \
51 | 	--eval_batch_size 32 \
52 | 	--learning_rate 5e-5 \
53 | 	--gradient_accumulation_steps 1 \
54 | 	--num_train_epochs 30 
55 | ```
56 | 
57 | Prediction results of test set are  ```saved_models/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
58 | 
59 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 
114 | 
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 |     max_order = 4
117 |     smooth = True
118 |     ref_files = [ref_file]
119 |     reference_text = []
120 |     for reference_filename in ref_files:
121 |         with open(reference_filename) as fh:
122 |             reference_text.append(fh.readlines())
123 |     per_segment_references = []
124 |     for references in zip(*reference_text):
125 |         reference_list = []
126 |         for reference in references:
127 |             reference_list.append(reference.strip().split())
128 |         per_segment_references.append(reference_list)
129 |     translations = []
130 |     with open(trans_file) as fh:
131 |         for line in fh:
132 |             translations.append(line.strip().split())
133 |     bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 |     return round(100 * bleu_score,2)


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer(
 29 |             "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
 30 |         )
 31 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 |         self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
 34 |         self.lsm = nn.LogSoftmax(dim=-1)
 35 |         
 36 |         self.beam_size = beam_size
 37 |         self.max_length = max_length
 38 |         self.sos_id = sos_id
 39 |         self.eos_id = eos_id       
 40 |         
 41 |     def forward(self, source_ids, target_ids=None):   
 42 |         if target_ids is None:
 43 |             return self.generate(source_ids)
 44 |         
 45 |         mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
 46 |         encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)  
 47 |         ids = torch.cat((source_ids,target_ids),-1)
 48 |         mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
 49 |         mask = mask & ids[:,None,:].ne(1)
 50 | 
 51 |         out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
 52 |         lm_logits = self.lm_head(out)
 53 |         # Shift so that tokens < n predict n
 54 |         active_loss = target_ids[..., 1:].ne(1).view(-1)
 55 |         shift_logits = lm_logits[..., :-1, :].contiguous()
 56 |         shift_labels = target_ids[..., 1:].contiguous()
 57 |         # Flatten the tokens
 58 |         loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 59 |         loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 60 |                         shift_labels.view(-1)[active_loss])
 61 | 
 62 |         outputs = loss,loss*active_loss.sum(),active_loss.sum()
 63 |         return outputs
 64 |     
 65 |     def generate(self, source_ids):
 66 |         mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
 67 |         encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)        
 68 |         preds = []       
 69 |         zero = torch.cuda.LongTensor(1).fill_(0)   
 70 |         source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
 71 |         for i in range(source_ids.shape[0]):
 72 |             context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y] 
 73 |                      for y in encoder_output.past_key_values]
 74 |             beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 75 |             input_ids = beam.getCurrentState()
 76 |             context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
 77 |             for _ in range(self.max_length): 
 78 |                 if beam.done():
 79 |                     break
 80 | 
 81 |                 ids = torch.cat((context_ids,input_ids),-1)
 82 |                 mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
 83 |                 mask = mask & ids[:,None,:].ne(1)
 84 |                 out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
 85 |                 hidden_states = out[:,-1,:]
 86 |                 out = self.lsm(self.lm_head(hidden_states)).data
 87 |                 beam.advance(out)
 88 |                 input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 89 |                 input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
 90 |             hyp = beam.getHyp(beam.getFinal())
 91 |             pred = beam.buildTargetTokens(hyp)[:self.beam_size]
 92 |             pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
 93 |             preds.append(torch.cat(pred,0).unsqueeze(0))
 94 | 
 95 |         preds = torch.cat(preds,0)    
 96 | 
 97 |         return preds   
 98 |         
 99 |         
100 | 
101 | class Beam(object):
102 |     def __init__(self, size,sos,eos):
103 |         self.size = size
104 |         self.tt = torch.cuda
105 |         # The score for each translation on the beam.
106 |         self.scores = self.tt.FloatTensor(size).zero_()
107 |         # The backpointers at each time-step.
108 |         self.prevKs = []
109 |         # The outputs at each time-step.
110 |         self.nextYs = [self.tt.LongTensor(size)
111 |                        .fill_(0)]
112 |         self.nextYs[0][0] = sos
113 |         # Has EOS topped the beam yet.
114 |         self._eos = eos
115 |         self.eosTop = False
116 |         # Time and k pair for finished.
117 |         self.finished = []
118 | 
119 |     def getCurrentState(self):
120 |         "Get the outputs for the current timestep."
121 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122 |         return batch
123 | 
124 |     def getCurrentOrigin(self):
125 |         "Get the backpointers for the current timestep."
126 |         return self.prevKs[-1]
127 | 
128 |     def advance(self, wordLk):
129 |         """
130 |         Given prob over words for every last beam `wordLk` and attention
131 |         `attnOut`: Compute and update the beam search.
132 | 
133 |         Parameters:
134 | 
135 |         * `wordLk`- probs of advancing from the last step (K x words)
136 |         * `attnOut`- attention at the last step
137 | 
138 |         Returns: True if beam search is complete.
139 |         """
140 |         numWords = wordLk.size(1)
141 | 
142 |         # Sum the previous scores.
143 |         if len(self.prevKs) > 0:
144 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145 | 
146 |             # Don't let EOS have children.
147 |             for i in range(self.nextYs[-1].size(0)):
148 |                 if self.nextYs[-1][i] == self._eos:
149 |                     beamLk[i] = -1e20
150 |         else:
151 |             beamLk = wordLk[0]
152 |         flatBeamLk = beamLk.view(-1)
153 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154 | 
155 |         self.scores = bestScores
156 | 
157 |         # bestScoresId is flattened beam x word array, so calculate which
158 |         # word and beam each score came from
159 |         prevK = bestScoresId // numWords
160 |         self.prevKs.append(prevK)
161 |         self.nextYs.append((bestScoresId - prevK * numWords))
162 | 
163 | 
164 |         for i in range(self.nextYs[-1].size(0)):
165 |             if self.nextYs[-1][i] == self._eos:
166 |                 s = self.scores[i]
167 |                 self.finished.append((s, len(self.nextYs) - 1, i))
168 | 
169 |         # End condition is when top-of-beam is EOS and no global score.
170 |         if self.nextYs[-1][0] == self._eos:
171 |             self.eosTop = True
172 | 
173 |     def done(self):
174 |         return self.eosTop and len(self.finished) >=self.size
175 | 
176 |     def getFinal(self):
177 |         if len(self.finished) == 0:
178 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179 |         self.finished.sort(key=lambda a: -a[0])
180 |         if len(self.finished) != self.size:
181 |             unfinished=[]
182 |             for i in range(self.nextYs[-1].size(0)):
183 |                 if self.nextYs[-1][i] != self._eos:
184 |                     s = self.scores[i]
185 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
186 |             unfinished.sort(key=lambda a: -a[0])
187 |             self.finished+=unfinished[:self.size-len(self.finished)]
188 |         return self.finished[:self.size]
189 | 
190 |     def getHyp(self, beam_res):
191 |         """
192 |         Walk back to construct the full hypothesis.
193 |         """
194 |         hyps=[]
195 |         for _,timestep, k in beam_res:
196 |             hyp = []
197 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198 |                 hyp.append(self.nextYs[j+1][k])
199 |                 k = self.prevKs[j][k]
200 |             hyps.append(hyp[::-1])
201 |         return hyps
202 |     
203 |     def buildTargetTokens(self, preds):
204 |         sentence=[]
205 |         for pred in preds:
206 |             tokens = []
207 |             for tok in pred:
208 |                 if tok==self._eos:
209 |                     break
210 |                 tokens.append(tok)
211 |             sentence.append(tokens)
212 |         return sentence
213 |         
214 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/run.sh:
--------------------------------------------------------------------------------
 1 | pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html > log.txt 2>&1
 2 | pip install sklearn scipy transformers tqdm > log.txt 2>&1
 3 | CUDA_VISIBLE_DEVICES=15,12,13,14
 4 | lang=java #programming language
 5 | lr=5e-5
 6 | batch_size=32
 7 | accm_steps=1
 8 | beam_size=3
 9 | source_length=512
10 | target_length=150
11 | data_dir=../../dataset
12 | output_dir=saved_models/$lang
13 | train_file=$data_dir/train.json
14 | dev_file=$data_dir/dev.json
15 | epochs=30 
16 | pretrained_model=../../../pretrained-model/UniXcoder-base/
17 | 
18 | mkdir -p $output_dir
19 | python run.py \
20 | --do_train \
21 | --do_eval \
22 | --model_name_or_path $pretrained_model \
23 | --train_filename $train_file \
24 | --dev_filename $dev_file \
25 | --tokenizer_name roberta-base \
26 | --output_dir $output_dir \
27 | --max_source_length $source_length \
28 | --max_target_length $target_length \
29 | --beam_size $beam_size \
30 | --train_batch_size $batch_size \
31 | --eval_batch_size $batch_size \
32 | --learning_rate $lr \
33 | --gradient_accumulation_steps $accm_steps \
34 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
35 | 
36 | 
37 | batch_size=64
38 | dev_file=$data_dir/dev.json
39 | test_file=$data_dir/test.json
40 | test_model=$output_dir/checkpoint-best-score/pytorch_model.bin #checkpoint for test
41 | 
42 | python run.py \
43 | --do_test \
44 | --model_name_or_path $pretrained_model \
45 | --load_model_path $test_model \
46 | --dev_filename $dev_file \
47 | --test_filename $test_file \
48 | --output_dir $output_dir \
49 | --max_source_length $source_length \
50 | --max_target_length $target_length \
51 | --beam_size $beam_size \
52 | --gradient_accumulation_steps $accm_steps \
53 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
54 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Code Search
  4 | 
  5 | ## Data Download
  6 | 
  7 | #### 1. AdvTest dataset
  8 | 
  9 | ```bash
 10 | mkdir dataset && cd dataset
 11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/NL-code-search-Adv/dataset.zip
 12 | unzip dataset.zip && rm -r dataset.zip && mv dataset AdvTest && cd AdvTest
 13 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
 14 | unzip python.zip && python preprocess.py && rm -r python && rm -r *.pkl && rm python.zip
 15 | cd ../..
 16 | ```
 17 | 
 18 | #### 2. CosQA dataset
 19 | 
 20 | ```bash
 21 | cd dataset
 22 | mkdir cosqa && cd cosqa
 23 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/code_idx_map.txt
 24 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-dev-500.json
 25 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-test-500.json
 26 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-train-19604.json
 27 | cd ../..
 28 | ```
 29 | 
 30 | #### 3. CSN dataset
 31 | 
 32 | ```bash
 33 | cd dataset
 34 | wget https://github.com/microsoft/CodeBERT/raw/master/GraphCodeBERT/codesearch/dataset.zip
 35 | unzip dataset.zip && rm -r dataset.zip && mv dataset CSN && cd CSN
 36 | bash run.sh 
 37 | cd ../..
 38 | ```
 39 | 
 40 | 
 41 | 
 42 | ## Dependency 
 43 | 
 44 | - pip install torch
 45 | - pip install transformers
 46 | 
 47 | ## Zero-Shot Setting
 48 | 
 49 | We first provide scripts for zero-shot code search. The similarity between code and nl we use is cosine distance of hidden states of UniXcoder.
 50 | 
 51 | #### 1. AdvTest dataset
 52 | 
 53 | ```bash
 54 | python run.py \
 55 |     --output_dir saved_models/AdvTest \
 56 |     --model_name_or_path microsoft/unixcoder-base  \
 57 |     --do_zero_shot \
 58 |     --do_test \
 59 |     --test_data_file dataset/AdvTest/test.jsonl \
 60 |     --codebase_file dataset/AdvTest/test.jsonl \
 61 |     --num_train_epochs 2 \
 62 |     --code_length 256 \
 63 |     --nl_length 128 \
 64 |     --train_batch_size 64 \
 65 |     --eval_batch_size 64 \
 66 |     --learning_rate 2e-5 \
 67 |     --seed 123456
 68 | ```
 69 | 
 70 | #### 2. CosQA dataset
 71 | 
 72 | ```bash
 73 | python run.py \
 74 |     --output_dir saved_models/cosqa \
 75 |     --model_name_or_path microsoft/unixcoder-base  \
 76 |     --do_zero_shot \
 77 |     --do_test \
 78 |     --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
 79 |     --codebase_file dataset/cosqa/code_idx_map.txt \
 80 |     --num_train_epochs 10 \
 81 |     --code_length 256 \
 82 |     --nl_length 128 \
 83 |     --train_batch_size 64 \
 84 |     --eval_batch_size 64 \
 85 |     --learning_rate 2e-5 \
 86 |     --seed 123456
 87 | ```
 88 | 
 89 | #### 3. CSN dataset
 90 | 
 91 | ```bash
 92 | lang=python
 93 | python run.py \
 94 |     --output_dir saved_models/CSN/$lang \
 95 |     --model_name_or_path microsoft/unixcoder-base  \
 96 |     --do_zero_shot \
 97 |     --do_test \
 98 |     --test_data_file dataset/CSN/$lang/test.jsonl \
 99 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
100 |     --num_train_epochs 10 \
101 |     --code_length 256 \
102 |     --nl_length 128 \
103 |     --train_batch_size 64 \
104 |     --eval_batch_size 64 \
105 |     --learning_rate 2e-5 \
106 |     --seed 123456
107 | ```
108 | 
109 | 
110 | 
111 | ## Fine-Tune Setting
112 | 
113 | Here we provide fine-tune settings for code search, whose results are reported in the paper.
114 | 
115 | #### 1. AdvTest dataset
116 | 
117 | ```shell
118 | # Training
119 | python run.py \
120 |     --output_dir saved_models/AdvTest \
121 |     --model_name_or_path microsoft/unixcoder-base  \
122 |     --do_train \
123 |     --train_data_file dataset/AdvTest/train.jsonl \
124 |     --eval_data_file dataset/AdvTest/valid.jsonl \
125 |     --codebase_file dataset/AdvTest/valid.jsonl \
126 |     --num_train_epochs 2 \
127 |     --code_length 256 \
128 |     --nl_length 128 \
129 |     --train_batch_size 64 \
130 |     --eval_batch_size 64 \
131 |     --learning_rate 2e-5 \
132 |     --seed 123456
133 |     
134 | # Evaluating
135 | python run.py \
136 |     --output_dir saved_models/AdvTest \
137 |     --model_name_or_path microsoft/unixcoder-base  \
138 |     --do_test \
139 | 		--test_data_file dataset/AdvTest/test.jsonl \
140 |     --codebase_file dataset/AdvTest/test.jsonl \
141 |     --num_train_epochs 2 \
142 |     --code_length 256 \
143 |     --nl_length 128 \
144 |     --train_batch_size 64 \
145 |     --eval_batch_size 64 \
146 |     --learning_rate 2e-5 \
147 |     --seed 123456
148 | ```
149 | #### 2. CosQA dataset
150 | 
151 | ```bash
152 | # Training
153 | python run.py \
154 |     --output_dir saved_models/cosqa \
155 |     --model_name_or_path microsoft/unixcoder-base  \
156 |     --do_train \
157 |     --train_data_file dataset/cosqa/cosqa-retrieval-train-19604.json \
158 |     --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
159 |     --codebase_file dataset/cosqa/code_idx_map.txt \
160 |     --num_train_epochs 10 \
161 |     --code_length 256 \
162 |     --nl_length 128 \
163 |     --train_batch_size 64 \
164 |     --eval_batch_size 64 \
165 |     --learning_rate 2e-5 \
166 |     --seed 123456
167 | 
168 | # Evaluating
169 | python run.py \
170 |     --output_dir saved_models/cosqa \
171 |     --model_name_or_path microsoft/unixcoder-base  \
172 |     --do_eval \
173 |     --do_test \
174 |     --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
175 |     --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
176 |     --codebase_file dataset/cosqa/code_idx_map.txt \
177 |     --num_train_epochs 10 \
178 |     --code_length 256 \
179 |     --nl_length 128 \
180 |     --train_batch_size 64 \
181 |     --eval_batch_size 64 \
182 |     --learning_rate 2e-5 \
183 |     --seed 123456 
184 | ```
185 | 
186 | #### 3. CSN dataset
187 | 
188 | ```bash
189 | # Training
190 | lang=python
191 | python run.py \
192 |     --output_dir saved_models/cosqa \
193 |     --model_name_or_path microsoft/unixcoder-base  \
194 |     --do_train \
195 |     --train_data_file dataset/CSN/$lang/train.jsonl \
196 |     --eval_data_file dataset/CSN/$lang/valid.jsonl \
197 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
198 |     --num_train_epochs 10 \
199 |     --code_length 256 \
200 |     --nl_length 128 \
201 |     --train_batch_size 64 \
202 |     --eval_batch_size 64 \
203 |     --learning_rate 2e-5 \
204 |     --seed 123456 
205 | 
206 | # Evaluating
207 | python run.py \
208 |     --output_dir saved_models/cosqa \
209 |     --model_name_or_path microsoft/unixcoder-base  \
210 |     --do_eval \
211 |     --do_test \
212 |     --eval_data_file dataset/CSN/$lang/valid.jsonl \
213 |     --test_data_file dataset/CSN/$lang/test.jsonl \
214 |     --codebase_file dataset/CSN/$lang/codebase.jsonl \
215 |     --num_train_epochs 10 \
216 |     --code_length 256 \
217 |     --nl_length 128 \
218 |     --train_batch_size 64 \
219 |     --eval_batch_size 64 \
220 |     --learning_rate 2e-5 \
221 |     --seed 123456
222 | 
223 | ```
224 | 
225 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, nl_inputs=None): 
11 |         if code_inputs is not None:
12 |             outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 |             outputs = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None]
14 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 |         else:
16 |             outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 |             outputs = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None]
18 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |         
20 |  
21 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/README.md:
--------------------------------------------------------------------------------
 1 | # Code Summarization
 2 | 
 3 | ## Data Download
 4 | 
 5 | ```bash
 6 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Text/code-to-text/dataset.zip
 7 | unzip dataset.zip
 8 | rm dataset.zip
 9 | cd dataset
10 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
11 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip
12 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/ruby.zip
13 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/javascript.zip
14 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/go.zip
15 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/php.zip
16 | 
17 | unzip python.zip
18 | unzip java.zip
19 | unzip ruby.zip
20 | unzip javascript.zip
21 | unzip go.zip
22 | unzip php.zip
23 | rm *.zip
24 | rm *.pkl
25 | 
26 | python preprocess.py
27 | rm -r */final
28 | cd ..
29 | ```
30 | 
31 | ## Dependency 
32 | 
33 | - pip install torch
34 | - pip install transformers
35 | 
36 | ## Fine-Tune Setting
37 | 
38 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
39 | 
40 | ```shell
41 | lang=python
42 | 
43 | # Training
44 | python run.py \
45 | 	--do_train \
46 | 	--do_eval \
47 | 	--model_name_or_path microsoft/unixcoder-base \
48 | 	--train_filename dataset/$lang/train.jsonl \
49 | 	--dev_filename dataset/$lang/valid.jsonl \
50 | 	--output_dir saved_models/$lang \
51 | 	--max_source_length 256 \
52 | 	--max_target_length 128 \
53 | 	--beam_size 10 \
54 | 	--train_batch_size 48 \
55 | 	--eval_batch_size 48 \
56 | 	--learning_rate 5e-5 \
57 | 	--gradient_accumulation_steps 2 \
58 | 	--num_train_epochs 10 
59 | 	
60 | # Evaluating	
61 | python run.py \
62 | 	--do_test \
63 | 	--model_name_or_path microsoft/unixcoder-base \
64 | 	--test_filename dataset/$lang/test.jsonl \
65 | 	--output_dir saved_models/$lang \
66 | 	--max_source_length 256 \
67 | 	--max_target_length 128 \
68 | 	--beam_size 10 \
69 | 	--train_batch_size 48 \
70 | 	--eval_batch_size 48 \
71 | 	--learning_rate 5e-5 \
72 | 	--gradient_accumulation_steps 2 \
73 | 	--num_train_epochs 10 	
74 | ```
75 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/bleu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | '''
  4 | This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
  5 | '''
  6 | 
  7 | # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
  8 | 
  9 | '''Provides:
 10 | 
 11 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 12 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 13 | score_cooked(alltest, n=4): Score a list of cooked test sentences.
 14 | 
 15 | score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
 16 | 
 17 | The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
 18 | '''
 19 | 
 20 | import sys, math, re, xml.sax.saxutils
 21 | import subprocess
 22 | import os
 23 | 
 24 | # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 25 | nonorm = 0
 26 | 
 27 | preserve_case = False
 28 | eff_ref_len = "shortest"
 29 | 
 30 | normalize1 = [
 31 |     ('<skipped>', ''),         # strip "skipped" tags
 32 |     (r'-\n', ''),              # strip end-of-line hyphenation and join lines
 33 |     (r'\n', ' '),              # join lines
 34 | #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
 35 | ]
 36 | normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
 37 | 
 38 | normalize2 = [
 39 |     (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
 40 |     (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
 41 |     (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
 42 |     (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
 43 | ]
 44 | normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
 45 | 
 46 | def normalize(s):
 47 |     '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
 48 |     # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 49 |     if (nonorm):
 50 |         return s.split()
 51 |     if type(s) is not str:
 52 |         s = " ".join(s)
 53 |     # language-independent part:
 54 |     for (pattern, replace) in normalize1:
 55 |         s = re.sub(pattern, replace, s)
 56 |     s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
 57 |     # language-dependent part (assuming Western languages):
 58 |     s = " %s " % s
 59 |     if not preserve_case:
 60 |         s = s.lower()         # this might not be identical to the original
 61 |     for (pattern, replace) in normalize2:
 62 |         s = re.sub(pattern, replace, s)
 63 |     return s.split()
 64 | 
 65 | def count_ngrams(words, n=4):
 66 |     counts = {}
 67 |     for k in range(1,n+1):
 68 |         for i in range(len(words)-k+1):
 69 |             ngram = tuple(words[i:i+k])
 70 |             counts[ngram] = counts.get(ngram, 0)+1
 71 |     return counts
 72 | 
 73 | def cook_refs(refs, n=4):
 74 |     '''Takes a list of reference sentences for a single segment
 75 |     and returns an object that encapsulates everything that BLEU
 76 |     needs to know about them.'''
 77 |     
 78 |     refs = [normalize(ref) for ref in refs]
 79 |     maxcounts = {}
 80 |     for ref in refs:
 81 |         counts = count_ngrams(ref, n)
 82 |         for (ngram,count) in counts.items():
 83 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 84 |     return ([len(ref) for ref in refs], maxcounts)
 85 | 
 86 | def cook_test(test, item, n=4):
 87 |     '''Takes a test sentence and returns an object that
 88 |     encapsulates everything that BLEU needs to know about it.'''
 89 |     (reflens, refmaxcounts)=item
 90 |     test = normalize(test)
 91 |     result = {}
 92 |     result["testlen"] = len(test)
 93 | 
 94 |     # Calculate effective reference sentence length.
 95 |     
 96 |     if eff_ref_len == "shortest":
 97 |         result["reflen"] = min(reflens)
 98 |     elif eff_ref_len == "average":
 99 |         result["reflen"] = float(sum(reflens))/len(reflens)
100 |     elif eff_ref_len == "closest":
101 |         min_diff = None
102 |         for reflen in reflens:
103 |             if min_diff is None or abs(reflen-len(test)) < min_diff:
104 |                 min_diff = abs(reflen-len(test))
105 |                 result['reflen'] = reflen
106 | 
107 |     result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
108 | 
109 |     result['correct'] = [0]*n
110 |     counts = count_ngrams(test, n)
111 |     for (ngram, count) in counts.items():
112 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
113 | 
114 |     return result
115 | 
116 | def score_cooked(allcomps, n=4, ground=0, smooth=1):
117 |     totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
118 |     for comps in allcomps:
119 |         for key in ['testlen','reflen']:
120 |             totalcomps[key] += comps[key]
121 |         for key in ['guess','correct']:
122 |             for k in range(n):
123 |                 totalcomps[key][k] += comps[key][k]
124 |     logbleu = 0.0
125 |     all_bleus = []
126 |     for k in range(n):
127 |       correct = totalcomps['correct'][k]
128 |       guess = totalcomps['guess'][k]
129 |       addsmooth = 0
130 |       if smooth == 1 and k > 0:
131 |         addsmooth = 1
132 |       logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
133 |       if guess == 0:
134 |         all_bleus.append(-10000000)
135 |       else:
136 |         all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
137 | 
138 |     logbleu /= float(n)
139 |     all_bleus.insert(0, logbleu)
140 | 
141 |     brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
142 |     for i in range(len(all_bleus)):
143 |       if i ==0:
144 |         all_bleus[i] += brevPenalty
145 |       all_bleus[i] = math.exp(all_bleus[i])
146 |     return all_bleus
147 | 
148 | def bleu(refs,  candidate, ground=0, smooth=1):
149 |     refs = cook_refs(refs)
150 |     test = cook_test(candidate, refs)
151 |     return score_cooked([test], ground=ground, smooth=smooth)
152 | 
153 | def splitPuncts(line):
154 |   return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
155 | 
156 | def computeMaps(predictions, goldfile):
157 |   predictionMap = {}
158 |   goldMap = {}
159 |   gf = open(goldfile, 'r')
160 | 
161 |   for row in predictions:
162 |     cols = row.strip().split('\t')
163 |     if len(cols) == 1:
164 |       (rid, pred) = (cols[0], '') 
165 |     else:
166 |       (rid, pred) = (cols[0], cols[1]) 
167 |     predictionMap[rid] = [splitPuncts(pred.strip().lower())]
168 | 
169 |   for row in gf:
170 |     (rid, pred) = row.split('\t') 
171 |     if rid in predictionMap: # Only insert if the id exists for the method
172 |       if rid not in goldMap:
173 |         goldMap[rid] = []
174 |       goldMap[rid].append(splitPuncts(pred.strip().lower()))
175 | 
176 |   sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
177 |   return (goldMap, predictionMap)
178 | 
179 | 
180 | #m1 is the reference map
181 | #m2 is the prediction map
182 | def bleuFromMaps(m1, m2):
183 |   score = [0] * 5
184 |   num = 0.0
185 | 
186 |   for key in m1:
187 |     if key in m2:
188 |       bl = bleu(m1[key], m2[key][0])
189 |       score = [ score[i] + bl[i] for i in range(0, len(bl))]
190 |       num += 1
191 |   return [s * 100.0 / num for s in score]
192 | 
193 | if __name__ == '__main__':
194 |   reference_file = sys.argv[1]
195 |   predictions = []
196 |   for row in sys.stdin:
197 |     predictions.append(row)
198 |   (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
199 |   print (bleuFromMaps(goldMap, predictionMap)[0])
200 | 
201 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import copy
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |         Build Seqence-to-Sequence.
 12 |         
 13 |         Parameters:
 14 | 
 15 |         * `encoder`- encoder of seq2seq model. e.g. roberta
 16 |         * `decoder`- decoder of seq2seq model. e.g. transformer
 17 |         * `config`- configuration of encoder model. 
 18 |         * `beam_size`- beam size for beam search. 
 19 |         * `max_length`- max length of target for beam search. 
 20 |         * `sos_id`- start of symbol ids in target for beam search.
 21 |         * `eos_id`- end of symbol ids in target for beam search. 
 22 |     """
 23 |     def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
 24 |         super(Seq2Seq, self).__init__()
 25 |         self.encoder = encoder
 26 |         self.decoder=decoder
 27 |         self.config=config
 28 |         self.register_buffer(
 29 |             "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
 30 |         )
 31 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 |         self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
 34 |         self.lsm = nn.LogSoftmax(dim=-1)
 35 |         
 36 |         self.beam_size = beam_size
 37 |         self.max_length = max_length
 38 |         self.sos_id = sos_id
 39 |         self.eos_id = eos_id       
 40 |         
 41 |     def forward(self, source_ids, target_ids=None):   
 42 |         if target_ids is None:
 43 |             return self.generate(source_ids)
 44 |         
 45 |         mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
 46 |         encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)  
 47 |         ids = torch.cat((source_ids,target_ids),-1)
 48 |         mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
 49 |         mask = mask & ids[:,None,:].ne(1)
 50 | 
 51 |         out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
 52 |         lm_logits = self.lm_head(out)
 53 |         # Shift so that tokens < n predict n
 54 |         active_loss = target_ids[..., 1:].ne(1).view(-1)
 55 |         shift_logits = lm_logits[..., :-1, :].contiguous()
 56 |         shift_labels = target_ids[..., 1:].contiguous()
 57 |         # Flatten the tokens
 58 |         loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 59 |         loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
 60 |                         shift_labels.view(-1)[active_loss])
 61 | 
 62 |         outputs = loss,loss*active_loss.sum(),active_loss.sum()
 63 |         return outputs
 64 |     
 65 |     def generate(self, source_ids):
 66 |         mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
 67 |         encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)        
 68 |         preds = []       
 69 |         zero = torch.cuda.LongTensor(1).fill_(0)   
 70 |         source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
 71 |         for i in range(source_ids.shape[0]):
 72 |             context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y] 
 73 |                      for y in encoder_output.past_key_values]
 74 |             beam = Beam(self.beam_size,self.sos_id,self.eos_id)
 75 |             input_ids = beam.getCurrentState()
 76 |             context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
 77 |             for _ in range(self.max_length): 
 78 |                 if beam.done():
 79 |                     break
 80 | 
 81 |                 ids = torch.cat((context_ids,input_ids),-1)
 82 |                 mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
 83 |                 mask = mask & ids[:,None,:].ne(1)
 84 |                 out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
 85 |                 hidden_states = out[:,-1,:]
 86 |                 out = self.lsm(self.lm_head(hidden_states)).data
 87 |                 beam.advance(out)
 88 |                 input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
 89 |                 input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
 90 |             hyp = beam.getHyp(beam.getFinal())
 91 |             pred = beam.buildTargetTokens(hyp)[:self.beam_size]
 92 |             pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
 93 |             preds.append(torch.cat(pred,0).unsqueeze(0))
 94 | 
 95 |         preds = torch.cat(preds,0)    
 96 | 
 97 |         return preds   
 98 |         
 99 |         
100 | 
101 | class Beam(object):
102 |     def __init__(self, size,sos,eos):
103 |         self.size = size
104 |         self.tt = torch.cuda
105 |         # The score for each translation on the beam.
106 |         self.scores = self.tt.FloatTensor(size).zero_()
107 |         # The backpointers at each time-step.
108 |         self.prevKs = []
109 |         # The outputs at each time-step.
110 |         self.nextYs = [self.tt.LongTensor(size)
111 |                        .fill_(0)]
112 |         self.nextYs[0][0] = sos
113 |         # Has EOS topped the beam yet.
114 |         self._eos = eos
115 |         self.eosTop = False
116 |         # Time and k pair for finished.
117 |         self.finished = []
118 | 
119 |     def getCurrentState(self):
120 |         "Get the outputs for the current timestep."
121 |         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122 |         return batch
123 | 
124 |     def getCurrentOrigin(self):
125 |         "Get the backpointers for the current timestep."
126 |         return self.prevKs[-1]
127 | 
128 |     def advance(self, wordLk):
129 |         """
130 |         Given prob over words for every last beam `wordLk` and attention
131 |         `attnOut`: Compute and update the beam search.
132 | 
133 |         Parameters:
134 | 
135 |         * `wordLk`- probs of advancing from the last step (K x words)
136 |         * `attnOut`- attention at the last step
137 | 
138 |         Returns: True if beam search is complete.
139 |         """
140 |         numWords = wordLk.size(1)
141 | 
142 |         # Sum the previous scores.
143 |         if len(self.prevKs) > 0:
144 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145 | 
146 |             # Don't let EOS have children.
147 |             for i in range(self.nextYs[-1].size(0)):
148 |                 if self.nextYs[-1][i] == self._eos:
149 |                     beamLk[i] = -1e20
150 |         else:
151 |             beamLk = wordLk[0]
152 |         flatBeamLk = beamLk.view(-1)
153 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154 | 
155 |         self.scores = bestScores
156 | 
157 |         # bestScoresId is flattened beam x word array, so calculate which
158 |         # word and beam each score came from
159 |         prevK = bestScoresId // numWords
160 |         self.prevKs.append(prevK)
161 |         self.nextYs.append((bestScoresId - prevK * numWords))
162 | 
163 | 
164 |         for i in range(self.nextYs[-1].size(0)):
165 |             if self.nextYs[-1][i] == self._eos:
166 |                 s = self.scores[i]
167 |                 self.finished.append((s, len(self.nextYs) - 1, i))
168 | 
169 |         # End condition is when top-of-beam is EOS and no global score.
170 |         if self.nextYs[-1][0] == self._eos:
171 |             self.eosTop = True
172 | 
173 |     def done(self):
174 |         return self.eosTop and len(self.finished) >=self.size
175 | 
176 |     def getFinal(self):
177 |         if len(self.finished) == 0:
178 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179 |         self.finished.sort(key=lambda a: -a[0])
180 |         if len(self.finished) != self.size:
181 |             unfinished=[]
182 |             for i in range(self.nextYs[-1].size(0)):
183 |                 if self.nextYs[-1][i] != self._eos:
184 |                     s = self.scores[i]
185 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
186 |             unfinished.sort(key=lambda a: -a[0])
187 |             self.finished+=unfinished[:self.size-len(self.finished)]
188 |         return self.finished[:self.size]
189 | 
190 |     def getHyp(self, beam_res):
191 |         """
192 |         Walk back to construct the full hypothesis.
193 |         """
194 |         hyps=[]
195 |         for _,timestep, k in beam_res:
196 |             hyp = []
197 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198 |                 hyp.append(self.nextYs[j+1][k])
199 |                 k = self.prevKs[j][k]
200 |             hyps.append(hyp[::-1])
201 |         return hyps
202 |     
203 |     def buildTargetTokens(self, preds):
204 |         sentence=[]
205 |         for pred in preds:
206 |             tokens = []
207 |             for tok in pred:
208 |                 if tok==self._eos:
209 |                     break
210 |                 tokens.append(tok)
211 |             sentence.append(tokens)
212 |         return sentence
213 |         
214 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Zero-shot Code-to-Code Search
 4 | 
 5 | Given a source code as the query, the task aims to retrieve codes with the same semantics from a collection of candidates in zero-shot setting.  We collect 11,744/15,594/23,530 functions from [CodeNet](https://github.com/IBM/Project_CodeNet) corpus in Ruby/Python/Java. Each function solves one of 4,053 problems. 
 6 | 
 7 | 
 8 | 
 9 | ## Data Download
10 | 
11 | ```bash
12 | cd dataset
13 | wget https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet.tar.gz
14 | tar -xvf Project_CodeNet.tar.gz
15 | python preprocess.py
16 | cd ..
17 | ```
18 | 
19 | 
20 | 
21 | ## Dependency 
22 | 
23 | - pip install torch
24 | - pip install transformers
25 | 
26 | 
27 | 
28 | ## Zero-Shot Setting
29 | 
30 | ```bash
31 | source_lang=ruby
32 | target_lang=python
33 | python run.py \
34 | --model_name_or_path microsoft/unixcoder-base  \
35 | --query_data_file dataset/${source_lang}_with_func.jsonl \
36 | --candidate_data_file dataset/${target_lang}_with_func.jsonl \
37 | --query_lang ${source_lang} \
38 | --candidate_lang ${target_lang} \
39 | --code_length 512 \
40 | --eval_batch_size 256 
41 | ```
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | import torch.nn as nn
 4 | import torch    
 5 | class Model(nn.Module):   
 6 |     def __init__(self, encoder):
 7 |         super(Model, self).__init__()
 8 |         self.encoder = encoder
 9 |       
10 |     def forward(self, code_inputs=None, nl_inputs=None, cls=False): 
11 |         if code_inputs is not None:
12 |             outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 |             outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None]
14 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 |         else:
16 |             outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 |             outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None]
18 |             return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |       
20 |         
21 |  
22 | 


--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/run.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 19 | using a masked language modeling (MLM) loss.
 20 | """
 21 | import sys 
 22 | 
 23 | 
 24 | import argparse
 25 | import logging
 26 | import os
 27 | import pickle
 28 | import random
 29 | import torch
 30 | import json
 31 | import numpy as np
 32 | from tqdm import tqdm
 33 | from model import Model
 34 | from torch.nn import CrossEntropyLoss, MSELoss
 35 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
 36 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
 37 |                           RobertaConfig, RobertaModel, RobertaTokenizer)  
 38 | import re
 39 | from io import StringIO
 40 | import  tokenize
 41 | 
 42 | logger = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | def remove_comments_and_docstrings(source,lang):
 46 |     if lang in ['python']:
 47 |         io_obj = StringIO(source)
 48 |         out = ""
 49 |         prev_toktype = tokenize.INDENT
 50 |         last_lineno = -1
 51 |         last_col = 0
 52 |         for tok in tokenize.generate_tokens(io_obj.readline):
 53 |             token_type = tok[0]
 54 |             token_string = tok[1]
 55 |             start_line, start_col = tok[2]
 56 |             end_line, end_col = tok[3]
 57 |             ltext = tok[4]
 58 |             if start_line > last_lineno:
 59 |                 last_col = 0
 60 |             if start_col > last_col:
 61 |                 out += (" " * (start_col - last_col))
 62 |             # Remove comments:
 63 |             if token_type == tokenize.COMMENT:
 64 |                 pass
 65 |             # This series of conditionals removes docstrings:
 66 |             elif token_type == tokenize.STRING:
 67 |                 if prev_toktype != tokenize.INDENT:
 68 |             # This is likely a docstring; double-check we're not inside an operator:
 69 |                     if prev_toktype != tokenize.NEWLINE:
 70 |                         if start_col > 0:
 71 |                             out += token_string
 72 |             else:
 73 |                 out += token_string
 74 |             prev_toktype = token_type
 75 |             last_col = end_col
 76 |             last_lineno = end_line
 77 |         temp=[]
 78 |         for x in out.split('\n'):
 79 |             if x.strip()!="":
 80 |                 temp.append(x)
 81 |         return '\n'.join(temp)
 82 |     elif lang in ['ruby']:
 83 |         return source
 84 |     else:
 85 |         def replacer(match):
 86 |             s = match.group(0)
 87 |             if s.startswith('/'):
 88 |                 return " " # note: a space and not an empty string
 89 |             else:
 90 |                 return s
 91 |         pattern = re.compile(
 92 |             r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
 93 |             re.DOTALL | re.MULTILINE
 94 |         )
 95 |         temp=[]
 96 |         for x in re.sub(pattern, replacer, source).split('\n'):
 97 |             if x.strip()!="":
 98 |                 temp.append(x)
 99 |         return '\n'.join(temp)
100 |     
101 | 
102 | class InputFeatures(object):
103 |     """A single training/test features for a example."""
104 |     def __init__(self,
105 |                  code_tokens,
106 |                  code_ids,
107 |                  index,
108 |                  label
109 | 
110 |     ):
111 |         self.code_tokens = code_tokens
112 |         self.code_ids = code_ids
113 |         self.index = index
114 |         self.label = label
115 | 
116 |         
117 | def convert_examples_to_features(js,tokenizer,args,lang):
118 |     """convert examples to token ids"""
119 |     if "func" in js:
120 |         code = " ".join(remove_comments_and_docstrings(js['func'],lang).split())
121 |     else:
122 |         code = " ".join(remove_comments_and_docstrings(js['code'],lang).split())
123 |     code_tokens = tokenizer.tokenize(code)[:args.code_length-4]
124 |     code_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
125 |     code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
126 |     padding_length = args.code_length - len(code_ids)
127 |     code_ids += [tokenizer.pad_token_id]*padding_length
128 |     return InputFeatures(code_tokens,code_ids,js["index"],int(js['label']))
129 | 
130 | class TextDataset(Dataset):
131 |     def __init__(self, tokenizer, args, file_path, lang):
132 |         self.examples = []
133 |         data = []
134 |         with open(file_path) as f:
135 |             for i, line in enumerate(f):
136 |                 line = line.strip()
137 |                 js = json.loads(line)
138 |                 data.append(js)
139 | 
140 |         for js in data:
141 |             self.examples.append(convert_examples_to_features(js,tokenizer,args,lang))
142 | 
143 |         for idx, example in enumerate(self.examples[:1]):
144 |             logger.info("*** Example ***")
145 |             logger.info("label: {}".format(example.label))
146 |             logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
147 |             logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
148 |                 
149 |         self.label_examples={}
150 |         for e in self.examples:
151 |             if e.label not in self.label_examples:
152 |                 self.label_examples[e.label]=[]
153 |             self.label_examples[e.label].append(e)                           
154 |         
155 |     def __len__(self):
156 |         return len(self.examples)
157 | 
158 |     def __getitem__(self, i):          
159 |         return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].label))
160 |             
161 | 
162 |     
163 | def evaluate(args, model, tokenizer, file_name, candidate_file_name):
164 |     query_dataset = TextDataset(tokenizer, args, file_name, args.query_lang)
165 |     query_sampler = SequentialSampler(query_dataset)
166 |     query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=args.eval_batch_size,num_workers=4)
167 |     
168 |     candidate_dataset = TextDataset(tokenizer, args, candidate_file_name, args.candidate_lang)
169 |     candidate_sampler = SequentialSampler(candidate_dataset)
170 |     candidate_dataloader = DataLoader(candidate_dataset, sampler=candidate_sampler, batch_size=args.eval_batch_size, num_workers=4)    
171 | 
172 |     # Eval!
173 |     logger.info("***** Running evaluation *****")
174 |     logger.info("  Num Query = %d", len(query_dataset))
175 |     logger.info("  Num Candidate = %d", len(candidate_dataset))
176 |     logger.info("  Batch size = %d", args.eval_batch_size)
177 | 
178 |     
179 |     model.eval()
180 |     query_vecs = [] 
181 |     query_labels = []
182 |     candidate_vecs = []
183 |     candidate_labels = []
184 |     # Obtain query vectors
185 |     for batch in query_dataloader:  
186 |         code_inputs = batch[0].to(args.device)
187 |         label = batch[1].to(args.device)
188 |         with torch.no_grad():
189 |             code_vec = model(code_inputs=code_inputs) 
190 |             query_vecs.append(code_vec.cpu().numpy()) 
191 |             query_labels.append(label.cpu().numpy())
192 |     
193 |     # Obtain candidate vectors
194 |     for batch in candidate_dataloader:  
195 |         code_inputs = batch[0].to(args.device)
196 |         label = batch[1].to(args.device)
197 |         with torch.no_grad():
198 |             code_vec = model(code_inputs=code_inputs) 
199 |             candidate_vecs.append(code_vec.cpu().numpy()) 
200 |             candidate_labels.append(label.cpu().numpy())
201 |             
202 |     model.train() 
203 | 
204 |     # Calculate cosine score
205 |     query_vecs = np.concatenate(query_vecs,0)
206 |     candidate_vecs = np.concatenate(candidate_vecs,0)
207 |     query_labels = list(np.concatenate(query_labels,0))
208 |     candidate_labels = list(np.concatenate(candidate_labels,0))
209 |     candidate_indexs =[candidate_dataset.examples[i].index for i in range(len(candidate_dataset))]
210 |     query_indexs = [query_dataset.examples[i].index for i in range(len(query_dataset))]
211 |     scores = np.matmul(query_vecs,candidate_vecs.T)
212 |     
213 |     # Calculate MAP score
214 |     sort_ids = np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]
215 |     MAP=[]
216 |     results = {}
217 |     for i in range(scores.shape[0]):
218 |         cont=0
219 |         label=int(query_labels[i])
220 |         query_index = query_indexs[i]
221 |         results[query_index] = [label,candidate_labels[sort_ids[i][0]],candidate_indexs[sort_ids[i][0]]]
222 |         Avep = []
223 |         for j,index in enumerate(list(sort_ids[i])):
224 |             if query_index==candidate_indexs[index]:
225 |                 cont+=1
226 |                 continue
227 |             if  int(candidate_labels[index])==label:
228 |                 Avep.append((len(Avep)+1)/(j+1-cont))
229 |         if len(Avep)!=0:
230 |             MAP.append(sum(Avep)/len(Avep))
231 |    
232 |     result = {
233 |         "eval_map":float(np.mean(MAP))
234 |     }
235 |     return result
236 | 
237 |                         
238 |                         
239 | def main():
240 |     parser = argparse.ArgumentParser()
241 | 
242 |     ## Required parameters
243 |     parser.add_argument("--query_data_file", default=None, type=str, required=False,
244 |                         help="The input training data file (a json file).")
245 |     parser.add_argument("--candidate_data_file", default=None, type=str, required=False,
246 |                         help="The input training data file (a json file).")    
247 |     parser.add_argument("--model_name_or_path", default=None, type=str,
248 |                         help="The model checkpoint for weights initialization.")
249 |     
250 |     parser.add_argument("--query_lang", default=None, type=str, required=False,
251 |                         help="Programming language of query.")    
252 |     parser.add_argument("--candidate_lang", default=None, type=str,
253 |                         help="Programming language of candidate.")
254 |     
255 |     
256 |     parser.add_argument("--code_length", default=256, type=int,
257 |                         help="Optional Code input sequence length after tokenization.") 
258 |     parser.add_argument("--eval_batch_size", default=4, type=int,
259 |                         help="Batch size for evaluation.")
260 | 
261 | 
262 | 
263 |     #print arguments
264 |     args = parser.parse_args()
265 |       
266 |     #set log
267 |     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
268 |                     datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
269 |     #set device
270 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
271 |     args.n_gpu = torch.cuda.device_count()
272 |     args.device = device
273 |     logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
274 |     
275 | 
276 |     #build model
277 |     tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
278 |     config = RobertaConfig.from_pretrained(args.model_name_or_path)
279 |     model = RobertaModel.from_pretrained(args.model_name_or_path) 
280 |  
281 |     
282 |     model=Model(model)
283 |     logger.info("Training/evaluation parameters %s", args)
284 |     model.to(args.device)
285 | 
286 |     if args.n_gpu > 1:
287 |         model = torch.nn.DataParallel(model) 
288 |         
289 | 
290 |     result=evaluate(args, model, tokenizer,args.query_data_file,args.candidate_data_file)
291 |     logger.info("***** Eval results *****")
292 |     for key in sorted(result.keys()):
293 |         logger.info("  %s = %s", key, str(round(result[key]*100,2)))
294 | 
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     main()
299 | 
300 | 
301 | 


--------------------------------------------------------------------------------
/UniXcoder/unixcoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
  7 | 
  8 | class UniXcoder(nn.Module):
  9 |     def __init__(self, model_name):
 10 |         """
 11 |             Build UniXcoder.
 12 | 
 13 |             Parameters:
 14 | 
 15 |             * `model_name`- huggingface model card name. e.g. microsoft/unixcoder-base
 16 |         """        
 17 |         super(UniXcoder, self).__init__()
 18 |         self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
 19 |         self.config = RobertaConfig.from_pretrained(model_name)
 20 |         self.config.is_decoder = True
 21 |         self.model = RobertaModel.from_pretrained(model_name, config=self.config)
 22 |         
 23 |         self.register_buffer("bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024))
 24 |         self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
 25 |         self.lm_head.weight = self.model.embeddings.word_embeddings.weight
 26 |         self.lsm = nn.LogSoftmax(dim=-1)
 27 |         
 28 |         self.tokenizer.add_tokens(["<mask0>"],special_tokens=True)
 29 |           
 30 |     def tokenize(self, inputs, mode="<encoder-only>", max_length=512, padding=False):
 31 |         """ 
 32 |         Convert string to token ids 
 33 |                 
 34 |         Parameters:
 35 | 
 36 |         * `inputs`- list of input strings.
 37 |         * `max_length`- The maximum total source sequence length after tokenization.
 38 |         * `padding`- whether to pad source sequence length to max_length. 
 39 |         * `mode`- which mode the sequence will use. i.e. <encoder-only>, <decoder-only>, <encoder-decoder>
 40 |         """
 41 |         assert mode in ["<encoder-only>", "<decoder-only>", "<encoder-decoder>"]
 42 |         
 43 |         tokenizer = self.tokenizer
 44 |         
 45 |         tokens_ids = []
 46 |         for x in inputs:
 47 |             tokens = tokenizer.tokenize(x)
 48 |             if mode == "<encoder-only>":
 49 |                 tokens = tokens[:max_length-4]
 50 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]                
 51 |             elif mode == "<decoder-only>":
 52 |                 tokens = tokens[-(max_length-3):]
 53 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens
 54 |             else:
 55 |                 tokens = tokens[:max_length-5]
 56 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
 57 |                 
 58 |             tokens_id = tokenizer.convert_tokens_to_ids(tokens)
 59 |             if padding:
 60 |                 tokens_id = tokens_id + [sel.config.pad_token_id] * (max_length-len(tokens_id))
 61 |             tokens_ids.append(tokens_id)
 62 |         return tokens_ids
 63 |             
 64 |     def decode(self, source_ids):   
 65 |         """ Convert token ids to string """      
 66 |         predictions = []
 67 |         for x in source_ids:
 68 |             prediction = []
 69 |             for y in x:
 70 |                 t = y.cpu().numpy()
 71 |                 t = list(t)
 72 |                 if 0 in t:
 73 |                     t = t[:t.index(0)]
 74 |                 text = self.tokenizer.decode(t,clean_up_tokenization_spaces=False)
 75 |                 prediction.append(text)        
 76 |             predictions.append(prediction)
 77 |         return predictions
 78 |     
 79 |     def forward(self, source_ids):   
 80 |         """ Obtain token embeddings and sentence embeddings """
 81 |         mask = source_ids.ne(self.config.pad_token_id)
 82 |         token_embeddings = self.model(source_ids,attention_mask = mask.unsqueeze(1) * mask.unsqueeze(2))[0]
 83 |         sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1)
 84 |         return token_embeddings, sentence_embeddings       
 85 | 
 86 |     def generate(self, source_ids, decoder_only = True, eos_id = None, beam_size = 5, max_length = 64):
 87 |         """ Generate sequence given context (source_ids) """
 88 |         
 89 |         # Set encoder mask attention matrix: bidirectional for <encoder-decoder>, unirectional for <decoder-only>
 90 |         if decoder_only:
 91 |             mask = self.bias[:,:source_ids.size(-1),:source_ids.size(-1)]
 92 |         else:
 93 |             mask = source_ids.ne(self.config.pad_token_id)
 94 |             mask = mask.unsqueeze(1) * mask.unsqueeze(2)  
 95 |             
 96 |         if eos_id is None:
 97 |             eos_id = self.config.eos_token_id
 98 |         
 99 |         device = source_ids.device
100 |         
101 |         # Decoding using beam search
102 |         preds = []       
103 |         zero = torch.LongTensor(1).fill_(0).to(device)   
104 |         source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
105 |         length = source_ids.size(-1)
106 |         encoder_output = self.model(source_ids,attention_mask=mask)
107 |         for i in range(source_ids.shape[0]):
108 |             context = [[x[i:i+1,:,:source_len[i]].repeat(beam_size,1,1,1) for x in y] 
109 |                      for y in encoder_output.past_key_values]
110 |             beam = Beam(beam_size,eos_id,device)
111 |             input_ids = beam.getCurrentState().clone()
112 |             context_ids = source_ids[i:i+1,:source_len[i]].repeat(beam_size,1)
113 |             out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(beam_size,1,1)
114 |             for _ in range(max_length): 
115 |                 if beam.done():
116 |                     break
117 |                 if _ == 0: 
118 |                     hidden_states = out[:,-1,:]
119 |                     out = self.lsm(self.lm_head(hidden_states)).data
120 |                     beam.advance(out)
121 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
122 |                     input_ids = beam.getCurrentState().clone()
123 |                 else:
124 |                     length = context_ids.size(-1)+input_ids.size(-1)
125 |                     out = self.model(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
126 |                                        past_key_values=context).last_hidden_state
127 |                     hidden_states = out[:,-1,:]
128 |                     out = self.lsm(self.lm_head(hidden_states)).data
129 |                     beam.advance(out)
130 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
131 |                     input_ids = torch.cat((input_ids,beam.getCurrentState().clone()),-1)
132 |             hyp = beam.getHyp(beam.getFinal())
133 |             pred = beam.buildTargetTokens(hyp)[:beam_size]
134 |             pred = [torch.cat([x.view(-1) for x in p]+[zero]*(max_length-len(p))).view(1,-1) for p in pred]
135 |             preds.append(torch.cat(pred,0).unsqueeze(0))
136 | 
137 |         preds = torch.cat(preds,0)    
138 | 
139 |         return preds  
140 |     
141 | 
142 |     
143 | class Beam(object):
144 |     def __init__(self, size, eos, device):
145 |         self.size = size
146 |         self.device = device
147 |         # The score for each translation on the beam.
148 |         self.scores = torch.FloatTensor(size).zero_().to(device)
149 |         # The backpointers at each time-step.
150 |         self.prevKs = []
151 |         # The outputs at each time-step.
152 |         self.nextYs = [torch.LongTensor(size).fill_(0).to(device)]
153 |         # Has EOS topped the beam yet.
154 |         self._eos = eos
155 |         self.eosTop = False
156 |         # Time and k pair for finished.
157 |         self.finished = []
158 | 
159 |     def getCurrentState(self):
160 |         "Get the outputs for the current timestep."
161 |         batch = self.nextYs[-1].view(-1, 1)
162 |         return batch
163 | 
164 |     def getCurrentOrigin(self):
165 |         "Get the backpointers for the current timestep."
166 |         return self.prevKs[-1]
167 | 
168 |     def advance(self, wordLk):
169 |         """
170 |         Given prob over words for every last beam `wordLk` and attention
171 |         `attnOut`: Compute and update the beam search.
172 | 
173 |         Parameters:
174 | 
175 |         * `wordLk`- probs of advancing from the last step (K x words)
176 |         * `attnOut`- attention at the last step
177 | 
178 |         Returns: True if beam search is complete.
179 |         """
180 |         numWords = wordLk.size(1)
181 | 
182 |         # Sum the previous scores.
183 |         if len(self.prevKs) > 0:
184 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
185 | 
186 |             # Don't let EOS have children.
187 |             for i in range(self.nextYs[-1].size(0)):
188 |                 if self.nextYs[-1][i] == self._eos:
189 |                     beamLk[i] = -1e20
190 |         else:
191 |             beamLk = wordLk[0]
192 |         flatBeamLk = beamLk.view(-1)
193 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
194 | 
195 |         self.scores = bestScores
196 | 
197 |         # bestScoresId is flattened beam x word array, so calculate which
198 |         # word and beam each score came from
199 |         prevK = bestScoresId // numWords
200 |         self.prevKs.append(prevK)
201 |         self.nextYs.append((bestScoresId - prevK * numWords))
202 | 
203 | 
204 |         for i in range(self.nextYs[-1].size(0)):
205 |             if self.nextYs[-1][i] == self._eos:
206 |                 s = self.scores[i]
207 |                 self.finished.append((s, len(self.nextYs) - 1, i))
208 | 
209 |         # End condition is when top-of-beam is EOS and no global score.
210 |         if self.nextYs[-1][0] == self._eos:
211 |             self.eosTop = True
212 | 
213 |     def done(self):
214 |         return self.eosTop and len(self.finished) >= self.size
215 | 
216 |     def getFinal(self):
217 |         if len(self.finished) == 0:
218 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
219 |         self.finished.sort(key=lambda a: -a[0])
220 |         if len(self.finished) != self.size:
221 |             unfinished=[]
222 |             for i in range(self.nextYs[-1].size(0)):
223 |                 if self.nextYs[-1][i] != self._eos:
224 |                     s = self.scores[i]
225 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
226 |             unfinished.sort(key=lambda a: -a[0])
227 |             self.finished+=unfinished[:self.size-len(self.finished)]
228 |         return self.finished[:self.size]
229 | 
230 |     def getHyp(self, beam_res):
231 |         """
232 |         Walk back to construct the full hypothesis.
233 |         """
234 |         hyps=[]
235 |         for _,timestep, k in beam_res:
236 |             hyp = []
237 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
238 |                 hyp.append(self.nextYs[j+1][k])
239 |                 k = self.prevKs[j][k]
240 |             hyps.append(hyp[::-1])
241 |         return hyps
242 |     
243 |     def buildTargetTokens(self, preds):
244 |         sentence=[]
245 |         for pred in preds:
246 |             tokens = []
247 |             for tok in pred:
248 |                 if tok==self._eos:
249 |                     break
250 |                 tokens.append(tok)
251 |             sentence.append(tokens)
252 |         return sentence
253 |         
254 | 


--------------------------------------------------------------------------------