","").strip() for x in predictions[0]])
179 | ```
180 |
181 | ```python
182 | ['Write JSON to file', 'Write json to file', 'Write a json file']
183 | ```
184 |
185 | ## 5. Fine-tuning
186 |
187 | For downstream tasks reported in the paper, please refer to the [downstream-tasks](https://github.com/guoday/CodeBERT/tree/master/UniXcoder/downstream-tasks) folders.
188 |
189 |
190 |
191 | # Reference
192 | If you use this code or CodeXGLUE, please consider citing us.
193 |
194 | @article{guo2022unixcoder,
195 | title={UniXcoder: Unified Cross-Modal Pre-training for Code Representation},
196 | author={Guo, Daya and Lu, Shuai and Duan, Nan and Wang, Yanlin and Zhou, Ming and Yin, Jian},
197 | journal={arXiv preprint arXiv:2203.03850},
198 | year={2022}
199 | }
200 |
201 |
202 |
203 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/README.md:
--------------------------------------------------------------------------------
1 | # Clone Detection (BigCloneDetection)
2 |
3 | ## Data Download
4 |
5 | ```bash
6 | mkdir dataset
7 | cd dataset
8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl
9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/test.txt
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/train.txt
11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Clone-detection-BigCloneBench/dataset/valid.txt
12 | cd ..
13 |
14 | ```
15 |
16 | ## Dependency
17 |
18 | - pip install torch
19 | - pip install transformers
20 |
21 | ## Fine-Tune
22 |
23 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
24 |
25 | ```shell
26 | # Training
27 | python run.py \
28 | --output_dir saved_models \
29 | --model_name_or_path microsoft/unixcoder-base \
30 | --do_train \
31 | --train_data_file dataset/train.txt \
32 | --eval_data_file dataset/valid.txt \
33 | --num_train_epochs 1 \
34 | --block_size 512 \
35 | --train_batch_size 16 \
36 | --eval_batch_size 32 \
37 | --learning_rate 5e-5 \
38 | --max_grad_norm 1.0 \
39 | --seed 123456
40 |
41 | # Evaluating
42 | python run.py \
43 | --output_dir saved_models \
44 | --model_name_or_path microsoft/unixcoder-base \
45 | --do_test \
46 | --test_data_file dataset/test.txt \
47 | --num_train_epochs 1 \
48 | --block_size 512 \
49 | --train_batch_size 16 \
50 | --eval_batch_size 32 \
51 | --learning_rate 5e-5 \
52 | --max_grad_norm 1.0 \
53 | --seed 123456
54 | ```
55 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | import torch
4 | import torch.nn as nn
5 | import torch
6 | from torch.autograd import Variable
7 | import copy
8 | import torch.nn.functional as F
9 | from torch.nn import CrossEntropyLoss, MSELoss
10 |
11 | class RobertaClassificationHead(nn.Module):
12 | """Head for sentence-level classification tasks."""
13 |
14 | def __init__(self, config):
15 | super().__init__()
16 | self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
17 | self.dropout = nn.Dropout(0.1)
18 | self.out_proj = nn.Linear(config.hidden_size, 2)
19 |
20 | def forward(self, x):
21 | x = x.reshape(-1,x.size(-1)*2)
22 | x = self.dropout(x)
23 | x = self.dense(x)
24 | x = torch.tanh(x)
25 | x = self.dropout(x)
26 | x = self.out_proj(x)
27 | return x
28 |
29 | class Model(nn.Module):
30 | def __init__(self, encoder,config,tokenizer,args):
31 | super(Model, self).__init__()
32 | self.encoder = encoder
33 | self.config = config
34 | self.tokenizer = tokenizer
35 | self.classifier = RobertaClassificationHead(config)
36 | self.args = args
37 |
38 |
39 | def forward(self, input_ids=None,labels=None):
40 | input_ids = input_ids.view(-1,self.args.block_size)
41 | outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
42 | outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
43 | outputs = outputs.reshape(-1,2,outputs.size(-1))
44 | outputs = torch.nn.functional.normalize(outputs, p=2, dim=-1)
45 | cos_sim = (outputs[:,0]*outputs[:,1]).sum(-1)
46 |
47 | if labels is not None:
48 | loss = ((cos_sim-labels.float())**2).mean()
49 | return loss,cos_sim
50 | else:
51 | return cos_sim
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/BCB/run.sh:
--------------------------------------------------------------------------------
1 | model=../../../../pretrained-model/UniXcoder-base
2 | mkdir saved_models
3 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
4 | --output_dir=./saved_models \
5 | --model_type=roberta \
6 | --model_name_or_path=$model \
7 | --do_train \
8 | --train_data_file=../../dataset/train.txt \
9 | --eval_data_file=../../dataset/valid.txt \
10 | --test_data_file=../../dataset/test.txt \
11 | --epoch 1 \
12 | --block_size 512 \
13 | --train_batch_size 16 \
14 | --eval_batch_size 32 \
15 | --learning_rate 5e-5 \
16 | --max_grad_norm 1.0 \
17 | --evaluate_during_training \
18 | --seed 123456 2>&1| tee saved_models/train.log
19 |
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py \
21 | --output_dir=./saved_models \
22 | --model_type=roberta \
23 | --model_name_or_path=$model \
24 | --do_eval \
25 | --do_test \
26 | --train_data_file=../../dataset/train.txt \
27 | --eval_data_file=../../dataset/valid.txt \
28 | --test_data_file=../../dataset/test.txt \
29 | --epoch 1 \
30 | --block_size 512 \
31 | --train_batch_size 16 \
32 | --eval_batch_size 32 \
33 | --learning_rate 5e-5 \
34 | --max_grad_norm 1.0 \
35 | --evaluate_during_training \
36 | --seed 123456 2>&1| tee saved_models/test.log
37 |
38 | python ../evaluator/evaluator.py -a ../../dataset/test.txt -p saved_models/predictions.txt 2>&1| tee saved_models/score.log
39 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/README.md:
--------------------------------------------------------------------------------
1 | # Clone Detection (POJ-104)
2 |
3 | ## Data Download
4 |
5 | ```bash
6 | cd dataset
7 | pip install gdown
8 | gdown https://drive.google.com/uc?id=0B2i-vWnOu7MxVlJwQXN6eVNONUU
9 | tar -xvf programs.tar.gz
10 | python preprocess.py
11 | cd ..
12 | ```
13 |
14 | ## Dependency
15 |
16 | - pip install torch
17 | - pip install transformers
18 |
19 | ## Fine-Tune
20 |
21 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
22 |
23 | ```shell
24 | # Training
25 | python run.py \
26 | --output_dir saved_models \
27 | --model_name_or_path microsoft/unixcoder-base \
28 | --do_train \
29 | --train_data_file dataset/train.jsonl \
30 | --eval_data_file dataset/valid.jsonl \
31 | --test_data_file dataset/test.jsonl \
32 | --num_train_epochs 2 \
33 | --block_size 400 \
34 | --train_batch_size 8 \
35 | --eval_batch_size 16 \
36 | --learning_rate 2e-5 \
37 | --max_grad_norm 1.0 \
38 | --seed 123456
39 |
40 | # Evaluating
41 | python run.py \
42 | --output_dir saved_models \
43 | --model_name_or_path microsoft/unixcoder-base \
44 | --do_eval \
45 | --do_test \
46 | --eval_data_file dataset/valid.jsonl \
47 | --test_data_file dataset/test.jsonl \
48 | --num_train_epochs 2 \
49 | --block_size 400 \
50 | --train_batch_size 8 \
51 | --eval_batch_size 16 \
52 | --learning_rate 2e-5 \
53 | --max_grad_norm 1.0 \
54 | --seed 123456
55 | ```
56 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/dataset/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | import os
4 | import json
5 | from tqdm import tqdm
6 | def files(path):
7 | g = os.walk(path)
8 | file=[]
9 | for path,dir_list,file_list in g:
10 | for file_name in file_list:
11 | file.append(os.path.join(path, file_name))
12 | return file
13 |
14 | cont=0
15 | with open("train.jsonl",'w') as f:
16 | for i in tqdm(range(1,65),total=64):
17 | items=files("ProgramData/{}".format(i))
18 | for item in items:
19 | js={}
20 | js['label']=item.split('/')[1]
21 | js['index']=str(cont)
22 | js['code']=open(item,encoding='latin-1').read()
23 | f.write(json.dumps(js)+'\n')
24 | cont+=1
25 |
26 | with open("valid.jsonl",'w') as f:
27 | for i in tqdm(range(65,81),total=16):
28 | items=files("ProgramData/{}".format(i))
29 | for item in items:
30 | js={}
31 | js['label']=item.split('/')[1]
32 | js['index']=str(cont)
33 | js['code']=open(item,encoding='latin-1').read()
34 | f.write(json.dumps(js)+'\n')
35 | cont+=1
36 |
37 | with open("test.jsonl",'w') as f:
38 | for i in tqdm(range(81,195),total=24):
39 | items=files("ProgramData/{}".format(i))
40 | for item in items:
41 | js={}
42 | js['label']=item.split('/')[1]
43 | js['index']=str(cont)
44 | js['code']=open(item,encoding='latin-1').read()
45 | f.write(json.dumps(js)+'\n')
46 | cont+=1
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/clone-detection/POJ-104/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | import torch
4 | import torch.nn as nn
5 | import torch
6 | from torch.autograd import Variable
7 | import copy
8 | import torch.nn.functional as F
9 | from torch.nn import CrossEntropyLoss, MSELoss
10 |
11 |
12 |
13 | class Model(nn.Module):
14 | def __init__(self, encoder,config,tokenizer,args):
15 | super(Model, self).__init__()
16 | self.encoder = encoder
17 | self.config=config
18 | self.tokenizer=tokenizer
19 | self.args=args
20 |
21 |
22 | def forward(self, input_ids=None,p_input_ids=None,n_input_ids=None,labels=None):
23 | bs,_ = input_ids.size()
24 | input_ids = torch.cat((input_ids,p_input_ids,n_input_ids),0)
25 |
26 | outputs = self.encoder(input_ids,attention_mask=input_ids.ne(1))[0]
27 | outputs = (outputs * input_ids.ne(1)[:,:,None]).sum(1)/input_ids.ne(1).sum(1)[:,None]
28 | outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
29 | outputs = outputs.split(bs,0)
30 |
31 | prob_1 = (outputs[0]*outputs[1]).sum(-1)*20
32 | prob_2 = (outputs[0]*outputs[2]).sum(-1)*20
33 | temp = torch.cat((outputs[0],outputs[1]),0)
34 | temp_labels = torch.cat((labels,labels),0)
35 | prob_3 = torch.mm(outputs[0],temp.t())*20
36 | mask = labels[:,None]==temp_labels[None,:]
37 | prob_3 = prob_3*(1-mask.float())-1e9*mask.float()
38 |
39 | prob = torch.softmax(torch.cat((prob_1[:,None],prob_2[:,None],prob_3),-1),-1)
40 | loss = torch.log(prob[:,0]+1e-10)
41 | loss = -loss.mean()
42 | return loss,outputs[0]
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/README.md:
--------------------------------------------------------------------------------
1 | # Code Completion
2 |
3 | ## Dependency
4 |
5 | - pip install torch
6 | - pip install transformers
7 | - pip install javalang
8 |
9 | ## Data Download
10 |
11 | ```bash
12 | unzip dataset.zip
13 |
14 | cd dataset/javaCorpus/
15 | bash download.sh
16 | python preprocess.py --base_dir=token_completion --output_dir=./
17 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion/test.json
18 |
19 | cd ../py150
20 | bash download.sh
21 | python preprocess.py --base_dir=py150_files --output_dir=./
22 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion/test.json
23 |
24 | cd ../..
25 | ```
26 |
27 |
28 |
29 | ## Fine-Tune Setting
30 |
31 | Here we provide fine-tune settings for code completion, whose results are reported in the paper.
32 |
33 | #### JavaCorpus Dataset
34 |
35 | ```shell
36 | # Training
37 | python run.py \
38 | --do_train \
39 | --do_eval \
40 | --lang java \
41 | --model_name_or_path microsoft/unixcoder-base \
42 | --train_filename dataset/javaCorpus/train.txt \
43 | --dev_filename dataset/javaCorpus/dev.json \
44 | --output_dir saved_models/javaCorpus \
45 | --max_source_length 936 \
46 | --max_target_length 64 \
47 | --beam_size 5 \
48 | --train_batch_size 32 \
49 | --gradient_accumulation_steps 1 \
50 | --eval_batch_size 32 \
51 | --learning_rate 2e-5 \
52 | --num_train_epochs 10
53 |
54 | # Output predictions of test set
55 | python run.py \
56 | --do_test \
57 | --lang java \
58 | --model_name_or_path microsoft/unixcoder-base \
59 | --load_model_path saved_models/javaCorpus/checkpoint-best-acc/pytorch_model.bin \
60 | --test_filename dataset/javaCorpus/test.json \
61 | --output_dir saved_models/javaCorpus \
62 | --max_source_length 936 \
63 | --max_target_length 64 \
64 | --beam_size 5 \
65 | --eval_batch_size 32
66 | ```
67 |
68 | Prediction results of test set are ```saved_models/javaCorpus/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
69 |
70 |
71 | #### PY150 Dataset
72 |
73 | ```shell
74 | # Training
75 | python run.py \
76 | --do_train \
77 | --do_eval \
78 | --lang python \
79 | --model_name_or_path microsoft/unixcoder-base \
80 | --train_filename dataset/py150/train.txt \
81 | --dev_filename dataset/py150/dev.json \
82 | --output_dir saved_models/py150 \
83 | --max_source_length 936 \
84 | --max_target_length 64 \
85 | --beam_size 5 \
86 | --train_batch_size 32 \
87 | --gradient_accumulation_steps 1 \
88 | --eval_batch_size 32 \
89 | --learning_rate 2e-4 \
90 | --num_train_epochs 10
91 |
92 | # Output predictions of test set
93 | python run.py \
94 | --do_test \
95 | --lang python \
96 | --model_name_or_path microsoft/unixcoder-base \
97 | --load_model_path saved_models/py150/checkpoint-best-acc/pytorch_model.bin \
98 | --test_filename dataset/py150/test.json \
99 | --output_dir saved_models/py150 \
100 | --max_source_length 936 \
101 | --max_target_length 64 \
102 | --beam_size 5 \
103 | --eval_batch_size 32
104 | ```
105 |
106 | Prediction results of test set are ```saved_models/py150/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
107 |
108 |
109 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoday/CodeBERT/d24ffecfaf2d411a87f53e41c331dc514465c51f/UniXcoder/downstream-tasks/code-completion/dataset.zip
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-completion/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch
7 | from torch.autograd import Variable
8 | import copy
9 | class Seq2Seq(nn.Module):
10 | """
11 | Build Seqence-to-Sequence.
12 |
13 | Parameters:
14 |
15 | * `encoder`- encoder of seq2seq model. e.g. roberta
16 | * `decoder`- decoder of seq2seq model. e.g. transformer
17 | * `config`- configuration of encoder model.
18 | * `beam_size`- beam size for beam search.
19 | * `max_length`- max length of target for beam search.
20 | * `sos_id`- start of symbol ids in target for beam search.
21 | * `eos_id`- end of symbol ids in target for beam search.
22 | """
23 | def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
24 | super(Seq2Seq, self).__init__()
25 | self.encoder = encoder
26 | self.decoder=decoder
27 | self.config=config
28 | self.register_buffer(
29 | "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30 | )
31 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
32 | self.lm_head.weight=self.encoder.embeddings.word_embeddings.weight
33 | self.lsm = nn.LogSoftmax(dim=-1)
34 |
35 | self.beam_size=beam_size
36 | self.max_length=max_length
37 | self.sos_id=sos_id
38 | self.eos_id=eos_id
39 |
40 |
41 | def tie_weights(self):
42 | """ Make sure we are sharing the input and output embeddings.
43 | Export to TorchScript can't handle parameter sharing so we are cloning them instead.
44 | """
45 | self._tie_or_clone_weights(self.lm_head,
46 | self.encoder.embeddings.word_embeddings)
47 |
48 | def forward(self, source_ids,train=False):
49 | max_length = source_ids.ne(1).sum(-1).max()
50 | source_ids = source_ids[:,:max_length]
51 | if train:
52 | length = source_ids.size(-1)
53 | out = self.decoder(source_ids,attention_mask=self.bias[:,:length,:length]).last_hidden_state
54 | lm_logits = self.lm_head(out)
55 | # Shift so that tokens < n predict n
56 | active_loss = source_ids[..., 1:].ne(1).view(-1)
57 | shift_logits = lm_logits[..., :-1, :].contiguous()
58 | shift_labels = source_ids[..., 1:].contiguous()
59 | # Flatten the tokens
60 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
61 | loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
62 | shift_labels.view(-1)[active_loss])
63 |
64 | outputs = loss,loss*active_loss.sum(),active_loss.sum()
65 | return outputs
66 | else:
67 | #Predict
68 | preds=[]
69 | zero=torch.cuda.LongTensor(1).fill_(0)
70 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71 | length = source_ids.size(-1)
72 | encoder_output = self.decoder(source_ids,attention_mask=self.bias[:,:length,:length])
73 | for i in range(source_ids.shape[0]):
74 | context=[[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
75 | for y in encoder_output.past_key_values]
76 | beam = Beam(self.beam_size,self.sos_id,self.eos_id)
77 | input_ids=beam.getCurrentState()
78 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
79 | out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(self.beam_size,1,1)
80 | for _ in range(self.max_length):
81 | if beam.done():
82 | break
83 | if _ == 0:
84 | hidden_states=out[:,-1,:]
85 | out = self.lsm(self.lm_head(hidden_states)).data
86 | beam.advance(out)
87 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
88 | input_ids=beam.getCurrentState()
89 | else:
90 | length = context_ids.size(-1)+input_ids.size(-1)
91 | out = self.decoder(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
92 | past_key_values=context).last_hidden_state
93 | hidden_states=out[:,-1,:]
94 | out = self.lsm(self.lm_head(hidden_states)).data
95 | beam.advance(out)
96 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
97 | input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
98 | hyp= beam.getHyp(beam.getFinal())
99 | pred=beam.buildTargetTokens(hyp)[:self.beam_size]
100 | pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
101 | preds.append(torch.cat(pred,0).unsqueeze(0))
102 |
103 | preds=torch.cat(preds,0)
104 |
105 | return preds
106 |
107 |
108 |
109 | class Beam(object):
110 | def __init__(self, size, sos, eos):
111 | self.size = size
112 | self.tt = torch.cuda
113 | # The score for each translation on the beam.
114 | self.scores = self.tt.FloatTensor(size).zero_()
115 | # The backpointers at each time-step.
116 | self.prevKs = []
117 | # The outputs at each time-step.
118 | self.nextYs = [self.tt.LongTensor(size)
119 | .fill_(0)]
120 | self.nextYs[0][:] = sos
121 | # Has EOS topped the beam yet.
122 | self._eos = eos
123 | self.eosTop = False
124 | # Time and k pair for finished.
125 | self.finished = []
126 |
127 | def getCurrentState(self):
128 | "Get the outputs for the current timestep."
129 | batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
130 | return batch
131 |
132 | def getCurrentOrigin(self):
133 | "Get the backpointers for the current timestep."
134 | return self.prevKs[-1]
135 |
136 | def advance(self, wordLk):
137 | """
138 | Given prob over words for every last beam `wordLk` and attention
139 | `attnOut`: Compute and update the beam search.
140 |
141 | Parameters:
142 |
143 | * `wordLk`- probs of advancing from the last step (K x words)
144 | * `attnOut`- attention at the last step
145 |
146 | Returns: True if beam search is complete.
147 | """
148 | numWords = wordLk.size(1)
149 |
150 | # Sum the previous scores.
151 | if len(self.prevKs) > 0:
152 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
153 |
154 | # Don't let EOS have children.
155 | for i in range(self.nextYs[-1].size(0)):
156 | if self.nextYs[-1][i] in self._eos:
157 | beamLk[i] = -1e20
158 | else:
159 | beamLk = wordLk[0]
160 | flatBeamLk = beamLk.view(-1)
161 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
162 |
163 | self.scores = bestScores
164 |
165 | # bestScoresId is flattened beam x word array, so calculate which
166 | # word and beam each score came from
167 | prevK = bestScoresId // numWords
168 | self.prevKs.append(prevK)
169 | self.nextYs.append((bestScoresId - prevK * numWords))
170 |
171 |
172 | for i in range(self.nextYs[-1].size(0)):
173 | if self.nextYs[-1][i] in self._eos:
174 | s = self.scores[i]
175 | self.finished.append((s, len(self.nextYs) - 1, i))
176 |
177 | # End condition is when top-of-beam is EOS and no global score.
178 | if self.nextYs[-1][0] in self._eos:
179 | self.eosTop = True
180 |
181 | def done(self):
182 | return self.eosTop and len(self.finished) >=self.size
183 |
184 | def getFinal(self):
185 | if len(self.finished) == 0:
186 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
187 | self.finished.sort(key=lambda a: -a[0])
188 | if len(self.finished) != self.size:
189 | unfinished=[]
190 | for i in range(self.nextYs[-1].size(0)):
191 | if self.nextYs[-1][i] not in self._eos:
192 | s = self.scores[i]
193 | unfinished.append((s, len(self.nextYs) - 1, i))
194 | unfinished.sort(key=lambda a: -a[0])
195 | self.finished+=unfinished[:self.size-len(self.finished)]
196 | return self.finished[:self.size]
197 |
198 | def getHyp(self, beam_res):
199 | """
200 | Walk back to construct the full hypothesis.
201 | """
202 | hyps=[]
203 | for _,timestep, k in beam_res:
204 | hyp = []
205 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
206 | hyp.append(self.nextYs[j+1][k])
207 | k = self.prevKs[j][k]
208 | hyps.append(hyp[::-1])
209 | return hyps
210 |
211 | def buildTargetTokens(self, preds):
212 | sentence=[]
213 | for pred in preds:
214 | tokens = []
215 | for tok in pred:
216 | tokens.append(tok)
217 | if tok in self._eos:
218 | break
219 | sentence.append(tokens)
220 | return sentence
221 |
222 |
223 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/README.md:
--------------------------------------------------------------------------------
1 | # Code Generation
2 |
3 | ## Data Download
4 |
5 | ```bash
6 | mkdir dataset
7 | cd dataset
8 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/train.json
9 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/dev.json
10 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/text-to-code/dataset/concode/test.json
11 | cd ..
12 | ```
13 |
14 | ## Dependency
15 |
16 | - pip install torch
17 | - pip install transformers
18 |
19 | ## Fine-Tune Setting
20 |
21 | Here we provide fine-tune settings for code generation, whose results are reported in the paper.
22 |
23 | ```shell
24 | # Training
25 | python run.py \
26 | --do_train \
27 | --do_eval \
28 | --model_name_or_path microsoft/unixcoder-base \
29 | --train_filename dataset/train.json \
30 | --dev_filename dataset/dev.json \
31 | --output_dir saved_models \
32 | --max_source_length 350 \
33 | --max_target_length 150 \
34 | --beam_size 3 \
35 | --train_batch_size 32 \
36 | --eval_batch_size 32 \
37 | --learning_rate 5e-5 \
38 | --gradient_accumulation_steps 1 \
39 | --num_train_epochs 30
40 |
41 | # Output results
42 | python run.py \
43 | --do_test \
44 | --model_name_or_path microsoft/unixcoder-base \
45 | --test_filename dataset/test.json \
46 | --output_dir saved_models \
47 | --max_source_length 350 \
48 | --max_target_length 150 \
49 | --beam_size 3 \
50 | --train_batch_size 32 \
51 | --eval_batch_size 32 \
52 | --learning_rate 5e-5 \
53 | --gradient_accumulation_steps 1 \
54 | --num_train_epochs 30
55 | ```
56 |
57 | Prediction results of test set are ```saved_models/predictions.txt```.To obtain the score of test set, you need to send the prediction to codexglue@microsoft.com.
58 |
59 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/bleu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Python implementation of BLEU and smooth-BLEU.
17 |
18 | This module provides a Python implementation of BLEU and smooth-BLEU.
19 | Smooth BLEU is computed following the method outlined in the paper:
20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21 | evaluation metrics for machine translation. COLING 2004.
22 | """
23 |
24 | import collections
25 | import math
26 |
27 |
28 | def _get_ngrams(segment, max_order):
29 | """Extracts all n-grams upto a given maximum order from an input segment.
30 |
31 | Args:
32 | segment: text segment from which n-grams will be extracted.
33 | max_order: maximum length in tokens of the n-grams returned by this
34 | methods.
35 |
36 | Returns:
37 | The Counter containing all n-grams upto max_order in segment
38 | with a count of how many times each n-gram occurred.
39 | """
40 | ngram_counts = collections.Counter()
41 | for order in range(1, max_order + 1):
42 | for i in range(0, len(segment) - order + 1):
43 | ngram = tuple(segment[i:i+order])
44 | ngram_counts[ngram] += 1
45 | return ngram_counts
46 |
47 |
48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49 | smooth=False):
50 | """Computes BLEU score of translated segments against one or more references.
51 |
52 | Args:
53 | reference_corpus: list of lists of references for each translation. Each
54 | reference should be tokenized into a list of tokens.
55 | translation_corpus: list of translations to score. Each translation
56 | should be tokenized into a list of tokens.
57 | max_order: Maximum n-gram order to use when computing BLEU score.
58 | smooth: Whether or not to apply Lin et al. 2004 smoothing.
59 |
60 | Returns:
61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62 | precisions and brevity penalty.
63 | """
64 | matches_by_order = [0] * max_order
65 | possible_matches_by_order = [0] * max_order
66 | reference_length = 0
67 | translation_length = 0
68 | for (references, translation) in zip(reference_corpus,
69 | translation_corpus):
70 | reference_length += min(len(r) for r in references)
71 | translation_length += len(translation)
72 |
73 | merged_ref_ngram_counts = collections.Counter()
74 | for reference in references:
75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76 | translation_ngram_counts = _get_ngrams(translation, max_order)
77 | overlap = translation_ngram_counts & merged_ref_ngram_counts
78 | for ngram in overlap:
79 | matches_by_order[len(ngram)-1] += overlap[ngram]
80 | for order in range(1, max_order+1):
81 | possible_matches = len(translation) - order + 1
82 | if possible_matches > 0:
83 | possible_matches_by_order[order-1] += possible_matches
84 |
85 | precisions = [0] * max_order
86 | for i in range(0, max_order):
87 | if smooth:
88 | precisions[i] = ((matches_by_order[i] + 1.) /
89 | (possible_matches_by_order[i] + 1.))
90 | else:
91 | if possible_matches_by_order[i] > 0:
92 | precisions[i] = (float(matches_by_order[i]) /
93 | possible_matches_by_order[i])
94 | else:
95 | precisions[i] = 0.0
96 |
97 | if min(precisions) > 0:
98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99 | geo_mean = math.exp(p_log_sum)
100 | else:
101 | geo_mean = 0
102 |
103 | ratio = float(translation_length) / reference_length
104 |
105 | if ratio > 1.0:
106 | bp = 1.
107 | else:
108 | bp = math.exp(1 - 1. / ratio)
109 |
110 | bleu = geo_mean * bp
111 |
112 | return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 |
114 |
115 | def _bleu(ref_file, trans_file, subword_option=None):
116 | max_order = 4
117 | smooth = True
118 | ref_files = [ref_file]
119 | reference_text = []
120 | for reference_filename in ref_files:
121 | with open(reference_filename) as fh:
122 | reference_text.append(fh.readlines())
123 | per_segment_references = []
124 | for references in zip(*reference_text):
125 | reference_list = []
126 | for reference in references:
127 | reference_list.append(reference.strip().split())
128 | per_segment_references.append(reference_list)
129 | translations = []
130 | with open(trans_file) as fh:
131 | for line in fh:
132 | translations.append(line.strip().split())
133 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134 | return round(100 * bleu_score,2)
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch
7 | from torch.autograd import Variable
8 | import copy
9 | class Seq2Seq(nn.Module):
10 | """
11 | Build Seqence-to-Sequence.
12 |
13 | Parameters:
14 |
15 | * `encoder`- encoder of seq2seq model. e.g. roberta
16 | * `decoder`- decoder of seq2seq model. e.g. transformer
17 | * `config`- configuration of encoder model.
18 | * `beam_size`- beam size for beam search.
19 | * `max_length`- max length of target for beam search.
20 | * `sos_id`- start of symbol ids in target for beam search.
21 | * `eos_id`- end of symbol ids in target for beam search.
22 | """
23 | def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24 | super(Seq2Seq, self).__init__()
25 | self.encoder = encoder
26 | self.decoder=decoder
27 | self.config=config
28 | self.register_buffer(
29 | "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30 | )
31 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33 | self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34 | self.lsm = nn.LogSoftmax(dim=-1)
35 |
36 | self.beam_size = beam_size
37 | self.max_length = max_length
38 | self.sos_id = sos_id
39 | self.eos_id = eos_id
40 |
41 | def forward(self, source_ids, target_ids=None):
42 | if target_ids is None:
43 | return self.generate(source_ids)
44 |
45 | mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46 | encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47 | ids = torch.cat((source_ids,target_ids),-1)
48 | mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49 | mask = mask & ids[:,None,:].ne(1)
50 |
51 | out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52 | lm_logits = self.lm_head(out)
53 | # Shift so that tokens < n predict n
54 | active_loss = target_ids[..., 1:].ne(1).view(-1)
55 | shift_logits = lm_logits[..., :-1, :].contiguous()
56 | shift_labels = target_ids[..., 1:].contiguous()
57 | # Flatten the tokens
58 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59 | loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60 | shift_labels.view(-1)[active_loss])
61 |
62 | outputs = loss,loss*active_loss.sum(),active_loss.sum()
63 | return outputs
64 |
65 | def generate(self, source_ids):
66 | mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67 | encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68 | preds = []
69 | zero = torch.cuda.LongTensor(1).fill_(0)
70 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71 | for i in range(source_ids.shape[0]):
72 | context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73 | for y in encoder_output.past_key_values]
74 | beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75 | input_ids = beam.getCurrentState()
76 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77 | for _ in range(self.max_length):
78 | if beam.done():
79 | break
80 |
81 | ids = torch.cat((context_ids,input_ids),-1)
82 | mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83 | mask = mask & ids[:,None,:].ne(1)
84 | out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85 | hidden_states = out[:,-1,:]
86 | out = self.lsm(self.lm_head(hidden_states)).data
87 | beam.advance(out)
88 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89 | input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90 | hyp = beam.getHyp(beam.getFinal())
91 | pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92 | pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93 | preds.append(torch.cat(pred,0).unsqueeze(0))
94 |
95 | preds = torch.cat(preds,0)
96 |
97 | return preds
98 |
99 |
100 |
101 | class Beam(object):
102 | def __init__(self, size,sos,eos):
103 | self.size = size
104 | self.tt = torch.cuda
105 | # The score for each translation on the beam.
106 | self.scores = self.tt.FloatTensor(size).zero_()
107 | # The backpointers at each time-step.
108 | self.prevKs = []
109 | # The outputs at each time-step.
110 | self.nextYs = [self.tt.LongTensor(size)
111 | .fill_(0)]
112 | self.nextYs[0][0] = sos
113 | # Has EOS topped the beam yet.
114 | self._eos = eos
115 | self.eosTop = False
116 | # Time and k pair for finished.
117 | self.finished = []
118 |
119 | def getCurrentState(self):
120 | "Get the outputs for the current timestep."
121 | batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122 | return batch
123 |
124 | def getCurrentOrigin(self):
125 | "Get the backpointers for the current timestep."
126 | return self.prevKs[-1]
127 |
128 | def advance(self, wordLk):
129 | """
130 | Given prob over words for every last beam `wordLk` and attention
131 | `attnOut`: Compute and update the beam search.
132 |
133 | Parameters:
134 |
135 | * `wordLk`- probs of advancing from the last step (K x words)
136 | * `attnOut`- attention at the last step
137 |
138 | Returns: True if beam search is complete.
139 | """
140 | numWords = wordLk.size(1)
141 |
142 | # Sum the previous scores.
143 | if len(self.prevKs) > 0:
144 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145 |
146 | # Don't let EOS have children.
147 | for i in range(self.nextYs[-1].size(0)):
148 | if self.nextYs[-1][i] == self._eos:
149 | beamLk[i] = -1e20
150 | else:
151 | beamLk = wordLk[0]
152 | flatBeamLk = beamLk.view(-1)
153 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154 |
155 | self.scores = bestScores
156 |
157 | # bestScoresId is flattened beam x word array, so calculate which
158 | # word and beam each score came from
159 | prevK = bestScoresId // numWords
160 | self.prevKs.append(prevK)
161 | self.nextYs.append((bestScoresId - prevK * numWords))
162 |
163 |
164 | for i in range(self.nextYs[-1].size(0)):
165 | if self.nextYs[-1][i] == self._eos:
166 | s = self.scores[i]
167 | self.finished.append((s, len(self.nextYs) - 1, i))
168 |
169 | # End condition is when top-of-beam is EOS and no global score.
170 | if self.nextYs[-1][0] == self._eos:
171 | self.eosTop = True
172 |
173 | def done(self):
174 | return self.eosTop and len(self.finished) >=self.size
175 |
176 | def getFinal(self):
177 | if len(self.finished) == 0:
178 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179 | self.finished.sort(key=lambda a: -a[0])
180 | if len(self.finished) != self.size:
181 | unfinished=[]
182 | for i in range(self.nextYs[-1].size(0)):
183 | if self.nextYs[-1][i] != self._eos:
184 | s = self.scores[i]
185 | unfinished.append((s, len(self.nextYs) - 1, i))
186 | unfinished.sort(key=lambda a: -a[0])
187 | self.finished+=unfinished[:self.size-len(self.finished)]
188 | return self.finished[:self.size]
189 |
190 | def getHyp(self, beam_res):
191 | """
192 | Walk back to construct the full hypothesis.
193 | """
194 | hyps=[]
195 | for _,timestep, k in beam_res:
196 | hyp = []
197 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198 | hyp.append(self.nextYs[j+1][k])
199 | k = self.prevKs[j][k]
200 | hyps.append(hyp[::-1])
201 | return hyps
202 |
203 | def buildTargetTokens(self, preds):
204 | sentence=[]
205 | for pred in preds:
206 | tokens = []
207 | for tok in pred:
208 | if tok==self._eos:
209 | break
210 | tokens.append(tok)
211 | sentence.append(tokens)
212 | return sentence
213 |
214 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-generation/run.sh:
--------------------------------------------------------------------------------
1 | pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html > log.txt 2>&1
2 | pip install sklearn scipy transformers tqdm > log.txt 2>&1
3 | CUDA_VISIBLE_DEVICES=15,12,13,14
4 | lang=java #programming language
5 | lr=5e-5
6 | batch_size=32
7 | accm_steps=1
8 | beam_size=3
9 | source_length=512
10 | target_length=150
11 | data_dir=../../dataset
12 | output_dir=saved_models/$lang
13 | train_file=$data_dir/train.json
14 | dev_file=$data_dir/dev.json
15 | epochs=30
16 | pretrained_model=../../../pretrained-model/UniXcoder-base/
17 |
18 | mkdir -p $output_dir
19 | python run.py \
20 | --do_train \
21 | --do_eval \
22 | --model_name_or_path $pretrained_model \
23 | --train_filename $train_file \
24 | --dev_filename $dev_file \
25 | --tokenizer_name roberta-base \
26 | --output_dir $output_dir \
27 | --max_source_length $source_length \
28 | --max_target_length $target_length \
29 | --beam_size $beam_size \
30 | --train_batch_size $batch_size \
31 | --eval_batch_size $batch_size \
32 | --learning_rate $lr \
33 | --gradient_accumulation_steps $accm_steps \
34 | --num_train_epochs $epochs 2>&1| tee $output_dir/train.log
35 |
36 |
37 | batch_size=64
38 | dev_file=$data_dir/dev.json
39 | test_file=$data_dir/test.json
40 | test_model=$output_dir/checkpoint-best-score/pytorch_model.bin #checkpoint for test
41 |
42 | python run.py \
43 | --do_test \
44 | --model_name_or_path $pretrained_model \
45 | --load_model_path $test_model \
46 | --dev_filename $dev_file \
47 | --test_filename $test_file \
48 | --output_dir $output_dir \
49 | --max_source_length $source_length \
50 | --max_target_length $target_length \
51 | --beam_size $beam_size \
52 | --gradient_accumulation_steps $accm_steps \
53 | --eval_batch_size $batch_size 2>&1| tee $output_dir/test.log
54 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Code Search
4 |
5 | ## Data Download
6 |
7 | #### 1. AdvTest dataset
8 |
9 | ```bash
10 | mkdir dataset && cd dataset
11 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Text-Code/NL-code-search-Adv/dataset.zip
12 | unzip dataset.zip && rm -r dataset.zip && mv dataset AdvTest && cd AdvTest
13 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
14 | unzip python.zip && python preprocess.py && rm -r python && rm -r *.pkl && rm python.zip
15 | cd ../..
16 | ```
17 |
18 | #### 2. CosQA dataset
19 |
20 | ```bash
21 | cd dataset
22 | mkdir cosqa && cd cosqa
23 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/code_idx_map.txt
24 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-dev-500.json
25 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-test-500.json
26 | wget https://github.com/Jun-jie-Huang/CoCLR/raw/main/data/search/cosqa-retrieval-train-19604.json
27 | cd ../..
28 | ```
29 |
30 | #### 3. CSN dataset
31 |
32 | ```bash
33 | cd dataset
34 | wget https://github.com/microsoft/CodeBERT/raw/master/GraphCodeBERT/codesearch/dataset.zip
35 | unzip dataset.zip && rm -r dataset.zip && mv dataset CSN && cd CSN
36 | bash run.sh
37 | cd ../..
38 | ```
39 |
40 |
41 |
42 | ## Dependency
43 |
44 | - pip install torch
45 | - pip install transformers
46 |
47 | ## Zero-Shot Setting
48 |
49 | We first provide scripts for zero-shot code search. The similarity between code and nl we use is cosine distance of hidden states of UniXcoder.
50 |
51 | #### 1. AdvTest dataset
52 |
53 | ```bash
54 | python run.py \
55 | --output_dir saved_models/AdvTest \
56 | --model_name_or_path microsoft/unixcoder-base \
57 | --do_zero_shot \
58 | --do_test \
59 | --test_data_file dataset/AdvTest/test.jsonl \
60 | --codebase_file dataset/AdvTest/test.jsonl \
61 | --num_train_epochs 2 \
62 | --code_length 256 \
63 | --nl_length 128 \
64 | --train_batch_size 64 \
65 | --eval_batch_size 64 \
66 | --learning_rate 2e-5 \
67 | --seed 123456
68 | ```
69 |
70 | #### 2. CosQA dataset
71 |
72 | ```bash
73 | python run.py \
74 | --output_dir saved_models/cosqa \
75 | --model_name_or_path microsoft/unixcoder-base \
76 | --do_zero_shot \
77 | --do_test \
78 | --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
79 | --codebase_file dataset/cosqa/code_idx_map.txt \
80 | --num_train_epochs 10 \
81 | --code_length 256 \
82 | --nl_length 128 \
83 | --train_batch_size 64 \
84 | --eval_batch_size 64 \
85 | --learning_rate 2e-5 \
86 | --seed 123456
87 | ```
88 |
89 | #### 3. CSN dataset
90 |
91 | ```bash
92 | lang=python
93 | python run.py \
94 | --output_dir saved_models/CSN/$lang \
95 | --model_name_or_path microsoft/unixcoder-base \
96 | --do_zero_shot \
97 | --do_test \
98 | --test_data_file dataset/CSN/$lang/test.jsonl \
99 | --codebase_file dataset/CSN/$lang/codebase.jsonl \
100 | --num_train_epochs 10 \
101 | --code_length 256 \
102 | --nl_length 128 \
103 | --train_batch_size 64 \
104 | --eval_batch_size 64 \
105 | --learning_rate 2e-5 \
106 | --seed 123456
107 | ```
108 |
109 |
110 |
111 | ## Fine-Tune Setting
112 |
113 | Here we provide fine-tune settings for code search, whose results are reported in the paper.
114 |
115 | #### 1. AdvTest dataset
116 |
117 | ```shell
118 | # Training
119 | python run.py \
120 | --output_dir saved_models/AdvTest \
121 | --model_name_or_path microsoft/unixcoder-base \
122 | --do_train \
123 | --train_data_file dataset/AdvTest/train.jsonl \
124 | --eval_data_file dataset/AdvTest/valid.jsonl \
125 | --codebase_file dataset/AdvTest/valid.jsonl \
126 | --num_train_epochs 2 \
127 | --code_length 256 \
128 | --nl_length 128 \
129 | --train_batch_size 64 \
130 | --eval_batch_size 64 \
131 | --learning_rate 2e-5 \
132 | --seed 123456
133 |
134 | # Evaluating
135 | python run.py \
136 | --output_dir saved_models/AdvTest \
137 | --model_name_or_path microsoft/unixcoder-base \
138 | --do_test \
139 | --test_data_file dataset/AdvTest/test.jsonl \
140 | --codebase_file dataset/AdvTest/test.jsonl \
141 | --num_train_epochs 2 \
142 | --code_length 256 \
143 | --nl_length 128 \
144 | --train_batch_size 64 \
145 | --eval_batch_size 64 \
146 | --learning_rate 2e-5 \
147 | --seed 123456
148 | ```
149 | #### 2. CosQA dataset
150 |
151 | ```bash
152 | # Training
153 | python run.py \
154 | --output_dir saved_models/cosqa \
155 | --model_name_or_path microsoft/unixcoder-base \
156 | --do_train \
157 | --train_data_file dataset/cosqa/cosqa-retrieval-train-19604.json \
158 | --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
159 | --codebase_file dataset/cosqa/code_idx_map.txt \
160 | --num_train_epochs 10 \
161 | --code_length 256 \
162 | --nl_length 128 \
163 | --train_batch_size 64 \
164 | --eval_batch_size 64 \
165 | --learning_rate 2e-5 \
166 | --seed 123456
167 |
168 | # Evaluating
169 | python run.py \
170 | --output_dir saved_models/cosqa \
171 | --model_name_or_path microsoft/unixcoder-base \
172 | --do_eval \
173 | --do_test \
174 | --eval_data_file dataset/cosqa/cosqa-retrieval-dev-500.json \
175 | --test_data_file dataset/cosqa/cosqa-retrieval-test-500.json \
176 | --codebase_file dataset/cosqa/code_idx_map.txt \
177 | --num_train_epochs 10 \
178 | --code_length 256 \
179 | --nl_length 128 \
180 | --train_batch_size 64 \
181 | --eval_batch_size 64 \
182 | --learning_rate 2e-5 \
183 | --seed 123456
184 | ```
185 |
186 | #### 3. CSN dataset
187 |
188 | ```bash
189 | # Training
190 | lang=python
191 | python run.py \
192 | --output_dir saved_models/cosqa \
193 | --model_name_or_path microsoft/unixcoder-base \
194 | --do_train \
195 | --train_data_file dataset/CSN/$lang/train.jsonl \
196 | --eval_data_file dataset/CSN/$lang/valid.jsonl \
197 | --codebase_file dataset/CSN/$lang/codebase.jsonl \
198 | --num_train_epochs 10 \
199 | --code_length 256 \
200 | --nl_length 128 \
201 | --train_batch_size 64 \
202 | --eval_batch_size 64 \
203 | --learning_rate 2e-5 \
204 | --seed 123456
205 |
206 | # Evaluating
207 | python run.py \
208 | --output_dir saved_models/cosqa \
209 | --model_name_or_path microsoft/unixcoder-base \
210 | --do_eval \
211 | --do_test \
212 | --eval_data_file dataset/CSN/$lang/valid.jsonl \
213 | --test_data_file dataset/CSN/$lang/test.jsonl \
214 | --codebase_file dataset/CSN/$lang/codebase.jsonl \
215 | --num_train_epochs 10 \
216 | --code_length 256 \
217 | --nl_length 128 \
218 | --train_batch_size 64 \
219 | --eval_batch_size 64 \
220 | --learning_rate 2e-5 \
221 | --seed 123456
222 |
223 | ```
224 |
225 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-search/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | import torch.nn as nn
4 | import torch
5 | class Model(nn.Module):
6 | def __init__(self, encoder):
7 | super(Model, self).__init__()
8 | self.encoder = encoder
9 |
10 | def forward(self, code_inputs=None, nl_inputs=None):
11 | if code_inputs is not None:
12 | outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 | outputs = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None]
14 | return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 | else:
16 | outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 | outputs = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None]
18 | return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |
20 |
21 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/README.md:
--------------------------------------------------------------------------------
1 | # Code Summarization
2 |
3 | ## Data Download
4 |
5 | ```bash
6 | wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Text/code-to-text/dataset.zip
7 | unzip dataset.zip
8 | rm dataset.zip
9 | cd dataset
10 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
11 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip
12 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/ruby.zip
13 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/javascript.zip
14 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/go.zip
15 | wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/php.zip
16 |
17 | unzip python.zip
18 | unzip java.zip
19 | unzip ruby.zip
20 | unzip javascript.zip
21 | unzip go.zip
22 | unzip php.zip
23 | rm *.zip
24 | rm *.pkl
25 |
26 | python preprocess.py
27 | rm -r */final
28 | cd ..
29 | ```
30 |
31 | ## Dependency
32 |
33 | - pip install torch
34 | - pip install transformers
35 |
36 | ## Fine-Tune Setting
37 |
38 | Here we provide fine-tune settings for code summarization, whose results are reported in the paper.
39 |
40 | ```shell
41 | lang=python
42 |
43 | # Training
44 | python run.py \
45 | --do_train \
46 | --do_eval \
47 | --model_name_or_path microsoft/unixcoder-base \
48 | --train_filename dataset/$lang/train.jsonl \
49 | --dev_filename dataset/$lang/valid.jsonl \
50 | --output_dir saved_models/$lang \
51 | --max_source_length 256 \
52 | --max_target_length 128 \
53 | --beam_size 10 \
54 | --train_batch_size 48 \
55 | --eval_batch_size 48 \
56 | --learning_rate 5e-5 \
57 | --gradient_accumulation_steps 2 \
58 | --num_train_epochs 10
59 |
60 | # Evaluating
61 | python run.py \
62 | --do_test \
63 | --model_name_or_path microsoft/unixcoder-base \
64 | --test_filename dataset/$lang/test.jsonl \
65 | --output_dir saved_models/$lang \
66 | --max_source_length 256 \
67 | --max_target_length 128 \
68 | --beam_size 10 \
69 | --train_batch_size 48 \
70 | --eval_batch_size 48 \
71 | --learning_rate 5e-5 \
72 | --gradient_accumulation_steps 2 \
73 | --num_train_epochs 10
74 | ```
75 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/bleu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | '''
4 | This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
5 | '''
6 |
7 | # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
8 |
9 | '''Provides:
10 |
11 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
12 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
13 | score_cooked(alltest, n=4): Score a list of cooked test sentences.
14 |
15 | score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
16 |
17 | The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
18 | '''
19 |
20 | import sys, math, re, xml.sax.saxutils
21 | import subprocess
22 | import os
23 |
24 | # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
25 | nonorm = 0
26 |
27 | preserve_case = False
28 | eff_ref_len = "shortest"
29 |
30 | normalize1 = [
31 | ('', ''), # strip "skipped" tags
32 | (r'-\n', ''), # strip end-of-line hyphenation and join lines
33 | (r'\n', ' '), # join lines
34 | # (r'(\d)\s+(?=\d)', r'\1'), # join digits
35 | ]
36 | normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
37 |
38 | normalize2 = [
39 | (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
40 | (r'([^0-9])([\.,])',r'\1 \2 '), # tokenize period and comma unless preceded by a digit
41 | (r'([\.,])([^0-9])',r' \1 \2'), # tokenize period and comma unless followed by a digit
42 | (r'([0-9])(-)',r'\1 \2 ') # tokenize dash when preceded by a digit
43 | ]
44 | normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
45 |
46 | def normalize(s):
47 | '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
48 | # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
49 | if (nonorm):
50 | return s.split()
51 | if type(s) is not str:
52 | s = " ".join(s)
53 | # language-independent part:
54 | for (pattern, replace) in normalize1:
55 | s = re.sub(pattern, replace, s)
56 | s = xml.sax.saxutils.unescape(s, {'"':'"'})
57 | # language-dependent part (assuming Western languages):
58 | s = " %s " % s
59 | if not preserve_case:
60 | s = s.lower() # this might not be identical to the original
61 | for (pattern, replace) in normalize2:
62 | s = re.sub(pattern, replace, s)
63 | return s.split()
64 |
65 | def count_ngrams(words, n=4):
66 | counts = {}
67 | for k in range(1,n+1):
68 | for i in range(len(words)-k+1):
69 | ngram = tuple(words[i:i+k])
70 | counts[ngram] = counts.get(ngram, 0)+1
71 | return counts
72 |
73 | def cook_refs(refs, n=4):
74 | '''Takes a list of reference sentences for a single segment
75 | and returns an object that encapsulates everything that BLEU
76 | needs to know about them.'''
77 |
78 | refs = [normalize(ref) for ref in refs]
79 | maxcounts = {}
80 | for ref in refs:
81 | counts = count_ngrams(ref, n)
82 | for (ngram,count) in counts.items():
83 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
84 | return ([len(ref) for ref in refs], maxcounts)
85 |
86 | def cook_test(test, item, n=4):
87 | '''Takes a test sentence and returns an object that
88 | encapsulates everything that BLEU needs to know about it.'''
89 | (reflens, refmaxcounts)=item
90 | test = normalize(test)
91 | result = {}
92 | result["testlen"] = len(test)
93 |
94 | # Calculate effective reference sentence length.
95 |
96 | if eff_ref_len == "shortest":
97 | result["reflen"] = min(reflens)
98 | elif eff_ref_len == "average":
99 | result["reflen"] = float(sum(reflens))/len(reflens)
100 | elif eff_ref_len == "closest":
101 | min_diff = None
102 | for reflen in reflens:
103 | if min_diff is None or abs(reflen-len(test)) < min_diff:
104 | min_diff = abs(reflen-len(test))
105 | result['reflen'] = reflen
106 |
107 | result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
108 |
109 | result['correct'] = [0]*n
110 | counts = count_ngrams(test, n)
111 | for (ngram, count) in counts.items():
112 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
113 |
114 | return result
115 |
116 | def score_cooked(allcomps, n=4, ground=0, smooth=1):
117 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
118 | for comps in allcomps:
119 | for key in ['testlen','reflen']:
120 | totalcomps[key] += comps[key]
121 | for key in ['guess','correct']:
122 | for k in range(n):
123 | totalcomps[key][k] += comps[key][k]
124 | logbleu = 0.0
125 | all_bleus = []
126 | for k in range(n):
127 | correct = totalcomps['correct'][k]
128 | guess = totalcomps['guess'][k]
129 | addsmooth = 0
130 | if smooth == 1 and k > 0:
131 | addsmooth = 1
132 | logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
133 | if guess == 0:
134 | all_bleus.append(-10000000)
135 | else:
136 | all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
137 |
138 | logbleu /= float(n)
139 | all_bleus.insert(0, logbleu)
140 |
141 | brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
142 | for i in range(len(all_bleus)):
143 | if i ==0:
144 | all_bleus[i] += brevPenalty
145 | all_bleus[i] = math.exp(all_bleus[i])
146 | return all_bleus
147 |
148 | def bleu(refs, candidate, ground=0, smooth=1):
149 | refs = cook_refs(refs)
150 | test = cook_test(candidate, refs)
151 | return score_cooked([test], ground=ground, smooth=smooth)
152 |
153 | def splitPuncts(line):
154 | return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
155 |
156 | def computeMaps(predictions, goldfile):
157 | predictionMap = {}
158 | goldMap = {}
159 | gf = open(goldfile, 'r')
160 |
161 | for row in predictions:
162 | cols = row.strip().split('\t')
163 | if len(cols) == 1:
164 | (rid, pred) = (cols[0], '')
165 | else:
166 | (rid, pred) = (cols[0], cols[1])
167 | predictionMap[rid] = [splitPuncts(pred.strip().lower())]
168 |
169 | for row in gf:
170 | (rid, pred) = row.split('\t')
171 | if rid in predictionMap: # Only insert if the id exists for the method
172 | if rid not in goldMap:
173 | goldMap[rid] = []
174 | goldMap[rid].append(splitPuncts(pred.strip().lower()))
175 |
176 | sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
177 | return (goldMap, predictionMap)
178 |
179 |
180 | #m1 is the reference map
181 | #m2 is the prediction map
182 | def bleuFromMaps(m1, m2):
183 | score = [0] * 5
184 | num = 0.0
185 |
186 | for key in m1:
187 | if key in m2:
188 | bl = bleu(m1[key], m2[key][0])
189 | score = [ score[i] + bl[i] for i in range(0, len(bl))]
190 | num += 1
191 | return [s * 100.0 / num for s in score]
192 |
193 | if __name__ == '__main__':
194 | reference_file = sys.argv[1]
195 | predictions = []
196 | for row in sys.stdin:
197 | predictions.append(row)
198 | (goldMap, predictionMap) = computeMaps(predictions, reference_file)
199 | print (bleuFromMaps(goldMap, predictionMap)[0])
200 |
201 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/code-summarization/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch
7 | from torch.autograd import Variable
8 | import copy
9 | class Seq2Seq(nn.Module):
10 | """
11 | Build Seqence-to-Sequence.
12 |
13 | Parameters:
14 |
15 | * `encoder`- encoder of seq2seq model. e.g. roberta
16 | * `decoder`- decoder of seq2seq model. e.g. transformer
17 | * `config`- configuration of encoder model.
18 | * `beam_size`- beam size for beam search.
19 | * `max_length`- max length of target for beam search.
20 | * `sos_id`- start of symbol ids in target for beam search.
21 | * `eos_id`- end of symbol ids in target for beam search.
22 | """
23 | def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24 | super(Seq2Seq, self).__init__()
25 | self.encoder = encoder
26 | self.decoder=decoder
27 | self.config=config
28 | self.register_buffer(
29 | "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30 | )
31 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33 | self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34 | self.lsm = nn.LogSoftmax(dim=-1)
35 |
36 | self.beam_size = beam_size
37 | self.max_length = max_length
38 | self.sos_id = sos_id
39 | self.eos_id = eos_id
40 |
41 | def forward(self, source_ids, target_ids=None):
42 | if target_ids is None:
43 | return self.generate(source_ids)
44 |
45 | mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46 | encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47 | ids = torch.cat((source_ids,target_ids),-1)
48 | mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49 | mask = mask & ids[:,None,:].ne(1)
50 |
51 | out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52 | lm_logits = self.lm_head(out)
53 | # Shift so that tokens < n predict n
54 | active_loss = target_ids[..., 1:].ne(1).view(-1)
55 | shift_logits = lm_logits[..., :-1, :].contiguous()
56 | shift_labels = target_ids[..., 1:].contiguous()
57 | # Flatten the tokens
58 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59 | loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60 | shift_labels.view(-1)[active_loss])
61 |
62 | outputs = loss,loss*active_loss.sum(),active_loss.sum()
63 | return outputs
64 |
65 | def generate(self, source_ids):
66 | mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67 | encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68 | preds = []
69 | zero = torch.cuda.LongTensor(1).fill_(0)
70 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71 | for i in range(source_ids.shape[0]):
72 | context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73 | for y in encoder_output.past_key_values]
74 | beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75 | input_ids = beam.getCurrentState()
76 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77 | for _ in range(self.max_length):
78 | if beam.done():
79 | break
80 |
81 | ids = torch.cat((context_ids,input_ids),-1)
82 | mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83 | mask = mask & ids[:,None,:].ne(1)
84 | out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85 | hidden_states = out[:,-1,:]
86 | out = self.lsm(self.lm_head(hidden_states)).data
87 | beam.advance(out)
88 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89 | input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90 | hyp = beam.getHyp(beam.getFinal())
91 | pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92 | pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93 | preds.append(torch.cat(pred,0).unsqueeze(0))
94 |
95 | preds = torch.cat(preds,0)
96 |
97 | return preds
98 |
99 |
100 |
101 | class Beam(object):
102 | def __init__(self, size,sos,eos):
103 | self.size = size
104 | self.tt = torch.cuda
105 | # The score for each translation on the beam.
106 | self.scores = self.tt.FloatTensor(size).zero_()
107 | # The backpointers at each time-step.
108 | self.prevKs = []
109 | # The outputs at each time-step.
110 | self.nextYs = [self.tt.LongTensor(size)
111 | .fill_(0)]
112 | self.nextYs[0][0] = sos
113 | # Has EOS topped the beam yet.
114 | self._eos = eos
115 | self.eosTop = False
116 | # Time and k pair for finished.
117 | self.finished = []
118 |
119 | def getCurrentState(self):
120 | "Get the outputs for the current timestep."
121 | batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122 | return batch
123 |
124 | def getCurrentOrigin(self):
125 | "Get the backpointers for the current timestep."
126 | return self.prevKs[-1]
127 |
128 | def advance(self, wordLk):
129 | """
130 | Given prob over words for every last beam `wordLk` and attention
131 | `attnOut`: Compute and update the beam search.
132 |
133 | Parameters:
134 |
135 | * `wordLk`- probs of advancing from the last step (K x words)
136 | * `attnOut`- attention at the last step
137 |
138 | Returns: True if beam search is complete.
139 | """
140 | numWords = wordLk.size(1)
141 |
142 | # Sum the previous scores.
143 | if len(self.prevKs) > 0:
144 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145 |
146 | # Don't let EOS have children.
147 | for i in range(self.nextYs[-1].size(0)):
148 | if self.nextYs[-1][i] == self._eos:
149 | beamLk[i] = -1e20
150 | else:
151 | beamLk = wordLk[0]
152 | flatBeamLk = beamLk.view(-1)
153 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154 |
155 | self.scores = bestScores
156 |
157 | # bestScoresId is flattened beam x word array, so calculate which
158 | # word and beam each score came from
159 | prevK = bestScoresId // numWords
160 | self.prevKs.append(prevK)
161 | self.nextYs.append((bestScoresId - prevK * numWords))
162 |
163 |
164 | for i in range(self.nextYs[-1].size(0)):
165 | if self.nextYs[-1][i] == self._eos:
166 | s = self.scores[i]
167 | self.finished.append((s, len(self.nextYs) - 1, i))
168 |
169 | # End condition is when top-of-beam is EOS and no global score.
170 | if self.nextYs[-1][0] == self._eos:
171 | self.eosTop = True
172 |
173 | def done(self):
174 | return self.eosTop and len(self.finished) >=self.size
175 |
176 | def getFinal(self):
177 | if len(self.finished) == 0:
178 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179 | self.finished.sort(key=lambda a: -a[0])
180 | if len(self.finished) != self.size:
181 | unfinished=[]
182 | for i in range(self.nextYs[-1].size(0)):
183 | if self.nextYs[-1][i] != self._eos:
184 | s = self.scores[i]
185 | unfinished.append((s, len(self.nextYs) - 1, i))
186 | unfinished.sort(key=lambda a: -a[0])
187 | self.finished+=unfinished[:self.size-len(self.finished)]
188 | return self.finished[:self.size]
189 |
190 | def getHyp(self, beam_res):
191 | """
192 | Walk back to construct the full hypothesis.
193 | """
194 | hyps=[]
195 | for _,timestep, k in beam_res:
196 | hyp = []
197 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198 | hyp.append(self.nextYs[j+1][k])
199 | k = self.prevKs[j][k]
200 | hyps.append(hyp[::-1])
201 | return hyps
202 |
203 | def buildTargetTokens(self, preds):
204 | sentence=[]
205 | for pred in preds:
206 | tokens = []
207 | for tok in pred:
208 | if tok==self._eos:
209 | break
210 | tokens.append(tok)
211 | sentence.append(tokens)
212 | return sentence
213 |
214 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Zero-shot Code-to-Code Search
4 |
5 | Given a source code as the query, the task aims to retrieve codes with the same semantics from a collection of candidates in zero-shot setting. We collect 11,744/15,594/23,530 functions from [CodeNet](https://github.com/IBM/Project_CodeNet) corpus in Ruby/Python/Java. Each function solves one of 4,053 problems.
6 |
7 |
8 |
9 | ## Data Download
10 |
11 | ```bash
12 | cd dataset
13 | wget https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet.tar.gz
14 | tar -xvf Project_CodeNet.tar.gz
15 | python preprocess.py
16 | cd ..
17 | ```
18 |
19 |
20 |
21 | ## Dependency
22 |
23 | - pip install torch
24 | - pip install transformers
25 |
26 |
27 |
28 | ## Zero-Shot Setting
29 |
30 | ```bash
31 | source_lang=ruby
32 | target_lang=python
33 | python run.py \
34 | --model_name_or_path microsoft/unixcoder-base \
35 | --query_data_file dataset/${source_lang}_with_func.jsonl \
36 | --candidate_data_file dataset/${target_lang}_with_func.jsonl \
37 | --query_lang ${source_lang} \
38 | --candidate_lang ${target_lang} \
39 | --code_length 512 \
40 | --eval_batch_size 256
41 | ```
42 |
43 |
44 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | import torch.nn as nn
4 | import torch
5 | class Model(nn.Module):
6 | def __init__(self, encoder):
7 | super(Model, self).__init__()
8 | self.encoder = encoder
9 |
10 | def forward(self, code_inputs=None, nl_inputs=None, cls=False):
11 | if code_inputs is not None:
12 | outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
13 | outputs = (outputs * code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(1)[:,None]
14 | return torch.nn.functional.normalize(outputs, p=2, dim=1)
15 | else:
16 | outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
17 | outputs = (outputs * nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(1)[:,None]
18 | return torch.nn.functional.normalize(outputs, p=2, dim=1)
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/UniXcoder/downstream-tasks/zero-shot-search/run.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19 | using a masked language modeling (MLM) loss.
20 | """
21 | import sys
22 |
23 |
24 | import argparse
25 | import logging
26 | import os
27 | import pickle
28 | import random
29 | import torch
30 | import json
31 | import numpy as np
32 | from tqdm import tqdm
33 | from model import Model
34 | from torch.nn import CrossEntropyLoss, MSELoss
35 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
36 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
37 | RobertaConfig, RobertaModel, RobertaTokenizer)
38 | import re
39 | from io import StringIO
40 | import tokenize
41 |
42 | logger = logging.getLogger(__name__)
43 |
44 |
45 | def remove_comments_and_docstrings(source,lang):
46 | if lang in ['python']:
47 | io_obj = StringIO(source)
48 | out = ""
49 | prev_toktype = tokenize.INDENT
50 | last_lineno = -1
51 | last_col = 0
52 | for tok in tokenize.generate_tokens(io_obj.readline):
53 | token_type = tok[0]
54 | token_string = tok[1]
55 | start_line, start_col = tok[2]
56 | end_line, end_col = tok[3]
57 | ltext = tok[4]
58 | if start_line > last_lineno:
59 | last_col = 0
60 | if start_col > last_col:
61 | out += (" " * (start_col - last_col))
62 | # Remove comments:
63 | if token_type == tokenize.COMMENT:
64 | pass
65 | # This series of conditionals removes docstrings:
66 | elif token_type == tokenize.STRING:
67 | if prev_toktype != tokenize.INDENT:
68 | # This is likely a docstring; double-check we're not inside an operator:
69 | if prev_toktype != tokenize.NEWLINE:
70 | if start_col > 0:
71 | out += token_string
72 | else:
73 | out += token_string
74 | prev_toktype = token_type
75 | last_col = end_col
76 | last_lineno = end_line
77 | temp=[]
78 | for x in out.split('\n'):
79 | if x.strip()!="":
80 | temp.append(x)
81 | return '\n'.join(temp)
82 | elif lang in ['ruby']:
83 | return source
84 | else:
85 | def replacer(match):
86 | s = match.group(0)
87 | if s.startswith('/'):
88 | return " " # note: a space and not an empty string
89 | else:
90 | return s
91 | pattern = re.compile(
92 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
93 | re.DOTALL | re.MULTILINE
94 | )
95 | temp=[]
96 | for x in re.sub(pattern, replacer, source).split('\n'):
97 | if x.strip()!="":
98 | temp.append(x)
99 | return '\n'.join(temp)
100 |
101 |
102 | class InputFeatures(object):
103 | """A single training/test features for a example."""
104 | def __init__(self,
105 | code_tokens,
106 | code_ids,
107 | index,
108 | label
109 |
110 | ):
111 | self.code_tokens = code_tokens
112 | self.code_ids = code_ids
113 | self.index = index
114 | self.label = label
115 |
116 |
117 | def convert_examples_to_features(js,tokenizer,args,lang):
118 | """convert examples to token ids"""
119 | if "func" in js:
120 | code = " ".join(remove_comments_and_docstrings(js['func'],lang).split())
121 | else:
122 | code = " ".join(remove_comments_and_docstrings(js['code'],lang).split())
123 | code_tokens = tokenizer.tokenize(code)[:args.code_length-4]
124 | code_tokens =[tokenizer.cls_token,"",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
125 | code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
126 | padding_length = args.code_length - len(code_ids)
127 | code_ids += [tokenizer.pad_token_id]*padding_length
128 | return InputFeatures(code_tokens,code_ids,js["index"],int(js['label']))
129 |
130 | class TextDataset(Dataset):
131 | def __init__(self, tokenizer, args, file_path, lang):
132 | self.examples = []
133 | data = []
134 | with open(file_path) as f:
135 | for i, line in enumerate(f):
136 | line = line.strip()
137 | js = json.loads(line)
138 | data.append(js)
139 |
140 | for js in data:
141 | self.examples.append(convert_examples_to_features(js,tokenizer,args,lang))
142 |
143 | for idx, example in enumerate(self.examples[:1]):
144 | logger.info("*** Example ***")
145 | logger.info("label: {}".format(example.label))
146 | logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
147 | logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
148 |
149 | self.label_examples={}
150 | for e in self.examples:
151 | if e.label not in self.label_examples:
152 | self.label_examples[e.label]=[]
153 | self.label_examples[e.label].append(e)
154 |
155 | def __len__(self):
156 | return len(self.examples)
157 |
158 | def __getitem__(self, i):
159 | return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].label))
160 |
161 |
162 |
163 | def evaluate(args, model, tokenizer, file_name, candidate_file_name):
164 | query_dataset = TextDataset(tokenizer, args, file_name, args.query_lang)
165 | query_sampler = SequentialSampler(query_dataset)
166 | query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=args.eval_batch_size,num_workers=4)
167 |
168 | candidate_dataset = TextDataset(tokenizer, args, candidate_file_name, args.candidate_lang)
169 | candidate_sampler = SequentialSampler(candidate_dataset)
170 | candidate_dataloader = DataLoader(candidate_dataset, sampler=candidate_sampler, batch_size=args.eval_batch_size, num_workers=4)
171 |
172 | # Eval!
173 | logger.info("***** Running evaluation *****")
174 | logger.info(" Num Query = %d", len(query_dataset))
175 | logger.info(" Num Candidate = %d", len(candidate_dataset))
176 | logger.info(" Batch size = %d", args.eval_batch_size)
177 |
178 |
179 | model.eval()
180 | query_vecs = []
181 | query_labels = []
182 | candidate_vecs = []
183 | candidate_labels = []
184 | # Obtain query vectors
185 | for batch in query_dataloader:
186 | code_inputs = batch[0].to(args.device)
187 | label = batch[1].to(args.device)
188 | with torch.no_grad():
189 | code_vec = model(code_inputs=code_inputs)
190 | query_vecs.append(code_vec.cpu().numpy())
191 | query_labels.append(label.cpu().numpy())
192 |
193 | # Obtain candidate vectors
194 | for batch in candidate_dataloader:
195 | code_inputs = batch[0].to(args.device)
196 | label = batch[1].to(args.device)
197 | with torch.no_grad():
198 | code_vec = model(code_inputs=code_inputs)
199 | candidate_vecs.append(code_vec.cpu().numpy())
200 | candidate_labels.append(label.cpu().numpy())
201 |
202 | model.train()
203 |
204 | # Calculate cosine score
205 | query_vecs = np.concatenate(query_vecs,0)
206 | candidate_vecs = np.concatenate(candidate_vecs,0)
207 | query_labels = list(np.concatenate(query_labels,0))
208 | candidate_labels = list(np.concatenate(candidate_labels,0))
209 | candidate_indexs =[candidate_dataset.examples[i].index for i in range(len(candidate_dataset))]
210 | query_indexs = [query_dataset.examples[i].index for i in range(len(query_dataset))]
211 | scores = np.matmul(query_vecs,candidate_vecs.T)
212 |
213 | # Calculate MAP score
214 | sort_ids = np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]
215 | MAP=[]
216 | results = {}
217 | for i in range(scores.shape[0]):
218 | cont=0
219 | label=int(query_labels[i])
220 | query_index = query_indexs[i]
221 | results[query_index] = [label,candidate_labels[sort_ids[i][0]],candidate_indexs[sort_ids[i][0]]]
222 | Avep = []
223 | for j,index in enumerate(list(sort_ids[i])):
224 | if query_index==candidate_indexs[index]:
225 | cont+=1
226 | continue
227 | if int(candidate_labels[index])==label:
228 | Avep.append((len(Avep)+1)/(j+1-cont))
229 | if len(Avep)!=0:
230 | MAP.append(sum(Avep)/len(Avep))
231 |
232 | result = {
233 | "eval_map":float(np.mean(MAP))
234 | }
235 | return result
236 |
237 |
238 |
239 | def main():
240 | parser = argparse.ArgumentParser()
241 |
242 | ## Required parameters
243 | parser.add_argument("--query_data_file", default=None, type=str, required=False,
244 | help="The input training data file (a json file).")
245 | parser.add_argument("--candidate_data_file", default=None, type=str, required=False,
246 | help="The input training data file (a json file).")
247 | parser.add_argument("--model_name_or_path", default=None, type=str,
248 | help="The model checkpoint for weights initialization.")
249 |
250 | parser.add_argument("--query_lang", default=None, type=str, required=False,
251 | help="Programming language of query.")
252 | parser.add_argument("--candidate_lang", default=None, type=str,
253 | help="Programming language of candidate.")
254 |
255 |
256 | parser.add_argument("--code_length", default=256, type=int,
257 | help="Optional Code input sequence length after tokenization.")
258 | parser.add_argument("--eval_batch_size", default=4, type=int,
259 | help="Batch size for evaluation.")
260 |
261 |
262 |
263 | #print arguments
264 | args = parser.parse_args()
265 |
266 | #set log
267 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
268 | datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
269 | #set device
270 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
271 | args.n_gpu = torch.cuda.device_count()
272 | args.device = device
273 | logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
274 |
275 |
276 | #build model
277 | tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
278 | config = RobertaConfig.from_pretrained(args.model_name_or_path)
279 | model = RobertaModel.from_pretrained(args.model_name_or_path)
280 |
281 |
282 | model=Model(model)
283 | logger.info("Training/evaluation parameters %s", args)
284 | model.to(args.device)
285 |
286 | if args.n_gpu > 1:
287 | model = torch.nn.DataParallel(model)
288 |
289 |
290 | result=evaluate(args, model, tokenizer,args.query_data_file,args.candidate_data_file)
291 | logger.info("***** Eval results *****")
292 | for key in sorted(result.keys()):
293 | logger.info(" %s = %s", key, str(round(result[key]*100,2)))
294 |
295 |
296 |
297 | if __name__ == "__main__":
298 | main()
299 |
300 |
301 |
--------------------------------------------------------------------------------
/UniXcoder/unixcoder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
7 |
8 | class UniXcoder(nn.Module):
9 | def __init__(self, model_name):
10 | """
11 | Build UniXcoder.
12 |
13 | Parameters:
14 |
15 | * `model_name`- huggingface model card name. e.g. microsoft/unixcoder-base
16 | """
17 | super(UniXcoder, self).__init__()
18 | self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
19 | self.config = RobertaConfig.from_pretrained(model_name)
20 | self.config.is_decoder = True
21 | self.model = RobertaModel.from_pretrained(model_name, config=self.config)
22 |
23 | self.register_buffer("bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024))
24 | self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
25 | self.lm_head.weight = self.model.embeddings.word_embeddings.weight
26 | self.lsm = nn.LogSoftmax(dim=-1)
27 |
28 | self.tokenizer.add_tokens([""],special_tokens=True)
29 |
30 | def tokenize(self, inputs, mode="", max_length=512, padding=False):
31 | """
32 | Convert string to token ids
33 |
34 | Parameters:
35 |
36 | * `inputs`- list of input strings.
37 | * `max_length`- The maximum total source sequence length after tokenization.
38 | * `padding`- whether to pad source sequence length to max_length.
39 | * `mode`- which mode the sequence will use. i.e. , ,
40 | """
41 | assert mode in ["", "", ""]
42 |
43 | tokenizer = self.tokenizer
44 |
45 | tokens_ids = []
46 | for x in inputs:
47 | tokens = tokenizer.tokenize(x)
48 | if mode == "":
49 | tokens = tokens[:max_length-4]
50 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
51 | elif mode == "":
52 | tokens = tokens[-(max_length-3):]
53 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens
54 | else:
55 | tokens = tokens[:max_length-5]
56 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
57 |
58 | tokens_id = tokenizer.convert_tokens_to_ids(tokens)
59 | if padding:
60 | tokens_id = tokens_id + [sel.config.pad_token_id] * (max_length-len(tokens_id))
61 | tokens_ids.append(tokens_id)
62 | return tokens_ids
63 |
64 | def decode(self, source_ids):
65 | """ Convert token ids to string """
66 | predictions = []
67 | for x in source_ids:
68 | prediction = []
69 | for y in x:
70 | t = y.cpu().numpy()
71 | t = list(t)
72 | if 0 in t:
73 | t = t[:t.index(0)]
74 | text = self.tokenizer.decode(t,clean_up_tokenization_spaces=False)
75 | prediction.append(text)
76 | predictions.append(prediction)
77 | return predictions
78 |
79 | def forward(self, source_ids):
80 | """ Obtain token embeddings and sentence embeddings """
81 | mask = source_ids.ne(self.config.pad_token_id)
82 | token_embeddings = self.model(source_ids,attention_mask = mask.unsqueeze(1) * mask.unsqueeze(2))[0]
83 | sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1)
84 | return token_embeddings, sentence_embeddings
85 |
86 | def generate(self, source_ids, decoder_only = True, eos_id = None, beam_size = 5, max_length = 64):
87 | """ Generate sequence given context (source_ids) """
88 |
89 | # Set encoder mask attention matrix: bidirectional for , unirectional for
90 | if decoder_only:
91 | mask = self.bias[:,:source_ids.size(-1),:source_ids.size(-1)]
92 | else:
93 | mask = source_ids.ne(self.config.pad_token_id)
94 | mask = mask.unsqueeze(1) * mask.unsqueeze(2)
95 |
96 | if eos_id is None:
97 | eos_id = self.config.eos_token_id
98 |
99 | device = source_ids.device
100 |
101 | # Decoding using beam search
102 | preds = []
103 | zero = torch.LongTensor(1).fill_(0).to(device)
104 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
105 | length = source_ids.size(-1)
106 | encoder_output = self.model(source_ids,attention_mask=mask)
107 | for i in range(source_ids.shape[0]):
108 | context = [[x[i:i+1,:,:source_len[i]].repeat(beam_size,1,1,1) for x in y]
109 | for y in encoder_output.past_key_values]
110 | beam = Beam(beam_size,eos_id,device)
111 | input_ids = beam.getCurrentState().clone()
112 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(beam_size,1)
113 | out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(beam_size,1,1)
114 | for _ in range(max_length):
115 | if beam.done():
116 | break
117 | if _ == 0:
118 | hidden_states = out[:,-1,:]
119 | out = self.lsm(self.lm_head(hidden_states)).data
120 | beam.advance(out)
121 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
122 | input_ids = beam.getCurrentState().clone()
123 | else:
124 | length = context_ids.size(-1)+input_ids.size(-1)
125 | out = self.model(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
126 | past_key_values=context).last_hidden_state
127 | hidden_states = out[:,-1,:]
128 | out = self.lsm(self.lm_head(hidden_states)).data
129 | beam.advance(out)
130 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
131 | input_ids = torch.cat((input_ids,beam.getCurrentState().clone()),-1)
132 | hyp = beam.getHyp(beam.getFinal())
133 | pred = beam.buildTargetTokens(hyp)[:beam_size]
134 | pred = [torch.cat([x.view(-1) for x in p]+[zero]*(max_length-len(p))).view(1,-1) for p in pred]
135 | preds.append(torch.cat(pred,0).unsqueeze(0))
136 |
137 | preds = torch.cat(preds,0)
138 |
139 | return preds
140 |
141 |
142 |
143 | class Beam(object):
144 | def __init__(self, size, eos, device):
145 | self.size = size
146 | self.device = device
147 | # The score for each translation on the beam.
148 | self.scores = torch.FloatTensor(size).zero_().to(device)
149 | # The backpointers at each time-step.
150 | self.prevKs = []
151 | # The outputs at each time-step.
152 | self.nextYs = [torch.LongTensor(size).fill_(0).to(device)]
153 | # Has EOS topped the beam yet.
154 | self._eos = eos
155 | self.eosTop = False
156 | # Time and k pair for finished.
157 | self.finished = []
158 |
159 | def getCurrentState(self):
160 | "Get the outputs for the current timestep."
161 | batch = self.nextYs[-1].view(-1, 1)
162 | return batch
163 |
164 | def getCurrentOrigin(self):
165 | "Get the backpointers for the current timestep."
166 | return self.prevKs[-1]
167 |
168 | def advance(self, wordLk):
169 | """
170 | Given prob over words for every last beam `wordLk` and attention
171 | `attnOut`: Compute and update the beam search.
172 |
173 | Parameters:
174 |
175 | * `wordLk`- probs of advancing from the last step (K x words)
176 | * `attnOut`- attention at the last step
177 |
178 | Returns: True if beam search is complete.
179 | """
180 | numWords = wordLk.size(1)
181 |
182 | # Sum the previous scores.
183 | if len(self.prevKs) > 0:
184 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
185 |
186 | # Don't let EOS have children.
187 | for i in range(self.nextYs[-1].size(0)):
188 | if self.nextYs[-1][i] == self._eos:
189 | beamLk[i] = -1e20
190 | else:
191 | beamLk = wordLk[0]
192 | flatBeamLk = beamLk.view(-1)
193 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
194 |
195 | self.scores = bestScores
196 |
197 | # bestScoresId is flattened beam x word array, so calculate which
198 | # word and beam each score came from
199 | prevK = bestScoresId // numWords
200 | self.prevKs.append(prevK)
201 | self.nextYs.append((bestScoresId - prevK * numWords))
202 |
203 |
204 | for i in range(self.nextYs[-1].size(0)):
205 | if self.nextYs[-1][i] == self._eos:
206 | s = self.scores[i]
207 | self.finished.append((s, len(self.nextYs) - 1, i))
208 |
209 | # End condition is when top-of-beam is EOS and no global score.
210 | if self.nextYs[-1][0] == self._eos:
211 | self.eosTop = True
212 |
213 | def done(self):
214 | return self.eosTop and len(self.finished) >= self.size
215 |
216 | def getFinal(self):
217 | if len(self.finished) == 0:
218 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
219 | self.finished.sort(key=lambda a: -a[0])
220 | if len(self.finished) != self.size:
221 | unfinished=[]
222 | for i in range(self.nextYs[-1].size(0)):
223 | if self.nextYs[-1][i] != self._eos:
224 | s = self.scores[i]
225 | unfinished.append((s, len(self.nextYs) - 1, i))
226 | unfinished.sort(key=lambda a: -a[0])
227 | self.finished+=unfinished[:self.size-len(self.finished)]
228 | return self.finished[:self.size]
229 |
230 | def getHyp(self, beam_res):
231 | """
232 | Walk back to construct the full hypothesis.
233 | """
234 | hyps=[]
235 | for _,timestep, k in beam_res:
236 | hyp = []
237 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
238 | hyp.append(self.nextYs[j+1][k])
239 | k = self.prevKs[j][k]
240 | hyps.append(hyp[::-1])
241 | return hyps
242 |
243 | def buildTargetTokens(self, preds):
244 | sentence=[]
245 | for pred in preds:
246 | tokens = []
247 | for tok in pred:
248 | if tok==self._eos:
249 | break
250 | tokens.append(tok)
251 | sentence.append(tokens)
252 | return sentence
253 |
254 |
--------------------------------------------------------------------------------