├── README.md ├── Standard_LSTM ├── .gitignore ├── Backward.m ├── Batch.m ├── Forward.m ├── LSTM.m ├── decode_beam.m ├── decode_greedy.m ├── lstmUnit.m ├── softmax.m └── test.m ├── a.txt ├── hier_LSTM ├── Backward.m ├── Batch.m ├── Forward.m ├── ReadData.m ├── decode_greedy.m ├── hier_LSTM.m ├── lstmUnit.m ├── softmax.m └── test.m ├── hier_LSTM_Attention ├── .hier_LSTM_Att.m.swp ├── Attention.m ├── Attention_Decode.m ├── Backward.m ├── Batch.m ├── Forward.m ├── ReadData.m ├── decode_greedy.m ├── hier_LSTM_Att.m ├── lstmUnit.m ├── softmax.m └── test.m ├── index.html └── misc ├── aggregateMatrix.m ├── oneMatrix.m ├── randSimpleMatrix.m ├── randomMatrix.m ├── sigmoid.m ├── sigmoidPrime.m ├── smallMatrix.m ├── tanhPrime.m └── zeroMatrix.m /README.md: -------------------------------------------------------------------------------- 1 | # A Hierarchical Neural Autoencoder for Paragraphs and Documents 2 | 3 | Implementations of the three models presented in the paper "A Hierarchical Neural Autoencoder for Paragraphs and Documents" by Jiwei Li, Minh-Thang Luong and Dan Jurafsky, ACL 2015 4 | 5 | ## Requirements: 6 | GPU 7 | 8 | matlab >= 2014b 9 | 10 | memory >= 4GB 11 | 12 | 13 | 14 | ## Folders 15 | Standard_LSTM: Standard LSTM Autoencoder 16 | 17 | hier_LSTM: Hierarchical LSTM Autoencoder 18 | 19 | hier_LSTM_Attention: Hierarchical LSTM Autoencoder with Attention 20 | 21 | ## DownLoad [Data](http://cs.stanford.edu/~bdlijiwei/data.tar) 22 | - `dictionary`: vocabulary 23 | - `train_permute.txt`: training data for standard Model. Each line corresponds to one document/paragraph 24 | - `train_source_permute_segment.txt`: source training data for hierarchical Models. Each line corresponds to one sentence. An empty line starts a new document/sentence. Documents are reversed. 25 | - `test_source_permute_segment.txt`: target training data for hierarchical Model. 26 | 27 | Training roughly takes 2-3 weeks for standard models and 4-6 weeks for hierarchical models on a K40 GPU machine. 28 | 29 | 30 | For any question or bug with the code, feel free to contact jiweil@stanford.edu 31 | 32 | ```latex 33 | @article{li2015hierarchical, 34 | title={A Hierarchical Neural Autoencoder for Paragraphs and Documents}, 35 | author={Li, Jiwei and Luong, Minh-Thang and Jurafsky, Dan}, 36 | journal={arXiv preprint arXiv:1506.01057}, 37 | year={2015} 38 | } 39 | ``` 40 | -------------------------------------------------------------------------------- /Standard_LSTM/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | -------------------------------------------------------------------------------- /Standard_LSTM/Backward.m: -------------------------------------------------------------------------------- 1 | function[grad]=Backward(batch,grad,parameter,lstms,all_c_t)%backward 2 | dh = cell(parameter.layer_num, 1); 3 | dc = cell(parameter.layer_num, 1); 4 | Word=batch.Word; 5 | N=size(Word,1); 6 | T=batch.MaxLen; 7 | MaxLenSource=batch.MaxLenSource; 8 | MaxLenTarget=batch.MaxLenTarget; 9 | for ll=1:parameter.layer_num 10 | grad.W_T{ll}=zeroMatrix(size(parameter.W_T{ll})); 11 | grad.W_S{ll}=zeroMatrix(size(parameter.W_S{ll})); 12 | end 13 | 14 | zeroState=zeroMatrix([parameter.hidden,size(batch.Word,1)]); 15 | for ll=parameter.layer_num:-1:1 16 | dh{ll} = zeroState; 17 | dc{ll} = zeroState; 18 | end 19 | wordCount = 0; 20 | numInputWords=size(Word,1)*size(Word,2); 21 | allEmbGrads=zeroMatrix([parameter.dimension,numInputWords]); 22 | for t=T-1:-1:1 23 | unmaskedIds=batch.Left{t}; 24 | for ll=parameter.layer_num:-1:1 25 | if ll==parameter.layer_num 26 | if(t>MaxLenSource-1) 27 | dh{ll}=dh{ll}+grad.ht{:,t-MaxLenSource+1}; 28 | end 29 | end 30 | if t==1 c_t_1 = []; 31 | else c_t_1 = all_c_t{ll, t-1}; 32 | end 33 | c_t = all_c_t{ll, t}; 34 | lstm = lstms{ll, t}; 35 | [lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc{ll}, dh{ll},ll, t,zeroState,parameter,MaxLenSource);%backward propagation 36 | if (t>MaxLenSource)grad.W_T{ll}=grad.W_T{ll}+lstm_grad.W; 37 | else grad.W_S{ll}=grad.W_S{ll}+lstm_grad.W; 38 | end 39 | dc{ll} = lstm_grad.dc; 40 | dh{ll} = lstm_grad.input(end-parameter.hidden+1:end,:); 41 | if ll==1%deal with word emebddings 42 | embIndices=batch.Word(unmaskedIds,t)'; 43 | embGrad = lstm_grad.input(1:parameter.dimension,unmaskedIds); 44 | numWords = length(embIndices); 45 | allEmbIndices(wordCount+1:wordCount+numWords) = embIndices; 46 | allEmbGrads(:, wordCount+1:wordCount+numWords) = embGrad; 47 | wordCount = wordCount + numWords; 48 | else 49 | dh{ll-1}(:,unmaskedIds)=dh{ll-1}(:,unmaskedIds)+lstm_grad.input(1:parameter.hidden,unmaskedIds); 50 | end 51 | end 52 | end 53 | allEmbGrads(:, wordCount+1:end) = []; 54 | allEmbIndices(wordCount+1:end) = []; 55 | [grad.W_emb, grad.indices] = aggregateMatrix(allEmbGrads, allEmbIndices); 56 | for ll=1:parameter.layer_num 57 | grad.W_T{ll}=grad.W_T{ll}/N;%divided by number of sentences 58 | grad.W_S{ll}=grad.W_S{ll}/N; 59 | end 60 | grad.W_emb=grad.W_emb/N; 61 | grad.soft_W=grad.soft_W/N; 62 | clear allEmbGrads; 63 | clear allEmbIndices; 64 | clear allEmbGrads; 65 | end 66 | 67 | function[lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc, dh, ll, t, zero_state,parameter,MaxLenSource) 68 | dc = arrayfun(@plusMult, dc, lstm.o_gate, dh); 69 | do = arrayfun(@sigmoidPrimeTriple, lstm.o_gate, c_t, dh); 70 | di = arrayfun(@sigmoidPrimeTriple, lstm.i_gate, lstm.a_signal, dc); 71 | 72 | if t>1 73 | df = arrayfun(@sigmoidPrimeTriple, lstm.f_gate, c_t_1, dc); 74 | else 75 | df = zero_state; 76 | end 77 | lstm_grad.dc = lstm.f_gate.*dc; 78 | dl = arrayfun(@tanhPrimeTriple, lstm.a_signal, lstm.i_gate, dc); 79 | d_ifoa = [di; df; do; dl]; 80 | lstm_grad.W = d_ifoa*lstm.input'; %dw 81 | if t>MaxLenSource 82 | lstm_grad.input = parameter.W_T{ll}'*d_ifoa;% dx dh 83 | else 84 | lstm_grad.input = parameter.W_S{ll}'*d_ifoa; 85 | end 86 | if parameter.dropout~=0 87 | if ll==1 88 | lstm_grad.input(1:parameter.dimension, :) = lstm_grad.input(1:parameter.dimension, :).*lstm.drop_left; 89 | else lstm_grad.input(1:parameter.hidden, :) = lstm_grad.input(1:parameter.hidden, :).*lstm.drop_left; 90 | end 91 | end 92 | if parameter.clip==1%if values of gradients are too large, clip them 93 | lstm_grad.input=arrayfun(@clipBackward, lstm_grad.input); 94 | lstm_grad.dc = arrayfun(@clipBackward, lstm_grad.dc); 95 | end 96 | clear dc; clear do; clear di; clear df; clear d_ifoa; 97 | end 98 | 99 | 100 | function [value] = plusTanhPrimeTriple(t, x, y, z) 101 | value = t + (1-x*x)*y*z; 102 | end 103 | function [value] = tanhPrimeTriple(x, y, z) 104 | value = (1-x*x)*y*z; 105 | end 106 | function [value] = plusMult(x, y, z) 107 | value = x + y*z; 108 | end 109 | function [value] = sigmoidPrimeTriple(x, y, z) 110 | value = x*(1-x)*y*z; 111 | end 112 | 113 | function [clippedValue] = clipBackward(x) 114 | if x>1000 clippedValue = single(1000); 115 | elseif x<-1000 clippedValue = single(-1000); 116 | else clippedValue =single(x); 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /Standard_LSTM/Batch.m: -------------------------------------------------------------------------------- 1 | classdef Batch < handle 2 | properties 3 | MaxLenSource=-1; 4 | MaxLenTarget=-1; 5 | MaxLen=-1; 6 | Word=[]; 7 | Delete={}; 8 | Left={}; 9 | Mask={}; 10 | Label=[]; 11 | SourceLength=[]; 12 | N_word=0; 13 | end 14 | end 15 | 16 | 17 | -------------------------------------------------------------------------------- /Standard_LSTM/Forward.m: -------------------------------------------------------------------------------- 1 | function[lstms,all_h_t,all_c_t]=Forward(batch,parameter,isTraining)%Forward 2 | N=size(batch.Word,1); 3 | zeroState=zeroMatrix([parameter.hidden,N]); 4 | if isTraining==1 5 | T=batch.MaxLen; 6 | else 7 | T=batch.MaxLenSource; 8 | end 9 | all_h_t=cell(parameter.layer_num,T); 10 | all_c_t=cell(parameter.layer_num,T); 11 | lstms = cell(parameter.layer_num,T); 12 | 13 | for ll=1:parameter.layer_num 14 | for tt=1:T 15 | all_h_t{ll,tt}=zeroMatrix([parameter.hidden,N]); 16 | all_c_t{ll,tt}=zeroMatrix([parameter.hidden,N]); 17 | end 18 | end 19 | for t=1:T 20 | for ll=1:parameter.layer_num 21 | if t0 % GPU exists 8 | parameter.isGPU = 1; 9 | gpuDevice(1); 10 | else 11 | print('no gpu ! ! ! ! !'); 12 | end 13 | 14 | parameter.dimension=1000; 15 | parameter.alpha=0.1; %learning rate 16 | parameter.layer_num=4; %number of layer 17 | parameter.hidden=1000; 18 | parameter.lstm_out_tanh=0; 19 | parameter.Initial=0.08; 20 | parameter.dropout=0.2; %drop-out rate 21 | params.lstm_out_tanh=0; 22 | parameter.isTraining=1; 23 | parameter.CheckGrad=0; %whether check gradient or not. 24 | parameter.PreTrainEmb=0; %whether using pre-trained embeddings 25 | parameter.update_embedding=1; %whether update word embeddings 26 | parameter.batch_size=32; %mini-batch size 27 | parameter.Source_Target_Same_Language=1; 28 | %whether source and target is of the same language. For author-encoder task, it is. 29 | parameter.maxGradNorm=1; %gradient clipping 30 | parameter.clip=1; 31 | 32 | parameter.lr=5; 33 | parameter.read=0; 34 | 35 | if parameter.Source_Target_Same_Language==1 36 | parameter.Vocab=25001; %vocabulary size plus document-end token 37 | parameter.stop=parameter.Vocab; %document-end token 38 | else 39 | parameter.SourceVocab=20; 40 | parameter.TargetVocab=20; 41 | parameter.stop=parameter.TargetVocab; 42 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 43 | end 44 | 45 | if parameter.CheckGrad==1¶meter.dropout~=0 %use identical dropout-vector for gradient checking 46 | parameter.drop_left_1=randSimpleMatrix([parameter.dimension,1])<1-parameter.dropout; 47 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 48 | end 49 | 50 | parameter.nonlinear_gate_f = @sigmoid; 51 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 52 | parameter.nonlinear_f = @tanh; 53 | parameter.nonlinear_f_prime = @tanhPrime; 54 | 55 | 56 | train_source_file='../data/train_permute.txt'; 57 | train_target_file='../data/train_permute.txt'; 58 | test_source_file='../data/test.txt'; 59 | test_target_file='../data/test.txt'; 60 | 61 | 62 | if 1==0 63 | train_source_file='../data/very_small.txt'; 64 | train_target_file='../data/very_small.txt'; 65 | end 66 | 67 | if 1==0 68 | train_source_file='../toy.txt'; 69 | train_target_file='../toy.txt'; 70 | test_source_file='../toy.txt'; 71 | test_target_file='../toy.txt'; 72 | end 73 | % above is files for gradient checking 74 | 75 | if parameter.read==1 76 | disp('read'); 77 | parameter=ReadParameter(parameter); %read from exisitng parameter 78 | else [parameter]=Initial(parameter); %rand initialization 79 | end 80 | 81 | iter=0; 82 | 83 | disp('begin') 84 | 85 | while 1 86 | iter=iter+1; 87 | End=0; 88 | fd_train_source=fopen(train_source_file); %read source 89 | fd_train_target=fopen(train_target_file); %read target 90 | sum_cost=0; 91 | sum_num=0; 92 | batch_n=0; 93 | while 1 94 | batch_n=batch_n+1; 95 | [batch,End]=ReadTrainData(fd_train_source,fd_train_target,parameter); %transform data to batches 96 | 97 | if End~=1 || (End==1&& length(batch.Word)~=0) 98 | %not the end of document 99 | [lstm,all_h_t,c]=Forward(batch,parameter,1); 100 | %LSTM Forward 101 | [batch_cost,grad]=softmax(all_h_t(parameter.layer_num,:),batch,parameter); 102 | %softmax 103 | clear all_h_t; 104 | if (isnan(batch_cost)||isinf(batch_cost)) &&End~=1 105 | %if gradient explodes 106 | if parameter.clip==1 107 | fprintf('die !! Hopeless!!\n'); 108 | disp('read done'); 109 | parameter=pre_parameter; 110 | %load parameters stores from last step, and skip lately used batches 111 | disp(batch_n) 112 | else parameter.clip=1; 113 | end 114 | if End==1 break; 115 | %end of documents 116 | else continue; 117 | end 118 | end 119 | if parameter.isTraining==1 120 | grad=Backward(batch,grad,parameter,lstm,c); 121 | %backward propagation 122 | disp('backward done') 123 | [parameter]=update_parameter(parameter,grad); 124 | %update parameter 125 | clear lstm; 126 | clear c; 127 | end 128 | end 129 | if End==1 130 | fclose(fd_train_source); 131 | fclose(fd_train_target); 132 | break; 133 | end 134 | 135 | if mod(batch_n,500)==0 136 | %save parameter every 500 batches 137 | pre_parameter=parameter; 138 | end 139 | end 140 | SaveParameter(parameter,iter); 141 | %save parameter 142 | end 143 | end 144 | 145 | function[parameter]=ReadParameter(parameter) 146 | %read parameter 147 | for ll=1:parameter.layer_num 148 | W_file=strcat('save_parameter/_W_S',num2str(ll)); 149 | parameter.W_S{ll}=gpuArray(load(W_file)); 150 | W_file=strcat('save_parameter/_W_T',num2str(ll)); 151 | parameter.W_T{ll}=gpuArray(load(W_file)); 152 | end 153 | parameter.vect=gpuArray(load('save_parameter/_v')); 154 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 155 | end 156 | 157 | function SaveParameter(parameter,iter) 158 | if iter~=-1 159 | all=strcat('save_parameter/',int2str(iter)); 160 | all=strcat(all,'_'); 161 | all=strcat(all,num2str(parameter.alpha)); 162 | else 163 | all='save_parameter/'; 164 | end 165 | for ll=1:parameter.layer_num 166 | W_file=strcat(all,'_W_T'); 167 | W_file=strcat(W_file,num2str(ll)); 168 | dlmwrite(W_file,parameter.W_T{ll}); 169 | end 170 | for ll=1:parameter.layer_num 171 | W_file=strcat(all,'_W_S'); 172 | W_file=strcat(W_file,num2str(ll)); 173 | dlmwrite(W_file,parameter.W_S{ll}); 174 | end 175 | v_file=strcat(all,'_v'); 176 | dlmwrite(v_file,parameter.vect); 177 | soft_W_file=strcat(all,'_soft_W'); 178 | dlmwrite(soft_W_file,parameter.soft_W); 179 | end 180 | 181 | function[parameter]=update_parameter(parameter,grad) 182 | %update parameters 183 | norm=computeGradNorm(grad,parameter); 184 | %compute normalization 185 | if norm>parameter.maxGradNorm 186 | lr=parameter.alpha*parameter.maxGradNorm/norm; 187 | %normalizing 188 | else lr=parameter.alpha; 189 | end 190 | for ll=1:parameter.layer_num 191 | parameter.W_T{ll}=parameter.W_T{ll}-lr*grad.W_T{ll}; 192 | parameter.W_S{ll}=parameter.W_S{ll}-lr*grad.W_S{ll}; 193 | end 194 | parameter.soft_W=parameter.soft_W-lr*grad.soft_W; 195 | parameter.vect(:,grad.indices)=parameter.vect(:,grad.indices)-lr*grad.W_emb; 196 | end 197 | 198 | function[parameter]=Initial(parameter) 199 | %random initialization 200 | m=parameter.Initial; 201 | for i=1:parameter.layer_num 202 | if i==1 203 | parameter.W_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 204 | parameter.W_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 205 | else 206 | parameter.W_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 207 | parameter.W_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 208 | end 209 | end 210 | parameter.vect=randomMatrix(parameter.Initial,[parameter.dimension,parameter.Vocab]); 211 | if parameter.Source_Target_Same_Language==1 212 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.Vocab,parameter.hidden]); 213 | else 214 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.TargetVocab,parameter.hidden]); 215 | end 216 | end 217 | 218 | function[current_batch,End]=ReadTrainData(fd_s,fd_t,parameter) 219 | tline_s = fgets(fd_s); 220 | tline_t = fgets(fd_t); 221 | i=0; 222 | Source={};Target={}; 223 | End=0; 224 | while ischar(tline_s) 225 | i=i+1; 226 | text_s=deblank(tline_s); 227 | text_t=deblank(tline_t); 228 | if parameter.Source_Target_Same_Language~=1 229 | Source{i}=wrev(str2num(text_s))+parameter.TargetVocab; 230 | %reverse inputs 231 | else 232 | Source{i}=wrev(str2num(text_s)); 233 | %reverse inputs 234 | end 235 | Target{i}=[str2num(text_t),parameter.stop]; 236 | %add document_end_token 237 | if i==parameter.batch_size 238 | break; 239 | end 240 | tline_s = fgets(fd_s); 241 | tline_t = fgets(fd_t); 242 | end 243 | if ischar(tline_s)==0 244 | End=1; 245 | end 246 | current_batch=Batch(); 247 | N=length(Source); 248 | for j=1:N 249 | source_length=length(Source{j}); 250 | current_batch.SourceLength=[current_batch.SourceLength,source_length]; 251 | if source_length>current_batch.MaxLenSource 252 | current_batch.MaxLenSource=source_length; 253 | end 254 | target_length=length(Target{j}); 255 | if target_length>current_batch.MaxLenTarget 256 | current_batch.MaxLenTarget=target_length; 257 | end 258 | end 259 | total_length=current_batch.MaxLenSource+current_batch.MaxLenTarget; 260 | current_batch.MaxLen=total_length; 261 | current_batch.Word=ones(N,total_length); 262 | Mask=ones(N,total_length); 263 | % Mask: labeling positions where no words exisit. The purpose is to work on sentences in bulk making program faster 264 | for j=1:N 265 | source_length=length(Source{j}); 266 | target_length=length(Target{j}); 267 | current_batch.Word(j,current_batch.MaxLenSource-source_length+1:current_batch.MaxLenSource)=Source{j}; 268 | %words within sentences 269 | current_batch.Word(j,current_batch.MaxLenSource+1:current_batch.MaxLenSource+target_length)=Target{j}; 270 | Mask(j,1:current_batch.MaxLenSource-source_length)=0; 271 | Mask(j,current_batch.MaxLenSource+target_length+1:end)=0; 272 | % label positions without tokens 0 273 | current_batch.N_word=current_batch.N_word+target_length; 274 | end 275 | for j=1:total_length 276 | current_batch.Delete{j}=find(Mask(:,j)==0); 277 | current_batch.Left{j}=find(Mask(:,j)==1); 278 | end 279 | current_batch.Mask=Mask; 280 | end 281 | 282 | %gradient check 283 | function check_soft_W(value1,i,j,batch,parameter) 284 | e=0.001; 285 | parameter.soft_W(i,j)=parameter.soft_W(i,j)+e; 286 | [lstms,h,c]=Forward(batch,parameter,1); 287 | [cost1,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 288 | parameter.soft_W(i,j)=parameter.soft_W(i,j)-2*e; 289 | [lstms,h,c]=Forward(batch,parameter,1); 290 | [cost2,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 291 | value2=(cost1-cost2)/(2*e); 292 | value1 293 | value2 294 | value1-value2 295 | end 296 | 297 | 298 | function check_target_W(value1,ll,i,j,batch,parameter) 299 | e=0.001; 300 | parameter.W_T{ll}(i,j)=parameter.W_T{ll}(i,j)+e; 301 | [lstms,h,c]=Forward(batch,parameter,1); 302 | [cost1,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 303 | parameter.W_T{ll}(i,j)=parameter.W_T{ll}(i,j)-2*e; 304 | [lstms,h,c]=Forward(batch,parameter,1); 305 | [cost2,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 306 | parameter.W_T{ll}(i,j)=parameter.W_T{ll}(i,j)+e; 307 | value2=(cost1-cost2)/(2*e); 308 | value1 309 | value2 310 | value1-value2 311 | end 312 | 313 | function check_source_W(value1,ll,i,j,batch,parameter) 314 | e=0.001; 315 | parameter.W_S{ll}(i,j)=parameter.W_S{ll}(i,j)+e; 316 | [lstms,h,c]=Forward(batch,parameter,1); 317 | [cost1,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 318 | parameter.W_S{ll}(i,j)=parameter.W_S{ll}(i,j)-2*e; 319 | [lstms,h,c]=Forward(batch,parameter,1); 320 | [cost2,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 321 | parameter.W_S{ll}(i,j)=parameter.W_S{ll}(i,j)+e; 322 | value2=(cost1-cost2)/(2*e); 323 | value1 324 | value2 325 | value1-value2 326 | end 327 | 328 | 329 | function check_vect(value1,i,j,batch,parameter) 330 | e=0.001; 331 | parameter.vect(i,j)=parameter.vect(i,j)+e; 332 | [lstms,h,c]=Forward(batch,parameter,1); 333 | [cost1,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 334 | parameter.vect(i,j)=parameter.vect(i,j)-2*e; 335 | [lstms,h,c]=Forward(batch,parameter,1); 336 | [cost2,grad]=softmax(h(parameter.layer_num,:),batch,parameter); 337 | parameter.vect(i,j)=parameter.vect(i,j)+e; 338 | value2=(cost1-cost2)/(2*e); 339 | value1 340 | value2 341 | value1-value2 342 | end 343 | 344 | function[norm]=computeGradNorm(grad,parameter) %compute gradient norm 345 | norm=0; 346 | for ii=1:parameter.layer_num 347 | norm=norm+double(sum(grad.W_S{ii}(:).^2)); 348 | norm=norm+double(sum(grad.W_T{ii}(:).^2)); 349 | end 350 | norm=sqrt(norm); 351 | end 352 | -------------------------------------------------------------------------------- /Standard_LSTM/decode_beam.m: -------------------------------------------------------------------------------- 1 | function[]=decode_beam(parameter,TestBatches,filename) 2 | disp('decode') 3 | parameter.beamSize=7; 4 | for batch_index=1:length(TestBatches) 5 | batch_index 6 | batch=TestBatches{batch_index}; 7 | max_length=floor(batch.MaxLenSource*1); 8 | Word=batch.Word; 9 | N=size(Word,1); 10 | SourceLength=batch.SourceLength; 11 | [lstms,last_h_t,last_c_t]=Forward(batch,parameter,0); 12 | numElements =N*parameter.beamSize; 13 | translations=-zeroMatrix([N,max_length]); 14 | [first_scores,first_words]=BeamStep(parameter,last_h_t{parameter.layer_num},1); 15 | beamHistory=oneMatrix([numElements,max_length]); 16 | beamHistory(:,1)=first_words(:); 17 | beamScores=first_scores(:)'; 18 | beamStates=cell(parameter.layer_num,1); 19 | for ll=1:parameter.layer_num 20 | beamStates{ll}.c_t=reshape(repmat(last_c_t{ll},parameter.beamSize,1),parameter.hidden,numElements); 21 | beamStates{ll}.h_t=reshape(repmat(last_h_t{ll},parameter.beamSize,1),parameter.hidden,numElements); 22 | end 23 | decoded=zeros(1,N); 24 | INDECE=oneMatrix([1,numElements]); 25 | for position=1:max_length 26 | words=beamHistory(:,position); 27 | for ll=1:parameter.layer_num 28 | if ll == 1 29 | x_t=parameter.vect(:,words); 30 | else 31 | x_t=beamStates{ll-1}.h_t; 32 | end 33 | h_t_1 = beamStates{ll}.h_t; 34 | c_t_1 = beamStates{ll}.c_t; 35 | [beamStates{ll}, h_t, c_t]=lstmUnit(parameter.W_T{ll},parameter,x_t,h_t_1, c_t_1, ll, -1,0); 36 | beamStates{ll}.h_t = h_t; 37 | beamStates{ll}.c_t = c_t; 38 | end 39 | [all_next_scores,all_next_words]=BeamStep(parameter,beamStates{parameter.layer_num}.h_t,0); 40 | all_next_scores=bsxfun(@plus,all_next_scores,beamScores); 41 | all_next_scores=reshape(all_next_scores,[parameter.beamSize*parameter.beamSize,N]); 42 | all_next_words=reshape(all_next_words,[parameter.beamSize*parameter.beamSize,N]); 43 | [all_next_scores,indices]=sort(all_next_scores,'descend'); 44 | for sentId=1:N 45 | if decoded(sentId)==1 continue; end 46 | sorted_Indices = indices(:, sentId); 47 | sorted_next_words=all_next_words(sorted_Indices,sentId); 48 | end_index=find(sorted_next_words(1:parameter.beamSize)==parameter.stop,1); 49 | if sorted_next_words(1)==parameter.stop&&position>floor(SourceLength(sentId)/2) 50 | decoded(sentId)=1; 51 | end_index=sorted_Indices(1); 52 | previous_index=floor((end_index-1)/parameter.beamSize)+1; 53 | translations(sentId,1:position+1)=[beamHistory((sentId-1)*parameter.beamSize+previous_index,1:position),parameter.stop]; 54 | continue; 55 | end 56 | if position==max_length 57 | end_index=sorted_Indices(1); 58 | previous_index=floor((end_index-1)/parameter.beamSize)+1; 59 | translations(sentId,1:position+1)=[beamHistory((sentId-1)*parameter.beamSize+previous_index,1:position),sorted_next_words(1)]; 60 | continue; 61 | end 62 | next_word_index=find(sorted_Indices~=parameter.stop,parameter.beamSize); 63 | beamScores(parameter.beamSize*(sentId-1)+1:parameter.beamSize*sentId)=all_next_scores(next_word_index,sentId); 64 | previous_index=(sentId-1)*parameter.beamSize+ floor((sorted_Indices(next_word_index)-1)/parameter.beamSize)+1; 65 | T=(sentId-1)*parameter.beamSize+1:(sentId-1)*parameter.beamSize+parameter.beamSize; 66 | beamHistory(T,:)=beamHistory(previous_index,:); 67 | beamHistory(T,position+1)=sorted_next_words(1:parameter.beamSize); 68 | INDECE(T)=previous_index; 69 | end 70 | if sum(decoded)==N 71 | break; 72 | end 73 | for ll=1:parameter.layer_num 74 | beamStates{ll}.c_t=beamStates{ll}.c_t(:,INDECE); 75 | beamStates{ll}.h_t=beamStates{ll}.h_t(:,INDECE); 76 | end 77 | end 78 | 79 | for senId=1:N 80 | vector=translations(senId,:); 81 | vector=vector(1:floor(1*batch.SourceLength(senId))); 82 | stop_sign=find(vector==parameter.stop); 83 | if length(stop_sign)==0 84 | dlmwrite(filename,vector,'delimiter',' ','-append'); 85 | else 86 | dlmwrite(filename,vector(1:stop_sign-1),'delimiter',' ','-append'); 87 | end 88 | end 89 | end 90 | end 91 | 92 | function[select_logP,select_words]=BeamStep(parameter,h_t,isFirst) 93 | if isFirst==1 scores=parameter.soft_W(2:end-1,:)*h_t; 94 | else scores=parameter.soft_W(2:end,:)*h_t; 95 | end 96 | mx = max(scores); 97 | scores = bsxfun(@minus, scores, mx); 98 | logP=bsxfun(@minus, scores, log(sum(exp(scores)))); 99 | [sortedLogP, sortedWords]=sort(logP, 'descend'); 100 | select_words=1+sortedWords(1:parameter.beamSize, :); 101 | select_logP=sortedLogP(1:parameter.beamSize, :); 102 | probs=exp(scores); 103 | norms = sum(probs, 1); 104 | probs=bsxfun(@rdivide, probs, norms); 105 | end 106 | -------------------------------------------------------------------------------- /Standard_LSTM/decode_greedy.m: -------------------------------------------------------------------------------- 1 | function[Translations]=decode_greedy(parameter,TestBatches,filename) 2 | % greedy decode 3 | disp('decode') 4 | Translations={}; 5 | for batch_index=1:length(TestBatches) 6 | if mod(batch_index,20)==0 7 | batch_index 8 | end 9 | batch=TestBatches{batch_index}; 10 | max_length=floor(batch.MaxLenSource*1.5); 11 | Word=batch.Word; 12 | N=size(Word,1); 13 | SourceLength=batch.SourceLength; 14 | [lstms,all_h_t,all_c_t]=Forward(batch,parameter,0); % obtain vectors for source sentences 15 | last_h_t=all_h_t(:,size(Word,2)); 16 | last_c_t=all_c_t(:,size(Word,2)); 17 | first_words=BeamStep(parameter,last_h_t{parameter.layer_num},1); 18 | beamHistory=oneMatrix([N,max_length]); % history 19 | beamHistory(:,1)=first_words(:); % decode first words 20 | beamStates=cell(parameter.layer_num,1); 21 | for ll=1:parameter.layer_num 22 | beamStates{ll}.c_t=last_c_t{ll}; 23 | beamStates{ll}.h_t=last_h_t{ll}; 24 | end 25 | 26 | for position=1:max_length 27 | words=beamHistory(:,position); 28 | for ll=1:parameter.layer_num 29 | if ll == 1 30 | x_t=parameter.vect(:,words); 31 | else 32 | x_t=beamStates{ll-1}.h_t; 33 | end 34 | h_t_1 = beamStates{ll}.h_t; 35 | c_t_1 = beamStates{ll}.c_t; 36 | [beamStates{ll}, h_t, c_t]=lstmUnit(parameter.W_T{ll},parameter,x_t,h_t_1, c_t_1, ll, -1,0); % lstmUnit, get current hidden vectors 37 | beamStates{ll}.h_t = h_t; 38 | beamStates{ll}.c_t = c_t; 39 | end 40 | all_next_words=BeamStep(parameter,beamStates{parameter.layer_num}.h_t,0); 41 | % predict next words given current hidden time step vectors 42 | beamHistory(:,position+1)=all_next_words; 43 | % add them to decoding history 44 | end 45 | for senId=1:N 46 | vector=beamHistory(senId,:); 47 | vector=vector(1:floor(1.5*batch.SourceLength(senId))); 48 | stop_sign=find(vector==parameter.stop); 49 | if length(stop_sign)==0 50 | dlmwrite(filename,vector,'delimiter',' ','-append'); 51 | else 52 | dlmwrite(filename,vector(1:stop_sign-1),'delimiter',' ','-append'); 53 | end 54 | end 55 | end 56 | end 57 | 58 | function[select_words]=BeamStep(parameter,h_t,isFirst) % next word prediction given hidden vectors 59 | if isFirst==1 scores=parameter.soft_W(1:end-1,:)*h_t; 60 | else scores=parameter.soft_W*h_t; 61 | end 62 | mx = max(scores); 63 | scores = bsxfun(@minus, scores, mx); 64 | logP=bsxfun(@minus, scores, log(sum(exp(scores)))); 65 | [sortedLogP, sortedWords]=sort(logP, 'descend'); 66 | select_words=sortedWords(1, :); 67 | 68 | probs_=exp(scores); 69 | norms = sum(probs_, 1); 70 | probs=bsxfun(@rdivide, probs_, norms); 71 | end 72 | -------------------------------------------------------------------------------- /Standard_LSTM/lstmUnit.m: -------------------------------------------------------------------------------- 1 | function[lstm, h_t, c_t]=lstmUnit(W,parameter,x_t,h_t_1, c_t_1, ll, t,isTraining)%lstm unit calculation 2 | if parameter.dropout~=0&&isTraining==1 3 | if parameter.CheckGrad==1 4 | if ll==1 5 | drop_left=repmat(parameter.drop_left_1,1,size(x_t,2)); 6 | else 7 | drop_left=repmat(parameter.drop_left,1,size(x_t,2)); 8 | end 9 | else 10 | drop_left=randSimpleMatrix(size(x_t))<1-parameter.dropout; 11 | end 12 | x_t=x_t.*drop_left; 13 | end 14 | input=[x_t; h_t_1]; 15 | ifoa_linear = W*input; 16 | ifo_gate=parameter.nonlinear_gate_f(ifoa_linear(1:3*parameter.hidden,:)); 17 | i_gate = ifo_gate(1:parameter.hidden, :); 18 | f_gate = ifo_gate(parameter.hidden+1:2*parameter.hidden,:); 19 | o_gate =ifo_gate(parameter.hidden*2+1:3*parameter.hidden,:); 20 | a_signal = parameter.nonlinear_f(ifoa_linear(3*parameter.hidden+1:4*parameter.hidden,:)); 21 | c_t=f_gate.*c_t_1 + i_gate.*a_signal; 22 | if parameter.lstm_out_tanh 23 | f_c_t = parameter.nonlinear_f(c_t); 24 | h_t = o_gate.*f_c_t; 25 | else 26 | h_t = o_gate.*c_t; 27 | end 28 | if parameter.clip==1%if values are too large, clip the values. 29 | c_t = arrayfun(@clipForward, c_t); 30 | h_t = arrayfun(@clipForward, h_t); 31 | end 32 | 33 | lstm.input = input; 34 | lstm.i_gate = i_gate; 35 | lstm.f_gate = f_gate; 36 | lstm.o_gate = o_gate; 37 | lstm.a_signal = a_signal; 38 | if parameter.lstm_out_tanh 39 | lstm.f_c_t = f_c_t; 40 | else 41 | lstm.c_t = c_t; 42 | end 43 | if isTraining==1&¶meter.dropout~=0 44 | lstm.drop_left=drop_left; 45 | end 46 | end 47 | 48 | function [clippedValue] = clipForward(x) 49 | if x>50 clippedValue = single(50); 50 | elseif x<-50 clippedValue = single(-50); 51 | else clippedValue =single(x); 52 | end 53 | end 54 | 55 | -------------------------------------------------------------------------------- /Standard_LSTM/softmax.m: -------------------------------------------------------------------------------- 1 | function[total_cost,grad]=softmax(topHidVecs,batch,parameter) 2 | MaxLenSource=batch.MaxLenSource; 3 | MaxLenTarget=batch.MaxLenTarget; 4 | MaxLen=batch.MaxLen; 5 | Word=batch.Word; 6 | Mask=batch.Mask; 7 | 8 | step_size=ceil(500/size(Word,1));%calculate softmax in bulk 9 | total_cost=0; 10 | grad.soft_W=zeroMatrix(size(parameter.soft_W)); 11 | step_size=1; 12 | for Begin=MaxLenSource+1:step_size:MaxLen 13 | End=Begin+step_size-1; 14 | if End>MaxLen 15 | End=MaxLen; 16 | end 17 | N_examples=size(Word,1)*(End-Begin+1); 18 | predict_Words=reshape(Word(:,Begin:End),1,N_examples); 19 | mask=reshape(Mask(:,Begin:End),1,N_examples);%remove positions with no words 20 | h_t=[topHidVecs{:,Begin-1:End-1}]; 21 | [cost,grad_softmax_h]=batchSoftmax(h_t,mask,predict_Words,parameter); 22 | total_cost=total_cost+cost; 23 | grad.soft_W=grad.soft_W+grad_softmax_h.soft_W; 24 | for j=Begin:End 25 | grad.ht{1,j-MaxLenSource}=grad_softmax_h.h(:,(j-Begin)*size(Word,1)+1:(j-Begin+1)*size(Word,1)); 26 | end 27 | end 28 | total_cost=total_cost/size(Word,1); 29 | clear predict_Words; clear mask; 30 | clear grad_softmax_h; 31 | end 32 | 33 | function[cost,softmax_grad]=batchSoftmax(h_t,mask,predict_Words,parameter) 34 | %softmax matrix 35 | unmaskedIds=find(mask==1); 36 | scores=parameter.soft_W*h_t; 37 | mx = max(scores,[],1); 38 | scores=bsxfun(@minus,scores,mx); 39 | scores=exp(scores); 40 | norms = sum(scores, 1); 41 | if length(find(mask==0))==0 42 | scores=bsxfun(@rdivide, scores, norms); 43 | else 44 | scores=bsxfun(@times,scores, mask./norms); 45 | end 46 | scoreIndices = sub2ind(size(scores),predict_Words(unmaskedIds),unmaskedIds); 47 | cost=sum(-log(scores(scoreIndices))); 48 | scores(scoreIndices) =scores(scoreIndices) - 1; 49 | softmax_grad.soft_W=scores*h_t'; %(N_word*examples)*(examples*diemsnion)=N_word*diemsnion; 50 | softmax_grad.h=(scores'*parameter.soft_W)';%(diemsnion*N_word)*(N_word*examples)=dimension*examples 51 | clear scores; 52 | clear norms; 53 | end 54 | -------------------------------------------------------------------------------- /Standard_LSTM/test.m: -------------------------------------------------------------------------------- 1 | function[]=test() 2 | clear; 3 | %matlabpool open 16 4 | addpath('../misc'); 5 | n= gpuDeviceCount; 6 | parameter.isGPU = 0; 7 | 8 | if n>0 % GPU exists 9 | parameter.isGPU = 1; 10 | gpuDevice(2); 11 | else 12 | print('no gpu ! ! ! ! !'); 13 | end 14 | 15 | parameter.dimension=1000; 16 | parameter.alpha=0.1; 17 | parameter.layer_num=4; 18 | parameter.hidden=1000; 19 | parameter.lstm_out_tanh=0; 20 | parameter.Initial=0.08; 21 | parameter.dropout=0.2; 22 | params.lstm_out_tanh=0; 23 | parameter.isTraining=1; 24 | parameter.CheckGrad=0; 25 | parameter.PreTrainEmb=0; 26 | parameter.update_embedding=1; 27 | parameter.batch_size=32; 28 | parameter.Source_Target_Same_Language=1; 29 | parameter.maxGradNorm=1; 30 | parameter.clip=1; 31 | 32 | parameter.lr=5; 33 | parameter.read=1; 34 | 35 | if parameter.Source_Target_Same_Language==1 36 | parameter.Vocab=25001; 37 | parameter.stop=parameter.Vocab; 38 | else 39 | parameter.SourceVocab=20; 40 | parameter.TargetVocab=20; 41 | parameter.stop=parameter.TargetVocab; 42 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 43 | end 44 | 45 | if parameter.CheckGrad==1¶meter.dropout~=0 46 | parameter.drop_left_1=randSimpleMatrix([parameter.dimension,1])<1-parameter.dropout; 47 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 48 | end 49 | %alpha: learning rate for minibatch 50 | 51 | parameter.nonlinear_gate_f = @sigmoid; 52 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 53 | parameter.nonlinear_f = @tanh; 54 | parameter.nonlinear_f_prime = @tanhPrime; 55 | 56 | 57 | train_source_file='../data/train_permute.txt'; 58 | train_target_file='../data/train_permute.txt'; 59 | test_source_file='../data/test.txt'; 60 | test_target_file='../data/test.txt'; 61 | 62 | parameter=ReadParameter(parameter); 63 | Test=ReadTestData(test_source_file,parameter); 64 | TestBatches=GetTestBatch(Test,32,parameter); 65 | disp('decode begin')%greedy decoding 66 | decode_greedy(parameter,TestBatches,'a.txt'); 67 | end 68 | 69 | function[Batches]=GetTestBatch(Source,batch_size,parameter) 70 | N_batch=ceil(length(Source)/batch_size); 71 | Batches={}; 72 | for i=1:N_batch 73 | Begin=batch_size*(i-1)+1; 74 | End=batch_size*i; 75 | if End>length(Source) 76 | End=length(Source); 77 | end 78 | current_batch=Batch(); 79 | for j=Begin:End 80 | source_length=length(Source{j}); 81 | current_batch.SourceLength=[current_batch.SourceLength,source_length]; 82 | if source_length>current_batch.MaxLenSource 83 | current_batch.MaxLenSource=source_length; 84 | end 85 | end 86 | current_batch.Word=ones(End-Begin+1,current_batch.MaxLenSource); 87 | Mask=ones(End-Begin+1,current_batch.MaxLenSource); 88 | for j=Begin:End 89 | source_length=length(Source{j}); 90 | current_batch.Word(j-Begin+1,current_batch.MaxLenSource-source_length+1:current_batch.MaxLenSource)=Source{j}; 91 | Mask(j-Begin+1,1:current_batch.MaxLenSource-source_length)=0; 92 | end 93 | for j=1:current_batch.MaxLenSource 94 | current_batch.Delete{j}=find(Mask(:,j)==0); 95 | current_batch.Left{j}=find(Mask(:,j)==1); 96 | end 97 | current_batch.Mask=Mask; 98 | Batches{i}=current_batch; 99 | end 100 | end 101 | 102 | 103 | function[Source]=ReadTestData(source_file,parameter) 104 | fd_s=fopen(source_file); 105 | Source={}; 106 | i=0; 107 | tline_s = fgets(fd_s); 108 | while ischar(tline_s) 109 | i=i+1; 110 | text_s=deblank(tline_s); 111 | if parameter.Source_Target_Same_Language~=1 112 | Source{i}=wrev(str2num(text_s))+parameter.TargetVocab; 113 | else 114 | Source{i}=wrev(str2num(text_s)); 115 | end 116 | tline_s = fgets(fd_s); 117 | end 118 | end 119 | 120 | function[parameter]=ReadParameter(parameter) 121 | for ll=1:parameter.layer_num 122 | W_file=strcat('save_parameter/_W_S',num2str(ll)); 123 | parameter.W_S{ll}=gpuArray(load(W_file)); 124 | W_file=strcat('save_parameter/_W_T',num2str(ll)); 125 | parameter.W_T{ll}=gpuArray(load(W_file)); 126 | end 127 | parameter.vect=gpuArray(load('save_parameter/_v')); 128 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 129 | end 130 | 131 | -------------------------------------------------------------------------------- /a.txt: -------------------------------------------------------------------------------- 1 | done 2 | -------------------------------------------------------------------------------- /hier_LSTM/Backward.m: -------------------------------------------------------------------------------- 1 | function[grad]=Backward(current_batch,result,grad,parameter) 2 | grad=Backward_Target(current_batch,result,grad,parameter); 3 | % backward propagatin for target sentences 4 | grad=Backward_Source_Sen(current_batch,result,grad,parameter); 5 | % backward propagatin for sources sentences 6 | end 7 | 8 | function[grad]=Backward_Source_Sen(docbatch,result,grad,parameter) 9 | num_sentence=size(docbatch.target_sen_matrix,1); 10 | for ll=1:parameter.layer_num 11 | dh{ll}=grad.source_h{ll,1}; % dh 12 | dc{ll}=grad.source_c{ll,1}; % dc 13 | end 14 | N=size(docbatch.source_sen_matrix,1); 15 | % number of documents in current batches 16 | T=size(docbatch.source_sen_matrix,2); 17 | % total time steps at sentence level 18 | zeroState=zeroMatrix([parameter.hidden,N]); 19 | clear grad.source_h; 20 | clear grad.source_c; 21 | d_source_sen=zeroMatrix([parameter.hidden,docbatch.num_of_source_sen]); 22 | for sen_tt=T:-1:1 23 | for ll=parameter.layer_num:-1:1 24 | if sen_tt==1 25 | c_t_1 =zeroState; 26 | else 27 | c_t_1=result.c_t_source_sen{ll,sen_tt-1}; 28 | end 29 | c_t=result.c_t_source_sen{ll,sen_tt}; 30 | lstm=result.lstms_source_sen{ll,sen_tt}; 31 | dh{ll}(:,docbatch.source_delete{sen_tt})=0; 32 | dc{ll}(:,docbatch.source_delete{sen_tt})=0; 33 | % set values to zero for positions without words 34 | if sen_tt==1 35 | is_begin=1; 36 | else 37 | is_begin=0; 38 | end 39 | [lstm_grad]=lstmUnitGrad(lstm,c_t, c_t_1,dc{ll},dh{ll},ll,sen_tt,zeroState,parameter,parameter.Sen_S{ll},is_begin); 40 | % backward calculation for lstm unit at source sentence level 41 | lstm_grad.input(:,docbatch.source_delete{sen_tt})=0; 42 | dc{ll}(:,docbatch.source_delete{sen_tt})=0; 43 | dc{ll}=lstm_grad.dc; 44 | grad.Sen_S{ll}=grad.Sen_S{ll}+lstm_grad.W; 45 | 46 | lstm_grad.input(:,docbatch.source_delete{sen_tt})=0; 47 | dh{ll}=lstm_grad.input(end-parameter.hidden+1:end,:); 48 | dc{ll}=lstm_grad.dc; 49 | if ll~=1 50 | dh{ll-1}=dh{ll-1}+lstm_grad.input(1:parameter.hidden,:); 51 | else 52 | sen_index=docbatch.source_sen_matrix(:,sen_tt); 53 | d_source_sen(:,sen_index(docbatch.source_left{sen_tt}))=lstm_grad.input(1:parameter.hidden,docbatch.source_left{sen_tt}); 54 | % if ll==1, gradient for emebddings at sentence level 55 | end 56 | end 57 | end 58 | clear dh; 59 | clear dc; 60 | clear lstm_grad; 61 | clear result.c_t_source_sen; 62 | clear result.lstms_source_sen; 63 | 64 | grad=DealSentenceSource(d_source_sen,docbatch,result,grad,parameter); 65 | % backward propagation to source sentences at word le 66 | for ll=1:parameter.layer_num 67 | grad.Word_S{ll}=grad.Word_S{ll}/num_sentence; 68 | grad.Sen_S{ll}=grad.Sen_S{ll}/num_sentence; 69 | end 70 | grad.vect=grad.vect/num_sentence; 71 | end 72 | 73 | function[grad]=DealSentenceSource(d_source_sen,docbatch,result,grad,parameter) 74 | % backward propagation to source sentences at word level 75 | wordCount = 0; 76 | for i=1:length(docbatch.source_smallBatch) 77 | source_smallBatch=docbatch.source_smallBatch{i}; 78 | Word=source_smallBatch.Word; 79 | N=size(Word,1); 80 | % number of sentences for current batch 81 | T=size(Word,2); 82 | % total number of time steps 83 | zeroState=zeroMatrix([parameter.hidden,N]); 84 | for ll=1:parameter.layer_num 85 | dh{ll}=zeroState; 86 | dc{ll}=zeroState; 87 | end 88 | for word_tt=T:-1:1 89 | unmaskedIds=source_smallBatch.Left{word_tt}; 90 | for ll=parameter.layer_num:-1:1 91 | if word_tt==T&&ll==parameter.layer_num 92 | dh{ll}=d_source_sen(:,source_smallBatch.Sen); 93 | end 94 | if word_tt==1 95 | c_t_1 =zeroState; 96 | else c_t_1=result.c_t_source_word{i}{ll,word_tt-1}; 97 | end 98 | c_t=result.c_t_source_word{i}{ll,word_tt}; 99 | lstm=result.lstms_source_word{i}{ll,word_tt}; 100 | if word_tt==1 101 | is_begin=1; 102 | else 103 | is_begin=0; 104 | end 105 | [lstm_grad]=lstmUnitGrad(lstm,c_t, c_t_1,dc{ll},dh{ll},ll,word_tt,zeroState,parameter,parameter.Word_S{ll},is_begin); 106 | % backward calculation for lstm unit at source word level 107 | grad.Word_S{ll}=grad.Word_S{ll}+lstm_grad.W; 108 | lstm_grad.input(:,source_smallBatch.Delete{word_tt})=0; 109 | dh{ll}=lstm_grad.input(end-parameter.hidden+1:end,:); 110 | dc{ll}=lstm_grad.dc; 111 | if ll~=1 112 | dh{ll-1}=dh{ll-1}+lstm_grad.input(1:parameter.hidden,:); 113 | else 114 | % gradient for word tokens 115 | embIndices=Word(unmaskedIds,word_tt)'; 116 | embGrad = lstm_grad.input(1:parameter.dimension,unmaskedIds); 117 | numWords = length(embIndices); 118 | allEmbIndices(wordCount+1:wordCount+numWords) = embIndices; 119 | allEmbGrads(:, wordCount+1:wordCount+numWords) = embGrad; 120 | wordCount = wordCount + numWords; 121 | end 122 | end 123 | end 124 | end 125 | allEmbGrads(:, wordCount+1:end) = []; 126 | allEmbIndices(wordCount+1:end) = []; 127 | [W_emb,indices] = aggregateMatrix(allEmbGrads, allEmbIndices); 128 | grad.vect(:,indices)=grad.vect(:,indices)+W_emb; 129 | clear allEmbIndices; 130 | clear allEmbGrads; 131 | clear W_emb; 132 | clear indices; 133 | clear result.c_t_source_word; 134 | clear result.h_t_source_word; 135 | clear result.lstms_source_word; 136 | end 137 | 138 | 139 | function[grad]=Backward_Target(current_batch,result,grad,parameter) 140 | % backward propagatin for target sentences 141 | num_sentence=size(current_batch.target_sen_matrix,1); 142 | for ll=1:parameter.layer_num 143 | grad.Word_T{ll}=zeroMatrix(size(parameter.Word_T{ll})); 144 | grad.Word_S{ll}=zeroMatrix(size(parameter.Word_S{ll})); 145 | grad.Sen_S{ll}=zeroMatrix(size(parameter.Sen_S{ll})); 146 | grad.Sen_T{ll}=zeroMatrix(size(parameter.Sen_T{ll})); 147 | end 148 | grad.vect=zeroMatrix(size(parameter.vect)); 149 | 150 | T_target_sen=size(current_batch.target_sen_matrix,2); 151 | zeroState=zeroMatrix([parameter.hidden, size(current_batch.target_sen_matrix,1)]); 152 | L=[]; 153 | 154 | for sen_tt=T_target_sen-1:-1:0 155 | target_sen=result.Target_sen{sen_tt+1}; 156 | grad=DealSentenceTarget(current_batch,target_sen,sen_tt+1,grad,L,parameter,result); 157 | % back propagation for target sentences 158 | grad.vect(:,grad.indices)=grad.vect(:,grad.indices)+grad.W_emb; 159 | clear grad.indices; 160 | clear grad.W_emb; 161 | if sen_tt==0 162 | continue; 163 | end 164 | for ll=parameter.layer_num:-1:1 165 | if sen_tt==1 166 | dim=size(result.c_t_source_sen); 167 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 168 | else c_t_1=result.c_t_target_sen{ll,sen_tt-1}; 169 | end 170 | c_t=result.c_t_target_sen{ll,sen_tt}; 171 | lstm=result.lstms_target_sen{ll,sen_tt}; 172 | [lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1,grad.target_sen_c{ll,sen_tt},grad.target_sen_h{ll,sen_tt},ll,sen_tt,zeroState,parameter,parameter.Sen_T{ll},0); 173 | % backpropagation for current lstm unit at sentence level 174 | lstm_grad.input(:,current_batch.target_delete{sen_tt+1})=0; 175 | lstm_grad.dc(:,current_batch.target_delete{sen_tt+1})=0; 176 | if sen_tt~=1 177 | grad.target_sen_h{ll,sen_tt-1}=lstm_grad.input(end-parameter.hidden+1:end,:)+grad.target_sen_h{ll,sen_tt-1}; 178 | % if sentence index is not 1, gradient update for sentnence from previous time-step 179 | grad.target_sen_c{ll,sen_tt-1}=lstm_grad.dc+grad.target_sen_c{ll,sen_tt-1}; 180 | else 181 | % else gradient update for vector representations from source sentences 182 | grad.source_h{ll,1}=grad.source_h{ll,1}+lstm_grad.input(end-parameter.hidden+1:end,:); 183 | grad.source_c{ll,1}=grad.source_c{ll,1}+lstm_grad.dc; 184 | end 185 | grad.Sen_T{ll}=grad.Sen_T{ll}+lstm_grad.W; 186 | if ll~=1 187 | grad.target_sen_h{ll-1,sen_tt}=lstm_grad.input(1:parameter.hidden,:)+grad.target_sen_h{ll-1,sen_tt}; 188 | else 189 | L=lstm_grad.input(1:parameter.hidden,:); 190 | end 191 | end 192 | end 193 | for ll=1:parameter.layer_num 194 | grad.Word_T{ll}=grad.Word_T{ll}/num_sentence; 195 | grad.Sen_T{ll}=grad.Sen_T{ll}/num_sentence; 196 | end 197 | clear grad.target_sen_c; 198 | clear grad.target_sen_h; 199 | clear zeroState; 200 | clear lstm_grad; 201 | clear c_t_1; 202 | clear c_t; 203 | clear lstm; 204 | clear grad.ht; 205 | clear lstm_grad; 206 | end 207 | 208 | function[grad]=DealSentenceTarget(docbatch,target_sen,sen_tt,grad,end_grad,parameter,result) 209 | % back propagation to target sentences 210 | target_word=docbatch.target_word{sen_tt}; 211 | Word=target_word.Word; 212 | Word_Delete=target_word.Delete; 213 | Word_Left=target_word.Left; 214 | N=size(Word,1); 215 | T=size(Word,2); 216 | zeroState=zeroMatrix([parameter.hidden,N]); 217 | for ll=parameter.layer_num:-1:1 218 | dh{ll} = zeroState; 219 | dc{ll} = zeroState; 220 | end 221 | wordCount = 0; 222 | numInputWords=size(Word,1)*size(Word,2); 223 | allEmbGrads=zeroMatrix([parameter.dimension,numInputWords]); 224 | allEmbIndices=[]; 225 | for word_tt=T:-1:1 226 | if word_tt==T &&length(end_grad)==0 227 | continue; 228 | end 229 | unmaskedIds=Word_Left{word_tt}; 230 | for ll=parameter.layer_num:-1:1 231 | if ll==parameter.layer_num 232 | if word_tt==T 233 | if length(end_grad)~=0 234 | dh{ll}=dh{ll}+end_grad; 235 | end 236 | else 237 | dh{ll}=dh{ll}+grad.ht{sen_tt}{:,word_tt}; 238 | end 239 | end 240 | if word_tt==1&& sen_tt==1 241 | dim=size(result.c_t_source_sen); 242 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 243 | elseif word_tt==1&& sen_tt~=1 244 | c_t_1=result.c_t_target_sen{ll,sen_tt-1}; 245 | else c_t_1=target_sen.c_t_target_word{ll,word_tt-1}; 246 | end 247 | c_t =target_sen.c_t_target_word{ll,word_tt}; 248 | lstm =target_sen.lstms{ll,word_tt}; 249 | %dh{ll}(:,Word_Delete{word_tt})=0; 250 | 251 | if length(Word_Delete{word_tt})~=0 252 | dh{ll}(:,Word_Delete{word_tt})=0; 253 | dc{ll}(:,Word_Delete{word_tt})=0; 254 | end 255 | % set values to 0 256 | 257 | [lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc{ll}, dh{ll},ll,word_tt,zeroState,parameter,parameter.Word_T{ll},0); 258 | % back propagation for current word level lstm unit 259 | grad.Word_T{ll}=grad.Word_T{ll}+lstm_grad.W; 260 | dc{ll} = lstm_grad.dc; 261 | dc{ll}(:,Word_Delete{word_tt})=0; 262 | dh{ll} = lstm_grad.input(end-parameter.hidden+1:end,:); 263 | if length(end_grad)~=0 && length(Word_Delete{word_tt})~=0&&ll==parameter.layer_num 264 | dh{ll}(:,Word_Delete{word_tt})=end_grad(:,Word_Delete{word_tt}); 265 | % gradients for positions that have no words stay the same 266 | end 267 | 268 | 269 | if word_tt==1 270 | if sen_tt==1 271 | grad.source_h{ll,1}=grad.source_h{ll,1}+dh{ll}; 272 | grad.source_c{ll,1}=grad.source_c{ll,1}+dc{ll}; 273 | else 274 | grad.target_sen_h{ll,sen_tt-1}=grad.target_sen_h{ll,sen_tt-1}+dh{ll}; 275 | grad.target_sen_c{ll,sen_tt-1}=grad.target_sen_c{ll,sen_tt-1}+dc{ll}; 276 | end 277 | end 278 | if ll==1 279 | % gradients for word embeddings 280 | embIndices=Word(unmaskedIds,word_tt)'; 281 | embGrad = lstm_grad.input(1:parameter.dimension,unmaskedIds); 282 | numWords = length(embIndices); 283 | allEmbIndices(wordCount+1:wordCount+numWords) = embIndices; 284 | allEmbGrads(:, wordCount+1:wordCount+numWords) = embGrad; 285 | wordCount = wordCount + numWords; 286 | else 287 | dh{ll-1}(:,unmaskedIds)=dh{ll-1}(:,unmaskedIds)+lstm_grad.input(1:parameter.hidden,unmaskedIds); 288 | end 289 | end 290 | end 291 | allEmbGrads(:, wordCount+1:end) = []; 292 | allEmbIndices(wordCount+1:end) = []; 293 | [grad.W_emb, grad.indices] = aggregateMatrix(allEmbGrads, allEmbIndices); 294 | clear dc; 295 | clear dh; 296 | clear allEmbGrads; 297 | clear allEmbIndices; 298 | clear allEmbGrads; 299 | clear lstm_grad; 300 | end 301 | 302 | function[lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc, dh, ll, t, zero_state,parameter,W,very_begin) 303 | % lstm gradient calculation 304 | dc = arrayfun(@plusMult, dc, lstm.o_gate, dh); 305 | do = arrayfun(@sigmoidPrimeTriple, lstm.o_gate, c_t, dh); 306 | di = arrayfun(@sigmoidPrimeTriple, lstm.i_gate, lstm.a_signal, dc); 307 | 308 | if very_begin==1 309 | df = zero_state; 310 | else 311 | df = arrayfun(@sigmoidPrimeTriple, lstm.f_gate, c_t_1, dc); 312 | end 313 | lstm_grad.dc = lstm.f_gate.*dc; 314 | dl = arrayfun(@tanhPrimeTriple, lstm.a_signal, lstm.i_gate, dc); 315 | d_ifoa = [di; df; do; dl]; 316 | lstm_grad.W = d_ifoa*lstm.input'; %dw 317 | lstm_grad.input=W'*d_ifoa; 318 | if parameter.dropout~=0 319 | if ll==1 320 | lstm_grad.input(1:parameter.dimension, :) = lstm_grad.input(1:parameter.dimension, :).*lstm.drop_left; 321 | else lstm_grad.input(1:parameter.hidden, :) = lstm_grad.input(1:parameter.hidden, :).*lstm.drop_left; 322 | end 323 | end 324 | if parameter.clip==1 325 | lstm_grad.input=arrayfun(@clipBackward, lstm_grad.input); 326 | lstm_grad.dc = arrayfun(@clipBackward, lstm_grad.dc); 327 | end 328 | clear dc; clear do; clear di; clear df; clear d_ifoa; 329 | end 330 | 331 | 332 | function [value] = plusTanhPrimeTriple(t, x, y, z) 333 | value = t + (1-x*x)*y*z; 334 | end 335 | function [value] = tanhPrimeTriple(x, y, z) 336 | value = (1-x*x)*y*z; 337 | end 338 | function [value] = plusMult(x, y, z) 339 | value = x + y*z; 340 | end 341 | function [value] = sigmoidPrimeTriple(x, y, z) 342 | value = x*(1-x)*y*z; 343 | end 344 | 345 | function [clippedValue] = clipBackward(x) 346 | if x>1000 clippedValue = single(1000); 347 | elseif x<-1000 clippedValue = single(-1000); 348 | else clippedValue =single(x); 349 | end 350 | end 351 | -------------------------------------------------------------------------------- /hier_LSTM/Batch.m: -------------------------------------------------------------------------------- 1 | classdef Batch < handle 2 | properties 3 | MaxLenSource=-1; 4 | MaxLenTarget=-1; 5 | MaxLen=-1; 6 | Word=[]; 7 | Delete={}; 8 | Left={}; 9 | Mask={}; 10 | Label=[]; 11 | SourceLength=[]; 12 | N_word=0; 13 | end 14 | end 15 | 16 | 17 | -------------------------------------------------------------------------------- /hier_LSTM/Forward.m: -------------------------------------------------------------------------------- 1 | function[result]=Forward(docbatch,parameter,isTraining) 2 | [result]=Forward_Source_Word(docbatch,parameter,isTraining); 3 | % forward calculation at word level for each sentence 4 | [result]=Forward_Source_Sen(result,docbatch,parameter,isTraining); 5 | % forward calculation at sentence level for each document 6 | if isTraining==1 7 | [result]=Forward_Target(result,docbatch,parameter,isTraining); 8 | end 9 | end 10 | 11 | function[result]=Forward_Source_Word(docbatch,parameter,isTraining) 12 | % forward calculation at word level for each sentence 13 | sourceBatch=docbatch.source_smallBatch; 14 | result.source_sen_vector=[]; 15 | for i=1:length(sourceBatch) 16 | batch=sourceBatch{i}; 17 | T=batch.max_length; 18 | h_t_source_word=cell(parameter.layer_num,T); 19 | % store h_t 20 | result.c_t_source_word{i}=cell(parameter.layer_num,T); 21 | % store c_t 22 | result.lstms_source_word{i} = cell(parameter.layer_num,T); 23 | % store gate values for lstm units 24 | N=size(batch.Word,1); 25 | zeroState=zeroMatrix([parameter.hidden,N]); 26 | for ll=1:parameter.layer_num 27 | for tt=1:T 28 | h_t_source_word{ll,tt}=zeroState; 29 | result.c_t_source_word{i}{ll,tt}=zeroState; 30 | end 31 | end 32 | for t=1:T 33 | for ll=1:parameter.layer_num 34 | W=parameter.Word_S{ll}; 35 | % W for word level composition in source 36 | if t==1 37 | h_t_1=zeroState; 38 | c_t_1=zeroState; 39 | else 40 | c_t_1=result.c_t_source_word{i}{ll, t-1}; 41 | h_t_1=h_t_source_word{ll, t-1}; 42 | end 43 | if ll==1 44 | x_t=parameter.vect(:,batch.Word(:,t)); 45 | else 46 | x_t=h_t_source_word{ll-1,t}; 47 | end 48 | x_t(:,batch.Delete{t})=0; 49 | h_t_1(:,batch.Delete{t})=0; 50 | c_t_1(:,batch.Delete{t})=0; 51 | % set postion that do not have words to zero 52 | [result.lstms_source_word{i}{ll, t},h_t_source_word{ll, t},result.c_t_source_word{i}{ll, t}]=lstmUnit(W,parameter,x_t,h_t_1,c_t_1,ll,t,isTraining); 53 | % compute lstm unit 54 | if t==T && ll==parameter.layer_num 55 | result.source_sen_vector=[result.source_sen_vector,h_t_source_word{ll,t}]; 56 | % store vector embeddings for each source sentence 57 | clear h_t_source_word; 58 | end 59 | end 60 | end 61 | end 62 | clear x_t; 63 | clear h_t_1; 64 | clear c_t_1; 65 | end 66 | 67 | function[result]=Forward_Source_Sen(result,docbatch,parameter,isTraining) 68 | % forward calculation at sentence level for each document 69 | T=docbatch.max_source_sen; 70 | h_t_source_sen=cell(parameter.layer_num,T); 71 | result.c_t_source_sen=cell(parameter.layer_num,T); 72 | result.lstms_source_sen=cell(parameter.layer_num,T); 73 | result.source_sen=cell(parameter.layer_num,1); 74 | 75 | N=size(docbatch.source_sen_matrix,1); 76 | zeroState=zeroMatrix([parameter.hidden,N]); 77 | for ll=1:parameter.layer_num 78 | for tt=1:T 79 | h_t_source_sen{ll,tt}=zeroState; 80 | result.c_t_source_sen{ll,tt}=zeroState; 81 | end 82 | end 83 | for t=1:T 84 | for ll=1:parameter.layer_num 85 | W=parameter.Sen_S{ll}; 86 | % sentence-level compostion 87 | if t==1 88 | h_t_1=zeroState; 89 | c_t_1 =zeroState; 90 | else 91 | c_t_1 =result.c_t_source_sen{ll, t-1}; 92 | h_t_1 =h_t_source_sen{ll, t-1}; 93 | end 94 | if ll==1 95 | x_t=result.source_sen_vector(:,docbatch.source_sen_matrix(:,t)); 96 | % sentence-level embedding 97 | else 98 | x_t=h_t_source_sen{ll-1,t}; 99 | end 100 | x_t(:,docbatch.source_delete{t})=0; 101 | h_t_1(:,docbatch.source_delete{t})=0; 102 | c_t_1(:,docbatch.source_delete{t})=0; 103 | % set values for deleted postion to 0 104 | [result.lstms_source_sen{ll, t},h_t_source_sen{ll, t},result.c_t_source_sen{ll, t}]=lstmUnit(W,parameter,x_t,h_t_1,c_t_1,ll,t,isTraining); 105 | % lstm unit calculation at sentence level 106 | if t==T 107 | result.source_sen{ll,1}=h_t_source_sen{ll, t}; 108 | % store vector embeddings for documents 109 | end 110 | end 111 | end 112 | clear result.source_sen_vector; 113 | clear h_t_source_sen; 114 | clear x_t; 115 | clear h_t_1; 116 | clear c_t_1; 117 | end 118 | 119 | function[result]=Forward_Target(result,docbatch,parameter,isTraining) 120 | % forward for target documents 121 | T=docbatch.max_target_sen; 122 | result.h_t_target_sen=cell(parameter.layer_num,T); 123 | % sentence level 124 | result.c_t_target_sen=cell(parameter.layer_num,T); 125 | result.lstms_target_sen=cell(parameter.layer_num,T); 126 | N=size(docbatch.target_sen_matrix,1); 127 | zeroState=zeroMatrix([parameter.hidden,N]); 128 | 129 | for ll=1:parameter.layer_num 130 | for tt=1:T 131 | result.h_t_target_sen{ll,tt}=zeroState; 132 | result.c_t_target_sen{ll,tt}=zeroState; 133 | end 134 | end 135 | result.Target_sen={}; 136 | for sen_tt=1:T 137 | for ll=1:parameter.layer_num 138 | W=parameter.Sen_T{ll}; 139 | % sentnece compositions for target 140 | if sen_tt==1 141 | % if sentence index is 1, h_t_1 and c_t_1 are from outputs from source sentences 142 | h_t_1=result.source_sen{ll,1}; 143 | dim=size(result.c_t_source_sen); 144 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 145 | else 146 | c_t_1 =result.c_t_target_sen{ll, sen_tt-1}; 147 | % otherwise, c_t_1 are lstm outputs from last time step 148 | h_t_1 =result.h_t_target_sen{ll, sen_tt-1}; 149 | end 150 | if ll==1 151 | Word_List=docbatch.target_word{sen_tt}.Word; 152 | Word_Delete=docbatch.target_word{sen_tt}.Delete; 153 | if sen_tt==1 154 | M1=result.source_sen(:,1); 155 | dim=size(result.c_t_source_sen); 156 | M2=result.c_t_source_sen(:,dim(2)); 157 | else 158 | M1=result.h_t_target_sen(:,sen_tt-1); 159 | M2=result.c_t_target_sen(:,sen_tt-1); 160 | end 161 | result.Target_sen{sen_tt}=Forward_Target_Word(M1,M2,Word_List,Word_Delete,docbatch,parameter,isTraining); 162 | % if layer number is 1, x_t sentence level embeddings based on its containing words; compute sentence level embeddings 163 | x_t=result.Target_sen{sen_tt}.h_t_target_word{parameter.layer_num,size(Word_List,2)}; 164 | else 165 | x_t=result.h_t_target_sen{ll-1,sen_tt}; 166 | % else, x_t is outputs from last layer lstm 167 | end 168 | x_t(:,docbatch.target_delete{sen_tt})=0; 169 | h_t_1(:,docbatch.target_delete{sen_tt})=0; 170 | c_t_1(:,docbatch.target_delete{sen_tt})=0; 171 | % set deleted postions to 0 value 172 | [result.lstms_target_sen{ll,sen_tt},result.h_t_target_sen{ll,sen_tt},result.c_t_target_sen{ll,sen_tt}]=lstmUnit(W,parameter,x_t,h_t_1,c_t_1,ll,sen_tt,isTraining); 173 | % lstm unit calculation 174 | end 175 | end 176 | clear target_sen_vector; 177 | clear x_t; 178 | clear h_t_1; 179 | clear c_t_1; 180 | end 181 | 182 | function[target_sen]=Forward_Target_Word(h_t_sen,c_t_sen,Word_List,Word_Delete,docbatch,parameter,isTraining) 183 | % obtain sentence level embeddings for target sentences 184 | N=size(Word_List,1); 185 | T=size(Word_List,2); 186 | target_sen.h_t_target_word=cell(parameter.layer_num,T); 187 | target_sen.c_t_target_word=cell(parameter.layer_num,T); 188 | target_sen.lstms=cell(parameter.layer_num,T); 189 | zeroState=zeroMatrix([parameter.hidden,N]); 190 | 191 | for ll=1:parameter.layer_num 192 | for tt=1:T 193 | target_sen.h_t_target_word{ll,tt}=zeroState; 194 | target_sen.c_t_target_word{ll,tt}=zeroState; 195 | end 196 | end 197 | for t=1:T 198 | for ll=1:parameter.layer_num 199 | W=parameter.Word_T{ll}; 200 | if t==1 201 | h_t_1=h_t_sen{ll,1}; 202 | c_t_1=c_t_sen{ll,1}; 203 | else 204 | c_t_1 =target_sen.c_t_target_word{ll, t-1}; 205 | h_t_1 =target_sen.h_t_target_word{ll, t-1}; 206 | end 207 | if ll==1 208 | x_t=parameter.vect(:,Word_List(:,t)); 209 | % if ll=1, x_t are correspondent word embeddings 210 | else 211 | x_t=target_sen.h_t_target_word{ll-1,t}; 212 | % else x_t are outputs from previous layer 213 | end 214 | x_t(:,Word_Delete{t})=0; 215 | h_t_1(:,Word_Delete{t})=0; 216 | c_t_1(:,Word_Delete{t})=0; 217 | % set deleted positions to 0 218 | [target_sen.lstms{ll, t},target_sen.h_t_target_word{ll, t},target_sen.c_t_target_word{ll, t}]=lstmUnit(W,parameter,x_t,h_t_1,c_t_1,ll,t,isTraining); 219 | % lstm unit calculations 220 | if t~=1 221 | target_sen.h_t_target_word{ll, t}(:,Word_Delete{t})=target_sen.h_t_target_word{ll,t-1}(:,Word_Delete{t}); 222 | % sentence representations stay the same for deleted positions 223 | end 224 | end 225 | end 226 | clear h_t_1; 227 | clear c_t_1; 228 | clear x_t; 229 | end 230 | -------------------------------------------------------------------------------- /hier_LSTM/ReadData.m: -------------------------------------------------------------------------------- 1 | function[batch,Stop]=ReadData(fd_s,fd_t,parameter,isTraining) 2 | tline_s = fgets(fd_s); 3 | Target={}; 4 | Stop=0; 5 | doc_index=1; 6 | sen_index=0; 7 | i=0; 8 | 9 | length_array=[]; 10 | num_of_sen=0; 11 | max_source_sen=0; 12 | while ischar(tline_s) 13 | % read source sentences and documents. Each line corresponds to one sentence. Documents are separated by an empty line 14 | text_s=deblank(tline_s); 15 | if length(text_s)==0 16 | sourceDoc{doc_index}=[i-sen_index+1,i]; % indexes for sentences within current document 17 | doc_index=doc_index+1; % document index 18 | if sen_index>max_source_sen 19 | max_source_sen=sen_index; 20 | end 21 | sen_index=0; 22 | if doc_index==parameter.batch_size+1 % batchsize 23 | break; 24 | end 25 | else 26 | i=i+1; 27 | sen_index=sen_index+1; 28 | num_of_sen=num_of_sen+1; 29 | if parameter.Source_Target_Same_Language~=1 30 | Source{i}=str2num(text_s)+parameter.TargetVocab; % Source sentences 31 | else 32 | Source{i}=str2num(text_s); 33 | end 34 | length_array=[length_array;length(Source{i})]; % list of sentence length 35 | end 36 | tline_s = fgets(fd_s); 37 | end 38 | if ischar(tline_s)==0 39 | Stop=1; 40 | end 41 | batch.num_of_source_sen=num_of_sen; 42 | % number of sentences in current batch 43 | source_sen_matrix=ones(length(sourceDoc),max_source_sen); 44 | % document-sentence matrix, each line corresponds indexes of sentences within one document 45 | source_sen_mask=ones(length(sourceDoc),max_source_sen); 46 | % due to different lengths for different sentences, some positions in document-sentence matrix are masked (not sentences at that positon) 47 | num_word_in_doc=[]; 48 | for i=1:length(sourceDoc) 49 | L=sourceDoc{i}(1):sourceDoc{i}(2); 50 | n_word=0; 51 | for j=sourceDoc{i}(1):sourceDoc{i}(2) 52 | n_word=n_word+length(Source{j}); 53 | end 54 | num_word_in_doc=[num_word_in_doc,n_word]; 55 | l=length(L); 56 | source_sen_matrix(i,max_source_sen-l+1:max_source_sen)=L; 57 | % sentence indexes within current document 58 | source_sen_mask(i,1:max_source_sen-l)=0; 59 | % mask postions where no sentneces resize 60 | end 61 | batch.num_word_in_doc=num_word_in_doc; 62 | source_delete={}; 63 | for j=1:size(source_sen_mask,2) 64 | source_delete{j}=find(source_sen_mask(:,j)==0); 65 | % matrix positons where no sentences resize for current time-step 66 | source_left{j}=find(source_sen_mask(:,j)==1); 67 | % matrix positons with sentences for current time-step in doc-sentence matrix 68 | end 69 | [a,b]=sort(length_array); 70 | % sort sentences by length . Stack sentences with similar lenght into one matrix for forward computation for time saving purposes 71 | c=ones(length(b),1); 72 | c(b)=1:length(b); 73 | source_sen_matrix=c(source_sen_matrix); 74 | % re-formulate document-sentence matrix using new sentence indexes (index ordered by sentence length) 75 | if size(source_sen_matrix,2)==1 76 | source_sen_matrix=source_sen_matrix'; 77 | end 78 | 79 | source_smallBatch={}; 80 | % store sentences of similar length in matrixes 81 | sen_size=32; 82 | num_small=ceil(num_of_sen/sen_size); 83 | for i=1:num_small 84 | Begin=sen_size*(i-1)+1; 85 | End=sen_size*i; 86 | if End>num_of_sen 87 | End=num_of_sen; 88 | end 89 | max_length=-1; 90 | for j=Begin:End 91 | if length(Source{b(j)})>max_length 92 | max_length=length(Source{b(j)}); 93 | end 94 | end 95 | N=End-Begin+1; 96 | source_smallBatch{i}.max_length=max_length; 97 | % max length 98 | source_smallBatch{i}.Word=ones(N,max_length); 99 | source_smallBatch{i}.Mask=ones(N,max_length); 100 | source_smallBatch{i}.Sen=Begin:End; 101 | for j=Begin:End 102 | source_length=length(Source{b(j)}); 103 | source_smallBatch{i}.Word(j-Begin+1,max_length-source_length+1:max_length)=Source{b(j)}; 104 | % each line corresponds words within current sentence 105 | source_smallBatch{i}.Mask(j-Begin+1,1:max_length-source_length)=0; 106 | % mask positions without any word 107 | end 108 | for j=1:max_length 109 | source_smallBatch{i}.Delete{j}=find(source_smallBatch{i}.Mask(:,j)==0); 110 | % label postions without any words 111 | source_smallBatch{i}.Left{j}=find(source_smallBatch{i}.Mask(:,j)==1); 112 | % label positions with words 113 | end 114 | end 115 | 116 | if isTraining==1 117 | End=0; 118 | doc_index=1; 119 | sen_index=0; 120 | i=0; 121 | Target={}; 122 | target_doc_index_array=[]; 123 | target_sen_index_array=[]; 124 | max_target_sen=0; 125 | length_array=[]; 126 | num_of_sen=0; 127 | tline_t = fgets(fd_t); 128 | targetDoc={}; 129 | while ischar(tline_t) 130 | % read target sentences/documents 131 | text_t=deblank(tline_t); 132 | if length(text_t)==0 133 | % end of a document 134 | Target{i}(length(Target{i}))=parameter.doc_stop; 135 | % append doc_stop indicator to the end 136 | targetDoc{doc_index}=[i-sen_index+1,i]; 137 | % sentence indexes within current document 138 | doc_index=doc_index+1; 139 | if sen_index>max_target_sen 140 | max_target_sen=sen_index; 141 | end 142 | sen_index=0; 143 | if doc_index==parameter.batch_size+1 144 | break; 145 | end 146 | else 147 | i=i+1; 148 | num_of_sen=num_of_sen+1; 149 | sen_index=sen_index+1; 150 | Target{i}=[str2num(text_t),parameter.sen_stop]; 151 | % append sentence-stop indicator to the end 152 | length_array=[length_array;length(Target{i})]; 153 | end 154 | tline_t = fgets(fd_t); 155 | end 156 | 157 | target_sen_matrix=ones(length(targetDoc),max_target_sen); 158 | % document-sentnece matrix for taget, each line correspond to sentnece indexes within current document 159 | target_sen_mask=ones(length(targetDoc),max_target_sen); 160 | % masking positions where no sentnece resize 161 | for i=1:length(targetDoc) 162 | L=targetDoc{i}(1):targetDoc{i}(2); 163 | l=length(L); 164 | target_sen_matrix(i,1:l)=L; 165 | target_sen_mask(i,l+1:max_target_sen)=0; 166 | end 167 | target_word={}; 168 | for i=1:size(target_sen_matrix,2) 169 | Max_l=1; 170 | sen_list=target_sen_matrix(:,i); 171 | mask_list=target_sen_mask(:,i); 172 | for j=1:length(sen_list) 173 | if mask_list(j)==0 174 | continue; 175 | end 176 | index=sen_list(j); 177 | Length=length(Target{index}); 178 | if Length>Max_l 179 | Max_l=Length; 180 | end 181 | end 182 | target_word{i}.Word=ones(length(sen_list),Max_l); 183 | % each line corresponds words within current sentence 184 | target_word{i}.Mask=ones(length(sen_list),Max_l); 185 | % mask positions where no words resize 186 | target_word{i}.End=[]; 187 | for j=1:length(sen_list) 188 | if mask_list(j)==0 189 | target_word{i}.Mask(j,:)=0; 190 | continue; 191 | end 192 | index=sen_list(j); 193 | Length=length(Target{index}); 194 | target_word{i}.Word(j,1:Length)=Target{index}; 195 | target_word{i}.Mask(j,Length+1:end)=0; 196 | end 197 | for j=1:size(target_word{i}.Mask,2) 198 | target_word{i}.Delete{j}=find(target_word{i}.Mask(:,j)==0); 199 | % label positions that have no words for current time-step 200 | target_word{i}.Left{j}=find(target_word{i}.Mask(:,j)==1); 201 | % label positions that have words for current time-step 202 | end 203 | end 204 | target_delete={}; 205 | for j=1:size(target_sen_mask,2) 206 | target_delete{j}=find(target_sen_mask(:,j)==0); 207 | end 208 | 209 | end 210 | 211 | batch.source_smallBatch=source_smallBatch; 212 | batch.max_source_sen=max_source_sen; 213 | 214 | batch.source_sen_matrix=source_sen_matrix; 215 | batch.source_sen_mask=source_sen_mask; 216 | batch.source_delete=source_delete; 217 | batch.source_left=source_left; 218 | clear source_sen_matrix; 219 | clear source_sen_mask; 220 | clear source_smallBatch; 221 | clear source_delete; 222 | clear source_left; 223 | 224 | if isTraining==1 225 | batch.target_sen_matrix=target_sen_matrix; 226 | batch.target_sen_mask=target_sen_mask; 227 | batch.target_delete=target_delete; 228 | batch.target_word=target_word; 229 | batch.max_target_sen=max_target_sen; 230 | 231 | clear target_sen_matrix; 232 | clear target_sen_mask; 233 | clear target_doc_index_array; 234 | clear target_delete; 235 | clear target_word; 236 | end 237 | end 238 | -------------------------------------------------------------------------------- /hier_LSTM/decode_greedy.m: -------------------------------------------------------------------------------- 1 | function[Translations]=decode_greedy(parameter,batch,filename) 2 | T=size(batch.source_sen_matrix,2); 3 | % total number of time-steps 4 | N=size(batch.source_sen_matrix,1); 5 | % number of sentences 6 | L=length(batch.source_smallBatch); 7 | Max_Length=1.5*max(batch.num_word_in_doc); 8 | 9 | result=Forward(batch,parameter,0); 10 | % obtain embeddings for inputs 11 | first_words=BeamStep(parameter,result.source_sen{parameter.layer_num,1},1); 12 | % decode first words 13 | beamHistory=oneMatrix([N,Max_Length]); 14 | % store in history 15 | beamHistory(:,1)=first_words(:); 16 | 17 | for ll=1:parameter.layer_num 18 | beamSenStates_h{ll}=result.source_sen{ll,1}; 19 | beamWordStates_h{ll}=result.source_sen{ll,1}; 20 | beamSenStates_c{ll}=result.c_t_source_sen{ll,T}; 21 | beamWordStates_c{ll}=result.c_t_source_sen{ll,T}; 22 | end 23 | % update current lstm states 24 | for position=1:Max_Length 25 | words=beamHistory(:,position); 26 | % get earliest predicted words 27 | for ll=1:parameter.layer_num 28 | if ll==1 29 | x_t=parameter.vect(:,words); 30 | else 31 | x_t=beamWordStates_h{ll-1}; 32 | end 33 | h_t_1=beamWordStates_h{ll}; 34 | c_t_1=beamWordStates_c{ll}; 35 | [A,beamWordStates_h{ll},beamWordStates_c{ll}]=lstmUnit(parameter.Word_T{ll},parameter,x_t,h_t_1,c_t_1,ll,-1,0); 36 | % lstm unit calculation, get lsmt status for current time step at word level 37 | end 38 | end_sen_index=find(words==parameter.sen_stop); 39 | % if predicted words incode sentence-ending indicators 40 | if length(end_sen_index)~=0 41 | for ll=1:parameter.layer_num 42 | if ll==1 43 | x_t=beamWordStates_h{ll}(:,end_sen_index); 44 | else 45 | x_t=beamSenStates_h{ll-1}(:,end_sen_index); 46 | end 47 | h_t_1=beamSenStates_h{ll}(:,end_sen_index); 48 | c_t_1=beamSenStates_c{ll}(:,end_sen_index); 49 | [A,beamSenStates_h{ll}(:,end_sen_index),beamSenStates_c{ll}(:,end_sen_index)]=lstmUnit(parameter.Sen_T{ll},parameter,x_t,h_t_1,c_t_1,ll,-1,0); 50 | % update lsmt status for current time step at sentence level 51 | beamWordStates_h{ll}(:,end_sen_index)=beamSenStates_h{ll}(:,end_sen_index); 52 | beamWordStates_c{ll}(:,end_sen_index)=beamSenStates_c{ll}(:,end_sen_index); 53 | end 54 | end 55 | all_next_words=BeamStep(parameter,beamWordStates_h{parameter.layer_num},0); 56 | % predict next word 57 | beamHistory(:,position+1)=all_next_words; 58 | end 59 | for senId=1:N 60 | vector=beamHistory(senId,:); 61 | vector=vector(1:floor(1.5*batch.num_word_in_doc(senId))); 62 | % maximium length of 1.5 * length of orginal inputs 63 | stop_sign=find(vector==parameter.doc_stop); 64 | if length(stop_sign)==0 65 | vector=vector; 66 | else 67 | vector=vector(1:stop_sign-1); 68 | end 69 | M=find(vector==parameter.sen_stop); 70 | vector(M)=[]; 71 | % remove sentence-ending token 72 | dlmwrite(filename,vector,'delimiter',' ','-append'); 73 | end 74 | end 75 | 76 | function[select_words]=BeamStep(parameter,h_t,isFirst) 77 | if isFirst==1 scores=parameter.soft_W(1:end-2,:)*h_t; 78 | else scores=parameter.soft_W*h_t; 79 | end 80 | % could not predict sentence-ending token or document-ending token for first word 81 | mx = max(scores); 82 | scores = bsxfun(@minus, scores, mx); 83 | logP=bsxfun(@minus, scores, log(sum(exp(scores)))); 84 | [sortedLogP, sortedWords]=sort(logP, 'descend'); 85 | select_words=sortedWords(1, :); 86 | 87 | probs_=exp(scores); 88 | norms = sum(probs_, 1); 89 | probs=bsxfun(@rdivide, probs_, norms); 90 | end 91 | -------------------------------------------------------------------------------- /hier_LSTM/hier_LSTM.m: -------------------------------------------------------------------------------- 1 | function[]=hier_LSTM() 2 | clear; 3 | addpath('../misc'); 4 | n= gpuDeviceCount; 5 | parameter.isGPU = 0; 6 | 7 | if n>0 % GPU exists 8 | parameter.isGPU = 1; 9 | gpuDevice(1); 10 | else 11 | print('no gpu ! ! ! ! !'); 12 | end 13 | 14 | parameter.dimension=1000; 15 | parameter.alpha=0.1; %learning rate 16 | parameter.layer_num=4; %number of layer 17 | parameter.hidden=1000; 18 | parameter.lstm_out_tanh=0; 19 | parameter.Initial=0.1; 20 | parameter.dropout=0.2; %drop-out rate 21 | params.lstm_out_tanh=0; 22 | parameter.isTraining=1; 23 | parameter.CheckGrad=0; %whether check gradient or not. 24 | parameter.PreTrainEmb=0; 25 | %whether using pre-trained embeddings 26 | parameter.update_embedding=1; 27 | %whether update word embeddings 28 | parameter.batch_size=32; 29 | parameter.Source_Target_Same_Language=1; 30 | %whether source and target is of the same language. For author-encoder task, it is. 31 | parameter.maxGradNorm=1; 32 | parameter.clip=0; 33 | 34 | parameter.lr=5; 35 | parameter.read=0; 36 | 37 | if parameter.Source_Target_Same_Language==1 38 | parameter.Vocab=25002; 39 | %vocabulary size plus sentence-end and document-end 40 | parameter.sen_stop=parameter.Vocab-1; %sentence-end 41 | parameter.doc_stop=parameter.Vocab; %document-end 42 | else 43 | parameter.SourceVocab=20; 44 | parameter.TargetVocab=20; 45 | parameter.stop=parameter.TargetVocab; 46 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 47 | end 48 | 49 | if parameter.CheckGrad==1¶meter.dropout~=0 50 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 51 | end 52 | %alpha: learning rate for minibatch 53 | 54 | parameter.nonlinear_gate_f = @sigmoid; 55 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 56 | parameter.nonlinear_f = @tanh; 57 | parameter.nonlinear_f_prime = @tanhPrime; 58 | 59 | 60 | train_source_file='../data/train_source_permute_segment.txt'; 61 | train_target_file='../data/train_target_permute_segment.txt'; 62 | if 1==0 63 | train_source_file='../../LSTM_Encode_Decode/data/train_source_permute_segment.txt'; 64 | train_target_file='../../LSTM_Encode_Decode/data/train_target_permute_segment.txt'; 65 | end 66 | 67 | if parameter.read==1 68 | disp('read'); 69 | parameter=ReadParameter(parameter); 70 | else [parameter]=Initial(parameter); 71 | end 72 | 73 | iter=0; 74 | 75 | disp('begin') 76 | 77 | while 1 78 | iter=iter+1 79 | End=0; 80 | fd_train_source=fopen(train_source_file); 81 | fd_train_target=fopen(train_target_file); 82 | sum_cost=0; 83 | sum_num=0; 84 | batch_n=0; 85 | while 1 86 | batch_n=batch_n+1; 87 | batch_n 88 | [current_batch,End]=ReadData(fd_train_source,fd_train_target,parameter,1); 89 | % read one batch 90 | if End~=1 || (End==1&& length(current_batch.source_smallBatch)~=0) 91 | result=Forward(current_batch,parameter,1); % Forward 92 | [batch_cost,grad]=softmax(result,current_batch,parameter); % SoftMax 93 | if (isnan(batch_cost)||isinf(batch_cost)) &&End~=1 94 | % if batch_cost is nan, load ealier parameters and skip latest batches 95 | if parameter.clip==1 96 | fprintf('die !! Hopeless!!\n'); 97 | disp('read done'); 98 | parameter=pre_parameter; 99 | disp(batch_n) 100 | else parameter.clip=1; 101 | end 102 | if End==1 break; 103 | else continue; 104 | end 105 | end 106 | if parameter.isTraining==1 107 | grad=Backward(current_batch,result,grad,parameter); 108 | % backward propagation 109 | clear result; 110 | if 1==0 111 | check(grad,current_batch,parameter); 112 | % check gradient 113 | end 114 | [parameter]=update_parameter(parameter,grad); 115 | % update parameter 116 | clear lstm; 117 | clear c; 118 | end 119 | end 120 | if End==1 121 | fclose(fd_train_source); 122 | fclose(fd_train_target); 123 | break; 124 | end 125 | if mod(batch_n,100)==0 126 | pre_parameter=parameter; 127 | % store parameters from last 100 batches in case of gradient explosion 128 | end 129 | 130 | end 131 | SaveParameter(parameter,iter); 132 | % save parameters 133 | end 134 | end 135 | 136 | function[]=check(grad,current_batch,parameter) 137 | % check gradients 138 | if 1==0 139 | check_vect(grad.vect(1,1),1,1,current_batch,parameter); 140 | check_vect(grad.vect(1,2),1,2,current_batch,parameter); 141 | check_vect(grad.vect(1,3),1,3,current_batch,parameter); 142 | check_vect(grad.vect(1,4),1,4,current_batch,parameter); 143 | check_vect(grad.vect(1,5),1,5,current_batch,parameter); 144 | end 145 | if 1==1 146 | disp('word_T') 147 | check_word_T(grad.Word_T{1}(1,1),1,1,1,current_batch,parameter); 148 | check_word_T(grad.Word_T{2}(1,1),2,1,1,current_batch,parameter); 149 | check_word_T(grad.Word_T{3}(1,1),3,1,1,current_batch,parameter); 150 | check_word_T(grad.Word_T{4}(1,1),4,1,1,current_batch,parameter); 151 | end 152 | if 1==1 153 | disp('sen_T') 154 | check_sen_T(grad.Sen_T{1}(1,1),1,1,1,current_batch,parameter); 155 | check_sen_T(grad.Sen_T{2}(1,1),2,1,1,current_batch,parameter); 156 | check_sen_T(grad.Sen_T{3}(1,1),3,1,1,current_batch,parameter); 157 | check_sen_T(grad.Sen_T{4}(1,1),4,1,1,current_batch,parameter); 158 | end 159 | if 1==1 160 | disp('sen_S') 161 | check_sen_S(grad.Sen_S{1}(1,1),1,1,1,current_batch,parameter); 162 | check_sen_S(grad.Sen_S{1}(1,2),1,1,2,current_batch,parameter); 163 | check_sen_S(grad.Sen_S{1}(1,7),1,1,7,current_batch,parameter); 164 | check_sen_S(grad.Sen_S{1}(1,8),1,1,8,current_batch,parameter); 165 | end 166 | if 1==1 167 | check_word_S(grad.Word_S{1}(1,1),1,1,1,current_batch,parameter); 168 | check_word_S(grad.Word_S{1}(1,2),1,1,2,current_batch,parameter); 169 | check_word_S(grad.Word_S{2}(1,1),2,1,1,current_batch,parameter); 170 | check_word_S(grad.Word_S{2}(1,2),2,1,2,current_batch,parameter); 171 | check_word_S(grad.Word_S{3}(1,1),3,1,1,current_batch,parameter); 172 | check_word_S(grad.Word_S{3}(1,2),3,1,2,current_batch,parameter); 173 | end 174 | end 175 | 176 | function[parameter]=ReadParameter(parameter) 177 | for ll=1:parameter.layer_num 178 | W_file=strcat('save_parameter/_Word_S',num2str(ll)); 179 | parameter.Word_S{ll}=gpuArray(load(W_file)); 180 | W_file=strcat('save_parameter/_Word_T',num2str(ll)); 181 | parameter.Word_T{ll}=gpuArray(load(W_file)); 182 | W_file=strcat('save_parameter/_Sen_S',num2str(ll)); 183 | parameter.Sen_S{ll}=gpuArray(load(W_file)); 184 | W_file=strcat('save_parameter/_Sen_T',num2str(ll)); 185 | parameter.Sen_T{ll}=gpuArray(load(W_file)); 186 | end 187 | parameter.vect=gpuArray(load('save_parameter/_v')); 188 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 189 | end 190 | 191 | function SaveParameter(parameter,iter) 192 | % save parameters 193 | if iter~=-1 194 | all=strcat('save_parameter/',int2str(iter)); 195 | all=strcat(all,'_'); 196 | all=strcat(all,num2str(parameter.alpha)); 197 | else 198 | all='save_parameter/'; 199 | end 200 | for ll=1:parameter.layer_num 201 | W_file=strcat(all,'_Word_T'); 202 | W_file=strcat(W_file,num2str(ll)); 203 | dlmwrite(W_file,parameter.Word_T{ll}); 204 | end 205 | for ll=1:parameter.layer_num 206 | W_file=strcat(all,'_Word_S'); 207 | W_file=strcat(W_file,num2str(ll)); 208 | dlmwrite(W_file,parameter.Word_S{ll}); 209 | end 210 | for ll=1:parameter.layer_num 211 | W_file=strcat(all,'_Sen_T'); 212 | W_file=strcat(W_file,num2str(ll)); 213 | dlmwrite(W_file,parameter.Sen_T{ll}); 214 | end 215 | for ll=1:parameter.layer_num 216 | W_file=strcat(all,'_Sen_S'); 217 | W_file=strcat(W_file,num2str(ll)); 218 | dlmwrite(W_file,parameter.Sen_S{ll}); 219 | end 220 | 221 | v_file=strcat(all,'_v'); 222 | dlmwrite(v_file,parameter.vect); 223 | soft_W_file=strcat(all,'_soft_W'); 224 | dlmwrite(soft_W_file,parameter.soft_W); 225 | end 226 | 227 | function[parameter]=update_parameter(parameter,grad) 228 | % update parameters 229 | norm=computeGradNorm(grad,parameter); 230 | if norm>parameter.maxGradNorm 231 | lr=parameter.alpha*parameter.maxGradNorm/norm; 232 | else lr=parameter.alpha; 233 | end 234 | for ll=1:parameter.layer_num 235 | parameter.Word_S{ll}=parameter.Word_S{ll}-lr*grad.Word_S{ll}; 236 | parameter.Word_T{ll}=parameter.Word_T{ll}-lr*grad.Word_T{ll}; 237 | parameter.Sen_S{ll}=parameter.Sen_S{ll}-lr*grad.Sen_S{ll}; 238 | parameter.Sen_T{ll}=parameter.Sen_T{ll}-lr*grad.Sen_T{ll}; 239 | end 240 | parameter.soft_W=parameter.soft_W-lr*grad.soft_W; 241 | parameter.vect=parameter.vect-lr*grad.vect; 242 | clear grad; 243 | end 244 | 245 | function[parameter]=Initial(parameter) 246 | %random initialization 247 | m=parameter.Initial; 248 | for i=1:parameter.layer_num 249 | if i==1 250 | parameter.Word_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 251 | parameter.Word_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 252 | else 253 | parameter.Word_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 254 | parameter.Word_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 255 | end 256 | parameter.Sen_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 257 | parameter.Sen_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 258 | end 259 | parameter.vect=randomMatrix(parameter.Initial,[parameter.dimension,parameter.Vocab]); 260 | if parameter.Source_Target_Same_Language==1 261 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.Vocab,parameter.hidden]); 262 | else 263 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.TargetVocab,parameter.hidden]); 264 | end 265 | end 266 | 267 | function[Batches]=GetTestBatch(Source,batch_size,parameter) 268 | N_batch=ceil(length(Source)/batch_size); 269 | Batches={}; 270 | for i=1:N_batch 271 | Begin=batch_size*(i-1)+1; 272 | End=batch_size*i; 273 | if End>length(Source) 274 | End=length(Source); 275 | end 276 | current_batch=Batch(); 277 | for j=Begin:End 278 | source_length=length(Source{j}); 279 | current_batch.SourceLength=[current_batch.SourceLength,source_length]; 280 | if source_length>current_batch.MaxLenSource 281 | current_batch.MaxLenSource=source_length; 282 | end 283 | end 284 | current_batch.Word=ones(End-Begin+1,current_batch.MaxLenSource); 285 | Mask=ones(End-Begin+1,current_batch.MaxLenSource); 286 | for j=Begin:End 287 | source_length=length(Source{j}); 288 | current_batch.Word(j-Begin+1,current_batch.MaxLenSource-source_length+1:current_batch.MaxLenSource)=Source{j}; 289 | Mask(j-Begin+1,1:current_batch.MaxLenSource-source_length)=0; 290 | end 291 | for j=1:current_batch.MaxLenSource 292 | current_batch.Delete{j}=find(Mask(:,j)==0); 293 | current_batch.Left{j}=find(Mask(:,j)==1); 294 | end 295 | current_batch.Mask=Mask; 296 | Batches{i}=current_batch; 297 | end 298 | end 299 | 300 | function[Source]=ReadTestData(source_file,parameter) 301 | fd_s=fopen(source_file); 302 | tline_s = fgets(fd_s); 303 | i=0; 304 | Source={}; 305 | while ischar(tline_s) 306 | i=i+1; 307 | text_s=deblank(tline_s); 308 | if parameter.Source_Target_Same_Language~=1 309 | Source{i}=wrev(str2num(text_s))+parameter.TargetVocab; 310 | else 311 | Source{i}=wrev(str2num(text_s)); 312 | end 313 | tline_s = fgets(fd_s); 314 | end 315 | end 316 | 317 | 318 | 319 | function check_vect(value1,i,j,current_batch,parameter) 320 | e=0.001; 321 | parameter.vect(i,j)=parameter.vect(i,j)+e; 322 | result=Forward(current_batch,parameter,1); 323 | [cost1,grad]=softmax(result,current_batch,parameter); 324 | parameter.vect(i,j)=parameter.vect(i,j)-2*e; 325 | result=Forward(current_batch,parameter,1); 326 | [cost2,grad]=softmax(result,current_batch,parameter); 327 | parameter.vect(i,j)=parameter.vect(i,j)+e; 328 | value2=(cost1-cost2)/(2*e); 329 | [value1,value2] 330 | value1-value2 331 | end 332 | 333 | function check_soft_W(value1,i,j,current_batch,parameter) 334 | e=0.001; 335 | parameter.soft_W(i,j)=parameter.soft_W(i,j)+e; 336 | result=Forward(current_batch,parameter,1); 337 | [cost1,grad]=softmax(result,current_batch,parameter); 338 | parameter.soft_W(i,j)=parameter.soft_W(i,j)-2*e; 339 | result=Forward(current_batch,parameter,1); 340 | [cost2,grad]=softmax(result,current_batch,parameter); 341 | parameter.soft_W(i,j)=parameter.soft_W(i,j)+e; 342 | value2=(cost1-cost2)/(2*e); 343 | value1 344 | value2 345 | value1-value2 346 | end 347 | 348 | 349 | function check_word_T(value1,ll,i,j,current_batch,parameter) 350 | e=1e-3; 351 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)+e; 352 | result=Forward(current_batch,parameter,1); 353 | [cost1,grad]=softmax(result,current_batch,parameter); 354 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)-2*e; 355 | result=Forward(current_batch,parameter,1); 356 | [cost2,grad]=softmax(result,current_batch,parameter); 357 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)+e; 358 | value2=(cost1-cost2)/(2*e); 359 | value1 360 | value2 361 | value1-value2 362 | end 363 | 364 | function check_word_S(value1,ll,i,j,current_batch,parameter) 365 | e=0.001; 366 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)+e; 367 | result=Forward(current_batch,parameter,1); 368 | [cost1,grad]=softmax(result,current_batch,parameter); 369 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)-2*e; 370 | result=Forward(current_batch,parameter,1); 371 | [cost2,grad]=softmax(result,current_batch,parameter); 372 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)+e; 373 | value2=(cost1-cost2)/(2*e); 374 | [value1,value2] 375 | value1-value2 376 | end 377 | 378 | 379 | function check_sen_S(value1,ll,i,j,current_batch,parameter) 380 | e=0.001; 381 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)+e; 382 | result=Forward(current_batch,parameter,1); 383 | [cost1,grad]=softmax(result,current_batch,parameter); 384 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)-2*e; 385 | result=Forward(current_batch,parameter,1); 386 | [cost2,grad]=softmax(result,current_batch,parameter); 387 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)+e; 388 | value2=(cost1-cost2)/(2*e); 389 | [value1,value2] 390 | value1-value2 391 | end 392 | 393 | function check_sen_T(value1,ll,i,j,current_batch,parameter) 394 | e=0.001; 395 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)+e; 396 | result=Forward(current_batch,parameter,1); 397 | [cost1,grad]=softmax(result,current_batch,parameter); 398 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)-2*e; 399 | result=Forward(current_batch,parameter,1); 400 | [cost2,grad]=softmax(result,current_batch,parameter); 401 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)+e; 402 | value2=(cost1-cost2)/(2*e); 403 | value1 404 | value2 405 | value1-value2 406 | end 407 | 408 | 409 | 410 | function[norm]=computeGradNorm(grad,parameter) 411 | % compute gradient norms 412 | norm=0; 413 | for ii=1:parameter.layer_num 414 | norm=norm+double(sum(grad.Word_S{ii}(:).^2)); 415 | norm=norm+double(sum(grad.Word_T{ii}(:).^2)); 416 | norm=norm+double(sum(grad.Sen_S{ii}(:).^2)); 417 | norm=norm+double(sum(grad.Sen_T{ii}(:).^2)); 418 | end 419 | norm=sqrt(norm); 420 | end 421 | -------------------------------------------------------------------------------- /hier_LSTM/lstmUnit.m: -------------------------------------------------------------------------------- 1 | function[lstm, h_t, c_t]=lstmUnit(W,parameter,x_t,h_t_1, c_t_1, ll, t,isTraining) 2 | if parameter.dropout~=0&&isTraining==1 3 | if parameter.CheckGrad==1 4 | drop_left=repmat(parameter.drop_left,1,size(x_t,2)); 5 | else 6 | drop_left=randSimpleMatrix(size(x_t))<1-parameter.dropout; 7 | end 8 | x_t=x_t.*drop_left; 9 | end 10 | input=[x_t; h_t_1]; 11 | ifoa_linear = W*input; 12 | ifo_gate=parameter.nonlinear_gate_f(ifoa_linear(1:3*parameter.hidden,:)); 13 | i_gate = ifo_gate(1:parameter.hidden, :); 14 | f_gate = ifo_gate(parameter.hidden+1:2*parameter.hidden,:); 15 | o_gate =ifo_gate(parameter.hidden*2+1:3*parameter.hidden,:); 16 | a_signal = parameter.nonlinear_f(ifoa_linear(3*parameter.hidden+1:4*parameter.hidden,:)); 17 | c_t=f_gate.*c_t_1 + i_gate.*a_signal; 18 | if parameter.lstm_out_tanh 19 | f_c_t = parameter.nonlinear_f(c_t); 20 | h_t = o_gate.*f_c_t; 21 | else 22 | h_t = o_gate.*c_t; 23 | end 24 | %c_t(c_t>parameter.clipForward)=parameter.clipForward; 25 | %c_t(c_t<-parameter.clipForward)=-parameter.clipForward; 26 | %h_t(h_t>parameter.clipForward)=parameter.clipForward; 27 | %h_t(h_t<-parameter.clipForward)=-parameter.clipForward; 28 | 29 | lstm.input = input; 30 | lstm.i_gate = i_gate; 31 | lstm.f_gate = f_gate; 32 | lstm.o_gate = o_gate; 33 | lstm.a_signal = a_signal; 34 | if parameter.lstm_out_tanh 35 | lstm.f_c_t = f_c_t; 36 | else 37 | lstm.c_t = c_t; 38 | end 39 | if isTraining==1&¶meter.dropout~=0 40 | lstm.drop_left=drop_left; 41 | end 42 | end 43 | 44 | function [clippedValue] = clipForward(x) 45 | if x>50 clippedValue = single(50); 46 | elseif x<-50 clippedValue = single(-50); 47 | else clippedValue =single(x); 48 | end 49 | end 50 | 51 | -------------------------------------------------------------------------------- /hier_LSTM/softmax.m: -------------------------------------------------------------------------------- 1 | function[total_cost,grad]=softmax(result,docbatch,parameter) 2 | total_cost=0; 3 | grad.soft_W=zeroMatrix(size(parameter.soft_W)); 4 | step_size=1; 5 | num_word=0; 6 | N=size(docbatch.target_sen_matrix,1); 7 | zeroState=zeroMatrix([parameter.hidden,N]); 8 | for ll=1:parameter.layer_num 9 | grad.source_h{ll,1}=zeroState; 10 | grad.source_c{ll,1}=zeroState; 11 | for sen_tt=1:length(result.Target_sen)-1 12 | grad.target_sen_h{ll,sen_tt}=zeroState; 13 | grad.target_sen_c{ll,sen_tt}=zeroState; 14 | end 15 | end 16 | for sen_tt=1:length(result.Target_sen) 17 | Word_List=docbatch.target_word{sen_tt}.Word; 18 | Word_Mask=docbatch.target_word{sen_tt}.Mask; 19 | num_word=num_word+length(find(Word_Mask==1)); 20 | N=size(Word_List,1); 21 | T=size(Word_List,2); 22 | target_sen=result.Target_sen{sen_tt}; 23 | N_examples=size(Word_List,1)*size(Word_List,2); 24 | predict_Words=reshape(Word_List,1,N_examples); 25 | mask=reshape(Word_Mask,1,N_examples); 26 | h_t=[]; 27 | if sen_tt==1 28 | h_t=[h_t,result.source_sen{parameter.layer_num,1}]; 29 | else 30 | dim=size(result.h_t_target_sen); 31 | h_t=[h_t,result.h_t_target_sen{dim(1),sen_tt-1}]; 32 | end 33 | dim=size(target_sen.h_t_target_word); 34 | h_t=[h_t,[target_sen.h_t_target_word{parameter.layer_num,1:dim(2)-1}]]; 35 | [cost,grad_softmax_h]=batchSoftmax(h_t,mask,predict_Words,parameter); 36 | total_cost=total_cost+cost; 37 | grad.soft_W=grad.soft_W+grad_softmax_h.soft_W; 38 | if sen_tt==1 39 | grad.source_h{parameter.layer_num,1}=grad_softmax_h.h(:,1:N); 40 | else 41 | grad.target_sen_h{parameter.layer_num,sen_tt-1}=grad_softmax_h.h(:,1:N); 42 | end 43 | for i=1:T-1 44 | grad.ht{sen_tt}{1,i}=grad_softmax_h.h(:,N*i+1:N*(i+1)); 45 | end 46 | end 47 | total_cost=total_cost/N; 48 | grad.soft_W=grad.soft_W/N; 49 | clear predict_Words; clear mask; 50 | clear grad_softmax_h; 51 | end 52 | 53 | function[cost,softmax_grad]=batchSoftmax(h_t,mask,predict_Words,parameter) 54 | unmaskedIds=find(mask==1); 55 | scores=parameter.soft_W*h_t; 56 | mx = max(scores,[],1); 57 | scores=bsxfun(@minus,scores,mx); 58 | scores=exp(scores); 59 | norms = sum(scores, 1); 60 | if length(find(mask==0))==0 61 | scores=bsxfun(@rdivide, scores, norms); 62 | else 63 | scores=bsxfun(@times,scores, mask./norms); 64 | end 65 | scoreIndices = sub2ind(size(scores),predict_Words(unmaskedIds),unmaskedIds); 66 | cost=sum(-log(scores(scoreIndices))); 67 | scores(scoreIndices) =scores(scoreIndices) - 1; 68 | softmax_grad.soft_W=scores*h_t'; %(N_word*examples)*(examples*diemsnion)=N_word*diemsnion; 69 | softmax_grad.h=(scores'*parameter.soft_W)';%(diemsnion*N_word)*(N_word*examples)=dimension*examples 70 | clear scores; 71 | clear norms; 72 | end 73 | -------------------------------------------------------------------------------- /hier_LSTM/test.m: -------------------------------------------------------------------------------- 1 | function[]=test() 2 | clear; 3 | 4 | addpath('../misc'); 5 | n= gpuDeviceCount; 6 | parameter.isGPU = 0; 7 | 8 | if n>0 % GPU exists 9 | parameter.isGPU = 1; 10 | gpuDevice(2); 11 | else 12 | print('no gpu ! ! ! ! !'); 13 | end 14 | 15 | parameter.dimension=100; 16 | parameter.alpha=0.1; %learning rate 17 | parameter.layer_num=4; %number of layer 18 | parameter.hidden=100; 19 | parameter.lstm_out_tanh=0; 20 | parameter.Initial=0.1; 21 | parameter.dropout=0.2; %drop-out rate 22 | params.lstm_out_tanh=0; 23 | parameter.isTraining=1; 24 | parameter.CheckGrad=0; %whether check gradient or not. 25 | parameter.PreTrainEmb=0; 26 | %whether using pre-trained embeddings 27 | parameter.update_embedding=1; 28 | %whether update word embeddings 29 | parameter.batch_size=16; 30 | parameter.Source_Target_Same_Language=1; 31 | %whether source and target is of the same language. 32 | parameter.maxGradNorm=1; 33 | parameter.clip=0; 34 | 35 | parameter.lr=5; 36 | parameter.read=0; 37 | 38 | if parameter.Source_Target_Same_Language==1 39 | parameter.Vocab=25002; 40 | %vocabulary size plus sentence-end and document-end 41 | parameter.sen_stop=parameter.Vocab-1; %sentence-end 42 | parameter.doc_stop=parameter.Vocab; %document-end 43 | else 44 | parameter.SourceVocab=20; 45 | parameter.TargetVocab=20; 46 | parameter.stop=parameter.TargetVocab; 47 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 48 | end 49 | 50 | if parameter.CheckGrad==1¶meter.dropout~=0 51 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 52 | end 53 | %alpha: learning rate for minibatch 54 | 55 | parameter.nonlinear_gate_f = @sigmoid; 56 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 57 | parameter.nonlinear_f = @tanh; 58 | parameter.nonlinear_f_prime = @tanhPrime; 59 | [parameter]=ReadParameter(parameter); 60 | 61 | test_source_file='../data/test_segment.txt'; 62 | 63 | while 1 64 | fd_test_source=fopen(test_source_file); 65 | [current_batch,End]=ReadData(fd_test_source,[],parameter,0); 66 | if End~=1 || (End==1&& length(current_batch.source_smallBatch)~=0) 67 | decode_greedy(parameter,current_batch,'a.txt'); 68 | end 69 | if End==1 70 | break; 71 | end 72 | end 73 | end 74 | 75 | function[parameter]=ReadParameter(parameter) 76 | for ll=1:parameter.layer_num 77 | W_file=strcat('save_parameter/_W_S',num2str(ll)); 78 | parameter.W_S{ll}=gpuArray(load(W_file)); 79 | W_file=strcat('save_parameter/_W_T',num2str(ll)); 80 | parameter.W_T{ll}=gpuArray(load(W_file)); 81 | end 82 | parameter.vect=gpuArray(load('save_parameter/_v')); 83 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 84 | end 85 | 86 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/.hier_LSTM_Att.m.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiweil/Hierarchical-Neural-Autoencoder/2cea5c0b687e6d3dfb20dee4bec97aec6b57a95f/hier_LSTM_Attention/.hier_LSTM_Att.m.swp -------------------------------------------------------------------------------- /hier_LSTM_Attention/Attention.m: -------------------------------------------------------------------------------- 1 | function[result]=Attention(docbatch,result,h_t,parameter,sen_tt) 2 | % attention models, linking current target sentences to all input source sentences 3 | N=size(docbatch.source_sen_matrix,1); 4 | T=size(docbatch.source_sen_matrix,2); 5 | result.attention_input{sen_tt}=[result.source_each_sen;repmat(h_t,1,T)]; 6 | % concatenate source sentence embeddings with target 7 | result.attention_vector{sen_tt}=tanh(parameter.Attention_W*result.attention_input{sen_tt}); 8 | % compositions 9 | scores=parameter.Attention_U'*result.attention_vector{sen_tt}; 10 | % linking scors 11 | Matrix=reshape(scores,N,T); 12 | Matrix=exp(Matrix); 13 | Matrix=Matrix.*docbatch.source_sen_mask; 14 | norms=sum(Matrix,2); 15 | result.scores{sen_tt}=bsxfun(@rdivide,Matrix, norms); 16 | % normalization 17 | result.dA{sen_tt}=gpuArray(); 18 | for i=1:N 19 | result.dA{sen_tt}=[result.dA{sen_tt};diag(result.scores{sen_tt}(i,:))-result.scores{sen_tt}(i,:)'*result.scores{sen_tt}(i,:)]; 20 | % calculate derivatives for later backward propagation usage 21 | end 22 | scores=reshape(result.scores{sen_tt},1,N*T); 23 | 24 | M=bsxfun(@times,result.source_each_sen,scores); 25 | % multiply weights with source embeddings 26 | result.sum_vector{sen_tt}=zeroMatrix([parameter.hidden,N]); 27 | for i=1:T 28 | result.sum_vector{sen_tt}=result.sum_vector{sen_tt}+M(:,(i-1)*N+1:i*N); 29 | % weighted sum 30 | end 31 | clear Matrix; 32 | clear norms; 33 | clear scores; 34 | clear M; 35 | end 36 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/Attention_Decode.m: -------------------------------------------------------------------------------- 1 | function[sum_vector]=Attention_Decode(docbatch,result,h_t,parameter,Index) 2 | 3 | N=length(Index); 4 | T=size(docbatch.source_sen_matrix,2); 5 | source_each_sen=[]; 6 | for i=1:T 7 | index=N*(T-1)+Index; 8 | source_each_sen=[source_each_sen,result.source_each_sen(:,index)]; 9 | end 10 | 11 | attention_input=[source_each_sen;repmat(h_t,1,T)]; 12 | attention_vector=tanh(parameter.Attention_W*attention_input); 13 | scores=parameter.Attention_U'*attention_vector; 14 | 15 | Matrix=reshape(scores,N,T); 16 | Matrix=exp(Matrix); 17 | Matrix=Matrix.*docbatch.source_sen_mask(Index,:); 18 | norms=sum(Matrix,2); 19 | scores=bsxfun(@rdivide,Matrix, norms); 20 | scores=reshape(scores,1,N*T); 21 | 22 | M=bsxfun(@times,source_each_sen,scores); 23 | sum_vector=zeroMatrix([parameter.hidden,N]); 24 | for i=1:T 25 | sum_vector=sum_vector+M(:,(i-1)*N+1:i*N); 26 | end 27 | clear Matrix; 28 | clear norms; 29 | clear scores; 30 | clear M; 31 | end 32 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/Backward.m: -------------------------------------------------------------------------------- 1 | function[grad]=Backward(current_batch,result,grad,parameter) 2 | grad=Backward_Target(current_batch,result,grad,parameter); 3 | % backward propagatin for target sentences 4 | grad=Backward_Source_Sen(current_batch,result,grad,parameter); 5 | % backward propagatin for sources sentences 6 | end 7 | 8 | function[grad]=Backward_Source_Sen(docbatch,result,grad,parameter) 9 | num_sentence=size(docbatch.target_sen_matrix,1); 10 | for ll=1:parameter.layer_num 11 | dh{ll}=grad.source_h{ll,1}; % dh 12 | dc{ll}=grad.source_c{ll,1}; % dc 13 | end 14 | N=size(docbatch.source_sen_matrix,1); 15 | % number of documents in current batches 16 | T=size(docbatch.source_sen_matrix,2); 17 | % total time steps at sentence level 18 | zeroState=zeroMatrix([parameter.hidden,N]); 19 | clear grad.source_h; 20 | clear grad.source_c; 21 | d_source_sen=zeroMatrix([parameter.hidden,docbatch.num_of_source_sen]); 22 | for sen_tt=T:-1:1 23 | for ll=parameter.layer_num:-1:1 24 | if ll==parameter.layer_num 25 | dh{ll}=dh{ll}+grad.source_sen(:,N*(sen_tt-1)+1:N*sen_tt); 26 | end 27 | if sen_tt==1 28 | c_t_1 =zeroState; 29 | else 30 | c_t_1=result.c_t_source_sen{ll,sen_tt-1}; 31 | end 32 | c_t=result.c_t_source_sen{ll,sen_tt}; 33 | lstm=result.lstms_source_sen{ll,sen_tt}; 34 | dh{ll}(:,docbatch.source_delete{sen_tt})=0; 35 | dc{ll}(:,docbatch.source_delete{sen_tt})=0; 36 | % set values to zero for positions without words 37 | if sen_tt==1 38 | is_begin=1; 39 | else 40 | is_begin=0; 41 | end 42 | [lstm_grad]=lstmUnitGrad(lstm,c_t, c_t_1,dc{ll},dh{ll},ll,sen_tt,zeroState,parameter,parameter.Sen_S{ll},is_begin); 43 | % backward calculation for lstm unit at source sentence level 44 | lstm_grad.input(:,docbatch.source_delete{sen_tt})=0; 45 | dc{ll}(:,docbatch.source_delete{sen_tt})=0; 46 | dc{ll}=lstm_grad.dc; 47 | grad.Sen_S{ll}=grad.Sen_S{ll}+lstm_grad.W; 48 | 49 | lstm_grad.input(:,docbatch.source_delete{sen_tt})=0; 50 | dh{ll}=lstm_grad.input(end-parameter.hidden+1:end,:); 51 | dc{ll}=lstm_grad.dc; 52 | if ll~=1 53 | dh{ll-1}=dh{ll-1}+lstm_grad.input(1:parameter.hidden,:); 54 | else 55 | sen_index=docbatch.source_sen_matrix(:,sen_tt); 56 | d_source_sen(:,sen_index(docbatch.source_left{sen_tt}))=lstm_grad.input(1:parameter.hidden,docbatch.source_left{sen_tt}); 57 | end 58 | end 59 | end 60 | clear dh; 61 | clear dc; 62 | clear lstm_grad; 63 | clear result.c_t_source_sen; 64 | clear result.lstms_source_sen; 65 | 66 | grad=DealSentenceSource(d_source_sen,docbatch,result,grad,parameter); 67 | % backward propagation to source sentences at word level 68 | for ll=1:parameter.layer_num 69 | grad.Word_S{ll}=grad.Word_S{ll}/num_sentence; 70 | grad.Sen_S{ll}=grad.Sen_S{ll}/num_sentence; 71 | end 72 | grad.vect=grad.vect/num_sentence; 73 | end 74 | 75 | function[grad]=DealSentenceSource(d_source_sen,docbatch,result,grad,parameter) 76 | % backward propagation to source sentences at word level 77 | wordCount = 0; 78 | for i=1:length(docbatch.source_smallBatch) 79 | source_smallBatch=docbatch.source_smallBatch{i}; 80 | Word=source_smallBatch.Word; 81 | N=size(Word,1); 82 | % number of sentences for current batch 83 | T=size(Word,2); 84 | % total number of time steps 85 | zeroState=zeroMatrix([parameter.hidden,N]); 86 | for ll=1:parameter.layer_num 87 | dh{ll}=zeroState; 88 | dc{ll}=zeroState; 89 | end 90 | for word_tt=T:-1:1 91 | unmaskedIds=source_smallBatch.Left{word_tt}; 92 | for ll=parameter.layer_num:-1:1 93 | if word_tt==T&&ll==parameter.layer_num 94 | dh{ll}=d_source_sen(:,source_smallBatch.Sen); 95 | end 96 | if word_tt==1 97 | c_t_1 =zeroState; 98 | else c_t_1=result.c_t_source_word{i}{ll,word_tt-1}; 99 | end 100 | c_t=result.c_t_source_word{i}{ll,word_tt}; 101 | lstm=result.lstms_source_word{i}{ll,word_tt}; 102 | if word_tt==1 103 | is_begin=1; 104 | else 105 | is_begin=0; 106 | end 107 | [lstm_grad]=lstmUnitGrad(lstm,c_t, c_t_1,dc{ll},dh{ll},ll,word_tt,zeroState,parameter,parameter.Word_S{ll},is_begin); 108 | % backward calculation for lstm unit at source word level 109 | grad.Word_S{ll}=grad.Word_S{ll}+lstm_grad.W; 110 | lstm_grad.input(:,source_smallBatch.Delete{word_tt})=0; 111 | dh{ll}=lstm_grad.input(end-parameter.hidden+1:end,:); 112 | dc{ll}=lstm_grad.dc; 113 | if ll~=1 114 | dh{ll-1}=dh{ll-1}+lstm_grad.input(1:parameter.hidden,:); 115 | else 116 | % gradient for word tokens 117 | embIndices=Word(unmaskedIds,word_tt)'; 118 | embGrad = lstm_grad.input(1:parameter.dimension,unmaskedIds); 119 | numWords = length(embIndices); 120 | allEmbIndices(wordCount+1:wordCount+numWords) = embIndices; 121 | allEmbGrads(:, wordCount+1:wordCount+numWords) = embGrad; 122 | wordCount = wordCount + numWords; 123 | end 124 | end 125 | end 126 | end 127 | allEmbGrads(:, wordCount+1:end) = []; 128 | allEmbIndices(wordCount+1:end) = []; 129 | [W_emb,indices] = aggregateMatrix(allEmbGrads, allEmbIndices); 130 | grad.vect(:,indices)=grad.vect(:,indices)+W_emb; 131 | clear allEmbIndices; 132 | clear allEmbGrads; 133 | clear W_emb; 134 | clear indices; 135 | clear result.c_t_source_word; 136 | clear result.h_t_source_word; 137 | clear result.lstms_source_word; 138 | end 139 | 140 | 141 | function[grad]=Backward_Target(current_batch,result,grad,parameter) 142 | % backward propagatin for target sentences 143 | num_sentence=size(current_batch.target_sen_matrix,1); 144 | for ll=1:parameter.layer_num 145 | grad.Word_T{ll}=zeroMatrix(size(parameter.Word_T{ll})); 146 | grad.Word_S{ll}=zeroMatrix(size(parameter.Word_S{ll})); 147 | grad.Sen_S{ll}=zeroMatrix(size(parameter.Sen_S{ll})); 148 | grad.Sen_T{ll}=zeroMatrix(size(parameter.Sen_T{ll})); 149 | end 150 | grad.Attention_U=zeroMatrix(size(parameter.Attention_U)); 151 | grad.Attention_W=zeroMatrix(size(parameter.Attention_W)); 152 | grad.vect=zeroMatrix(size(parameter.vect)); 153 | 154 | N_target_sen=size(current_batch.target_sen_matrix,1); 155 | T_target_sen=size(current_batch.target_sen_matrix,2); 156 | zeroState=zeroMatrix([parameter.hidden, size(current_batch.target_sen_matrix,1)]); 157 | L=[]; 158 | N_source=size(current_batch.source_sen_matrix,1); 159 | T_source=size(current_batch.source_sen_matrix,2); 160 | grad.source_sen=zeroMatrix([parameter.hidden,N_source*T_source]); 161 | 162 | for sen_tt=T_target_sen-1:-1:0 163 | target_sen=result.Target_sen{sen_tt+1}; 164 | grad=DealSentenceTarget(current_batch,target_sen,sen_tt+1,grad,L,parameter,result); 165 | grad.vect(:,grad.indices)=grad.vect(:,grad.indices)+grad.W_emb; 166 | clear grad.indices; 167 | clear grad.W_emb; 168 | if sen_tt==0 169 | continue; 170 | end 171 | for ll=parameter.layer_num:-1:1 172 | if sen_tt==1 173 | dim=size(result.c_t_source_sen); 174 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 175 | else c_t_1=result.c_t_target_sen{ll,sen_tt-1}; 176 | end 177 | c_t=result.c_t_target_sen{ll,sen_tt}; 178 | lstm=result.lstms_target_sen{ll,sen_tt}; 179 | [lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1,grad.target_sen_c{ll,sen_tt},grad.target_sen_h{ll,sen_tt},ll,sen_tt,zeroState,parameter,parameter.Sen_T{ll},0); 180 | % backpropagation for current lstm unit at sentence level 181 | lstm_grad.input(:,current_batch.target_delete{sen_tt+1})=0; 182 | lstm_grad.dc(:,current_batch.target_delete{sen_tt+1})=0; 183 | if sen_tt~=1 184 | grad.target_sen_h{ll,sen_tt-1}=lstm_grad.input(end-parameter.hidden+1:end,:)+grad.target_sen_h{ll,sen_tt-1}; 185 | grad.target_sen_c{ll,sen_tt-1}=lstm_grad.dc+grad.target_sen_c{ll,sen_tt-1}; 186 | else 187 | grad.source_h{ll,1}=grad.source_h{ll,1}+lstm_grad.input(end-parameter.hidden+1:end,:); 188 | grad.source_c{ll,1}=grad.source_c{ll,1}+lstm_grad.dc; 189 | end 190 | grad.Sen_T{ll}=grad.Sen_T{ll}+lstm_grad.W; 191 | if ll~=1 192 | grad.target_sen_h{ll-1,sen_tt}=lstm_grad.input(1:parameter.hidden,:)+grad.target_sen_h{ll-1,sen_tt}; 193 | else 194 | L=lstm_grad.input(1:parameter.hidden,:); 195 | dm=lstm_grad.input(parameter.hidden+1:2*parameter.hidden,:); 196 | % derivatives for attention vectors 197 | rep_dm=repmat(dm,1,T_source); 198 | grad.source_sen=grad.source_sen+bsxfun(@times,rep_dm,reshape(result.scores{sen_tt},1,N_source*T_source)); 199 | da=sum(result.source_each_sen.*rep_dm); 200 | da=reshape(da,N_source,T_source); 201 | % derivatives for attention weights (normalzied) 202 | dp=gpuArray(); 203 | for i=1:N_source 204 | dp=[dp;da(i,:)*result.dA{sen_tt}(T_source*(i-1)+1:T_source*i,:)]; 205 | end 206 | % derivatives for attention values before normalization 207 | dp=reshape(dp,1,N_source*T_source); 208 | grad.Attention_U=grad.Attention_U+sum(bsxfun(@times,result.attention_vector{sen_tt},dp),2); 209 | % derivatives for Attention_U 210 | d_attention_vector=arrayfun(@tanhPrimeDouble,result.attention_vector{sen_tt},parameter.Attention_U*dp); 211 | % derivatives for attention vectors: attention-value=U^T* vectors 212 | grad.Attention_W=grad.Attention_W+d_attention_vector*result.attention_input{sen_tt}'; 213 | % derivatives for attention vectors: vectors= tanh (W * concatenations) 214 | d_input=parameter.Attention_W'*d_attention_vector; 215 | clear d_attention_vector; 216 | grad.source_sen=grad.source_sen+d_input(1:parameter.hidden,:); 217 | V=zeroMatrix([parameter.hidden,N_target_sen]); 218 | for i=1:T_source 219 | V=V+d_input(parameter.hidden+1:2*parameter.hidden,N_source*(i-1)+1:N_source*i); 220 | end 221 | if sen_tt~=1 222 | grad.target_sen_h{parameter.layer_num,sen_tt-1}=grad.target_sen_h{parameter.layer_num,sen_tt-1}+V; 223 | else 224 | grad.source_h{parameter.layer_num,1}=grad.source_h{parameter.layer_num,1}+V; 225 | end 226 | clear V; 227 | clear d_input; 228 | end 229 | end 230 | end 231 | clear result.attention_input; 232 | clear result.attention_vector; 233 | clear result.dA; 234 | clear result.scores; 235 | clear result.sum_vector; 236 | 237 | for ll=1:parameter.layer_num 238 | grad.Word_T{ll}=grad.Word_T{ll}/num_sentence; 239 | grad.Sen_T{ll}=grad.Sen_T{ll}/num_sentence; 240 | end 241 | grad.Attention_U=grad.Attention_U/num_sentence; 242 | grad.Attention_W=grad.Attention_W/num_sentence; 243 | clear grad.target_sen_c; 244 | clear grad.target_sen_h; 245 | clear zeroState; 246 | clear lstm_grad; 247 | clear c_t_1; 248 | clear c_t; 249 | clear lstm; 250 | clear grad.ht; 251 | clear lstm_grad; 252 | end 253 | 254 | 255 | function[grad]=DealSentenceTarget(docbatch,target_sen,sen_tt,grad,end_grad,parameter,result) 256 | % back propagation to target sentences 257 | target_word=docbatch.target_word{sen_tt}; 258 | Word=target_word.Word; 259 | Word_Delete=target_word.Delete; 260 | Word_Left=target_word.Left; 261 | N=size(Word,1); 262 | T=size(Word,2); 263 | zeroState=zeroMatrix([parameter.hidden,N]); 264 | for ll=parameter.layer_num:-1:1 265 | dh{ll} = zeroState; 266 | dc{ll} = zeroState; 267 | end 268 | wordCount = 0; 269 | numInputWords=size(Word,1)*size(Word,2); 270 | allEmbGrads=zeroMatrix([parameter.dimension,numInputWords]); 271 | allEmbIndices=[]; 272 | for word_tt=T:-1:1 273 | if word_tt==T &&length(end_grad)==0 274 | continue; 275 | end 276 | unmaskedIds=Word_Left{word_tt}; 277 | for ll=parameter.layer_num:-1:1 278 | if ll==parameter.layer_num 279 | if word_tt==T 280 | if length(end_grad)~=0 281 | dh{ll}=dh{ll}+end_grad; 282 | end 283 | else 284 | dh{ll}=dh{ll}+grad.ht{sen_tt}{:,word_tt}; 285 | end 286 | end 287 | if word_tt==1&& sen_tt==1 288 | dim=size(result.c_t_source_sen); 289 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 290 | elseif word_tt==1&& sen_tt~=1 291 | c_t_1=result.c_t_target_sen{ll,sen_tt-1}; 292 | else c_t_1=target_sen.c_t_target_word{ll,word_tt-1}; 293 | end 294 | c_t =target_sen.c_t_target_word{ll,word_tt}; 295 | lstm =target_sen.lstms{ll,word_tt}; 296 | %dh{ll}(:,Word_Delete{word_tt})=0; 297 | 298 | if length(Word_Delete{word_tt})~=0 299 | dh{ll}(:,Word_Delete{word_tt})=0; 300 | dc{ll}(:,Word_Delete{word_tt})=0; 301 | end 302 | % set values to 0 303 | [lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc{ll}, dh{ll},ll,word_tt,zeroState,parameter,parameter.Word_T{ll},0); 304 | % back propagation for current word level lstm unit 305 | grad.Word_T{ll}=grad.Word_T{ll}+lstm_grad.W; 306 | dc{ll} = lstm_grad.dc; 307 | dc{ll}(:,Word_Delete{word_tt})=0; 308 | dh{ll} = lstm_grad.input(end-parameter.hidden+1:end,:); 309 | if length(end_grad)~=0 && length(Word_Delete{word_tt})~=0&&ll==parameter.layer_num 310 | dh{ll}(:,Word_Delete{word_tt})=end_grad(:,Word_Delete{word_tt}); 311 | % gradients for positions that have no words stay the same 312 | end 313 | 314 | 315 | if word_tt==1 316 | if sen_tt==1 317 | grad.source_h{ll,1}=grad.source_h{ll,1}+dh{ll}; 318 | grad.source_c{ll,1}=grad.source_c{ll,1}+dc{ll}; 319 | else 320 | grad.target_sen_h{ll,sen_tt-1}=grad.target_sen_h{ll,sen_tt-1}+dh{ll}; 321 | grad.target_sen_c{ll,sen_tt-1}=grad.target_sen_c{ll,sen_tt-1}+dc{ll}; 322 | end 323 | end 324 | if ll==1 325 | % gradients for word embeddings 326 | embIndices=Word(unmaskedIds,word_tt)'; 327 | embGrad = lstm_grad.input(1:parameter.dimension,unmaskedIds); 328 | numWords = length(embIndices); 329 | allEmbIndices(wordCount+1:wordCount+numWords) = embIndices; 330 | allEmbGrads(:, wordCount+1:wordCount+numWords) = embGrad; 331 | wordCount = wordCount + numWords; 332 | else 333 | dh{ll-1}(:,unmaskedIds)=dh{ll-1}(:,unmaskedIds)+lstm_grad.input(1:parameter.hidden,unmaskedIds); 334 | end 335 | end 336 | end 337 | allEmbGrads(:, wordCount+1:end) = []; 338 | allEmbIndices(wordCount+1:end) = []; 339 | [grad.W_emb, grad.indices] = aggregateMatrix(allEmbGrads, allEmbIndices); 340 | clear dc; 341 | clear dh; 342 | clear allEmbGrads; 343 | clear allEmbIndices; 344 | clear allEmbGrads; 345 | clear lstm_grad; 346 | end 347 | 348 | function[lstm_grad]=lstmUnitGrad(lstm, c_t, c_t_1, dc, dh, ll, t, zero_state,parameter,W,very_begin) 349 | dc = arrayfun(@plusMult, dc, lstm.o_gate, dh); 350 | do = arrayfun(@sigmoidPrimeTriple, lstm.o_gate, c_t, dh); 351 | di = arrayfun(@sigmoidPrimeTriple, lstm.i_gate, lstm.a_signal, dc); 352 | 353 | if very_begin==1 354 | df = zero_state; 355 | else 356 | df = arrayfun(@sigmoidPrimeTriple, lstm.f_gate, c_t_1, dc); 357 | end 358 | lstm_grad.dc = lstm.f_gate.*dc; 359 | dl = arrayfun(@tanhPrimeTriple, lstm.a_signal, lstm.i_gate, dc); 360 | d_ifoa = [di; df; do; dl]; 361 | lstm_grad.W = d_ifoa*lstm.input'; %dw 362 | lstm_grad.input=W'*d_ifoa; 363 | if parameter.dropout~=0 364 | if ll==1 365 | lstm_grad.input(1:parameter.dimension, :) = lstm_grad.input(1:parameter.dimension, :).*lstm.drop_left; 366 | else lstm_grad.input(1:parameter.hidden, :) = lstm_grad.input(1:parameter.hidden, :).*lstm.drop_left; 367 | end 368 | end 369 | if parameter.clip==1 370 | lstm_grad.input=arrayfun(@clipBackward, lstm_grad.input); 371 | lstm_grad.dc = arrayfun(@clipBackward, lstm_grad.dc); 372 | end 373 | clear dc; clear do; clear di; clear df; clear d_ifoa; 374 | end 375 | function[value]=dTanh(x) 376 | value=(1-x*x); 377 | end 378 | 379 | function [value] = plusTanhPrimeTriple(t, x, y, z) 380 | value = t + (1-x*x)*y*z; 381 | end 382 | function [value] = tanhPrimeDouble(x, y) 383 | value = (1-x*x)*y; 384 | end 385 | function [value] = tanhPrimeTriple(x, y, z) 386 | value = (1-x*x)*y*z; 387 | end 388 | function [value] = plusMult(x, y, z) 389 | value = x + y*z; 390 | end 391 | function [value] = sigmoidPrimeTriple(x, y, z) 392 | value = x*(1-x)*y*z; 393 | end 394 | 395 | function [clippedValue] = clipBackward(x) 396 | if x>1000 clippedValue = single(1000); 397 | elseif x<-1000 clippedValue = single(-1000); 398 | else clippedValue =single(x); 399 | end 400 | end 401 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/Batch.m: -------------------------------------------------------------------------------- 1 | classdef Batch < handle 2 | properties 3 | MaxLenSource=-1; 4 | MaxLenTarget=-1; 5 | MaxLen=-1; 6 | Word=[]; 7 | Delete={}; 8 | Left={}; 9 | Mask={}; 10 | Label=[]; 11 | SourceLength=[]; 12 | N_word=0; 13 | end 14 | end 15 | 16 | 17 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/Forward.m: -------------------------------------------------------------------------------- 1 | function[result]=Forward(docbatch,parameter,isTraining) 2 | [result]=Forward_Source_Word(docbatch,parameter,isTraining); 3 | % forward calculation at word level for each sentence 4 | [result]=Forward_Source_Sen(result,docbatch,parameter,isTraining); 5 | % forward calculation at sentence level for each document 6 | if isTraining==1 7 | [result]=Forward_Target(result,docbatch,parameter,isTraining); 8 | end 9 | end 10 | 11 | function[result]=Forward_Source_Word(docbatch,parameter,isTraining) 12 | % forward calculation at word level for each sentence 13 | sourceBatch=docbatch.source_smallBatch; 14 | result.source_sen_vector=[]; 15 | for i=1:length(sourceBatch) 16 | batch=sourceBatch{i}; 17 | T=batch.max_length; 18 | h_t_source_word=cell(parameter.layer_num,T); 19 | % store h_t 20 | result.c_t_source_word{i}=cell(parameter.layer_num,T); 21 | % store c_t 22 | result.lstms_source_word{i} = cell(parameter.layer_num,T); 23 | % store gate values for lstm units 24 | N=size(batch.Word,1); 25 | zeroState=zeroMatrix([parameter.hidden,N]); 26 | for ll=1:parameter.layer_num 27 | for tt=1:T 28 | h_t_source_word{ll,tt}=zeroState; 29 | result.c_t_source_word{i}{ll,tt}=zeroState; 30 | end 31 | end 32 | for t=1:T 33 | for ll=1:parameter.layer_num 34 | W=parameter.Word_S{ll}; 35 | % W for word level composition in source 36 | if t==1 37 | h_t_1=zeroState; 38 | c_t_1=zeroState; 39 | else 40 | c_t_1=result.c_t_source_word{i}{ll, t-1}; 41 | h_t_1=h_t_source_word{ll, t-1}; 42 | end 43 | if ll==1 44 | x_t=parameter.vect(:,batch.Word(:,t)); 45 | else 46 | x_t=h_t_source_word{ll-1,t}; 47 | end 48 | x_t(:,batch.Delete{t})=0; 49 | h_t_1(:,batch.Delete{t})=0; 50 | c_t_1(:,batch.Delete{t})=0; 51 | % set postion that do not have words to zero 52 | [result.lstms_source_word{i}{ll, t},h_t_source_word{ll, t},result.c_t_source_word{i}{ll, t}]=lstmUnit(W,parameter,x_t,[],h_t_1,c_t_1,ll,t,isTraining); 53 | % compute lstm unit 54 | if t==T && ll==parameter.layer_num 55 | result.source_sen_vector=[result.source_sen_vector,h_t_source_word{ll,t}]; 56 | % store vector embeddings for each source sentence 57 | clear h_t_source_word; 58 | end 59 | end 60 | end 61 | end 62 | clear x_t; 63 | clear h_t_1; 64 | clear c_t_1; 65 | end 66 | 67 | function[result]=Forward_Source_Sen(result,docbatch,parameter,isTraining) 68 | % forward calculation at sentence level for each document 69 | T=docbatch.max_source_sen; 70 | h_t_source_sen=cell(parameter.layer_num,T); 71 | result.c_t_source_sen=cell(parameter.layer_num,T); 72 | result.lstms_source_sen=cell(parameter.layer_num,T); 73 | result.source_sen=cell(parameter.layer_num,1); 74 | result.source_each_sen=gpuArray(); 75 | 76 | N=size(docbatch.source_sen_matrix,1); 77 | zeroState=zeroMatrix([parameter.hidden,N]); 78 | for ll=1:parameter.layer_num 79 | for tt=1:T 80 | h_t_source_sen{ll,tt}=zeroState; 81 | result.c_t_source_sen{ll,tt}=zeroState; 82 | end 83 | end 84 | for t=1:T 85 | for ll=1:parameter.layer_num 86 | W=parameter.Sen_S{ll}; 87 | % sentence-level compostion 88 | if t==1 89 | h_t_1=zeroState; 90 | c_t_1 =zeroState; 91 | else 92 | c_t_1 =result.c_t_source_sen{ll, t-1}; 93 | h_t_1 =h_t_source_sen{ll, t-1}; 94 | end 95 | if ll==1 96 | x_t=result.source_sen_vector(:,docbatch.source_sen_matrix(:,t)); 97 | % compute sentence-level embedding 98 | else 99 | x_t=h_t_source_sen{ll-1,t}; 100 | end 101 | x_t(:,docbatch.source_delete{t})=0; 102 | h_t_1(:,docbatch.source_delete{t})=0; 103 | c_t_1(:,docbatch.source_delete{t})=0; 104 | % set values for deleted postion to 0 105 | [result.lstms_source_sen{ll, t},h_t_source_sen{ll, t},result.c_t_source_sen{ll, t}]=lstmUnit(W,parameter,x_t,[],h_t_1,c_t_1,ll,t,isTraining); 106 | % lstm unit calculation at sentence level 107 | if t==T 108 | result.source_sen{ll,1}=h_t_source_sen{ll, t}; 109 | % store vector embeddings for documents 110 | end 111 | if ll==parameter.layer_num 112 | result.source_each_sen=[result.source_each_sen,h_t_source_sen{parameter.layer_num,t}]; 113 | end 114 | end 115 | end 116 | clear result.source_sen_vector; 117 | clear h_t_source_sen; 118 | clear x_t; 119 | clear h_t_1; 120 | clear c_t_1; 121 | end 122 | 123 | function[result]=Forward_Target(result,docbatch,parameter,isTraining) 124 | % forward for target documents 125 | T=docbatch.max_target_sen; 126 | result.h_t_target_sen=cell(parameter.layer_num,T); 127 | % sentence level embeddings 128 | result.c_t_target_sen=cell(parameter.layer_num,T); 129 | result.lstms_target_sen=cell(parameter.layer_num,T); 130 | N=size(docbatch.target_sen_matrix,1); 131 | zeroState=zeroMatrix([parameter.hidden,N]); 132 | 133 | for ll=1:parameter.layer_num 134 | for tt=1:T 135 | result.h_t_target_sen{ll,tt}=zeroState; 136 | result.c_t_target_sen{ll,tt}=zeroState; 137 | end 138 | end 139 | result.Target_sen={}; 140 | for sen_tt=1:T 141 | for ll=1:parameter.layer_num 142 | W=parameter.Sen_T{ll}; 143 | % sentnece compositions for target 144 | if sen_tt==1 145 | % if sentence index is 1, h_t_1 and c_t_1 are from outputs from source sentences 146 | h_t_1=result.source_sen{ll,1}; 147 | dim=size(result.c_t_source_sen); 148 | c_t_1=result.c_t_source_sen{ll,dim(2)}; 149 | else 150 | % otherwise, c_t_1 are lstm outputs from last time step 151 | c_t_1 =result.c_t_target_sen{ll, sen_tt-1}; 152 | h_t_1 =result.h_t_target_sen{ll, sen_tt-1}; 153 | end 154 | if ll==1 155 | Word_List=docbatch.target_word{sen_tt}.Word; 156 | Word_Delete=docbatch.target_word{sen_tt}.Delete; 157 | if sen_tt==1 158 | M1=result.source_sen(:,1); 159 | dim=size(result.c_t_source_sen); 160 | M2=result.c_t_source_sen(:,dim(2)); 161 | else 162 | M1=result.h_t_target_sen(:,sen_tt-1); 163 | M2=result.c_t_target_sen(:,sen_tt-1); 164 | end 165 | result=Forward_Target_Word(result,M1,M2,Word_List,Word_Delete,docbatch,parameter,isTraining,sen_tt); 166 | x_t=result.Target_sen{sen_tt}.h_t_target_word{parameter.layer_num,size(Word_List,2)}; 167 | result=Attention(docbatch,result,M1{parameter.layer_num,1},parameter,sen_tt); 168 | % compute attention values 169 | m_t=result.sum_vector{sen_tt}; 170 | else 171 | x_t=result.h_t_target_sen{ll-1,sen_tt}; 172 | m_t=[]; 173 | end 174 | x_t(:,docbatch.target_delete{sen_tt})=0; 175 | h_t_1(:,docbatch.target_delete{sen_tt})=0; 176 | c_t_1(:,docbatch.target_delete{sen_tt})=0; 177 | % set deleted postions to 0 value 178 | [result.lstms_target_sen{ll,sen_tt},result.h_t_target_sen{ll,sen_tt},result.c_t_target_sen{ll,sen_tt}]=lstmUnit(W,parameter,x_t,m_t,h_t_1,c_t_1,ll,sen_tt,isTraining); 179 | % lstm unit calculation 180 | end 181 | end 182 | clear target_sen_vector; 183 | clear x_t; 184 | clear h_t_1; 185 | clear c_t_1; 186 | end 187 | 188 | function[result]=Forward_Target_Word(result,h_t_sen,c_t_sen,Word_List,Word_Delete,docbatch,parameter,isTraining,sen_tt) 189 | % obtain sentence level embeddings for target sentences 190 | N=size(Word_List,1); 191 | T=size(Word_List,2); 192 | target_sen.h_t_target_word=cell(parameter.layer_num,T); 193 | target_sen.c_t_target_word=cell(parameter.layer_num,T); 194 | target_sen.lstms=cell(parameter.layer_num,T); 195 | zeroState=zeroMatrix([parameter.hidden,N]); 196 | 197 | for ll=1:parameter.layer_num 198 | for tt=1:T 199 | target_sen.h_t_target_word{ll,tt}=zeroState; 200 | target_sen.c_t_target_word{ll,tt}=zeroState; 201 | end 202 | end 203 | for t=1:T 204 | for ll=1:parameter.layer_num 205 | W=parameter.Word_T{ll}; 206 | if t==1 207 | h_t_1=h_t_sen{ll,1}; 208 | c_t_1=c_t_sen{ll,1}; 209 | else 210 | c_t_1 =target_sen.c_t_target_word{ll, t-1}; 211 | h_t_1 =target_sen.h_t_target_word{ll, t-1}; 212 | end 213 | if ll==1 214 | x_t=parameter.vect(:,Word_List(:,t)); 215 | % if ll=1, x_t are correspondent word embeddings 216 | else 217 | x_t=target_sen.h_t_target_word{ll-1,t}; 218 | % else x_t are outputs from previous layer 219 | end 220 | x_t(:,Word_Delete{t})=0; 221 | h_t_1(:,Word_Delete{t})=0; 222 | c_t_1(:,Word_Delete{t})=0; 223 | % set deleted positions to 0 224 | [target_sen.lstms{ll, t},target_sen.h_t_target_word{ll, t},target_sen.c_t_target_word{ll, t}]=lstmUnit(W,parameter,x_t,[],h_t_1,c_t_1,ll,t,isTraining); 225 | % lstm unit calculations 226 | if t~=1 227 | target_sen.h_t_target_word{ll, t}(:,Word_Delete{t})=target_sen.h_t_target_word{ll,t-1}(:,Word_Delete{t}); 228 | % sentence representations stay the same for deleted positions 229 | end 230 | end 231 | end 232 | result.Target_sen{sen_tt}=target_sen; 233 | clear h_t_1; 234 | clear c_t_1; 235 | clear x_t; 236 | end 237 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/ReadData.m: -------------------------------------------------------------------------------- 1 | function[batch,Stop]=ReadData(fd_s,fd_t,parameter) 2 | tline_s = fgets(fd_s); 3 | Target={}; 4 | Stop=0; 5 | doc_index=1; 6 | sen_index=0; 7 | i=0; 8 | 9 | length_array=[]; 10 | num_of_sen=0; 11 | max_source_sen=0; 12 | while ischar(tline_s) 13 | text_s=deblank(tline_s); 14 | if length(text_s)==0 15 | sourceDoc{doc_index}=[i-sen_index+1,i]; 16 | doc_index=doc_index+1; 17 | if sen_index>max_source_sen 18 | max_source_sen=sen_index; 19 | end 20 | sen_index=0; 21 | if doc_index==parameter.batch_size+1 22 | break; 23 | end 24 | else 25 | i=i+1; 26 | sen_index=sen_index+1; 27 | num_of_sen=num_of_sen+1; 28 | if parameter.Source_Target_Same_Language~=1 29 | Source{i}=str2num(text_s)+parameter.TargetVocab; 30 | else 31 | Source{i}=str2num(text_s); 32 | end 33 | length_array=[length_array;length(Source{i})]; 34 | end 35 | tline_s = fgets(fd_s); 36 | end 37 | if ischar(tline_s)==0 38 | Stop=1; 39 | end 40 | if exist('sourceDoc')==0 41 | Stop=1; 42 | batch.source_smallBatch=[]; 43 | return; 44 | end 45 | batch.num_of_source_sen=num_of_sen; 46 | source_sen_matrix=ones(length(sourceDoc),max_source_sen); 47 | source_sen_mask=ones(length(sourceDoc),max_source_sen); 48 | num_word_in_doc=[]; 49 | for i=1:length(sourceDoc) 50 | L=sourceDoc{i}(1):sourceDoc{i}(2); 51 | n_word=0; 52 | for j=sourceDoc{i}(1):sourceDoc{i}(2) 53 | n_word=n_word+length(Source{j}); 54 | end 55 | num_word_in_doc=[num_word_in_doc,n_word]; 56 | l=length(L); 57 | source_sen_matrix(i,max_source_sen-l+1:max_source_sen)=L; 58 | source_sen_mask(i,1:max_source_sen-l)=0; 59 | end 60 | batch.num_word_in_doc=num_word_in_doc; 61 | source_delete={}; 62 | for j=1:size(source_sen_mask,2) 63 | source_delete{j}=find(source_sen_mask(:,j)==0); 64 | source_left{j}=find(source_sen_mask(:,j)==1); 65 | end 66 | [a,b]=sort(length_array); 67 | c=ones(length(b),1); 68 | c(b)=1:length(b); 69 | source_sen_matrix=c(source_sen_matrix); 70 | if size(source_sen_matrix,2)==1 71 | source_sen_matrix=source_sen_matrix'; 72 | end 73 | 74 | source_smallBatch={}; 75 | sen_size=32; 76 | num_small=ceil(num_of_sen/sen_size); 77 | for i=1:num_small 78 | Begin=sen_size*(i-1)+1; 79 | End=sen_size*i; 80 | if End>num_of_sen 81 | End=num_of_sen; 82 | end 83 | max_length=-1; 84 | for j=Begin:End 85 | if length(Source{b(j)})>max_length 86 | max_length=length(Source{b(j)}); 87 | end 88 | end 89 | N=End-Begin+1; 90 | source_smallBatch{i}.max_length=max_length; 91 | source_smallBatch{i}.Word=ones(N,max_length); 92 | source_smallBatch{i}.Mask=ones(N,max_length); 93 | source_smallBatch{i}.Sen=Begin:End; 94 | for j=Begin:End 95 | source_length=length(Source{b(j)}); 96 | source_smallBatch{i}.Word(j-Begin+1,max_length-source_length+1:max_length)=Source{b(j)}; 97 | source_smallBatch{i}.Mask(j-Begin+1,1:max_length-source_length)=0; 98 | end 99 | for j=1:max_length 100 | source_smallBatch{i}.Delete{j}=find(source_smallBatch{i}.Mask(:,j)==0); 101 | source_smallBatch{i}.Left{j}=find(source_smallBatch{i}.Mask(:,j)==1); 102 | end 103 | %disp('source') 104 | %source_smallBatch{i}.Word 105 | %source_smallBatch{i}.Mask 106 | end 107 | 108 | 109 | End=0; 110 | doc_index=1; 111 | sen_index=0; 112 | i=0; 113 | Target={}; 114 | target_doc_index_array=[]; 115 | target_sen_index_array=[]; 116 | max_target_sen=0; 117 | length_array=[]; 118 | num_of_sen=0; 119 | tline_t = fgets(fd_t); 120 | targetDoc={}; 121 | while ischar(tline_t) 122 | text_t=deblank(tline_t); 123 | if length(text_t)==0 124 | Target{i}(length(Target{i}))=parameter.doc_stop; 125 | targetDoc{doc_index}=[i-sen_index+1,i]; 126 | doc_index=doc_index+1; 127 | if sen_index>max_target_sen 128 | max_target_sen=sen_index; 129 | end 130 | sen_index=0; 131 | if doc_index==parameter.batch_size+1 132 | break; 133 | end 134 | else 135 | i=i+1; 136 | num_of_sen=num_of_sen+1; 137 | sen_index=sen_index+1; 138 | Target{i}=[str2num(text_t),parameter.sen_stop]; 139 | length_array=[length_array;length(Target{i})]; 140 | end 141 | tline_t = fgets(fd_t); 142 | end 143 | 144 | target_sen_matrix=ones(length(targetDoc),max_target_sen); 145 | target_sen_mask=ones(length(targetDoc),max_target_sen); 146 | for i=1:length(targetDoc) 147 | L=targetDoc{i}(1):targetDoc{i}(2); 148 | l=length(L); 149 | target_sen_matrix(i,1:l)=L; 150 | target_sen_mask(i,l+1:max_target_sen)=0; 151 | end 152 | target_word={}; 153 | for i=1:size(target_sen_matrix,2) 154 | Max_l=1; 155 | sen_list=target_sen_matrix(:,i); 156 | mask_list=target_sen_mask(:,i); 157 | for j=1:length(sen_list) 158 | if mask_list(j)==0 159 | continue; 160 | end 161 | index=sen_list(j); 162 | Length=length(Target{index}); 163 | if Length>Max_l 164 | Max_l=Length; 165 | end 166 | end 167 | target_word{i}.Word=ones(length(sen_list),Max_l); 168 | target_word{i}.Mask=ones(length(sen_list),Max_l); 169 | target_word{i}.End=[]; 170 | for j=1:length(sen_list) 171 | if mask_list(j)==0 172 | target_word{i}.Mask(j,:)=0; 173 | continue; 174 | end 175 | index=sen_list(j); 176 | Length=length(Target{index}); 177 | target_word{i}.Word(j,1:Length)=Target{index}; 178 | target_word{i}.Mask(j,Length+1:end)=0; 179 | end 180 | for j=1:size(target_word{i}.Mask,2) 181 | target_word{i}.Delete{j}=find(target_word{i}.Mask(:,j)==0); 182 | target_word{i}.Left{j}=find(target_word{i}.Mask(:,j)==1); 183 | end 184 | end 185 | target_delete={}; 186 | for j=1:size(target_sen_mask,2) 187 | target_delete{j}=find(target_sen_mask(:,j)==0); 188 | end 189 | 190 | batch.source_smallBatch=source_smallBatch; 191 | batch.max_source_sen=max_source_sen; 192 | batch.max_target_sen=max_target_sen; 193 | 194 | batch.target_sen_matrix=target_sen_matrix; 195 | batch.source_sen_matrix=source_sen_matrix; 196 | batch.target_sen_mask=target_sen_mask; 197 | batch.source_sen_mask=source_sen_mask; 198 | batch.source_delete=source_delete; 199 | batch.source_left=source_left; 200 | 201 | batch.target_delete=target_delete; 202 | batch.target_word=target_word; 203 | 204 | 205 | clear target_sen_matrix; 206 | clear source_sen_matrix; 207 | clear target_sen_mask; 208 | clear source_sen_mask; 209 | clear source_smallBatch; 210 | clear target_doc_index_array; 211 | 212 | clear source_delete; 213 | clear source_left; 214 | clear target_delete; 215 | 216 | clear target_word; 217 | end 218 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/decode_greedy.m: -------------------------------------------------------------------------------- 1 | function[Translations]=decode_greedy(parameter,batch,filename) 2 | T=size(batch.source_sen_matrix,2); 3 | N=size(batch.source_sen_matrix,1); 4 | L=length(batch.source_smallBatch); 5 | Max_Length=1.5*max(batch.num_word_in_doc); 6 | 7 | result=Forward(batch,parameter,0); 8 | first_words=BeamStep(parameter,result.source_sen{parameter.layer_num,1},1); 9 | beamHistory=oneMatrix([N,Max_Length]); 10 | beamHistory(:,1)=first_words(:); 11 | 12 | for ll=1:parameter.layer_num 13 | beamSenStates_h{ll}=result.source_sen{ll,1}; 14 | beamWordStates_h{ll}=result.source_sen{ll,1}; 15 | beamSenStates_c{ll}=result.c_t_source_sen{ll,T}; 16 | beamWordStates_c{ll}=result.c_t_source_sen{ll,T}; 17 | end 18 | for position=1:Max_Length 19 | words=beamHistory(:,position); 20 | for ll=1:parameter.layer_num 21 | if ll==1 22 | x_t=parameter.vect(:,words); 23 | else 24 | x_t=beamWordStates_h{ll-1}; 25 | end 26 | h_t_1=beamWordStates_h{ll}; 27 | c_t_1=beamWordStates_c{ll}; 28 | [A,beamWordStates_h{ll},beamWordStates_c{ll}]=lstmUnit(parameter.Word_T{ll},parameter,x_t,[],h_t_1,c_t_1,ll,-1,0); 29 | end 30 | end_sen_index=find(words==parameter.sen_stop); 31 | if length(end_sen_index)~=0 32 | for ll=1:parameter.layer_num 33 | if ll==1 34 | x_t=beamWordStates_h{ll}(:,end_sen_index); 35 | m_t=Attention_Decode(batch,result,beamWordStates_h{parameter.layer_num}(:,end_sen_index),parameter,end_sen_index); 36 | else 37 | x_t=beamSenStates_h{ll-1}(:,end_sen_index); 38 | m_t=[]; 39 | end 40 | h_t_1=beamSenStates_h{ll}(:,end_sen_index); 41 | c_t_1=beamSenStates_c{ll}(:,end_sen_index); 42 | [A,beamSenStates_h{ll}(:,end_sen_index),beamSenStates_c{ll}(:,end_sen_index)]=lstmUnit(parameter.Sen_T{ll},parameter,x_t,m_t,h_t_1,c_t_1,ll,-1,0); 43 | beamWordStates_h{ll}(:,end_sen_index)=beamSenStates_h{ll}(:,end_sen_index); 44 | beamWordStates_c{ll}(:,end_sen_index)=beamSenStates_c{ll}(:,end_sen_index); 45 | end 46 | end 47 | all_next_words=BeamStep(parameter,beamWordStates_h{parameter.layer_num},0); 48 | beamHistory(:,position+1)=all_next_words; 49 | end 50 | for senId=1:N 51 | vector=beamHistory(senId,:); 52 | vector=vector(1:floor(1.5*batch.num_word_in_doc(senId))); 53 | batch.num_word_in_doc(senId) 54 | stop_sign=find(vector==parameter.doc_stop); 55 | if length(stop_sign)==0 56 | dlmwrite(filename,vector,'delimiter',' ','-append'); 57 | else 58 | dlmwrite(filename,vector(1:stop_sign-1),'delimiter',' ','-append'); 59 | end 60 | end 61 | end 62 | 63 | function[select_words]=BeamStep(parameter,h_t,isFirst) 64 | if isFirst==1 scores=parameter.soft_W(1:end-2,:)*h_t; 65 | else scores=parameter.soft_W*h_t; 66 | end 67 | mx = max(scores); 68 | scores = bsxfun(@minus, scores, mx); 69 | logP=bsxfun(@minus, scores, log(sum(exp(scores)))); 70 | [sortedLogP, sortedWords]=sort(logP, 'descend'); 71 | select_words=sortedWords(1, :); 72 | 73 | probs_=exp(scores); 74 | norms = sum(probs_, 1); 75 | probs=bsxfun(@rdivide, probs_, norms); 76 | end 77 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/hier_LSTM_Att.m: -------------------------------------------------------------------------------- 1 | function[]=hier_LSTM_Att() 2 | clear; 3 | addpath('../misc'); 4 | n= gpuDeviceCount; 5 | parameter.isGPU = 0; 6 | 7 | if n>0 % GPU exists 8 | parameter.isGPU = 1; 9 | gpuDevice(1); 10 | else 11 | print('no gpu ! ! ! ! !'); 12 | end 13 | 14 | parameter.dimension=1000; 15 | parameter.alpha=0.1; %learning rate 16 | parameter.layer_num=4; %number of layer 17 | parameter.hidden=1000; 18 | parameter.lstm_out_tanh=0; 19 | parameter.Initial=0.08; 20 | parameter.dropout=0; 21 | params.lstm_out_tanh=0; 22 | parameter.isTraining=1; 23 | parameter.CheckGrad=0; 24 | parameter.PreTrainEmb=0; 25 | %whether using pre-trained embeddings 26 | parameter.update_embedding=1; 27 | %whether update word embeddings 28 | parameter.batch_size=16; 29 | parameter.Source_Target_Same_Language=1; 30 | %whether source and target is of the same language. For author-encoder task, it is. 31 | parameter.maxGradNorm=1; 32 | parameter.clip=0; 33 | 34 | parameter.lr=5; 35 | parameter.read=0; 36 | 37 | if parameter.Source_Target_Same_Language==1 38 | parameter.Vocab=25002; 39 | %vocabulary size plus sentence-end and document-end 40 | parameter.sen_stop=parameter.Vocab-1; %sentence-end 41 | parameter.doc_stop=parameter.Vocab; %document-end 42 | else 43 | parameter.SourceVocab=20; 44 | parameter.TargetVocab=20; 45 | parameter.stop=parameter.TargetVocab; 46 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 47 | end 48 | 49 | if parameter.CheckGrad==1¶meter.dropout~=0 50 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 51 | end 52 | %alpha: learning rate for minibatch 53 | 54 | parameter.nonlinear_gate_f = @sigmoid; 55 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 56 | parameter.nonlinear_f = @tanh; 57 | parameter.nonlinear_f_prime = @tanhPrime; 58 | 59 | 60 | train_source_file='data/train_source_permute_segment.txt'; 61 | train_target_file='data/train_target_permute_segment.txt'; 62 | 63 | if 1==0 64 | disp('small data testing') 65 | train_source_file='small_data/train_source_permute_segment_small.txt'; 66 | train_target_file='small_data/train_target_permute_segment_small.txt'; 67 | end 68 | 69 | 70 | if parameter.read==1 71 | disp('read'); 72 | parameter=ReadParameter(parameter); 73 | % read parameters 74 | else [parameter]=Initial(parameter); 75 | end 76 | 77 | pre_parameter=parameter; 78 | 79 | iter=0; 80 | 81 | disp('begin') 82 | 83 | while 1 84 | iter=iter+1 85 | fd_train_source=fopen(train_source_file); 86 | fd_train_target=fopen(train_target_file); 87 | sum_cost=0; 88 | sum_num=0; 89 | batch_n=0; 90 | while 1 91 | batch_n=batch_n+1; 92 | [current_batch,End]=ReadData(fd_train_source,fd_train_target,parameter); 93 | % read one batch 94 | if End~=1 || (End==1&& length(current_batch.source_smallBatch)~=0) 95 | result=Forward(current_batch,parameter,1); 96 | % Forward 97 | [batch_cost,grad]=softmax(result,current_batch,parameter); 98 | if (isnan(batch_cost)||isinf(batch_cost)) &&End~=1 99 | % if batch_cost is nan, load ealier parameters and skip latest batches 100 | if parameter.clip==1 101 | fprintf('die !! Hopeless!!\n'); 102 | disp('read done'); 103 | parameter=pre_parameter; 104 | disp(batch_n) 105 | else parameter.clip=1; 106 | end 107 | if End==1 break; 108 | else continue; 109 | end 110 | end 111 | if parameter.isTraining==1 112 | grad=Backward(current_batch,result,grad,parameter); 113 | % backward propagation 114 | clear result; 115 | if 1==0 116 | check(grad,current_batch,parameter) 117 | % check gradient 118 | end 119 | [parameter]=update_parameter(parameter,grad); 120 | % update parameter 121 | clear lstm; 122 | clear c; 123 | end 124 | end 125 | if End==1 126 | fclose(fd_train_source); 127 | fclose(fd_train_target); 128 | break; 129 | end 130 | 131 | if mod(batch_n,100)==0 132 | pre_parameter=parameter; 133 | % store parameters from last 100 batches in case of gradient explosion 134 | end 135 | end 136 | 137 | SaveParameter(parameter,iter); 138 | end 139 | end 140 | function[]=check(grad,current_batch,parameter) 141 | check_Attention_U(grad.Attention_U(1),1,current_batch,parameter); 142 | check_Attention_W(grad.Attention_W(1,1),1,1,current_batch,parameter); 143 | if 1==0 144 | check_vect(grad.vect(1,1),1,1,current_batch,parameter); 145 | check_vect(grad.vect(1,2),1,2,current_batch,parameter); 146 | check_vect(grad.vect(1,3),1,3,current_batch,parameter); 147 | check_vect(grad.vect(1,4),1,4,current_batch,parameter); 148 | check_vect(grad.vect(1,5),1,5,current_batch,parameter); 149 | end 150 | if 1==0 151 | disp('word_T') 152 | check_word_T(grad.Word_T{1}(1,1),1,1,1,current_batch,parameter); 153 | check_word_T(grad.Word_T{2}(1,1),2,1,1,current_batch,parameter); 154 | check_word_T(grad.Word_T{3}(1,1),3,1,1,current_batch,parameter); 155 | check_word_T(grad.Word_T{4}(1,1),4,1,1,current_batch,parameter); 156 | end 157 | if 1==1 158 | disp('sen_T') 159 | check_sen_T(grad.Sen_T{1}(1,1),1,1,1,current_batch,parameter); 160 | check_sen_T(grad.Sen_T{2}(1,1),2,1,1,current_batch,parameter); 161 | check_sen_T(grad.Sen_T{3}(1,1),3,1,1,current_batch,parameter); 162 | check_sen_T(grad.Sen_T{4}(1,1),4,1,1,current_batch,parameter); 163 | end 164 | if 1==1 165 | disp('sen_S') 166 | check_sen_S(grad.Sen_S{1}(1,1),1,1,1,current_batch,parameter); 167 | check_sen_S(grad.Sen_S{1}(1,2),1,1,2,current_batch,parameter); 168 | check_sen_S(grad.Sen_S{1}(1,7),1,1,7,current_batch,parameter); 169 | check_sen_S(grad.Sen_S{1}(1,8),1,1,8,current_batch,parameter); 170 | end 171 | if 1==1 172 | disp('word_S') 173 | check_word_S(grad.Word_S{1}(1,1),1,1,1,current_batch,parameter); 174 | check_word_S(grad.Word_S{1}(1,2),1,1,2,current_batch,parameter); 175 | check_word_S(grad.Word_S{2}(1,1),2,1,1,current_batch,parameter); 176 | check_word_S(grad.Word_S{2}(1,2),2,1,2,current_batch,parameter); 177 | check_word_S(grad.Word_S{3}(1,1),3,1,1,current_batch,parameter); 178 | check_word_S(grad.Word_S{3}(1,2),3,1,2,current_batch,parameter); 179 | end 180 | end 181 | 182 | function[parameter]=ReadParameter(parameter) 183 | % read parameters 184 | for ll=1:parameter.layer_num 185 | W_file=strcat('save_parameter/_Word_S',num2str(ll)); 186 | parameter.Word_S{ll}=gpuArray(load(W_file)); 187 | W_file=strcat('save_parameter/_Word_T',num2str(ll)); 188 | parameter.Word_T{ll}=gpuArray(load(W_file)); 189 | W_file=strcat('save_parameter/_Sen_S',num2str(ll)); 190 | parameter.Sen_S{ll}=gpuArray(load(W_file)); 191 | W_file=strcat('save_parameter/_Sen_T',num2str(ll)); 192 | parameter.Sen_T{ll}=gpuArray(load(W_file)); 193 | end 194 | parameter.Attention_W=gpuArray(load('save_parameter/_Attention_W')); 195 | parameter.Attention_U=gpuArray(load('save_parameter/_Attention_U')); 196 | parameter.vect=gpuArray(load('save_parameter/_v')); 197 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 198 | end 199 | 200 | function SaveParameter(parameter,iter) 201 | % save parameters 202 | if iter~=-1 203 | all=strcat('save_parameter/',int2str(iter)); 204 | all=strcat(all,'_'); 205 | all=strcat(all,num2str(parameter.alpha)); 206 | else 207 | all='save_parameter/'; 208 | end 209 | for ll=1:parameter.layer_num 210 | W_file=strcat(all,'_Word_T'); 211 | W_file=strcat(W_file,num2str(ll)); 212 | dlmwrite(W_file,parameter.Word_T{ll}); 213 | end 214 | for ll=1:parameter.layer_num 215 | W_file=strcat(all,'_Word_S'); 216 | W_file=strcat(W_file,num2str(ll)); 217 | dlmwrite(W_file,parameter.Word_S{ll}); 218 | end 219 | for ll=1:parameter.layer_num 220 | W_file=strcat(all,'_Sen_T'); 221 | W_file=strcat(W_file,num2str(ll)); 222 | dlmwrite(W_file,parameter.Sen_T{ll}); 223 | end 224 | for ll=1:parameter.layer_num 225 | W_file=strcat(all,'_Sen_S'); 226 | W_file=strcat(W_file,num2str(ll)); 227 | dlmwrite(W_file,parameter.Sen_S{ll}); 228 | end 229 | W_file=strcat(all,'_Attention_W'); 230 | dlmwrite(W_file,parameter.Attention_W) 231 | U_file=strcat(all,'_Attention_U'); 232 | dlmwrite(U_file,parameter.Attention_U) 233 | 234 | v_file=strcat(all,'_v'); 235 | dlmwrite(v_file,parameter.vect); 236 | soft_W_file=strcat(all,'_soft_W'); 237 | dlmwrite(soft_W_file,parameter.soft_W); 238 | end 239 | 240 | function[parameter]=update_parameter(parameter,grad) 241 | % update parameters 242 | norm=computeGradNorm(grad,parameter); 243 | % compute norm 244 | if norm>parameter.maxGradNorm 245 | lr=parameter.alpha*parameter.maxGradNorm/norm; 246 | else lr=parameter.alpha; 247 | end 248 | for ll=1:parameter.layer_num 249 | parameter.Word_S{ll}=parameter.Word_S{ll}-lr*grad.Word_S{ll}; 250 | parameter.Word_T{ll}=parameter.Word_T{ll}-lr*grad.Word_T{ll}; 251 | parameter.Sen_S{ll}=parameter.Sen_S{ll}-lr*grad.Sen_S{ll}; 252 | parameter.Sen_T{ll}=parameter.Sen_T{ll}-lr*grad.Sen_T{ll}; 253 | end 254 | parameter.Attention_U=parameter.Attention_U-lr*grad.Attention_U; 255 | parameter.Attention_W=parameter.Attention_W-lr*grad.Attention_W; 256 | parameter.soft_W=parameter.soft_W-lr*grad.soft_W; 257 | parameter.vect=parameter.vect-lr*grad.vect; 258 | clear grad; 259 | end 260 | 261 | function[parameter]=Initial(parameter) 262 | %random initialization 263 | m=parameter.Initial; 264 | for i=1:parameter.layer_num 265 | if i==1 266 | parameter.Word_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 267 | parameter.Word_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 268 | else 269 | parameter.Word_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 270 | parameter.Word_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 271 | end 272 | parameter.Sen_S{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,parameter.dimension+parameter.hidden]); 273 | if i==1 274 | parameter.Sen_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,3*parameter.hidden]); 275 | else 276 | parameter.Sen_T{i}=randomMatrix(parameter.Initial,[4*parameter.hidden,2*parameter.hidden]); 277 | end 278 | 279 | end 280 | parameter.vect=randomMatrix(parameter.Initial,[parameter.dimension,parameter.Vocab]); 281 | if parameter.Source_Target_Same_Language==1 282 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.Vocab,parameter.hidden]); 283 | else 284 | parameter.soft_W=randomMatrix(parameter.Initial,[parameter.TargetVocab,parameter.hidden]); 285 | end 286 | parameter.Attention_W=randomMatrix(parameter.Initial,[parameter.hidden,parameter.hidden*2]); 287 | parameter.Attention_U=randomMatrix(parameter.Initial,[parameter.hidden,1]); 288 | end 289 | 290 | function check_Attention_W(value1,i,j,current_batch,parameter) 291 | e=0.001; 292 | parameter.Attention_W(i,j)=parameter.Attention_W(i,j)+e; 293 | result=Forward(current_batch,parameter,1); 294 | [cost1,grad]=softmax(result,current_batch,parameter); 295 | parameter.Attention_W(i,j)=parameter.Attention_W(i,j)-2*e; 296 | result=Forward(current_batch,parameter,1); 297 | [cost2,grad]=softmax(result,current_batch,parameter); 298 | parameter.Attention_W(i,j)=parameter.Attention_W(i,j)+e; 299 | value2=(cost1-cost2)/(2*e); 300 | [value1,value2] 301 | value1-value2 302 | end 303 | 304 | function check_Attention_U(value1,i,current_batch,parameter) 305 | e=0.001; 306 | parameter.Attention_U(i)=parameter.Attention_U(i)+e; 307 | result=Forward(current_batch,parameter,1); 308 | [cost1,grad]=softmax(result,current_batch,parameter); 309 | parameter.Attention_U(i)=parameter.Attention_U(i)-2*e; 310 | result=Forward(current_batch,parameter,1); 311 | [cost2,grad]=softmax(result,current_batch,parameter); 312 | parameter.Attention_U(i)=parameter.Attention_U(i)+e; 313 | value2=(cost1-cost2)/(2*e); 314 | [value1,value2] 315 | value1-value2 316 | end 317 | 318 | function check_vect(value1,i,j,current_batch,parameter) 319 | e=0.001; 320 | parameter.vect(i,j)=parameter.vect(i,j)+e; 321 | result=Forward(current_batch,parameter,1); 322 | [cost1,grad]=softmax(result,current_batch,parameter); 323 | parameter.vect(i,j)=parameter.vect(i,j)-2*e; 324 | result=Forward(current_batch,parameter,1); 325 | [cost2,grad]=softmax(result,current_batch,parameter); 326 | parameter.vect(i,j)=parameter.vect(i,j)+e; 327 | value2=(cost1-cost2)/(2*e); 328 | [value1,value2] 329 | value1-value2 330 | end 331 | 332 | function check_soft_W(value1,i,j,current_batch,parameter) 333 | e=0.001; 334 | parameter.soft_W(i,j)=parameter.soft_W(i,j)+e; 335 | result=Forward(current_batch,parameter,1); 336 | [cost1,grad]=softmax(result,current_batch,parameter); 337 | parameter.soft_W(i,j)=parameter.soft_W(i,j)-2*e; 338 | result=Forward(current_batch,parameter,1); 339 | [cost2,grad]=softmax(result,current_batch,parameter); 340 | parameter.soft_W(i,j)=parameter.soft_W(i,j)+e; 341 | value2=(cost1-cost2)/(2*e); 342 | value1 343 | value2 344 | value1-value2 345 | end 346 | 347 | 348 | function check_word_T(value1,ll,i,j,current_batch,parameter) 349 | e=1e-3; 350 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)+e; 351 | result=Forward(current_batch,parameter,1); 352 | [cost1,grad]=softmax(result,current_batch,parameter); 353 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)-2*e; 354 | result=Forward(current_batch,parameter,1); 355 | [cost2,grad]=softmax(result,current_batch,parameter); 356 | parameter.Word_T{ll}(i,j)=parameter.Word_T{ll}(i,j)+e; 357 | value2=(cost1-cost2)/(2*e); 358 | [value1,value2] 359 | value1-value2 360 | end 361 | 362 | function check_word_S(value1,ll,i,j,current_batch,parameter) 363 | e=0.001; 364 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)+e; 365 | result=Forward(current_batch,parameter,1); 366 | [cost1,grad]=softmax(result,current_batch,parameter); 367 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)-2*e; 368 | result=Forward(current_batch,parameter,1); 369 | [cost2,grad]=softmax(result,current_batch,parameter); 370 | parameter.Word_S{ll}(i,j)=parameter.Word_S{ll}(i,j)+e; 371 | value2=(cost1-cost2)/(2*e); 372 | [value1,value2] 373 | value1-value2 374 | end 375 | 376 | 377 | function check_sen_S(value1,ll,i,j,current_batch,parameter) 378 | e=0.001; 379 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)+e; 380 | result=Forward(current_batch,parameter,1); 381 | [cost1,grad]=softmax(result,current_batch,parameter); 382 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)-2*e; 383 | result=Forward(current_batch,parameter,1); 384 | [cost2,grad]=softmax(result,current_batch,parameter); 385 | parameter.Sen_S{ll}(i,j)=parameter.Sen_S{ll}(i,j)+e; 386 | value2=(cost1-cost2)/(2*e); 387 | [value1,value2] 388 | value1-value2 389 | end 390 | 391 | function check_sen_T(value1,ll,i,j,current_batch,parameter) 392 | e=0.001; 393 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)+e; 394 | result=Forward(current_batch,parameter,1); 395 | [cost1,grad]=softmax(result,current_batch,parameter); 396 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)-2*e; 397 | result=Forward(current_batch,parameter,1); 398 | [cost2,grad]=softmax(result,current_batch,parameter); 399 | parameter.Sen_T{ll}(i,j)=parameter.Sen_T{ll}(i,j)+e; 400 | value2=(cost1-cost2)/(2*e); 401 | value1 402 | value2 403 | value1-value2 404 | end 405 | 406 | 407 | 408 | function[norm]=computeGradNorm(grad,parameter) 409 | % compute gradient norm 410 | norm=0; 411 | for ii=1:parameter.layer_num 412 | norm=norm+double(sum(grad.Word_S{ii}(:).^2)); 413 | norm=norm+double(sum(grad.Word_T{ii}(:).^2)); 414 | norm=norm+double(sum(grad.Sen_S{ii}(:).^2)); 415 | norm=norm+double(sum(grad.Sen_T{ii}(:).^2)); 416 | end 417 | norm=sqrt(norm); 418 | end 419 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/lstmUnit.m: -------------------------------------------------------------------------------- 1 | function[lstm, h_t, c_t]=lstmUnit(W,parameter,x_t,m_t,h_t_1, c_t_1, ll, t,isTraining) 2 | if parameter.dropout~=0&&isTraining==1 3 | if parameter.CheckGrad==1 4 | drop_left=repmat(parameter.drop_left,1,size(x_t,2)); 5 | else 6 | drop_left=randSimpleMatrix(size(x_t))<1-parameter.dropout; 7 | end 8 | x_t=x_t.*drop_left; 9 | end 10 | input=[x_t;m_t;h_t_1]; 11 | ifoa_linear = W*input; 12 | ifo_gate=parameter.nonlinear_gate_f(ifoa_linear(1:3*parameter.hidden,:)); 13 | i_gate = ifo_gate(1:parameter.hidden, :); 14 | f_gate = ifo_gate(parameter.hidden+1:2*parameter.hidden,:); 15 | o_gate =ifo_gate(parameter.hidden*2+1:3*parameter.hidden,:); 16 | a_signal = parameter.nonlinear_f(ifoa_linear(3*parameter.hidden+1:4*parameter.hidden,:)); 17 | c_t=f_gate.*c_t_1 + i_gate.*a_signal; 18 | if parameter.lstm_out_tanh 19 | f_c_t = parameter.nonlinear_f(c_t); 20 | h_t = o_gate.*f_c_t; 21 | else 22 | h_t = o_gate.*c_t; 23 | end 24 | %c_t(c_t>parameter.clipForward)=parameter.clipForward; 25 | %c_t(c_t<-parameter.clipForward)=-parameter.clipForward; 26 | %h_t(h_t>parameter.clipForward)=parameter.clipForward; 27 | %h_t(h_t<-parameter.clipForward)=-parameter.clipForward; 28 | 29 | lstm.input = input; 30 | lstm.i_gate = i_gate; 31 | lstm.f_gate = f_gate; 32 | lstm.o_gate = o_gate; 33 | lstm.a_signal = a_signal; 34 | if parameter.lstm_out_tanh 35 | lstm.f_c_t = f_c_t; 36 | else 37 | lstm.c_t = c_t; 38 | end 39 | if isTraining==1&¶meter.dropout~=0 40 | lstm.drop_left=drop_left; 41 | end 42 | end 43 | 44 | function [clippedValue] = clipForward(x) 45 | if x>50 clippedValue = single(50); 46 | elseif x<-50 clippedValue = single(-50); 47 | else clippedValue =single(x); 48 | end 49 | end 50 | 51 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/softmax.m: -------------------------------------------------------------------------------- 1 | function[total_cost,grad]=softmax(result,docbatch,parameter) 2 | total_cost=0; 3 | grad.soft_W=zeroMatrix(size(parameter.soft_W)); 4 | step_size=1; 5 | num_word=0; 6 | N=size(docbatch.target_sen_matrix,1); 7 | zeroState=zeroMatrix([parameter.hidden,N]); 8 | for ll=1:parameter.layer_num 9 | grad.source_h{ll,1}=zeroState; 10 | grad.source_c{ll,1}=zeroState; 11 | for sen_tt=1:length(result.Target_sen)-1 12 | grad.target_sen_h{ll,sen_tt}=zeroState; 13 | grad.target_sen_c{ll,sen_tt}=zeroState; 14 | end 15 | end 16 | for sen_tt=1:length(result.Target_sen) 17 | Word_List=docbatch.target_word{sen_tt}.Word; 18 | Word_Mask=docbatch.target_word{sen_tt}.Mask; 19 | num_word=num_word+length(find(Word_Mask==1)); 20 | N=size(Word_List,1); 21 | T=size(Word_List,2); 22 | target_sen=result.Target_sen{sen_tt}; 23 | N_examples=size(Word_List,1)*size(Word_List,2); 24 | predict_Words=reshape(Word_List,1,N_examples); 25 | mask=reshape(Word_Mask,1,N_examples); 26 | h_t=[]; 27 | if sen_tt==1 28 | h_t=[h_t,result.source_sen{parameter.layer_num,1}]; 29 | else 30 | dim=size(result.h_t_target_sen); 31 | h_t=[h_t,result.h_t_target_sen{dim(1),sen_tt-1}]; 32 | end 33 | dim=size(target_sen.h_t_target_word); 34 | h_t=[h_t,[target_sen.h_t_target_word{parameter.layer_num,1:dim(2)-1}]]; 35 | [cost,grad_softmax_h]=batchSoftmax(h_t,mask,predict_Words,parameter); 36 | total_cost=total_cost+cost; 37 | grad.soft_W=grad.soft_W+grad_softmax_h.soft_W; 38 | if sen_tt==1 39 | grad.source_h{parameter.layer_num,1}=grad_softmax_h.h(:,1:N); 40 | else 41 | grad.target_sen_h{parameter.layer_num,sen_tt-1}=grad_softmax_h.h(:,1:N); 42 | end 43 | for i=1:T-1 44 | grad.ht{sen_tt}{1,i}=grad_softmax_h.h(:,N*i+1:N*(i+1)); 45 | end 46 | end 47 | total_cost=total_cost/N; 48 | grad.soft_W=grad.soft_W/N; 49 | clear predict_Words; clear mask; 50 | clear grad_softmax_h; 51 | end 52 | 53 | function[cost,softmax_grad]=batchSoftmax(h_t,mask,predict_Words,parameter) 54 | unmaskedIds=find(mask==1); 55 | scores=parameter.soft_W*h_t; 56 | mx = max(scores,[],1); 57 | scores=bsxfun(@minus,scores,mx); 58 | scores=exp(scores); 59 | norms = sum(scores, 1); 60 | if length(find(mask==0))==0 61 | scores=bsxfun(@rdivide, scores, norms); 62 | else 63 | scores=bsxfun(@times,scores, mask./norms); 64 | end 65 | scoreIndices = sub2ind(size(scores),predict_Words(unmaskedIds),unmaskedIds); 66 | cost=sum(-log(scores(scoreIndices))); 67 | scores(scoreIndices) =scores(scoreIndices) - 1; 68 | softmax_grad.soft_W=scores*h_t'; %(N_word*examples)*(examples*diemsnion)=N_word*diemsnion; 69 | softmax_grad.h=(scores'*parameter.soft_W)';%(diemsnion*N_word)*(N_word*examples)=dimension*examples 70 | clear scores; 71 | clear norms; 72 | end 73 | -------------------------------------------------------------------------------- /hier_LSTM_Attention/test.m: -------------------------------------------------------------------------------- 1 | function[]=test() 2 | clear; 3 | 4 | addpath('../misc'); 5 | n= gpuDeviceCount; 6 | parameter.isGPU = 0; 7 | 8 | if n>0 % GPU exists 9 | parameter.isGPU = 1; 10 | gpuDevice(2); 11 | else 12 | print('no gpu ! ! ! ! !'); 13 | end 14 | 15 | parameter.dimension=100; 16 | parameter.alpha=0.1; %learning rate 17 | parameter.layer_num=4; %number of layer 18 | parameter.hidden=100; 19 | parameter.lstm_out_tanh=0; 20 | parameter.Initial=0.1; 21 | parameter.dropout=0.2; %drop-out rate 22 | params.lstm_out_tanh=0; 23 | parameter.isTraining=1; 24 | parameter.CheckGrad=0; %whether check gradient or not. 25 | parameter.PreTrainEmb=0; 26 | %whether using pre-trained embeddings 27 | parameter.update_embedding=1; 28 | %whether update word embeddings 29 | parameter.batch_size=16; 30 | parameter.Source_Target_Same_Language=1; 31 | %whether source and target is of the same language. 32 | parameter.maxGradNorm=1; 33 | parameter.clip=0; 34 | 35 | parameter.lr=5; 36 | parameter.read=0; 37 | 38 | if parameter.Source_Target_Same_Language==1 39 | parameter.Vocab=25002; 40 | %vocabulary size plus sentence-end and document-end 41 | parameter.sen_stop=parameter.Vocab-1; %sentence-end 42 | parameter.doc_stop=parameter.Vocab; %document-end 43 | else 44 | parameter.SourceVocab=20; 45 | parameter.TargetVocab=20; 46 | parameter.stop=parameter.TargetVocab; 47 | parameter.Vocab=parameter.SourceVocab+parameter.TargetVocab; 48 | end 49 | 50 | if parameter.CheckGrad==1¶meter.dropout~=0 51 | parameter.drop_left=randSimpleMatrix([parameter.hidden,1])<1-parameter.dropout; 52 | end 53 | %alpha: learning rate for minibatch 54 | 55 | parameter.nonlinear_gate_f = @sigmoid; 56 | parameter.nonlinear_gate_f_prime = @sigmoidPrime; 57 | parameter.nonlinear_f = @tanh; 58 | parameter.nonlinear_f_prime = @tanhPrime; 59 | [parameter]=ReadParameter(parameter); 60 | 61 | test_source_file='../data/test_segment.txt'; 62 | 63 | while 1 64 | fd_test_source=fopen(test_source_file); 65 | [current_batch,End]=ReadData(fd_test_source,[],parameter,0); 66 | if End~=1 || (End==1&& length(current_batch.source_smallBatch)~=0) 67 | decode_greedy(parameter,current_batch,'a.txt'); 68 | end 69 | if End==1 70 | break; 71 | end 72 | end 73 | end 74 | 75 | function[parameter]=ReadParameter(parameter) 76 | % read parameters 77 | for ll=1:parameter.layer_num 78 | W_file=strcat('save_parameter/_Word_S',num2str(ll)); 79 | parameter.Word_S{ll}=gpuArray(load(W_file)); 80 | W_file=strcat('save_parameter/_Word_T',num2str(ll)); 81 | parameter.Word_T{ll}=gpuArray(load(W_file)); 82 | W_file=strcat('save_parameter/_Sen_S',num2str(ll)); 83 | parameter.Sen_S{ll}=gpuArray(load(W_file)); 84 | W_file=strcat('save_parameter/_Sen_T',num2str(ll)); 85 | parameter.Sen_T{ll}=gpuArray(load(W_file)); 86 | end 87 | parameter.Attention_W=gpuArray(load('save_parameter/_Attention_W')); 88 | parameter.Attention_U=gpuArray(load('save_parameter/_Attention_U')); 89 | parameter.vect=gpuArray(load('save_parameter/_v')); 90 | parameter.soft_W=gpuArray(load('save_parameter/_soft_W')); 91 | end 92 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Computing and Communication | University IT 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 32 | 33 | 34 | 35 | 36 | 37 | 42 | 44 | 45 | 48 | 49 | 50 | 51 |
Skip to content 52 | Skip to navigation 53 | 54 | 55 | 71 | 72 | 87 | 88 | 157 | 158 |
159 |
160 |
161 |

Computing and Communication

162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | 175 |
176 |

Get started

177 | 178 | 184 |
185 |
186 | 187 |
188 |
189 | 190 |
191 |

Everyday tools

192 | 193 | 198 |
199 |
200 | 201 |
202 |
203 | 204 |
205 |

Find more

206 | 207 | 213 |
214 |
215 |
216 | 217 | 258 |
259 |
260 |
261 |
262 |
263 |
264 | 265 |
266 |
267 | 268 |
269 | 270 |
271 |
272 |
273 | 274 |
275 | 276 |
277 | 278 | 313 | 314 | 333 | 334 | 335 | 336 | -------------------------------------------------------------------------------- /misc/aggregateMatrix.m: -------------------------------------------------------------------------------- 1 | function [accumX, uniqIndices] = aggregateMatrix(X, indices) 2 | %%% 3 | % Fast way of aggregating column vectors based on indices which contain repetitive values. 4 | % X = [101:105; 11:15]; 5 | % indices = [1 4 4 2 2] 6 | % aggregateMatrix(X, indices) returns: 7 | % accumX = 8 | % 101 209 205 9 | % 11 29 25 10 | % uniqIndices = [1 2 4] 11 | % 12 | % Thang Luong @ 2015, 13 | %%% 14 | 15 | [uniqIndices, ~, J] = unique(indices); 16 | numUniqIndices = length(uniqIndices); 17 | numEmbGrads = length(indices); 18 | 19 | if numEmbGrads==1 20 | accumX = X; 21 | else 22 | sparseMatrix = zeros(numEmbGrads, numUniqIndices,'double', 'gpuArray'); 23 | sparseIndices = sub2ind([numEmbGrads, numUniqIndices], 1:numEmbGrads, J'); 24 | sparseMatrix(sparseIndices) = ones(numEmbGrads, 1); 25 | accumX = X*sparseMatrix; 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /misc/oneMatrix.m: -------------------------------------------------------------------------------- 1 | function [result] = oneMatrix(size) 2 | result =ones(size, 'double', 'gpuArray'); 3 | end 4 | -------------------------------------------------------------------------------- /misc/randSimpleMatrix.m: -------------------------------------------------------------------------------- 1 | function [result]=randSimpleMatrix(size) 2 | result=rand(size,'double', 'gpuArray'); 3 | end 4 | -------------------------------------------------------------------------------- /misc/randomMatrix.m: -------------------------------------------------------------------------------- 1 | function [result] = randomMatrix(rangeSize, size) 2 | result = 2*rangeSize * (rand(size,'double', 'gpuArray') - 0.5); 3 | end 4 | -------------------------------------------------------------------------------- /misc/sigmoid.m: -------------------------------------------------------------------------------- 1 | function y = sigmoid(x) 2 | y = 1 ./ (1 + exp(-x)); -------------------------------------------------------------------------------- /misc/sigmoidPrime.m: -------------------------------------------------------------------------------- 1 | function [result] = sigmoidPrime(z) 2 | result = z.*(1-z); 3 | -------------------------------------------------------------------------------- /misc/smallMatrix.m: -------------------------------------------------------------------------------- 1 | function [result] = smallMatrix(size) 2 | result =10^(-50)*ones(size, 'double', 'gpuArray'); 3 | end 4 | -------------------------------------------------------------------------------- /misc/tanhPrime.m: -------------------------------------------------------------------------------- 1 | function [result] = tanhPrime(z) 2 | result = (1-z.^2); 3 | -------------------------------------------------------------------------------- /misc/zeroMatrix.m: -------------------------------------------------------------------------------- 1 | function [result] = zeroMatrix(size) 2 | result = zeros(size, 'double', 'gpuArray'); 3 | end 4 | --------------------------------------------------------------------------------