├── AutoExtend
    ├── AutoExtend.m
    ├── columnNormalize.m
    ├── costFunc.m
    ├── costFuncColumnNorm.m
    ├── costFuncLexeme.m
    ├── getVectors.m
    ├── gradient.m
    ├── gradientChecking.m
    ├── gradientColumnNorm.m
    ├── gradientLexeme.m
    ├── learnAffineMapMatrix.m
    ├── learnLinearMapMatrix.m
    ├── learnTranslationMatrix.m
    ├── loadBinaryFile.m
    ├── loadSynsetFile.m
    ├── loadTxtFile.m
    └── writeVectors.m
├── IMS Features
    ├── CSynsetCosineFeatureExtractor.java
    ├── CSynsetProductFeatureExtractor.java
    └── CSynsetRawFeatureExtractor.java
├── LICENSE
└── WordNetExtractor
    ├── Shared.java
    └── WordNetExtractor.java


/AutoExtend/AutoExtend.m:
--------------------------------------------------------------------------------
  1 | function [] = AutoExtend(varargin)
  2 | 
  3 |     if (nargin == 8)
  4 |         folder = varargin{1};
  5 |         normalizeWeights = varargin{2};
  6 |         sWeight = varargin{3};
  7 |         lWeight = varargin{4};
  8 | 		rWeight = varargin{5};
  9 | 		nWeight = varargin{6};
 10 |         experiment = varargin{7};
 11 |         RelationFiles = varargin{8};
 12 |         
 13 |         settings = [true false false];
 14 |         weights = [sWeight lWeight rWeight nWeight];
 15 |         
 16 |     elseif (nargin == 10)
 17 |         folder = varargin{1};
 18 |         normalizeWeights = varargin{2};
 19 |         sWeight = varargin{3};
 20 |         lWeight = varargin{4};
 21 |         rWeight = varargin{5};
 22 |         startNormalizedED = varargin{6};
 23 |         normWhenPossibleED = varargin{7};
 24 |         endWhenNotED = varargin{8};
 25 |         experiment = varargin{9};
 26 |         RelationFiles = varargin{10};
 27 |         
 28 |         settings = [startNormalizedED normWhenPossibleED endWhenNotED];
 29 |         weights = [sWeight lWeight rWeight 0];
 30 |         
 31 |     else
 32 |         folder = '[...]';
 33 |         normalizeWeights = true;
 34 |         sWeight = 0.20;
 35 |         lWeight = 0.20;
 36 |         rWeight = 0.60;
 37 |         startNormalizedED = false;
 38 |         normWhenPossibleED = false;
 39 |         endWhenNotED = false;  
 40 |         experiment = 'naive';
 41 |         RelationFiles = cell(4,1);
 42 |         RelationFiles{1} = 'hypernym.txt';
 43 |         RelationFiles{2} = 'verbGroup.txt';
 44 |         RelationFiles{3} = 'similar.txt';
 45 |         RelationFiles{4} = 'antonym.txt';
 46 |         RelationFiles = [];
 47 |         
 48 |         settings = [startNormalizedED normWhenPossibleED endWhenNotED];
 49 |         weights = [sWeight lWeight rWeight 0];
 50 |     end
 51 |     
 52 |     normalizeVectors = false;
 53 |     
 54 |     if ~exist(strcat(folder, experiment), 'dir')
 55 |         fprintf('Folder does not exist. Created %s\n', strcat(folder, experiment));
 56 |         mkdir(strcat(folder, experiment));
 57 |     end
 58 |     
 59 |     if exist(strcat(folder, experiment, '/iota.txt'), 'file')
 60 |         fprintf('Model %s already exists. Skipped\n', experiment);
 61 |         return;
 62 |     end
 63 |     
 64 |     [W , ~] = loadTxtFile(strcat(folder, 'words.txt'));
 65 |     dim = size(W,2); %dim = 1;    
 66 |     num_iters = 1000; %num_iters = 0;
 67 |     
 68 |     [DictS, DictSID] = loadSynsetFile(folder);
 69 | 
 70 |     countSynsets = length(DictSID);
 71 |     countWords = length(W);
 72 |     
 73 |     save(strcat(folder, experiment, '/settings.mat'), '-regexp', '^[^WD]');
 74 |     
 75 |     if (normalizeVectors == true)
 76 |         W = normr(W);
 77 |     end
 78 | 
 79 |     Table = readtable(strcat(folder, 'lexemes.txt'), 'ReadVariableNames', false, 'Delimiter', ' ');
 80 |     ThetaMap = table2array(Table(:, 1:2));
 81 |     Iota = sparse(ThetaMap(:,1),ThetaMap(:,2),ones(size(ThetaMap,1),1),countWords,countSynsets);
 82 |     Theta = Iota'; 
 83 | 
 84 |     % create relation matrix - will do a squared error of relation pairs
 85 |     RelationMap = [];
 86 |     for i=1:size(RelationFiles, 1)
 87 |         Table = readtable(strcat(folder, RelationFiles{i}), 'ReadVariableNames', false, 'Delimiter', ' ');
 88 |         if isempty(Table)
 89 |             continue;
 90 |         end
 91 |         RelationMap = [RelationMap ; table2array(Table(:, 1:2))];
 92 |     end
 93 | 
 94 |     if (~isempty(RelationMap))
 95 |         fprintf('Creating Relation Matrix. %d relations found.\n', length(RelationMap));
 96 |         rFrom = [(1:length(RelationMap))'; (1:length(RelationMap))'];
 97 |         rTo = [RelationMap(:,1); RelationMap(:,2)];
 98 |         rValue = [ones(length(RelationMap),1); (-1 * ones(length(RelationMap),1))];
 99 |         R = sparse(rFrom,rTo,rValue,length(RelationMap),countSynsets);
100 |     else
101 |         fprintf('Relation Matrix is empty.\n');
102 |         R = zeros(1, countSynsets);
103 |     end
104 |     
105 |     if (normalizeWeights == true)
106 |         sWeight = sWeight / countWords;
107 |         lWeight = lWeight / nnz(Theta);
108 |         rWeight = rWeight / length(RelationMap);
109 |         weights(1:3) = [sWeight lWeight rWeight];
110 |         weights = weights / norm(weights,1);
111 |     end 
112 |     
113 |     trainModel(folder, dim, num_iters, countSynsets, countWords, W, Theta, Iota, R, weights, settings, experiment);
114 | end
115 | 
116 | function [] = trainModel(folder, dim, num_iters, countSynsets, countWords, W, Theta, Iota, R, weights, settings, experiment)
117 |    
118 |     if ~exist(strcat(folder, experiment, '/debug'), 'dir')
119 |         mkdir(strcat(folder, experiment, '/debug'));
120 |     end
121 |     %delete(strcat(folder, experiment, '/debug/dim_finished_*'));
122 | 	
123 |     J_history = zeros(num_iters,5);
124 |     lastNormIter = 0;
125 | 
126 |     fprintf('Starting parallel computation on %d dimensions.\n', dim);
127 |     %poolobj = parpool('local',30);
128 |     ThetaValues = NaN(nnz(Theta),dim);
129 |     IotaValues = NaN(nnz(Iota),dim);
130 |  
131 |     for d=1:dim %parfor
132 |         
133 |         dimFilename = strcat(folder, experiment, '/debug/ThetaIota_', num2str(d), '.mat');
134 |         
135 |         if exist(dimFilename, 'file')
136 |             [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = loadVariables(dimFilename);
137 |         else
138 |             Theta_dim = NaN;
139 |             Iota_dim = NaN;
140 |             J_history_dim = NaN; 
141 |             lastNormIter_dim = NaN;
142 |             saveVariables(dimFilename, Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim);
143 |             
144 |             w = W(:,d);
145 | 
146 |             debugFilename = strcat(folder, experiment, '/debug/dim_', num2str(d),  '.txt');
147 |             debugFilenameFinished = strcat(folder, experiment, '/debug/dim_finished_', num2str(d),  '.txt');
148 |             debugFile = fopen(debugFilename, 'w');
149 | 
150 |             [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = trainDimension(num_iters, countSynsets, countWords, w, Theta, Iota, R, weights, settings, debugFile);
151 |             saveVariables(dimFilename, Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim);
152 |             
153 |             fclose(debugFile);
154 |             movefile(debugFilename,debugFilenameFinished);
155 |         end
156 |         
157 |         if (length(Theta_dim) == nnz(Theta) && length(Iota_dim) == nnz(Iota))
158 |             ThetaValues(:,d) = Theta_dim;
159 |             IotaValues(:,d) = Iota_dim;
160 |             J_history = J_history + (J_history_dim ./ dim);
161 |             lastNormIter = lastNormIter + (lastNormIter_dim / dim);
162 |         end
163 |     end
164 |     
165 |     fprintf('Parallel computation completed.\n');
166 |     
167 |     % looking for missing values
168 |     for d=1:dim        
169 |         if (any(isnan(ThetaValues(:,d))) || any(isnan(IotaValues(:,d))))        
170 |             dimFilename = strcat(folder, experiment, '/debug/ThetaIota_', num2str(d), '.mat');
171 |         
172 |             if exist(dimFilename, 'file')
173 |                 [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = loadVariables(dimFilename);
174 |             end
175 | 
176 |             if (length(Theta_dim) == nnz(Theta) && length(Iota_dim) == nnz(Iota))
177 |                 ThetaValues(:,d) = Theta_dim;
178 |                 IotaValues(:,d) = Iota_dim;
179 |                 J_history = J_history + (J_history_dim ./ dim);
180 |                 lastNormIter = lastNormIter + (lastNormIter_dim / dim);
181 |             end
182 |         end
183 |     end
184 |     
185 |     % if still not all values available
186 |     if (any(isnan(ThetaValues(:))) || any(isnan(IotaValues(:))))
187 |         fprintf('Not all values available (process not master). Process ended.\n');
188 |         return;
189 |     end
190 |     
191 |     fprintf('Saving values ...');
192 |     
193 |     %load(strcat(folder, experiment, '/debug/ThetaIota.mat'));
194 |     save(strcat(folder, experiment, '/debug/ThetaIota.mat'),'ThetaValues', 'IotaValues');
195 |     delete(strcat(folder, experiment, '/debug/ThetaIota_*'));    
196 | 
197 | 	% print convergence matrix
198 | 	fName = strcat(folder, experiment, '/convergence.mat');
199 | 	save(fName,'J_history','lastNormIter');
200 | 
201 |     % print theta matrix
202 |     [synset, word, ~] = find(Theta);
203 |     mat1 = [word synset ThetaValues];
204 |     fName = strcat(folder, experiment, '/theta.txt');
205 |     dlmwrite(fName,mat1,'delimiter',' ','newline','pc','precision',6);
206 |     
207 |     % print iota matrix
208 |     [word, synset, ~] = find(Iota);
209 |     mat2 = [word synset IotaValues];
210 |     fName = strcat(folder, experiment, '/iota.txt');
211 |     dlmwrite(fName,mat2,'delimiter',' ','newline','pc','precision',6);
212 |     
213 |     % Plot the convergence graph
214 |     for i=2:num_iters
215 |         if (J_history(i,:) == J_history(i-1,:))
216 |             num_iters = i;
217 |             break;
218 |         end
219 |     end    
220 |     h=figure('Visible','off');
221 |     hax = axes;
222 |     hold on;    
223 |     plot(1:num_iters, (J_history(1:num_iters,1) / max(J_history(:,1))), '-', 'Color', [0 0.8 1], 'LineWidth', 2);
224 |     plot(1:num_iters, (J_history(1:num_iters,2) / max(J_history(:,2))), '-', 'Color', [1 0.4 0], 'LineWidth', 2);
225 | 	plot(1:num_iters, (J_history(1:num_iters,3) / max(J_history(:,3))), '-', 'Color', [0 0.5 0], 'LineWidth', 2);
226 |     plot(1:num_iters, (J_history(1:num_iters,4) / max(J_history(:,4))), '-', 'Color', [0.7 0 0.7], 'LineWidth', 2);
227 |     plot(1:num_iters, (J_history(1:num_iters,5) / max(J_history(:,5))), '-', 'Color', [0.3 0.3 0.3]);
228 |     line([lastNormIter lastNormIter],get(hax,'YLim'),'Color',[0.7 0 0.7]);
229 |     legend('autoencoder','lexeme', 'relations', 'norm', 'learning rate');
230 |     xlabel('iteration');
231 |     ylabel('average error');
232 |     fName = strcat(folder, experiment, '/convergence.jpg');
233 |     saveas(h,fName);  % here you save the figure
234 |     close(h);
235 |     
236 |     fprintf(' done!\n');
237 | 
238 | end
239 | 
240 | function [var1, var2, var3, var4] = loadVariables(filename)
241 |     load(filename);
242 | end
243 | 
244 | function saveVariables(filename, var1, var2, var3, var4)
245 |     save(filename,'var1', 'var2', 'var3', 'var4');
246 | end
247 | 
248 | function [EValues, DValues, J_history, lastNormIter] = trainDimension(num_iters, ~, countWords, w, E, D, R, weights, settings, debugFile)
249 | 
250 |     learningRate = 0.00005;
251 |     fprintf(debugFile, 'Starting computation with learning rate %f\n',learningRate);
252 |     
253 |     J_history = zeros(num_iters,5);
254 |     
255 |     if (settings(1) == true)
256 |         % normalize matrizes
257 |         fprintf(debugFile, 'Normalizing matrices at start.\n');
258 |         E = columnNormalize(E);    
259 |         D = columnNormalize(D);
260 |     end
261 |     
262 |     lastNormIter = 0;
263 |     iter = 1;
264 |     while iter <= num_iters
265 |         
266 |         fprintf(debugFile, 'Iteration %d/%d\n', iter, num_iters);
267 |         
268 |         grad_E = sparse(size(E,1),size(E,2));
269 |         grad_D = sparse(size(D,1),size(D,2));        
270 |         J1 = 0;
271 |         J2 = 0;
272 |         J3 = 0;
273 |         J4 = 0;
274 |         
275 |         % update with respect to autoencoder
276 |         if (weights(1) > 0)
277 |             [J1, grads_E, grads_D] = gradient(w, E, D, w, 'both');
278 |             %gradientChecking(w, E, D, R, grads_E, grads_D, iter, weights, 'J1', 0.00001);
279 |             grad_E = grad_E + (weights(1) * grads_E);
280 |             grad_D = grad_D + (weights(1) * grads_D);
281 |         end
282 |         
283 |         % update with respect to lexeme
284 |         if (weights(2) > 0)
285 |             [J2, gradl_E, gradl_D] = gradientLexeme(w, E, D);
286 |             %gradientChecking(w, E, D, R, gradl_E, gradl_D, iter, weights, 'J2', 0.00001);
287 |             grad_E = grad_E + (weights(2) * gradl_E);
288 |             grad_D = grad_D + (weights(2) * gradl_D);
289 |         end
290 |         
291 |         
292 |         % update with respect to relations
293 |         if (weights(3) > 0)
294 |             [J3, gradr_E, ~] = gradient(w, E, R, zeros(size(R,1),1) , 'onlyE');
295 |             %gradientChecking(w, E, D, R, gradr_E, grad_D, iter, weights, 'J3', 0.00001);
296 |             grad_E = grad_E + (weights(3) * gradr_E);
297 |         end
298 |         
299 |         
300 |         % update with respect to column norm
301 |         if (weights(4) > 0)
302 |             [J4_E, gradn_E] = gradientColumnNorm(E);
303 |             [J4_D, gradn_D] = gradientColumnNorm(D);
304 |             J4 = J4_E + J4_D;
305 |             %gradientChecking(w, E, D, R, gradn_E, gradn_D, iter, weights, 'J4', 0.00001);
306 |             grad_E = grad_E + (weights(4) * gradn_E);
307 |             grad_D = grad_D + (weights(4) * gradn_D);
308 |         end
309 |         
310 |         J = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4));
311 |         J_history(iter,1) = J1 / countWords;
312 |         J_history(iter,2) = J2 / nnz(E);
313 |         J_history(iter,3) = J3 / size(R, 1);
314 | 		J_history(iter,4) = J4 / (size(E,2) + size(D,2));
315 |         J_history(iter,5) = learningRate;
316 |         
317 |         fprintf(debugFile, 'Error J:  %8.3f\n', J);
318 |         fprintf(debugFile, 'Error J1: %5.4f %8.3f %5.4f\n', J1 * weights(1)/J, J1 * weights(1), J_history(iter,1));
319 |         fprintf(debugFile, 'Error J2: %5.4f %8.3f %5.4f\n', J2 * weights(2)/J, J2 * weights(2), J_history(iter,2));
320 |         fprintf(debugFile, 'Error J3: %5.4f %8.3f %5.4f\n', J3 * weights(3)/J, J3 * weights(3), J_history(iter,3));
321 |         fprintf(debugFile, 'Error J4: %5.4f %8.3f %5.4f\n', J4 * weights(4)/J, J4 * weights(4), J_history(iter,4));
322 |         
323 |         E_new = E - (learningRate * grad_E);
324 |         E_new = keepSparsity(E, E_new);
325 | 
326 |         D_new = D - (learningRate * grad_D);
327 |         D_new = keepSparsity(D, D_new);
328 |         
329 |         % get new cost
330 |         if (weights(1) > 0)
331 |             J1 = costFunc(w, E_new, D_new, w);
332 |         end         
333 |         if (weights(2) > 0)
334 |             J2 = costFuncLexeme(w, E_new, D_new);
335 |         end
336 |         if (weights(3) > 0)
337 |             J3 = costFunc(w, E_new, R, zeros(size(R,1),1)); 
338 |         end
339 |         if (weights(4) > 0)
340 |             J4 = costFuncColumnNorm(E_new) + costFuncColumnNorm(D_new);
341 |         end
342 |         J_new = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4));
343 |         
344 |         fprintf(debugFile, 'New Error J:  %8.3f\n', J_new);
345 |         fprintf(debugFile, 'New Error J1: %5.4f %8.3f\n', J1 * weights(1)/J_new, J1 * weights(1));
346 |         fprintf(debugFile, 'New Error J2: %5.4f %8.3f\n', J2 * weights(2)/J_new, J2 * weights(2));
347 |         fprintf(debugFile, 'New Error J3: %5.4f %8.3f\n', J3 * weights(3)/J_new, J3 * weights(3));
348 |         fprintf(debugFile, 'New Error J4: %5.4f %8.3f\n', J4 * weights(4)/J_new, J4 * weights(4));
349 |         
350 |         % check if error increased
351 |         if J_new > J
352 |             
353 |             fprintf(debugFile, 'Error increased\n');
354 |             
355 |             % reduce learning rate
356 |             learningRate = learningRate / 3;
357 |             fprintf(debugFile, 'New Learning Rate: %f\n', learningRate);
358 |             
359 |             if learningRate < 0.000001
360 |                 
361 |                 fprintf(debugFile, 'Learning Rate to small. Calculation stopped\n\n');
362 |                 
363 |                 for i=iter+1:num_iters
364 |                     J_history(i,:) = J_history(iter,:);
365 |                 end
366 |                 
367 |                 break;
368 |             end
369 |             
370 |         else
371 |             
372 |             fprintf(debugFile, 'Error decreased\n');
373 |             
374 |             % increasing learning rate
375 |             learningRate = learningRate * 1.1;
376 |             fprintf(debugFile, 'New Learning Rate: %f\n', learningRate);
377 |             
378 |             % update matrix
379 |             E = E_new;            
380 |             D = D_new;
381 |             
382 |             if (settings(2) == true)
383 |                 
384 |                 fprintf(debugFile, 'Trying to normalize matrices\n');
385 |                 
386 |                 % normalize matrizes
387 |                 E_new = columnNormalize(E_new);
388 |                 E_new = keepSparsity(E, E_new);
389 |                 D_new = columnNormalize(D_new);
390 |                 D_new = keepSparsity(D, D_new);
391 | 
392 |                 % get new cost
393 |                 if (weights(1) > 0)
394 |                     J1 = costFunc(w, E_new, D_new, w);
395 |                 end         
396 |                 if (weights(2) > 0)
397 |                     J2 = costFuncLexeme(w, E_new, D_new);
398 |                 end
399 |                 if (weights(3) > 0)
400 |                     J3 = costFunc(w, E_new, R, zeros(size(R,1),1)); 
401 |                 end
402 |                 if (weights(4) > 0)
403 |                     J4 = costFuncColumnNorm(E_new) + costFuncColumnNorm(D_new);
404 |                 end
405 |                 J_norm = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4));
406 |                 
407 |                 fprintf(debugFile, 'Norm Error J: %f\n', J_norm);
408 |                 fprintf(debugFile, 'Norm Error J1: %5.4f %8.3f\n', J1 * weights(1)/J_norm, J1 * weights(1));
409 |                 fprintf(debugFile, 'Norm Error J2: %5.4f %8.3f\n', J2 * weights(2)/J_norm, J2 * weights(2));
410 |                 fprintf(debugFile, 'Norm Error J3: %5.4f %8.3f\n', J3 * weights(3)/J_norm, J3 * weights(3));
411 |                 fprintf(debugFile, 'Norm Error J4: %5.4f %8.3f\n', J4 * weights(4)/J_norm, J4 * weights(4));
412 | 
413 |                 if J_norm < J
414 | 
415 |                     fprintf(debugFile, 'Error decreased. Matrix normalized\n');
416 |                     
417 |                     % update matrix
418 |                     E = E_new;            
419 |                     D = D_new;
420 |                      
421 |                     lastNormIter = iter;
422 |                 else
423 |                     fprintf(debugFile, 'Error increased. Matrix not normalized\n');
424 |                     
425 |                     % if only as long as normalization possible
426 |                     if settings(3) == true
427 |                         
428 |                         fprintf(debugFile, 'Normalization not possible. Calculation stopped\n\n');
429 |                 
430 |                         for i=iter+1:num_iters
431 |                             J_history(i,:) = J_history(iter,:);
432 |                         end
433 | 
434 |                         break;
435 |                     end
436 |                 end
437 |                 
438 |             end
439 |             
440 |             fprintf(debugFile, 'Iteration finished.\n\n');
441 |             
442 |             % do next iteration
443 |             iter = iter + 1;
444 |             
445 |         end
446 |     end
447 |     
448 |     fprintf(debugFile, 'Calculation finished. Learned values will be returned');
449 |     
450 |     EValues = nonzeros(E);
451 |     DValues = nonzeros(D);
452 | 
453 | end
454 | 
455 | function [E_new] = keepSparsity(E, E_new)
456 | 
457 |     while (nnz(E) ~= nnz(E_new))
458 |         [r1,c1,~] = find(E);
459 |         [r2,c2,~] = find(E_new);
460 |         for l=1:length(r1)
461 |             if (r1(l) ~= r2(l) || c1(l) ~= c2(l))
462 |                 E_new(r1(l),c1(l)) = eps;
463 |                 break; 
464 |             end
465 |         end
466 |     end
467 | end


--------------------------------------------------------------------------------
/AutoExtend/columnNormalize.m:
--------------------------------------------------------------------------------
 1 | function [ A ] = columnNormalize( A )
 2 | 
 3 |     if issparse(A)
 4 |         A = columnNormalizeSparse( A );
 5 |     else   
 6 |         A = columnNormalizeFull( A );
 7 |     end
 8 | 
 9 | end
10 | 
11 | function [ A ] = columnNormalizeSparse( A )
12 | 
13 |     
14 |     [i,j,values] = find(A);
15 |     
16 |     colSum = (ones(1, size(A,1)) * A)';    
17 |     values = values ./ colSum(j);
18 |     
19 |     A = sparse(i,j,values,size(A,1),size(A,2));
20 | 
21 | end
22 | 
23 | function [ A ] = columnNormalizeFull( A )
24 | 
25 |     colSum = sum(A,1);
26 | 
27 |     for i=1:size(A,2)
28 | 
29 |         A(:,i) = A(:,i) ./ colSum(i);
30 | 
31 |     end
32 | 
33 | end
34 | 
35 | 


--------------------------------------------------------------------------------
/AutoExtend/costFunc.m:
--------------------------------------------------------------------------------
1 | function [J] = costFunc(x, E, D, x_expected)
2 | 
3 | x_predict = (D * (E * x));
4 | x_diff = x_predict - x_expected;
5 | J = sum(x_diff.^2);
6 | 
7 | end


--------------------------------------------------------------------------------
/AutoExtend/costFuncColumnNorm.m:
--------------------------------------------------------------------------------
 1 | function [J] = costFuncColumnNorm(A)
 2 | 
 3 |     [~,j,values] = find(A);
 4 |     error = NaN(size(A,2),1);
 5 | 
 6 |     for l=1:length(j)
 7 |         if isnan(error(j(l)))
 8 |             error(j(l)) = 1 - values(l);
 9 |         else
10 |             error(j(l)) = error(j(l)) - values(l);
11 |         end
12 |     end
13 | 
14 |     error(isnan(error)) = 0;
15 |     J = sum(error.^2);
16 |     
17 | end
18 | 
19 | 


--------------------------------------------------------------------------------
/AutoExtend/costFuncLexeme.m:
--------------------------------------------------------------------------------
 1 | function [J] = costFuncLexeme(w, E, D)
 2 | 
 3 |     if (nnz(E) ~= nnz(D))
 4 |         msgID = 'MY:BadLengthED';
 5 |         msg = 'Sparsity of encode and decode not matching.';
 6 |         baseException = MException(msgID,msg);
 7 |         throw(baseException);
 8 |     end
 9 |     
10 |     [synset,word,value] = find(E);
11 |     %L1 = sortrows([word synset value],[1 2]);
12 |     L1 = [word synset value];
13 |     lexeme1 = L1(:,3) .* w(L1(:,1));
14 |     
15 |     s = E * w;
16 |     %[word,synset,value] = find(D);
17 |     %L2 = sortrows([word synset value],[1 2]);
18 |     [synset,word,value] = find(D');
19 |     L2 = [word synset value];
20 |     lexeme2 = L2(:,3) .* s(L2(:,2));
21 |     
22 |     diff = lexeme1 - lexeme2;
23 |     J = sum(diff.^2);
24 | 
25 | end


--------------------------------------------------------------------------------
/AutoExtend/getVectors.m:
--------------------------------------------------------------------------------
 1 | function [ X , y ] = getVectors( dictX, A, dictA )
 2 | 
 3 |     X = zeros(size(dictX,1),size(A,2));
 4 | 	y = zeros(size(dictX,1),1);
 5 |     
 6 |     for i=1:size(dictX,1)
 7 |         ind = strcmp(dictX{i}, dictA);
 8 |         if (any(ind))
 9 |             y(i) = find(ind,1);
10 |             X(i,:) = A(y(i),:);
11 |         end
12 |     end
13 |     
14 | end
15 | 
16 | 


--------------------------------------------------------------------------------
/AutoExtend/gradient.m:
--------------------------------------------------------------------------------
 1 | function [J, grad_E, grad_D] = gradient(x, E, D, x_expected, mode)
 2 | 
 3 |     % precalculations
 4 |     x_predict = (D * (E * x));
 5 |     x_diff = x_predict - x_expected;
 6 |     d = (D' * x_diff);
 7 |     e = E * x;
 8 |     
 9 |     % calculate error
10 |     J = sum(x_diff.^2);
11 |     
12 |     if ~strcmp(mode,'onlyD')
13 | 
14 |         % calculate derivate for E
15 |         [row,column,~] = find(E);
16 |         %for l=1:size(row)
17 |         %    i = row(l);
18 |         %    j = column(l);
19 |         %    grad_values(l) = 2 * d(i) * x(j); %((x_predict - x_expected)' * D(:,i)) * x(j); but we use precalculations
20 |         %end
21 |         grad_values = 2 * d(row) .* x(column);
22 |         grad_E = sparse(row,column,grad_values,size(E,1),size(E,2));
23 |     else
24 |         grad_E = NaN;
25 |     end
26 | 
27 |     if ~strcmp(mode,'onlyE')
28 |         
29 |          % calculate derivate for D
30 |         [row,column,~] = find(D);
31 |         %for l=1:size(row)
32 |         %    i = row(l);
33 |         %    j = column(l);
34 |         %    grad_values(l) = 2 * x_diff(i) * e(j); %(x_predict(i) - x_expected(i)) * (E(j,:) * x); but we use precalculations
35 |         %end
36 |         grad_values = 2 * x_diff(row) .* e(column);
37 |         grad_D = sparse(row,column,grad_values,size(D,1),size(D,2));
38 |     else
39 |         grad_D = NaN;
40 |     end
41 |     
42 | 
43 | 
44 | end
45 | 


--------------------------------------------------------------------------------
/AutoExtend/gradientChecking.m:
--------------------------------------------------------------------------------
 1 | function [] = gradientChecking(w, E, D, R, grad_E, grad_D, iter, weights, mode, epsilon)
 2 |             
 3 |     fprintf('Gradient checking in iteration: %3d\n', iter);
 4 | 
 5 |     E_epsilon = E;
 6 |     [row,column,value] = find(E);
 7 |     grad = zeros(10,2);
 8 |     e = 1;
 9 |     for l=randi(length(row),1,10)
10 |         E_epsilon(row(l), column(l)) = value(l) + epsilon;
11 |         J_1 = getCost(w, E_epsilon, D, R, weights, mode);
12 |         
13 | 
14 |         E_epsilon(row(l), column(l)) = value(l) - epsilon;
15 |         J_2 = getCost(w, E_epsilon, D, R, weights, mode);
16 |         
17 |         grad(e,1) = (J_1  - J_2) / (2 * epsilon); % num
18 |         grad(e,2) = grad_E(row(l),column(l)); % analis
19 | 
20 |         E_epsilon(row(l), column(l)) = value(l);
21 |         
22 |         e = e + 1;
23 |     end
24 |     fprintf('Difference in E: %g\n', norm(grad(:,1)-grad(:,2))/norm(grad(:,1)+grad(:,2))); 
25 | 
26 |     D_epsilon = D;
27 |     [row,column,value] = find(D);
28 |     grad = zeros(10,2);
29 |     e = 1;
30 |     for l=randi(length(row),1,10)
31 |         D_epsilon(row(l), column(l)) = value(l) + epsilon;
32 |         J_1 = getCost(w, E, D_epsilon, R, weights, mode);
33 | 
34 |         D_epsilon(row(l), column(l)) = value(l) - epsilon;
35 |         J_2 = getCost(w, E, D_epsilon, R, weights, mode);
36 |         
37 |         grad(e,1) = (J_1 - J_2) / (2 * epsilon); % num
38 |         grad(e,2) = grad_D(row(l),column(l)); % analis
39 | 
40 |         D_epsilon(row(l), column(l)) = value(l);
41 |         
42 |         e = e + 1;
43 |     end
44 |     fprintf('Difference in D: %g\n', norm(grad(:,1)-grad(:,2))/norm(grad(:,1)+grad(:,2)));         
45 | 
46 | end
47 | 
48 | function J = getCost(w, E, D, R, weights, mode)
49 | 
50 |     if strcmp(mode, 'J1')
51 |         J = costFunc(w, E, D, w);
52 |     elseif strcmp(mode, 'J2')
53 |         J = costFuncLexeme(w, E, D);
54 |     elseif strcmp(mode, 'J3')
55 |         J = costFunc(w, E, R, zeros(size(R,1),1));
56 |     elseif strcmp(mode, 'J4')
57 |         J = costFuncColumnNorm(E) + costFuncColumnNorm(D);
58 |     elseif strcmp(mode, 'R1')
59 |         J = costFuncR1(w, E);
60 |     elseif strcmp(mode, 'R2')
61 |         J = costFuncR2(w, E, D, R);
62 |     else
63 |         J1 = costFunc(w, E, D, w);
64 |         J2 = costFuncLexeme(w, E, D);
65 |         J3 = costFunc(w, E, R, zeros(size(R,1),1));
66 |         J4 = costFuncColumnNorm(E) + costFuncColumnNorm(D);
67 |         J = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4));
68 |     end
69 |     
70 | end
71 |        
72 | 


--------------------------------------------------------------------------------
/AutoExtend/gradientColumnNorm.m:
--------------------------------------------------------------------------------
 1 | function [J, grad_A] = gradientColumnNorm(A)
 2 | 
 3 |     [i,j,values] = find(A);
 4 |     error = NaN(size(A,2),1);
 5 | 
 6 |     for l=1:length(j)
 7 |         if isnan(error(j(l)))
 8 |             error(j(l)) = 1 - values(l);
 9 |         else
10 |             error(j(l)) = error(j(l)) - values(l);
11 |         end
12 |     end
13 | 
14 |     error(isnan(error)) = 0;
15 |     J = sum(error.^2);
16 |     
17 |     for l=1:length(j)
18 |         values(l) = -2 * error(j(l));
19 |     end
20 |     
21 |     grad_A = sparse(i,j,values,size(A,1),size(A,2));
22 | 
23 | end
24 | 
25 | 


--------------------------------------------------------------------------------
/AutoExtend/gradientLexeme.m:
--------------------------------------------------------------------------------
 1 | function [J, grad_E, grad_D] = gradientLexeme(w, E, D)
 2 | 
 3 |     [synset,word,value] = find(E);
 4 |     %L1 = sortrows([word synset value],[1 2]);
 5 |     L1 = [word synset value];
 6 |     lexeme1 = L1(:,3) .* w(L1(:,1));
 7 |     
 8 |     s = E * w;
 9 |     %[word,synset,value] = find(D);
10 |     %L2 = sortrows([word synset value],[1 2]);
11 |     [synset,word,value] = find(D');
12 |     L2 = [word synset value];
13 |     lexeme2 = L2(:,3) .* s(L2(:,2));
14 |     
15 |     diff = lexeme1 - lexeme2;
16 |     J = sum(diff.^2);
17 |     
18 | 	%new
19 | 	%d = (D' * x_diff);
20 | 	%grad1 = 2 * w(L1(:,1)) .* d(row);
21 | 	%old
22 |     grad1 = 2 * w(L1(:,1)) .* diff;
23 |     %end
24 | 	grad2 = -2 * s(L2(:,2)) .* diff;
25 |     
26 |     grad_E = sparse(L1(:,2),L1(:,1),grad1,size(E,1),size(E,2));
27 |     grad_D = sparse(L2(:,1),L2(:,2),grad2,size(D,1),size(D,2));
28 | end
29 | 


--------------------------------------------------------------------------------
/AutoExtend/learnAffineMapMatrix.m:
--------------------------------------------------------------------------------
1 | function [T] = learnAffineMapMatrix(X, Y)
2 | 
3 | % add biased term
4 | X = [X ones(size(X,1), 1)];
5 | 
6 | % learn affine map matrix
7 | T = (X' * X) \ (X' * Y);


--------------------------------------------------------------------------------
/AutoExtend/learnLinearMapMatrix.m:
--------------------------------------------------------------------------------
1 | function [T] = learnLinearMapMatrix(X, Y)
2 | 
3 | % learn linear map matrix
4 | T = (X' * X) \ (X' * Y);
5 | 
6 | % create transformation matrix
7 | T = [T ; zeros(1, size(T,2))];


--------------------------------------------------------------------------------
/AutoExtend/learnTranslationMatrix.m:
--------------------------------------------------------------------------------
1 | function [T] = learnTranslationMatrix(X, Y)
2 | 
3 | % learn translation vector
4 | t = mean(Y - X,1);
5 | 
6 | % create transformation matrix
7 | T = eye(size(X,2), size(Y,2));
8 | T = [T ; t];


--------------------------------------------------------------------------------
/AutoExtend/loadBinaryFile.m:
--------------------------------------------------------------------------------
 1 | function [A, dictA ] = loadBinaryFile( varargin )
 2 | 
 3 | if (nargin == 1)
 4 |     filename = varargin{1};
 5 |     max = -1;
 6 |     fprintf('Reading word vectors ... ');
 7 | elseif (nargin == 2)    
 8 |     filename = varargin{1};
 9 |     max = varargin{2};
10 |     fprintf('Reading word vectors (up to %d) ... ', max);
11 | else
12 |     fprintf('Reading word vectors - Error in number of arguments');
13 |     return;
14 | end
15 | 
16 | fid = fopen(filename);
17 | 
18 | stringbuffer = blanks(300);
19 | 
20 | for j=1:300;
21 |     c = fread(fid,1,'uchar');
22 | 
23 |     if c == 10 || c == 32
24 |         break;
25 |     end
26 | 
27 |     stringbuffer(j) = c;
28 | end  
29 | words = str2double(stringbuffer(1:j-1));
30 | 
31 | for j=1:300;
32 |     c = fread(fid,1,'uchar');
33 | 
34 |     if c == 10 || c == 32
35 |         break;
36 |     end
37 | 
38 |     stringbuffer(j) = c;
39 | end  
40 | dim = str2double(stringbuffer(1:j-1));
41 | 
42 | if (max > 0)
43 |     words = max;
44 | end
45 | dictA = cell(words, 1);
46 | A = zeros(words,dim);
47 | 
48 | for i=1:words;
49 |     
50 |     for j=1:300;
51 |         c = fread(fid,1,'uchar');
52 |         
53 |         if c == 10
54 |             c = fread(fid,1,'uchar');
55 |         end
56 |         
57 |         if c == 32
58 |             break;
59 |         end
60 |         
61 |         stringbuffer(j) = c;
62 |     end    
63 |     dictA{i} = stringbuffer(1:j-1);
64 |     
65 |     A(i,:) = fread(fid,dim,'single');
66 |     
67 | end
68 | 
69 | fprintf('done!\n');


--------------------------------------------------------------------------------
/AutoExtend/loadSynsetFile.m:
--------------------------------------------------------------------------------
1 | function [dictS, dictSID] = loadSynsetFile(folder)
2 |     
3 |     fileID = fopen(strcat(folder, 'synsets.txt'));
4 |     Table = textscan(fileID, '%s\t%s\n', 'CollectOutput',1);
5 |     dictSID = Table{1,1}(:, 1);
6 |     dictS = Table{1,1}(:, 2);
7 |     
8 | end


--------------------------------------------------------------------------------
/AutoExtend/loadTxtFile.m:
--------------------------------------------------------------------------------
 1 | function [A, dictA, dictPOS] = loadTxtFile( filename )
 2 | 
 3 | 	fprintf('Reading word vectors ... ');
 4 |     
 5 |     fileID = fopen(filename);
 6 |     line = fgetl(fileID);
 7 |     dim = length(strfind(line,' '));
 8 |     
 9 |     frewind(fileID);
10 |     
11 |     textformat = ['%s', repmat(' %f',1,dim)];
12 |     Table = textscan(fileID,textformat);
13 |     dictA = Table{1,1}(:, 1);
14 |     A = zeros(length(dictA),dim);
15 |     for d=1:dim
16 |         A(:,d) = table2array(Table(:, d+1));
17 |     end
18 |     
19 |     fclose(fileID);
20 |     
21 |     if nargout > 2
22 |         
23 |         [dictA, dictPOS] = strtok(dictA_, '%');
24 |         dictPOS = strrep(dictPOS, '%', '');
25 |     
26 |     else
27 |         
28 |         dictA = strrep(dictA, '%n', '');
29 |         dictA = strrep(dictA, '%v', '');
30 |         dictA = strrep(dictA, '%a', '');
31 |         dictA = strrep(dictA, '%r', '');
32 |         dictA = strrep(dictA, '%u', '');
33 |         
34 |     end
35 | 
36 | 	fprintf('done!\n');
37 | 
38 | end


--------------------------------------------------------------------------------
/AutoExtend/writeVectors.m:
--------------------------------------------------------------------------------
  1 | function [] = writeVectors(varargin)
  2 |     
  3 |     folder = varargin{1};
  4 |     experiment = varargin{2};
  5 |     
  6 |     writeWords = true;
  7 |     writeSynsets = true;
  8 |     writeLexemes = false;
  9 |     
 10 |     if (nargin == 5)   
 11 |         writeWords = varargin{3};
 12 |         writeSynsets = varargin{4};
 13 |         writeLexemes = varargin{5};
 14 |     end
 15 |     
 16 |     file = strcat(folder, experiment, '/outputVectors.txt');
 17 | 
 18 |     [W , dictW] = loadTxtFile(strcat(folder, 'words.txt'));
 19 |     [dictS, dictSID] = loadSynsetFile(folder);
 20 |     
 21 |     Theta = importdata(strcat(folder, experiment, '/theta.txt'), ' ');
 22 |     fprintf('Calculating synset vectors ... ');
 23 |     S = zeros(size(dictS, 1), size(W,2));
 24 |     for l=1:size(Theta, 1)
 25 |         w = Theta(l,1);
 26 |         s = Theta(l,2);
 27 |         theta = Theta(l, 3:end);
 28 |         S(s,:) = S(s,:) + (W(w,:) .* theta);
 29 |     end
 30 |     fprintf('done!\n');
 31 |         
 32 |     outputSize = 0;
 33 |     if (writeWords == true)
 34 |         outputSize = outputSize + size(dictW, 1);
 35 |     end
 36 |     if (writeLexemes == true)
 37 |         outputSize = outputSize + size(Theta, 1);
 38 |     end
 39 |     if (writeSynsets == true)
 40 |         outputSize = outputSize + size(dictS, 1);
 41 |     end
 42 |     
 43 |     fid = fopen(file, 'w');
 44 |     fprintf(fid, '%d %d\n',outputSize, size(W,2));
 45 |     fclose(fid);
 46 | 
 47 |     if (writeWords == true)
 48 |         fprintf('Writing word vectors ... ');
 49 |         writeToFile(file, 'a', W, dictW);
 50 |         fprintf('done!\n');
 51 |     end
 52 |     
 53 |     if (writeSynsets == true)
 54 |         
 55 |         fprintf('Writing synset vectors ... ');
 56 |         writeToFile(file, 'a', S, dictS);
 57 |         fprintf('done!\n');
 58 |     end
 59 |     
 60 |     if (writeLexemes == true)
 61 |     
 62 |         Iota = importdata(strcat(folder, experiment, '/iota.txt'), ' ');
 63 |         Theta = sortrows(Theta, [1 2]);
 64 |         Iota = sortrows(Iota, [1 2]);
 65 |         
 66 |         if (sum(sum(Theta(:,1:2)-Iota(:,1:2))) ~= 0)
 67 |             fprintf('Iota and Theta file do not match. Lexemes vector might be screwed.\n');
 68 |         end
 69 |         
 70 |         fprintf('Calculating lexeme vectors ... ');
 71 |         L = zeros(size(Theta, 1), size(W,2));
 72 |         dictL = cell(size(Theta, 1), 1);
 73 |         for l=1:size(Theta, 1)
 74 |             w = Theta(l,1);
 75 |             s = Theta(l,2);
 76 |             theta = Theta(l, 3:end);
 77 |             iota = Iota(l, 3:end);
 78 |             L(l,:) = ((W(w,:) .* theta) + (S(s,:) .* iota)) / 2;
 79 |             dictL{l} = strcat(dictW{w}, '-', dictSID{s});
 80 |         end
 81 |         fprintf('done!\n');
 82 |     
 83 |         fprintf('Writing lexeme vectors ... ');
 84 |         writeToFile(file, 'a', L, dictL);
 85 |         fprintf('done!\n');
 86 |     end
 87 | 
 88 | end
 89 | 
 90 | function [] = writeToFile(file, mode, A, dictA)
 91 | 
 92 |     fid = fopen(file, mode);
 93 | 
 94 |     for i=1:size(dictA,1)
 95 |         fprintf(fid, '%s', dictA{i});
 96 |         fprintf(fid,' %f',A(i,:));
 97 |         fprintf(fid,'\n');
 98 |     end
 99 | 
100 |     fclose(fid);
101 | 
102 | end
103 | 


--------------------------------------------------------------------------------
/IMS Features/CSynsetCosineFeatureExtractor.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IMS (It Makes Sense) -- NUS WSD System
  3 |  * Copyright (c) 2010 National University of Singapore.
  4 |  * All Rights Reserved.
  5 |  */
  6 | package sg.edu.nus.comp.nlp.ims.feature;
  7 | 
  8 | import java.io.BufferedReader;
  9 | import java.io.FileReader;
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | import java.util.HashMap;
 13 | import java.util.HashSet;
 14 | 
 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem;
 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus;
 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence;
 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter;
 19 | 
 20 | /**
 21 |  * Synset Cosine feature extractor.
 22 |  */
 23 | public class CSynsetCosineFeatureExtractor implements IFeatureExtractor {
 24 | 	
 25 | 	// the Synsets and corresponding vectors
 26 | 	protected ArrayList<String> m_Synsets = new ArrayList<String>();
 27 | 	protected ArrayList<float[]> m_SynsetVectors = new ArrayList<float[]>();
 28 | 
 29 | 	// corpus to be extracted
 30 | 	protected ICorpus m_Corpus = null;
 31 | 
 32 | 	// index of current instance
 33 | 	protected int m_Index = -1;
 34 | 
 35 | 	// current sentence to process
 36 | 	protected ISentence m_Sentence = null;
 37 | 
 38 | 	// item index in current sentence
 39 | 	protected int m_IndexInSentence;
 40 | 
 41 | 	// item length
 42 | 	protected int m_InstanceLength;
 43 | 
 44 | 	// index of Synset feature
 45 | 	protected int m_SynsetIndex = -1;
 46 | 	
 47 | 	// sentence before current sentence
 48 | 	protected int m_Left;
 49 | 
 50 | 	// sentence after current sentence
 51 | 	protected int m_Right;
 52 | 	
 53 | 	// surrounding words of current instance
 54 | 	protected HashSet<String> m_SurroundingWordSet = new HashSet<String>();
 55 | 	
 56 | 	// vector of surroundings word of current instance
 57 | 	protected float[] m_SurroundingWordVector;
 58 | 	
 59 | 	// current lemma to process
 60 | 	protected String m_Lemma;
 61 | 	protected String m_POS;
 62 | 	
 63 | 	// stop words filter
 64 | 	protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance();
 65 | 
 66 | 	// current feature
 67 | 	protected IFeature m_CurrentFeature = null;
 68 | 	
 69 | 	protected static HashMap<String, float[]> wordVectors = new HashMap<String, float[]>();  
 70 | 	
 71 | 	protected static int g_LIDX = AItem.Features.LEMMA.ordinal();
 72 | 	protected static int g_TIDX = AItem.Features.TOKEN.ordinal();
 73 | 	protected static int g_PIDX = AItem.Features.POS.ordinal();
 74 | 	
 75 | 	protected static int DIM_SIZE;
 76 | 
 77 | 
 78 | 	/**
 79 | 	 * constructor
 80 | 	 */
 81 | 	public CSynsetCosineFeatureExtractor() {
 82 | 		
 83 | 		createWordVectorSet();
 84 | 		
 85 | 		this.m_Left = Integer.MAX_VALUE;
 86 | 		this.m_Right = Integer.MAX_VALUE;
 87 | 	}
 88 | 
 89 | 	/*
 90 | 	 * (non-Javadoc)
 91 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID()
 92 | 	 */
 93 | 	@Override
 94 | 	public String getCurrentInstanceID() {
 95 | 		if (this.validIndex(this.m_Index)) {
 96 | 			return this.m_Corpus.getValue(this.m_Index, "id");
 97 | 		}
 98 | 		return null;
 99 | 	}
100 | 
101 | 	/*
102 | 	 * (non-Javadoc)
103 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext()
104 | 	 */
105 | 	@Override
106 | 	public boolean hasNext() {
107 | 		if (this.m_CurrentFeature != null) {
108 | 			return true;
109 | 		}
110 | 		if (this.validIndex(this.m_Index)) {
111 | 			this.m_CurrentFeature = this.getNext();
112 | 			if (this.m_CurrentFeature != null) {
113 | 				return true;
114 | 			}
115 | 		}
116 | 		return false;
117 | 	}
118 | 
119 | 	/**
120 | 	 * get the next feature of current instance
121 | 	 *
122 | 	 * @return feature
123 | 	 */
124 | 	protected IFeature getNext() {
125 | 		IFeature feature = null;
126 | 		if (this.m_SynsetIndex >= 0 && this.m_SynsetIndex < this.m_Synsets.size()) {
127 | 			feature = new CDoubleFeature();
128 | 			feature.setKey(this.m_Synsets.get(this.m_SynsetIndex));
129 | 			feature.setValue(this.getSynsetFeature(this.m_SynsetIndex));
130 | 			this.m_SynsetIndex++;
131 | 		}
132 | 		return feature;
133 | 	}
134 | 
135 | 	/**
136 | 	 * get the part-of-speech of item p_Index + m_IndexInSentence
137 | 	 *
138 | 	 * @param p_Index
139 | 	 *            index
140 | 	 * @return feature value
141 | 	 */
142 | 	protected String getSynsetFeature(int p_Index) {
143 | 		
144 | 		float cosine = getCosine(this.m_SurroundingWordVector, this.m_SynsetVectors.get(p_Index));
145 | 		return Float.toString(cosine);
146 | 	}
147 | 
148 | 	private float getCosine(float[] vector1, float[] vector2)
149 | 	{		
150 | 		float value = 0;
151 | 		float len1 = 0;
152 | 		float len2 = 0;
153 | 		
154 | 		for (int i = 0; i < vector1.length; i++) {
155 |         	
156 |         	value += vector1[i] * vector2[i];
157 |         	len1 += vector1[i] * vector1[i];
158 |         	len2 += vector2[i] * vector2[i];
159 |         }
160 | 		
161 | 		return (float)(value/(Math.sqrt(len1)*Math.sqrt(len2)));		
162 | 	}
163 | 	
164 | 	private void createWordVectorSet()
165 | 	{
166 | 		if (wordVectors.size() > 0)
167 | 			return;
168 | 		
169 | 		// path to word and synset vectors
170 | 		String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile;
171 | 		
172 | 		BufferedReader br = null;
173 | 		try
174 | 		{
175 |             br = new BufferedReader(new FileReader(path));;
176 |             
177 |             String key = null;
178 |             
179 |             String line = br.readLine();
180 |             String[] lineSplited = line.split(" ");
181 |             
182 |             DIM_SIZE = Integer.parseInt(lineSplited[1]);
183 |             
184 |             while ((line = br.readLine()) != null) {
185 |             	
186 |                 lineSplited = line.split(" ");
187 |             	
188 |                 key = lineSplited[0];
189 |                 
190 |                 float vector[] = new float[DIM_SIZE];
191 |                 
192 |                 for (int j = 0; j < DIM_SIZE; j++) {
193 |                 	vector[j] += Float.parseFloat(lineSplited[j + 1]);
194 | 				}
195 |                 
196 |                 wordVectors.put(key, vector);
197 |             }
198 | 
199 |         } catch (IOException e)
200 |         {
201 |         	e.printStackTrace();
202 |         }
203 | 	}
204 | 
205 | 	/**
206 | 	 * check the validity of index
207 | 	 *
208 | 	 * @param p_Index
209 | 	 *            index
210 | 	 * @return valid or not
211 | 	 */
212 | 	protected boolean validIndex(int p_Index) {
213 | 		if (this.m_Corpus != null && this.m_Corpus.size() > p_Index
214 | 				&& p_Index >= 0) {
215 | 			return true;
216 | 		}
217 | 		return false;
218 | 	}
219 | 
220 | 	/*
221 | 	 * (non-Javadoc)
222 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next()
223 | 	 */
224 | 	@Override
225 | 	public IFeature next() {
226 | 		IFeature feature = null;
227 | 		if (this.hasNext()) {
228 | 			feature = this.m_CurrentFeature;
229 | 			this.m_CurrentFeature = null;
230 | 		}
231 | 		return feature;
232 | 	}
233 | 
234 | 	/*
235 | 	 * (non-Javadoc)
236 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart()
237 | 	 */
238 | 	@Override
239 | 	public boolean restart() {
240 | 		this.m_SynsetIndex = 0;
241 | 		this.m_CurrentFeature = null;
242 | 		return this.validIndex(this.m_Index);
243 | 	}
244 | 
245 | 	/*
246 | 	 * (non-Javadoc)
247 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus)
248 | 	 */
249 | 	@Override
250 | 	public boolean setCorpus(ICorpus p_Corpus) {
251 | 		if (p_Corpus == null) {
252 | 			return false;
253 | 		}
254 | 		this.m_Corpus = p_Corpus;
255 | 		this.m_Index = 0;
256 | 		this.restart();
257 | 		this.m_Index = -1;
258 | 		this.m_IndexInSentence = -1;
259 | 		this.m_InstanceLength = -1;
260 | 		return true;
261 | 	}
262 | 	
263 | 	/**
264 | 	 * check whether word is in stop word list or contains no alphabet
265 | 	 *
266 | 	 * @param p_Word
267 | 	 *            word
268 | 	 * @return true if it should be filtered, else false
269 | 	 */
270 | 	public boolean filter(String p_Word) {
271 | 		return this.m_Filter.filter(p_Word);
272 | 	}
273 | 
274 | 	/*
275 | 	 * (non-Javadoc)
276 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int)
277 | 	 */
278 | 	@Override
279 | 	public boolean setCurrentInstance(int p_Index) {
280 | 		if (this.validIndex(p_Index)) {
281 | 			this.m_Index = p_Index;
282 | 			this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index);
283 | 			this.m_InstanceLength = this.m_Corpus.getLength(p_Index);
284 | 			int currentSent = this.m_Corpus.getSentenceID(p_Index);
285 | 			this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus
286 | 					.getSentenceID(p_Index));
287 | 			this.m_Synsets.clear();
288 | 			this.m_SynsetVectors.clear();
289 | 			this.m_SurroundingWordSet.clear();
290 | 			this.m_SurroundingWordVector = new float[DIM_SIZE];
291 | 			
292 | 			this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX);
293 | 			this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX);
294 | 			String posID = "%3";
295 | 			
296 | 			if (this.m_POS.contains("NN"))
297 | 				posID = "%1";
298 | 			else if (this.m_POS.contains("VB"))
299 | 				posID = "%2";
300 | 			else if (this.m_POS.contains("JJ"))
301 | 				posID = "%3";
302 | 			else if (this.m_POS.contains("RB"))
303 | 				posID = "%4";
304 | 			else
305 | 				posID = "%";
306 | 			// add possible synsets
307 | 			for (String key : wordVectors.keySet())
308 | 			{
309 | 		        if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) ||
310 | 		        	key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") ||
311 | 		        	key.equals(this.m_Lemma))
312 | 				{
313 | 					this.m_Synsets.add(key);
314 | 					this.m_SynsetVectors.add(wordVectors.get(key));
315 | 				}
316 | 			}
317 | 			
318 | 			String keyWord = null;
319 | 			int lower = this.m_Corpus.getLowerBoundary(currentSent);
320 | 			int upper = this.m_Corpus.getUpperBoundary(currentSent);
321 | 			for (int sentIdx = lower; sentIdx < upper; sentIdx++) {
322 | 				if (currentSent - sentIdx > this.m_Left
323 | 						|| sentIdx - currentSent > this.m_Right) {
324 | 					continue;
325 | 				}
326 | 				ISentence sentence = this.m_Corpus.getSentence(sentIdx);
327 | 				if (sentence != null) {
328 | 					for (int i = 0; i < sentence.size(); i++) {
329 | 						keyWord = sentence.getItem(i).get(g_TIDX);
330 | 						if (this.filter(keyWord)) {
331 | 							continue;
332 | 						}
333 | 						keyWord = sentence.getItem(i).get(g_LIDX);
334 | 						if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength)
335 | 								&& !this.m_SurroundingWordSet.contains(keyWord))
336 | 						{
337 | 							this.m_SurroundingWordSet.add(keyWord);
338 | 							if (wordVectors.containsKey(keyWord))
339 | 							{
340 | 								float[] vector = wordVectors.get(keyWord);
341 | 								for (int j = 0; j < vector.length; j++) {
342 | 				                	this.m_SurroundingWordVector[j] += vector[j];
343 | 								}
344 | 							}
345 | 						}
346 | 					}
347 | 				}
348 | 			}
349 | 			this.restart();
350 | 			return true;
351 | 		}
352 | 		return false;
353 | 	}
354 | 
355 | }
356 | 


--------------------------------------------------------------------------------
/IMS Features/CSynsetProductFeatureExtractor.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IMS (It Makes Sense) -- NUS WSD System
  3 |  * Copyright (c) 2010 National University of Singapore.
  4 |  * All Rights Reserved.
  5 |  */
  6 | package sg.edu.nus.comp.nlp.ims.feature;
  7 | 
  8 | import java.io.BufferedReader;
  9 | import java.io.FileReader;
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | import java.util.HashMap;
 13 | import java.util.HashSet;
 14 | 
 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem;
 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus;
 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence;
 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter;
 19 | 
 20 | /**
 21 |  * Synset Product feature extractor.
 22 |  */
 23 | public class CSynsetProductFeatureExtractor implements IFeatureExtractor {
 24 | 	
 25 | 	// the Synsets and corresponding vectors
 26 | 	protected ArrayList<String> m_Synsets = new ArrayList<String>();
 27 | 	protected ArrayList<float[]> m_SynsetVectors = new ArrayList<float[]>();
 28 | 
 29 | 	// corpus to be extracted
 30 | 	protected ICorpus m_Corpus = null;
 31 | 
 32 | 	// index of current instance
 33 | 	protected int m_Index = -1;
 34 | 
 35 | 	// current sentence to process
 36 | 	protected ISentence m_Sentence = null;
 37 | 
 38 | 	// item index in current sentence
 39 | 	protected int m_IndexInSentence;
 40 | 
 41 | 	// item length
 42 | 	protected int m_InstanceLength;
 43 | 
 44 | 	// index of Synset feature
 45 | 	protected int m_FeatureIndex = -1;
 46 | 	
 47 | 	// sentence before current sentence
 48 | 	protected int m_Left;
 49 | 
 50 | 	// sentence after current sentence
 51 | 	protected int m_Right;
 52 | 	
 53 | 	// surrounding words of current instance
 54 | 	protected HashSet<String> m_SurroundingWordSet = new HashSet<String>();
 55 | 	
 56 | 	// vector of surroundings word of current instance
 57 | 	protected float[] m_SurroundingWordVector;
 58 | 	
 59 | 	// current lemma to process
 60 | 	protected String m_Lemma;
 61 | 	protected String m_POS;
 62 | 	
 63 | 	// stop words filter
 64 | 	protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance();
 65 | 
 66 | 	// current feature
 67 | 	protected IFeature m_CurrentFeature = null;
 68 | 	
 69 | 	protected static HashMap<String, float[]> wordVectors = new HashMap<String, float[]>();  
 70 | 	
 71 | 	protected static int g_LIDX = AItem.Features.LEMMA.ordinal();
 72 | 	protected static int g_TIDX = AItem.Features.TOKEN.ordinal();
 73 | 	protected static int g_PIDX = AItem.Features.POS.ordinal();
 74 | 	
 75 | 	protected static int DIM_SIZE;
 76 | 
 77 | 
 78 | 	/**
 79 | 	 * constructor
 80 | 	 */
 81 | 	public CSynsetProductFeatureExtractor() {
 82 | 		
 83 | 		createWordVectorSet();
 84 | 		
 85 | 		this.m_Left = Integer.MAX_VALUE;
 86 | 		this.m_Right = Integer.MAX_VALUE;
 87 | 	}
 88 | 
 89 | 	/*
 90 | 	 * (non-Javadoc)
 91 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID()
 92 | 	 */
 93 | 	@Override
 94 | 	public String getCurrentInstanceID() {
 95 | 		if (this.validIndex(this.m_Index)) {
 96 | 			return this.m_Corpus.getValue(this.m_Index, "id");
 97 | 		}
 98 | 		return null;
 99 | 	}
100 | 
101 | 	/*
102 | 	 * (non-Javadoc)
103 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext()
104 | 	 */
105 | 	@Override
106 | 	public boolean hasNext() {
107 | 		if (this.m_CurrentFeature != null) {
108 | 			return true;
109 | 		}
110 | 		if (this.validIndex(this.m_Index)) {
111 | 			this.m_CurrentFeature = this.getNext();
112 | 			if (this.m_CurrentFeature != null) {
113 | 				return true;
114 | 			}
115 | 		}
116 | 		return false;
117 | 	}
118 | 
119 | 	/**
120 | 	 * get the next feature of current instance
121 | 	 *
122 | 	 * @return feature
123 | 	 */
124 | 	protected IFeature getNext() {
125 | 		IFeature feature = null;
126 | 		if (this.m_FeatureIndex >= 0 && this.m_FeatureIndex < this.m_Synsets.size() * DIM_SIZE) {
127 | 			feature = new CDoubleFeature();
128 | 			int index = this.m_FeatureIndex / DIM_SIZE;
129 | 			int dimension = this.m_FeatureIndex % DIM_SIZE; 
130 | 			feature.setKey(dimension + "_" + this.m_Synsets.get(index));
131 | 			feature.setValue(this.getSynsetFeature(index, dimension));
132 | 			this.m_FeatureIndex++;
133 | 		}
134 | 		return feature;
135 | 	}
136 | 
137 | 	/**
138 | 	 * get the part-of-speech of item p_Index + m_IndexInSentence
139 | 	 *
140 | 	 * @param p_Index
141 | 	 *            index
142 | 	 * @return feature value
143 | 	 */
144 | 	protected String getSynsetFeature(int index, int dimension) {
145 | 		
146 | 		float result = this.m_SurroundingWordVector[dimension] * this.m_SynsetVectors.get(index)[dimension];
147 | 		return Float.toString(result);
148 | 	}
149 | 	
150 | 	private void createWordVectorSet()
151 | 	{
152 | 		if (wordVectors.size() > 0)
153 | 			return;
154 | 		
155 | 		// path to word and synset vectors
156 | 		String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile;
157 | 		
158 | 		System.err.println("Reading word and synsets vector from:");
159 | 		System.err.println(path);
160 | 		
161 | 		BufferedReader br = null;
162 | 		try
163 | 		{
164 |             br = new BufferedReader(new FileReader(path));;
165 |             
166 |             String key = null;
167 |             
168 |             String line = br.readLine();
169 |             String[] lineSplited = line.split(" ");
170 |             
171 |             DIM_SIZE = Integer.parseInt(lineSplited[1]);
172 |             
173 |             while ((line = br.readLine()) != null) {
174 |             	
175 |                 lineSplited = line.split(" ");
176 |             	
177 |                 key = lineSplited[0];
178 |                 
179 |                 float vector[] = new float[DIM_SIZE];
180 |                 
181 |                 for (int j = 0; j < DIM_SIZE; j++) {
182 |                 	vector[j] += Float.parseFloat(lineSplited[j + 1]);
183 | 				}
184 |                 
185 |                 wordVectors.put(key, vector);
186 |             }
187 | 
188 |         } catch (IOException e)
189 |         {
190 |         	e.printStackTrace();
191 |         }
192 | 		
193 | 		System.err.println("Done!");
194 | 	}
195 | 
196 | 	/**
197 | 	 * check the validity of index
198 | 	 *
199 | 	 * @param p_Index
200 | 	 *            index
201 | 	 * @return valid or not
202 | 	 */
203 | 	protected boolean validIndex(int p_Index) {
204 | 		if (this.m_Corpus != null && this.m_Corpus.size() > p_Index
205 | 				&& p_Index >= 0) {
206 | 			return true;
207 | 		}
208 | 		return false;
209 | 	}
210 | 
211 | 	/*
212 | 	 * (non-Javadoc)
213 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next()
214 | 	 */
215 | 	@Override
216 | 	public IFeature next() {
217 | 		IFeature feature = null;
218 | 		if (this.hasNext()) {
219 | 			feature = this.m_CurrentFeature;
220 | 			this.m_CurrentFeature = null;
221 | 		}
222 | 		return feature;
223 | 	}
224 | 
225 | 	/*
226 | 	 * (non-Javadoc)
227 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart()
228 | 	 */
229 | 	@Override
230 | 	public boolean restart() {
231 | 		this.m_FeatureIndex = 0;
232 | 		this.m_CurrentFeature = null;
233 | 		return this.validIndex(this.m_Index);
234 | 	}
235 | 
236 | 	/*
237 | 	 * (non-Javadoc)
238 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus)
239 | 	 */
240 | 	@Override
241 | 	public boolean setCorpus(ICorpus p_Corpus) {
242 | 		if (p_Corpus == null) {
243 | 			return false;
244 | 		}
245 | 		this.m_Corpus = p_Corpus;
246 | 		this.m_Index = 0;
247 | 		this.restart();
248 | 		this.m_Index = -1;
249 | 		this.m_IndexInSentence = -1;
250 | 		this.m_InstanceLength = -1;
251 | 		return true;
252 | 	}
253 | 	
254 | 	/**
255 | 	 * check whether word is in stop word list or contains no alphabet
256 | 	 *
257 | 	 * @param p_Word
258 | 	 *            word
259 | 	 * @return true if it should be filtered, else false
260 | 	 */
261 | 	public boolean filter(String p_Word) {
262 | 		return this.m_Filter.filter(p_Word);
263 | 	}
264 | 
265 | 	/*
266 | 	 * (non-Javadoc)
267 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int)
268 | 	 */
269 | 	@Override
270 | 	public boolean setCurrentInstance(int p_Index) {
271 | 		if (this.validIndex(p_Index)) {
272 | 			this.m_Index = p_Index;
273 | 			this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index);
274 | 			this.m_InstanceLength = this.m_Corpus.getLength(p_Index);
275 | 			int currentSent = this.m_Corpus.getSentenceID(p_Index);
276 | 			this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus
277 | 					.getSentenceID(p_Index));
278 | 			this.m_Synsets.clear();
279 | 			this.m_SynsetVectors.clear();
280 | 			this.m_SurroundingWordSet.clear();
281 | 			this.m_SurroundingWordVector = new float[DIM_SIZE];
282 | 			
283 | 			this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX);
284 | 			this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX);
285 | 			String posID = "%3";
286 | 			
287 | 			if (this.m_POS.contains("NN"))
288 | 				posID = "%1";
289 | 			else if (this.m_POS.contains("VB"))
290 | 				posID = "%2";
291 | 			else if (this.m_POS.contains("JJ"))
292 | 				posID = "%3";
293 | 			else if (this.m_POS.contains("RB"))
294 | 				posID = "%4";
295 | 			else
296 | 				posID = "%";
297 | 			// add possible synsets
298 | 			for (String key : wordVectors.keySet())
299 | 			{
300 | 		        if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) ||
301 | 		        	key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") ||
302 | 		        	key.equals(this.m_Lemma))
303 | 				{
304 | 					this.m_Synsets.add(key);
305 | 					this.m_SynsetVectors.add(wordVectors.get(key));
306 | 				}
307 | 			}
308 | 
309 | 			String keyWord = null;
310 | 			int lower = this.m_Corpus.getLowerBoundary(currentSent);
311 | 			int upper = this.m_Corpus.getUpperBoundary(currentSent);
312 | 			for (int sentIdx = lower; sentIdx < upper; sentIdx++) {
313 | 				if (currentSent - sentIdx > this.m_Left
314 | 						|| sentIdx - currentSent > this.m_Right) {
315 | 					continue;
316 | 				}
317 | 				ISentence sentence = this.m_Corpus.getSentence(sentIdx);
318 | 				if (sentence != null) {
319 | 					for (int i = 0; i < sentence.size(); i++) {
320 | 						keyWord = sentence.getItem(i).get(g_TIDX);
321 | 						if (this.filter(keyWord)) {
322 | 							continue;
323 | 						}
324 | 						keyWord = sentence.getItem(i).get(g_LIDX);
325 | 						if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength)
326 | 								&& !this.m_SurroundingWordSet.contains(keyWord))
327 | 						{
328 | 							this.m_SurroundingWordSet.add(keyWord);
329 | 							if (wordVectors.containsKey(keyWord))
330 | 							{
331 | 								float[] vector = wordVectors.get(keyWord);
332 | 								for (int j = 0; j < vector.length; j++) {
333 | 				                	this.m_SurroundingWordVector[j] += vector[j];
334 | 								}
335 | 							}
336 | 						}
337 | 					}
338 | 				}
339 | 			}
340 | 			this.restart();
341 | 			return true;
342 | 		}
343 | 		return false;
344 | 	}
345 | 
346 | }
347 | 


--------------------------------------------------------------------------------
/IMS Features/CSynsetRawFeatureExtractor.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IMS (It Makes Sense) -- NUS WSD System
  3 |  * Copyright (c) 2010 National University of Singapore.
  4 |  * All Rights Reserved.
  5 |  */
  6 | package sg.edu.nus.comp.nlp.ims.feature;
  7 | 
  8 | import java.io.BufferedReader;
  9 | import java.io.FileReader;
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | import java.util.HashMap;
 13 | import java.util.HashSet;
 14 | 
 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem;
 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus;
 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence;
 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter;
 19 | 
 20 | /**
 21 |  * Synset Raw feature extractor.
 22 |  */
 23 | public class CSynsetRawFeatureExtractor implements IFeatureExtractor {
 24 | 	
 25 | 	// the Synsets and corresponding vectors
 26 | 	protected ArrayList<String> m_Synsets = new ArrayList<String>();
 27 | 	protected ArrayList<float[]> m_SynsetVectors = new ArrayList<float[]>();
 28 | 
 29 | 	// corpus to be extracted
 30 | 	protected ICorpus m_Corpus = null;
 31 | 
 32 | 	// index of current instance
 33 | 	protected int m_Index = -1;
 34 | 
 35 | 	// current sentence to process
 36 | 	protected ISentence m_Sentence = null;
 37 | 
 38 | 	// item index in current sentence
 39 | 	protected int m_IndexInSentence;
 40 | 
 41 | 	// item length
 42 | 	protected int m_InstanceLength;
 43 | 
 44 | 	// index of Synset feature
 45 | 	protected int m_FeatureIndex = -1;
 46 | 	
 47 | 	// sentence before current sentence
 48 | 	protected int m_Left;
 49 | 
 50 | 	// sentence after current sentence
 51 | 	protected int m_Right;
 52 | 	
 53 | 	// surrounding words of current instance
 54 | 	protected HashSet<String> m_SurroundingWordSet = new HashSet<String>();
 55 | 	
 56 | 	// vector of surroundings word of current instance
 57 | 	protected float[] m_SurroundingWordVector;
 58 | 	
 59 | 	// current lemma to process
 60 | 	protected String m_Lemma;
 61 | 	protected String m_POS;
 62 | 	
 63 | 	// stop words filter
 64 | 	protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance();
 65 | 
 66 | 	// current feature
 67 | 	protected IFeature m_CurrentFeature = null;
 68 | 	
 69 | 	protected static HashMap<String, float[]> wordVectors = new HashMap<String, float[]>();  
 70 | 	
 71 | 	protected static int g_LIDX = AItem.Features.LEMMA.ordinal();
 72 | 	protected static int g_TIDX = AItem.Features.TOKEN.ordinal();
 73 | 	protected static int g_PIDX = AItem.Features.POS.ordinal();
 74 | 	
 75 | 	protected static int DIM_SIZE;
 76 | 
 77 | 
 78 | 	/**
 79 | 	 * constructor
 80 | 	 */
 81 | 	public CSynsetRawFeatureExtractor() {
 82 | 		
 83 | 		createWordVectorSet();
 84 | 		
 85 | 		this.m_Left = Integer.MAX_VALUE;
 86 | 		this.m_Right = Integer.MAX_VALUE;
 87 | 	}
 88 | 
 89 | 	/*
 90 | 	 * (non-Javadoc)
 91 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID()
 92 | 	 */
 93 | 	@Override
 94 | 	public String getCurrentInstanceID() {
 95 | 		if (this.validIndex(this.m_Index)) {
 96 | 			return this.m_Corpus.getValue(this.m_Index, "id");
 97 | 		}
 98 | 		return null;
 99 | 	}
100 | 
101 | 	/*
102 | 	 * (non-Javadoc)
103 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext()
104 | 	 */
105 | 	@Override
106 | 	public boolean hasNext() {
107 | 		if (this.m_CurrentFeature != null) {
108 | 			return true;
109 | 		}
110 | 		if (this.validIndex(this.m_Index)) {
111 | 			this.m_CurrentFeature = this.getNext();
112 | 			if (this.m_CurrentFeature != null) {
113 | 				return true;
114 | 			}
115 | 		}
116 | 		return false;
117 | 	}
118 | 
119 | 	/**
120 | 	 * get the next feature of current instance
121 | 	 *
122 | 	 * @return feature
123 | 	 */
124 | 	protected IFeature getNext() {
125 | 		IFeature feature = null;
126 | 		if (this.m_FeatureIndex >= 0 && this.m_FeatureIndex < (this.m_Synsets.size() + 1) * DIM_SIZE) {
127 | 			feature = new CDoubleFeature();
128 | 			int index = (this.m_FeatureIndex / DIM_SIZE) - 1;
129 | 			int dimension = this.m_FeatureIndex % DIM_SIZE;
130 | 			if (index == -1)
131 | 				feature.setKey(dimension + "_sentence");
132 | 			else
133 | 				feature.setKey(dimension + "_" + this.m_Synsets.get(index));
134 | 			feature.setValue(this.getSynsetFeature(index, dimension));
135 | 			this.m_FeatureIndex++;
136 | 		}
137 | 		return feature;
138 | 	}
139 | 
140 | 	/**
141 | 	 * get the part-of-speech of item p_Index + m_IndexInSentence
142 | 	 *
143 | 	 * @param p_Index
144 | 	 *            index
145 | 	 * @return feature value
146 | 	 */
147 | 	protected String getSynsetFeature(int index, int dimension) {
148 | 		
149 | 		if (index == -1)
150 | 			return Float.toString(this.m_SurroundingWordVector[dimension]);
151 | 		else
152 | 			return Float.toString(this.m_SynsetVectors.get(index)[dimension]);
153 | 	}
154 | 	
155 | 	private void createWordVectorSet()
156 | 	{
157 | 		if (wordVectors.size() > 0)
158 | 			return;
159 | 		
160 | 		// path to word and synset vectors
161 | 		String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile;
162 | 		
163 | 		BufferedReader br = null;
164 | 		try
165 | 		{
166 |             br = new BufferedReader(new FileReader(path));;
167 |             
168 |             String key = null;
169 |             
170 |             String line = br.readLine();
171 |             String[] lineSplited = line.split(" ");
172 |             
173 |             DIM_SIZE = Integer.parseInt(lineSplited[1]);
174 |             
175 |             while ((line = br.readLine()) != null) {
176 |             	
177 |                 lineSplited = line.split(" ");
178 |             	
179 |                 key = lineSplited[0];
180 |                 
181 |                 float vector[] = new float[DIM_SIZE];
182 |                 
183 |                 for (int j = 0; j < DIM_SIZE; j++) {
184 |                 	vector[j] += Float.parseFloat(lineSplited[j + 1]);
185 | 				}
186 |                 
187 |                 wordVectors.put(key, vector);
188 |             }
189 | 
190 |         } catch (IOException e)
191 |         {
192 |         	e.printStackTrace();
193 |         }
194 | 	}
195 | 
196 | 	/**
197 | 	 * check the validity of index
198 | 	 *
199 | 	 * @param p_Index
200 | 	 *            index
201 | 	 * @return valid or not
202 | 	 */
203 | 	protected boolean validIndex(int p_Index) {
204 | 		if (this.m_Corpus != null && this.m_Corpus.size() > p_Index
205 | 				&& p_Index >= 0) {
206 | 			return true;
207 | 		}
208 | 		return false;
209 | 	}
210 | 
211 | 	/*
212 | 	 * (non-Javadoc)
213 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next()
214 | 	 */
215 | 	@Override
216 | 	public IFeature next() {
217 | 		IFeature feature = null;
218 | 		if (this.hasNext()) {
219 | 			feature = this.m_CurrentFeature;
220 | 			this.m_CurrentFeature = null;
221 | 		}
222 | 		return feature;
223 | 	}
224 | 
225 | 	/*
226 | 	 * (non-Javadoc)
227 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart()
228 | 	 */
229 | 	@Override
230 | 	public boolean restart() {
231 | 		this.m_FeatureIndex = 0;
232 | 		this.m_CurrentFeature = null;
233 | 		return this.validIndex(this.m_Index);
234 | 	}
235 | 
236 | 	/*
237 | 	 * (non-Javadoc)
238 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus)
239 | 	 */
240 | 	@Override
241 | 	public boolean setCorpus(ICorpus p_Corpus) {
242 | 		if (p_Corpus == null) {
243 | 			return false;
244 | 		}
245 | 		this.m_Corpus = p_Corpus;
246 | 		this.m_Index = 0;
247 | 		this.restart();
248 | 		this.m_Index = -1;
249 | 		this.m_IndexInSentence = -1;
250 | 		this.m_InstanceLength = -1;
251 | 		return true;
252 | 	}
253 | 	
254 | 	/**
255 | 	 * check whether word is in stop word list or contains no alphabet
256 | 	 *
257 | 	 * @param p_Word
258 | 	 *            word
259 | 	 * @return true if it should be filtered, else false
260 | 	 */
261 | 	public boolean filter(String p_Word) {
262 | 		return this.m_Filter.filter(p_Word);
263 | 	}
264 | 
265 | 	/*
266 | 	 * (non-Javadoc)
267 | 	 * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int)
268 | 	 */
269 | 	@Override
270 | 	public boolean setCurrentInstance(int p_Index) {
271 | 		if (this.validIndex(p_Index)) {
272 | 			this.m_Index = p_Index;
273 | 			this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index);
274 | 			this.m_InstanceLength = this.m_Corpus.getLength(p_Index);
275 | 			int currentSent = this.m_Corpus.getSentenceID(p_Index);
276 | 			this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus
277 | 					.getSentenceID(p_Index));
278 | 			this.m_Synsets.clear();
279 | 			this.m_SynsetVectors.clear();
280 | 			this.m_SurroundingWordSet.clear();
281 | 			this.m_SurroundingWordVector = new float[DIM_SIZE];
282 | 			
283 | 			this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX);
284 | 			this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX);
285 | 			String posID = "%3";
286 | 			
287 | 			if (this.m_POS.contains("NN"))
288 | 				posID = "%1";
289 | 			else if (this.m_POS.contains("VB"))
290 | 				posID = "%2";
291 | 			else if (this.m_POS.contains("JJ"))
292 | 				posID = "%3";
293 | 			else if (this.m_POS.contains("RB"))
294 | 				posID = "%4";
295 | 			else
296 | 				posID = "%";
297 | 			// add possible synsets
298 | 			for (String key : wordVectors.keySet())
299 | 			{
300 | 		        if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) ||
301 | 		        	key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") ||
302 | 		        	key.equals(this.m_Lemma))
303 | 				{
304 | 					this.m_Synsets.add(key);
305 | 					this.m_SynsetVectors.add(wordVectors.get(key));
306 | 				}
307 | 			}
308 | 
309 | 			String keyWord = null;
310 | 			int lower = this.m_Corpus.getLowerBoundary(currentSent);
311 | 			int upper = this.m_Corpus.getUpperBoundary(currentSent);
312 | 			for (int sentIdx = lower; sentIdx < upper; sentIdx++) {
313 | 				if (currentSent - sentIdx > this.m_Left
314 | 						|| sentIdx - currentSent > this.m_Right) {
315 | 					continue;
316 | 				}
317 | 				ISentence sentence = this.m_Corpus.getSentence(sentIdx);
318 | 				if (sentence != null) {
319 | 					for (int i = 0; i < sentence.size(); i++) {
320 | 						keyWord = sentence.getItem(i).get(g_TIDX);
321 | 						if (this.filter(keyWord)) {
322 | 							continue;
323 | 						}
324 | 						keyWord = sentence.getItem(i).get(g_LIDX);
325 | 						if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength)
326 | 								&& !this.m_SurroundingWordSet.contains(keyWord))
327 | 						{
328 | 							this.m_SurroundingWordSet.add(keyWord);
329 | 							if (wordVectors.containsKey(keyWord))
330 | 							{
331 | 								float[] vector = wordVectors.get(keyWord);
332 | 								for (int j = 0; j < vector.length; j++) {
333 | 				                	this.m_SurroundingWordVector[j] += vector[j];
334 | 								}
335 | 							}
336 | 						}
337 | 					}
338 | 				}
339 | 			}
340 | 			this.restart();
341 | 			return true;
342 | 		}
343 | 		return false;
344 | 	}
345 | 
346 | }
347 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 casaro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/WordNetExtractor/Shared.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedInputStream;
  2 | import java.io.BufferedOutputStream;
  3 | import java.io.BufferedReader;
  4 | import java.io.DataInputStream;
  5 | import java.io.DataOutputStream;
  6 | import java.io.FileInputStream;
  7 | import java.io.FileNotFoundException;
  8 | import java.io.FileOutputStream;
  9 | import java.io.FileReader;
 10 | import java.io.IOException;
 11 | import java.io.InputStream;
 12 | import java.io.PrintWriter;
 13 | import java.io.UnsupportedEncodingException;
 14 | import java.util.HashMap;
 15 | import java.util.Iterator;
 16 | import java.util.Map;
 17 | import java.util.Map.Entry;
 18 | 
 19 | 
 20 | public class Shared
 21 | {
 22 | 	public static HashMap<String, float[]> WordMap = new HashMap<String, float[]>();
 23 | 
 24 | 	public static int words;
 25 | 	public static int size;
 26 | 	
 27 | 	private static final int MAX_SIZE = 50;
 28 | 	
 29 | 	public static void loadTxtModel(String path)
 30 | 	{
 31 | 		BufferedReader br = null;
 32 | 		try
 33 | 		{
 34 |             br = new BufferedReader(new FileReader(path));;
 35 | 			
 36 |             String line = br.readLine();
 37 |             String[] lineSplited = line.split(" ");
 38 |             
 39 | 			words = Integer.parseInt(lineSplited[0]);
 40 |             size = Integer.parseInt(lineSplited[1]);
 41 | 
 42 |             float vector = 0;
 43 | 
 44 |             String key = null;
 45 |             float[] value = null;
 46 |             for (int i = 0; i < words; i++) {
 47 |             	
 48 |             	line = br.readLine();
 49 |                 lineSplited = line.split(" ");
 50 |             	
 51 |                 key = lineSplited[0];
 52 |                 value = new float[size];
 53 |                 for (int j = 0; j < size; j++) {
 54 |                     vector = Float.parseFloat(lineSplited[j + 1]);
 55 | 					value[j] = vector;
 56 | 				}
 57 | 
 58 | 				WordMap.put(key, value);
 59 |             }
 60 | 
 61 |         } catch (FileNotFoundException e)
 62 | 		{
 63 | 			e.printStackTrace();
 64 | 		} catch (IOException e)
 65 | 		{
 66 | 			e.printStackTrace();
 67 | 		}
 68 |     }
 69 | 
 70 | 	public static void loadGoogleModel(String path)
 71 | 	{
 72 | 		DataInputStream dis = null;
 73 | 		BufferedInputStream bis = null;
 74 | 		double len = 0;
 75 | 		float vector = 0;
 76 | 		try
 77 | 		{
 78 | 			bis = new BufferedInputStream(new FileInputStream(path));
 79 | 			dis = new DataInputStream(bis);
 80 | 			
 81 | 			words = Integer.parseInt(readString(dis));
 82 | 			size = Integer.parseInt(readString(dis));
 83 | 
 84 | 			String key;
 85 | 			float[] value = null;
 86 | 			float[] valueN = null;
 87 | 			for (int i = 0; i < words; i++)
 88 | 			{
 89 | 				key = readString(dis);
 90 | 				value = new float[size];
 91 | 				valueN = new float[size];
 92 | 				len = 0;
 93 | 				for (int j = 0; j < size; j++)
 94 | 				{
 95 | 					vector = readFloat(dis);
 96 | 					len += vector * vector;
 97 | 					value[j] = (float) vector;
 98 | 				}
 99 | 				len = Math.sqrt(len);
100 | 
101 | 				for (int j = 0; j < size; j++)
102 | 				{
103 | 					valueN[j] = value[j] / (float) len;
104 | 				}
105 | 
106 | 				WordMap.put(key, value);
107 | 			}
108 | 			
109 | 			bis.close();
110 | 			dis.close();
111 | 		} catch (FileNotFoundException e)
112 | 		{
113 | 			e.printStackTrace();			
114 | 		} catch (IOException e)
115 | 		{
116 | 			e.printStackTrace();
117 | 		}
118 | 	}
119 | 	
120 | 	public static void saveGoogleModel(String path)
121 | 	{
122 | 		DataOutputStream dis = null;
123 | 		BufferedOutputStream bis = null;
124 | 
125 | 		try
126 | 		{
127 | 			bis = new BufferedOutputStream(new FileOutputStream(path));
128 | 			dis = new DataOutputStream(bis);
129 | 			
130 | 			dis.writeBytes(Integer.toString(words));
131 | 			dis.writeByte(' ');
132 | 			dis.writeBytes(Integer.toString(size));
133 | 			dis.writeByte('\n');
134 | 			
135 | 			Iterator<Entry<String, float[]>> it = WordMap.entrySet().iterator();
136 | 		    while (it.hasNext())
137 | 		    {
138 | 		        Map.Entry<String, float[]> pairs = (Map.Entry<String, float[]>)it.next();
139 | 		        String key = pairs.getKey();
140 | 		        float[] value = pairs.getValue();
141 | 		        
142 | 		        dis.writeBytes(key);
143 | 		        dis.writeByte(' ');
144 | 		        
145 | 		        for (int j = 0; j < size; j++)
146 | 				{
147 | 		        	//dis.writeFloat(value[j]);
148 | 		        	dis.writeInt(Integer.reverseBytes(Float.floatToIntBits(value[j])));
149 | 				}
150 | 		        it.remove(); // avoids a ConcurrentModificationException
151 | 		    }
152 | 			
153 | 			bis.close();
154 | 			dis.close();
155 | 		} catch (FileNotFoundException e)
156 | 		{
157 | 			e.printStackTrace();			
158 | 		} catch (IOException e)
159 | 		{
160 | 			e.printStackTrace();
161 | 		}
162 | 	}
163 | 	
164 | 	public static void convertGoogleModel(String path, String filename)
165 | 	{
166 | 		PrintWriter writer;
167 | 		DataInputStream dis = null;
168 | 		BufferedInputStream bis = null;
169 | 		
170 | 		float vector = 0;
171 | 		try
172 | 		{
173 | 			bis = new BufferedInputStream(new FileInputStream(path));
174 | 			dis = new DataInputStream(bis);
175 | 			writer = new PrintWriter(filename, "UTF-8");
176 | 			
177 | 			words = Integer.parseInt(readString(dis));
178 | 			size = Integer.parseInt(readString(dis));
179 | 
180 | 			String key;
181 | 			float[] value = null;
182 | 			float[] valueUnknown = new float[size];
183 | 			for (int i = 0; i < words; i++)
184 | 			{
185 | 				key = readString(dis);
186 | 				key = key.toLowerCase();
187 | 				value = new float[size];
188 | 				for (int j = 0; j < size; j++)
189 | 				{
190 | 					vector = readFloat(dis);
191 | 					value[j] = (float) vector;
192 | 					if (i >= words-100000)
193 | 						valueUnknown[j] += ((float) vector / 100000);
194 | 				}
195 | 
196 | 				if (WordMap.containsKey(key))
197 | 					continue;
198 | 				
199 | 				writer.print(key + " ");
200 | 				writer.print(getVectorAsString(value) + "\n");
201 | 					
202 | 				WordMap.put(key, value);
203 | 			}
204 | 			
205 | 			WordMap.put("<UNK>", valueUnknown);
206 | 			writer.print("<UNK>" + " ");
207 | 			writer.print(getVectorAsString(valueUnknown) + "\n");
208 | 			
209 | 			bis.close();
210 | 			dis.close();
211 | 			writer.close();
212 | 		} catch (FileNotFoundException e)
213 | 		{
214 | 			e.printStackTrace();			
215 | 		} catch (IOException e)
216 | 		{
217 | 			e.printStackTrace();
218 | 		}
219 | 		
220 | 		System.out.printf("%8d / %8d\n", WordMap.size(), words);
221 | 	}
222 | 	
223 | 	public static void saveTxtModel(String filename)
224 | 	{
225 | 		// create file
226 | 		PrintWriter writer;
227 | 		try
228 | 		{
229 | 			writer = new PrintWriter(filename, "UTF-8");
230 | 		} catch (FileNotFoundException | UnsupportedEncodingException e)
231 | 		{
232 | 			e.printStackTrace();
233 | 			
234 | 			return;
235 | 		}
236 | 		
237 | 		writer.print(Integer.toString(words));
238 | 		writer.print(" ");
239 | 		writer.print(Integer.toString(size));
240 | 		writer.print("\n");
241 | 		
242 | 		// loop through all words
243 | 		Iterator<Entry<String, float[]>> it = WordMap.entrySet().iterator();
244 | 	    while (it.hasNext())
245 | 	    {
246 | 	        Map.Entry<String, float[]> pairs = (Map.Entry<String, float[]>)it.next();
247 | 	        String key = pairs.getKey();
248 | 	        float[] value = pairs.getValue();
249 | 	        
250 | 			writer.print(key + " ");
251 | 			writer.print(getVectorAsString(value) + "\n");
252 | 	    }
253 | 		
254 | 		writer.close();
255 | 	}
256 | 	
257 | 	public static String getVectorAsString(float[] vector)
258 | 	{	
259 | 		StringBuilder sb = new StringBuilder();
260 | 
261 | 		for (int b = 0; b < size; b++)
262 | 		{
263 | 			sb.append(vector[b]);
264 | 			sb.append(" ");
265 | 		}
266 | 
267 | 		return sb.toString().trim();
268 | 	}
269 | 
270 | 	private static float readFloat(InputStream is)
271 | 	{
272 | 		byte[] bytes = new byte[4];
273 | 		try
274 | 		{
275 | 			is.read(bytes);
276 | 		} catch (IOException e)
277 | 		{
278 | 			e.printStackTrace();
279 | 		}
280 | 		float f = getFloat(bytes);
281 | 		return f;
282 | 	}
283 | 
284 | 	private static float getFloat(byte[] b)
285 | 	{
286 | 		int accum = 0;
287 | 		accum = accum | (b[0] & 0xff) << 0;
288 | 		accum = accum | (b[1] & 0xff) << 8;
289 | 		accum = accum | (b[2] & 0xff) << 16;
290 | 		accum = accum | (b[3] & 0xff) << 24;
291 | 		return Float.intBitsToFloat(accum);
292 | 	}
293 | 
294 | 	private static String readString(DataInputStream dis)
295 | 	{
296 | 		byte[] bytes = new byte[MAX_SIZE];
297 | 		StringBuilder sb = new StringBuilder();
298 | 		try
299 | 		{
300 | 			byte b = dis.readByte();
301 | 			int i = -1;
302 | 			
303 | 			if (b == 10)
304 | 				b = dis.readByte();
305 | 
306 | 			while (b != 32 && b != 10)
307 | 			{
308 | 				i++;
309 | 				bytes[i] = b;
310 | 				b = dis.readByte();
311 | 				if (i == 49)
312 | 				{
313 | 					sb.append(new String(bytes));
314 | 					i = -1;
315 | 					bytes = new byte[MAX_SIZE];
316 | 				}
317 | 			}
318 | 			sb.append(new String(bytes, 0, i + 1));
319 | 			
320 | 		} catch (IOException e)
321 | 		{
322 | 			e.printStackTrace();
323 | 		}
324 | 		String s = sb.toString();
325 | 		return s;
326 | 	}
327 | 	
328 | 	public static void createSyntacticVec(String path, String pathLemmaMap, String filename)
329 | 	{
330 | 		// create file
331 | 		PrintWriter writer;
332 | 		try
333 | 		{
334 | 			writer = new PrintWriter(filename, "UTF-8");
335 | 		} catch (FileNotFoundException | UnsupportedEncodingException e)
336 | 		{
337 | 			e.printStackTrace();
338 | 
339 | 			return;
340 | 		}
341 | 		
342 | 		HashMap<String, float[]> leadingLemma = new HashMap<String, float[]>();
343 | 		
344 | 		DataInputStream dis = null;
345 | 		BufferedInputStream bis = null;
346 | 		BufferedReader br = null;
347 | 		double len = 0;
348 | 		float vector = 0;
349 | 		try
350 | 		{
351 | 			bis = new BufferedInputStream(new FileInputStream(path));
352 | 			dis = new DataInputStream(bis);
353 | 			br = new BufferedReader(new FileReader(pathLemmaMap));
354 | 
355 | 			words = Integer.parseInt(readString(dis));
356 | 			size = Integer.parseInt(readString(dis));
357 | 
358 | 			String key;
359 | 			float[] value = null;
360 | 			for (int i = 0; i < words; i++)
361 | 			{
362 | 				key = readString(dis);
363 | 				value = new float[size];
364 | 				len = 0;
365 | 				for (int j = 0; j < size; j++)
366 | 				{
367 | 					vector = readFloat(dis);
368 | 					len += vector * vector;
369 | 					value[j] = vector;
370 | 				}
371 | 				len = Math.sqrt(len);
372 | 				
373 | 				String line = br.readLine();
374 | 				String keyLemma = line.split("\t")[1];
375 | 				
376 | 				if (keyLemma.equals("<unknown>"))
377 | 					continue;
378 | 
379 | 				keyLemma = line.split("\t")[0];
380 | 				
381 | 				if (leadingLemma.containsKey(keyLemma))
382 | 				{
383 | 					float[] diff = leadingLemma.get(keyLemma);
384 | 					for (int j = 0; j < size; j++)
385 | 					{
386 | 						diff[j] -= value[j];
387 | 					}
388 | 					writer.print(normalizeLemma(key) + " ");
389 | 					writer.print(getVectorAsString(diff) + "\n");
390 | 				}
391 | 				else
392 | 				{
393 | 					leadingLemma.put(keyLemma, value);
394 | 				}
395 | 			}
396 | 
397 | 			writer.close();
398 | 			bis.close();
399 | 			dis.close();
400 | 		} catch (FileNotFoundException e)
401 | 		{
402 | 			e.printStackTrace();
403 | 		} catch (IOException e)
404 | 		{
405 | 			e.printStackTrace();
406 | 		}
407 | 	}
408 | 	
409 | 	public static String normalizeText(String s)
410 | 	{
411 | 		s = s.replace('’', '\'');
412 | 		s = s.replace('′', '\'');
413 | 		s = s.replace("''", " ");
414 | 		s = s.replace("'", " ' ");
415 | 		s = s.replace('“', '"');
416 | 		s = s.replace('”', '"');
417 | 		s = s.replace("\"", " \" ");
418 | 		s = s.replace(".", " . ");
419 | 		s = s.replace(",", " , ");
420 | 		s = s.replace("(", " ( ");
421 | 		s = s.replace(")", " ) ");
422 | 		s = s.replace("!", " ! ");
423 | 		s = s.replace(';', ' ');
424 | 		s = s.replace(':', ' ');
425 | 		s = s.replace("-", " - ");
426 | 		s = s.replace('=', ' ');
427 | 		s = s.replace('*', ' ');
428 | 		s = s.replace('|', ' ');
429 | 		s = s.replace('«', ' ');
430 | 		s = s.replace("  ", " ");
431 | 		s = s.replace("  ", " ");
432 | 
433 | 		s = s.trim();
434 | 
435 | 		s = s.toLowerCase();
436 | 
437 | 		return s;
438 | 	}
439 | 
440 | 	public static String normalizeLemma(String s)
441 | 	{
442 | 		s = s.replaceAll("\\(..?\\)", "");
443 | 
444 | 		s = normalizeText(s);
445 | 
446 | 		s = s.replace(" ", "_");
447 | 
448 | 		return s;
449 | 	}
450 | }
451 | 


--------------------------------------------------------------------------------
/WordNetExtractor/WordNetExtractor.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedInputStream;
  2 | import java.io.BufferedOutputStream;
  3 | import java.io.BufferedReader;
  4 | import java.io.DataInputStream;
  5 | import java.io.DataOutputStream;
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.io.FileNotFoundException;
  9 | import java.io.FileOutputStream;
 10 | import java.io.FileReader;
 11 | import java.io.IOException;
 12 | import java.io.InputStream;
 13 | import java.io.PrintWriter;
 14 | import java.io.UnsupportedEncodingException;
 15 | import java.util.HashMap;
 16 | import java.util.HashSet;
 17 | import java.util.Iterator;
 18 | import java.util.List;
 19 | import java.util.Map;
 20 | import java.util.Map.Entry;
 21 | import java.util.Scanner;
 22 | 
 23 | import net.didion.jwnl.JWNL;
 24 | import net.didion.jwnl.JWNLException;
 25 | import net.didion.jwnl.data.IndexWord;
 26 | import net.didion.jwnl.data.IndexWordSet;
 27 | import net.didion.jwnl.data.POS;
 28 | import net.didion.jwnl.data.Pointer;
 29 | import net.didion.jwnl.data.PointerType;
 30 | import net.didion.jwnl.data.Synset;
 31 | import net.didion.jwnl.data.Word;
 32 | import net.didion.jwnl.dictionary.Dictionary;
 33 | import net.didion.jwnl.dictionary.MorphologicalProcessor;
 34 | 
 35 | @SuppressWarnings("unchecked")
 36 | public class WordNetExtractor
 37 | {
 38 | 	private static HashMap<String, Integer> WordIndex = new HashMap<String, Integer>();
 39 | 	private static HashMap<String, Integer> SynsetIndex = new HashMap<String, Integer>();
 40 | 	private static Dictionary dictionary;
 41 | 
 42 | 	public static void main(String[] args) throws IOException, JWNLException
 43 | 	{
 44 | 		// path to JWNL prop xml file
 45 | 		JWNL.initialize(new FileInputStream("[...]"));
 46 | 		dictionary = Dictionary.getInstance();
 47 | 		
 48 | 		// path to input word embeddings
 49 | 		String file_name = "[...]";
 50 | 		
 51 | 		// path to output folder
 52 | 		String folder = "[...]";
 53 | 		
 54 | 		if (file_name.endsWith(".bin"))
 55 | 			Shared.loadGoogleModel(file_name);
 56 | 		else
 57 | 			Shared.loadTxtModel(file_name);
 58 | 		
 59 | 		JWNL.Version ver = JWNL.getVersion();
 60 | 		System.out.printf("RESOURCE: WN " + ver.toString() + "\n");
 61 | 		System.out.printf("VECTORS: " + folder + "\n");
 62 | 		System.out.printf("TARGET: " + folder + "\n");
 63 | 
 64 | 		extractWordsAndSynsets(folder + "words.txt",
 65 | 				folder + "synsets.txt",
 66 | 				folder + "lexemes.txt",
 67 | 				folder + "glosses.txt");
 68 | 
 69 | 		extractSynsetRelations(folder + "hypernym.txt", PointerType.HYPERNYM);
 70 | 		extractSynsetRelations(folder + "similar.txt", PointerType.SIMILAR_TO);
 71 | 		extractSynsetRelations(folder + "verbGroup.txt", PointerType.VERB_GROUP);
 72 | 		extractSynsetRelations(folder + "antonym.txt", PointerType.ANTONYM);
 73 | 
 74 | 		System.out.printf("DONE");
 75 | 	}	
 76 | 	
 77 | 	private static void extractWordsAndSynsets(String filenameWords, String filenameSynsets, String filenameLexemes, String filenameGlosses) throws JWNLException
 78 | 	{
 79 | 		// create file
 80 | 		PrintWriter writerWords, writerSynsets, writerLexemes, writerGlosses;
 81 | 		try
 82 | 		{
 83 | 			writerWords = new PrintWriter(filenameWords, "UTF-8");
 84 | 			writerSynsets = new PrintWriter(filenameSynsets, "UTF-8");
 85 | 			writerLexemes = new PrintWriter(filenameLexemes, "UTF-8");
 86 | 			writerGlosses = new PrintWriter(filenameGlosses, "UTF-8");
 87 | 		} catch (FileNotFoundException | UnsupportedEncodingException e)
 88 | 		{
 89 | 			e.printStackTrace();
 90 | 
 91 | 			return;
 92 | 		}
 93 | 
 94 | 		int wordCounter = 0;
 95 | 		int synsetCounter = 0;
 96 | 		int synsetCounterAll = 0;
 97 | 		int lexemCounter = 0;
 98 | 		int lexemCounterAll = 0;
 99 | 
100 | 		HashSet<String> oov = new HashSet<String>();
101 | 		
102 | 		for (Object pos : POS.getAllPOS())
103 | 		{
104 | 			Iterator<Synset> itr = dictionary.getSynsetIterator((POS) pos);
105 | 			while (itr.hasNext())
106 | 			{
107 | 				Synset synset = itr.next();
108 | 				String synsetId = getId(synset);
109 | 				++synsetCounterAll;
110 | 				
111 | 				SynsetIndex.put(synsetId, synsetCounterAll);
112 | 
113 | 				// export synset
114 | 				writerSynsets.print(synsetId + " ");
115 | 				
116 | 				float[] naiveSynsetVector = new float[Shared.size];
117 | 				int wordsInSynset = 0;
118 | 
119 | 				for (Word word : synset.getWords())
120 | 				{
121 | 					++lexemCounterAll;
122 | 
123 | 					String lemma = word.getLemma();
124 | 					lemma = Shared.normalizeLemma(lemma);
125 | 
126 | 					// if not in corpus maybe with pos tag
127 | 					if (!Shared.WordMap.containsKey(lemma))
128 | 					{
129 | 						lemma = lemma + "%" + synset.getPOS().getKey();							
130 | 						
131 | 						// skip words that are not in corpus
132 | 						if (!Shared.WordMap.containsKey(lemma))
133 | 						{
134 | 							oov.add(lemma);
135 | 							continue;
136 | 						}
137 | 					}
138 | 					
139 | 					++wordsInSynset;
140 | 					for (int b = 0; b < Shared.size; b++)
141 | 					{
142 | 						naiveSynsetVector[b] += Shared.WordMap.get(lemma)[b];
143 | 					}
144 | 
145 | 					if (!WordIndex.containsKey(lemma))
146 | 					{
147 | 						writerWords.print(lemma + " " + Shared.getVectorAsString(Shared.WordMap.get(lemma)) + "\n");
148 | 						WordIndex.put(lemma, ++wordCounter);
149 | 					}
150 | 
151 | 					++lexemCounter;
152 | 
153 | 					String sensekey = synset.getSenseKey(word.getLemma());
154 | 
155 | 					writerSynsets.print(sensekey + ",");
156 | 					writerLexemes.print(WordIndex.get(lemma) + " " + synsetCounterAll + "\n");
157 | 				}
158 | 				writerSynsets.print("\n");
159 | 				
160 | 				// get gloss vector and normalize length of it
161 | 				float[] glossVector = getGlossVector(synset);
162 | 				if (wordsInSynset != 0)
163 | 				{
164 | 					float lenNSV = 0, lenGloss = 0;
165 | 					for (int b = 0; b < Shared.size; b++)
166 | 					{
167 | 						naiveSynsetVector[b] /= wordsInSynset;
168 | 						lenNSV += naiveSynsetVector[b] * naiveSynsetVector[b];
169 | 						lenGloss += glossVector[b] * glossVector[b];
170 | 					}
171 | 					lenNSV = (float)Math.sqrt(lenNSV);
172 | 					lenGloss = (float)Math.sqrt(lenGloss);
173 | 					for (int b = 0; b < Shared.size; b++)
174 | 					{
175 | 						glossVector[b] *= (lenNSV / lenGloss);
176 | 					}
177 | 				}
178 | 				else
179 | 				{
180 | 					float lenGloss = 0;
181 | 					for (int b = 0; b < Shared.size; b++)
182 | 					{
183 | 						lenGloss += glossVector[b] * glossVector[b];
184 | 					}
185 | 					lenGloss = (float)Math.sqrt(lenGloss);
186 | 					for (int b = 0; b < Shared.size; b++)
187 | 					{
188 | 						glossVector[b] /= lenGloss;
189 | 					}
190 | 				}
191 | 				
192 | 				writerGlosses.print(synsetId + " " + Shared.getVectorAsString(glossVector) + "\n");
193 | 				
194 | 				if (wordsInSynset != 0)
195 | 					++synsetCounter;
196 | 				else
197 | 					SynsetIndex.put(synsetId, -1);
198 | 			}
199 | 		}
200 | 
201 | 		writerWords.close();
202 | 		writerSynsets.close();
203 | 		writerLexemes.close();
204 | 		writerGlosses.close();
205 | 
206 | 		System.out.printf("   Words: %8d / %8d\n", wordCounter, wordCounter + oov.size());
207 | 		System.out.printf("  Synset: %8d / %8d\n", synsetCounter, synsetCounterAll);
208 | 		System.out.printf("  Lexems: %8d / %8d\n", lexemCounter, lexemCounterAll);
209 | 	}
210 | 	
211 | 	private static String getId(Synset synset)
212 | 	{
213 | 		JWNL.Version ver = JWNL.getVersion();
214 | 		
215 | 		String id = "wn-" + ver.getNumber() + "-" + String.format("%08d", synset.getOffset()) + "-" + synset.getPOS().getKey();
216 | 		
217 | 		return id;
218 | 	}
219 | 
220 | 	private static float[] getGlossVector(Synset synset)
221 | 	{
222 | 		String gloss = Shared.normalizeText(synset.getGloss());
223 | 		
224 | 		float[] vector = new float[Shared.size];
225 | 		for (String word : gloss.split(" "))
226 | 		{
227 | 			if (Shared.WordMap.containsKey(word))
228 | 			{
229 | 				for (int b = 0; b < Shared.size; b++)
230 | 				{
231 | 					vector[b] += Shared.WordMap.get(word)[b];
232 | 				}
233 | 			}
234 | 		}
235 | 		
236 | 		return vector;
237 | 	}
238 | 
239 | 	private static void extractSynsetRelations(String filename, PointerType pointer) throws JWNLException
240 | 	{
241 | 		HashMap<String, Integer> affectedPOS = new HashMap<String, Integer>();
242 | 		
243 | 		// create file
244 | 		PrintWriter writer;
245 | 		try
246 | 		{
247 | 			writer = new PrintWriter(filename, "UTF-8");
248 | 		} catch (FileNotFoundException | UnsupportedEncodingException e)
249 | 		{
250 | 			e.printStackTrace();
251 | 
252 | 			return;
253 | 		}
254 | 
255 | 		for (Object pos : POS.getAllPOS())
256 | 		{
257 | 			Iterator<Synset> itr = dictionary.getSynsetIterator((POS) pos);
258 | 			while (itr.hasNext())
259 | 			{
260 | 				Synset synset = itr.next();
261 | 				String synsetId = getId(synset);
262 | 
263 | 				Pointer[] pointers = synset.getPointers(pointer);
264 | 				for (Pointer p : pointers)
265 | 				{
266 | 					Synset targetSynset = p.getTargetSynset();
267 | 					String targetId = getId(targetSynset);;
268 | 					
269 | 					String key = targetSynset.getPOS().getLabel();					
270 | 					if (affectedPOS.containsKey(key))
271 | 					{
272 | 						affectedPOS.put(key, affectedPOS.get(key) + 1);
273 | 					}
274 | 					else
275 | 					{
276 | 						affectedPOS.put(key, 1);
277 | 					}
278 | 					
279 | 					if (SynsetIndex.get(synsetId) < 0 || SynsetIndex.get(targetId) < 0)
280 | 						continue;
281 | 
282 | 					writer.print(SynsetIndex.get(synsetId));
283 | 					writer.print(" ");
284 | 					writer.print(SynsetIndex.get(targetId));
285 | 					writer.print("\n");
286 | 				}
287 | 			}
288 | 		}
289 | 
290 | 		writer.close();
291 | 
292 | 		System.out.printf("Extracted %s: done!\n", pointer.getLabel());
293 | 		Iterator<Entry<String, Integer>> it = affectedPOS.entrySet().iterator();
294 | 		while (it.hasNext())
295 | 		{
296 | 			Map.Entry<String, Integer> pairs = it.next();
297 | 			String key = pairs.getKey();
298 | 			int value = pairs.getValue();
299 | 
300 | 			System.out.printf("  %s: %d\n", key, value);
301 | 		}
302 | 	}
303 | }
304 | 


--------------------------------------------------------------------------------