├── AutoExtend ├── AutoExtend.m ├── columnNormalize.m ├── costFunc.m ├── costFuncColumnNorm.m ├── costFuncLexeme.m ├── getVectors.m ├── gradient.m ├── gradientChecking.m ├── gradientColumnNorm.m ├── gradientLexeme.m ├── learnAffineMapMatrix.m ├── learnLinearMapMatrix.m ├── learnTranslationMatrix.m ├── loadBinaryFile.m ├── loadSynsetFile.m ├── loadTxtFile.m └── writeVectors.m ├── IMS Features ├── CSynsetCosineFeatureExtractor.java ├── CSynsetProductFeatureExtractor.java └── CSynsetRawFeatureExtractor.java ├── LICENSE └── WordNetExtractor ├── Shared.java └── WordNetExtractor.java /AutoExtend/AutoExtend.m: -------------------------------------------------------------------------------- 1 | function [] = AutoExtend(varargin) 2 | 3 | if (nargin == 8) 4 | folder = varargin{1}; 5 | normalizeWeights = varargin{2}; 6 | sWeight = varargin{3}; 7 | lWeight = varargin{4}; 8 | rWeight = varargin{5}; 9 | nWeight = varargin{6}; 10 | experiment = varargin{7}; 11 | RelationFiles = varargin{8}; 12 | 13 | settings = [true false false]; 14 | weights = [sWeight lWeight rWeight nWeight]; 15 | 16 | elseif (nargin == 10) 17 | folder = varargin{1}; 18 | normalizeWeights = varargin{2}; 19 | sWeight = varargin{3}; 20 | lWeight = varargin{4}; 21 | rWeight = varargin{5}; 22 | startNormalizedED = varargin{6}; 23 | normWhenPossibleED = varargin{7}; 24 | endWhenNotED = varargin{8}; 25 | experiment = varargin{9}; 26 | RelationFiles = varargin{10}; 27 | 28 | settings = [startNormalizedED normWhenPossibleED endWhenNotED]; 29 | weights = [sWeight lWeight rWeight 0]; 30 | 31 | else 32 | folder = '[...]'; 33 | normalizeWeights = true; 34 | sWeight = 0.20; 35 | lWeight = 0.20; 36 | rWeight = 0.60; 37 | startNormalizedED = false; 38 | normWhenPossibleED = false; 39 | endWhenNotED = false; 40 | experiment = 'naive'; 41 | RelationFiles = cell(4,1); 42 | RelationFiles{1} = 'hypernym.txt'; 43 | RelationFiles{2} = 'verbGroup.txt'; 44 | RelationFiles{3} = 'similar.txt'; 45 | RelationFiles{4} = 'antonym.txt'; 46 | RelationFiles = []; 47 | 48 | settings = [startNormalizedED normWhenPossibleED endWhenNotED]; 49 | weights = [sWeight lWeight rWeight 0]; 50 | end 51 | 52 | normalizeVectors = false; 53 | 54 | if ~exist(strcat(folder, experiment), 'dir') 55 | fprintf('Folder does not exist. Created %s\n', strcat(folder, experiment)); 56 | mkdir(strcat(folder, experiment)); 57 | end 58 | 59 | if exist(strcat(folder, experiment, '/iota.txt'), 'file') 60 | fprintf('Model %s already exists. Skipped\n', experiment); 61 | return; 62 | end 63 | 64 | [W , ~] = loadTxtFile(strcat(folder, 'words.txt')); 65 | dim = size(W,2); %dim = 1; 66 | num_iters = 1000; %num_iters = 0; 67 | 68 | [DictS, DictSID] = loadSynsetFile(folder); 69 | 70 | countSynsets = length(DictSID); 71 | countWords = length(W); 72 | 73 | save(strcat(folder, experiment, '/settings.mat'), '-regexp', '^[^WD]'); 74 | 75 | if (normalizeVectors == true) 76 | W = normr(W); 77 | end 78 | 79 | Table = readtable(strcat(folder, 'lexemes.txt'), 'ReadVariableNames', false, 'Delimiter', ' '); 80 | ThetaMap = table2array(Table(:, 1:2)); 81 | Iota = sparse(ThetaMap(:,1),ThetaMap(:,2),ones(size(ThetaMap,1),1),countWords,countSynsets); 82 | Theta = Iota'; 83 | 84 | % create relation matrix - will do a squared error of relation pairs 85 | RelationMap = []; 86 | for i=1:size(RelationFiles, 1) 87 | Table = readtable(strcat(folder, RelationFiles{i}), 'ReadVariableNames', false, 'Delimiter', ' '); 88 | if isempty(Table) 89 | continue; 90 | end 91 | RelationMap = [RelationMap ; table2array(Table(:, 1:2))]; 92 | end 93 | 94 | if (~isempty(RelationMap)) 95 | fprintf('Creating Relation Matrix. %d relations found.\n', length(RelationMap)); 96 | rFrom = [(1:length(RelationMap))'; (1:length(RelationMap))']; 97 | rTo = [RelationMap(:,1); RelationMap(:,2)]; 98 | rValue = [ones(length(RelationMap),1); (-1 * ones(length(RelationMap),1))]; 99 | R = sparse(rFrom,rTo,rValue,length(RelationMap),countSynsets); 100 | else 101 | fprintf('Relation Matrix is empty.\n'); 102 | R = zeros(1, countSynsets); 103 | end 104 | 105 | if (normalizeWeights == true) 106 | sWeight = sWeight / countWords; 107 | lWeight = lWeight / nnz(Theta); 108 | rWeight = rWeight / length(RelationMap); 109 | weights(1:3) = [sWeight lWeight rWeight]; 110 | weights = weights / norm(weights,1); 111 | end 112 | 113 | trainModel(folder, dim, num_iters, countSynsets, countWords, W, Theta, Iota, R, weights, settings, experiment); 114 | end 115 | 116 | function [] = trainModel(folder, dim, num_iters, countSynsets, countWords, W, Theta, Iota, R, weights, settings, experiment) 117 | 118 | if ~exist(strcat(folder, experiment, '/debug'), 'dir') 119 | mkdir(strcat(folder, experiment, '/debug')); 120 | end 121 | %delete(strcat(folder, experiment, '/debug/dim_finished_*')); 122 | 123 | J_history = zeros(num_iters,5); 124 | lastNormIter = 0; 125 | 126 | fprintf('Starting parallel computation on %d dimensions.\n', dim); 127 | %poolobj = parpool('local',30); 128 | ThetaValues = NaN(nnz(Theta),dim); 129 | IotaValues = NaN(nnz(Iota),dim); 130 | 131 | for d=1:dim %parfor 132 | 133 | dimFilename = strcat(folder, experiment, '/debug/ThetaIota_', num2str(d), '.mat'); 134 | 135 | if exist(dimFilename, 'file') 136 | [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = loadVariables(dimFilename); 137 | else 138 | Theta_dim = NaN; 139 | Iota_dim = NaN; 140 | J_history_dim = NaN; 141 | lastNormIter_dim = NaN; 142 | saveVariables(dimFilename, Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim); 143 | 144 | w = W(:,d); 145 | 146 | debugFilename = strcat(folder, experiment, '/debug/dim_', num2str(d), '.txt'); 147 | debugFilenameFinished = strcat(folder, experiment, '/debug/dim_finished_', num2str(d), '.txt'); 148 | debugFile = fopen(debugFilename, 'w'); 149 | 150 | [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = trainDimension(num_iters, countSynsets, countWords, w, Theta, Iota, R, weights, settings, debugFile); 151 | saveVariables(dimFilename, Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim); 152 | 153 | fclose(debugFile); 154 | movefile(debugFilename,debugFilenameFinished); 155 | end 156 | 157 | if (length(Theta_dim) == nnz(Theta) && length(Iota_dim) == nnz(Iota)) 158 | ThetaValues(:,d) = Theta_dim; 159 | IotaValues(:,d) = Iota_dim; 160 | J_history = J_history + (J_history_dim ./ dim); 161 | lastNormIter = lastNormIter + (lastNormIter_dim / dim); 162 | end 163 | end 164 | 165 | fprintf('Parallel computation completed.\n'); 166 | 167 | % looking for missing values 168 | for d=1:dim 169 | if (any(isnan(ThetaValues(:,d))) || any(isnan(IotaValues(:,d)))) 170 | dimFilename = strcat(folder, experiment, '/debug/ThetaIota_', num2str(d), '.mat'); 171 | 172 | if exist(dimFilename, 'file') 173 | [Theta_dim, Iota_dim, J_history_dim, lastNormIter_dim] = loadVariables(dimFilename); 174 | end 175 | 176 | if (length(Theta_dim) == nnz(Theta) && length(Iota_dim) == nnz(Iota)) 177 | ThetaValues(:,d) = Theta_dim; 178 | IotaValues(:,d) = Iota_dim; 179 | J_history = J_history + (J_history_dim ./ dim); 180 | lastNormIter = lastNormIter + (lastNormIter_dim / dim); 181 | end 182 | end 183 | end 184 | 185 | % if still not all values available 186 | if (any(isnan(ThetaValues(:))) || any(isnan(IotaValues(:)))) 187 | fprintf('Not all values available (process not master). Process ended.\n'); 188 | return; 189 | end 190 | 191 | fprintf('Saving values ...'); 192 | 193 | %load(strcat(folder, experiment, '/debug/ThetaIota.mat')); 194 | save(strcat(folder, experiment, '/debug/ThetaIota.mat'),'ThetaValues', 'IotaValues'); 195 | delete(strcat(folder, experiment, '/debug/ThetaIota_*')); 196 | 197 | % print convergence matrix 198 | fName = strcat(folder, experiment, '/convergence.mat'); 199 | save(fName,'J_history','lastNormIter'); 200 | 201 | % print theta matrix 202 | [synset, word, ~] = find(Theta); 203 | mat1 = [word synset ThetaValues]; 204 | fName = strcat(folder, experiment, '/theta.txt'); 205 | dlmwrite(fName,mat1,'delimiter',' ','newline','pc','precision',6); 206 | 207 | % print iota matrix 208 | [word, synset, ~] = find(Iota); 209 | mat2 = [word synset IotaValues]; 210 | fName = strcat(folder, experiment, '/iota.txt'); 211 | dlmwrite(fName,mat2,'delimiter',' ','newline','pc','precision',6); 212 | 213 | % Plot the convergence graph 214 | for i=2:num_iters 215 | if (J_history(i,:) == J_history(i-1,:)) 216 | num_iters = i; 217 | break; 218 | end 219 | end 220 | h=figure('Visible','off'); 221 | hax = axes; 222 | hold on; 223 | plot(1:num_iters, (J_history(1:num_iters,1) / max(J_history(:,1))), '-', 'Color', [0 0.8 1], 'LineWidth', 2); 224 | plot(1:num_iters, (J_history(1:num_iters,2) / max(J_history(:,2))), '-', 'Color', [1 0.4 0], 'LineWidth', 2); 225 | plot(1:num_iters, (J_history(1:num_iters,3) / max(J_history(:,3))), '-', 'Color', [0 0.5 0], 'LineWidth', 2); 226 | plot(1:num_iters, (J_history(1:num_iters,4) / max(J_history(:,4))), '-', 'Color', [0.7 0 0.7], 'LineWidth', 2); 227 | plot(1:num_iters, (J_history(1:num_iters,5) / max(J_history(:,5))), '-', 'Color', [0.3 0.3 0.3]); 228 | line([lastNormIter lastNormIter],get(hax,'YLim'),'Color',[0.7 0 0.7]); 229 | legend('autoencoder','lexeme', 'relations', 'norm', 'learning rate'); 230 | xlabel('iteration'); 231 | ylabel('average error'); 232 | fName = strcat(folder, experiment, '/convergence.jpg'); 233 | saveas(h,fName); % here you save the figure 234 | close(h); 235 | 236 | fprintf(' done!\n'); 237 | 238 | end 239 | 240 | function [var1, var2, var3, var4] = loadVariables(filename) 241 | load(filename); 242 | end 243 | 244 | function saveVariables(filename, var1, var2, var3, var4) 245 | save(filename,'var1', 'var2', 'var3', 'var4'); 246 | end 247 | 248 | function [EValues, DValues, J_history, lastNormIter] = trainDimension(num_iters, ~, countWords, w, E, D, R, weights, settings, debugFile) 249 | 250 | learningRate = 0.00005; 251 | fprintf(debugFile, 'Starting computation with learning rate %f\n',learningRate); 252 | 253 | J_history = zeros(num_iters,5); 254 | 255 | if (settings(1) == true) 256 | % normalize matrizes 257 | fprintf(debugFile, 'Normalizing matrices at start.\n'); 258 | E = columnNormalize(E); 259 | D = columnNormalize(D); 260 | end 261 | 262 | lastNormIter = 0; 263 | iter = 1; 264 | while iter <= num_iters 265 | 266 | fprintf(debugFile, 'Iteration %d/%d\n', iter, num_iters); 267 | 268 | grad_E = sparse(size(E,1),size(E,2)); 269 | grad_D = sparse(size(D,1),size(D,2)); 270 | J1 = 0; 271 | J2 = 0; 272 | J3 = 0; 273 | J4 = 0; 274 | 275 | % update with respect to autoencoder 276 | if (weights(1) > 0) 277 | [J1, grads_E, grads_D] = gradient(w, E, D, w, 'both'); 278 | %gradientChecking(w, E, D, R, grads_E, grads_D, iter, weights, 'J1', 0.00001); 279 | grad_E = grad_E + (weights(1) * grads_E); 280 | grad_D = grad_D + (weights(1) * grads_D); 281 | end 282 | 283 | % update with respect to lexeme 284 | if (weights(2) > 0) 285 | [J2, gradl_E, gradl_D] = gradientLexeme(w, E, D); 286 | %gradientChecking(w, E, D, R, gradl_E, gradl_D, iter, weights, 'J2', 0.00001); 287 | grad_E = grad_E + (weights(2) * gradl_E); 288 | grad_D = grad_D + (weights(2) * gradl_D); 289 | end 290 | 291 | 292 | % update with respect to relations 293 | if (weights(3) > 0) 294 | [J3, gradr_E, ~] = gradient(w, E, R, zeros(size(R,1),1) , 'onlyE'); 295 | %gradientChecking(w, E, D, R, gradr_E, grad_D, iter, weights, 'J3', 0.00001); 296 | grad_E = grad_E + (weights(3) * gradr_E); 297 | end 298 | 299 | 300 | % update with respect to column norm 301 | if (weights(4) > 0) 302 | [J4_E, gradn_E] = gradientColumnNorm(E); 303 | [J4_D, gradn_D] = gradientColumnNorm(D); 304 | J4 = J4_E + J4_D; 305 | %gradientChecking(w, E, D, R, gradn_E, gradn_D, iter, weights, 'J4', 0.00001); 306 | grad_E = grad_E + (weights(4) * gradn_E); 307 | grad_D = grad_D + (weights(4) * gradn_D); 308 | end 309 | 310 | J = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4)); 311 | J_history(iter,1) = J1 / countWords; 312 | J_history(iter,2) = J2 / nnz(E); 313 | J_history(iter,3) = J3 / size(R, 1); 314 | J_history(iter,4) = J4 / (size(E,2) + size(D,2)); 315 | J_history(iter,5) = learningRate; 316 | 317 | fprintf(debugFile, 'Error J: %8.3f\n', J); 318 | fprintf(debugFile, 'Error J1: %5.4f %8.3f %5.4f\n', J1 * weights(1)/J, J1 * weights(1), J_history(iter,1)); 319 | fprintf(debugFile, 'Error J2: %5.4f %8.3f %5.4f\n', J2 * weights(2)/J, J2 * weights(2), J_history(iter,2)); 320 | fprintf(debugFile, 'Error J3: %5.4f %8.3f %5.4f\n', J3 * weights(3)/J, J3 * weights(3), J_history(iter,3)); 321 | fprintf(debugFile, 'Error J4: %5.4f %8.3f %5.4f\n', J4 * weights(4)/J, J4 * weights(4), J_history(iter,4)); 322 | 323 | E_new = E - (learningRate * grad_E); 324 | E_new = keepSparsity(E, E_new); 325 | 326 | D_new = D - (learningRate * grad_D); 327 | D_new = keepSparsity(D, D_new); 328 | 329 | % get new cost 330 | if (weights(1) > 0) 331 | J1 = costFunc(w, E_new, D_new, w); 332 | end 333 | if (weights(2) > 0) 334 | J2 = costFuncLexeme(w, E_new, D_new); 335 | end 336 | if (weights(3) > 0) 337 | J3 = costFunc(w, E_new, R, zeros(size(R,1),1)); 338 | end 339 | if (weights(4) > 0) 340 | J4 = costFuncColumnNorm(E_new) + costFuncColumnNorm(D_new); 341 | end 342 | J_new = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4)); 343 | 344 | fprintf(debugFile, 'New Error J: %8.3f\n', J_new); 345 | fprintf(debugFile, 'New Error J1: %5.4f %8.3f\n', J1 * weights(1)/J_new, J1 * weights(1)); 346 | fprintf(debugFile, 'New Error J2: %5.4f %8.3f\n', J2 * weights(2)/J_new, J2 * weights(2)); 347 | fprintf(debugFile, 'New Error J3: %5.4f %8.3f\n', J3 * weights(3)/J_new, J3 * weights(3)); 348 | fprintf(debugFile, 'New Error J4: %5.4f %8.3f\n', J4 * weights(4)/J_new, J4 * weights(4)); 349 | 350 | % check if error increased 351 | if J_new > J 352 | 353 | fprintf(debugFile, 'Error increased\n'); 354 | 355 | % reduce learning rate 356 | learningRate = learningRate / 3; 357 | fprintf(debugFile, 'New Learning Rate: %f\n', learningRate); 358 | 359 | if learningRate < 0.000001 360 | 361 | fprintf(debugFile, 'Learning Rate to small. Calculation stopped\n\n'); 362 | 363 | for i=iter+1:num_iters 364 | J_history(i,:) = J_history(iter,:); 365 | end 366 | 367 | break; 368 | end 369 | 370 | else 371 | 372 | fprintf(debugFile, 'Error decreased\n'); 373 | 374 | % increasing learning rate 375 | learningRate = learningRate * 1.1; 376 | fprintf(debugFile, 'New Learning Rate: %f\n', learningRate); 377 | 378 | % update matrix 379 | E = E_new; 380 | D = D_new; 381 | 382 | if (settings(2) == true) 383 | 384 | fprintf(debugFile, 'Trying to normalize matrices\n'); 385 | 386 | % normalize matrizes 387 | E_new = columnNormalize(E_new); 388 | E_new = keepSparsity(E, E_new); 389 | D_new = columnNormalize(D_new); 390 | D_new = keepSparsity(D, D_new); 391 | 392 | % get new cost 393 | if (weights(1) > 0) 394 | J1 = costFunc(w, E_new, D_new, w); 395 | end 396 | if (weights(2) > 0) 397 | J2 = costFuncLexeme(w, E_new, D_new); 398 | end 399 | if (weights(3) > 0) 400 | J3 = costFunc(w, E_new, R, zeros(size(R,1),1)); 401 | end 402 | if (weights(4) > 0) 403 | J4 = costFuncColumnNorm(E_new) + costFuncColumnNorm(D_new); 404 | end 405 | J_norm = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4)); 406 | 407 | fprintf(debugFile, 'Norm Error J: %f\n', J_norm); 408 | fprintf(debugFile, 'Norm Error J1: %5.4f %8.3f\n', J1 * weights(1)/J_norm, J1 * weights(1)); 409 | fprintf(debugFile, 'Norm Error J2: %5.4f %8.3f\n', J2 * weights(2)/J_norm, J2 * weights(2)); 410 | fprintf(debugFile, 'Norm Error J3: %5.4f %8.3f\n', J3 * weights(3)/J_norm, J3 * weights(3)); 411 | fprintf(debugFile, 'Norm Error J4: %5.4f %8.3f\n', J4 * weights(4)/J_norm, J4 * weights(4)); 412 | 413 | if J_norm < J 414 | 415 | fprintf(debugFile, 'Error decreased. Matrix normalized\n'); 416 | 417 | % update matrix 418 | E = E_new; 419 | D = D_new; 420 | 421 | lastNormIter = iter; 422 | else 423 | fprintf(debugFile, 'Error increased. Matrix not normalized\n'); 424 | 425 | % if only as long as normalization possible 426 | if settings(3) == true 427 | 428 | fprintf(debugFile, 'Normalization not possible. Calculation stopped\n\n'); 429 | 430 | for i=iter+1:num_iters 431 | J_history(i,:) = J_history(iter,:); 432 | end 433 | 434 | break; 435 | end 436 | end 437 | 438 | end 439 | 440 | fprintf(debugFile, 'Iteration finished.\n\n'); 441 | 442 | % do next iteration 443 | iter = iter + 1; 444 | 445 | end 446 | end 447 | 448 | fprintf(debugFile, 'Calculation finished. Learned values will be returned'); 449 | 450 | EValues = nonzeros(E); 451 | DValues = nonzeros(D); 452 | 453 | end 454 | 455 | function [E_new] = keepSparsity(E, E_new) 456 | 457 | while (nnz(E) ~= nnz(E_new)) 458 | [r1,c1,~] = find(E); 459 | [r2,c2,~] = find(E_new); 460 | for l=1:length(r1) 461 | if (r1(l) ~= r2(l) || c1(l) ~= c2(l)) 462 | E_new(r1(l),c1(l)) = eps; 463 | break; 464 | end 465 | end 466 | end 467 | end -------------------------------------------------------------------------------- /AutoExtend/columnNormalize.m: -------------------------------------------------------------------------------- 1 | function [ A ] = columnNormalize( A ) 2 | 3 | if issparse(A) 4 | A = columnNormalizeSparse( A ); 5 | else 6 | A = columnNormalizeFull( A ); 7 | end 8 | 9 | end 10 | 11 | function [ A ] = columnNormalizeSparse( A ) 12 | 13 | 14 | [i,j,values] = find(A); 15 | 16 | colSum = (ones(1, size(A,1)) * A)'; 17 | values = values ./ colSum(j); 18 | 19 | A = sparse(i,j,values,size(A,1),size(A,2)); 20 | 21 | end 22 | 23 | function [ A ] = columnNormalizeFull( A ) 24 | 25 | colSum = sum(A,1); 26 | 27 | for i=1:size(A,2) 28 | 29 | A(:,i) = A(:,i) ./ colSum(i); 30 | 31 | end 32 | 33 | end 34 | 35 | -------------------------------------------------------------------------------- /AutoExtend/costFunc.m: -------------------------------------------------------------------------------- 1 | function [J] = costFunc(x, E, D, x_expected) 2 | 3 | x_predict = (D * (E * x)); 4 | x_diff = x_predict - x_expected; 5 | J = sum(x_diff.^2); 6 | 7 | end -------------------------------------------------------------------------------- /AutoExtend/costFuncColumnNorm.m: -------------------------------------------------------------------------------- 1 | function [J] = costFuncColumnNorm(A) 2 | 3 | [~,j,values] = find(A); 4 | error = NaN(size(A,2),1); 5 | 6 | for l=1:length(j) 7 | if isnan(error(j(l))) 8 | error(j(l)) = 1 - values(l); 9 | else 10 | error(j(l)) = error(j(l)) - values(l); 11 | end 12 | end 13 | 14 | error(isnan(error)) = 0; 15 | J = sum(error.^2); 16 | 17 | end 18 | 19 | -------------------------------------------------------------------------------- /AutoExtend/costFuncLexeme.m: -------------------------------------------------------------------------------- 1 | function [J] = costFuncLexeme(w, E, D) 2 | 3 | if (nnz(E) ~= nnz(D)) 4 | msgID = 'MY:BadLengthED'; 5 | msg = 'Sparsity of encode and decode not matching.'; 6 | baseException = MException(msgID,msg); 7 | throw(baseException); 8 | end 9 | 10 | [synset,word,value] = find(E); 11 | %L1 = sortrows([word synset value],[1 2]); 12 | L1 = [word synset value]; 13 | lexeme1 = L1(:,3) .* w(L1(:,1)); 14 | 15 | s = E * w; 16 | %[word,synset,value] = find(D); 17 | %L2 = sortrows([word synset value],[1 2]); 18 | [synset,word,value] = find(D'); 19 | L2 = [word synset value]; 20 | lexeme2 = L2(:,3) .* s(L2(:,2)); 21 | 22 | diff = lexeme1 - lexeme2; 23 | J = sum(diff.^2); 24 | 25 | end -------------------------------------------------------------------------------- /AutoExtend/getVectors.m: -------------------------------------------------------------------------------- 1 | function [ X , y ] = getVectors( dictX, A, dictA ) 2 | 3 | X = zeros(size(dictX,1),size(A,2)); 4 | y = zeros(size(dictX,1),1); 5 | 6 | for i=1:size(dictX,1) 7 | ind = strcmp(dictX{i}, dictA); 8 | if (any(ind)) 9 | y(i) = find(ind,1); 10 | X(i,:) = A(y(i),:); 11 | end 12 | end 13 | 14 | end 15 | 16 | -------------------------------------------------------------------------------- /AutoExtend/gradient.m: -------------------------------------------------------------------------------- 1 | function [J, grad_E, grad_D] = gradient(x, E, D, x_expected, mode) 2 | 3 | % precalculations 4 | x_predict = (D * (E * x)); 5 | x_diff = x_predict - x_expected; 6 | d = (D' * x_diff); 7 | e = E * x; 8 | 9 | % calculate error 10 | J = sum(x_diff.^2); 11 | 12 | if ~strcmp(mode,'onlyD') 13 | 14 | % calculate derivate for E 15 | [row,column,~] = find(E); 16 | %for l=1:size(row) 17 | % i = row(l); 18 | % j = column(l); 19 | % grad_values(l) = 2 * d(i) * x(j); %((x_predict - x_expected)' * D(:,i)) * x(j); but we use precalculations 20 | %end 21 | grad_values = 2 * d(row) .* x(column); 22 | grad_E = sparse(row,column,grad_values,size(E,1),size(E,2)); 23 | else 24 | grad_E = NaN; 25 | end 26 | 27 | if ~strcmp(mode,'onlyE') 28 | 29 | % calculate derivate for D 30 | [row,column,~] = find(D); 31 | %for l=1:size(row) 32 | % i = row(l); 33 | % j = column(l); 34 | % grad_values(l) = 2 * x_diff(i) * e(j); %(x_predict(i) - x_expected(i)) * (E(j,:) * x); but we use precalculations 35 | %end 36 | grad_values = 2 * x_diff(row) .* e(column); 37 | grad_D = sparse(row,column,grad_values,size(D,1),size(D,2)); 38 | else 39 | grad_D = NaN; 40 | end 41 | 42 | 43 | 44 | end 45 | -------------------------------------------------------------------------------- /AutoExtend/gradientChecking.m: -------------------------------------------------------------------------------- 1 | function [] = gradientChecking(w, E, D, R, grad_E, grad_D, iter, weights, mode, epsilon) 2 | 3 | fprintf('Gradient checking in iteration: %3d\n', iter); 4 | 5 | E_epsilon = E; 6 | [row,column,value] = find(E); 7 | grad = zeros(10,2); 8 | e = 1; 9 | for l=randi(length(row),1,10) 10 | E_epsilon(row(l), column(l)) = value(l) + epsilon; 11 | J_1 = getCost(w, E_epsilon, D, R, weights, mode); 12 | 13 | 14 | E_epsilon(row(l), column(l)) = value(l) - epsilon; 15 | J_2 = getCost(w, E_epsilon, D, R, weights, mode); 16 | 17 | grad(e,1) = (J_1 - J_2) / (2 * epsilon); % num 18 | grad(e,2) = grad_E(row(l),column(l)); % analis 19 | 20 | E_epsilon(row(l), column(l)) = value(l); 21 | 22 | e = e + 1; 23 | end 24 | fprintf('Difference in E: %g\n', norm(grad(:,1)-grad(:,2))/norm(grad(:,1)+grad(:,2))); 25 | 26 | D_epsilon = D; 27 | [row,column,value] = find(D); 28 | grad = zeros(10,2); 29 | e = 1; 30 | for l=randi(length(row),1,10) 31 | D_epsilon(row(l), column(l)) = value(l) + epsilon; 32 | J_1 = getCost(w, E, D_epsilon, R, weights, mode); 33 | 34 | D_epsilon(row(l), column(l)) = value(l) - epsilon; 35 | J_2 = getCost(w, E, D_epsilon, R, weights, mode); 36 | 37 | grad(e,1) = (J_1 - J_2) / (2 * epsilon); % num 38 | grad(e,2) = grad_D(row(l),column(l)); % analis 39 | 40 | D_epsilon(row(l), column(l)) = value(l); 41 | 42 | e = e + 1; 43 | end 44 | fprintf('Difference in D: %g\n', norm(grad(:,1)-grad(:,2))/norm(grad(:,1)+grad(:,2))); 45 | 46 | end 47 | 48 | function J = getCost(w, E, D, R, weights, mode) 49 | 50 | if strcmp(mode, 'J1') 51 | J = costFunc(w, E, D, w); 52 | elseif strcmp(mode, 'J2') 53 | J = costFuncLexeme(w, E, D); 54 | elseif strcmp(mode, 'J3') 55 | J = costFunc(w, E, R, zeros(size(R,1),1)); 56 | elseif strcmp(mode, 'J4') 57 | J = costFuncColumnNorm(E) + costFuncColumnNorm(D); 58 | elseif strcmp(mode, 'R1') 59 | J = costFuncR1(w, E); 60 | elseif strcmp(mode, 'R2') 61 | J = costFuncR2(w, E, D, R); 62 | else 63 | J1 = costFunc(w, E, D, w); 64 | J2 = costFuncLexeme(w, E, D); 65 | J3 = costFunc(w, E, R, zeros(size(R,1),1)); 66 | J4 = costFuncColumnNorm(E) + costFuncColumnNorm(D); 67 | J = (J1 * weights(1)) + (J2 * weights(2)) + (J3 * weights(3)) + (J4 * weights(4)); 68 | end 69 | 70 | end 71 | 72 | -------------------------------------------------------------------------------- /AutoExtend/gradientColumnNorm.m: -------------------------------------------------------------------------------- 1 | function [J, grad_A] = gradientColumnNorm(A) 2 | 3 | [i,j,values] = find(A); 4 | error = NaN(size(A,2),1); 5 | 6 | for l=1:length(j) 7 | if isnan(error(j(l))) 8 | error(j(l)) = 1 - values(l); 9 | else 10 | error(j(l)) = error(j(l)) - values(l); 11 | end 12 | end 13 | 14 | error(isnan(error)) = 0; 15 | J = sum(error.^2); 16 | 17 | for l=1:length(j) 18 | values(l) = -2 * error(j(l)); 19 | end 20 | 21 | grad_A = sparse(i,j,values,size(A,1),size(A,2)); 22 | 23 | end 24 | 25 | -------------------------------------------------------------------------------- /AutoExtend/gradientLexeme.m: -------------------------------------------------------------------------------- 1 | function [J, grad_E, grad_D] = gradientLexeme(w, E, D) 2 | 3 | [synset,word,value] = find(E); 4 | %L1 = sortrows([word synset value],[1 2]); 5 | L1 = [word synset value]; 6 | lexeme1 = L1(:,3) .* w(L1(:,1)); 7 | 8 | s = E * w; 9 | %[word,synset,value] = find(D); 10 | %L2 = sortrows([word synset value],[1 2]); 11 | [synset,word,value] = find(D'); 12 | L2 = [word synset value]; 13 | lexeme2 = L2(:,3) .* s(L2(:,2)); 14 | 15 | diff = lexeme1 - lexeme2; 16 | J = sum(diff.^2); 17 | 18 | %new 19 | %d = (D' * x_diff); 20 | %grad1 = 2 * w(L1(:,1)) .* d(row); 21 | %old 22 | grad1 = 2 * w(L1(:,1)) .* diff; 23 | %end 24 | grad2 = -2 * s(L2(:,2)) .* diff; 25 | 26 | grad_E = sparse(L1(:,2),L1(:,1),grad1,size(E,1),size(E,2)); 27 | grad_D = sparse(L2(:,1),L2(:,2),grad2,size(D,1),size(D,2)); 28 | end 29 | -------------------------------------------------------------------------------- /AutoExtend/learnAffineMapMatrix.m: -------------------------------------------------------------------------------- 1 | function [T] = learnAffineMapMatrix(X, Y) 2 | 3 | % add biased term 4 | X = [X ones(size(X,1), 1)]; 5 | 6 | % learn affine map matrix 7 | T = (X' * X) \ (X' * Y); -------------------------------------------------------------------------------- /AutoExtend/learnLinearMapMatrix.m: -------------------------------------------------------------------------------- 1 | function [T] = learnLinearMapMatrix(X, Y) 2 | 3 | % learn linear map matrix 4 | T = (X' * X) \ (X' * Y); 5 | 6 | % create transformation matrix 7 | T = [T ; zeros(1, size(T,2))]; -------------------------------------------------------------------------------- /AutoExtend/learnTranslationMatrix.m: -------------------------------------------------------------------------------- 1 | function [T] = learnTranslationMatrix(X, Y) 2 | 3 | % learn translation vector 4 | t = mean(Y - X,1); 5 | 6 | % create transformation matrix 7 | T = eye(size(X,2), size(Y,2)); 8 | T = [T ; t]; -------------------------------------------------------------------------------- /AutoExtend/loadBinaryFile.m: -------------------------------------------------------------------------------- 1 | function [A, dictA ] = loadBinaryFile( varargin ) 2 | 3 | if (nargin == 1) 4 | filename = varargin{1}; 5 | max = -1; 6 | fprintf('Reading word vectors ... '); 7 | elseif (nargin == 2) 8 | filename = varargin{1}; 9 | max = varargin{2}; 10 | fprintf('Reading word vectors (up to %d) ... ', max); 11 | else 12 | fprintf('Reading word vectors - Error in number of arguments'); 13 | return; 14 | end 15 | 16 | fid = fopen(filename); 17 | 18 | stringbuffer = blanks(300); 19 | 20 | for j=1:300; 21 | c = fread(fid,1,'uchar'); 22 | 23 | if c == 10 || c == 32 24 | break; 25 | end 26 | 27 | stringbuffer(j) = c; 28 | end 29 | words = str2double(stringbuffer(1:j-1)); 30 | 31 | for j=1:300; 32 | c = fread(fid,1,'uchar'); 33 | 34 | if c == 10 || c == 32 35 | break; 36 | end 37 | 38 | stringbuffer(j) = c; 39 | end 40 | dim = str2double(stringbuffer(1:j-1)); 41 | 42 | if (max > 0) 43 | words = max; 44 | end 45 | dictA = cell(words, 1); 46 | A = zeros(words,dim); 47 | 48 | for i=1:words; 49 | 50 | for j=1:300; 51 | c = fread(fid,1,'uchar'); 52 | 53 | if c == 10 54 | c = fread(fid,1,'uchar'); 55 | end 56 | 57 | if c == 32 58 | break; 59 | end 60 | 61 | stringbuffer(j) = c; 62 | end 63 | dictA{i} = stringbuffer(1:j-1); 64 | 65 | A(i,:) = fread(fid,dim,'single'); 66 | 67 | end 68 | 69 | fprintf('done!\n'); -------------------------------------------------------------------------------- /AutoExtend/loadSynsetFile.m: -------------------------------------------------------------------------------- 1 | function [dictS, dictSID] = loadSynsetFile(folder) 2 | 3 | fileID = fopen(strcat(folder, 'synsets.txt')); 4 | Table = textscan(fileID, '%s\t%s\n', 'CollectOutput',1); 5 | dictSID = Table{1,1}(:, 1); 6 | dictS = Table{1,1}(:, 2); 7 | 8 | end -------------------------------------------------------------------------------- /AutoExtend/loadTxtFile.m: -------------------------------------------------------------------------------- 1 | function [A, dictA, dictPOS] = loadTxtFile( filename ) 2 | 3 | fprintf('Reading word vectors ... '); 4 | 5 | fileID = fopen(filename); 6 | line = fgetl(fileID); 7 | dim = length(strfind(line,' ')); 8 | 9 | frewind(fileID); 10 | 11 | textformat = ['%s', repmat(' %f',1,dim)]; 12 | Table = textscan(fileID,textformat); 13 | dictA = Table{1,1}(:, 1); 14 | A = zeros(length(dictA),dim); 15 | for d=1:dim 16 | A(:,d) = table2array(Table(:, d+1)); 17 | end 18 | 19 | fclose(fileID); 20 | 21 | if nargout > 2 22 | 23 | [dictA, dictPOS] = strtok(dictA_, '%'); 24 | dictPOS = strrep(dictPOS, '%', ''); 25 | 26 | else 27 | 28 | dictA = strrep(dictA, '%n', ''); 29 | dictA = strrep(dictA, '%v', ''); 30 | dictA = strrep(dictA, '%a', ''); 31 | dictA = strrep(dictA, '%r', ''); 32 | dictA = strrep(dictA, '%u', ''); 33 | 34 | end 35 | 36 | fprintf('done!\n'); 37 | 38 | end -------------------------------------------------------------------------------- /AutoExtend/writeVectors.m: -------------------------------------------------------------------------------- 1 | function [] = writeVectors(varargin) 2 | 3 | folder = varargin{1}; 4 | experiment = varargin{2}; 5 | 6 | writeWords = true; 7 | writeSynsets = true; 8 | writeLexemes = false; 9 | 10 | if (nargin == 5) 11 | writeWords = varargin{3}; 12 | writeSynsets = varargin{4}; 13 | writeLexemes = varargin{5}; 14 | end 15 | 16 | file = strcat(folder, experiment, '/outputVectors.txt'); 17 | 18 | [W , dictW] = loadTxtFile(strcat(folder, 'words.txt')); 19 | [dictS, dictSID] = loadSynsetFile(folder); 20 | 21 | Theta = importdata(strcat(folder, experiment, '/theta.txt'), ' '); 22 | fprintf('Calculating synset vectors ... '); 23 | S = zeros(size(dictS, 1), size(W,2)); 24 | for l=1:size(Theta, 1) 25 | w = Theta(l,1); 26 | s = Theta(l,2); 27 | theta = Theta(l, 3:end); 28 | S(s,:) = S(s,:) + (W(w,:) .* theta); 29 | end 30 | fprintf('done!\n'); 31 | 32 | outputSize = 0; 33 | if (writeWords == true) 34 | outputSize = outputSize + size(dictW, 1); 35 | end 36 | if (writeLexemes == true) 37 | outputSize = outputSize + size(Theta, 1); 38 | end 39 | if (writeSynsets == true) 40 | outputSize = outputSize + size(dictS, 1); 41 | end 42 | 43 | fid = fopen(file, 'w'); 44 | fprintf(fid, '%d %d\n',outputSize, size(W,2)); 45 | fclose(fid); 46 | 47 | if (writeWords == true) 48 | fprintf('Writing word vectors ... '); 49 | writeToFile(file, 'a', W, dictW); 50 | fprintf('done!\n'); 51 | end 52 | 53 | if (writeSynsets == true) 54 | 55 | fprintf('Writing synset vectors ... '); 56 | writeToFile(file, 'a', S, dictS); 57 | fprintf('done!\n'); 58 | end 59 | 60 | if (writeLexemes == true) 61 | 62 | Iota = importdata(strcat(folder, experiment, '/iota.txt'), ' '); 63 | Theta = sortrows(Theta, [1 2]); 64 | Iota = sortrows(Iota, [1 2]); 65 | 66 | if (sum(sum(Theta(:,1:2)-Iota(:,1:2))) ~= 0) 67 | fprintf('Iota and Theta file do not match. Lexemes vector might be screwed.\n'); 68 | end 69 | 70 | fprintf('Calculating lexeme vectors ... '); 71 | L = zeros(size(Theta, 1), size(W,2)); 72 | dictL = cell(size(Theta, 1), 1); 73 | for l=1:size(Theta, 1) 74 | w = Theta(l,1); 75 | s = Theta(l,2); 76 | theta = Theta(l, 3:end); 77 | iota = Iota(l, 3:end); 78 | L(l,:) = ((W(w,:) .* theta) + (S(s,:) .* iota)) / 2; 79 | dictL{l} = strcat(dictW{w}, '-', dictSID{s}); 80 | end 81 | fprintf('done!\n'); 82 | 83 | fprintf('Writing lexeme vectors ... '); 84 | writeToFile(file, 'a', L, dictL); 85 | fprintf('done!\n'); 86 | end 87 | 88 | end 89 | 90 | function [] = writeToFile(file, mode, A, dictA) 91 | 92 | fid = fopen(file, mode); 93 | 94 | for i=1:size(dictA,1) 95 | fprintf(fid, '%s', dictA{i}); 96 | fprintf(fid,' %f',A(i,:)); 97 | fprintf(fid,'\n'); 98 | end 99 | 100 | fclose(fid); 101 | 102 | end 103 | -------------------------------------------------------------------------------- /IMS Features/CSynsetCosineFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IMS (It Makes Sense) -- NUS WSD System 3 | * Copyright (c) 2010 National University of Singapore. 4 | * All Rights Reserved. 5 | */ 6 | package sg.edu.nus.comp.nlp.ims.feature; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.HashSet; 14 | 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem; 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus; 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence; 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter; 19 | 20 | /** 21 | * Synset Cosine feature extractor. 22 | */ 23 | public class CSynsetCosineFeatureExtractor implements IFeatureExtractor { 24 | 25 | // the Synsets and corresponding vectors 26 | protected ArrayList m_Synsets = new ArrayList(); 27 | protected ArrayList m_SynsetVectors = new ArrayList(); 28 | 29 | // corpus to be extracted 30 | protected ICorpus m_Corpus = null; 31 | 32 | // index of current instance 33 | protected int m_Index = -1; 34 | 35 | // current sentence to process 36 | protected ISentence m_Sentence = null; 37 | 38 | // item index in current sentence 39 | protected int m_IndexInSentence; 40 | 41 | // item length 42 | protected int m_InstanceLength; 43 | 44 | // index of Synset feature 45 | protected int m_SynsetIndex = -1; 46 | 47 | // sentence before current sentence 48 | protected int m_Left; 49 | 50 | // sentence after current sentence 51 | protected int m_Right; 52 | 53 | // surrounding words of current instance 54 | protected HashSet m_SurroundingWordSet = new HashSet(); 55 | 56 | // vector of surroundings word of current instance 57 | protected float[] m_SurroundingWordVector; 58 | 59 | // current lemma to process 60 | protected String m_Lemma; 61 | protected String m_POS; 62 | 63 | // stop words filter 64 | protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance(); 65 | 66 | // current feature 67 | protected IFeature m_CurrentFeature = null; 68 | 69 | protected static HashMap wordVectors = new HashMap(); 70 | 71 | protected static int g_LIDX = AItem.Features.LEMMA.ordinal(); 72 | protected static int g_TIDX = AItem.Features.TOKEN.ordinal(); 73 | protected static int g_PIDX = AItem.Features.POS.ordinal(); 74 | 75 | protected static int DIM_SIZE; 76 | 77 | 78 | /** 79 | * constructor 80 | */ 81 | public CSynsetCosineFeatureExtractor() { 82 | 83 | createWordVectorSet(); 84 | 85 | this.m_Left = Integer.MAX_VALUE; 86 | this.m_Right = Integer.MAX_VALUE; 87 | } 88 | 89 | /* 90 | * (non-Javadoc) 91 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID() 92 | */ 93 | @Override 94 | public String getCurrentInstanceID() { 95 | if (this.validIndex(this.m_Index)) { 96 | return this.m_Corpus.getValue(this.m_Index, "id"); 97 | } 98 | return null; 99 | } 100 | 101 | /* 102 | * (non-Javadoc) 103 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext() 104 | */ 105 | @Override 106 | public boolean hasNext() { 107 | if (this.m_CurrentFeature != null) { 108 | return true; 109 | } 110 | if (this.validIndex(this.m_Index)) { 111 | this.m_CurrentFeature = this.getNext(); 112 | if (this.m_CurrentFeature != null) { 113 | return true; 114 | } 115 | } 116 | return false; 117 | } 118 | 119 | /** 120 | * get the next feature of current instance 121 | * 122 | * @return feature 123 | */ 124 | protected IFeature getNext() { 125 | IFeature feature = null; 126 | if (this.m_SynsetIndex >= 0 && this.m_SynsetIndex < this.m_Synsets.size()) { 127 | feature = new CDoubleFeature(); 128 | feature.setKey(this.m_Synsets.get(this.m_SynsetIndex)); 129 | feature.setValue(this.getSynsetFeature(this.m_SynsetIndex)); 130 | this.m_SynsetIndex++; 131 | } 132 | return feature; 133 | } 134 | 135 | /** 136 | * get the part-of-speech of item p_Index + m_IndexInSentence 137 | * 138 | * @param p_Index 139 | * index 140 | * @return feature value 141 | */ 142 | protected String getSynsetFeature(int p_Index) { 143 | 144 | float cosine = getCosine(this.m_SurroundingWordVector, this.m_SynsetVectors.get(p_Index)); 145 | return Float.toString(cosine); 146 | } 147 | 148 | private float getCosine(float[] vector1, float[] vector2) 149 | { 150 | float value = 0; 151 | float len1 = 0; 152 | float len2 = 0; 153 | 154 | for (int i = 0; i < vector1.length; i++) { 155 | 156 | value += vector1[i] * vector2[i]; 157 | len1 += vector1[i] * vector1[i]; 158 | len2 += vector2[i] * vector2[i]; 159 | } 160 | 161 | return (float)(value/(Math.sqrt(len1)*Math.sqrt(len2))); 162 | } 163 | 164 | private void createWordVectorSet() 165 | { 166 | if (wordVectors.size() > 0) 167 | return; 168 | 169 | // path to word and synset vectors 170 | String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile; 171 | 172 | BufferedReader br = null; 173 | try 174 | { 175 | br = new BufferedReader(new FileReader(path));; 176 | 177 | String key = null; 178 | 179 | String line = br.readLine(); 180 | String[] lineSplited = line.split(" "); 181 | 182 | DIM_SIZE = Integer.parseInt(lineSplited[1]); 183 | 184 | while ((line = br.readLine()) != null) { 185 | 186 | lineSplited = line.split(" "); 187 | 188 | key = lineSplited[0]; 189 | 190 | float vector[] = new float[DIM_SIZE]; 191 | 192 | for (int j = 0; j < DIM_SIZE; j++) { 193 | vector[j] += Float.parseFloat(lineSplited[j + 1]); 194 | } 195 | 196 | wordVectors.put(key, vector); 197 | } 198 | 199 | } catch (IOException e) 200 | { 201 | e.printStackTrace(); 202 | } 203 | } 204 | 205 | /** 206 | * check the validity of index 207 | * 208 | * @param p_Index 209 | * index 210 | * @return valid or not 211 | */ 212 | protected boolean validIndex(int p_Index) { 213 | if (this.m_Corpus != null && this.m_Corpus.size() > p_Index 214 | && p_Index >= 0) { 215 | return true; 216 | } 217 | return false; 218 | } 219 | 220 | /* 221 | * (non-Javadoc) 222 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next() 223 | */ 224 | @Override 225 | public IFeature next() { 226 | IFeature feature = null; 227 | if (this.hasNext()) { 228 | feature = this.m_CurrentFeature; 229 | this.m_CurrentFeature = null; 230 | } 231 | return feature; 232 | } 233 | 234 | /* 235 | * (non-Javadoc) 236 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart() 237 | */ 238 | @Override 239 | public boolean restart() { 240 | this.m_SynsetIndex = 0; 241 | this.m_CurrentFeature = null; 242 | return this.validIndex(this.m_Index); 243 | } 244 | 245 | /* 246 | * (non-Javadoc) 247 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus) 248 | */ 249 | @Override 250 | public boolean setCorpus(ICorpus p_Corpus) { 251 | if (p_Corpus == null) { 252 | return false; 253 | } 254 | this.m_Corpus = p_Corpus; 255 | this.m_Index = 0; 256 | this.restart(); 257 | this.m_Index = -1; 258 | this.m_IndexInSentence = -1; 259 | this.m_InstanceLength = -1; 260 | return true; 261 | } 262 | 263 | /** 264 | * check whether word is in stop word list or contains no alphabet 265 | * 266 | * @param p_Word 267 | * word 268 | * @return true if it should be filtered, else false 269 | */ 270 | public boolean filter(String p_Word) { 271 | return this.m_Filter.filter(p_Word); 272 | } 273 | 274 | /* 275 | * (non-Javadoc) 276 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int) 277 | */ 278 | @Override 279 | public boolean setCurrentInstance(int p_Index) { 280 | if (this.validIndex(p_Index)) { 281 | this.m_Index = p_Index; 282 | this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index); 283 | this.m_InstanceLength = this.m_Corpus.getLength(p_Index); 284 | int currentSent = this.m_Corpus.getSentenceID(p_Index); 285 | this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus 286 | .getSentenceID(p_Index)); 287 | this.m_Synsets.clear(); 288 | this.m_SynsetVectors.clear(); 289 | this.m_SurroundingWordSet.clear(); 290 | this.m_SurroundingWordVector = new float[DIM_SIZE]; 291 | 292 | this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX); 293 | this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX); 294 | String posID = "%3"; 295 | 296 | if (this.m_POS.contains("NN")) 297 | posID = "%1"; 298 | else if (this.m_POS.contains("VB")) 299 | posID = "%2"; 300 | else if (this.m_POS.contains("JJ")) 301 | posID = "%3"; 302 | else if (this.m_POS.contains("RB")) 303 | posID = "%4"; 304 | else 305 | posID = "%"; 306 | // add possible synsets 307 | for (String key : wordVectors.keySet()) 308 | { 309 | if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) || 310 | key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") || 311 | key.equals(this.m_Lemma)) 312 | { 313 | this.m_Synsets.add(key); 314 | this.m_SynsetVectors.add(wordVectors.get(key)); 315 | } 316 | } 317 | 318 | String keyWord = null; 319 | int lower = this.m_Corpus.getLowerBoundary(currentSent); 320 | int upper = this.m_Corpus.getUpperBoundary(currentSent); 321 | for (int sentIdx = lower; sentIdx < upper; sentIdx++) { 322 | if (currentSent - sentIdx > this.m_Left 323 | || sentIdx - currentSent > this.m_Right) { 324 | continue; 325 | } 326 | ISentence sentence = this.m_Corpus.getSentence(sentIdx); 327 | if (sentence != null) { 328 | for (int i = 0; i < sentence.size(); i++) { 329 | keyWord = sentence.getItem(i).get(g_TIDX); 330 | if (this.filter(keyWord)) { 331 | continue; 332 | } 333 | keyWord = sentence.getItem(i).get(g_LIDX); 334 | if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength) 335 | && !this.m_SurroundingWordSet.contains(keyWord)) 336 | { 337 | this.m_SurroundingWordSet.add(keyWord); 338 | if (wordVectors.containsKey(keyWord)) 339 | { 340 | float[] vector = wordVectors.get(keyWord); 341 | for (int j = 0; j < vector.length; j++) { 342 | this.m_SurroundingWordVector[j] += vector[j]; 343 | } 344 | } 345 | } 346 | } 347 | } 348 | } 349 | this.restart(); 350 | return true; 351 | } 352 | return false; 353 | } 354 | 355 | } 356 | -------------------------------------------------------------------------------- /IMS Features/CSynsetProductFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IMS (It Makes Sense) -- NUS WSD System 3 | * Copyright (c) 2010 National University of Singapore. 4 | * All Rights Reserved. 5 | */ 6 | package sg.edu.nus.comp.nlp.ims.feature; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.HashSet; 14 | 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem; 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus; 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence; 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter; 19 | 20 | /** 21 | * Synset Product feature extractor. 22 | */ 23 | public class CSynsetProductFeatureExtractor implements IFeatureExtractor { 24 | 25 | // the Synsets and corresponding vectors 26 | protected ArrayList m_Synsets = new ArrayList(); 27 | protected ArrayList m_SynsetVectors = new ArrayList(); 28 | 29 | // corpus to be extracted 30 | protected ICorpus m_Corpus = null; 31 | 32 | // index of current instance 33 | protected int m_Index = -1; 34 | 35 | // current sentence to process 36 | protected ISentence m_Sentence = null; 37 | 38 | // item index in current sentence 39 | protected int m_IndexInSentence; 40 | 41 | // item length 42 | protected int m_InstanceLength; 43 | 44 | // index of Synset feature 45 | protected int m_FeatureIndex = -1; 46 | 47 | // sentence before current sentence 48 | protected int m_Left; 49 | 50 | // sentence after current sentence 51 | protected int m_Right; 52 | 53 | // surrounding words of current instance 54 | protected HashSet m_SurroundingWordSet = new HashSet(); 55 | 56 | // vector of surroundings word of current instance 57 | protected float[] m_SurroundingWordVector; 58 | 59 | // current lemma to process 60 | protected String m_Lemma; 61 | protected String m_POS; 62 | 63 | // stop words filter 64 | protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance(); 65 | 66 | // current feature 67 | protected IFeature m_CurrentFeature = null; 68 | 69 | protected static HashMap wordVectors = new HashMap(); 70 | 71 | protected static int g_LIDX = AItem.Features.LEMMA.ordinal(); 72 | protected static int g_TIDX = AItem.Features.TOKEN.ordinal(); 73 | protected static int g_PIDX = AItem.Features.POS.ordinal(); 74 | 75 | protected static int DIM_SIZE; 76 | 77 | 78 | /** 79 | * constructor 80 | */ 81 | public CSynsetProductFeatureExtractor() { 82 | 83 | createWordVectorSet(); 84 | 85 | this.m_Left = Integer.MAX_VALUE; 86 | this.m_Right = Integer.MAX_VALUE; 87 | } 88 | 89 | /* 90 | * (non-Javadoc) 91 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID() 92 | */ 93 | @Override 94 | public String getCurrentInstanceID() { 95 | if (this.validIndex(this.m_Index)) { 96 | return this.m_Corpus.getValue(this.m_Index, "id"); 97 | } 98 | return null; 99 | } 100 | 101 | /* 102 | * (non-Javadoc) 103 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext() 104 | */ 105 | @Override 106 | public boolean hasNext() { 107 | if (this.m_CurrentFeature != null) { 108 | return true; 109 | } 110 | if (this.validIndex(this.m_Index)) { 111 | this.m_CurrentFeature = this.getNext(); 112 | if (this.m_CurrentFeature != null) { 113 | return true; 114 | } 115 | } 116 | return false; 117 | } 118 | 119 | /** 120 | * get the next feature of current instance 121 | * 122 | * @return feature 123 | */ 124 | protected IFeature getNext() { 125 | IFeature feature = null; 126 | if (this.m_FeatureIndex >= 0 && this.m_FeatureIndex < this.m_Synsets.size() * DIM_SIZE) { 127 | feature = new CDoubleFeature(); 128 | int index = this.m_FeatureIndex / DIM_SIZE; 129 | int dimension = this.m_FeatureIndex % DIM_SIZE; 130 | feature.setKey(dimension + "_" + this.m_Synsets.get(index)); 131 | feature.setValue(this.getSynsetFeature(index, dimension)); 132 | this.m_FeatureIndex++; 133 | } 134 | return feature; 135 | } 136 | 137 | /** 138 | * get the part-of-speech of item p_Index + m_IndexInSentence 139 | * 140 | * @param p_Index 141 | * index 142 | * @return feature value 143 | */ 144 | protected String getSynsetFeature(int index, int dimension) { 145 | 146 | float result = this.m_SurroundingWordVector[dimension] * this.m_SynsetVectors.get(index)[dimension]; 147 | return Float.toString(result); 148 | } 149 | 150 | private void createWordVectorSet() 151 | { 152 | if (wordVectors.size() > 0) 153 | return; 154 | 155 | // path to word and synset vectors 156 | String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile; 157 | 158 | System.err.println("Reading word and synsets vector from:"); 159 | System.err.println(path); 160 | 161 | BufferedReader br = null; 162 | try 163 | { 164 | br = new BufferedReader(new FileReader(path));; 165 | 166 | String key = null; 167 | 168 | String line = br.readLine(); 169 | String[] lineSplited = line.split(" "); 170 | 171 | DIM_SIZE = Integer.parseInt(lineSplited[1]); 172 | 173 | while ((line = br.readLine()) != null) { 174 | 175 | lineSplited = line.split(" "); 176 | 177 | key = lineSplited[0]; 178 | 179 | float vector[] = new float[DIM_SIZE]; 180 | 181 | for (int j = 0; j < DIM_SIZE; j++) { 182 | vector[j] += Float.parseFloat(lineSplited[j + 1]); 183 | } 184 | 185 | wordVectors.put(key, vector); 186 | } 187 | 188 | } catch (IOException e) 189 | { 190 | e.printStackTrace(); 191 | } 192 | 193 | System.err.println("Done!"); 194 | } 195 | 196 | /** 197 | * check the validity of index 198 | * 199 | * @param p_Index 200 | * index 201 | * @return valid or not 202 | */ 203 | protected boolean validIndex(int p_Index) { 204 | if (this.m_Corpus != null && this.m_Corpus.size() > p_Index 205 | && p_Index >= 0) { 206 | return true; 207 | } 208 | return false; 209 | } 210 | 211 | /* 212 | * (non-Javadoc) 213 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next() 214 | */ 215 | @Override 216 | public IFeature next() { 217 | IFeature feature = null; 218 | if (this.hasNext()) { 219 | feature = this.m_CurrentFeature; 220 | this.m_CurrentFeature = null; 221 | } 222 | return feature; 223 | } 224 | 225 | /* 226 | * (non-Javadoc) 227 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart() 228 | */ 229 | @Override 230 | public boolean restart() { 231 | this.m_FeatureIndex = 0; 232 | this.m_CurrentFeature = null; 233 | return this.validIndex(this.m_Index); 234 | } 235 | 236 | /* 237 | * (non-Javadoc) 238 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus) 239 | */ 240 | @Override 241 | public boolean setCorpus(ICorpus p_Corpus) { 242 | if (p_Corpus == null) { 243 | return false; 244 | } 245 | this.m_Corpus = p_Corpus; 246 | this.m_Index = 0; 247 | this.restart(); 248 | this.m_Index = -1; 249 | this.m_IndexInSentence = -1; 250 | this.m_InstanceLength = -1; 251 | return true; 252 | } 253 | 254 | /** 255 | * check whether word is in stop word list or contains no alphabet 256 | * 257 | * @param p_Word 258 | * word 259 | * @return true if it should be filtered, else false 260 | */ 261 | public boolean filter(String p_Word) { 262 | return this.m_Filter.filter(p_Word); 263 | } 264 | 265 | /* 266 | * (non-Javadoc) 267 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int) 268 | */ 269 | @Override 270 | public boolean setCurrentInstance(int p_Index) { 271 | if (this.validIndex(p_Index)) { 272 | this.m_Index = p_Index; 273 | this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index); 274 | this.m_InstanceLength = this.m_Corpus.getLength(p_Index); 275 | int currentSent = this.m_Corpus.getSentenceID(p_Index); 276 | this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus 277 | .getSentenceID(p_Index)); 278 | this.m_Synsets.clear(); 279 | this.m_SynsetVectors.clear(); 280 | this.m_SurroundingWordSet.clear(); 281 | this.m_SurroundingWordVector = new float[DIM_SIZE]; 282 | 283 | this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX); 284 | this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX); 285 | String posID = "%3"; 286 | 287 | if (this.m_POS.contains("NN")) 288 | posID = "%1"; 289 | else if (this.m_POS.contains("VB")) 290 | posID = "%2"; 291 | else if (this.m_POS.contains("JJ")) 292 | posID = "%3"; 293 | else if (this.m_POS.contains("RB")) 294 | posID = "%4"; 295 | else 296 | posID = "%"; 297 | // add possible synsets 298 | for (String key : wordVectors.keySet()) 299 | { 300 | if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) || 301 | key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") || 302 | key.equals(this.m_Lemma)) 303 | { 304 | this.m_Synsets.add(key); 305 | this.m_SynsetVectors.add(wordVectors.get(key)); 306 | } 307 | } 308 | 309 | String keyWord = null; 310 | int lower = this.m_Corpus.getLowerBoundary(currentSent); 311 | int upper = this.m_Corpus.getUpperBoundary(currentSent); 312 | for (int sentIdx = lower; sentIdx < upper; sentIdx++) { 313 | if (currentSent - sentIdx > this.m_Left 314 | || sentIdx - currentSent > this.m_Right) { 315 | continue; 316 | } 317 | ISentence sentence = this.m_Corpus.getSentence(sentIdx); 318 | if (sentence != null) { 319 | for (int i = 0; i < sentence.size(); i++) { 320 | keyWord = sentence.getItem(i).get(g_TIDX); 321 | if (this.filter(keyWord)) { 322 | continue; 323 | } 324 | keyWord = sentence.getItem(i).get(g_LIDX); 325 | if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength) 326 | && !this.m_SurroundingWordSet.contains(keyWord)) 327 | { 328 | this.m_SurroundingWordSet.add(keyWord); 329 | if (wordVectors.containsKey(keyWord)) 330 | { 331 | float[] vector = wordVectors.get(keyWord); 332 | for (int j = 0; j < vector.length; j++) { 333 | this.m_SurroundingWordVector[j] += vector[j]; 334 | } 335 | } 336 | } 337 | } 338 | } 339 | } 340 | this.restart(); 341 | return true; 342 | } 343 | return false; 344 | } 345 | 346 | } 347 | -------------------------------------------------------------------------------- /IMS Features/CSynsetRawFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IMS (It Makes Sense) -- NUS WSD System 3 | * Copyright (c) 2010 National University of Singapore. 4 | * All Rights Reserved. 5 | */ 6 | package sg.edu.nus.comp.nlp.ims.feature; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.HashSet; 14 | 15 | import sg.edu.nus.comp.nlp.ims.corpus.AItem; 16 | import sg.edu.nus.comp.nlp.ims.corpus.ICorpus; 17 | import sg.edu.nus.comp.nlp.ims.corpus.ISentence; 18 | import sg.edu.nus.comp.nlp.ims.util.CSurroundingWordFilter; 19 | 20 | /** 21 | * Synset Raw feature extractor. 22 | */ 23 | public class CSynsetRawFeatureExtractor implements IFeatureExtractor { 24 | 25 | // the Synsets and corresponding vectors 26 | protected ArrayList m_Synsets = new ArrayList(); 27 | protected ArrayList m_SynsetVectors = new ArrayList(); 28 | 29 | // corpus to be extracted 30 | protected ICorpus m_Corpus = null; 31 | 32 | // index of current instance 33 | protected int m_Index = -1; 34 | 35 | // current sentence to process 36 | protected ISentence m_Sentence = null; 37 | 38 | // item index in current sentence 39 | protected int m_IndexInSentence; 40 | 41 | // item length 42 | protected int m_InstanceLength; 43 | 44 | // index of Synset feature 45 | protected int m_FeatureIndex = -1; 46 | 47 | // sentence before current sentence 48 | protected int m_Left; 49 | 50 | // sentence after current sentence 51 | protected int m_Right; 52 | 53 | // surrounding words of current instance 54 | protected HashSet m_SurroundingWordSet = new HashSet(); 55 | 56 | // vector of surroundings word of current instance 57 | protected float[] m_SurroundingWordVector; 58 | 59 | // current lemma to process 60 | protected String m_Lemma; 61 | protected String m_POS; 62 | 63 | // stop words filter 64 | protected CSurroundingWordFilter m_Filter = CSurroundingWordFilter.getInstance(); 65 | 66 | // current feature 67 | protected IFeature m_CurrentFeature = null; 68 | 69 | protected static HashMap wordVectors = new HashMap(); 70 | 71 | protected static int g_LIDX = AItem.Features.LEMMA.ordinal(); 72 | protected static int g_TIDX = AItem.Features.TOKEN.ordinal(); 73 | protected static int g_PIDX = AItem.Features.POS.ordinal(); 74 | 75 | protected static int DIM_SIZE; 76 | 77 | 78 | /** 79 | * constructor 80 | */ 81 | public CSynsetRawFeatureExtractor() { 82 | 83 | createWordVectorSet(); 84 | 85 | this.m_Left = Integer.MAX_VALUE; 86 | this.m_Right = Integer.MAX_VALUE; 87 | } 88 | 89 | /* 90 | * (non-Javadoc) 91 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#getCurrentInstanceID() 92 | */ 93 | @Override 94 | public String getCurrentInstanceID() { 95 | if (this.validIndex(this.m_Index)) { 96 | return this.m_Corpus.getValue(this.m_Index, "id"); 97 | } 98 | return null; 99 | } 100 | 101 | /* 102 | * (non-Javadoc) 103 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#hasNext() 104 | */ 105 | @Override 106 | public boolean hasNext() { 107 | if (this.m_CurrentFeature != null) { 108 | return true; 109 | } 110 | if (this.validIndex(this.m_Index)) { 111 | this.m_CurrentFeature = this.getNext(); 112 | if (this.m_CurrentFeature != null) { 113 | return true; 114 | } 115 | } 116 | return false; 117 | } 118 | 119 | /** 120 | * get the next feature of current instance 121 | * 122 | * @return feature 123 | */ 124 | protected IFeature getNext() { 125 | IFeature feature = null; 126 | if (this.m_FeatureIndex >= 0 && this.m_FeatureIndex < (this.m_Synsets.size() + 1) * DIM_SIZE) { 127 | feature = new CDoubleFeature(); 128 | int index = (this.m_FeatureIndex / DIM_SIZE) - 1; 129 | int dimension = this.m_FeatureIndex % DIM_SIZE; 130 | if (index == -1) 131 | feature.setKey(dimension + "_sentence"); 132 | else 133 | feature.setKey(dimension + "_" + this.m_Synsets.get(index)); 134 | feature.setValue(this.getSynsetFeature(index, dimension)); 135 | this.m_FeatureIndex++; 136 | } 137 | return feature; 138 | } 139 | 140 | /** 141 | * get the part-of-speech of item p_Index + m_IndexInSentence 142 | * 143 | * @param p_Index 144 | * index 145 | * @return feature value 146 | */ 147 | protected String getSynsetFeature(int index, int dimension) { 148 | 149 | if (index == -1) 150 | return Float.toString(this.m_SurroundingWordVector[dimension]); 151 | else 152 | return Float.toString(this.m_SynsetVectors.get(index)[dimension]); 153 | } 154 | 155 | private void createWordVectorSet() 156 | { 157 | if (wordVectors.size() > 0) 158 | return; 159 | 160 | // path to word and synset vectors 161 | String path = sg.edu.nus.comp.nlp.ims.implement.CTester.svFile; 162 | 163 | BufferedReader br = null; 164 | try 165 | { 166 | br = new BufferedReader(new FileReader(path));; 167 | 168 | String key = null; 169 | 170 | String line = br.readLine(); 171 | String[] lineSplited = line.split(" "); 172 | 173 | DIM_SIZE = Integer.parseInt(lineSplited[1]); 174 | 175 | while ((line = br.readLine()) != null) { 176 | 177 | lineSplited = line.split(" "); 178 | 179 | key = lineSplited[0]; 180 | 181 | float vector[] = new float[DIM_SIZE]; 182 | 183 | for (int j = 0; j < DIM_SIZE; j++) { 184 | vector[j] += Float.parseFloat(lineSplited[j + 1]); 185 | } 186 | 187 | wordVectors.put(key, vector); 188 | } 189 | 190 | } catch (IOException e) 191 | { 192 | e.printStackTrace(); 193 | } 194 | } 195 | 196 | /** 197 | * check the validity of index 198 | * 199 | * @param p_Index 200 | * index 201 | * @return valid or not 202 | */ 203 | protected boolean validIndex(int p_Index) { 204 | if (this.m_Corpus != null && this.m_Corpus.size() > p_Index 205 | && p_Index >= 0) { 206 | return true; 207 | } 208 | return false; 209 | } 210 | 211 | /* 212 | * (non-Javadoc) 213 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#next() 214 | */ 215 | @Override 216 | public IFeature next() { 217 | IFeature feature = null; 218 | if (this.hasNext()) { 219 | feature = this.m_CurrentFeature; 220 | this.m_CurrentFeature = null; 221 | } 222 | return feature; 223 | } 224 | 225 | /* 226 | * (non-Javadoc) 227 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#restart() 228 | */ 229 | @Override 230 | public boolean restart() { 231 | this.m_FeatureIndex = 0; 232 | this.m_CurrentFeature = null; 233 | return this.validIndex(this.m_Index); 234 | } 235 | 236 | /* 237 | * (non-Javadoc) 238 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCorpus(sg.edu.nus.comp.nlp.ims.corpus.ICorpus) 239 | */ 240 | @Override 241 | public boolean setCorpus(ICorpus p_Corpus) { 242 | if (p_Corpus == null) { 243 | return false; 244 | } 245 | this.m_Corpus = p_Corpus; 246 | this.m_Index = 0; 247 | this.restart(); 248 | this.m_Index = -1; 249 | this.m_IndexInSentence = -1; 250 | this.m_InstanceLength = -1; 251 | return true; 252 | } 253 | 254 | /** 255 | * check whether word is in stop word list or contains no alphabet 256 | * 257 | * @param p_Word 258 | * word 259 | * @return true if it should be filtered, else false 260 | */ 261 | public boolean filter(String p_Word) { 262 | return this.m_Filter.filter(p_Word); 263 | } 264 | 265 | /* 266 | * (non-Javadoc) 267 | * @see sg.edu.nus.comp.nlp.ims.feature.IFeatureExtractor#setCurrentInstance(int) 268 | */ 269 | @Override 270 | public boolean setCurrentInstance(int p_Index) { 271 | if (this.validIndex(p_Index)) { 272 | this.m_Index = p_Index; 273 | this.m_IndexInSentence = this.m_Corpus.getIndexInSentence(p_Index); 274 | this.m_InstanceLength = this.m_Corpus.getLength(p_Index); 275 | int currentSent = this.m_Corpus.getSentenceID(p_Index); 276 | this.m_Sentence = this.m_Corpus.getSentence(this.m_Corpus 277 | .getSentenceID(p_Index)); 278 | this.m_Synsets.clear(); 279 | this.m_SynsetVectors.clear(); 280 | this.m_SurroundingWordSet.clear(); 281 | this.m_SurroundingWordVector = new float[DIM_SIZE]; 282 | 283 | this.m_Lemma = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_LIDX); 284 | this.m_POS = this.m_Sentence.getItem(this.m_IndexInSentence).get(g_PIDX); 285 | String posID = "%3"; 286 | 287 | if (this.m_POS.contains("NN")) 288 | posID = "%1"; 289 | else if (this.m_POS.contains("VB")) 290 | posID = "%2"; 291 | else if (this.m_POS.contains("JJ")) 292 | posID = "%3"; 293 | else if (this.m_POS.contains("RB")) 294 | posID = "%4"; 295 | else 296 | posID = "%"; 297 | // add possible synsets 298 | for (String key : wordVectors.keySet()) 299 | { 300 | if (key.startsWith(this.m_Lemma + posID) || key.contains("," + this.m_Lemma + posID) || 301 | key.startsWith(this.m_Lemma + ",") || key.contains("," + this.m_Lemma + ",") || 302 | key.equals(this.m_Lemma)) 303 | { 304 | this.m_Synsets.add(key); 305 | this.m_SynsetVectors.add(wordVectors.get(key)); 306 | } 307 | } 308 | 309 | String keyWord = null; 310 | int lower = this.m_Corpus.getLowerBoundary(currentSent); 311 | int upper = this.m_Corpus.getUpperBoundary(currentSent); 312 | for (int sentIdx = lower; sentIdx < upper; sentIdx++) { 313 | if (currentSent - sentIdx > this.m_Left 314 | || sentIdx - currentSent > this.m_Right) { 315 | continue; 316 | } 317 | ISentence sentence = this.m_Corpus.getSentence(sentIdx); 318 | if (sentence != null) { 319 | for (int i = 0; i < sentence.size(); i++) { 320 | keyWord = sentence.getItem(i).get(g_TIDX); 321 | if (this.filter(keyWord)) { 322 | continue; 323 | } 324 | keyWord = sentence.getItem(i).get(g_LIDX); 325 | if ((sentIdx != currentSent || i < this.m_IndexInSentence || i >= this.m_IndexInSentence + this.m_InstanceLength) 326 | && !this.m_SurroundingWordSet.contains(keyWord)) 327 | { 328 | this.m_SurroundingWordSet.add(keyWord); 329 | if (wordVectors.containsKey(keyWord)) 330 | { 331 | float[] vector = wordVectors.get(keyWord); 332 | for (int j = 0; j < vector.length; j++) { 333 | this.m_SurroundingWordVector[j] += vector[j]; 334 | } 335 | } 336 | } 337 | } 338 | } 339 | } 340 | this.restart(); 341 | return true; 342 | } 343 | return false; 344 | } 345 | 346 | } 347 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 casaro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /WordNetExtractor/Shared.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedInputStream; 2 | import java.io.BufferedOutputStream; 3 | import java.io.BufferedReader; 4 | import java.io.DataInputStream; 5 | import java.io.DataOutputStream; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileOutputStream; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | import java.io.PrintWriter; 13 | import java.io.UnsupportedEncodingException; 14 | import java.util.HashMap; 15 | import java.util.Iterator; 16 | import java.util.Map; 17 | import java.util.Map.Entry; 18 | 19 | 20 | public class Shared 21 | { 22 | public static HashMap WordMap = new HashMap(); 23 | 24 | public static int words; 25 | public static int size; 26 | 27 | private static final int MAX_SIZE = 50; 28 | 29 | public static void loadTxtModel(String path) 30 | { 31 | BufferedReader br = null; 32 | try 33 | { 34 | br = new BufferedReader(new FileReader(path));; 35 | 36 | String line = br.readLine(); 37 | String[] lineSplited = line.split(" "); 38 | 39 | words = Integer.parseInt(lineSplited[0]); 40 | size = Integer.parseInt(lineSplited[1]); 41 | 42 | float vector = 0; 43 | 44 | String key = null; 45 | float[] value = null; 46 | for (int i = 0; i < words; i++) { 47 | 48 | line = br.readLine(); 49 | lineSplited = line.split(" "); 50 | 51 | key = lineSplited[0]; 52 | value = new float[size]; 53 | for (int j = 0; j < size; j++) { 54 | vector = Float.parseFloat(lineSplited[j + 1]); 55 | value[j] = vector; 56 | } 57 | 58 | WordMap.put(key, value); 59 | } 60 | 61 | } catch (FileNotFoundException e) 62 | { 63 | e.printStackTrace(); 64 | } catch (IOException e) 65 | { 66 | e.printStackTrace(); 67 | } 68 | } 69 | 70 | public static void loadGoogleModel(String path) 71 | { 72 | DataInputStream dis = null; 73 | BufferedInputStream bis = null; 74 | double len = 0; 75 | float vector = 0; 76 | try 77 | { 78 | bis = new BufferedInputStream(new FileInputStream(path)); 79 | dis = new DataInputStream(bis); 80 | 81 | words = Integer.parseInt(readString(dis)); 82 | size = Integer.parseInt(readString(dis)); 83 | 84 | String key; 85 | float[] value = null; 86 | float[] valueN = null; 87 | for (int i = 0; i < words; i++) 88 | { 89 | key = readString(dis); 90 | value = new float[size]; 91 | valueN = new float[size]; 92 | len = 0; 93 | for (int j = 0; j < size; j++) 94 | { 95 | vector = readFloat(dis); 96 | len += vector * vector; 97 | value[j] = (float) vector; 98 | } 99 | len = Math.sqrt(len); 100 | 101 | for (int j = 0; j < size; j++) 102 | { 103 | valueN[j] = value[j] / (float) len; 104 | } 105 | 106 | WordMap.put(key, value); 107 | } 108 | 109 | bis.close(); 110 | dis.close(); 111 | } catch (FileNotFoundException e) 112 | { 113 | e.printStackTrace(); 114 | } catch (IOException e) 115 | { 116 | e.printStackTrace(); 117 | } 118 | } 119 | 120 | public static void saveGoogleModel(String path) 121 | { 122 | DataOutputStream dis = null; 123 | BufferedOutputStream bis = null; 124 | 125 | try 126 | { 127 | bis = new BufferedOutputStream(new FileOutputStream(path)); 128 | dis = new DataOutputStream(bis); 129 | 130 | dis.writeBytes(Integer.toString(words)); 131 | dis.writeByte(' '); 132 | dis.writeBytes(Integer.toString(size)); 133 | dis.writeByte('\n'); 134 | 135 | Iterator> it = WordMap.entrySet().iterator(); 136 | while (it.hasNext()) 137 | { 138 | Map.Entry pairs = (Map.Entry)it.next(); 139 | String key = pairs.getKey(); 140 | float[] value = pairs.getValue(); 141 | 142 | dis.writeBytes(key); 143 | dis.writeByte(' '); 144 | 145 | for (int j = 0; j < size; j++) 146 | { 147 | //dis.writeFloat(value[j]); 148 | dis.writeInt(Integer.reverseBytes(Float.floatToIntBits(value[j]))); 149 | } 150 | it.remove(); // avoids a ConcurrentModificationException 151 | } 152 | 153 | bis.close(); 154 | dis.close(); 155 | } catch (FileNotFoundException e) 156 | { 157 | e.printStackTrace(); 158 | } catch (IOException e) 159 | { 160 | e.printStackTrace(); 161 | } 162 | } 163 | 164 | public static void convertGoogleModel(String path, String filename) 165 | { 166 | PrintWriter writer; 167 | DataInputStream dis = null; 168 | BufferedInputStream bis = null; 169 | 170 | float vector = 0; 171 | try 172 | { 173 | bis = new BufferedInputStream(new FileInputStream(path)); 174 | dis = new DataInputStream(bis); 175 | writer = new PrintWriter(filename, "UTF-8"); 176 | 177 | words = Integer.parseInt(readString(dis)); 178 | size = Integer.parseInt(readString(dis)); 179 | 180 | String key; 181 | float[] value = null; 182 | float[] valueUnknown = new float[size]; 183 | for (int i = 0; i < words; i++) 184 | { 185 | key = readString(dis); 186 | key = key.toLowerCase(); 187 | value = new float[size]; 188 | for (int j = 0; j < size; j++) 189 | { 190 | vector = readFloat(dis); 191 | value[j] = (float) vector; 192 | if (i >= words-100000) 193 | valueUnknown[j] += ((float) vector / 100000); 194 | } 195 | 196 | if (WordMap.containsKey(key)) 197 | continue; 198 | 199 | writer.print(key + " "); 200 | writer.print(getVectorAsString(value) + "\n"); 201 | 202 | WordMap.put(key, value); 203 | } 204 | 205 | WordMap.put("", valueUnknown); 206 | writer.print("" + " "); 207 | writer.print(getVectorAsString(valueUnknown) + "\n"); 208 | 209 | bis.close(); 210 | dis.close(); 211 | writer.close(); 212 | } catch (FileNotFoundException e) 213 | { 214 | e.printStackTrace(); 215 | } catch (IOException e) 216 | { 217 | e.printStackTrace(); 218 | } 219 | 220 | System.out.printf("%8d / %8d\n", WordMap.size(), words); 221 | } 222 | 223 | public static void saveTxtModel(String filename) 224 | { 225 | // create file 226 | PrintWriter writer; 227 | try 228 | { 229 | writer = new PrintWriter(filename, "UTF-8"); 230 | } catch (FileNotFoundException | UnsupportedEncodingException e) 231 | { 232 | e.printStackTrace(); 233 | 234 | return; 235 | } 236 | 237 | writer.print(Integer.toString(words)); 238 | writer.print(" "); 239 | writer.print(Integer.toString(size)); 240 | writer.print("\n"); 241 | 242 | // loop through all words 243 | Iterator> it = WordMap.entrySet().iterator(); 244 | while (it.hasNext()) 245 | { 246 | Map.Entry pairs = (Map.Entry)it.next(); 247 | String key = pairs.getKey(); 248 | float[] value = pairs.getValue(); 249 | 250 | writer.print(key + " "); 251 | writer.print(getVectorAsString(value) + "\n"); 252 | } 253 | 254 | writer.close(); 255 | } 256 | 257 | public static String getVectorAsString(float[] vector) 258 | { 259 | StringBuilder sb = new StringBuilder(); 260 | 261 | for (int b = 0; b < size; b++) 262 | { 263 | sb.append(vector[b]); 264 | sb.append(" "); 265 | } 266 | 267 | return sb.toString().trim(); 268 | } 269 | 270 | private static float readFloat(InputStream is) 271 | { 272 | byte[] bytes = new byte[4]; 273 | try 274 | { 275 | is.read(bytes); 276 | } catch (IOException e) 277 | { 278 | e.printStackTrace(); 279 | } 280 | float f = getFloat(bytes); 281 | return f; 282 | } 283 | 284 | private static float getFloat(byte[] b) 285 | { 286 | int accum = 0; 287 | accum = accum | (b[0] & 0xff) << 0; 288 | accum = accum | (b[1] & 0xff) << 8; 289 | accum = accum | (b[2] & 0xff) << 16; 290 | accum = accum | (b[3] & 0xff) << 24; 291 | return Float.intBitsToFloat(accum); 292 | } 293 | 294 | private static String readString(DataInputStream dis) 295 | { 296 | byte[] bytes = new byte[MAX_SIZE]; 297 | StringBuilder sb = new StringBuilder(); 298 | try 299 | { 300 | byte b = dis.readByte(); 301 | int i = -1; 302 | 303 | if (b == 10) 304 | b = dis.readByte(); 305 | 306 | while (b != 32 && b != 10) 307 | { 308 | i++; 309 | bytes[i] = b; 310 | b = dis.readByte(); 311 | if (i == 49) 312 | { 313 | sb.append(new String(bytes)); 314 | i = -1; 315 | bytes = new byte[MAX_SIZE]; 316 | } 317 | } 318 | sb.append(new String(bytes, 0, i + 1)); 319 | 320 | } catch (IOException e) 321 | { 322 | e.printStackTrace(); 323 | } 324 | String s = sb.toString(); 325 | return s; 326 | } 327 | 328 | public static void createSyntacticVec(String path, String pathLemmaMap, String filename) 329 | { 330 | // create file 331 | PrintWriter writer; 332 | try 333 | { 334 | writer = new PrintWriter(filename, "UTF-8"); 335 | } catch (FileNotFoundException | UnsupportedEncodingException e) 336 | { 337 | e.printStackTrace(); 338 | 339 | return; 340 | } 341 | 342 | HashMap leadingLemma = new HashMap(); 343 | 344 | DataInputStream dis = null; 345 | BufferedInputStream bis = null; 346 | BufferedReader br = null; 347 | double len = 0; 348 | float vector = 0; 349 | try 350 | { 351 | bis = new BufferedInputStream(new FileInputStream(path)); 352 | dis = new DataInputStream(bis); 353 | br = new BufferedReader(new FileReader(pathLemmaMap)); 354 | 355 | words = Integer.parseInt(readString(dis)); 356 | size = Integer.parseInt(readString(dis)); 357 | 358 | String key; 359 | float[] value = null; 360 | for (int i = 0; i < words; i++) 361 | { 362 | key = readString(dis); 363 | value = new float[size]; 364 | len = 0; 365 | for (int j = 0; j < size; j++) 366 | { 367 | vector = readFloat(dis); 368 | len += vector * vector; 369 | value[j] = vector; 370 | } 371 | len = Math.sqrt(len); 372 | 373 | String line = br.readLine(); 374 | String keyLemma = line.split("\t")[1]; 375 | 376 | if (keyLemma.equals("")) 377 | continue; 378 | 379 | keyLemma = line.split("\t")[0]; 380 | 381 | if (leadingLemma.containsKey(keyLemma)) 382 | { 383 | float[] diff = leadingLemma.get(keyLemma); 384 | for (int j = 0; j < size; j++) 385 | { 386 | diff[j] -= value[j]; 387 | } 388 | writer.print(normalizeLemma(key) + " "); 389 | writer.print(getVectorAsString(diff) + "\n"); 390 | } 391 | else 392 | { 393 | leadingLemma.put(keyLemma, value); 394 | } 395 | } 396 | 397 | writer.close(); 398 | bis.close(); 399 | dis.close(); 400 | } catch (FileNotFoundException e) 401 | { 402 | e.printStackTrace(); 403 | } catch (IOException e) 404 | { 405 | e.printStackTrace(); 406 | } 407 | } 408 | 409 | public static String normalizeText(String s) 410 | { 411 | s = s.replace('’', '\''); 412 | s = s.replace('′', '\''); 413 | s = s.replace("''", " "); 414 | s = s.replace("'", " ' "); 415 | s = s.replace('“', '"'); 416 | s = s.replace('”', '"'); 417 | s = s.replace("\"", " \" "); 418 | s = s.replace(".", " . "); 419 | s = s.replace(",", " , "); 420 | s = s.replace("(", " ( "); 421 | s = s.replace(")", " ) "); 422 | s = s.replace("!", " ! "); 423 | s = s.replace(';', ' '); 424 | s = s.replace(':', ' '); 425 | s = s.replace("-", " - "); 426 | s = s.replace('=', ' '); 427 | s = s.replace('*', ' '); 428 | s = s.replace('|', ' '); 429 | s = s.replace('«', ' '); 430 | s = s.replace(" ", " "); 431 | s = s.replace(" ", " "); 432 | 433 | s = s.trim(); 434 | 435 | s = s.toLowerCase(); 436 | 437 | return s; 438 | } 439 | 440 | public static String normalizeLemma(String s) 441 | { 442 | s = s.replaceAll("\\(..?\\)", ""); 443 | 444 | s = normalizeText(s); 445 | 446 | s = s.replace(" ", "_"); 447 | 448 | return s; 449 | } 450 | } 451 | -------------------------------------------------------------------------------- /WordNetExtractor/WordNetExtractor.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedInputStream; 2 | import java.io.BufferedOutputStream; 3 | import java.io.BufferedReader; 4 | import java.io.DataInputStream; 5 | import java.io.DataOutputStream; 6 | import java.io.File; 7 | import java.io.FileInputStream; 8 | import java.io.FileNotFoundException; 9 | import java.io.FileOutputStream; 10 | import java.io.FileReader; 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.io.PrintWriter; 14 | import java.io.UnsupportedEncodingException; 15 | import java.util.HashMap; 16 | import java.util.HashSet; 17 | import java.util.Iterator; 18 | import java.util.List; 19 | import java.util.Map; 20 | import java.util.Map.Entry; 21 | import java.util.Scanner; 22 | 23 | import net.didion.jwnl.JWNL; 24 | import net.didion.jwnl.JWNLException; 25 | import net.didion.jwnl.data.IndexWord; 26 | import net.didion.jwnl.data.IndexWordSet; 27 | import net.didion.jwnl.data.POS; 28 | import net.didion.jwnl.data.Pointer; 29 | import net.didion.jwnl.data.PointerType; 30 | import net.didion.jwnl.data.Synset; 31 | import net.didion.jwnl.data.Word; 32 | import net.didion.jwnl.dictionary.Dictionary; 33 | import net.didion.jwnl.dictionary.MorphologicalProcessor; 34 | 35 | @SuppressWarnings("unchecked") 36 | public class WordNetExtractor 37 | { 38 | private static HashMap WordIndex = new HashMap(); 39 | private static HashMap SynsetIndex = new HashMap(); 40 | private static Dictionary dictionary; 41 | 42 | public static void main(String[] args) throws IOException, JWNLException 43 | { 44 | // path to JWNL prop xml file 45 | JWNL.initialize(new FileInputStream("[...]")); 46 | dictionary = Dictionary.getInstance(); 47 | 48 | // path to input word embeddings 49 | String file_name = "[...]"; 50 | 51 | // path to output folder 52 | String folder = "[...]"; 53 | 54 | if (file_name.endsWith(".bin")) 55 | Shared.loadGoogleModel(file_name); 56 | else 57 | Shared.loadTxtModel(file_name); 58 | 59 | JWNL.Version ver = JWNL.getVersion(); 60 | System.out.printf("RESOURCE: WN " + ver.toString() + "\n"); 61 | System.out.printf("VECTORS: " + folder + "\n"); 62 | System.out.printf("TARGET: " + folder + "\n"); 63 | 64 | extractWordsAndSynsets(folder + "words.txt", 65 | folder + "synsets.txt", 66 | folder + "lexemes.txt", 67 | folder + "glosses.txt"); 68 | 69 | extractSynsetRelations(folder + "hypernym.txt", PointerType.HYPERNYM); 70 | extractSynsetRelations(folder + "similar.txt", PointerType.SIMILAR_TO); 71 | extractSynsetRelations(folder + "verbGroup.txt", PointerType.VERB_GROUP); 72 | extractSynsetRelations(folder + "antonym.txt", PointerType.ANTONYM); 73 | 74 | System.out.printf("DONE"); 75 | } 76 | 77 | private static void extractWordsAndSynsets(String filenameWords, String filenameSynsets, String filenameLexemes, String filenameGlosses) throws JWNLException 78 | { 79 | // create file 80 | PrintWriter writerWords, writerSynsets, writerLexemes, writerGlosses; 81 | try 82 | { 83 | writerWords = new PrintWriter(filenameWords, "UTF-8"); 84 | writerSynsets = new PrintWriter(filenameSynsets, "UTF-8"); 85 | writerLexemes = new PrintWriter(filenameLexemes, "UTF-8"); 86 | writerGlosses = new PrintWriter(filenameGlosses, "UTF-8"); 87 | } catch (FileNotFoundException | UnsupportedEncodingException e) 88 | { 89 | e.printStackTrace(); 90 | 91 | return; 92 | } 93 | 94 | int wordCounter = 0; 95 | int synsetCounter = 0; 96 | int synsetCounterAll = 0; 97 | int lexemCounter = 0; 98 | int lexemCounterAll = 0; 99 | 100 | HashSet oov = new HashSet(); 101 | 102 | for (Object pos : POS.getAllPOS()) 103 | { 104 | Iterator itr = dictionary.getSynsetIterator((POS) pos); 105 | while (itr.hasNext()) 106 | { 107 | Synset synset = itr.next(); 108 | String synsetId = getId(synset); 109 | ++synsetCounterAll; 110 | 111 | SynsetIndex.put(synsetId, synsetCounterAll); 112 | 113 | // export synset 114 | writerSynsets.print(synsetId + " "); 115 | 116 | float[] naiveSynsetVector = new float[Shared.size]; 117 | int wordsInSynset = 0; 118 | 119 | for (Word word : synset.getWords()) 120 | { 121 | ++lexemCounterAll; 122 | 123 | String lemma = word.getLemma(); 124 | lemma = Shared.normalizeLemma(lemma); 125 | 126 | // if not in corpus maybe with pos tag 127 | if (!Shared.WordMap.containsKey(lemma)) 128 | { 129 | lemma = lemma + "%" + synset.getPOS().getKey(); 130 | 131 | // skip words that are not in corpus 132 | if (!Shared.WordMap.containsKey(lemma)) 133 | { 134 | oov.add(lemma); 135 | continue; 136 | } 137 | } 138 | 139 | ++wordsInSynset; 140 | for (int b = 0; b < Shared.size; b++) 141 | { 142 | naiveSynsetVector[b] += Shared.WordMap.get(lemma)[b]; 143 | } 144 | 145 | if (!WordIndex.containsKey(lemma)) 146 | { 147 | writerWords.print(lemma + " " + Shared.getVectorAsString(Shared.WordMap.get(lemma)) + "\n"); 148 | WordIndex.put(lemma, ++wordCounter); 149 | } 150 | 151 | ++lexemCounter; 152 | 153 | String sensekey = synset.getSenseKey(word.getLemma()); 154 | 155 | writerSynsets.print(sensekey + ","); 156 | writerLexemes.print(WordIndex.get(lemma) + " " + synsetCounterAll + "\n"); 157 | } 158 | writerSynsets.print("\n"); 159 | 160 | // get gloss vector and normalize length of it 161 | float[] glossVector = getGlossVector(synset); 162 | if (wordsInSynset != 0) 163 | { 164 | float lenNSV = 0, lenGloss = 0; 165 | for (int b = 0; b < Shared.size; b++) 166 | { 167 | naiveSynsetVector[b] /= wordsInSynset; 168 | lenNSV += naiveSynsetVector[b] * naiveSynsetVector[b]; 169 | lenGloss += glossVector[b] * glossVector[b]; 170 | } 171 | lenNSV = (float)Math.sqrt(lenNSV); 172 | lenGloss = (float)Math.sqrt(lenGloss); 173 | for (int b = 0; b < Shared.size; b++) 174 | { 175 | glossVector[b] *= (lenNSV / lenGloss); 176 | } 177 | } 178 | else 179 | { 180 | float lenGloss = 0; 181 | for (int b = 0; b < Shared.size; b++) 182 | { 183 | lenGloss += glossVector[b] * glossVector[b]; 184 | } 185 | lenGloss = (float)Math.sqrt(lenGloss); 186 | for (int b = 0; b < Shared.size; b++) 187 | { 188 | glossVector[b] /= lenGloss; 189 | } 190 | } 191 | 192 | writerGlosses.print(synsetId + " " + Shared.getVectorAsString(glossVector) + "\n"); 193 | 194 | if (wordsInSynset != 0) 195 | ++synsetCounter; 196 | else 197 | SynsetIndex.put(synsetId, -1); 198 | } 199 | } 200 | 201 | writerWords.close(); 202 | writerSynsets.close(); 203 | writerLexemes.close(); 204 | writerGlosses.close(); 205 | 206 | System.out.printf(" Words: %8d / %8d\n", wordCounter, wordCounter + oov.size()); 207 | System.out.printf(" Synset: %8d / %8d\n", synsetCounter, synsetCounterAll); 208 | System.out.printf(" Lexems: %8d / %8d\n", lexemCounter, lexemCounterAll); 209 | } 210 | 211 | private static String getId(Synset synset) 212 | { 213 | JWNL.Version ver = JWNL.getVersion(); 214 | 215 | String id = "wn-" + ver.getNumber() + "-" + String.format("%08d", synset.getOffset()) + "-" + synset.getPOS().getKey(); 216 | 217 | return id; 218 | } 219 | 220 | private static float[] getGlossVector(Synset synset) 221 | { 222 | String gloss = Shared.normalizeText(synset.getGloss()); 223 | 224 | float[] vector = new float[Shared.size]; 225 | for (String word : gloss.split(" ")) 226 | { 227 | if (Shared.WordMap.containsKey(word)) 228 | { 229 | for (int b = 0; b < Shared.size; b++) 230 | { 231 | vector[b] += Shared.WordMap.get(word)[b]; 232 | } 233 | } 234 | } 235 | 236 | return vector; 237 | } 238 | 239 | private static void extractSynsetRelations(String filename, PointerType pointer) throws JWNLException 240 | { 241 | HashMap affectedPOS = new HashMap(); 242 | 243 | // create file 244 | PrintWriter writer; 245 | try 246 | { 247 | writer = new PrintWriter(filename, "UTF-8"); 248 | } catch (FileNotFoundException | UnsupportedEncodingException e) 249 | { 250 | e.printStackTrace(); 251 | 252 | return; 253 | } 254 | 255 | for (Object pos : POS.getAllPOS()) 256 | { 257 | Iterator itr = dictionary.getSynsetIterator((POS) pos); 258 | while (itr.hasNext()) 259 | { 260 | Synset synset = itr.next(); 261 | String synsetId = getId(synset); 262 | 263 | Pointer[] pointers = synset.getPointers(pointer); 264 | for (Pointer p : pointers) 265 | { 266 | Synset targetSynset = p.getTargetSynset(); 267 | String targetId = getId(targetSynset);; 268 | 269 | String key = targetSynset.getPOS().getLabel(); 270 | if (affectedPOS.containsKey(key)) 271 | { 272 | affectedPOS.put(key, affectedPOS.get(key) + 1); 273 | } 274 | else 275 | { 276 | affectedPOS.put(key, 1); 277 | } 278 | 279 | if (SynsetIndex.get(synsetId) < 0 || SynsetIndex.get(targetId) < 0) 280 | continue; 281 | 282 | writer.print(SynsetIndex.get(synsetId)); 283 | writer.print(" "); 284 | writer.print(SynsetIndex.get(targetId)); 285 | writer.print("\n"); 286 | } 287 | } 288 | } 289 | 290 | writer.close(); 291 | 292 | System.out.printf("Extracted %s: done!\n", pointer.getLabel()); 293 | Iterator> it = affectedPOS.entrySet().iterator(); 294 | while (it.hasNext()) 295 | { 296 | Map.Entry pairs = it.next(); 297 | String key = pairs.getKey(); 298 | int value = pairs.getValue(); 299 | 300 | System.out.printf(" %s: %d\n", key, value); 301 | } 302 | } 303 | } 304 | --------------------------------------------------------------------------------