├── .gitignore ├── LICENSE ├── README ├── Rakefile ├── VERSION.yml ├── ext ├── extconf.rb ├── porter.c └── porter_wrap.c ├── fast-stemmer.gemspec ├── lib ├── fast-stemmer.rb └── fast_stemmer.rb └── test └── fast_stemmer_test.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw? 2 | ext/Makefile 3 | .DS_Store 4 | coverage 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008 Roman Shterenzon 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Fast-stemmer is simply a wrapping around multithreaded 2 | Porter stemming algorithm. 3 | 4 | This gem adds a String#stem method, and it conflicts with the stemmer gem. 5 | It's in order of magnitude faster (and uses much less memory) than the latter. 6 | 7 | For the original work please see: 8 | http://tartarus.org/~martin/PorterStemmer/ 9 | 10 | Gemfile: 11 | gem 'fast-stemmer' 12 | 13 | Usage: 14 | 15 | require 'rubygems' 16 | require 'fast_stemmer' 17 | 18 | Stemmer::stem_word('running') # -> 'run' 19 | 'running'.stem # -> 'run' 20 | 21 | 22 | COPYRIGHT 23 | ========= 24 | 25 | Copyright (c) 2008 Roman Shterenzon. See LICENSE for details. 26 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake' 2 | 3 | begin 4 | require 'jeweler' 5 | Jeweler::Tasks.new do |s| 6 | s.name = "fast-stemmer" 7 | s.summary = %Q{Fast Porter stemmer based on a C version of algorithm} 8 | s.email = "romanbsd@yahoo.com" 9 | s.homepage = "http://github.com/romanbsd/fast-stemmer" 10 | s.description = s.summary 11 | s.authors = ["Roman Shterenzon"] 12 | s.extensions = ['ext/extconf.rb'] 13 | s.files = FileList["[A-Z]*", "{ext,lib,test}/**/*"] 14 | end 15 | Jeweler::RubygemsDotOrgTasks.new 16 | rescue LoadError 17 | puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com" 18 | end 19 | 20 | require 'rdoc/task' 21 | Rake::RDocTask.new do |rdoc| 22 | rdoc.rdoc_dir = 'rdoc' 23 | rdoc.title = 'fast-stemmer' 24 | rdoc.options << '--line-numbers' << '--inline-source' 25 | rdoc.rdoc_files.include('README*') 26 | rdoc.rdoc_files.include('lib/**/*.rb') 27 | end 28 | 29 | require 'rake/testtask' 30 | Rake::TestTask.new(:test) do |t| 31 | t.libs << 'lib' << 'test' 32 | t.pattern = 'test/**/*_test.rb' 33 | t.verbose = false 34 | end 35 | 36 | task :default => :test 37 | -------------------------------------------------------------------------------- /VERSION.yml: -------------------------------------------------------------------------------- 1 | --- 2 | :major: 1 3 | :minor: 0 4 | :patch: 2 5 | :build: 6 | -------------------------------------------------------------------------------- /ext/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | RbConfig::MAKEFILE_CONFIG['CC'] = ENV['CC'] if ENV['CC'] 3 | create_makefile('stemmer') 4 | -------------------------------------------------------------------------------- /ext/porter.c: -------------------------------------------------------------------------------- 1 | 2 | /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C 3 | by the author. 4 | 5 | It may be be regarded as cononical, in that it follows the algorithm 6 | presented in 7 | 8 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 9 | no. 3, pp 130-137, 10 | 11 | only differing from it at the points maked --DEPARTURE-- below. 12 | 13 | See also http://www.tartarus.org/~martin/PorterStemmer 14 | 15 | The algorithm as described in the paper could be exactly replicated 16 | by adjusting the points of DEPARTURE, but this is barely necessary, 17 | because (a) the points of DEPARTURE are definitely improvements, and 18 | (b) no encoding of the Porter stemmer I have seen is anything like 19 | as exact as this version, even with the points of DEPARTURE! 20 | 21 | You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which 22 | 'stem' takes a list of inputs and sends the stemmed equivalent to 23 | stdout. 24 | 25 | The algorithm as encoded here is particularly fast. 26 | 27 | Release 2 (the more old-fashioned, non-thread-safe version may be 28 | regarded as release 1.) 29 | */ 30 | 31 | #include /* for malloc, free */ 32 | #include /* for memcmp, memmove */ 33 | 34 | /* You will probably want to move the following declarations to a central 35 | header file. 36 | */ 37 | 38 | struct stemmer; 39 | 40 | extern struct stemmer * create_stemmer(void); 41 | extern void free_stemmer(struct stemmer * z); 42 | 43 | extern int stem(struct stemmer * z, char * b, int k); 44 | 45 | 46 | 47 | /* The main part of the stemming algorithm starts here. 48 | */ 49 | 50 | #define TRUE 1 51 | #define FALSE 0 52 | 53 | /* stemmer is a structure for a few local bits of data, 54 | */ 55 | 56 | struct stemmer { 57 | char * b; /* buffer for word to be stemmed */ 58 | int k; /* offset to the end of the string */ 59 | int j; /* a general offset into the string */ 60 | }; 61 | 62 | 63 | /* Member b is a buffer holding a word to be stemmed. The letters are in 64 | b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as 65 | the stemming progresses. Zero termination is not in fact used in the 66 | algorithm. 67 | 68 | Note that only lower case sequences are stemmed. Forcing to lower case 69 | should be done before stem(...) is called. 70 | 71 | 72 | Typical usage is: 73 | 74 | struct stemmer * z = create_stemmer(); 75 | char b[] = "pencils"; 76 | int res = stem(z, b, 6); 77 | /- stem the 7 characters of b[0] to b[6]. The result, res, 78 | will be 5 (the 's' is removed). -/ 79 | free_stemmer(z); 80 | */ 81 | 82 | 83 | extern struct stemmer * create_stemmer(void) 84 | { 85 | return (struct stemmer *) malloc(sizeof(struct stemmer)); 86 | /* assume malloc succeeds */ 87 | } 88 | 89 | extern void free_stemmer(struct stemmer * z) 90 | { 91 | free(z); 92 | } 93 | 94 | 95 | /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here 96 | and below we drop 'z->' in comments. 97 | */ 98 | 99 | static int cons(struct stemmer * z, int i) 100 | { switch (z->b[i]) 101 | { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; 102 | case 'y': return (i == 0) ? TRUE : !cons(z, i - 1); 103 | default: return TRUE; 104 | } 105 | } 106 | 107 | /* m(z) measures the number of consonant sequences between 0 and j. if c is 108 | a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 109 | presence, 110 | 111 | gives 0 112 | vc gives 1 113 | vcvc gives 2 114 | vcvcvc gives 3 115 | .... 116 | */ 117 | 118 | static int m(struct stemmer * z) 119 | { int n = 0; 120 | int i = 0; 121 | int j = z->j; 122 | while(TRUE) 123 | { if (i > j) return n; 124 | if (! cons(z, i)) break; i++; 125 | } 126 | i++; 127 | while(TRUE) 128 | { while(TRUE) 129 | { if (i > j) return n; 130 | if (cons(z, i)) break; 131 | i++; 132 | } 133 | i++; 134 | n++; 135 | while(TRUE) 136 | { if (i > j) return n; 137 | if (! cons(z, i)) break; 138 | i++; 139 | } 140 | i++; 141 | } 142 | } 143 | 144 | /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */ 145 | 146 | static int vowelinstem(struct stemmer * z) 147 | { 148 | int j = z->j; 149 | int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE; 150 | return FALSE; 151 | } 152 | 153 | /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */ 154 | 155 | static int doublec(struct stemmer * z, int j) 156 | { 157 | char * b = z->b; 158 | if (j < 1) return FALSE; 159 | if (b[j] != b[j - 1]) return FALSE; 160 | return cons(z, j); 161 | } 162 | 163 | /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 164 | and also if the second c is not w,x or y. this is used when trying to 165 | restore an e at the end of a short word. e.g. 166 | 167 | cav(e), lov(e), hop(e), crim(e), but 168 | snow, box, tray. 169 | 170 | */ 171 | 172 | static int cvc(struct stemmer * z, int i) 173 | { if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE; 174 | { int ch = z->b[i]; 175 | if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; 176 | } 177 | return TRUE; 178 | } 179 | 180 | /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */ 181 | 182 | static int ends(struct stemmer * z, char * s) 183 | { int length = s[0]; 184 | char * b = z->b; 185 | int k = z->k; 186 | if (s[length] != b[k]) return FALSE; /* tiny speed-up */ 187 | if (length > k + 1) return FALSE; 188 | if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE; 189 | z->j = k-length; 190 | return TRUE; 191 | } 192 | 193 | /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting 194 | k. */ 195 | 196 | static void setto(struct stemmer * z, char * s) 197 | { int length = s[0]; 198 | int j = z->j; 199 | memmove(z->b + j + 1, s + 1, length); 200 | z->k = j+length; 201 | } 202 | 203 | /* r(z, s) is used further down. */ 204 | 205 | static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); } 206 | 207 | /* step1ab(z) gets rid of plurals and -ed or -ing. e.g. 208 | 209 | caresses -> caress 210 | ponies -> poni 211 | ties -> ti 212 | caress -> caress 213 | cats -> cat 214 | 215 | feed -> feed 216 | agreed -> agree 217 | disabled -> disable 218 | 219 | matting -> mat 220 | mating -> mate 221 | meeting -> meet 222 | milling -> mill 223 | messing -> mess 224 | 225 | meetings -> meet 226 | 227 | */ 228 | 229 | static void step1ab(struct stemmer * z) 230 | { 231 | char * b = z->b; 232 | if (b[z->k] == 's') 233 | { if (ends(z, "\04" "sses")) z->k -= 2; else 234 | if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else 235 | if (b[z->k - 1] != 's') z->k--; 236 | } 237 | if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else 238 | if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z)) 239 | { z->k = z->j; 240 | if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else 241 | if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else 242 | if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else 243 | if (doublec(z, z->k)) 244 | { z->k--; 245 | { int ch = b[z->k]; 246 | if (ch == 'l' || ch == 's' || ch == 'z') z->k++; 247 | } 248 | } 249 | else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e"); 250 | } 251 | } 252 | 253 | /* step1c(z) turns terminal y to i when there is another vowel in the stem. */ 254 | 255 | static void step1c(struct stemmer * z) 256 | { 257 | if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i'; 258 | } 259 | 260 | 261 | /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus 262 | -ation) maps to -ize etc. note that the string before the suffix must give 263 | m(z) > 0. */ 264 | 265 | static void step2(struct stemmer * z) { switch (z->b[z->k-1]) 266 | { 267 | case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; } 268 | if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; } 269 | break; 270 | case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; } 271 | if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; } 272 | break; 273 | case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; } 274 | break; 275 | case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/ 276 | 277 | /* To match the published algorithm, replace this line with 278 | case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */ 279 | 280 | if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; } 281 | if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; } 282 | if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; } 283 | if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; } 284 | break; 285 | case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; } 286 | if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; } 287 | if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; } 288 | break; 289 | case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; } 290 | if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; } 291 | if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; } 292 | if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; } 293 | break; 294 | case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; } 295 | if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; } 296 | if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; } 297 | break; 298 | case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/ 299 | 300 | /* To match the published algorithm, delete this line */ 301 | 302 | } } 303 | 304 | /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */ 305 | 306 | static void step3(struct stemmer * z) { switch (z->b[z->k]) 307 | { 308 | case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; } 309 | if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; } 310 | if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; } 311 | break; 312 | case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; } 313 | break; 314 | case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; } 315 | if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; } 316 | break; 317 | case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; } 318 | break; 319 | } } 320 | 321 | /* step4(z) takes off -ant, -ence etc., in context vcvc. */ 322 | 323 | static void step4(struct stemmer * z) 324 | { switch (z->b[z->k-1]) 325 | { case 'a': if (ends(z, "\02" "al")) break; return; 326 | case 'c': if (ends(z, "\04" "ance")) break; 327 | if (ends(z, "\04" "ence")) break; return; 328 | case 'e': if (ends(z, "\02" "er")) break; return; 329 | case 'i': if (ends(z, "\02" "ic")) break; return; 330 | case 'l': if (ends(z, "\04" "able")) break; 331 | if (ends(z, "\04" "ible")) break; return; 332 | case 'n': if (ends(z, "\03" "ant")) break; 333 | if (ends(z, "\05" "ement")) break; 334 | if (ends(z, "\04" "ment")) break; 335 | if (ends(z, "\03" "ent")) break; return; 336 | case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break; 337 | if (ends(z, "\02" "ou")) break; return; 338 | /* takes care of -ous */ 339 | case 's': if (ends(z, "\03" "ism")) break; return; 340 | case 't': if (ends(z, "\03" "ate")) break; 341 | if (ends(z, "\03" "iti")) break; return; 342 | case 'u': if (ends(z, "\03" "ous")) break; return; 343 | case 'v': if (ends(z, "\03" "ive")) break; return; 344 | case 'z': if (ends(z, "\03" "ize")) break; return; 345 | default: return; 346 | } 347 | if (m(z) > 1) z->k = z->j; 348 | } 349 | 350 | /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if 351 | m(z) > 1. */ 352 | 353 | static void step5(struct stemmer * z) 354 | { 355 | char * b = z->b; 356 | z->j = z->k; 357 | if (b[z->k] == 'e') 358 | { int a = m(z); 359 | if (a > 1 || a == 1 && !cvc(z, z->k - 1)) z->k--; 360 | } 361 | if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--; 362 | } 363 | 364 | /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is 365 | from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not 366 | important. The stemmer adjusts the characters b[0] ... b[k] and returns 367 | the new end-point of the string, k'. Stemming never increases word 368 | length, so 0 <= k' <= k. 369 | */ 370 | 371 | extern int stem(struct stemmer * z, char * b, int k) 372 | { 373 | if (k <= 1) return k; /*-DEPARTURE-*/ 374 | z->b = b; z->k = k; /* copy the parameters into z */ 375 | 376 | /* With this line, strings of length 1 or 2 don't go through the 377 | stemming process, although no mention is made of this in the 378 | published algorithm. Remove the line to match the published 379 | algorithm. */ 380 | 381 | step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z); 382 | return z->k; 383 | } 384 | 385 | /*--------------------stemmer definition ends here------------------------*/ 386 | #if 0 387 | #include 388 | #include /* for malloc, free */ 389 | #include /* for isupper, islower, tolower */ 390 | 391 | static char * s; /* buffer for words tobe stemmed */ 392 | 393 | #define INC 50 /* size units in which s is increased */ 394 | static int i_max = INC; /* maximum offset in s */ 395 | 396 | #define LETTER(ch) (isupper(ch) || islower(ch)) 397 | 398 | void stemfile(struct stemmer * z, FILE * f) 399 | { while(TRUE) 400 | { int ch = getc(f); 401 | if (ch == EOF) return; 402 | if (LETTER(ch)) 403 | { int i = 0; 404 | while(TRUE) 405 | { if (i == i_max) 406 | { i_max += INC; 407 | s = realloc(s, i_max + 1); 408 | } 409 | ch = tolower(ch); /* forces lower case */ 410 | 411 | s[i] = ch; i++; 412 | ch = getc(f); 413 | if (!LETTER(ch)) { ungetc(ch,f); break; } 414 | } 415 | s[stem(z, s, i - 1) + 1] = 0; 416 | /* the previous line calls the stemmer and uses its result to 417 | zero-terminate the string in s */ 418 | printf("%s",s); 419 | } 420 | else putchar(ch); 421 | } 422 | } 423 | 424 | int main(int argc, char * argv[]) 425 | { int i; 426 | 427 | struct stemmer * z = create_stemmer(); 428 | 429 | s = (char *) malloc(i_max + 1); 430 | for (i = 1; i < argc; i++) 431 | { FILE * f = fopen(argv[i],"r"); 432 | if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); } 433 | stemfile(z, f); 434 | } 435 | free(s); 436 | 437 | free_stemmer(z); 438 | 439 | return 0; 440 | } 441 | #endif 442 | -------------------------------------------------------------------------------- /ext/porter_wrap.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef RSTRING_PTR 4 | #define RSTRING_PTR(str) (RSTRING(str)->ptr) 5 | #endif 6 | 7 | extern struct stemmer * create_stemmer(void); 8 | extern void free_stemmer(struct stemmer * z); 9 | extern int stem(struct stemmer * z, char * b, int k); 10 | 11 | /* copied from porter.c */ 12 | struct stemmer { 13 | char * b; /* buffer for word to be stemmed */ 14 | int k; /* offset to the end of the string */ 15 | int j; /* a general offset into the string */ 16 | }; 17 | 18 | static VALUE stem_word(VALUE self, VALUE arg) 19 | { 20 | size_t length, i; 21 | char *word; 22 | struct stemmer z; 23 | VALUE str, rv; 24 | 25 | str = StringValue(arg); 26 | word = malloc(RSTRING_LEN(str) + 1); 27 | strncpy(word, RSTRING_PTR(str), RSTRING_LEN(str)); 28 | word[RSTRING_LEN(str)] = '\0'; 29 | 30 | length = stem(&z, word, strlen(word)-1); 31 | word[length+1] = 0; 32 | rv = rb_str_new2(word); 33 | free(word); 34 | return rv; 35 | } 36 | 37 | VALUE mStemmer; 38 | 39 | void Init_stemmer(void) { 40 | mStemmer = rb_define_module("Stemmer"); 41 | rb_define_module_function(mStemmer, "stem_word", stem_word, 1); 42 | } 43 | -------------------------------------------------------------------------------- /fast-stemmer.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = "fast-stemmer" 8 | s.version = "1.0.2" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["Roman Shterenzon"] 12 | s.date = "2013-02-06" 13 | s.description = "Fast Porter stemmer based on a C version of algorithm" 14 | s.license = "BSD" 15 | s.email = "romanbsd@yahoo.com" 16 | s.extensions = ["ext/extconf.rb"] 17 | s.extra_rdoc_files = [ 18 | "LICENSE", 19 | "README" 20 | ] 21 | s.files = [ 22 | "LICENSE", 23 | "README", 24 | "Rakefile", 25 | "VERSION.yml", 26 | "ext/Makefile", 27 | "ext/extconf.rb", 28 | "ext/porter.c", 29 | "ext/porter_wrap.c", 30 | "lib/fast-stemmer.rb", 31 | "lib/fast_stemmer.rb", 32 | "test/fast_stemmer_test.rb" 33 | ] 34 | s.homepage = "http://github.com/romanbsd/fast-stemmer" 35 | s.require_paths = ["lib"] 36 | s.rubygems_version = "1.8.23" 37 | s.summary = "Fast Porter stemmer based on a C version of algorithm" 38 | 39 | if s.respond_to? :specification_version then 40 | s.specification_version = 3 41 | 42 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 43 | else 44 | end 45 | else 46 | end 47 | end 48 | 49 | -------------------------------------------------------------------------------- /lib/fast-stemmer.rb: -------------------------------------------------------------------------------- 1 | require 'fast_stemmer' 2 | -------------------------------------------------------------------------------- /lib/fast_stemmer.rb: -------------------------------------------------------------------------------- 1 | require 'stemmer' 2 | 3 | class String 4 | def stem 5 | Stemmer.stem_word(self) 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /test/fast_stemmer_test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'test/unit' 3 | require File.join(File.dirname(__FILE__), '..', 'lib', 'fast_stemmer') 4 | 5 | class TestStemmer < Test::Unit::TestCase 6 | def setup 7 | @stems = { 'riding' => 'ride', 8 | 'forestalled' => 'forestal', 9 | 'combined' => 'combin', 10 | 'ran' => 'ran', 11 | 'seen' => 'seen', 12 | 'excused' => 'excus' 13 | } 14 | end 15 | 16 | def test_stems 17 | @stems.each {|stem| assert_equal(stem[1], stem[0].stem)} 18 | end 19 | end 20 | --------------------------------------------------------------------------------