├── .gitignore
├── LICENSE
├── README
├── Rakefile
├── VERSION.yml
├── ext
    ├── extconf.rb
    ├── porter.c
    └── porter_wrap.c
├── fast-stemmer.gemspec
├── lib
    ├── fast-stemmer.rb
    └── fast_stemmer.rb
└── test
    └── fast_stemmer_test.rb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | ext/Makefile
3 | .DS_Store
4 | coverage
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008 Roman Shterenzon
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Fast-stemmer is simply a wrapping around multithreaded
 2 | Porter stemming algorithm.
 3 | 
 4 | This gem adds a String#stem method, and it conflicts with the stemmer gem.
 5 | It's in order of magnitude faster (and uses much less memory) than the latter.
 6 | 
 7 | For the original work please see:
 8 | http://tartarus.org/~martin/PorterStemmer/
 9 | 
10 | Gemfile:
11 |   gem 'fast-stemmer'
12 | 
13 | Usage:
14 | 
15 |   require 'rubygems'
16 |   require 'fast_stemmer'
17 | 
18 |   Stemmer::stem_word('running') # -> 'run'
19 |   'running'.stem                # -> 'run'
20 | 
21 | 
22 | COPYRIGHT
23 | =========
24 | 
25 | Copyright (c) 2008 Roman Shterenzon. See LICENSE for details.
26 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rake'
 2 | 
 3 | begin
 4 |   require 'jeweler'
 5 |   Jeweler::Tasks.new do |s|
 6 |     s.name = "fast-stemmer"
 7 |     s.summary = %Q{Fast Porter stemmer based on a C version of algorithm}
 8 |     s.email = "romanbsd@yahoo.com"
 9 |     s.homepage = "http://github.com/romanbsd/fast-stemmer"
10 |     s.description = s.summary
11 |     s.authors = ["Roman Shterenzon"]
12 |     s.extensions = ['ext/extconf.rb']
13 |     s.files = FileList["[A-Z]*", "{ext,lib,test}/**/*"]
14 |   end
15 |   Jeweler::RubygemsDotOrgTasks.new
16 | rescue LoadError
17 |   puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
18 | end
19 | 
20 | require 'rdoc/task'
21 | Rake::RDocTask.new do |rdoc|
22 |   rdoc.rdoc_dir = 'rdoc'
23 |   rdoc.title = 'fast-stemmer'
24 |   rdoc.options << '--line-numbers' << '--inline-source'
25 |   rdoc.rdoc_files.include('README*')
26 |   rdoc.rdoc_files.include('lib/**/*.rb')
27 | end
28 | 
29 | require 'rake/testtask'
30 | Rake::TestTask.new(:test) do |t|
31 |   t.libs << 'lib' << 'test'
32 |   t.pattern = 'test/**/*_test.rb'
33 |   t.verbose = false
34 | end
35 | 
36 | task :default => :test
37 | 


--------------------------------------------------------------------------------
/VERSION.yml:
--------------------------------------------------------------------------------
1 | ---
2 | :major: 1
3 | :minor: 0
4 | :patch: 2
5 | :build: 
6 | 


--------------------------------------------------------------------------------
/ext/extconf.rb:
--------------------------------------------------------------------------------
1 | require 'mkmf'
2 | RbConfig::MAKEFILE_CONFIG['CC'] = ENV['CC'] if ENV['CC']
3 | create_makefile('stemmer')
4 | 


--------------------------------------------------------------------------------
/ext/porter.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
  3 |    by the author.
  4 | 
  5 |    It may be be regarded as cononical, in that it follows the algorithm
  6 |    presented in
  7 | 
  8 |    Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  9 |    no. 3, pp 130-137,
 10 | 
 11 |    only differing from it at the points maked --DEPARTURE-- below.
 12 | 
 13 |    See also http://www.tartarus.org/~martin/PorterStemmer
 14 | 
 15 |    The algorithm as described in the paper could be exactly replicated
 16 |    by adjusting the points of DEPARTURE, but this is barely necessary,
 17 |    because (a) the points of DEPARTURE are definitely improvements, and
 18 |    (b) no encoding of the Porter stemmer I have seen is anything like
 19 |    as exact as this version, even with the points of DEPARTURE!
 20 | 
 21 |    You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
 22 |    'stem' takes a list of inputs and sends the stemmed equivalent to
 23 |    stdout.
 24 | 
 25 |    The algorithm as encoded here is particularly fast.
 26 | 
 27 |    Release 2 (the more old-fashioned, non-thread-safe version may be
 28 |    regarded as release 1.)
 29 | */
 30 | 
 31 | #include <stdlib.h>  /* for malloc, free */
 32 | #include <string.h>  /* for memcmp, memmove */
 33 | 
 34 | /* You will probably want to move the following declarations to a central
 35 |    header file.
 36 | */
 37 | 
 38 | struct stemmer;
 39 | 
 40 | extern struct stemmer * create_stemmer(void);
 41 | extern void free_stemmer(struct stemmer * z);
 42 | 
 43 | extern int stem(struct stemmer * z, char * b, int k);
 44 | 
 45 | 
 46 | 
 47 | /* The main part of the stemming algorithm starts here.
 48 | */
 49 | 
 50 | #define TRUE 1
 51 | #define FALSE 0
 52 | 
 53 | /* stemmer is a structure for a few local bits of data,
 54 | */
 55 | 
 56 | struct stemmer {
 57 |    char * b;       /* buffer for word to be stemmed */
 58 |    int k;          /* offset to the end of the string */
 59 |    int j;          /* a general offset into the string */
 60 | };
 61 | 
 62 | 
 63 | /* Member b is a buffer holding a word to be stemmed. The letters are in
 64 |    b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as
 65 |    the stemming progresses. Zero termination is not in fact used in the
 66 |    algorithm.
 67 | 
 68 |    Note that only lower case sequences are stemmed. Forcing to lower case
 69 |    should be done before stem(...) is called.
 70 | 
 71 | 
 72 |    Typical usage is:
 73 | 
 74 |        struct stemmer * z = create_stemmer();
 75 |        char b[] = "pencils";
 76 |        int res = stem(z, b, 6);
 77 |            /- stem the 7 characters of b[0] to b[6]. The result, res,
 78 |               will be 5 (the 's' is removed). -/
 79 |        free_stemmer(z);
 80 | */
 81 | 
 82 | 
 83 | extern struct stemmer * create_stemmer(void)
 84 | {
 85 |     return (struct stemmer *) malloc(sizeof(struct stemmer));
 86 |     /* assume malloc succeeds */
 87 | }
 88 | 
 89 | extern void free_stemmer(struct stemmer * z)
 90 | {
 91 |     free(z);
 92 | }
 93 | 
 94 | 
 95 | /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
 96 |    and below we drop 'z->' in comments.
 97 | */
 98 | 
 99 | static int cons(struct stemmer * z, int i)
100 | {  switch (z->b[i])
101 |    {  case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
102 |       case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
103 |       default: return TRUE;
104 |    }
105 | }
106 | 
107 | /* m(z) measures the number of consonant sequences between 0 and j. if c is
108 |    a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
109 |    presence,
110 | 
111 |       <c><v>       gives 0
112 |       <c>vc<v>     gives 1
113 |       <c>vcvc<v>   gives 2
114 |       <c>vcvcvc<v> gives 3
115 |       ....
116 | */
117 | 
118 | static int m(struct stemmer * z)
119 | {  int n = 0;
120 |    int i = 0;
121 |    int j = z->j;
122 |    while(TRUE)
123 |    {  if (i > j) return n;
124 |       if (! cons(z, i)) break; i++;
125 |    }
126 |    i++;
127 |    while(TRUE)
128 |    {  while(TRUE)
129 |       {  if (i > j) return n;
130 |             if (cons(z, i)) break;
131 |             i++;
132 |       }
133 |       i++;
134 |       n++;
135 |       while(TRUE)
136 |       {  if (i > j) return n;
137 |          if (! cons(z, i)) break;
138 |          i++;
139 |       }
140 |       i++;
141 |    }
142 | }
143 | 
144 | /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
145 | 
146 | static int vowelinstem(struct stemmer * z)
147 | {
148 |    int j = z->j;
149 |    int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
150 |    return FALSE;
151 | }
152 | 
153 | /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
154 | 
155 | static int doublec(struct stemmer * z, int j)
156 | {
157 |    char * b = z->b;
158 |    if (j < 1) return FALSE;
159 |    if (b[j] != b[j - 1]) return FALSE;
160 |    return cons(z, j);
161 | }
162 | 
163 | /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
164 |    and also if the second c is not w,x or y. this is used when trying to
165 |    restore an e at the end of a short word. e.g.
166 | 
167 |       cav(e), lov(e), hop(e), crim(e), but
168 |       snow, box, tray.
169 | 
170 | */
171 | 
172 | static int cvc(struct stemmer * z, int i)
173 | {  if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
174 |    {  int ch = z->b[i];
175 |       if (ch  == 'w' || ch == 'x' || ch == 'y') return FALSE;
176 |    }
177 |    return TRUE;
178 | }
179 | 
180 | /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
181 | 
182 | static int ends(struct stemmer * z, char * s)
183 | {  int length = s[0];
184 |    char * b = z->b;
185 |    int k = z->k;
186 |    if (s[length] != b[k]) return FALSE; /* tiny speed-up */
187 |    if (length > k + 1) return FALSE;
188 |    if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
189 |    z->j = k-length;
190 |    return TRUE;
191 | }
192 | 
193 | /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
194 |    k. */
195 | 
196 | static void setto(struct stemmer * z, char * s)
197 | {  int length = s[0];
198 |    int j = z->j;
199 |    memmove(z->b + j + 1, s + 1, length);
200 |    z->k = j+length;
201 | }
202 | 
203 | /* r(z, s) is used further down. */
204 | 
205 | static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
206 | 
207 | /* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
208 | 
209 |        caresses  ->  caress
210 |        ponies    ->  poni
211 |        ties      ->  ti
212 |        caress    ->  caress
213 |        cats      ->  cat
214 | 
215 |        feed      ->  feed
216 |        agreed    ->  agree
217 |        disabled  ->  disable
218 | 
219 |        matting   ->  mat
220 |        mating    ->  mate
221 |        meeting   ->  meet
222 |        milling   ->  mill
223 |        messing   ->  mess
224 | 
225 |        meetings  ->  meet
226 | 
227 | */
228 | 
229 | static void step1ab(struct stemmer * z)
230 | {
231 |    char * b = z->b;
232 |    if (b[z->k] == 's')
233 |    {  if (ends(z, "\04" "sses")) z->k -= 2; else
234 |       if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
235 |       if (b[z->k - 1] != 's') z->k--;
236 |    }
237 |    if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
238 |    if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
239 |    {  z->k = z->j;
240 |       if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
241 |       if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
242 |       if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
243 |       if (doublec(z, z->k))
244 |       {  z->k--;
245 |          {  int ch = b[z->k];
246 |             if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
247 |          }
248 |       }
249 |       else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
250 |    }
251 | }
252 | 
253 | /* step1c(z) turns terminal y to i when there is another vowel in the stem. */
254 | 
255 | static void step1c(struct stemmer * z)
256 | {
257 |    if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
258 | }
259 | 
260 | 
261 | /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
262 |    -ation) maps to -ize etc. note that the string before the suffix must give
263 |    m(z) > 0. */
264 | 
265 | static void step2(struct stemmer * z) { switch (z->b[z->k-1])
266 | {
267 |    case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
268 |              if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
269 |              break;
270 |    case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
271 |              if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
272 |              break;
273 |    case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
274 |              break;
275 |    case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
276 | 
277 |  /* To match the published algorithm, replace this line with
278 |     case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
279 | 
280 |              if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
281 |              if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
282 |              if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
283 |              if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
284 |              break;
285 |    case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
286 |              if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
287 |              if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
288 |              break;
289 |    case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
290 |              if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
291 |              if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
292 |              if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
293 |              break;
294 |    case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
295 |              if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
296 |              if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
297 |              break;
298 |    case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
299 | 
300 |  /* To match the published algorithm, delete this line */
301 | 
302 | } }
303 | 
304 | /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
305 | 
306 | static void step3(struct stemmer * z) { switch (z->b[z->k])
307 | {
308 |    case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
309 |              if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
310 |              if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
311 |              break;
312 |    case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
313 |              break;
314 |    case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
315 |              if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
316 |              break;
317 |    case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
318 |              break;
319 | } }
320 | 
321 | /* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
322 | 
323 | static void step4(struct stemmer * z)
324 | {  switch (z->b[z->k-1])
325 |    {  case 'a': if (ends(z, "\02" "al")) break; return;
326 |       case 'c': if (ends(z, "\04" "ance")) break;
327 |                 if (ends(z, "\04" "ence")) break; return;
328 |       case 'e': if (ends(z, "\02" "er")) break; return;
329 |       case 'i': if (ends(z, "\02" "ic")) break; return;
330 |       case 'l': if (ends(z, "\04" "able")) break;
331 |                 if (ends(z, "\04" "ible")) break; return;
332 |       case 'n': if (ends(z, "\03" "ant")) break;
333 |                 if (ends(z, "\05" "ement")) break;
334 |                 if (ends(z, "\04" "ment")) break;
335 |                 if (ends(z, "\03" "ent")) break; return;
336 |       case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
337 |                 if (ends(z, "\02" "ou")) break; return;
338 |                 /* takes care of -ous */
339 |       case 's': if (ends(z, "\03" "ism")) break; return;
340 |       case 't': if (ends(z, "\03" "ate")) break;
341 |                 if (ends(z, "\03" "iti")) break; return;
342 |       case 'u': if (ends(z, "\03" "ous")) break; return;
343 |       case 'v': if (ends(z, "\03" "ive")) break; return;
344 |       case 'z': if (ends(z, "\03" "ize")) break; return;
345 |       default: return;
346 |    }
347 |    if (m(z) > 1) z->k = z->j;
348 | }
349 | 
350 | /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
351 |    m(z) > 1. */
352 | 
353 | static void step5(struct stemmer * z)
354 | {
355 |    char * b = z->b;
356 |    z->j = z->k;
357 |    if (b[z->k] == 'e')
358 |    {  int a = m(z);
359 |       if (a > 1 || a == 1 && !cvc(z, z->k - 1)) z->k--;
360 |    }
361 |    if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
362 | }
363 | 
364 | /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
365 |    from b[0] to b[k] inclusive.  Possibly b[k+1] == '\0', but it is not
366 |    important. The stemmer adjusts the characters b[0] ... b[k] and returns
367 |    the new end-point of the string, k'. Stemming never increases word
368 |    length, so 0 <= k' <= k.
369 | */
370 | 
371 | extern int stem(struct stemmer * z, char * b, int k)
372 | {
373 |    if (k <= 1) return k; /*-DEPARTURE-*/
374 |    z->b = b; z->k = k; /* copy the parameters into z */
375 | 
376 |    /* With this line, strings of length 1 or 2 don't go through the
377 |       stemming process, although no mention is made of this in the
378 |       published algorithm. Remove the line to match the published
379 |       algorithm. */
380 | 
381 |    step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z);
382 |    return z->k;
383 | }
384 | 
385 | /*--------------------stemmer definition ends here------------------------*/
386 | #if 0
387 | #include <stdio.h>
388 | #include <stdlib.h>      /* for malloc, free */
389 | #include <ctype.h>       /* for isupper, islower, tolower */
390 | 
391 | static char * s;         /* buffer for words tobe stemmed */
392 | 
393 | #define INC 50           /* size units in which s is increased */
394 | static int i_max = INC;  /* maximum offset in s */
395 | 
396 | #define LETTER(ch) (isupper(ch) || islower(ch))
397 | 
398 | void stemfile(struct stemmer * z, FILE * f)
399 | {  while(TRUE)
400 |    {  int ch = getc(f);
401 |       if (ch == EOF) return;
402 |       if (LETTER(ch))
403 |       {  int i = 0;
404 |          while(TRUE)
405 |          {  if (i == i_max)
406 |             {  i_max += INC;
407 |                s = realloc(s, i_max + 1);
408 |             }
409 |             ch = tolower(ch); /* forces lower case */
410 | 
411 |             s[i] = ch; i++;
412 |             ch = getc(f);
413 |             if (!LETTER(ch)) { ungetc(ch,f); break; }
414 |          }
415 |          s[stem(z, s, i - 1) + 1] = 0;
416 |          /* the previous line calls the stemmer and uses its result to
417 |             zero-terminate the string in s */
418 |          printf("%s",s);
419 |       }
420 |       else putchar(ch);
421 |    }
422 | }
423 | 
424 | int main(int argc, char * argv[])
425 | {  int i;
426 | 
427 |    struct stemmer * z = create_stemmer();
428 | 
429 |    s = (char *) malloc(i_max + 1);
430 |    for (i = 1; i < argc; i++)
431 |    {  FILE * f = fopen(argv[i],"r");
432 |       if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
433 |       stemfile(z, f);
434 |    }
435 |    free(s);
436 | 
437 |    free_stemmer(z);
438 | 
439 |    return 0;
440 | }
441 | #endif
442 | 


--------------------------------------------------------------------------------
/ext/porter_wrap.c:
--------------------------------------------------------------------------------
 1 | #include <ruby.h>
 2 | 
 3 | #ifndef RSTRING_PTR
 4 | #define RSTRING_PTR(str) (RSTRING(str)->ptr)
 5 | #endif
 6 | 
 7 | extern struct stemmer * create_stemmer(void);
 8 | extern void free_stemmer(struct stemmer * z);
 9 | extern int stem(struct stemmer * z, char * b, int k);
10 | 
11 | /* copied from porter.c */
12 | struct stemmer {
13 | 	char * b;       /* buffer for word to be stemmed */
14 | 	int k;          /* offset to the end of the string */
15 | 	int j;          /* a general offset into the string */
16 | };
17 | 
18 | static VALUE stem_word(VALUE self, VALUE arg)
19 | {
20 | 	size_t length, i;
21 | 	char *word;
22 | 	struct stemmer z;
23 | 	VALUE str, rv;
24 | 
25 | 	str = StringValue(arg);
26 | 	word = malloc(RSTRING_LEN(str) + 1);
27 | 	strncpy(word, RSTRING_PTR(str), RSTRING_LEN(str));
28 | 	word[RSTRING_LEN(str)] = '\0';
29 | 
30 | 	length  = stem(&z, word, strlen(word)-1);
31 | 	word[length+1] = 0;
32 | 	rv = rb_str_new2(word);
33 | 	free(word);
34 | 	return rv;
35 | }
36 | 
37 | VALUE mStemmer;
38 | 
39 | void Init_stemmer(void) {
40 | 	mStemmer = rb_define_module("Stemmer");
41 | 	rb_define_module_function(mStemmer, "stem_word", stem_word, 1);
42 | }
43 | 


--------------------------------------------------------------------------------
/fast-stemmer.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = "fast-stemmer"
 8 |   s.version = "1.0.2"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["Roman Shterenzon"]
12 |   s.date = "2013-02-06"
13 |   s.description = "Fast Porter stemmer based on a C version of algorithm"
14 |   s.license = "BSD"
15 |   s.email = "romanbsd@yahoo.com"
16 |   s.extensions = ["ext/extconf.rb"]
17 |   s.extra_rdoc_files = [
18 |     "LICENSE",
19 |     "README"
20 |   ]
21 |   s.files = [
22 |     "LICENSE",
23 |     "README",
24 |     "Rakefile",
25 |     "VERSION.yml",
26 |     "ext/Makefile",
27 |     "ext/extconf.rb",
28 |     "ext/porter.c",
29 |     "ext/porter_wrap.c",
30 |     "lib/fast-stemmer.rb",
31 |     "lib/fast_stemmer.rb",
32 |     "test/fast_stemmer_test.rb"
33 |   ]
34 |   s.homepage = "http://github.com/romanbsd/fast-stemmer"
35 |   s.require_paths = ["lib"]
36 |   s.rubygems_version = "1.8.23"
37 |   s.summary = "Fast Porter stemmer based on a C version of algorithm"
38 | 
39 |   if s.respond_to? :specification_version then
40 |     s.specification_version = 3
41 | 
42 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
43 |     else
44 |     end
45 |   else
46 |   end
47 | end
48 | 
49 | 


--------------------------------------------------------------------------------
/lib/fast-stemmer.rb:
--------------------------------------------------------------------------------
1 | require 'fast_stemmer'
2 | 


--------------------------------------------------------------------------------
/lib/fast_stemmer.rb:
--------------------------------------------------------------------------------
1 | require 'stemmer'
2 | 
3 | class String
4 |   def stem
5 |     Stemmer.stem_word(self)
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/test/fast_stemmer_test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'test/unit'
 3 | require File.join(File.dirname(__FILE__), '..', 'lib', 'fast_stemmer')
 4 | 
 5 | class TestStemmer < Test::Unit::TestCase
 6 | 	def setup
 7 | 		@stems = { 'riding' => 'ride',
 8 | 			'forestalled' => 'forestal',
 9 | 			'combined' => 'combin',
10 | 			'ran' => 'ran',
11 | 			'seen' => 'seen',
12 | 			'excused' => 'excus'
13 | 		}
14 | 	end
15 | 
16 | 	def test_stems
17 | 		@stems.each {|stem| assert_equal(stem[1], stem[0].stem)}
18 | 	end
19 | end
20 | 


--------------------------------------------------------------------------------