├── .gitignore ├── libstemmer-ffi.swig ├── libstemmer_c ├── Makefile ├── src_c │ ├── stem_UTF_8_dutch.h │ ├── stem_UTF_8_danish.h │ ├── stem_UTF_8_english.h │ ├── stem_UTF_8_finnish.h │ ├── stem_UTF_8_french.h │ ├── stem_UTF_8_german.h │ ├── stem_UTF_8_italian.h │ ├── stem_UTF_8_porter.h │ ├── stem_UTF_8_russian.h │ ├── stem_UTF_8_spanish.h │ ├── stem_UTF_8_swedish.h │ ├── stem_UTF_8_turkish.h │ ├── stem_KOI8_R_russian.h │ ├── stem_UTF_8_romanian.h │ ├── stem_UTF_8_hungarian.h │ ├── stem_UTF_8_norwegian.h │ ├── stem_ISO_8859_1_dutch.h │ ├── stem_UTF_8_portuguese.h │ ├── stem_ISO_8859_1_danish.h │ ├── stem_ISO_8859_1_english.h │ ├── stem_ISO_8859_1_finnish.h │ ├── stem_ISO_8859_1_french.h │ ├── stem_ISO_8859_1_german.h │ ├── stem_ISO_8859_1_italian.h │ ├── stem_ISO_8859_1_porter.h │ ├── stem_ISO_8859_1_spanish.h │ ├── stem_ISO_8859_1_swedish.h │ ├── stem_ISO_8859_2_romanian.h │ ├── stem_ISO_8859_1_hungarian.h │ ├── stem_ISO_8859_1_norwegian.h │ ├── stem_ISO_8859_1_portuguese.h │ ├── stem_ISO_8859_1_norwegian.c │ ├── stem_UTF_8_norwegian.c │ ├── stem_ISO_8859_1_swedish.c │ ├── stem_UTF_8_swedish.c │ ├── stem_ISO_8859_1_danish.c │ └── stem_UTF_8_danish.c ├── runtime │ ├── api.h │ ├── api.c │ └── header.h ├── mkinc_utf8.mak ├── MANIFEST ├── libstemmer │ ├── libstemmer.c │ ├── libstemmer_c.in │ ├── libstemmer_utf8.c │ ├── modules_utf8.txt │ ├── modules.txt │ ├── modules_utf8.h │ └── modules.h ├── mkinc.mak ├── include │ └── libstemmer.h ├── README └── examples │ └── stemwords.c ├── package.lisp ├── cl-libstemmer.asd ├── README.md ├── stopwords.lisp ├── stopwords ├── hu.txt ├── fi.txt ├── da.txt ├── sv.txt ├── fr.txt ├── nl.txt ├── nb.txt ├── pt.txt ├── de.txt ├── it.txt ├── en.txt ├── es.txt └── ru.txt ├── libstemmer-ffi.lisp └── cl-libstemmer.lisp /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.o 3 | stemwords 4 | -------------------------------------------------------------------------------- /libstemmer-ffi.swig: -------------------------------------------------------------------------------- 1 | %module "libstemmer-ffi" 2 | %include "libstemmer_c/include/libstemmer.h" 3 | -------------------------------------------------------------------------------- /libstemmer_c/Makefile: -------------------------------------------------------------------------------- 1 | include mkinc.mak 2 | CFLAGS=-fPIC 3 | all: libstemmer.o stemwords 4 | libstemmer.o: $(snowball_sources:.c=.o) 5 | $(AR) -cru $@ $^ 6 | libstemmer.so: $(snowball_sources:.c=.o) 7 | $(CC) -shared -o $@ $^ 8 | stemwords: examples/stemwords.o libstemmer.o 9 | $(CC) -o $@ $^ 10 | clean: 11 | rm -f stemwords *.so *.o src_c/*.o runtime/*.o libstemmer/*.o 12 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_dutch.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * dutch_UTF_8_create_env(void); 9 | extern void dutch_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int dutch_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_danish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * danish_UTF_8_create_env(void); 9 | extern void danish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int danish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_english.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * english_UTF_8_create_env(void); 9 | extern void english_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int english_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_finnish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * finnish_UTF_8_create_env(void); 9 | extern void finnish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int finnish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_french.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * french_UTF_8_create_env(void); 9 | extern void french_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int french_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_german.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * german_UTF_8_create_env(void); 9 | extern void german_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int german_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_italian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * italian_UTF_8_create_env(void); 9 | extern void italian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int italian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_porter.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * porter_UTF_8_create_env(void); 9 | extern void porter_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int porter_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_russian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * russian_UTF_8_create_env(void); 9 | extern void russian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int russian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_spanish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * spanish_UTF_8_create_env(void); 9 | extern void spanish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int spanish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_swedish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * swedish_UTF_8_create_env(void); 9 | extern void swedish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int swedish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_turkish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * turkish_UTF_8_create_env(void); 9 | extern void turkish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int turkish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_KOI8_R_russian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * russian_KOI8_R_create_env(void); 9 | extern void russian_KOI8_R_close_env(struct SN_env * z); 10 | 11 | extern int russian_KOI8_R_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_romanian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * romanian_UTF_8_create_env(void); 9 | extern void romanian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int romanian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_hungarian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * hungarian_UTF_8_create_env(void); 9 | extern void hungarian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int hungarian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_norwegian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * norwegian_UTF_8_create_env(void); 9 | extern void norwegian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int norwegian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_dutch.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * dutch_ISO_8859_1_create_env(void); 9 | extern void dutch_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int dutch_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_portuguese.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * portuguese_UTF_8_create_env(void); 9 | extern void portuguese_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int portuguese_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_danish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * danish_ISO_8859_1_create_env(void); 9 | extern void danish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int danish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_english.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * english_ISO_8859_1_create_env(void); 9 | extern void english_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int english_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_finnish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * finnish_ISO_8859_1_create_env(void); 9 | extern void finnish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int finnish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_french.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * french_ISO_8859_1_create_env(void); 9 | extern void french_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int french_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_german.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * german_ISO_8859_1_create_env(void); 9 | extern void german_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int german_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_italian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * italian_ISO_8859_1_create_env(void); 9 | extern void italian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int italian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_porter.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * porter_ISO_8859_1_create_env(void); 9 | extern void porter_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int porter_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_spanish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * spanish_ISO_8859_1_create_env(void); 9 | extern void spanish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int spanish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_swedish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * swedish_ISO_8859_1_create_env(void); 9 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int swedish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_2_romanian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * romanian_ISO_8859_2_create_env(void); 9 | extern void romanian_ISO_8859_2_close_env(struct SN_env * z); 10 | 11 | extern int romanian_ISO_8859_2_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * hungarian_ISO_8859_1_create_env(void); 9 | extern void hungarian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int hungarian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void); 9 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * portuguese_ISO_8859_1_create_env(void); 9 | extern void portuguese_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int portuguese_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /package.lisp: -------------------------------------------------------------------------------- 1 | ;;;; package.lisp 2 | 3 | (defpackage #:cl-libstemmer 4 | (:use #:cl #:alexandria #:serapeum) 5 | (:export :*default-encoding* 6 | :stemmer :stemmer-language :stemmer-encoding 7 | :stem-all 8 | :with-stemmer :stem 9 | :load-stemmer :close-stemmer 10 | :no-such-stemmer 11 | :no-such-stemmer-language 12 | :no-such-stemmer-encoding 13 | :stop-word-p :list-stop-words) 14 | (:nicknames #:libstemmer)) 15 | -------------------------------------------------------------------------------- /cl-libstemmer.asd: -------------------------------------------------------------------------------- 1 | ;;;; cl-libstemmer.asd 2 | 3 | (defpackage #:cl-libstemmer.asdf 4 | (:use #:cl #:asdf)) 5 | 6 | (in-package #:cl-libstemmer.asdf) 7 | 8 | (defun wrap-package (fn) 9 | (let ((*package* (find-package :cl-libstemmer))) 10 | (funcall fn))) 11 | 12 | (defsystem #:cl-libstemmer 13 | :serial t 14 | :description "Snowball stemming algorithms (FFI)" 15 | :author "Paul M. Rodriguez " 16 | :license "MIT" 17 | :depends-on (#:alexandria 18 | #:serapeum 19 | #:trivial-garbage 20 | #:cffi 21 | #:bordeaux-threads 22 | #:uiop) 23 | :components ((:file "package") 24 | (:file "libstemmer-ffi" 25 | :around-compile wrap-package) 26 | (:file "stopwords") 27 | (:file "cl-libstemmer"))) 28 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/api.h: -------------------------------------------------------------------------------- 1 | 2 | typedef unsigned char symbol; 3 | 4 | /* Or replace 'char' above with 'short' for 16 bit characters. 5 | 6 | More precisely, replace 'char' with whatever type guarantees the 7 | character width you need. Note however that sizeof(symbol) should divide 8 | HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise 9 | there is an alignment problem. In the unlikely event of a problem here, 10 | consult Martin Porter. 11 | 12 | */ 13 | 14 | struct SN_env { 15 | symbol * p; 16 | int c; int l; int lb; int bra; int ket; 17 | symbol * * S; 18 | int * I; 19 | unsigned char * B; 20 | }; 21 | 22 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); 23 | extern void SN_close_env(struct SN_env * z, int S_size); 24 | 25 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s); 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The [Snowball][snowball] project has defined stemming algorithms for 2 | 17 languages. Libstemmer provides these algorithms as a C library. 3 | 4 | CL-LIBSTEMMER includes the full source of libstemmer, and will attempt 5 | to build and load `libstemmer.so` when it is first loaded. Obviously 6 | this will only work on a system with `make`. 7 | 8 | The preferred way to use CL-LIBSTEMMER is with `stem-all`: 9 | 10 | (libstemmer:stem-all '("visible" "irradiate" "vainglorious" "habitat") 11 | :en) 12 | => '("visibl" "irradi" "vainglori" "habitat"), T 13 | 14 | `stem-all` takes a list of words, a language (as a two or three letter 15 | abbreviation) and, optionally, an encoding. If such a stemmer exists, 16 | stem-all returns the stemmed words; otherwise, it returns the list of 17 | words unchanged. The second value is T if stemming was actually done. 18 | 19 | You can also stem incrementally, using `with-stemmer` and `stem`: 20 | 21 | (libstemmer:with-stemmer (stemmer :en) 22 | (libstemmer:stem stemmer "resplendent")) 23 | => "resplend" 24 | 25 | There are also unbalanced `load-stemmer` and `close-stemmer` 26 | functions. Bear in mind that loading a stemmer is relatively 27 | expensive: for best results, stem in large batches. 28 | 29 | Besides libstemmer itself, CL-LIBSTEMMER also includes the lists of 30 | stop words compiled by the Snowball project. 31 | 32 | (libstemmer:stop-word-p "is" :es) => T 33 | 34 | [snowball]: http://snowball.tartarus.org/index.php 35 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/api.c: -------------------------------------------------------------------------------- 1 | 2 | #include /* for calloc, free */ 3 | #include "header.h" 4 | 5 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) 6 | { 7 | struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); 8 | if (z == NULL) return NULL; 9 | z->p = create_s(); 10 | if (z->p == NULL) goto error; 11 | if (S_size) 12 | { 13 | int i; 14 | z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); 15 | if (z->S == NULL) goto error; 16 | 17 | for (i = 0; i < S_size; i++) 18 | { 19 | z->S[i] = create_s(); 20 | if (z->S[i] == NULL) goto error; 21 | } 22 | } 23 | 24 | if (I_size) 25 | { 26 | z->I = (int *) calloc(I_size, sizeof(int)); 27 | if (z->I == NULL) goto error; 28 | } 29 | 30 | if (B_size) 31 | { 32 | z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); 33 | if (z->B == NULL) goto error; 34 | } 35 | 36 | return z; 37 | error: 38 | SN_close_env(z, S_size); 39 | return NULL; 40 | } 41 | 42 | extern void SN_close_env(struct SN_env * z, int S_size) 43 | { 44 | if (z == NULL) return; 45 | if (S_size) 46 | { 47 | int i; 48 | for (i = 0; i < S_size; i++) 49 | { 50 | lose_s(z->S[i]); 51 | } 52 | free(z->S); 53 | } 54 | free(z->I); 55 | free(z->B); 56 | if (z->p) lose_s(z->p); 57 | free(z); 58 | } 59 | 60 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s) 61 | { 62 | int err = replace_s(z, 0, z->l, size, s, NULL); 63 | z->c = 0; 64 | return err; 65 | } 66 | 67 | -------------------------------------------------------------------------------- /libstemmer_c/mkinc_utf8.mak: -------------------------------------------------------------------------------- 1 | # libstemmer/mkinc_utf8.mak: List of stemming module source files 2 | # 3 | # This file is generated by mkmodules.pl from a list of module names. 4 | # Do not edit manually. 5 | # 6 | # Modules included by this file are: danish, dutch, english, finnish, french, 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | # russian, spanish, swedish, turkish 9 | 10 | snowball_sources= \ 11 | src_c/stem_UTF_8_danish.c \ 12 | src_c/stem_UTF_8_dutch.c \ 13 | src_c/stem_UTF_8_english.c \ 14 | src_c/stem_UTF_8_finnish.c \ 15 | src_c/stem_UTF_8_french.c \ 16 | src_c/stem_UTF_8_german.c \ 17 | src_c/stem_UTF_8_hungarian.c \ 18 | src_c/stem_UTF_8_italian.c \ 19 | src_c/stem_UTF_8_norwegian.c \ 20 | src_c/stem_UTF_8_porter.c \ 21 | src_c/stem_UTF_8_portuguese.c \ 22 | src_c/stem_UTF_8_romanian.c \ 23 | src_c/stem_UTF_8_russian.c \ 24 | src_c/stem_UTF_8_spanish.c \ 25 | src_c/stem_UTF_8_swedish.c \ 26 | src_c/stem_UTF_8_turkish.c \ 27 | runtime/api.c \ 28 | runtime/utilities.c \ 29 | libstemmer/libstemmer_utf8.c 30 | 31 | snowball_headers= \ 32 | src_c/stem_UTF_8_danish.h \ 33 | src_c/stem_UTF_8_dutch.h \ 34 | src_c/stem_UTF_8_english.h \ 35 | src_c/stem_UTF_8_finnish.h \ 36 | src_c/stem_UTF_8_french.h \ 37 | src_c/stem_UTF_8_german.h \ 38 | src_c/stem_UTF_8_hungarian.h \ 39 | src_c/stem_UTF_8_italian.h \ 40 | src_c/stem_UTF_8_norwegian.h \ 41 | src_c/stem_UTF_8_porter.h \ 42 | src_c/stem_UTF_8_portuguese.h \ 43 | src_c/stem_UTF_8_romanian.h \ 44 | src_c/stem_UTF_8_russian.h \ 45 | src_c/stem_UTF_8_spanish.h \ 46 | src_c/stem_UTF_8_swedish.h \ 47 | src_c/stem_UTF_8_turkish.h \ 48 | include/libstemmer.h \ 49 | libstemmer/modules_utf8.h \ 50 | runtime/api.h \ 51 | runtime/header.h 52 | 53 | -------------------------------------------------------------------------------- /stopwords.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:libstemmer) 2 | 3 | ;;; NB All of the included stopwords files have been re-encoded as 4 | ;;; UTF-8. 5 | 6 | ;; TODO Use tries. 7 | 8 | (defun snarf-stopwords-file (file) 9 | (let* ((string (read-file-into-string file)) 10 | (lines (lines string)) 11 | (words (remove-if #'emptyp 12 | (mapcar #'trim-whitespace 13 | (mapcar (lambda (line) 14 | (subseq line 0 (position #\| line))) 15 | lines))))) 16 | words)) 17 | 18 | (defun snarf-stopwords (lang) 19 | (let* ((dir (asdf:system-relative-pathname 20 | :cl-libstemmer 21 | "stopwords/")) 22 | (file (merge-pathnames (make-pathname :name lang :type "txt") dir))) 23 | (snarf-stopwords-file file))) 24 | 25 | (defun stopwords () 26 | (let* ((dir (asdf:system-relative-pathname :cl-libstemmer "stopwords/")) 27 | (langs (mapcar #'pathname-name 28 | (directory (merge-pathnames "*.txt" dir))))) 29 | (loop for lang in langs 30 | collect (cons lang 31 | (set-hash-table (snarf-stopwords lang) 32 | :strict nil 33 | :test 'equal))))) 34 | 35 | (defparameter *stopwords* 36 | (load-time-value 37 | (alist-hash-table 38 | (stopwords) 39 | :test 'equal) 40 | t)) 41 | 42 | (defun stop-words (lang &optional (table *stopwords*)) 43 | (gethash (string-downcase lang) table #.(dict))) 44 | 45 | (defun stop-word-p (word lang &key (table *stopwords*)) 46 | (check-type word string) 47 | (values (gethash word (stop-words lang table)))) 48 | 49 | (defun list-stop-words (lang) 50 | (if-let (table (stop-words lang)) 51 | (hash-table-values table) 52 | '())) 53 | -------------------------------------------------------------------------------- /libstemmer_c/MANIFEST: -------------------------------------------------------------------------------- 1 | README 2 | src_c/stem_ISO_8859_1_danish.c 3 | src_c/stem_ISO_8859_1_danish.h 4 | src_c/stem_ISO_8859_1_dutch.c 5 | src_c/stem_ISO_8859_1_dutch.h 6 | src_c/stem_ISO_8859_1_english.c 7 | src_c/stem_ISO_8859_1_english.h 8 | src_c/stem_ISO_8859_1_finnish.c 9 | src_c/stem_ISO_8859_1_finnish.h 10 | src_c/stem_ISO_8859_1_french.c 11 | src_c/stem_ISO_8859_1_french.h 12 | src_c/stem_ISO_8859_1_german.c 13 | src_c/stem_ISO_8859_1_german.h 14 | src_c/stem_ISO_8859_1_hungarian.c 15 | src_c/stem_ISO_8859_1_hungarian.h 16 | src_c/stem_ISO_8859_1_italian.c 17 | src_c/stem_ISO_8859_1_italian.h 18 | src_c/stem_ISO_8859_1_norwegian.c 19 | src_c/stem_ISO_8859_1_norwegian.h 20 | src_c/stem_ISO_8859_1_porter.c 21 | src_c/stem_ISO_8859_1_porter.h 22 | src_c/stem_ISO_8859_1_portuguese.c 23 | src_c/stem_ISO_8859_1_portuguese.h 24 | src_c/stem_ISO_8859_1_spanish.c 25 | src_c/stem_ISO_8859_1_spanish.h 26 | src_c/stem_ISO_8859_1_swedish.c 27 | src_c/stem_ISO_8859_1_swedish.h 28 | src_c/stem_ISO_8859_2_romanian.c 29 | src_c/stem_ISO_8859_2_romanian.h 30 | src_c/stem_KOI8_R_russian.c 31 | src_c/stem_KOI8_R_russian.h 32 | src_c/stem_UTF_8_danish.c 33 | src_c/stem_UTF_8_danish.h 34 | src_c/stem_UTF_8_dutch.c 35 | src_c/stem_UTF_8_dutch.h 36 | src_c/stem_UTF_8_english.c 37 | src_c/stem_UTF_8_english.h 38 | src_c/stem_UTF_8_finnish.c 39 | src_c/stem_UTF_8_finnish.h 40 | src_c/stem_UTF_8_french.c 41 | src_c/stem_UTF_8_french.h 42 | src_c/stem_UTF_8_german.c 43 | src_c/stem_UTF_8_german.h 44 | src_c/stem_UTF_8_hungarian.c 45 | src_c/stem_UTF_8_hungarian.h 46 | src_c/stem_UTF_8_italian.c 47 | src_c/stem_UTF_8_italian.h 48 | src_c/stem_UTF_8_norwegian.c 49 | src_c/stem_UTF_8_norwegian.h 50 | src_c/stem_UTF_8_porter.c 51 | src_c/stem_UTF_8_porter.h 52 | src_c/stem_UTF_8_portuguese.c 53 | src_c/stem_UTF_8_portuguese.h 54 | src_c/stem_UTF_8_romanian.c 55 | src_c/stem_UTF_8_romanian.h 56 | src_c/stem_UTF_8_russian.c 57 | src_c/stem_UTF_8_russian.h 58 | src_c/stem_UTF_8_spanish.c 59 | src_c/stem_UTF_8_spanish.h 60 | src_c/stem_UTF_8_swedish.c 61 | src_c/stem_UTF_8_swedish.h 62 | src_c/stem_UTF_8_turkish.c 63 | src_c/stem_UTF_8_turkish.h 64 | runtime/api.c 65 | runtime/api.h 66 | runtime/header.h 67 | runtime/utilities.c 68 | libstemmer/libstemmer.c 69 | libstemmer/libstemmer_utf8.c 70 | libstemmer/modules.h 71 | libstemmer/modules_utf8.h 72 | include/libstemmer.h 73 | -------------------------------------------------------------------------------- /stopwords/hu.txt: -------------------------------------------------------------------------------- 1 | | Hungarian stop word list 2 | | prepared by Anna Tordai 3 | 4 | a 5 | ahogy 6 | ahol 7 | aki 8 | akik 9 | akkor 10 | alatt 11 | által 12 | általában 13 | amely 14 | amelyek 15 | amelyekben 16 | amelyeket 17 | amelyet 18 | amelynek 19 | ami 20 | amit 21 | amolyan 22 | amíg 23 | amikor 24 | át 25 | abban 26 | ahhoz 27 | annak 28 | arra 29 | arról 30 | az 31 | azok 32 | azon 33 | azt 34 | azzal 35 | azért 36 | aztán 37 | azután 38 | azonban 39 | bár 40 | be 41 | belül 42 | benne 43 | cikk 44 | cikkek 45 | cikkeket 46 | csak 47 | de 48 | e 49 | eddig 50 | egész 51 | egy 52 | egyes 53 | egyetlen 54 | egyéb 55 | egyik 56 | egyre 57 | ekkor 58 | el 59 | elég 60 | ellen 61 | elõ 62 | elõször 63 | elõtt 64 | elsõ 65 | én 66 | éppen 67 | ebben 68 | ehhez 69 | emilyen 70 | ennek 71 | erre 72 | ez 73 | ezt 74 | ezek 75 | ezen 76 | ezzel 77 | ezért 78 | és 79 | fel 80 | felé 81 | hanem 82 | hiszen 83 | hogy 84 | hogyan 85 | igen 86 | így 87 | illetve 88 | ill. 89 | ill 90 | ilyen 91 | ilyenkor 92 | ison 93 | ismét 94 | itt 95 | jó 96 | jól 97 | jobban 98 | kell 99 | kellett 100 | keresztül 101 | keressünk 102 | ki 103 | kívül 104 | között 105 | közül 106 | legalább 107 | lehet 108 | lehetett 109 | legyen 110 | lenne 111 | lenni 112 | lesz 113 | lett 114 | maga 115 | magát 116 | majd 117 | majd 118 | már 119 | más 120 | másik 121 | meg 122 | még 123 | mellett 124 | mert 125 | mely 126 | melyek 127 | mi 128 | mit 129 | míg 130 | miért 131 | milyen 132 | mikor 133 | minden 134 | mindent 135 | mindenki 136 | mindig 137 | mint 138 | mintha 139 | mivel 140 | most 141 | nagy 142 | nagyobb 143 | nagyon 144 | ne 145 | néha 146 | nekem 147 | neki 148 | nem 149 | néhány 150 | nélkül 151 | nincs 152 | olyan 153 | ott 154 | össze 155 | õ 156 | õk 157 | õket 158 | pedig 159 | persze 160 | rá 161 | s 162 | saját 163 | sem 164 | semmi 165 | sok 166 | sokat 167 | sokkal 168 | számára 169 | szemben 170 | szerint 171 | szinte 172 | talán 173 | tehát 174 | teljes 175 | tovább 176 | továbbá 177 | több 178 | úgy 179 | ugyanis 180 | új 181 | újabb 182 | újra 183 | után 184 | utána 185 | utolsó 186 | vagy 187 | vagyis 188 | valaki 189 | valami 190 | valamint 191 | való 192 | vagyok 193 | van 194 | vannak 195 | volt 196 | voltam 197 | voltak 198 | voltunk 199 | vissza 200 | vele 201 | viszont 202 | volna 203 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "modules.h" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close == 0) return; 71 | stemmer->close(stemmer->env); 72 | stemmer->close = 0; 73 | free(stemmer); 74 | } 75 | 76 | const sb_symbol * 77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 78 | { 79 | int ret; 80 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 81 | { 82 | stemmer->env->l = 0; 83 | return NULL; 84 | } 85 | ret = stemmer->stem(stemmer->env); 86 | if (ret < 0) return NULL; 87 | stemmer->env->p[stemmer->env->l] = 0; 88 | return (const sb_symbol *)(stemmer->env->p); 89 | } 90 | 91 | int 92 | sb_stemmer_length(struct sb_stemmer * stemmer) 93 | { 94 | return stemmer->env->l; 95 | } 96 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer_c.in: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "@MODULES_H@" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close == 0) return; 71 | stemmer->close(stemmer->env); 72 | stemmer->close = 0; 73 | free(stemmer); 74 | } 75 | 76 | const sb_symbol * 77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 78 | { 79 | int ret; 80 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 81 | { 82 | stemmer->env->l = 0; 83 | return NULL; 84 | } 85 | ret = stemmer->stem(stemmer->env); 86 | if (ret < 0) return NULL; 87 | stemmer->env->p[stemmer->env->l] = 0; 88 | return (const sb_symbol *)(stemmer->env->p); 89 | } 90 | 91 | int 92 | sb_stemmer_length(struct sb_stemmer * stemmer) 93 | { 94 | return stemmer->env->l; 95 | } 96 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer_utf8.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "modules_utf8.h" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close == 0) return; 71 | stemmer->close(stemmer->env); 72 | stemmer->close = 0; 73 | free(stemmer); 74 | } 75 | 76 | const sb_symbol * 77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 78 | { 79 | int ret; 80 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 81 | { 82 | stemmer->env->l = 0; 83 | return NULL; 84 | } 85 | ret = stemmer->stem(stemmer->env); 86 | if (ret < 0) return NULL; 87 | stemmer->env->p[stemmer->env->l] = 0; 88 | return (const sb_symbol *)(stemmer->env->p); 89 | } 90 | 91 | int 92 | sb_stemmer_length(struct sb_stemmer * stemmer) 93 | { 94 | return stemmer->env->l; 95 | } 96 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules_utf8.txt: -------------------------------------------------------------------------------- 1 | # This file contains a list of stemmers to include in the distribution. 2 | # The format is a set of space separated lines - on each line: 3 | # First item is name of stemmer. 4 | # Second item is comma separated list of character sets. 5 | # Third item is comma separated list of names to refer to the stemmer by. 6 | # 7 | # Lines starting with a #, or blank lines, are ignored. 8 | 9 | # List all the main algorithms for each language, in UTF-8. 10 | 11 | danish UTF_8 danish,da,dan 12 | dutch UTF_8 dutch,nl,dut,nld 13 | english UTF_8 english,en,eng 14 | finnish UTF_8 finnish,fi,fin 15 | french UTF_8 french,fr,fre,fra 16 | german UTF_8 german,de,ger,deu 17 | hungarian UTF_8 hungarian,hu,hun 18 | italian UTF_8 italian,it,ita 19 | norwegian UTF_8 norwegian,no,nor 20 | portuguese UTF_8 portuguese,pt,por 21 | romanian UTF_8 romanian,ro,rum,ron 22 | russian UTF_8 russian,ru,rus 23 | spanish UTF_8 spanish,es,esl,spa 24 | swedish UTF_8 swedish,sv,swe 25 | turkish UTF_8 turkish,tr,tur 26 | 27 | # Also include the traditional porter algorithm for english. 28 | # The porter algorithm is included in the libstemmer distribution to assist 29 | # with backwards compatibility, but for new systems the english algorithm 30 | # should be used in preference. 31 | porter UTF_8 porter 32 | 33 | # Some other stemmers in the snowball project are not included in the standard 34 | # distribution. To compile a libstemmer with them in, add them to this list, 35 | # and regenerate the distribution. (You will need a full source checkout for 36 | # this.) They are included in the snowball website as curiosities, but are not 37 | # intended for general use, and use of them is is not fully supported. These 38 | # algorithms are: 39 | # 40 | # german2 - This is a slight modification of the german stemmer. 41 | #german2 UTF_8 german2 42 | # 43 | # kraaij_pohlmann - This is a different dutch stemmer. 44 | #kraaij_pohlmann UTF_8 kraaij_pohlmann 45 | # 46 | # lovins - This is an english stemmer, but fairly outdated, and 47 | # only really applicable to a restricted type of input text 48 | # (keywords in academic publications). 49 | #lovins UTF_8 lovins 50 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/header.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "api.h" 5 | 6 | #define MAXINT INT_MAX 7 | #define MININT INT_MIN 8 | 9 | #define HEAD 2*sizeof(int) 10 | 11 | #define SIZE(p) ((int *)(p))[-1] 12 | #define SET_SIZE(p, n) ((int *)(p))[-1] = n 13 | #define CAPACITY(p) ((int *)(p))[-2] 14 | 15 | struct among 16 | { int s_size; /* number of chars in string */ 17 | const symbol * s; /* search string */ 18 | int substring_i;/* index to longest matching substring */ 19 | int result; /* result of the lookup */ 20 | int (* function)(struct SN_env *); 21 | }; 22 | 23 | extern symbol * create_s(void); 24 | extern void lose_s(symbol * p); 25 | 26 | extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); 27 | 28 | extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 29 | extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 30 | extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 31 | extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 32 | 33 | extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 34 | extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 35 | extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 36 | extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 37 | 38 | extern int eq_s(struct SN_env * z, int s_size, const symbol * s); 39 | extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); 40 | extern int eq_v(struct SN_env * z, const symbol * p); 41 | extern int eq_v_b(struct SN_env * z, const symbol * p); 42 | 43 | extern int find_among(struct SN_env * z, const struct among * v, int v_size); 44 | extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); 45 | 46 | extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); 47 | extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); 48 | extern int slice_from_v(struct SN_env * z, const symbol * p); 49 | extern int slice_del(struct SN_env * z); 50 | 51 | extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); 52 | extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); 53 | 54 | extern symbol * slice_to(struct SN_env * z, symbol * p); 55 | extern symbol * assign_to(struct SN_env * z, symbol * p); 56 | 57 | extern void debug(struct SN_env * z, int number, int line_count); 58 | 59 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules.txt: -------------------------------------------------------------------------------- 1 | # This file contains a list of stemmers to include in the distribution. 2 | # The format is a set of space separated lines - on each line: 3 | # First item is name of stemmer. 4 | # Second item is comma separated list of character sets. 5 | # Third item is comma separated list of names to refer to the stemmer by. 6 | # 7 | # Lines starting with a #, or blank lines, are ignored. 8 | 9 | # List all the main algorithms for each language, in UTF-8, and also with 10 | # the most commonly used encoding. 11 | 12 | danish UTF_8,ISO_8859_1 danish,da,dan 13 | dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld 14 | english UTF_8,ISO_8859_1 english,en,eng 15 | finnish UTF_8,ISO_8859_1 finnish,fi,fin 16 | french UTF_8,ISO_8859_1 french,fr,fre,fra 17 | german UTF_8,ISO_8859_1 german,de,ger,deu 18 | hungarian UTF_8,ISO_8859_1 hungarian,hu,hun 19 | italian UTF_8,ISO_8859_1 italian,it,ita 20 | norwegian UTF_8,ISO_8859_1 norwegian,no,nor 21 | portuguese UTF_8,ISO_8859_1 portuguese,pt,por 22 | romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron 23 | russian UTF_8,KOI8_R russian,ru,rus 24 | spanish UTF_8,ISO_8859_1 spanish,es,esl,spa 25 | swedish UTF_8,ISO_8859_1 swedish,sv,swe 26 | turkish UTF_8 turkish,tr,tur 27 | 28 | # Also include the traditional porter algorithm for english. 29 | # The porter algorithm is included in the libstemmer distribution to assist 30 | # with backwards compatibility, but for new systems the english algorithm 31 | # should be used in preference. 32 | porter UTF_8,ISO_8859_1 porter 33 | 34 | # Some other stemmers in the snowball project are not included in the standard 35 | # distribution. To compile a libstemmer with them in, add them to this list, 36 | # and regenerate the distribution. (You will need a full source checkout for 37 | # this.) They are included in the snowball website as curiosities, but are not 38 | # intended for general use, and use of them is is not fully supported. These 39 | # algorithms are: 40 | # 41 | # german2 - This is a slight modification of the german stemmer. 42 | #german2 UTF_8,ISO_8859_1 german2 43 | # 44 | # kraaij_pohlmann - This is a different dutch stemmer. 45 | #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann 46 | # 47 | # lovins - This is an english stemmer, but fairly outdated, and 48 | # only really applicable to a restricted type of input text 49 | # (keywords in academic publications). 50 | #lovins UTF_8,ISO_8859_1 lovins 51 | -------------------------------------------------------------------------------- /stopwords/fi.txt: -------------------------------------------------------------------------------- 1 | | forms of BE 2 | 3 | olla 4 | olen 5 | olet 6 | on 7 | olemme 8 | olette 9 | ovat 10 | ole | negative form 11 | 12 | oli 13 | olisi 14 | olisit 15 | olisin 16 | olisimme 17 | olisitte 18 | olisivat 19 | olit 20 | olin 21 | olimme 22 | olitte 23 | olivat 24 | ollut 25 | olleet 26 | 27 | en | negation 28 | et 29 | ei 30 | emme 31 | ette 32 | eivät 33 | 34 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans 35 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I 36 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you 37 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she 38 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we 39 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you 40 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they 41 | 42 | tämä tämän tätä tässä tästä tähän tällä tältä tälle tänä täksi | this 43 | tuo tuon tuota tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that 44 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it 45 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these 46 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those 47 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they 48 | 49 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who 50 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) 51 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what 52 | mitkä | (pl) 53 | 54 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which 55 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) 56 | 57 | | conjunctions 58 | 59 | että | that 60 | ja | and 61 | jos | if 62 | koska | because 63 | kuin | than 64 | mutta | but 65 | niin | so 66 | sekä | and 67 | sillä | for 68 | tai | or 69 | vaan | but 70 | vai | or 71 | vaikka | although 72 | 73 | 74 | | prepositions 75 | 76 | kanssa | with 77 | mukaan | according to 78 | noin | about 79 | poikki | across 80 | yli | over, across 81 | 82 | | other 83 | 84 | kun | when 85 | niin | so 86 | nyt | now 87 | itse | self 88 | -------------------------------------------------------------------------------- /libstemmer_c/mkinc.mak: -------------------------------------------------------------------------------- 1 | # libstemmer/mkinc.mak: List of stemming module source files 2 | # 3 | # This file is generated by mkmodules.pl from a list of module names. 4 | # Do not edit manually. 5 | # 6 | # Modules included by this file are: danish, dutch, english, finnish, french, 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | # russian, spanish, swedish, turkish 9 | 10 | snowball_sources= \ 11 | src_c/stem_ISO_8859_1_danish.c \ 12 | src_c/stem_UTF_8_danish.c \ 13 | src_c/stem_ISO_8859_1_dutch.c \ 14 | src_c/stem_UTF_8_dutch.c \ 15 | src_c/stem_ISO_8859_1_english.c \ 16 | src_c/stem_UTF_8_english.c \ 17 | src_c/stem_ISO_8859_1_finnish.c \ 18 | src_c/stem_UTF_8_finnish.c \ 19 | src_c/stem_ISO_8859_1_french.c \ 20 | src_c/stem_UTF_8_french.c \ 21 | src_c/stem_ISO_8859_1_german.c \ 22 | src_c/stem_UTF_8_german.c \ 23 | src_c/stem_ISO_8859_1_hungarian.c \ 24 | src_c/stem_UTF_8_hungarian.c \ 25 | src_c/stem_ISO_8859_1_italian.c \ 26 | src_c/stem_UTF_8_italian.c \ 27 | src_c/stem_ISO_8859_1_norwegian.c \ 28 | src_c/stem_UTF_8_norwegian.c \ 29 | src_c/stem_ISO_8859_1_porter.c \ 30 | src_c/stem_UTF_8_porter.c \ 31 | src_c/stem_ISO_8859_1_portuguese.c \ 32 | src_c/stem_UTF_8_portuguese.c \ 33 | src_c/stem_ISO_8859_2_romanian.c \ 34 | src_c/stem_UTF_8_romanian.c \ 35 | src_c/stem_KOI8_R_russian.c \ 36 | src_c/stem_UTF_8_russian.c \ 37 | src_c/stem_ISO_8859_1_spanish.c \ 38 | src_c/stem_UTF_8_spanish.c \ 39 | src_c/stem_ISO_8859_1_swedish.c \ 40 | src_c/stem_UTF_8_swedish.c \ 41 | src_c/stem_UTF_8_turkish.c \ 42 | runtime/api.c \ 43 | runtime/utilities.c \ 44 | libstemmer/libstemmer.c 45 | 46 | snowball_headers= \ 47 | src_c/stem_ISO_8859_1_danish.h \ 48 | src_c/stem_UTF_8_danish.h \ 49 | src_c/stem_ISO_8859_1_dutch.h \ 50 | src_c/stem_UTF_8_dutch.h \ 51 | src_c/stem_ISO_8859_1_english.h \ 52 | src_c/stem_UTF_8_english.h \ 53 | src_c/stem_ISO_8859_1_finnish.h \ 54 | src_c/stem_UTF_8_finnish.h \ 55 | src_c/stem_ISO_8859_1_french.h \ 56 | src_c/stem_UTF_8_french.h \ 57 | src_c/stem_ISO_8859_1_german.h \ 58 | src_c/stem_UTF_8_german.h \ 59 | src_c/stem_ISO_8859_1_hungarian.h \ 60 | src_c/stem_UTF_8_hungarian.h \ 61 | src_c/stem_ISO_8859_1_italian.h \ 62 | src_c/stem_UTF_8_italian.h \ 63 | src_c/stem_ISO_8859_1_norwegian.h \ 64 | src_c/stem_UTF_8_norwegian.h \ 65 | src_c/stem_ISO_8859_1_porter.h \ 66 | src_c/stem_UTF_8_porter.h \ 67 | src_c/stem_ISO_8859_1_portuguese.h \ 68 | src_c/stem_UTF_8_portuguese.h \ 69 | src_c/stem_ISO_8859_2_romanian.h \ 70 | src_c/stem_UTF_8_romanian.h \ 71 | src_c/stem_KOI8_R_russian.h \ 72 | src_c/stem_UTF_8_russian.h \ 73 | src_c/stem_ISO_8859_1_spanish.h \ 74 | src_c/stem_UTF_8_spanish.h \ 75 | src_c/stem_ISO_8859_1_swedish.h \ 76 | src_c/stem_UTF_8_swedish.h \ 77 | src_c/stem_UTF_8_turkish.h \ 78 | include/libstemmer.h \ 79 | libstemmer/modules.h \ 80 | runtime/api.h \ 81 | runtime/header.h 82 | 83 | -------------------------------------------------------------------------------- /libstemmer-ffi.lisp: -------------------------------------------------------------------------------- 1 | ;;; This file was automatically generated by SWIG (http://www.swig.org). 2 | ;;; Version 2.0.4 3 | ;;; 4 | ;;; Do not make changes to this file unless you know what you are doing--modify 5 | ;;; the SWIG interface file instead. 6 | 7 | 8 | ;;;SWIG wrapper code starts here 9 | 10 | (cl:defmacro defanonenum (&body enums) 11 | "Converts anonymous enums to defconstants." 12 | `(cl:progn ,@(cl:loop for value in enums 13 | for index = 0 then (cl:1+ index) 14 | when (cl:listp value) do (cl:setf index (cl:second value) 15 | value (cl:first value)) 16 | collect `(cl:defconstant ,value ,index)))) 17 | 18 | (cl:eval-when (:compile-toplevel :load-toplevel) 19 | (cl:unless (cl:fboundp 'swig-lispify) 20 | (cl:defun swig-lispify (name flag cl:&optional (package cl:*package*)) 21 | (cl:labels ((helper (lst last rest cl:&aux (c (cl:car lst))) 22 | (cl:cond 23 | ((cl:null lst) 24 | rest) 25 | ((cl:upper-case-p c) 26 | (helper (cl:cdr lst) 'upper 27 | (cl:case last 28 | ((lower digit) (cl:list* c #\- rest)) 29 | (cl:t (cl:cons c rest))))) 30 | ((cl:lower-case-p c) 31 | (helper (cl:cdr lst) 'lower (cl:cons (cl:char-upcase c) rest))) 32 | ((cl:digit-char-p c) 33 | (helper (cl:cdr lst) 'digit 34 | (cl:case last 35 | ((upper lower) (cl:list* c #\- rest)) 36 | (cl:t (cl:cons c rest))))) 37 | ((cl:char-equal c #\_) 38 | (helper (cl:cdr lst) '_ (cl:cons #\- rest))) 39 | (cl:t 40 | (cl:error "Invalid character: ~A" c))))) 41 | (cl:let ((fix (cl:case flag 42 | ((constant enumvalue) "+") 43 | (variable "*") 44 | (cl:t "")))) 45 | (cl:intern 46 | (cl:concatenate 47 | 'cl:string 48 | fix 49 | (cl:nreverse (helper (cl:concatenate 'cl:list name) cl:nil cl:nil)) 50 | fix) 51 | package)))))) 52 | 53 | ;;;SWIG wrapper code ends here 54 | 55 | 56 | (cffi:defcfun ("sb_stemmer_list" sb_stemmer_list) :pointer) 57 | 58 | (cffi:defcfun ("sb_stemmer_new" sb_stemmer_new) :pointer 59 | (algorithm :string) 60 | (charenc :string)) 61 | 62 | (cffi:defcfun ("sb_stemmer_delete" sb_stemmer_delete) :void 63 | (stemmer :pointer)) 64 | 65 | (cffi:defcfun ("sb_stemmer_stem" sb_stemmer_stem) :pointer 66 | (stemmer :pointer) 67 | (word :pointer) 68 | (size :int)) 69 | 70 | (cffi:defcfun ("sb_stemmer_length" sb_stemmer_length) :int 71 | (stemmer :pointer)) 72 | -------------------------------------------------------------------------------- /libstemmer_c/include/libstemmer.h: -------------------------------------------------------------------------------- 1 | 2 | /* Make header file work when included from C++ */ 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | struct sb_stemmer; 8 | typedef unsigned char sb_symbol; 9 | 10 | /* FIXME - should be able to get a version number for each stemming 11 | * algorithm (which will be incremented each time the output changes). */ 12 | 13 | /** Returns an array of the names of the available stemming algorithms. 14 | * Note that these are the canonical names - aliases (ie, other names for 15 | * the same algorithm) will not be included in the list. 16 | * The list is terminated with a null pointer. 17 | * 18 | * The list must not be modified in any way. 19 | */ 20 | const char ** sb_stemmer_list(void); 21 | 22 | /** Create a new stemmer object, using the specified algorithm, for the 23 | * specified character encoding. 24 | * 25 | * All algorithms will usually be available in UTF-8, but may also be 26 | * available in other character encodings. 27 | * 28 | * @param algorithm The algorithm name. This is either the english 29 | * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the 30 | * language. Note that case is significant in this parameter - the 31 | * value should be supplied in lower case. 32 | * 33 | * @param charenc The character encoding. NULL may be passed as 34 | * this value, in which case UTF-8 encoding will be assumed. Otherwise, 35 | * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), 36 | * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that 37 | * case is significant in this parameter. 38 | * 39 | * @return NULL if the specified algorithm is not recognised, or the 40 | * algorithm is not available for the requested encoding. Otherwise, 41 | * returns a pointer to a newly created stemmer for the requested algorithm. 42 | * The returned pointer must be deleted by calling sb_stemmer_delete(). 43 | * 44 | * @note NULL will also be returned if an out of memory error occurs. 45 | */ 46 | struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); 47 | 48 | /** Delete a stemmer object. 49 | * 50 | * This frees all resources allocated for the stemmer. After calling 51 | * this function, the supplied stemmer may no longer be used in any way. 52 | * 53 | * It is safe to pass a null pointer to this function - this will have 54 | * no effect. 55 | */ 56 | void sb_stemmer_delete(struct sb_stemmer * stemmer); 57 | 58 | /** Stem a word. 59 | * 60 | * The return value is owned by the stemmer - it must not be freed or 61 | * modified, and it will become invalid when the stemmer is called again, 62 | * or if the stemmer is freed. 63 | * 64 | * The length of the return value can be obtained using sb_stemmer_length(). 65 | * 66 | * If an out-of-memory error occurs, this will return NULL. 67 | */ 68 | const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, 69 | const sb_symbol * word, int size); 70 | 71 | /** Get the length of the result of the last stemmed word. 72 | * This should not be called before sb_stemmer_stem() has been called. 73 | */ 74 | int sb_stemmer_length(struct sb_stemmer * stemmer); 75 | 76 | #ifdef __cplusplus 77 | } 78 | #endif 79 | 80 | -------------------------------------------------------------------------------- /stopwords/da.txt: -------------------------------------------------------------------------------- 1 | | A Danish stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | This is a ranked list (commonest to rarest) of stopwords derived from 5 | | a large text sample. 6 | 7 | 8 | og | and 9 | i | in 10 | jeg | I 11 | det | that (dem. pronoun)/it (pers. pronoun) 12 | at | that (in front of a sentence)/to (with infinitive) 13 | en | a/an 14 | den | it (pers. pronoun)/that (dem. pronoun) 15 | til | to/at/for/until/against/by/of/into, more 16 | er | present tense of "to be" 17 | som | who, as 18 | på | on/upon/in/on/at/to/after/of/with/for, on 19 | de | they 20 | med | with/by/in, along 21 | han | he 22 | af | of/by/from/off/for/in/with/on, off 23 | for | at/for/to/from/by/of/ago, in front/before, because 24 | ikke | not 25 | der | who/which, there/those 26 | var | past tense of "to be" 27 | mig | me/myself 28 | sig | oneself/himself/herself/itself/themselves 29 | men | but 30 | et | a/an/one, one (number), someone/somebody/one 31 | har | present tense of "to have" 32 | om | round/about/for/in/a, about/around/down, if 33 | vi | we 34 | min | my 35 | havde | past tense of "to have" 36 | ham | him 37 | hun | she 38 | nu | now 39 | over | over/above/across/by/beyond/past/on/about, over/past 40 | da | then, when/as/since 41 | fra | from/off/since, off, since 42 | du | you 43 | ud | out 44 | sin | his/her/its/one's 45 | dem | them 46 | os | us/ourselves 47 | op | up 48 | man | you/one 49 | hans | his 50 | hvor | where 51 | eller | or 52 | hvad | what 53 | skal | must/shall etc. 54 | selv | myself/youself/herself/ourselves etc., even 55 | her | here 56 | alle | all/everyone/everybody etc. 57 | vil | will (verb) 58 | blev | past tense of "to stay/to remain/to get/to become" 59 | kunne | could 60 | ind | in 61 | når | when 62 | være | present tense of "to be" 63 | dog | however/yet/after all 64 | noget | something 65 | ville | would 66 | jo | you know/you see (adv), yes 67 | deres | their/theirs 68 | efter | after/behind/according to/for/by/from, later/afterwards 69 | ned | down 70 | skulle | should 71 | denne | this 72 | end | than 73 | dette | this 74 | mit | my/mine 75 | også | also 76 | under | under/beneath/below/during, below/underneath 77 | have | have 78 | dig | you 79 | anden | other 80 | hende | her 81 | mine | my 82 | alt | everything 83 | meget | much/very, plenty of 84 | sit | his, her, its, one's 85 | sine | his, her, its, one's 86 | vor | our 87 | mod | against 88 | disse | these 89 | hvis | if 90 | din | your/yours 91 | nogle | some 92 | hos | by/at 93 | blive | be/become 94 | mange | many 95 | ad | by/through 96 | bliver | present tense of "to be/to become" 97 | hendes | her/hers 98 | været | be 99 | thi | for (conj) 100 | jer | you 101 | sådan | such, like this/like that 102 | -------------------------------------------------------------------------------- /stopwords/sv.txt: -------------------------------------------------------------------------------- 1 | | A Swedish stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | This is a ranked list (commonest to rarest) of stopwords derived from 5 | | a large text sample. 6 | 7 | | Swedish stop words occasionally exhibit homonym clashes. For example 8 | | så = so, but also seed. These are indicated clearly below. 9 | 10 | och | and 11 | det | it, this/that 12 | att | to (with infinitive) 13 | i | in, at 14 | en | a 15 | jag | I 16 | hon | she 17 | som | who, that 18 | han | he 19 | på | on 20 | den | it, this/that 21 | med | with 22 | var | where, each 23 | sig | him(self) etc 24 | för | for 25 | så | so (also: seed) 26 | till | to 27 | är | is 28 | men | but 29 | ett | a 30 | om | if; around, about 31 | hade | had 32 | de | they, these/those 33 | av | of 34 | icke | not, no 35 | mig | me 36 | du | you 37 | henne | her 38 | då | then, when 39 | sin | his 40 | nu | now 41 | har | have 42 | inte | inte någon = no one 43 | hans | his 44 | honom | him 45 | skulle | 'sake' 46 | hennes | her 47 | där | there 48 | min | my 49 | man | one (pronoun) 50 | ej | nor 51 | vid | at, by, on (also: vast) 52 | kunde | could 53 | något | some etc 54 | från | from, off 55 | ut | out 56 | när | when 57 | efter | after, behind 58 | upp | up 59 | vi | we 60 | dem | them 61 | vara | be 62 | vad | what 63 | över | over 64 | än | than 65 | dig | you 66 | kan | can 67 | sina | his 68 | här | here 69 | ha | have 70 | mot | towards 71 | alla | all 72 | under | under (also: wonder) 73 | någon | some etc 74 | eller | or (else) 75 | allt | all 76 | mycket | much 77 | sedan | since 78 | ju | why 79 | denna | this/that 80 | själv | myself, yourself etc 81 | detta | this/that 82 | åt | to 83 | utan | without 84 | varit | was 85 | hur | how 86 | ingen | no 87 | mitt | my 88 | ni | you 89 | bli | to be, become 90 | blev | from bli 91 | oss | us 92 | din | thy 93 | dessa | these/those 94 | några | some etc 95 | deras | their 96 | blir | from bli 97 | mina | my 98 | samma | (the) same 99 | vilken | who, that 100 | er | you, your 101 | sådan | such a 102 | vår | our 103 | blivit | from bli 104 | dess | its 105 | inom | within 106 | mellan | between 107 | sådant | such a 108 | varför | why 109 | varje | each 110 | vilka | who, that 111 | ditt | thy 112 | vem | who 113 | vilket | who, that 114 | sitta | his 115 | sådana | such a 116 | vart | each 117 | dina | thy 118 | vars | whose 119 | vårt | our 120 | våra | our 121 | ert | your 122 | era | your 123 | vilkas | whose 124 | -------------------------------------------------------------------------------- /stopwords/fr.txt: -------------------------------------------------------------------------------- 1 | | A French stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | au | a + le 5 | aux | a + les 6 | avec | with 7 | ce | this 8 | ces | these 9 | dans | with 10 | de | of 11 | des | de + les 12 | du | de + le 13 | elle | she 14 | en | `of them' etc 15 | et | and 16 | eux | them 17 | il | he 18 | je | I 19 | la | the 20 | le | the 21 | leur | their 22 | lui | him 23 | ma | my (fem) 24 | mais | but 25 | me | me 26 | même | same; as in moi-même (myself) etc 27 | mes | me (pl) 28 | moi | me 29 | mon | my (masc) 30 | ne | not 31 | nos | our (pl) 32 | notre | our 33 | nous | we 34 | on | one 35 | ou | where 36 | par | by 37 | pas | not 38 | pour | for 39 | qu | que before vowel 40 | que | that 41 | qui | who 42 | sa | his, her (fem) 43 | se | oneself 44 | ses | his (pl) 45 | son | his, her (masc) 46 | sur | on 47 | ta | thy (fem) 48 | te | thee 49 | tes | thy (pl) 50 | toi | thee 51 | ton | thy (masc) 52 | tu | thou 53 | un | a 54 | une | a 55 | vos | your (pl) 56 | votre | your 57 | vous | you 58 | 59 | | single letter forms 60 | 61 | c | c' 62 | d | d' 63 | j | j' 64 | l | l' 65 | à | to, at 66 | m | m' 67 | n | n' 68 | s | s' 69 | t | t' 70 | y | there 71 | 72 | | forms of être (not including the infinitive): 73 | été 74 | étée 75 | étées 76 | étés 77 | étant 78 | suis 79 | es 80 | est 81 | sommes 82 | êtes 83 | sont 84 | serai 85 | seras 86 | sera 87 | serons 88 | serez 89 | seront 90 | serais 91 | serait 92 | serions 93 | seriez 94 | seraient 95 | étais 96 | était 97 | étions 98 | étiez 99 | étaient 100 | fus 101 | fut 102 | fûmes 103 | fûtes 104 | furent 105 | sois 106 | soit 107 | soyons 108 | soyez 109 | soient 110 | fusse 111 | fusses 112 | fût 113 | fussions 114 | fussiez 115 | fussent 116 | 117 | | forms of avoir (not including the infinitive): 118 | ayant 119 | eu 120 | eue 121 | eues 122 | eus 123 | ai 124 | as 125 | avons 126 | avez 127 | ont 128 | aurai 129 | auras 130 | aura 131 | aurons 132 | aurez 133 | auront 134 | aurais 135 | aurait 136 | aurions 137 | auriez 138 | auraient 139 | avais 140 | avait 141 | avions 142 | aviez 143 | avaient 144 | eut 145 | eûmes 146 | eûtes 147 | eurent 148 | aie 149 | aies 150 | ait 151 | ayons 152 | ayez 153 | aient 154 | eusse 155 | eusses 156 | eût 157 | eussions 158 | eussiez 159 | eussent 160 | 161 | | Later additions (from Jean-Christophe Deschamps) 162 | ceci | this 163 | cela | that (added 11 Apr 2012. Omission reported by Adrien Grand) 164 | celà | that (incorrect, though common) 165 | cet | this 166 | cette | this 167 | ici | here 168 | ils | they 169 | les | the (pl) 170 | leurs | their (pl) 171 | quel | which 172 | quels | which 173 | quelle | which 174 | quelles | which 175 | sans | without 176 | soi | oneself 177 | -------------------------------------------------------------------------------- /stopwords/nl.txt: -------------------------------------------------------------------------------- 1 | | A Dutch stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | This is a ranked list (commonest to rarest) of stopwords derived from 5 | | a large sample of Dutch text. 6 | 7 | | Dutch stop words frequently exhibit homonym clashes. These are indicated 8 | | clearly below. 9 | 10 | de | the 11 | en | and 12 | van | of, from 13 | ik | I, the ego 14 | te | (1) chez, at etc, (2) to, (3) too 15 | dat | that, which 16 | die | that, those, who, which 17 | in | in, inside 18 | een | a, an, one 19 | hij | he 20 | het | the, it 21 | niet | not, nothing, naught 22 | zijn | (1) to be, being, (2) his, one's, its 23 | is | is 24 | was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river 25 | op | on, upon, at, in, up, used up 26 | aan | on, upon, to (as dative) 27 | met | with, by 28 | als | like, such as, when 29 | voor | (1) before, in front of, (2) furrow 30 | had | had, past tense all persons sing. of 'hebben' (have) 31 | er | there 32 | maar | but, only 33 | om | round, about, for etc 34 | hem | him 35 | dan | then 36 | zou | should/would, past tense all persons sing. of 'zullen' 37 | of | or, whether, if 38 | wat | what, something, anything 39 | mijn | possessive and noun 'mine' 40 | men | people, 'one' 41 | dit | this 42 | zo | so, thus, in this way 43 | door | through by 44 | over | over, across 45 | ze | she, her, they, them 46 | zich | oneself 47 | bij | (1) a bee, (2) by, near, at 48 | ook | also, too 49 | tot | till, until 50 | je | you 51 | mij | me 52 | uit | out of, from 53 | der | Old Dutch form of 'van der' still found in surnames 54 | daar | (1) there, (2) because 55 | haar | (1) her, their, them, (2) hair 56 | naar | (1) unpleasant, unwell etc, (2) towards, (3) as 57 | heb | present first person sing. of 'to have' 58 | hoe | how, why 59 | heeft | present third person sing. of 'to have' 60 | hebben | 'to have' and various parts thereof 61 | deze | this 62 | u | you 63 | want | (1) for, (2) mitten, (3) rigging 64 | nog | yet, still 65 | zal | 'shall', first and third person sing. of verb 'zullen' (will) 66 | me | me 67 | zij | she, they 68 | nu | now 69 | ge | 'thou', still used in Belgium and south Netherlands 70 | geen | none 71 | omdat | because 72 | iets | something, somewhat 73 | worden | to become, grow, get 74 | toch | yet, still 75 | al | all, every, each 76 | waren | (1) 'were' (2) to wander, (3) wares, (3) 77 | veel | much, many 78 | meer | (1) more, (2) lake 79 | doen | to do, to make 80 | toen | then, when 81 | moet | noun 'spot/mote' and present form of 'to must' 82 | ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' 83 | zonder | without 84 | kan | noun 'can' and present form of 'to be able' 85 | hun | their, them 86 | dus | so, consequently 87 | alles | all, everything, anything 88 | onder | under, beneath 89 | ja | yes, of course 90 | eens | once, one day 91 | hier | here 92 | wie | who 93 | werd | imperfect third person sing. of 'become' 94 | altijd | always 95 | doch | yet, but etc 96 | wordt | present third person sing. of 'become' 97 | wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans 98 | kunnen | to be able 99 | ons | us/our 100 | zelf | self 101 | tegen | against, towards, at 102 | na | after, near 103 | reeds | already 104 | wil | (1) present tense of 'want', (2) 'will', noun, (3) fender 105 | kon | could; past tense of 'to be able' 106 | niets | nothing 107 | uw | your 108 | iemand | somebody 109 | geweest | been; past participle of 'be' 110 | andere | other 111 | -------------------------------------------------------------------------------- /stopwords/nb.txt: -------------------------------------------------------------------------------- 1 | | A Norwegian stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | This stop word list is for the dominant bokmål dialect. Words unique 5 | | to nynorsk are marked *. 6 | 7 | | Revised by Jan Bruusgaard , Jan 2005 8 | 9 | og | and 10 | i | in 11 | jeg | I 12 | det | it/this/that 13 | at | to (w. inf.) 14 | en | a/an 15 | et | a/an 16 | den | it/this/that 17 | til | to 18 | er | is/am/are 19 | som | who/that 20 | på | on 21 | de | they / you(formal) 22 | med | with 23 | han | he 24 | av | of 25 | ikke | not 26 | ikkje | not * 27 | der | there 28 | så | so 29 | var | was/were 30 | meg | me 31 | seg | you 32 | men | but 33 | ett | one 34 | har | have 35 | om | about 36 | vi | we 37 | min | my 38 | mitt | my 39 | ha | have 40 | hadde | had 41 | hun | she 42 | nå | now 43 | over | over 44 | da | when/as 45 | ved | by/know 46 | fra | from 47 | du | you 48 | ut | out 49 | sin | your 50 | dem | them 51 | oss | us 52 | opp | up 53 | man | you/one 54 | kan | can 55 | hans | his 56 | hvor | where 57 | eller | or 58 | hva | what 59 | skal | shall/must 60 | selv | self (reflective) 61 | sjøl | self (reflective) 62 | her | here 63 | alle | all 64 | vil | will 65 | bli | become 66 | ble | became 67 | blei | became * 68 | blitt | have become 69 | kunne | could 70 | inn | in 71 | når | when 72 | være | be 73 | kom | come 74 | noen | some 75 | noe | some 76 | ville | would 77 | dere | you 78 | som | who/which/that 79 | deres | their/theirs 80 | kun | only/just 81 | ja | yes 82 | etter | after 83 | ned | down 84 | skulle | should 85 | denne | this 86 | for | for/because 87 | deg | you 88 | si | hers/his 89 | sine | hers/his 90 | sitt | hers/his 91 | mot | against 92 | å | to 93 | meget | much 94 | hvorfor | why 95 | dette | this 96 | disse | these/those 97 | uten | without 98 | hvordan | how 99 | ingen | none 100 | din | your 101 | ditt | your 102 | blir | become 103 | samme | same 104 | hvilken | which 105 | hvilke | which (plural) 106 | sånn | such a 107 | inni | inside/within 108 | mellom | between 109 | vår | our 110 | hver | each 111 | hvem | who 112 | vors | us/ours 113 | hvis | whose 114 | både | both 115 | bare | only/just 116 | enn | than 117 | fordi | as/because 118 | før | before 119 | mange | many 120 | også | also 121 | slik | just 122 | vært | been 123 | være | to be 124 | båe | both * 125 | begge | both 126 | siden | since 127 | dykk | your * 128 | dykkar | yours * 129 | dei | they * 130 | deira | them * 131 | deires | theirs * 132 | deim | them * 133 | di | your (fem.) * 134 | då | as/when * 135 | eg | I * 136 | ein | a/an * 137 | eit | a/an * 138 | eitt | a/an * 139 | elles | or * 140 | honom | he * 141 | hjå | at * 142 | ho | she * 143 | hoe | she * 144 | henne | her 145 | hennar | her/hers 146 | hennes | hers 147 | hoss | how * 148 | hossen | how * 149 | ikkje | not * 150 | ingi | noone * 151 | inkje | noone * 152 | korleis | how * 153 | korso | how * 154 | kva | what/which * 155 | kvar | where * 156 | kvarhelst | where * 157 | kven | who/whom * 158 | kvi | why * 159 | kvifor | why * 160 | me | we * 161 | medan | while * 162 | mi | my * 163 | mine | my * 164 | mykje | much * 165 | no | now * 166 | nokon | some (masc./neut.) * 167 | noka | some (fem.) * 168 | nokor | some * 169 | noko | some * 170 | nokre | some * 171 | si | his/hers * 172 | sia | since * 173 | sidan | since * 174 | so | so * 175 | somt | some * 176 | somme | some * 177 | um | about* 178 | upp | up * 179 | vere | be * 180 | vore | was * 181 | verte | become * 182 | vort | become * 183 | varte | became * 184 | vart | became * 185 | -------------------------------------------------------------------------------- /stopwords/pt.txt: -------------------------------------------------------------------------------- 1 | | A Portuguese stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | 5 | | The following is a ranked list (commonest to rarest) of stopwords 6 | | deriving from a large sample of text. 7 | 8 | | Extra words have been added at the end. 9 | 10 | de | of, from 11 | a | the; to, at; her 12 | o | the; him 13 | que | who, that 14 | e | and 15 | do | de + o 16 | da | de + a 17 | em | in 18 | um | a 19 | para | for 20 | | é from SER 21 | com | with 22 | não | not, no 23 | uma | a 24 | os | the; them 25 | no | em + o 26 | se | himself etc 27 | na | em + a 28 | por | for 29 | mais | more 30 | as | the; them 31 | dos | de + os 32 | como | as, like 33 | mas | but 34 | | foi from SER 35 | ao | a + o 36 | ele | he 37 | das | de + as 38 | | tem from TER 39 | à | a + a 40 | seu | his 41 | sua | her 42 | ou | or 43 | | ser from SER 44 | quando | when 45 | muito | much 46 | | há from HAV 47 | nos | em + os; us 48 | já | already, now 49 | | está from EST 50 | eu | I 51 | também | also 52 | só | only, just 53 | pelo | per + o 54 | pela | per + a 55 | até | up to 56 | isso | that 57 | ela | he 58 | entre | between 59 | | era from SER 60 | depois | after 61 | sem | without 62 | mesmo | same 63 | aos | a + os 64 | | ter from TER 65 | seus | his 66 | quem | whom 67 | nas | em + as 68 | me | me 69 | esse | that 70 | eles | they 71 | | estão from EST 72 | você | you 73 | | tinha from TER 74 | | foram from SER 75 | essa | that 76 | num | em + um 77 | nem | nor 78 | suas | her 79 | meu | my 80 | às | a + as 81 | minha | my 82 | | têm from TER 83 | numa | em + uma 84 | pelos | per + os 85 | elas | they 86 | | havia from HAV 87 | | seja from SER 88 | qual | which 89 | | será from SER 90 | nós | we 91 | | tenho from TER 92 | lhe | to him, her 93 | deles | of them 94 | essas | those 95 | esses | those 96 | pelas | per + as 97 | este | this 98 | | fosse from SER 99 | dele | of him 100 | 101 | | other words. There are many contractions such as naquele = em+aquele, 102 | | mo = me+o, but they are rare. 103 | | Indefinite article plural forms are also rare. 104 | 105 | tu | thou 106 | te | thee 107 | vocês | you (plural) 108 | vos | you 109 | lhes | to them 110 | meus | my 111 | minhas 112 | teu | thy 113 | tua 114 | teus 115 | tuas 116 | nosso | our 117 | nossa 118 | nossos 119 | nossas 120 | 121 | dela | of her 122 | delas | of them 123 | 124 | esta | this 125 | estes | these 126 | estas | these 127 | aquele | that 128 | aquela | that 129 | aqueles | those 130 | aquelas | those 131 | isto | this 132 | aquilo | that 133 | 134 | | forms of estar, to be (not including the infinitive): 135 | estou 136 | está 137 | estamos 138 | estão 139 | estive 140 | esteve 141 | estivemos 142 | estiveram 143 | estava 144 | estávamos 145 | estavam 146 | estivera 147 | estivéramos 148 | esteja 149 | estejamos 150 | estejam 151 | estivesse 152 | estivéssemos 153 | estivessem 154 | estiver 155 | estivermos 156 | estiverem 157 | 158 | | forms of haver, to have (not including the infinitive): 159 | hei 160 | há 161 | havemos 162 | hão 163 | houve 164 | houvemos 165 | houveram 166 | houvera 167 | houvéramos 168 | haja 169 | hajamos 170 | hajam 171 | houvesse 172 | houvéssemos 173 | houvessem 174 | houver 175 | houvermos 176 | houverem 177 | houverei 178 | houverá 179 | houveremos 180 | houverão 181 | houveria 182 | houveríamos 183 | houveriam 184 | 185 | | forms of ser, to be (not including the infinitive): 186 | sou 187 | somos 188 | são 189 | era 190 | éramos 191 | eram 192 | fui 193 | foi 194 | fomos 195 | foram 196 | fora 197 | fôramos 198 | seja 199 | sejamos 200 | sejam 201 | fosse 202 | fôssemos 203 | fossem 204 | for 205 | formos 206 | forem 207 | serei 208 | será 209 | seremos 210 | serão 211 | seria 212 | seríamos 213 | seriam 214 | 215 | | forms of ter, to have (not including the infinitive): 216 | tenho 217 | tem 218 | temos 219 | tém 220 | tinha 221 | tínhamos 222 | tinham 223 | tive 224 | teve 225 | tivemos 226 | tiveram 227 | tivera 228 | tivéramos 229 | tenha 230 | tenhamos 231 | tenham 232 | tivesse 233 | tivéssemos 234 | tivessem 235 | tiver 236 | tivermos 237 | tiverem 238 | terei 239 | terá 240 | teremos 241 | terão 242 | teria 243 | teríamos 244 | teriam 245 | -------------------------------------------------------------------------------- /libstemmer_c/README: -------------------------------------------------------------------------------- 1 | libstemmer_c 2 | ============ 3 | 4 | This document pertains to the C version of the libstemmer distribution, 5 | available for download from: 6 | 7 | http://snowball.tartarus.org/dist/libstemmer_c.tgz 8 | 9 | 10 | Compiling the library 11 | ===================== 12 | 13 | A simple makefile is provided for Unix style systems. On such systems, it 14 | should be possible simply to run "make", and the file "libstemmer.o" 15 | and the example program "stemwords" will be generated. 16 | 17 | If this doesn't work on your system, you need to write your own build 18 | system (or call the compiler directly). The files to compile are 19 | all contained in the "libstemmer", "runtime" and "src_c" directories, 20 | and the public header file is contained in the "include" directory. 21 | 22 | The library comes in two flavours; UTF-8 only, and UTF-8 plus other character 23 | sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of 24 | "libstemmer.c". 25 | 26 | For convenience "mkinc.mak" is a makefile fragment listing the source files and 27 | header files used to compile the standard version of the library. 28 | "mkinc_utf8.mak" is a comparable makefile fragment listing just the source 29 | files for the UTF-8 only version of the library. 30 | 31 | 32 | Using the library 33 | ================= 34 | 35 | The library provides a simple C API. Essentially, a new stemmer can 36 | be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then 37 | used to stem a word, "sb_stemmer_length" returns the stemmed 38 | length of the last word processed, and "sb_stemmer_delete" is 39 | used to delete a stemmer. 40 | 41 | Creating a stemmer is a relatively expensive operation - the expected 42 | usage pattern is that a new stemmer is created when needed, used 43 | to stem many words, and deleted after some time. 44 | 45 | Stemmers are re-entrant, but not threadsafe. In other words, if 46 | you wish to access the same stemmer object from multiple threads, 47 | you must ensure that all access is protected by a mutex or similar 48 | device. 49 | 50 | libstemmer does not currently incorporate any mechanism for caching the results 51 | of stemming operations. Such caching can greatly increase the performance of a 52 | stemmer under certain situations, so suitable patches will be considered for 53 | inclusion. 54 | 55 | The standard libstemmer sources contain an algorithm for each of the supported 56 | languages. The algorithm may be selected using the english name of the 57 | language, or using the 2 or 3 letter ISO 639 language codes. In addition, 58 | the traditional "Porter" stemming algorithm for english is included for 59 | backwards compatibility purposes, but we recommend use of the "English" 60 | stemmer in preference for new projects. 61 | 62 | (Some minor algorithms which are included only as curiosities in the snowball 63 | website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not 64 | included in the standard libstemmer sources. These are not really supported by 65 | the snowball project, but it would be possible to compile a modified libstemmer 66 | library containing these if desired.) 67 | 68 | 69 | The stemwords example 70 | ===================== 71 | 72 | The stemwords example program allows you to run any of the stemmers 73 | compiled into the libstemmer library on a sample vocabulary. For 74 | details on how to use it, run it with the "-h" command line option. 75 | 76 | 77 | Using the library in a larger system 78 | ==================================== 79 | 80 | If you are incorporating the library into the build system of a larger 81 | program, I recommend copying the unpacked tarball without modification into 82 | a subdirectory of the sources of your program. Future versions of the 83 | library are intended to keep the same structure, so this will keep the 84 | work required to move to a new version of the library to a minimum. 85 | 86 | As an additional convenience, the list of source and header files used 87 | in the library is detailed in mkinc.mak - a file which is in a suitable 88 | format for inclusion by a Makefile. By including this file in your build 89 | system, you can link the snowball system into your program with a few 90 | extra rules. 91 | 92 | Using the library in a system using GNU autotools 93 | ================================================= 94 | 95 | The libstemmer_c library can be integrated into a larger system which uses the 96 | GNU autotool framework (and in particular, automake and autoconf) as follows: 97 | 98 | 1) Unpack libstemmer_c.tgz in the top level project directory so that there is 99 | a libstemmer_c subdirectory of the top level directory of the project. 100 | 101 | 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: 102 | 103 | noinst_LTLIBRARIES = libstemmer.la 104 | include $(srcdir)/mkinc.mak 105 | noinst_HEADERS = $(snowball_headers) 106 | libstemmer_la_SOURCES = $(snowball_sources) 107 | 108 | (You may also need to add other lines to this, for example, if you are using 109 | compiler options which are not compatible with compiling the libstemmer 110 | library.) 111 | 112 | 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's 113 | configure.ac file. 114 | 115 | 4) Add to the top level makefile the following lines (or modify existing 116 | assignments to these variables appropriately): 117 | 118 | AUTOMAKE_OPTIONS = subdir-objects 119 | AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include 120 | SUBDIRS=libstemmer_c 121 | _LIBADD = libstemmer_c/libstemmer.la 122 | 123 | (Where is the name of the library or executable which links against 124 | libstemmer.) 125 | 126 | -------------------------------------------------------------------------------- /stopwords/de.txt: -------------------------------------------------------------------------------- 1 | | A German stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | The number of forms in this list is reduced significantly by passing it 5 | | through the German stemmer. 6 | 7 | 8 | aber | but 9 | 10 | alle | all 11 | allem 12 | allen 13 | aller 14 | alles 15 | 16 | als | than, as 17 | also | so 18 | am | an + dem 19 | an | at 20 | 21 | ander | other 22 | andere 23 | anderem 24 | anderen 25 | anderer 26 | anderes 27 | anderm 28 | andern 29 | anderr 30 | anders 31 | 32 | auch | also 33 | auf | on 34 | aus | out of 35 | bei | by 36 | bin | am 37 | bis | until 38 | bist | art 39 | da | there 40 | damit | with it 41 | dann | then 42 | 43 | der | the 44 | den 45 | des 46 | dem 47 | die 48 | das 49 | 50 | daß | that 51 | 52 | derselbe | the same 53 | derselben 54 | denselben 55 | desselben 56 | demselben 57 | dieselbe 58 | dieselben 59 | dasselbe 60 | 61 | dazu | to that 62 | 63 | dein | thy 64 | deine 65 | deinem 66 | deinen 67 | deiner 68 | deines 69 | 70 | denn | because 71 | 72 | derer | of those 73 | dessen | of him 74 | 75 | dich | thee 76 | dir | to thee 77 | du | thou 78 | 79 | dies | this 80 | diese 81 | diesem 82 | diesen 83 | dieser 84 | dieses 85 | 86 | 87 | doch | (several meanings) 88 | dort | (over) there 89 | 90 | 91 | durch | through 92 | 93 | ein | a 94 | eine 95 | einem 96 | einen 97 | einer 98 | eines 99 | 100 | einig | some 101 | einige 102 | einigem 103 | einigen 104 | einiger 105 | einiges 106 | 107 | einmal | once 108 | 109 | er | he 110 | ihn | him 111 | ihm | to him 112 | 113 | es | it 114 | etwas | something 115 | 116 | euer | your 117 | eure 118 | eurem 119 | euren 120 | eurer 121 | eures 122 | 123 | für | for 124 | gegen | towards 125 | gewesen | p.p. of sein 126 | hab | have 127 | habe | have 128 | haben | have 129 | hat | has 130 | hatte | had 131 | hatten | had 132 | hier | here 133 | hin | there 134 | hinter | behind 135 | 136 | ich | I 137 | mich | me 138 | mir | to me 139 | 140 | 141 | ihr | you, to her 142 | ihre 143 | ihrem 144 | ihren 145 | ihrer 146 | ihres 147 | euch | to you 148 | 149 | im | in + dem 150 | in | in 151 | indem | while 152 | ins | in + das 153 | ist | is 154 | 155 | jede | each, every 156 | jedem 157 | jeden 158 | jeder 159 | jedes 160 | 161 | jene | that 162 | jenem 163 | jenen 164 | jener 165 | jenes 166 | 167 | jetzt | now 168 | kann | can 169 | 170 | kein | no 171 | keine 172 | keinem 173 | keinen 174 | keiner 175 | keines 176 | 177 | können | can 178 | könnte | could 179 | machen | do 180 | man | one 181 | 182 | manche | some, many a 183 | manchem 184 | manchen 185 | mancher 186 | manches 187 | 188 | mein | my 189 | meine 190 | meinem 191 | meinen 192 | meiner 193 | meines 194 | 195 | mit | with 196 | muss | must 197 | musste | had to 198 | nach | to(wards) 199 | nicht | not 200 | nichts | nothing 201 | noch | still, yet 202 | nun | now 203 | nur | only 204 | ob | whether 205 | oder | or 206 | ohne | without 207 | sehr | very 208 | 209 | sein | his 210 | seine 211 | seinem 212 | seinen 213 | seiner 214 | seines 215 | 216 | selbst | self 217 | sich | herself 218 | 219 | sie | they, she 220 | ihnen | to them 221 | 222 | sind | are 223 | so | so 224 | 225 | solche | such 226 | solchem 227 | solchen 228 | solcher 229 | solches 230 | 231 | soll | shall 232 | sollte | should 233 | sondern | but 234 | sonst | else 235 | über | over 236 | um | about, around 237 | und | and 238 | 239 | uns | us 240 | unse 241 | unsem 242 | unsen 243 | unser 244 | unses 245 | 246 | unter | under 247 | viel | much 248 | vom | von + dem 249 | von | from 250 | vor | before 251 | während | while 252 | war | was 253 | waren | were 254 | warst | wast 255 | was | what 256 | weg | away, off 257 | weil | because 258 | weiter | further 259 | 260 | welche | which 261 | welchem 262 | welchen 263 | welcher 264 | welches 265 | 266 | wenn | when 267 | werde | will 268 | werden | will 269 | wie | how 270 | wieder | again 271 | will | want 272 | wir | we 273 | wird | will 274 | wirst | willst 275 | wo | where 276 | wollen | want 277 | wollte | wanted 278 | würde | would 279 | würden | would 280 | zu | to 281 | zum | zu + dem 282 | zur | zu + der 283 | zwar | indeed 284 | zwischen | between 285 | -------------------------------------------------------------------------------- /stopwords/it.txt: -------------------------------------------------------------------------------- 1 | | An Italian stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | ad | a (to) before vowel 5 | al | a + il 6 | allo | a + lo 7 | ai | a + i 8 | agli | a + gli 9 | all | a + l' 10 | agl | a + gl' 11 | alla | a + la 12 | alle | a + le 13 | con | with 14 | col | con + il 15 | coi | con + i (forms collo, cogli etc are now very rare) 16 | da | from 17 | dal | da + il 18 | dallo | da + lo 19 | dai | da + i 20 | dagli | da + gli 21 | dall | da + l' 22 | dagl | da + gll' 23 | dalla | da + la 24 | dalle | da + le 25 | di | of 26 | del | di + il 27 | dello | di + lo 28 | dei | di + i 29 | degli | di + gli 30 | dell | di + l' 31 | degl | di + gl' 32 | della | di + la 33 | delle | di + le 34 | in | in 35 | nel | in + el 36 | nello | in + lo 37 | nei | in + i 38 | negli | in + gli 39 | nell | in + l' 40 | negl | in + gl' 41 | nella | in + la 42 | nelle | in + le 43 | su | on 44 | sul | su + il 45 | sullo | su + lo 46 | sui | su + i 47 | sugli | su + gli 48 | sull | su + l' 49 | sugl | su + gl' 50 | sulla | su + la 51 | sulle | su + le 52 | per | through, by 53 | tra | among 54 | contro | against 55 | io | I 56 | tu | thou 57 | lui | he 58 | lei | she 59 | noi | we 60 | voi | you 61 | loro | they 62 | mio | my 63 | mia | 64 | miei | 65 | mie | 66 | tuo | 67 | tua | 68 | tuoi | thy 69 | tue | 70 | suo | 71 | sua | 72 | suoi | his, her 73 | sue | 74 | nostro | our 75 | nostra | 76 | nostri | 77 | nostre | 78 | vostro | your 79 | vostra | 80 | vostri | 81 | vostre | 82 | mi | me 83 | ti | thee 84 | ci | us, there 85 | vi | you, there 86 | lo | him, the 87 | la | her, the 88 | li | them 89 | le | them, the 90 | gli | to him, the 91 | ne | from there etc 92 | il | the 93 | un | a 94 | uno | a 95 | una | a 96 | ma | but 97 | ed | and 98 | se | if 99 | perché | why, because 100 | anche | also 101 | come | how 102 | dov | where (as dov') 103 | dove | where 104 | che | who, that 105 | chi | who 106 | cui | whom 107 | non | not 108 | più | more 109 | quale | who, that 110 | quanto | how much 111 | quanti | 112 | quanta | 113 | quante | 114 | quello | that 115 | quelli | 116 | quella | 117 | quelle | 118 | questo | this 119 | questi | 120 | questa | 121 | queste | 122 | si | yes 123 | tutto | all 124 | tutti | all 125 | 126 | | single letter forms: 127 | 128 | a | at 129 | c | as c' for ce or ci 130 | e | and 131 | i | the 132 | l | as l' 133 | o | or 134 | 135 | | forms of avere, to have (not including the infinitive): 136 | 137 | ho 138 | hai 139 | ha 140 | abbiamo 141 | avete 142 | hanno 143 | abbia 144 | abbiate 145 | abbiano 146 | avrò 147 | avrai 148 | avrà 149 | avremo 150 | avrete 151 | avranno 152 | avrei 153 | avresti 154 | avrebbe 155 | avremmo 156 | avreste 157 | avrebbero 158 | avevo 159 | avevi 160 | aveva 161 | avevamo 162 | avevate 163 | avevano 164 | ebbi 165 | avesti 166 | ebbe 167 | avemmo 168 | aveste 169 | ebbero 170 | avessi 171 | avesse 172 | avessimo 173 | avessero 174 | avendo 175 | avuto 176 | avuta 177 | avuti 178 | avute 179 | 180 | | forms of essere, to be (not including the infinitive): 181 | sono 182 | sei 183 | è 184 | siamo 185 | siete 186 | sia 187 | siate 188 | siano 189 | sarò 190 | sarai 191 | sarà 192 | saremo 193 | sarete 194 | saranno 195 | sarei 196 | saresti 197 | sarebbe 198 | saremmo 199 | sareste 200 | sarebbero 201 | ero 202 | eri 203 | era 204 | eravamo 205 | eravate 206 | erano 207 | fui 208 | fosti 209 | fu 210 | fummo 211 | foste 212 | furono 213 | fossi 214 | fosse 215 | fossimo 216 | fossero 217 | essendo 218 | 219 | | forms of fare, to do (not including the infinitive, fa, fat-): 220 | faccio 221 | fai 222 | facciamo 223 | fanno 224 | faccia 225 | facciate 226 | facciano 227 | farò 228 | farai 229 | farà 230 | faremo 231 | farete 232 | faranno 233 | farei 234 | faresti 235 | farebbe 236 | faremmo 237 | fareste 238 | farebbero 239 | facevo 240 | facevi 241 | faceva 242 | facevamo 243 | facevate 244 | facevano 245 | feci 246 | facesti 247 | fece 248 | facemmo 249 | faceste 250 | fecero 251 | facessi 252 | facesse 253 | facessimo 254 | facessero 255 | facendo 256 | 257 | | forms of stare, to be (not including the infinitive): 258 | sto 259 | stai 260 | sta 261 | stiamo 262 | stanno 263 | stia 264 | stiate 265 | stiano 266 | starò 267 | starai 268 | starà 269 | staremo 270 | starete 271 | staranno 272 | starei 273 | staresti 274 | starebbe 275 | staremmo 276 | stareste 277 | starebbero 278 | stavo 279 | stavi 280 | stava 281 | stavamo 282 | stavate 283 | stavano 284 | stetti 285 | stesti 286 | stette 287 | stemmo 288 | steste 289 | stettero 290 | stessi 291 | stesse 292 | stessimo 293 | stessero 294 | stando 295 | -------------------------------------------------------------------------------- /stopwords/en.txt: -------------------------------------------------------------------------------- 1 | | An English stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | | Many of the forms below are quite rare (e.g. "yourselves") but included for 5 | | completeness. 6 | 7 | | PRONOUNS FORMS 8 | | 1st person sing 9 | 10 | i | subject, always in upper case of course 11 | 12 | me | object 13 | my | possessive adjective 14 | | the possessive pronoun `mine' is best suppressed, because of the 15 | | sense of coal-mine etc. 16 | myself | reflexive 17 | | 1st person plural 18 | we | subject 19 | 20 | | us | object 21 | | care is required here because US = United States. It is usually 22 | | safe to remove it if it is in lower case. 23 | our | possessive adjective 24 | ours | possessive pronoun 25 | ourselves | reflexive 26 | | second person (archaic `thou' forms not included) 27 | you | subject and object 28 | your | possessive adjective 29 | yours | possessive pronoun 30 | yourself | reflexive (singular) 31 | yourselves | reflexive (plural) 32 | | third person singular 33 | he | subject 34 | him | object 35 | his | possessive adjective and pronoun 36 | himself | reflexive 37 | 38 | she | subject 39 | her | object and possessive adjective 40 | hers | possessive pronoun 41 | herself | reflexive 42 | 43 | it | subject and object 44 | its | possessive adjective 45 | itself | reflexive 46 | | third person plural 47 | they | subject 48 | them | object 49 | their | possessive adjective 50 | theirs | possessive pronoun 51 | themselves | reflexive 52 | | other forms (demonstratives, interrogatives) 53 | what 54 | which 55 | who 56 | whom 57 | this 58 | that 59 | these 60 | those 61 | 62 | | VERB FORMS (using F.R. Palmer's nomenclature) 63 | | BE 64 | am | 1st person, present 65 | is | -s form (3rd person, present) 66 | are | present 67 | was | 1st person, past 68 | were | past 69 | be | infinitive 70 | been | past participle 71 | being | -ing form 72 | | HAVE 73 | have | simple 74 | has | -s form 75 | had | past 76 | having | -ing form 77 | | DO 78 | do | simple 79 | does | -s form 80 | did | past 81 | doing | -ing form 82 | 83 | | The forms below are, I believe, best omitted, because of the significant 84 | | homonym forms: 85 | 86 | | He made a WILL 87 | | old tin CAN 88 | | merry month of MAY 89 | | a smell of MUST 90 | | fight the good fight with all thy MIGHT 91 | 92 | | would, could, should, ought might however be included 93 | 94 | | | AUXILIARIES 95 | | | WILL 96 | |will 97 | 98 | would 99 | 100 | | | SHALL 101 | |shall 102 | 103 | should 104 | 105 | | | CAN 106 | |can 107 | 108 | could 109 | 110 | | | MAY 111 | |may 112 | |might 113 | | | MUST 114 | |must 115 | | | OUGHT 116 | 117 | ought 118 | 119 | | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing 120 | | pronoun + verb 121 | 122 | i'm 123 | you're 124 | he's 125 | she's 126 | it's 127 | we're 128 | they're 129 | i've 130 | you've 131 | we've 132 | they've 133 | i'd 134 | you'd 135 | he'd 136 | she'd 137 | we'd 138 | they'd 139 | i'll 140 | you'll 141 | he'll 142 | she'll 143 | we'll 144 | they'll 145 | 146 | | verb + negation 147 | 148 | isn't 149 | aren't 150 | wasn't 151 | weren't 152 | hasn't 153 | haven't 154 | hadn't 155 | doesn't 156 | don't 157 | didn't 158 | 159 | | auxiliary + negation 160 | 161 | won't 162 | wouldn't 163 | shan't 164 | shouldn't 165 | can't 166 | cannot 167 | couldn't 168 | mustn't 169 | 170 | | miscellaneous forms 171 | 172 | let's 173 | that's 174 | who's 175 | what's 176 | here's 177 | there's 178 | when's 179 | where's 180 | why's 181 | how's 182 | 183 | | rarer forms 184 | 185 | | daren't needn't 186 | 187 | | doubtful forms 188 | 189 | | oughtn't mightn't 190 | 191 | | ARTICLES 192 | a 193 | an 194 | the 195 | 196 | | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so 197 | | high, that classification is pointless.) 198 | and 199 | but 200 | if 201 | or 202 | because 203 | as 204 | until 205 | while 206 | 207 | of 208 | at 209 | by 210 | for 211 | with 212 | about 213 | against 214 | between 215 | into 216 | through 217 | during 218 | before 219 | after 220 | above 221 | below 222 | to 223 | from 224 | up 225 | down 226 | in 227 | out 228 | on 229 | off 230 | over 231 | under 232 | 233 | again 234 | further 235 | then 236 | once 237 | 238 | here 239 | there 240 | when 241 | where 242 | why 243 | how 244 | 245 | all 246 | any 247 | both 248 | each 249 | few 250 | more 251 | most 252 | other 253 | some 254 | such 255 | 256 | no 257 | nor 258 | not 259 | only 260 | own 261 | same 262 | so 263 | than 264 | too 265 | very 266 | 267 | | Just for the record, the following words are among the commonest in English 268 | 269 | | one 270 | | every 271 | | least 272 | | less 273 | | many 274 | | now 275 | | ever 276 | | never 277 | | say 278 | | says 279 | | said 280 | | also 281 | | get 282 | | go 283 | | goes 284 | | just 285 | | made 286 | | make 287 | | put 288 | | see 289 | | seen 290 | | whether 291 | | like 292 | | well 293 | | back 294 | | even 295 | | still 296 | | way 297 | | take 298 | | since 299 | | another 300 | | however 301 | | two 302 | | three 303 | | four 304 | | five 305 | | first 306 | | second 307 | | new 308 | | old 309 | | high 310 | | long 311 | -------------------------------------------------------------------------------- /libstemmer_c/examples/stemwords.c: -------------------------------------------------------------------------------- 1 | /* This is a simple program which uses libstemmer to provide a command 2 | * line interface for stemming using any of the algorithms provided. 3 | */ 4 | 5 | #include 6 | #include /* for malloc, free */ 7 | #include /* for memmove */ 8 | #include /* for isupper, tolower */ 9 | 10 | #include "libstemmer.h" 11 | 12 | const char * progname; 13 | static int pretty = 1; 14 | 15 | static void 16 | stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) 17 | { 18 | #define INC 10 19 | int lim = INC; 20 | sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); 21 | 22 | while(1) { 23 | int ch = getc(f_in); 24 | if (ch == EOF) { 25 | free(b); return; 26 | } 27 | { 28 | int i = 0; 29 | int inlen = 0; 30 | while(1) { 31 | if (ch == '\n' || ch == EOF) break; 32 | if (i == lim) { 33 | sb_symbol * newb; 34 | newb = (sb_symbol *) 35 | realloc(b, (lim + INC) * sizeof(sb_symbol)); 36 | if (newb == 0) goto error; 37 | b = newb; 38 | lim = lim + INC; 39 | } 40 | /* Update count of utf-8 characters. */ 41 | if (ch < 0x80 || ch > 0xBF) inlen += 1; 42 | /* force lower case: */ 43 | if (isupper(ch)) ch = tolower(ch); 44 | 45 | b[i] = ch; 46 | i++; 47 | ch = getc(f_in); 48 | } 49 | 50 | { 51 | const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); 52 | if (stemmed == NULL) 53 | { 54 | fprintf(stderr, "Out of memory"); 55 | exit(1); 56 | } 57 | else 58 | { 59 | if (pretty == 1) { 60 | fwrite(b, i, 1, f_out); 61 | fputs(" -> ", f_out); 62 | } else if (pretty == 2) { 63 | fwrite(b, i, 1, f_out); 64 | if (sb_stemmer_length(stemmer) > 0) { 65 | int j; 66 | if (inlen < 30) { 67 | for (j = 30 - inlen; j > 0; j--) 68 | fputs(" ", f_out); 69 | } else { 70 | fputs("\n", f_out); 71 | for (j = 30; j > 0; j--) 72 | fputs(" ", f_out); 73 | } 74 | } 75 | } 76 | 77 | fputs((char *)stemmed, f_out); 78 | putc('\n', f_out); 79 | } 80 | } 81 | } 82 | } 83 | error: 84 | if (b != 0) free(b); 85 | return; 86 | } 87 | 88 | /** Display the command line syntax, and then exit. 89 | * @param n The value to exit with. 90 | */ 91 | static void 92 | usage(int n) 93 | { 94 | printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" 95 | "\n" 96 | "The input file consists of a list of words to be stemmed, one per\n" 97 | "line. Words should be in lower case, but (for English) A-Z letters\n" 98 | "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" 99 | "used.\n" 100 | "\n" 101 | "If -c is given, the argument is the character encoding of the input\n" 102 | "and output files. If it is omitted, the UTF-8 encoding is used.\n" 103 | "\n" 104 | "If -p is given the output file consists of each word of the input\n" 105 | "file followed by \"->\" followed by its stemmed equivalent.\n" 106 | "If -p2 is given the output file is a two column layout containing\n" 107 | "the input words in the first column and the stemmed eqivalents in\n" 108 | "the second column.\n" 109 | "Otherwise, the output file consists of the stemmed words, one per\n" 110 | "line.\n" 111 | "\n" 112 | "-h displays this help\n", 113 | progname); 114 | exit(n); 115 | } 116 | 117 | int 118 | main(int argc, char * argv[]) 119 | { 120 | char * in = 0; 121 | char * out = 0; 122 | FILE * f_in; 123 | FILE * f_out; 124 | struct sb_stemmer * stemmer; 125 | 126 | char * language = "english"; 127 | char * charenc = NULL; 128 | 129 | char * s; 130 | int i = 1; 131 | pretty = 0; 132 | 133 | progname = argv[0]; 134 | 135 | while(i < argc) { 136 | s = argv[i++]; 137 | if (s[0] == '-') { 138 | if (strcmp(s, "-o") == 0) { 139 | if (i >= argc) { 140 | fprintf(stderr, "%s requires an argument\n", s); 141 | exit(1); 142 | } 143 | out = argv[i++]; 144 | } else if (strcmp(s, "-i") == 0) { 145 | if (i >= argc) { 146 | fprintf(stderr, "%s requires an argument\n", s); 147 | exit(1); 148 | } 149 | in = argv[i++]; 150 | } else if (strcmp(s, "-l") == 0) { 151 | if (i >= argc) { 152 | fprintf(stderr, "%s requires an argument\n", s); 153 | exit(1); 154 | } 155 | language = argv[i++]; 156 | } else if (strcmp(s, "-c") == 0) { 157 | if (i >= argc) { 158 | fprintf(stderr, "%s requires an argument\n", s); 159 | exit(1); 160 | } 161 | charenc = argv[i++]; 162 | } else if (strcmp(s, "-p2") == 0) { 163 | pretty = 2; 164 | } else if (strcmp(s, "-p") == 0) { 165 | pretty = 1; 166 | } else if (strcmp(s, "-h") == 0) { 167 | usage(0); 168 | } else { 169 | fprintf(stderr, "option %s unknown\n", s); 170 | usage(1); 171 | } 172 | } else { 173 | fprintf(stderr, "unexpected parameter %s\n", s); 174 | usage(1); 175 | } 176 | } 177 | 178 | /* prepare the files */ 179 | f_in = (in == 0) ? stdin : fopen(in, "r"); 180 | if (f_in == 0) { 181 | fprintf(stderr, "file %s not found\n", in); 182 | exit(1); 183 | } 184 | f_out = (out == 0) ? stdout : fopen(out, "w"); 185 | if (f_out == 0) { 186 | fprintf(stderr, "file %s cannot be opened\n", out); 187 | exit(1); 188 | } 189 | 190 | /* do the stemming process: */ 191 | stemmer = sb_stemmer_new(language, charenc); 192 | if (stemmer == 0) { 193 | if (charenc == NULL) { 194 | fprintf(stderr, "language `%s' not available for stemming\n", language); 195 | exit(1); 196 | } else { 197 | fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); 198 | exit(1); 199 | } 200 | } 201 | stem_file(stemmer, f_in, f_out); 202 | sb_stemmer_delete(stemmer); 203 | 204 | if (in != 0) (void) fclose(f_in); 205 | if (out != 0) (void) fclose(f_out); 206 | 207 | return 0; 208 | } 209 | 210 | -------------------------------------------------------------------------------- /cl-libstemmer.lisp: -------------------------------------------------------------------------------- 1 | ;;;; cl-libstemmer.lisp 2 | 3 | (in-package #:cl-libstemmer) 4 | 5 | ;;; "cl-libstemmer" goes here. Hacks and glory await! 6 | 7 | (def libstemmer-path 8 | (asdf:system-relative-pathname :cl-libstemmer "libstemmer_c/") 9 | "Where are the libstemmer sources?") 10 | 11 | (defun wordp (x) 12 | "Sanity check for word-ness." 13 | (and (stringp x) (< (length x) 500))) 14 | 15 | (deftype word () 16 | "A string that is a word." 17 | '(and string (satisfies wordp))) 18 | 19 | (cffi:define-foreign-library libstemmer 20 | (t (:default "libstemmer"))) 21 | 22 | (defun use-libstemmer () 23 | "Try to load libstemmer; build it if possible." 24 | (handler-case 25 | (let ((cffi:*foreign-library-directories* (list libstemmer-path))) 26 | (cffi:use-foreign-library libstemmer)) 27 | (error () 28 | (format t "~&Building libstemmer.so...~%") 29 | (unless (zerop 30 | (uiop:run-program 31 | '("make" "libstemmer.so") 32 | :directory libstemmer-path)) 33 | (error "Could not build libstemmer.so."))))) 34 | 35 | (use-libstemmer) 36 | 37 | (defparameter *default-encoding* :utf-8 38 | "The default encoding for use with stemmers.") 39 | 40 | (def encodings 41 | '((:iso-8859-1 . "ISO_8859_1") 42 | (:iso-8859-2 . "ISO_8859_2") 43 | (:koi8-r . "KOI8_R") 44 | (:utf-8 . "UTF_8")) 45 | "Map keywords to encoding names libstemmer understands.") 46 | 47 | (defun encoding->string (encoding) 48 | "Look up ENCODING in `encodings'." 49 | (assocdr encoding encodings)) 50 | 51 | (defstruct (%stemmer (:constructor %make-stemmer (pointer &aux (deleted nil)))) 52 | "Wrap a C stemmer so we can track if it has been closed." 53 | (pointer (error "No pointer") :type cffi:foreign-pointer :read-only t) 54 | (deleted nil :type boolean)) 55 | 56 | (defun %close-stemmer (s) 57 | (unless (%stemmer-deleted s) 58 | (setf (%stemmer-deleted s) t) 59 | (sb_stemmer_delete (%stemmer-pointer s)) 60 | s)) 61 | 62 | (defclass stemmer (synchronized) 63 | ((%stemmer :initarg :stemmer :type %stemmer :accessor %stemmer) 64 | (language :initarg :language 65 | :accessor language-of 66 | :accessor stemmer-language) 67 | (encoding :initarg :encoding 68 | :accessor encoding-of 69 | :accessor stemmer-encoding)) 70 | (:documentation "Lisp wrapper for a C stemmer.")) 71 | 72 | (defmethod closed? ((self stemmer)) 73 | (%stemmer-deleted (%stemmer self))) 74 | 75 | (defmethod print-object ((self stemmer) stream) 76 | (print-unreadable-object (self stream :type t) 77 | (with-slots (language encoding) self 78 | (format t "~a/~a" language encoding)))) 79 | 80 | (defcondition no-such-stemmer (error) 81 | ((language :initarg :language :accessor no-such-stemmer-language) 82 | (encoding :initarg :encoding :accessor no-such-stemmer-encoding)) 83 | (:documentation "Error for an unsupport language/encoding combination.") 84 | (:report (lambda (c s) 85 | (with-slots (language encoding) c 86 | (format s "No such language/encoding pair: ~a/~a" 87 | language encoding))))) 88 | 89 | (defun check-open (stemmer) 90 | "Make sure STEMMER is not closed." 91 | (when (closed? stemmer) 92 | (error "~a is closed" stemmer))) 93 | 94 | (defmethod initialize-instance :after ((self stemmer) &key language encoding) 95 | "Set up the C stemmer." 96 | (let* ((enc (encoding->string encoding)) 97 | (ptr (cffi:with-foreign-strings ((lang language) 98 | (encoding enc)) 99 | (sb_stemmer_new lang encoding)))) 100 | (cond ((cffi:null-pointer-p ptr) 101 | (error 'no-such-stemmer 102 | :language language 103 | :encoding encoding)) 104 | (t (let ((stemmer (%make-stemmer ptr))) 105 | (setf (%stemmer self) stemmer) 106 | (tg:finalize self 107 | (lambda () 108 | (%close-stemmer stemmer)))))))) 109 | 110 | (defun close-stemmer (stemmer) 111 | "Close STEMMER and free the C-side stemmer." 112 | (synchronized (stemmer) 113 | (%close-stemmer (%stemmer stemmer)))) 114 | 115 | (defun load-stemmer (language &optional encoding) 116 | "Load a stemmer for LANGUAGE and ENCODING (which defaults to 117 | `*default-encoding*`." 118 | (make 'stemmer 119 | :language (string-downcase language) 120 | :encoding (or encoding *default-encoding*))) 121 | 122 | (defmacro with-stemmer ((var language &optional encoding) 123 | &body body) 124 | (with-thunk (body var) 125 | `(call/stemmer ,language ,encoding #',body))) 126 | 127 | (defun call/stemmer (lang enc fn) 128 | (let ((stemmer (load-stemmer lang enc))) 129 | (unwind-protect 130 | (funcall fn stemmer) 131 | (close-stemmer stemmer)))) 132 | 133 | ;;; NB 134 | ;;; /** Stem a word. 135 | ;;; * 136 | ;;; * The return value is owned by the stemmer - it must not be freed or 137 | ;;; * modified, and it will become invalid when the stemmer is called again, 138 | ;;; * or if the stemmer is freed. 139 | ;;; * 140 | ;;; * The length of the return value can be obtained using sb_stemmer_length(). 141 | ;;; * 142 | ;;; * If an out-of-memory error occurs, this will return NULL. 143 | ;;; */ 144 | 145 | (defun stem-word/no-lock (stemmer word) 146 | (typecase word 147 | (word 148 | (when (stop-word-p word (stemmer-language stemmer)) 149 | (return-from stem-word/no-lock word)) 150 | (let ((encoding (encoding-of stemmer))) 151 | (cffi:with-foreign-string (fw word :encoding encoding) 152 | ;; TODO Octets by encoding. 153 | (let ((ptr (sb_stemmer_stem (%stemmer-pointer (%stemmer stemmer)) 154 | fw 155 | (cffi::foreign-string-length fw :encoding encoding)))) 156 | (cffi:foreign-string-to-lisp ptr :encoding encoding))))) 157 | (t word))) 158 | 159 | (defun stem (stemmer word) 160 | (check-open stemmer) 161 | (synchronized (stemmer) 162 | (stem-word/no-lock stemmer word))) 163 | 164 | (defun stem-all (list language &optional encoding) 165 | (handler-case 166 | (values 167 | (with-stemmer (s language encoding) 168 | (synchronized (s) 169 | (loop for item in list 170 | collect (stem-word/no-lock s item)))) 171 | t) 172 | (no-such-stemmer () 173 | (values list nil)))) 174 | -------------------------------------------------------------------------------- /stopwords/es.txt: -------------------------------------------------------------------------------- 1 | | A Spanish stop word list. Comments begin with vertical bar. Each stop 2 | | word is at the start of a line. 3 | 4 | 5 | | The following is a ranked list (commonest to rarest) of stopwords 6 | | deriving from a large sample of text. 7 | 8 | | Extra words have been added at the end. 9 | 10 | de | from, of 11 | la | the, her 12 | que | who, that 13 | el | the 14 | en | in 15 | y | and 16 | a | to 17 | los | the, them 18 | del | de + el 19 | se | himself, from him etc 20 | las | the, them 21 | por | for, by, etc 22 | un | a 23 | para | for 24 | con | with 25 | no | no 26 | una | a 27 | su | his, her 28 | al | a + el 29 | | es from SER 30 | lo | him 31 | como | how 32 | más | more 33 | pero | pero 34 | sus | su plural 35 | le | to him, her 36 | ya | already 37 | o | or 38 | | fue from SER 39 | este | this 40 | | ha from HABER 41 | sí | himself etc 42 | porque | because 43 | esta | this 44 | | son from SER 45 | entre | between 46 | | está from ESTAR 47 | cuando | when 48 | muy | very 49 | sin | without 50 | sobre | on 51 | | ser from SER 52 | | tiene from TENER 53 | también | also 54 | me | me 55 | hasta | until 56 | hay | there is/are 57 | donde | where 58 | | han from HABER 59 | quien | whom, that 60 | | están from ESTAR 61 | | estado from ESTAR 62 | desde | from 63 | todo | all 64 | nos | us 65 | durante | during 66 | | estados from ESTAR 67 | todos | all 68 | uno | a 69 | les | to them 70 | ni | nor 71 | contra | against 72 | otros | other 73 | | fueron from SER 74 | ese | that 75 | eso | that 76 | | había from HABER 77 | ante | before 78 | ellos | they 79 | e | and (variant of y) 80 | esto | this 81 | mí | me 82 | antes | before 83 | algunos | some 84 | qué | what? 85 | unos | a 86 | yo | I 87 | otro | other 88 | otras | other 89 | otra | other 90 | él | he 91 | tanto | so much, many 92 | esa | that 93 | estos | these 94 | mucho | much, many 95 | quienes | who 96 | nada | nothing 97 | muchos | many 98 | cual | who 99 | | sea from SER 100 | poco | few 101 | ella | she 102 | estar | to be 103 | | haber from HABER 104 | estas | these 105 | | estaba from ESTAR 106 | | estamos from ESTAR 107 | algunas | some 108 | algo | something 109 | nosotros | we 110 | 111 | | other forms 112 | 113 | mi | me 114 | mis | mi plural 115 | tú | thou 116 | te | thee 117 | ti | thee 118 | tu | thy 119 | tus | tu plural 120 | ellas | they 121 | nosotras | we 122 | vosotros | you 123 | vosotras | you 124 | os | you 125 | mío | mine 126 | mía | 127 | míos | 128 | mías | 129 | tuyo | thine 130 | tuya | 131 | tuyos | 132 | tuyas | 133 | suyo | his, hers, theirs 134 | suya | 135 | suyos | 136 | suyas | 137 | nuestro | ours 138 | nuestra | 139 | nuestros | 140 | nuestras | 141 | vuestro | yours 142 | vuestra | 143 | vuestros | 144 | vuestras | 145 | esos | those 146 | esas | those 147 | 148 | | forms of estar, to be (not including the infinitive): 149 | estoy 150 | estás 151 | está 152 | estamos 153 | estáis 154 | están 155 | esté 156 | estés 157 | estemos 158 | estéis 159 | estén 160 | estaré 161 | estarás 162 | estará 163 | estaremos 164 | estaréis 165 | estarán 166 | estaría 167 | estarías 168 | estaríamos 169 | estaríais 170 | estarían 171 | estaba 172 | estabas 173 | estábamos 174 | estabais 175 | estaban 176 | estuve 177 | estuviste 178 | estuvo 179 | estuvimos 180 | estuvisteis 181 | estuvieron 182 | estuviera 183 | estuvieras 184 | estuviéramos 185 | estuvierais 186 | estuvieran 187 | estuviese 188 | estuvieses 189 | estuviésemos 190 | estuvieseis 191 | estuviesen 192 | estando 193 | estado 194 | estada 195 | estados 196 | estadas 197 | estad 198 | 199 | | forms of haber, to have (not including the infinitive): 200 | he 201 | has 202 | ha 203 | hemos 204 | habéis 205 | han 206 | haya 207 | hayas 208 | hayamos 209 | hayáis 210 | hayan 211 | habré 212 | habrás 213 | habrá 214 | habremos 215 | habréis 216 | habrán 217 | habría 218 | habrías 219 | habríamos 220 | habríais 221 | habrían 222 | había 223 | habías 224 | habíamos 225 | habíais 226 | habían 227 | hube 228 | hubiste 229 | hubo 230 | hubimos 231 | hubisteis 232 | hubieron 233 | hubiera 234 | hubieras 235 | hubiéramos 236 | hubierais 237 | hubieran 238 | hubiese 239 | hubieses 240 | hubiésemos 241 | hubieseis 242 | hubiesen 243 | habiendo 244 | habido 245 | habida 246 | habidos 247 | habidas 248 | 249 | | forms of ser, to be (not including the infinitive): 250 | soy 251 | eres 252 | es 253 | somos 254 | sois 255 | son 256 | sea 257 | seas 258 | seamos 259 | seáis 260 | sean 261 | seré 262 | serás 263 | será 264 | seremos 265 | seréis 266 | serán 267 | sería 268 | serías 269 | seríamos 270 | seríais 271 | serían 272 | era 273 | eras 274 | éramos 275 | erais 276 | eran 277 | fui 278 | fuiste 279 | fue 280 | fuimos 281 | fuisteis 282 | fueron 283 | fuera 284 | fueras 285 | fuéramos 286 | fuerais 287 | fueran 288 | fuese 289 | fueses 290 | fuésemos 291 | fueseis 292 | fuesen 293 | siendo 294 | sido 295 | | sed also means 'thirst' 296 | 297 | | forms of tener, to have (not including the infinitive): 298 | tengo 299 | tienes 300 | tiene 301 | tenemos 302 | tenéis 303 | tienen 304 | tenga 305 | tengas 306 | tengamos 307 | tengáis 308 | tengan 309 | tendré 310 | tendrás 311 | tendrá 312 | tendremos 313 | tendréis 314 | tendrán 315 | tendría 316 | tendrías 317 | tendríamos 318 | tendríais 319 | tendrían 320 | tenía 321 | tenías 322 | teníamos 323 | teníais 324 | tenían 325 | tuve 326 | tuviste 327 | tuvo 328 | tuvimos 329 | tuvisteis 330 | tuvieron 331 | tuviera 332 | tuvieras 333 | tuviéramos 334 | tuvierais 335 | tuvieran 336 | tuviese 337 | tuvieses 338 | tuviésemos 339 | tuvieseis 340 | tuviesen 341 | teniendo 342 | tenido 343 | tenida 344 | tenidos 345 | tenidas 346 | tened 347 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules_utf8.h: -------------------------------------------------------------------------------- 1 | /* libstemmer/modules_utf8.h: List of stemming modules. 2 | * 3 | * This file is generated by mkmodules.pl from a list of module names. 4 | * Do not edit manually. 5 | * 6 | * Modules included by this file are: danish, dutch, english, finnish, french, 7 | * german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | * russian, spanish, swedish, turkish 9 | */ 10 | 11 | #include "../src_c/stem_UTF_8_danish.h" 12 | #include "../src_c/stem_UTF_8_dutch.h" 13 | #include "../src_c/stem_UTF_8_english.h" 14 | #include "../src_c/stem_UTF_8_finnish.h" 15 | #include "../src_c/stem_UTF_8_french.h" 16 | #include "../src_c/stem_UTF_8_german.h" 17 | #include "../src_c/stem_UTF_8_hungarian.h" 18 | #include "../src_c/stem_UTF_8_italian.h" 19 | #include "../src_c/stem_UTF_8_norwegian.h" 20 | #include "../src_c/stem_UTF_8_porter.h" 21 | #include "../src_c/stem_UTF_8_portuguese.h" 22 | #include "../src_c/stem_UTF_8_romanian.h" 23 | #include "../src_c/stem_UTF_8_russian.h" 24 | #include "../src_c/stem_UTF_8_spanish.h" 25 | #include "../src_c/stem_UTF_8_swedish.h" 26 | #include "../src_c/stem_UTF_8_turkish.h" 27 | 28 | typedef enum { 29 | ENC_UNKNOWN=0, 30 | ENC_UTF_8 31 | } stemmer_encoding_t; 32 | 33 | struct stemmer_encoding { 34 | const char * name; 35 | stemmer_encoding_t enc; 36 | }; 37 | static struct stemmer_encoding encodings[] = { 38 | {"UTF_8", ENC_UTF_8}, 39 | {0,ENC_UNKNOWN} 40 | }; 41 | 42 | struct stemmer_modules { 43 | const char * name; 44 | stemmer_encoding_t enc; 45 | struct SN_env * (*create)(void); 46 | void (*close)(struct SN_env *); 47 | int (*stem)(struct SN_env *); 48 | }; 49 | static struct stemmer_modules modules[] = { 50 | {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 51 | {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 52 | {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 53 | {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 54 | {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 55 | {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 56 | {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 57 | {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 58 | {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 59 | {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 60 | {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 61 | {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 62 | {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 63 | {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 64 | {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 65 | {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 66 | {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 67 | {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 68 | {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 69 | {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 70 | {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 71 | {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 72 | {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 73 | {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 74 | {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 75 | {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 76 | {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 77 | {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 78 | {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 79 | {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 80 | {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 81 | {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 82 | {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 83 | {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, 84 | {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 85 | {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 86 | {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 87 | {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 88 | {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 89 | {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 90 | {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 91 | {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 92 | {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 93 | {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 94 | {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 95 | {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 96 | {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 97 | {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 98 | {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 99 | {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 100 | {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 101 | {0,ENC_UNKNOWN,0,0,0} 102 | }; 103 | static const char * algorithm_names[] = { 104 | "danish", 105 | "dutch", 106 | "english", 107 | "finnish", 108 | "french", 109 | "german", 110 | "hungarian", 111 | "italian", 112 | "norwegian", 113 | "porter", 114 | "portuguese", 115 | "romanian", 116 | "russian", 117 | "spanish", 118 | "swedish", 119 | "turkish", 120 | 0 121 | }; 122 | -------------------------------------------------------------------------------- /stopwords/ru.txt: -------------------------------------------------------------------------------- 1 | | a russian stop word list. comments begin with vertical bar. each stop 2 | | word is at the start of a line. 3 | 4 | | this is a ranked list (commonest to rarest) of stopwords derived from 5 | | a large text sample. 6 | 7 | | letter `ё' is translated to `е'. 8 | 9 | и | and 10 | в | in/into 11 | во | alternative form 12 | не | not 13 | что | what/that 14 | он | he 15 | на | on/onto 16 | я | i 17 | с | from 18 | со | alternative form 19 | как | how 20 | а | milder form of `no' (but) 21 | то | conjunction and form of `that' 22 | все | all 23 | она | she 24 | так | so, thus 25 | его | him 26 | но | but 27 | да | yes/and 28 | ты | thou 29 | к | towards, by 30 | у | around, chez 31 | же | intensifier particle 32 | вы | you 33 | за | beyond, behind 34 | бы | conditional/subj. particle 35 | по | up to, along 36 | только | only 37 | ее | her 38 | мне | to me 39 | было | it was 40 | вот | here is/are, particle 41 | от | away from 42 | меня | me 43 | еще | still, yet, more 44 | нет | no, there isnt/arent 45 | о | about 46 | из | out of 47 | ему | to him 48 | теперь | now 49 | когда | when 50 | даже | even 51 | ну | so, well 52 | вдруг | suddenly 53 | ли | interrogative particle 54 | если | if 55 | уже | already, but homonym of `narrower' 56 | или | or 57 | ни | neither 58 | быть | to be 59 | был | he was 60 | него | prepositional form of его 61 | до | up to 62 | вас | you accusative 63 | нибудь | indef. suffix preceded by hyphen 64 | опять | again 65 | уж | already, but homonym of `adder' 66 | вам | to you 67 | сказал | he said 68 | ведь | particle `after all' 69 | там | there 70 | потом | then 71 | себя | oneself 72 | ничего | nothing 73 | ей | to her 74 | может | usually with `быть' as `maybe' 75 | они | they 76 | тут | here 77 | где | where 78 | есть | there is/are 79 | надо | got to, must 80 | ней | prepositional form of ей 81 | для | for 82 | мы | we 83 | тебя | thee 84 | их | them, their 85 | чем | than 86 | была | she was 87 | сам | self 88 | чтоб | in order to 89 | без | without 90 | будто | as if 91 | человек | man, person, one 92 | чего | genitive form of `what' 93 | раз | once 94 | тоже | also 95 | себе | to oneself 96 | под | beneath 97 | жизнь | life 98 | будет | will be 99 | ж | short form of intensifer particle `же' 100 | тогда | then 101 | кто | who 102 | этот | this 103 | говорил | was saying 104 | того | genitive form of `that' 105 | потому | for that reason 106 | этого | genitive form of `this' 107 | какой | which 108 | совсем | altogether 109 | ним | prepositional form of `его', `они' 110 | здесь | here 111 | этом | prepositional form of `этот' 112 | один | one 113 | почти | almost 114 | мой | my 115 | тем | instrumental/dative plural of `тот', `то' 116 | чтобы | full form of `in order that' 117 | нее | her (acc.) 118 | кажется | it seems 119 | сейчас | now 120 | были | they were 121 | куда | where to 122 | зачем | why 123 | сказать | to say 124 | всех | all (acc., gen. preposn. plural) 125 | никогда | never 126 | сегодня | today 127 | можно | possible, one can 128 | при | by 129 | наконец | finally 130 | два | two 131 | об | alternative form of `о', about 132 | другой | another 133 | хоть | even 134 | после | after 135 | над | above 136 | больше | more 137 | тот | that one (masc.) 138 | через | across, in 139 | эти | these 140 | нас | us 141 | про | about 142 | всего | in all, only, of all 143 | них | prepositional form of `они' (they) 144 | какая | which, feminine 145 | много | lots 146 | разве | interrogative particle 147 | сказала | she said 148 | три | three 149 | эту | this, acc. fem. sing. 150 | моя | my, feminine 151 | впрочем | moreover, besides 152 | хорошо | good 153 | свою | ones own, acc. fem. sing. 154 | этой | oblique form of `эта', fem. `this' 155 | перед | in front of 156 | иногда | sometimes 157 | лучше | better 158 | чуть | a little 159 | том | preposn. form of `that one' 160 | нельзя | one must not 161 | такой | such a one 162 | им | to them 163 | более | more 164 | всегда | always 165 | конечно | of course 166 | всю | acc. fem. sing of `all' 167 | между | between 168 | 169 | 170 | | b: some paradigms 171 | | 172 | | personal pronouns 173 | | 174 | | я меня мне мной [мною] 175 | | ты тебя тебе тобой [тобою] 176 | | он его ему им [него, нему, ним] 177 | | она ее эи ею [нее, нэи, нею] 178 | | оно его ему им [него, нему, ним] 179 | | 180 | | мы нас нам нами 181 | | вы вас вам вами 182 | | они их им ими [них, ним, ними] 183 | | 184 | | себя себе собой [собою] 185 | | 186 | | demonstrative pronouns: этот (this), тот (that) 187 | | 188 | | этот эта это эти 189 | | этого эты это эти 190 | | этого этой этого этих 191 | | этому этой этому этим 192 | | этим этой этим [этою] этими 193 | | этом этой этом этих 194 | | 195 | | тот та то те 196 | | того ту то те 197 | | того той того тех 198 | | тому той тому тем 199 | | тем той тем [тою] теми 200 | | том той том тех 201 | | 202 | | determinative pronouns 203 | | 204 | | (a) весь (all) 205 | | 206 | | весь вся все все 207 | | всего всю все все 208 | | всего всей всего всех 209 | | всему всей всему всем 210 | | всем всей всем [всею] всеми 211 | | всем всей всем всех 212 | | 213 | | (b) сам (himself etc) 214 | | 215 | | сам сама само сами 216 | | самого саму само самих 217 | | самого самой самого самих 218 | | самому самой самому самим 219 | | самим самой самим [самою] самими 220 | | самом самой самом самих 221 | | 222 | | stems of verbs `to be', `to have', `to do' and modal 223 | | 224 | | быть бы буд быв есть суть 225 | | име 226 | | дел 227 | | мог мож мочь 228 | | уме 229 | | хоч хот 230 | | долж 231 | | можн 232 | | нужн 233 | | нельзя 234 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void); 23 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[1] = { 'e' }; 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' }; 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' }; 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' }; 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' }; 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' }; 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' }; 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' }; 38 | static const symbol s_0_9[2] = { 'e', 'n' }; 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' }; 40 | static const symbol s_0_11[2] = { 'a', 'r' }; 41 | static const symbol s_0_12[2] = { 'e', 'r' }; 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' }; 43 | static const symbol s_0_14[1] = { 's' }; 44 | static const symbol s_0_15[2] = { 'a', 's' }; 45 | static const symbol s_0_16[2] = { 'e', 's' }; 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' }; 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' }; 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' }; 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' }; 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' }; 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' }; 53 | static const symbol s_0_24[3] = { 'e', 't', 's' }; 54 | static const symbol s_0_25[2] = { 'e', 't' }; 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' }; 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' }; 57 | static const symbol s_0_28[3] = { 'a', 's', 't' }; 58 | 59 | static const struct among a_0[29] = 60 | { 61 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 62 | /* 1 */ { 1, s_0_1, -1, 1, 0}, 63 | /* 2 */ { 3, s_0_2, 1, 1, 0}, 64 | /* 3 */ { 4, s_0_3, 1, 1, 0}, 65 | /* 4 */ { 4, s_0_4, 1, 1, 0}, 66 | /* 5 */ { 3, s_0_5, 1, 1, 0}, 67 | /* 6 */ { 3, s_0_6, 1, 1, 0}, 68 | /* 7 */ { 6, s_0_7, 6, 1, 0}, 69 | /* 8 */ { 4, s_0_8, 1, 3, 0}, 70 | /* 9 */ { 2, s_0_9, -1, 1, 0}, 71 | /* 10 */ { 5, s_0_10, 9, 1, 0}, 72 | /* 11 */ { 2, s_0_11, -1, 1, 0}, 73 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 74 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 75 | /* 14 */ { 1, s_0_14, -1, 2, 0}, 76 | /* 15 */ { 2, s_0_15, 14, 1, 0}, 77 | /* 16 */ { 2, s_0_16, 14, 1, 0}, 78 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 79 | /* 18 */ { 5, s_0_18, 16, 1, 0}, 80 | /* 19 */ { 4, s_0_19, 16, 1, 0}, 81 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 82 | /* 21 */ { 3, s_0_21, 14, 1, 0}, 83 | /* 22 */ { 6, s_0_22, 21, 1, 0}, 84 | /* 23 */ { 3, s_0_23, 14, 1, 0}, 85 | /* 24 */ { 3, s_0_24, 14, 1, 0}, 86 | /* 25 */ { 2, s_0_25, -1, 1, 0}, 87 | /* 26 */ { 3, s_0_26, 25, 1, 0}, 88 | /* 27 */ { 3, s_0_27, -1, 3, 0}, 89 | /* 28 */ { 3, s_0_28, -1, 1, 0} 90 | }; 91 | 92 | static const symbol s_1_0[2] = { 'd', 't' }; 93 | static const symbol s_1_1[2] = { 'v', 't' }; 94 | 95 | static const struct among a_1[2] = 96 | { 97 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 98 | /* 1 */ { 2, s_1_1, -1, -1, 0} 99 | }; 100 | 101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' }; 102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' }; 103 | static const symbol s_2_2[2] = { 'i', 'g' }; 104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' }; 105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' }; 106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' }; 107 | static const symbol s_2_6[3] = { 'e', 'l', 's' }; 108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' }; 109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' }; 110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' }; 111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' }; 112 | 113 | static const struct among a_2[11] = 114 | { 115 | /* 0 */ { 3, s_2_0, -1, 1, 0}, 116 | /* 1 */ { 4, s_2_1, 0, 1, 0}, 117 | /* 2 */ { 2, s_2_2, -1, 1, 0}, 118 | /* 3 */ { 3, s_2_3, 2, 1, 0}, 119 | /* 4 */ { 3, s_2_4, 2, 1, 0}, 120 | /* 5 */ { 4, s_2_5, 4, 1, 0}, 121 | /* 6 */ { 3, s_2_6, -1, 1, 0}, 122 | /* 7 */ { 3, s_2_7, -1, 1, 0}, 123 | /* 8 */ { 4, s_2_8, 7, 1, 0}, 124 | /* 9 */ { 4, s_2_9, 7, 1, 0}, 125 | /* 10 */ { 7, s_2_10, 9, 1, 0} 126 | }; 127 | 128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 129 | 130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 }; 131 | 132 | static const symbol s_0[] = { 'k' }; 133 | static const symbol s_1[] = { 'e', 'r' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 30 */ 138 | { int ret = z->c + 3; 139 | if (0 > ret || ret > z->l) return 0; 140 | z->c = ret; /* hop, line 30 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 30 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */ 146 | { /* gopast */ /* non v, line 31 */ 147 | int ret = in_grouping(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 31 */ 152 | /* try, line 32 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 38 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 38 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 38 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 29); /* substring, line 38 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 38 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 44 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | { int m2 = z->l - z->c; (void)m2; /* or, line 46 */ 183 | if (in_grouping_b(z, g_s_ending, 98, 122, 0)) goto lab1; 184 | goto lab0; 185 | lab1: 186 | z->c = z->l - m2; 187 | if (!(eq_s_b(z, 1, s_0))) return 0; 188 | if (out_grouping_b(z, g_v, 97, 248, 0)) return 0; 189 | } 190 | lab0: 191 | { int ret = slice_del(z); /* delete, line 46 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 3: 196 | { int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */ 197 | if (ret < 0) return ret; 198 | } 199 | break; 200 | } 201 | return 1; 202 | } 203 | 204 | static int r_consonant_pair(struct SN_env * z) { 205 | { int m_test = z->l - z->c; /* test, line 53 */ 206 | { int mlimit; /* setlimit, line 54 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 54 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | z->ket = z->c; /* [, line 54 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */ 215 | z->bra = z->c; /* ], line 54 */ 216 | z->lb = mlimit; 217 | } 218 | z->c = z->l - m_test; 219 | } 220 | if (z->c <= z->lb) return 0; 221 | z->c--; /* next, line 59 */ 222 | z->bra = z->c; /* ], line 59 */ 223 | { int ret = slice_del(z); /* delete, line 59 */ 224 | if (ret < 0) return ret; 225 | } 226 | return 1; 227 | } 228 | 229 | static int r_other_suffix(struct SN_env * z) { 230 | int among_var; 231 | { int mlimit; /* setlimit, line 63 */ 232 | int m1 = z->l - z->c; (void)m1; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 63 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m1; 237 | z->ket = z->c; /* [, line 63 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 11); /* substring, line 63 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 63 */ 242 | z->lb = mlimit; 243 | } 244 | switch(among_var) { 245 | case 0: return 0; 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 67 */ 248 | if (ret < 0) return ret; 249 | } 250 | break; 251 | } 252 | return 1; 253 | } 254 | 255 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z) { 256 | { int c1 = z->c; /* do, line 74 */ 257 | { int ret = r_mark_regions(z); 258 | if (ret == 0) goto lab0; /* call mark_regions, line 74 */ 259 | if (ret < 0) return ret; 260 | } 261 | lab0: 262 | z->c = c1; 263 | } 264 | z->lb = z->c; z->c = z->l; /* backwards, line 75 */ 265 | 266 | { int m2 = z->l - z->c; (void)m2; /* do, line 76 */ 267 | { int ret = r_main_suffix(z); 268 | if (ret == 0) goto lab1; /* call main_suffix, line 76 */ 269 | if (ret < 0) return ret; 270 | } 271 | lab1: 272 | z->c = z->l - m2; 273 | } 274 | { int m3 = z->l - z->c; (void)m3; /* do, line 77 */ 275 | { int ret = r_consonant_pair(z); 276 | if (ret == 0) goto lab2; /* call consonant_pair, line 77 */ 277 | if (ret < 0) return ret; 278 | } 279 | lab2: 280 | z->c = z->l - m3; 281 | } 282 | { int m4 = z->l - z->c; (void)m4; /* do, line 78 */ 283 | { int ret = r_other_suffix(z); 284 | if (ret == 0) goto lab3; /* call other_suffix, line 78 */ 285 | if (ret < 0) return ret; 286 | } 287 | lab3: 288 | z->c = z->l - m4; 289 | } 290 | z->c = z->lb; 291 | return 1; 292 | } 293 | 294 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); } 295 | 296 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); } 297 | 298 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_norwegian.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int norwegian_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * norwegian_UTF_8_create_env(void); 23 | extern void norwegian_UTF_8_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[1] = { 'e' }; 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' }; 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' }; 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' }; 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' }; 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' }; 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' }; 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' }; 38 | static const symbol s_0_9[2] = { 'e', 'n' }; 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' }; 40 | static const symbol s_0_11[2] = { 'a', 'r' }; 41 | static const symbol s_0_12[2] = { 'e', 'r' }; 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' }; 43 | static const symbol s_0_14[1] = { 's' }; 44 | static const symbol s_0_15[2] = { 'a', 's' }; 45 | static const symbol s_0_16[2] = { 'e', 's' }; 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' }; 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' }; 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' }; 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' }; 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' }; 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' }; 53 | static const symbol s_0_24[3] = { 'e', 't', 's' }; 54 | static const symbol s_0_25[2] = { 'e', 't' }; 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' }; 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' }; 57 | static const symbol s_0_28[3] = { 'a', 's', 't' }; 58 | 59 | static const struct among a_0[29] = 60 | { 61 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 62 | /* 1 */ { 1, s_0_1, -1, 1, 0}, 63 | /* 2 */ { 3, s_0_2, 1, 1, 0}, 64 | /* 3 */ { 4, s_0_3, 1, 1, 0}, 65 | /* 4 */ { 4, s_0_4, 1, 1, 0}, 66 | /* 5 */ { 3, s_0_5, 1, 1, 0}, 67 | /* 6 */ { 3, s_0_6, 1, 1, 0}, 68 | /* 7 */ { 6, s_0_7, 6, 1, 0}, 69 | /* 8 */ { 4, s_0_8, 1, 3, 0}, 70 | /* 9 */ { 2, s_0_9, -1, 1, 0}, 71 | /* 10 */ { 5, s_0_10, 9, 1, 0}, 72 | /* 11 */ { 2, s_0_11, -1, 1, 0}, 73 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 74 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 75 | /* 14 */ { 1, s_0_14, -1, 2, 0}, 76 | /* 15 */ { 2, s_0_15, 14, 1, 0}, 77 | /* 16 */ { 2, s_0_16, 14, 1, 0}, 78 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 79 | /* 18 */ { 5, s_0_18, 16, 1, 0}, 80 | /* 19 */ { 4, s_0_19, 16, 1, 0}, 81 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 82 | /* 21 */ { 3, s_0_21, 14, 1, 0}, 83 | /* 22 */ { 6, s_0_22, 21, 1, 0}, 84 | /* 23 */ { 3, s_0_23, 14, 1, 0}, 85 | /* 24 */ { 3, s_0_24, 14, 1, 0}, 86 | /* 25 */ { 2, s_0_25, -1, 1, 0}, 87 | /* 26 */ { 3, s_0_26, 25, 1, 0}, 88 | /* 27 */ { 3, s_0_27, -1, 3, 0}, 89 | /* 28 */ { 3, s_0_28, -1, 1, 0} 90 | }; 91 | 92 | static const symbol s_1_0[2] = { 'd', 't' }; 93 | static const symbol s_1_1[2] = { 'v', 't' }; 94 | 95 | static const struct among a_1[2] = 96 | { 97 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 98 | /* 1 */ { 2, s_1_1, -1, -1, 0} 99 | }; 100 | 101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' }; 102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' }; 103 | static const symbol s_2_2[2] = { 'i', 'g' }; 104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' }; 105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' }; 106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' }; 107 | static const symbol s_2_6[3] = { 'e', 'l', 's' }; 108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' }; 109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' }; 110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' }; 111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' }; 112 | 113 | static const struct among a_2[11] = 114 | { 115 | /* 0 */ { 3, s_2_0, -1, 1, 0}, 116 | /* 1 */ { 4, s_2_1, 0, 1, 0}, 117 | /* 2 */ { 2, s_2_2, -1, 1, 0}, 118 | /* 3 */ { 3, s_2_3, 2, 1, 0}, 119 | /* 4 */ { 3, s_2_4, 2, 1, 0}, 120 | /* 5 */ { 4, s_2_5, 4, 1, 0}, 121 | /* 6 */ { 3, s_2_6, -1, 1, 0}, 122 | /* 7 */ { 3, s_2_7, -1, 1, 0}, 123 | /* 8 */ { 4, s_2_8, 7, 1, 0}, 124 | /* 9 */ { 4, s_2_9, 7, 1, 0}, 125 | /* 10 */ { 7, s_2_10, 9, 1, 0} 126 | }; 127 | 128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 129 | 130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 }; 131 | 132 | static const symbol s_0[] = { 'k' }; 133 | static const symbol s_1[] = { 'e', 'r' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 30 */ 138 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 139 | if (ret < 0) return 0; 140 | z->c = ret; /* hop, line 30 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 30 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */ 146 | { /* gopast */ /* non v, line 31 */ 147 | int ret = in_grouping_U(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 31 */ 152 | /* try, line 32 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 38 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 38 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 38 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 29); /* substring, line 38 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 38 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 44 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | { int m2 = z->l - z->c; (void)m2; /* or, line 46 */ 183 | if (in_grouping_b_U(z, g_s_ending, 98, 122, 0)) goto lab1; 184 | goto lab0; 185 | lab1: 186 | z->c = z->l - m2; 187 | if (!(eq_s_b(z, 1, s_0))) return 0; 188 | if (out_grouping_b_U(z, g_v, 97, 248, 0)) return 0; 189 | } 190 | lab0: 191 | { int ret = slice_del(z); /* delete, line 46 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 3: 196 | { int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */ 197 | if (ret < 0) return ret; 198 | } 199 | break; 200 | } 201 | return 1; 202 | } 203 | 204 | static int r_consonant_pair(struct SN_env * z) { 205 | { int m_test = z->l - z->c; /* test, line 53 */ 206 | { int mlimit; /* setlimit, line 54 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 54 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | z->ket = z->c; /* [, line 54 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */ 215 | z->bra = z->c; /* ], line 54 */ 216 | z->lb = mlimit; 217 | } 218 | z->c = z->l - m_test; 219 | } 220 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 221 | if (ret < 0) return 0; 222 | z->c = ret; /* next, line 59 */ 223 | } 224 | z->bra = z->c; /* ], line 59 */ 225 | { int ret = slice_del(z); /* delete, line 59 */ 226 | if (ret < 0) return ret; 227 | } 228 | return 1; 229 | } 230 | 231 | static int r_other_suffix(struct SN_env * z) { 232 | int among_var; 233 | { int mlimit; /* setlimit, line 63 */ 234 | int m1 = z->l - z->c; (void)m1; 235 | if (z->c < z->I[0]) return 0; 236 | z->c = z->I[0]; /* tomark, line 63 */ 237 | mlimit = z->lb; z->lb = z->c; 238 | z->c = z->l - m1; 239 | z->ket = z->c; /* [, line 63 */ 240 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 241 | among_var = find_among_b(z, a_2, 11); /* substring, line 63 */ 242 | if (!(among_var)) { z->lb = mlimit; return 0; } 243 | z->bra = z->c; /* ], line 63 */ 244 | z->lb = mlimit; 245 | } 246 | switch(among_var) { 247 | case 0: return 0; 248 | case 1: 249 | { int ret = slice_del(z); /* delete, line 67 */ 250 | if (ret < 0) return ret; 251 | } 252 | break; 253 | } 254 | return 1; 255 | } 256 | 257 | extern int norwegian_UTF_8_stem(struct SN_env * z) { 258 | { int c1 = z->c; /* do, line 74 */ 259 | { int ret = r_mark_regions(z); 260 | if (ret == 0) goto lab0; /* call mark_regions, line 74 */ 261 | if (ret < 0) return ret; 262 | } 263 | lab0: 264 | z->c = c1; 265 | } 266 | z->lb = z->c; z->c = z->l; /* backwards, line 75 */ 267 | 268 | { int m2 = z->l - z->c; (void)m2; /* do, line 76 */ 269 | { int ret = r_main_suffix(z); 270 | if (ret == 0) goto lab1; /* call main_suffix, line 76 */ 271 | if (ret < 0) return ret; 272 | } 273 | lab1: 274 | z->c = z->l - m2; 275 | } 276 | { int m3 = z->l - z->c; (void)m3; /* do, line 77 */ 277 | { int ret = r_consonant_pair(z); 278 | if (ret == 0) goto lab2; /* call consonant_pair, line 77 */ 279 | if (ret < 0) return ret; 280 | } 281 | lab2: 282 | z->c = z->l - m3; 283 | } 284 | { int m4 = z->l - z->c; (void)m4; /* do, line 78 */ 285 | { int ret = r_other_suffix(z); 286 | if (ret == 0) goto lab3; /* call other_suffix, line 78 */ 287 | if (ret < 0) return ret; 288 | } 289 | lab3: 290 | z->c = z->l - m4; 291 | } 292 | z->c = z->lb; 293 | return 1; 294 | } 295 | 296 | extern struct SN_env * norwegian_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); } 297 | 298 | extern void norwegian_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); } 299 | 300 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_swedish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int swedish_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * swedish_ISO_8859_1_create_env(void); 23 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' }; 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' }; 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' }; 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' }; 34 | static const symbol s_0_5[2] = { 'a', 'd' }; 35 | static const symbol s_0_6[1] = { 'e' }; 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' }; 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' }; 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' }; 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' }; 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' }; 41 | static const symbol s_0_12[2] = { 'e', 'n' }; 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' }; 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' }; 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' }; 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' }; 46 | static const symbol s_0_17[2] = { 'a', 'r' }; 47 | static const symbol s_0_18[2] = { 'e', 'r' }; 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' }; 49 | static const symbol s_0_20[2] = { 'o', 'r' }; 50 | static const symbol s_0_21[1] = { 's' }; 51 | static const symbol s_0_22[2] = { 'a', 's' }; 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' }; 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' }; 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' }; 55 | static const symbol s_0_26[2] = { 'e', 's' }; 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' }; 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' }; 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' }; 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' }; 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' }; 62 | static const symbol s_0_33[2] = { 'a', 't' }; 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' }; 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' }; 65 | static const symbol s_0_36[3] = { 'a', 's', 't' }; 66 | 67 | static const struct among a_0[37] = 68 | { 69 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 70 | /* 1 */ { 4, s_0_1, 0, 1, 0}, 71 | /* 2 */ { 4, s_0_2, 0, 1, 0}, 72 | /* 3 */ { 7, s_0_3, 2, 1, 0}, 73 | /* 4 */ { 4, s_0_4, 0, 1, 0}, 74 | /* 5 */ { 2, s_0_5, -1, 1, 0}, 75 | /* 6 */ { 1, s_0_6, -1, 1, 0}, 76 | /* 7 */ { 3, s_0_7, 6, 1, 0}, 77 | /* 8 */ { 4, s_0_8, 6, 1, 0}, 78 | /* 9 */ { 4, s_0_9, 6, 1, 0}, 79 | /* 10 */ { 3, s_0_10, 6, 1, 0}, 80 | /* 11 */ { 4, s_0_11, 6, 1, 0}, 81 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 82 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 83 | /* 14 */ { 4, s_0_14, 12, 1, 0}, 84 | /* 15 */ { 5, s_0_15, 12, 1, 0}, 85 | /* 16 */ { 3, s_0_16, -1, 1, 0}, 86 | /* 17 */ { 2, s_0_17, -1, 1, 0}, 87 | /* 18 */ { 2, s_0_18, -1, 1, 0}, 88 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 89 | /* 20 */ { 2, s_0_20, -1, 1, 0}, 90 | /* 21 */ { 1, s_0_21, -1, 2, 0}, 91 | /* 22 */ { 2, s_0_22, 21, 1, 0}, 92 | /* 23 */ { 5, s_0_23, 22, 1, 0}, 93 | /* 24 */ { 5, s_0_24, 22, 1, 0}, 94 | /* 25 */ { 5, s_0_25, 22, 1, 0}, 95 | /* 26 */ { 2, s_0_26, 21, 1, 0}, 96 | /* 27 */ { 4, s_0_27, 26, 1, 0}, 97 | /* 28 */ { 5, s_0_28, 26, 1, 0}, 98 | /* 29 */ { 3, s_0_29, 21, 1, 0}, 99 | /* 30 */ { 5, s_0_30, 29, 1, 0}, 100 | /* 31 */ { 6, s_0_31, 29, 1, 0}, 101 | /* 32 */ { 4, s_0_32, 21, 1, 0}, 102 | /* 33 */ { 2, s_0_33, -1, 1, 0}, 103 | /* 34 */ { 5, s_0_34, -1, 1, 0}, 104 | /* 35 */ { 3, s_0_35, -1, 1, 0}, 105 | /* 36 */ { 3, s_0_36, -1, 1, 0} 106 | }; 107 | 108 | static const symbol s_1_0[2] = { 'd', 'd' }; 109 | static const symbol s_1_1[2] = { 'g', 'd' }; 110 | static const symbol s_1_2[2] = { 'n', 'n' }; 111 | static const symbol s_1_3[2] = { 'd', 't' }; 112 | static const symbol s_1_4[2] = { 'g', 't' }; 113 | static const symbol s_1_5[2] = { 'k', 't' }; 114 | static const symbol s_1_6[2] = { 't', 't' }; 115 | 116 | static const struct among a_1[7] = 117 | { 118 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 119 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 120 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 121 | /* 3 */ { 2, s_1_3, -1, -1, 0}, 122 | /* 4 */ { 2, s_1_4, -1, -1, 0}, 123 | /* 5 */ { 2, s_1_5, -1, -1, 0}, 124 | /* 6 */ { 2, s_1_6, -1, -1, 0} 125 | }; 126 | 127 | static const symbol s_2_0[2] = { 'i', 'g' }; 128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 129 | static const symbol s_2_2[3] = { 'e', 'l', 's' }; 130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' }; 131 | static const symbol s_2_4[4] = { 'l', 0xF6, 's', 't' }; 132 | 133 | static const struct among a_2[5] = 134 | { 135 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 136 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 137 | /* 2 */ { 3, s_2_2, -1, 1, 0}, 138 | /* 3 */ { 5, s_2_3, -1, 3, 0}, 139 | /* 4 */ { 4, s_2_4, -1, 2, 0} 140 | }; 141 | 142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 }; 143 | 144 | static const unsigned char g_s_ending[] = { 119, 127, 149 }; 145 | 146 | static const symbol s_0[] = { 'l', 0xF6, 's' }; 147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' }; 148 | 149 | static int r_mark_regions(struct SN_env * z) { 150 | z->I[0] = z->l; 151 | { int c_test = z->c; /* test, line 29 */ 152 | { int ret = z->c + 3; 153 | if (0 > ret || ret > z->l) return 0; 154 | z->c = ret; /* hop, line 29 */ 155 | } 156 | z->I[1] = z->c; /* setmark x, line 29 */ 157 | z->c = c_test; 158 | } 159 | if (out_grouping(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */ 160 | { /* gopast */ /* non v, line 30 */ 161 | int ret = in_grouping(z, g_v, 97, 246, 1); 162 | if (ret < 0) return 0; 163 | z->c += ret; 164 | } 165 | z->I[0] = z->c; /* setmark p1, line 30 */ 166 | /* try, line 31 */ 167 | if (!(z->I[0] < z->I[1])) goto lab0; 168 | z->I[0] = z->I[1]; 169 | lab0: 170 | return 1; 171 | } 172 | 173 | static int r_main_suffix(struct SN_env * z) { 174 | int among_var; 175 | { int mlimit; /* setlimit, line 37 */ 176 | int m1 = z->l - z->c; (void)m1; 177 | if (z->c < z->I[0]) return 0; 178 | z->c = z->I[0]; /* tomark, line 37 */ 179 | mlimit = z->lb; z->lb = z->c; 180 | z->c = z->l - m1; 181 | z->ket = z->c; /* [, line 37 */ 182 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 183 | among_var = find_among_b(z, a_0, 37); /* substring, line 37 */ 184 | if (!(among_var)) { z->lb = mlimit; return 0; } 185 | z->bra = z->c; /* ], line 37 */ 186 | z->lb = mlimit; 187 | } 188 | switch(among_var) { 189 | case 0: return 0; 190 | case 1: 191 | { int ret = slice_del(z); /* delete, line 44 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 2: 196 | if (in_grouping_b(z, g_s_ending, 98, 121, 0)) return 0; 197 | { int ret = slice_del(z); /* delete, line 46 */ 198 | if (ret < 0) return ret; 199 | } 200 | break; 201 | } 202 | return 1; 203 | } 204 | 205 | static int r_consonant_pair(struct SN_env * z) { 206 | { int mlimit; /* setlimit, line 50 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 50 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | { int m2 = z->l - z->c; (void)m2; /* and, line 52 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */ 215 | z->c = z->l - m2; 216 | z->ket = z->c; /* [, line 52 */ 217 | if (z->c <= z->lb) { z->lb = mlimit; return 0; } 218 | z->c--; /* next, line 52 */ 219 | z->bra = z->c; /* ], line 52 */ 220 | { int ret = slice_del(z); /* delete, line 52 */ 221 | if (ret < 0) return ret; 222 | } 223 | } 224 | z->lb = mlimit; 225 | } 226 | return 1; 227 | } 228 | 229 | static int r_other_suffix(struct SN_env * z) { 230 | int among_var; 231 | { int mlimit; /* setlimit, line 55 */ 232 | int m1 = z->l - z->c; (void)m1; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 55 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m1; 237 | z->ket = z->c; /* [, line 56 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 5); /* substring, line 56 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 56 */ 242 | switch(among_var) { 243 | case 0: { z->lb = mlimit; return 0; } 244 | case 1: 245 | { int ret = slice_del(z); /* delete, line 57 */ 246 | if (ret < 0) return ret; 247 | } 248 | break; 249 | case 2: 250 | { int ret = slice_from_s(z, 3, s_0); /* <-, line 58 */ 251 | if (ret < 0) return ret; 252 | } 253 | break; 254 | case 3: 255 | { int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */ 256 | if (ret < 0) return ret; 257 | } 258 | break; 259 | } 260 | z->lb = mlimit; 261 | } 262 | return 1; 263 | } 264 | 265 | extern int swedish_ISO_8859_1_stem(struct SN_env * z) { 266 | { int c1 = z->c; /* do, line 66 */ 267 | { int ret = r_mark_regions(z); 268 | if (ret == 0) goto lab0; /* call mark_regions, line 66 */ 269 | if (ret < 0) return ret; 270 | } 271 | lab0: 272 | z->c = c1; 273 | } 274 | z->lb = z->c; z->c = z->l; /* backwards, line 67 */ 275 | 276 | { int m2 = z->l - z->c; (void)m2; /* do, line 68 */ 277 | { int ret = r_main_suffix(z); 278 | if (ret == 0) goto lab1; /* call main_suffix, line 68 */ 279 | if (ret < 0) return ret; 280 | } 281 | lab1: 282 | z->c = z->l - m2; 283 | } 284 | { int m3 = z->l - z->c; (void)m3; /* do, line 69 */ 285 | { int ret = r_consonant_pair(z); 286 | if (ret == 0) goto lab2; /* call consonant_pair, line 69 */ 287 | if (ret < 0) return ret; 288 | } 289 | lab2: 290 | z->c = z->l - m3; 291 | } 292 | { int m4 = z->l - z->c; (void)m4; /* do, line 70 */ 293 | { int ret = r_other_suffix(z); 294 | if (ret == 0) goto lab3; /* call other_suffix, line 70 */ 295 | if (ret < 0) return ret; 296 | } 297 | lab3: 298 | z->c = z->l - m4; 299 | } 300 | z->c = z->lb; 301 | return 1; 302 | } 303 | 304 | extern struct SN_env * swedish_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); } 305 | 306 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); } 307 | 308 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_swedish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int swedish_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * swedish_UTF_8_create_env(void); 23 | extern void swedish_UTF_8_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' }; 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' }; 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' }; 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' }; 34 | static const symbol s_0_5[2] = { 'a', 'd' }; 35 | static const symbol s_0_6[1] = { 'e' }; 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' }; 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' }; 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' }; 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' }; 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' }; 41 | static const symbol s_0_12[2] = { 'e', 'n' }; 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' }; 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' }; 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' }; 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' }; 46 | static const symbol s_0_17[2] = { 'a', 'r' }; 47 | static const symbol s_0_18[2] = { 'e', 'r' }; 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' }; 49 | static const symbol s_0_20[2] = { 'o', 'r' }; 50 | static const symbol s_0_21[1] = { 's' }; 51 | static const symbol s_0_22[2] = { 'a', 's' }; 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' }; 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' }; 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' }; 55 | static const symbol s_0_26[2] = { 'e', 's' }; 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' }; 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' }; 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' }; 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' }; 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' }; 62 | static const symbol s_0_33[2] = { 'a', 't' }; 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' }; 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' }; 65 | static const symbol s_0_36[3] = { 'a', 's', 't' }; 66 | 67 | static const struct among a_0[37] = 68 | { 69 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 70 | /* 1 */ { 4, s_0_1, 0, 1, 0}, 71 | /* 2 */ { 4, s_0_2, 0, 1, 0}, 72 | /* 3 */ { 7, s_0_3, 2, 1, 0}, 73 | /* 4 */ { 4, s_0_4, 0, 1, 0}, 74 | /* 5 */ { 2, s_0_5, -1, 1, 0}, 75 | /* 6 */ { 1, s_0_6, -1, 1, 0}, 76 | /* 7 */ { 3, s_0_7, 6, 1, 0}, 77 | /* 8 */ { 4, s_0_8, 6, 1, 0}, 78 | /* 9 */ { 4, s_0_9, 6, 1, 0}, 79 | /* 10 */ { 3, s_0_10, 6, 1, 0}, 80 | /* 11 */ { 4, s_0_11, 6, 1, 0}, 81 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 82 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 83 | /* 14 */ { 4, s_0_14, 12, 1, 0}, 84 | /* 15 */ { 5, s_0_15, 12, 1, 0}, 85 | /* 16 */ { 3, s_0_16, -1, 1, 0}, 86 | /* 17 */ { 2, s_0_17, -1, 1, 0}, 87 | /* 18 */ { 2, s_0_18, -1, 1, 0}, 88 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 89 | /* 20 */ { 2, s_0_20, -1, 1, 0}, 90 | /* 21 */ { 1, s_0_21, -1, 2, 0}, 91 | /* 22 */ { 2, s_0_22, 21, 1, 0}, 92 | /* 23 */ { 5, s_0_23, 22, 1, 0}, 93 | /* 24 */ { 5, s_0_24, 22, 1, 0}, 94 | /* 25 */ { 5, s_0_25, 22, 1, 0}, 95 | /* 26 */ { 2, s_0_26, 21, 1, 0}, 96 | /* 27 */ { 4, s_0_27, 26, 1, 0}, 97 | /* 28 */ { 5, s_0_28, 26, 1, 0}, 98 | /* 29 */ { 3, s_0_29, 21, 1, 0}, 99 | /* 30 */ { 5, s_0_30, 29, 1, 0}, 100 | /* 31 */ { 6, s_0_31, 29, 1, 0}, 101 | /* 32 */ { 4, s_0_32, 21, 1, 0}, 102 | /* 33 */ { 2, s_0_33, -1, 1, 0}, 103 | /* 34 */ { 5, s_0_34, -1, 1, 0}, 104 | /* 35 */ { 3, s_0_35, -1, 1, 0}, 105 | /* 36 */ { 3, s_0_36, -1, 1, 0} 106 | }; 107 | 108 | static const symbol s_1_0[2] = { 'd', 'd' }; 109 | static const symbol s_1_1[2] = { 'g', 'd' }; 110 | static const symbol s_1_2[2] = { 'n', 'n' }; 111 | static const symbol s_1_3[2] = { 'd', 't' }; 112 | static const symbol s_1_4[2] = { 'g', 't' }; 113 | static const symbol s_1_5[2] = { 'k', 't' }; 114 | static const symbol s_1_6[2] = { 't', 't' }; 115 | 116 | static const struct among a_1[7] = 117 | { 118 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 119 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 120 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 121 | /* 3 */ { 2, s_1_3, -1, -1, 0}, 122 | /* 4 */ { 2, s_1_4, -1, -1, 0}, 123 | /* 5 */ { 2, s_1_5, -1, -1, 0}, 124 | /* 6 */ { 2, s_1_6, -1, -1, 0} 125 | }; 126 | 127 | static const symbol s_2_0[2] = { 'i', 'g' }; 128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 129 | static const symbol s_2_2[3] = { 'e', 'l', 's' }; 130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' }; 131 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB6, 's', 't' }; 132 | 133 | static const struct among a_2[5] = 134 | { 135 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 136 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 137 | /* 2 */ { 3, s_2_2, -1, 1, 0}, 138 | /* 3 */ { 5, s_2_3, -1, 3, 0}, 139 | /* 4 */ { 5, s_2_4, -1, 2, 0} 140 | }; 141 | 142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 }; 143 | 144 | static const unsigned char g_s_ending[] = { 119, 127, 149 }; 145 | 146 | static const symbol s_0[] = { 'l', 0xC3, 0xB6, 's' }; 147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' }; 148 | 149 | static int r_mark_regions(struct SN_env * z) { 150 | z->I[0] = z->l; 151 | { int c_test = z->c; /* test, line 29 */ 152 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 153 | if (ret < 0) return 0; 154 | z->c = ret; /* hop, line 29 */ 155 | } 156 | z->I[1] = z->c; /* setmark x, line 29 */ 157 | z->c = c_test; 158 | } 159 | if (out_grouping_U(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */ 160 | { /* gopast */ /* non v, line 30 */ 161 | int ret = in_grouping_U(z, g_v, 97, 246, 1); 162 | if (ret < 0) return 0; 163 | z->c += ret; 164 | } 165 | z->I[0] = z->c; /* setmark p1, line 30 */ 166 | /* try, line 31 */ 167 | if (!(z->I[0] < z->I[1])) goto lab0; 168 | z->I[0] = z->I[1]; 169 | lab0: 170 | return 1; 171 | } 172 | 173 | static int r_main_suffix(struct SN_env * z) { 174 | int among_var; 175 | { int mlimit; /* setlimit, line 37 */ 176 | int m1 = z->l - z->c; (void)m1; 177 | if (z->c < z->I[0]) return 0; 178 | z->c = z->I[0]; /* tomark, line 37 */ 179 | mlimit = z->lb; z->lb = z->c; 180 | z->c = z->l - m1; 181 | z->ket = z->c; /* [, line 37 */ 182 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 183 | among_var = find_among_b(z, a_0, 37); /* substring, line 37 */ 184 | if (!(among_var)) { z->lb = mlimit; return 0; } 185 | z->bra = z->c; /* ], line 37 */ 186 | z->lb = mlimit; 187 | } 188 | switch(among_var) { 189 | case 0: return 0; 190 | case 1: 191 | { int ret = slice_del(z); /* delete, line 44 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 2: 196 | if (in_grouping_b_U(z, g_s_ending, 98, 121, 0)) return 0; 197 | { int ret = slice_del(z); /* delete, line 46 */ 198 | if (ret < 0) return ret; 199 | } 200 | break; 201 | } 202 | return 1; 203 | } 204 | 205 | static int r_consonant_pair(struct SN_env * z) { 206 | { int mlimit; /* setlimit, line 50 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 50 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | { int m2 = z->l - z->c; (void)m2; /* and, line 52 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */ 215 | z->c = z->l - m2; 216 | z->ket = z->c; /* [, line 52 */ 217 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 218 | if (ret < 0) { z->lb = mlimit; return 0; } 219 | z->c = ret; /* next, line 52 */ 220 | } 221 | z->bra = z->c; /* ], line 52 */ 222 | { int ret = slice_del(z); /* delete, line 52 */ 223 | if (ret < 0) return ret; 224 | } 225 | } 226 | z->lb = mlimit; 227 | } 228 | return 1; 229 | } 230 | 231 | static int r_other_suffix(struct SN_env * z) { 232 | int among_var; 233 | { int mlimit; /* setlimit, line 55 */ 234 | int m1 = z->l - z->c; (void)m1; 235 | if (z->c < z->I[0]) return 0; 236 | z->c = z->I[0]; /* tomark, line 55 */ 237 | mlimit = z->lb; z->lb = z->c; 238 | z->c = z->l - m1; 239 | z->ket = z->c; /* [, line 56 */ 240 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 241 | among_var = find_among_b(z, a_2, 5); /* substring, line 56 */ 242 | if (!(among_var)) { z->lb = mlimit; return 0; } 243 | z->bra = z->c; /* ], line 56 */ 244 | switch(among_var) { 245 | case 0: { z->lb = mlimit; return 0; } 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 57 */ 248 | if (ret < 0) return ret; 249 | } 250 | break; 251 | case 2: 252 | { int ret = slice_from_s(z, 4, s_0); /* <-, line 58 */ 253 | if (ret < 0) return ret; 254 | } 255 | break; 256 | case 3: 257 | { int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */ 258 | if (ret < 0) return ret; 259 | } 260 | break; 261 | } 262 | z->lb = mlimit; 263 | } 264 | return 1; 265 | } 266 | 267 | extern int swedish_UTF_8_stem(struct SN_env * z) { 268 | { int c1 = z->c; /* do, line 66 */ 269 | { int ret = r_mark_regions(z); 270 | if (ret == 0) goto lab0; /* call mark_regions, line 66 */ 271 | if (ret < 0) return ret; 272 | } 273 | lab0: 274 | z->c = c1; 275 | } 276 | z->lb = z->c; z->c = z->l; /* backwards, line 67 */ 277 | 278 | { int m2 = z->l - z->c; (void)m2; /* do, line 68 */ 279 | { int ret = r_main_suffix(z); 280 | if (ret == 0) goto lab1; /* call main_suffix, line 68 */ 281 | if (ret < 0) return ret; 282 | } 283 | lab1: 284 | z->c = z->l - m2; 285 | } 286 | { int m3 = z->l - z->c; (void)m3; /* do, line 69 */ 287 | { int ret = r_consonant_pair(z); 288 | if (ret == 0) goto lab2; /* call consonant_pair, line 69 */ 289 | if (ret < 0) return ret; 290 | } 291 | lab2: 292 | z->c = z->l - m3; 293 | } 294 | { int m4 = z->l - z->c; (void)m4; /* do, line 70 */ 295 | { int ret = r_other_suffix(z); 296 | if (ret == 0) goto lab3; /* call other_suffix, line 70 */ 297 | if (ret < 0) return ret; 298 | } 299 | lab3: 300 | z->c = z->l - m4; 301 | } 302 | z->c = z->lb; 303 | return 1; 304 | } 305 | 306 | extern struct SN_env * swedish_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); } 307 | 308 | extern void swedish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); } 309 | 310 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_danish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int danish_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_undouble(struct SN_env * z); 14 | static int r_other_suffix(struct SN_env * z); 15 | static int r_consonant_pair(struct SN_env * z); 16 | static int r_main_suffix(struct SN_env * z); 17 | static int r_mark_regions(struct SN_env * z); 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | 23 | extern struct SN_env * danish_ISO_8859_1_create_env(void); 24 | extern void danish_ISO_8859_1_close_env(struct SN_env * z); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' }; 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' }; 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' }; 33 | static const symbol s_0_3[1] = { 'e' }; 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' }; 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' }; 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' }; 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' }; 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' }; 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' }; 40 | static const symbol s_0_10[2] = { 'e', 'n' }; 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' }; 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' }; 43 | static const symbol s_0_13[2] = { 'e', 'r' }; 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' }; 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' }; 46 | static const symbol s_0_16[1] = { 's' }; 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' }; 48 | static const symbol s_0_18[2] = { 'e', 's' }; 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' }; 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' }; 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' }; 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' }; 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' }; 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' }; 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' }; 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' }; 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' }; 58 | static const symbol s_0_28[3] = { 'e', 't', 's' }; 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' }; 60 | static const symbol s_0_30[2] = { 'e', 't' }; 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' }; 62 | 63 | static const struct among a_0[32] = 64 | { 65 | /* 0 */ { 3, s_0_0, -1, 1, 0}, 66 | /* 1 */ { 5, s_0_1, 0, 1, 0}, 67 | /* 2 */ { 4, s_0_2, -1, 1, 0}, 68 | /* 3 */ { 1, s_0_3, -1, 1, 0}, 69 | /* 4 */ { 5, s_0_4, 3, 1, 0}, 70 | /* 5 */ { 4, s_0_5, 3, 1, 0}, 71 | /* 6 */ { 6, s_0_6, 5, 1, 0}, 72 | /* 7 */ { 3, s_0_7, 3, 1, 0}, 73 | /* 8 */ { 4, s_0_8, 3, 1, 0}, 74 | /* 9 */ { 3, s_0_9, 3, 1, 0}, 75 | /* 10 */ { 2, s_0_10, -1, 1, 0}, 76 | /* 11 */ { 5, s_0_11, 10, 1, 0}, 77 | /* 12 */ { 4, s_0_12, 10, 1, 0}, 78 | /* 13 */ { 2, s_0_13, -1, 1, 0}, 79 | /* 14 */ { 5, s_0_14, 13, 1, 0}, 80 | /* 15 */ { 4, s_0_15, 13, 1, 0}, 81 | /* 16 */ { 1, s_0_16, -1, 2, 0}, 82 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 83 | /* 18 */ { 2, s_0_18, 16, 1, 0}, 84 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 85 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 86 | /* 21 */ { 4, s_0_21, 18, 1, 0}, 87 | /* 22 */ { 5, s_0_22, 18, 1, 0}, 88 | /* 23 */ { 4, s_0_23, 18, 1, 0}, 89 | /* 24 */ { 3, s_0_24, 16, 1, 0}, 90 | /* 25 */ { 6, s_0_25, 24, 1, 0}, 91 | /* 26 */ { 5, s_0_26, 24, 1, 0}, 92 | /* 27 */ { 3, s_0_27, 16, 1, 0}, 93 | /* 28 */ { 3, s_0_28, 16, 1, 0}, 94 | /* 29 */ { 5, s_0_29, 28, 1, 0}, 95 | /* 30 */ { 2, s_0_30, -1, 1, 0}, 96 | /* 31 */ { 4, s_0_31, 30, 1, 0} 97 | }; 98 | 99 | static const symbol s_1_0[2] = { 'g', 'd' }; 100 | static const symbol s_1_1[2] = { 'd', 't' }; 101 | static const symbol s_1_2[2] = { 'g', 't' }; 102 | static const symbol s_1_3[2] = { 'k', 't' }; 103 | 104 | static const struct among a_1[4] = 105 | { 106 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 107 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 108 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 109 | /* 3 */ { 2, s_1_3, -1, -1, 0} 110 | }; 111 | 112 | static const symbol s_2_0[2] = { 'i', 'g' }; 113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' }; 115 | static const symbol s_2_3[3] = { 'e', 'l', 's' }; 116 | static const symbol s_2_4[4] = { 'l', 0xF8, 's', 't' }; 117 | 118 | static const struct among a_2[5] = 119 | { 120 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 121 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 122 | /* 2 */ { 4, s_2_2, 1, 1, 0}, 123 | /* 3 */ { 3, s_2_3, -1, 1, 0}, 124 | /* 4 */ { 4, s_2_4, -1, 2, 0} 125 | }; 126 | 127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 128 | 129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 }; 130 | 131 | static const symbol s_0[] = { 's', 't' }; 132 | static const symbol s_1[] = { 'i', 'g' }; 133 | static const symbol s_2[] = { 'l', 0xF8, 's' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 33 */ 138 | { int ret = z->c + 3; 139 | if (0 > ret || ret > z->l) return 0; 140 | z->c = ret; /* hop, line 33 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 33 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */ 146 | { /* gopast */ /* non v, line 34 */ 147 | int ret = in_grouping(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 34 */ 152 | /* try, line 35 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 41 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 41 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 41 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 32); /* substring, line 41 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 41 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 48 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | if (in_grouping_b(z, g_s_ending, 97, 229, 0)) return 0; 183 | { int ret = slice_del(z); /* delete, line 50 */ 184 | if (ret < 0) return ret; 185 | } 186 | break; 187 | } 188 | return 1; 189 | } 190 | 191 | static int r_consonant_pair(struct SN_env * z) { 192 | { int m_test = z->l - z->c; /* test, line 55 */ 193 | { int mlimit; /* setlimit, line 56 */ 194 | int m1 = z->l - z->c; (void)m1; 195 | if (z->c < z->I[0]) return 0; 196 | z->c = z->I[0]; /* tomark, line 56 */ 197 | mlimit = z->lb; z->lb = z->c; 198 | z->c = z->l - m1; 199 | z->ket = z->c; /* [, line 56 */ 200 | if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; } 201 | if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */ 202 | z->bra = z->c; /* ], line 56 */ 203 | z->lb = mlimit; 204 | } 205 | z->c = z->l - m_test; 206 | } 207 | if (z->c <= z->lb) return 0; 208 | z->c--; /* next, line 62 */ 209 | z->bra = z->c; /* ], line 62 */ 210 | { int ret = slice_del(z); /* delete, line 62 */ 211 | if (ret < 0) return ret; 212 | } 213 | return 1; 214 | } 215 | 216 | static int r_other_suffix(struct SN_env * z) { 217 | int among_var; 218 | { int m1 = z->l - z->c; (void)m1; /* do, line 66 */ 219 | z->ket = z->c; /* [, line 66 */ 220 | if (!(eq_s_b(z, 2, s_0))) goto lab0; 221 | z->bra = z->c; /* ], line 66 */ 222 | if (!(eq_s_b(z, 2, s_1))) goto lab0; 223 | { int ret = slice_del(z); /* delete, line 66 */ 224 | if (ret < 0) return ret; 225 | } 226 | lab0: 227 | z->c = z->l - m1; 228 | } 229 | { int mlimit; /* setlimit, line 67 */ 230 | int m2 = z->l - z->c; (void)m2; 231 | if (z->c < z->I[0]) return 0; 232 | z->c = z->I[0]; /* tomark, line 67 */ 233 | mlimit = z->lb; z->lb = z->c; 234 | z->c = z->l - m2; 235 | z->ket = z->c; /* [, line 67 */ 236 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 237 | among_var = find_among_b(z, a_2, 5); /* substring, line 67 */ 238 | if (!(among_var)) { z->lb = mlimit; return 0; } 239 | z->bra = z->c; /* ], line 67 */ 240 | z->lb = mlimit; 241 | } 242 | switch(among_var) { 243 | case 0: return 0; 244 | case 1: 245 | { int ret = slice_del(z); /* delete, line 70 */ 246 | if (ret < 0) return ret; 247 | } 248 | { int m3 = z->l - z->c; (void)m3; /* do, line 70 */ 249 | { int ret = r_consonant_pair(z); 250 | if (ret == 0) goto lab1; /* call consonant_pair, line 70 */ 251 | if (ret < 0) return ret; 252 | } 253 | lab1: 254 | z->c = z->l - m3; 255 | } 256 | break; 257 | case 2: 258 | { int ret = slice_from_s(z, 3, s_2); /* <-, line 72 */ 259 | if (ret < 0) return ret; 260 | } 261 | break; 262 | } 263 | return 1; 264 | } 265 | 266 | static int r_undouble(struct SN_env * z) { 267 | { int mlimit; /* setlimit, line 76 */ 268 | int m1 = z->l - z->c; (void)m1; 269 | if (z->c < z->I[0]) return 0; 270 | z->c = z->I[0]; /* tomark, line 76 */ 271 | mlimit = z->lb; z->lb = z->c; 272 | z->c = z->l - m1; 273 | z->ket = z->c; /* [, line 76 */ 274 | if (out_grouping_b(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; } 275 | z->bra = z->c; /* ], line 76 */ 276 | z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */ 277 | if (z->S[0] == 0) return -1; /* -> ch, line 76 */ 278 | z->lb = mlimit; 279 | } 280 | if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */ 281 | { int ret = slice_del(z); /* delete, line 78 */ 282 | if (ret < 0) return ret; 283 | } 284 | return 1; 285 | } 286 | 287 | extern int danish_ISO_8859_1_stem(struct SN_env * z) { 288 | { int c1 = z->c; /* do, line 84 */ 289 | { int ret = r_mark_regions(z); 290 | if (ret == 0) goto lab0; /* call mark_regions, line 84 */ 291 | if (ret < 0) return ret; 292 | } 293 | lab0: 294 | z->c = c1; 295 | } 296 | z->lb = z->c; z->c = z->l; /* backwards, line 85 */ 297 | 298 | { int m2 = z->l - z->c; (void)m2; /* do, line 86 */ 299 | { int ret = r_main_suffix(z); 300 | if (ret == 0) goto lab1; /* call main_suffix, line 86 */ 301 | if (ret < 0) return ret; 302 | } 303 | lab1: 304 | z->c = z->l - m2; 305 | } 306 | { int m3 = z->l - z->c; (void)m3; /* do, line 87 */ 307 | { int ret = r_consonant_pair(z); 308 | if (ret == 0) goto lab2; /* call consonant_pair, line 87 */ 309 | if (ret < 0) return ret; 310 | } 311 | lab2: 312 | z->c = z->l - m3; 313 | } 314 | { int m4 = z->l - z->c; (void)m4; /* do, line 88 */ 315 | { int ret = r_other_suffix(z); 316 | if (ret == 0) goto lab3; /* call other_suffix, line 88 */ 317 | if (ret < 0) return ret; 318 | } 319 | lab3: 320 | z->c = z->l - m4; 321 | } 322 | { int m5 = z->l - z->c; (void)m5; /* do, line 89 */ 323 | { int ret = r_undouble(z); 324 | if (ret == 0) goto lab4; /* call undouble, line 89 */ 325 | if (ret < 0) return ret; 326 | } 327 | lab4: 328 | z->c = z->l - m5; 329 | } 330 | z->c = z->lb; 331 | return 1; 332 | } 333 | 334 | extern struct SN_env * danish_ISO_8859_1_create_env(void) { return SN_create_env(1, 2, 0); } 335 | 336 | extern void danish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 1); } 337 | 338 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_danish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int danish_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_undouble(struct SN_env * z); 14 | static int r_other_suffix(struct SN_env * z); 15 | static int r_consonant_pair(struct SN_env * z); 16 | static int r_main_suffix(struct SN_env * z); 17 | static int r_mark_regions(struct SN_env * z); 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | 23 | extern struct SN_env * danish_UTF_8_create_env(void); 24 | extern void danish_UTF_8_close_env(struct SN_env * z); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' }; 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' }; 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' }; 33 | static const symbol s_0_3[1] = { 'e' }; 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' }; 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' }; 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' }; 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' }; 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' }; 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' }; 40 | static const symbol s_0_10[2] = { 'e', 'n' }; 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' }; 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' }; 43 | static const symbol s_0_13[2] = { 'e', 'r' }; 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' }; 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' }; 46 | static const symbol s_0_16[1] = { 's' }; 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' }; 48 | static const symbol s_0_18[2] = { 'e', 's' }; 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' }; 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' }; 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' }; 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' }; 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' }; 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' }; 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' }; 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' }; 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' }; 58 | static const symbol s_0_28[3] = { 'e', 't', 's' }; 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' }; 60 | static const symbol s_0_30[2] = { 'e', 't' }; 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' }; 62 | 63 | static const struct among a_0[32] = 64 | { 65 | /* 0 */ { 3, s_0_0, -1, 1, 0}, 66 | /* 1 */ { 5, s_0_1, 0, 1, 0}, 67 | /* 2 */ { 4, s_0_2, -1, 1, 0}, 68 | /* 3 */ { 1, s_0_3, -1, 1, 0}, 69 | /* 4 */ { 5, s_0_4, 3, 1, 0}, 70 | /* 5 */ { 4, s_0_5, 3, 1, 0}, 71 | /* 6 */ { 6, s_0_6, 5, 1, 0}, 72 | /* 7 */ { 3, s_0_7, 3, 1, 0}, 73 | /* 8 */ { 4, s_0_8, 3, 1, 0}, 74 | /* 9 */ { 3, s_0_9, 3, 1, 0}, 75 | /* 10 */ { 2, s_0_10, -1, 1, 0}, 76 | /* 11 */ { 5, s_0_11, 10, 1, 0}, 77 | /* 12 */ { 4, s_0_12, 10, 1, 0}, 78 | /* 13 */ { 2, s_0_13, -1, 1, 0}, 79 | /* 14 */ { 5, s_0_14, 13, 1, 0}, 80 | /* 15 */ { 4, s_0_15, 13, 1, 0}, 81 | /* 16 */ { 1, s_0_16, -1, 2, 0}, 82 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 83 | /* 18 */ { 2, s_0_18, 16, 1, 0}, 84 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 85 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 86 | /* 21 */ { 4, s_0_21, 18, 1, 0}, 87 | /* 22 */ { 5, s_0_22, 18, 1, 0}, 88 | /* 23 */ { 4, s_0_23, 18, 1, 0}, 89 | /* 24 */ { 3, s_0_24, 16, 1, 0}, 90 | /* 25 */ { 6, s_0_25, 24, 1, 0}, 91 | /* 26 */ { 5, s_0_26, 24, 1, 0}, 92 | /* 27 */ { 3, s_0_27, 16, 1, 0}, 93 | /* 28 */ { 3, s_0_28, 16, 1, 0}, 94 | /* 29 */ { 5, s_0_29, 28, 1, 0}, 95 | /* 30 */ { 2, s_0_30, -1, 1, 0}, 96 | /* 31 */ { 4, s_0_31, 30, 1, 0} 97 | }; 98 | 99 | static const symbol s_1_0[2] = { 'g', 'd' }; 100 | static const symbol s_1_1[2] = { 'd', 't' }; 101 | static const symbol s_1_2[2] = { 'g', 't' }; 102 | static const symbol s_1_3[2] = { 'k', 't' }; 103 | 104 | static const struct among a_1[4] = 105 | { 106 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 107 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 108 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 109 | /* 3 */ { 2, s_1_3, -1, -1, 0} 110 | }; 111 | 112 | static const symbol s_2_0[2] = { 'i', 'g' }; 113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' }; 115 | static const symbol s_2_3[3] = { 'e', 'l', 's' }; 116 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB8, 's', 't' }; 117 | 118 | static const struct among a_2[5] = 119 | { 120 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 121 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 122 | /* 2 */ { 4, s_2_2, 1, 1, 0}, 123 | /* 3 */ { 3, s_2_3, -1, 1, 0}, 124 | /* 4 */ { 5, s_2_4, -1, 2, 0} 125 | }; 126 | 127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 128 | 129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 }; 130 | 131 | static const symbol s_0[] = { 's', 't' }; 132 | static const symbol s_1[] = { 'i', 'g' }; 133 | static const symbol s_2[] = { 'l', 0xC3, 0xB8, 's' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 33 */ 138 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 139 | if (ret < 0) return 0; 140 | z->c = ret; /* hop, line 33 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 33 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */ 146 | { /* gopast */ /* non v, line 34 */ 147 | int ret = in_grouping_U(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 34 */ 152 | /* try, line 35 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 41 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 41 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 41 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 32); /* substring, line 41 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 41 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 48 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | if (in_grouping_b_U(z, g_s_ending, 97, 229, 0)) return 0; 183 | { int ret = slice_del(z); /* delete, line 50 */ 184 | if (ret < 0) return ret; 185 | } 186 | break; 187 | } 188 | return 1; 189 | } 190 | 191 | static int r_consonant_pair(struct SN_env * z) { 192 | { int m_test = z->l - z->c; /* test, line 55 */ 193 | { int mlimit; /* setlimit, line 56 */ 194 | int m1 = z->l - z->c; (void)m1; 195 | if (z->c < z->I[0]) return 0; 196 | z->c = z->I[0]; /* tomark, line 56 */ 197 | mlimit = z->lb; z->lb = z->c; 198 | z->c = z->l - m1; 199 | z->ket = z->c; /* [, line 56 */ 200 | if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; } 201 | if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */ 202 | z->bra = z->c; /* ], line 56 */ 203 | z->lb = mlimit; 204 | } 205 | z->c = z->l - m_test; 206 | } 207 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 208 | if (ret < 0) return 0; 209 | z->c = ret; /* next, line 62 */ 210 | } 211 | z->bra = z->c; /* ], line 62 */ 212 | { int ret = slice_del(z); /* delete, line 62 */ 213 | if (ret < 0) return ret; 214 | } 215 | return 1; 216 | } 217 | 218 | static int r_other_suffix(struct SN_env * z) { 219 | int among_var; 220 | { int m1 = z->l - z->c; (void)m1; /* do, line 66 */ 221 | z->ket = z->c; /* [, line 66 */ 222 | if (!(eq_s_b(z, 2, s_0))) goto lab0; 223 | z->bra = z->c; /* ], line 66 */ 224 | if (!(eq_s_b(z, 2, s_1))) goto lab0; 225 | { int ret = slice_del(z); /* delete, line 66 */ 226 | if (ret < 0) return ret; 227 | } 228 | lab0: 229 | z->c = z->l - m1; 230 | } 231 | { int mlimit; /* setlimit, line 67 */ 232 | int m2 = z->l - z->c; (void)m2; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 67 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m2; 237 | z->ket = z->c; /* [, line 67 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 5); /* substring, line 67 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 67 */ 242 | z->lb = mlimit; 243 | } 244 | switch(among_var) { 245 | case 0: return 0; 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 70 */ 248 | if (ret < 0) return ret; 249 | } 250 | { int m3 = z->l - z->c; (void)m3; /* do, line 70 */ 251 | { int ret = r_consonant_pair(z); 252 | if (ret == 0) goto lab1; /* call consonant_pair, line 70 */ 253 | if (ret < 0) return ret; 254 | } 255 | lab1: 256 | z->c = z->l - m3; 257 | } 258 | break; 259 | case 2: 260 | { int ret = slice_from_s(z, 4, s_2); /* <-, line 72 */ 261 | if (ret < 0) return ret; 262 | } 263 | break; 264 | } 265 | return 1; 266 | } 267 | 268 | static int r_undouble(struct SN_env * z) { 269 | { int mlimit; /* setlimit, line 76 */ 270 | int m1 = z->l - z->c; (void)m1; 271 | if (z->c < z->I[0]) return 0; 272 | z->c = z->I[0]; /* tomark, line 76 */ 273 | mlimit = z->lb; z->lb = z->c; 274 | z->c = z->l - m1; 275 | z->ket = z->c; /* [, line 76 */ 276 | if (out_grouping_b_U(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; } 277 | z->bra = z->c; /* ], line 76 */ 278 | z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */ 279 | if (z->S[0] == 0) return -1; /* -> ch, line 76 */ 280 | z->lb = mlimit; 281 | } 282 | if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */ 283 | { int ret = slice_del(z); /* delete, line 78 */ 284 | if (ret < 0) return ret; 285 | } 286 | return 1; 287 | } 288 | 289 | extern int danish_UTF_8_stem(struct SN_env * z) { 290 | { int c1 = z->c; /* do, line 84 */ 291 | { int ret = r_mark_regions(z); 292 | if (ret == 0) goto lab0; /* call mark_regions, line 84 */ 293 | if (ret < 0) return ret; 294 | } 295 | lab0: 296 | z->c = c1; 297 | } 298 | z->lb = z->c; z->c = z->l; /* backwards, line 85 */ 299 | 300 | { int m2 = z->l - z->c; (void)m2; /* do, line 86 */ 301 | { int ret = r_main_suffix(z); 302 | if (ret == 0) goto lab1; /* call main_suffix, line 86 */ 303 | if (ret < 0) return ret; 304 | } 305 | lab1: 306 | z->c = z->l - m2; 307 | } 308 | { int m3 = z->l - z->c; (void)m3; /* do, line 87 */ 309 | { int ret = r_consonant_pair(z); 310 | if (ret == 0) goto lab2; /* call consonant_pair, line 87 */ 311 | if (ret < 0) return ret; 312 | } 313 | lab2: 314 | z->c = z->l - m3; 315 | } 316 | { int m4 = z->l - z->c; (void)m4; /* do, line 88 */ 317 | { int ret = r_other_suffix(z); 318 | if (ret == 0) goto lab3; /* call other_suffix, line 88 */ 319 | if (ret < 0) return ret; 320 | } 321 | lab3: 322 | z->c = z->l - m4; 323 | } 324 | { int m5 = z->l - z->c; (void)m5; /* do, line 89 */ 325 | { int ret = r_undouble(z); 326 | if (ret == 0) goto lab4; /* call undouble, line 89 */ 327 | if (ret < 0) return ret; 328 | } 329 | lab4: 330 | z->c = z->l - m5; 331 | } 332 | z->c = z->lb; 333 | return 1; 334 | } 335 | 336 | extern struct SN_env * danish_UTF_8_create_env(void) { return SN_create_env(1, 2, 0); } 337 | 338 | extern void danish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 1); } 339 | 340 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules.h: -------------------------------------------------------------------------------- 1 | /* libstemmer/modules.h: List of stemming modules. 2 | * 3 | * This file is generated by mkmodules.pl from a list of module names. 4 | * Do not edit manually. 5 | * 6 | * Modules included by this file are: danish, dutch, english, finnish, french, 7 | * german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | * russian, spanish, swedish, turkish 9 | */ 10 | 11 | #include "../src_c/stem_ISO_8859_1_danish.h" 12 | #include "../src_c/stem_UTF_8_danish.h" 13 | #include "../src_c/stem_ISO_8859_1_dutch.h" 14 | #include "../src_c/stem_UTF_8_dutch.h" 15 | #include "../src_c/stem_ISO_8859_1_english.h" 16 | #include "../src_c/stem_UTF_8_english.h" 17 | #include "../src_c/stem_ISO_8859_1_finnish.h" 18 | #include "../src_c/stem_UTF_8_finnish.h" 19 | #include "../src_c/stem_ISO_8859_1_french.h" 20 | #include "../src_c/stem_UTF_8_french.h" 21 | #include "../src_c/stem_ISO_8859_1_german.h" 22 | #include "../src_c/stem_UTF_8_german.h" 23 | #include "../src_c/stem_ISO_8859_1_hungarian.h" 24 | #include "../src_c/stem_UTF_8_hungarian.h" 25 | #include "../src_c/stem_ISO_8859_1_italian.h" 26 | #include "../src_c/stem_UTF_8_italian.h" 27 | #include "../src_c/stem_ISO_8859_1_norwegian.h" 28 | #include "../src_c/stem_UTF_8_norwegian.h" 29 | #include "../src_c/stem_ISO_8859_1_porter.h" 30 | #include "../src_c/stem_UTF_8_porter.h" 31 | #include "../src_c/stem_ISO_8859_1_portuguese.h" 32 | #include "../src_c/stem_UTF_8_portuguese.h" 33 | #include "../src_c/stem_ISO_8859_2_romanian.h" 34 | #include "../src_c/stem_UTF_8_romanian.h" 35 | #include "../src_c/stem_KOI8_R_russian.h" 36 | #include "../src_c/stem_UTF_8_russian.h" 37 | #include "../src_c/stem_ISO_8859_1_spanish.h" 38 | #include "../src_c/stem_UTF_8_spanish.h" 39 | #include "../src_c/stem_ISO_8859_1_swedish.h" 40 | #include "../src_c/stem_UTF_8_swedish.h" 41 | #include "../src_c/stem_UTF_8_turkish.h" 42 | 43 | typedef enum { 44 | ENC_UNKNOWN=0, 45 | ENC_ISO_8859_1, 46 | ENC_ISO_8859_2, 47 | ENC_KOI8_R, 48 | ENC_UTF_8 49 | } stemmer_encoding_t; 50 | 51 | struct stemmer_encoding { 52 | const char * name; 53 | stemmer_encoding_t enc; 54 | }; 55 | static struct stemmer_encoding encodings[] = { 56 | {"ISO_8859_1", ENC_ISO_8859_1}, 57 | {"ISO_8859_2", ENC_ISO_8859_2}, 58 | {"KOI8_R", ENC_KOI8_R}, 59 | {"UTF_8", ENC_UTF_8}, 60 | {0,ENC_UNKNOWN} 61 | }; 62 | 63 | struct stemmer_modules { 64 | const char * name; 65 | stemmer_encoding_t enc; 66 | struct SN_env * (*create)(void); 67 | void (*close)(struct SN_env *); 68 | int (*stem)(struct SN_env *); 69 | }; 70 | static struct stemmer_modules modules[] = { 71 | {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 72 | {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 73 | {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 74 | {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 75 | {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 76 | {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 77 | {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 78 | {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 79 | {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 80 | {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 81 | {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 82 | {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 83 | {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 84 | {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 85 | {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 86 | {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 87 | {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 88 | {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 89 | {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 90 | {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 91 | {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 92 | {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 93 | {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 94 | {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 95 | {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 96 | {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 97 | {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 98 | {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 99 | {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 100 | {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 101 | {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 102 | {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 103 | {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 104 | {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 105 | {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 106 | {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 107 | {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 108 | {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 109 | {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 110 | {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 111 | {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 112 | {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 113 | {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 114 | {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 115 | {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 116 | {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 117 | {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 118 | {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 119 | {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 120 | {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 121 | {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 122 | {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 123 | {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 124 | {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 125 | {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 126 | {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 127 | {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 128 | {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 129 | {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 130 | {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 131 | {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 132 | {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 133 | {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 134 | {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 135 | {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 136 | {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 137 | {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem}, 138 | {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, 139 | {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 140 | {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 141 | {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 142 | {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 143 | {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 144 | {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 145 | {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 146 | {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 147 | {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 148 | {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 149 | {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 150 | {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 151 | {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 152 | {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 153 | {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 154 | {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 155 | {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 156 | {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 157 | {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 158 | {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 159 | {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 160 | {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 161 | {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 162 | {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 163 | {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 164 | {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 165 | {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 166 | {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 167 | {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 168 | {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 169 | {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 170 | {0,ENC_UNKNOWN,0,0,0} 171 | }; 172 | static const char * algorithm_names[] = { 173 | "danish", 174 | "dutch", 175 | "english", 176 | "finnish", 177 | "french", 178 | "german", 179 | "hungarian", 180 | "italian", 181 | "norwegian", 182 | "porter", 183 | "portuguese", 184 | "romanian", 185 | "russian", 186 | "spanish", 187 | "swedish", 188 | "turkish", 189 | 0 190 | }; 191 | --------------------------------------------------------------------------------