├── libstemmer_c ├── Makefile ├── src_c │ ├── stem_UTF_8_dutch.h │ ├── stem_UTF_8_danish.h │ ├── stem_UTF_8_english.h │ ├── stem_UTF_8_finnish.h │ ├── stem_UTF_8_french.h │ ├── stem_UTF_8_german.h │ ├── stem_UTF_8_italian.h │ ├── stem_UTF_8_porter.h │ ├── stem_UTF_8_russian.h │ ├── stem_UTF_8_spanish.h │ ├── stem_UTF_8_swedish.h │ ├── stem_UTF_8_turkish.h │ ├── stem_KOI8_R_russian.h │ ├── stem_UTF_8_romanian.h │ ├── stem_UTF_8_hungarian.h │ ├── stem_UTF_8_norwegian.h │ ├── stem_ISO_8859_1_dutch.h │ ├── stem_UTF_8_portuguese.h │ ├── stem_ISO_8859_1_danish.h │ ├── stem_ISO_8859_1_english.h │ ├── stem_ISO_8859_1_finnish.h │ ├── stem_ISO_8859_1_french.h │ ├── stem_ISO_8859_1_german.h │ ├── stem_ISO_8859_1_italian.h │ ├── stem_ISO_8859_1_porter.h │ ├── stem_ISO_8859_1_spanish.h │ ├── stem_ISO_8859_1_swedish.h │ ├── stem_ISO_8859_2_romanian.h │ ├── stem_ISO_8859_1_hungarian.h │ ├── stem_ISO_8859_1_norwegian.h │ ├── stem_ISO_8859_1_portuguese.h │ ├── stem_ISO_8859_1_norwegian.c │ ├── stem_UTF_8_norwegian.c │ ├── stem_ISO_8859_1_swedish.c │ ├── stem_UTF_8_swedish.c │ ├── stem_ISO_8859_1_danish.c │ └── stem_UTF_8_danish.c ├── runtime │ ├── api.h │ ├── api_sq3.c │ ├── header.h │ └── utilities_sq3.c ├── mkinc_utf8.mak ├── MANIFEST ├── libstemmer │ ├── libstemmer.c │ ├── libstemmer_c.in │ ├── modules_utf8.txt │ ├── libstemmer_utf8.c │ ├── modules.txt │ ├── modules_utf8.h │ └── modules.h ├── mkinc.mak ├── include │ └── libstemmer.h ├── README └── examples │ └── stemwords.c ├── sqlite3_unicodesn_tokenizer.h ├── fts3_unicodesn.h ├── README ├── fts3Int.h ├── sqlite3_unicodesn_tokenizer.c ├── Makefile ├── extension.c ├── fts3_tokenizer.h ├── fts3_unicode2.c └── fts3_unicodesn.c /libstemmer_c/Makefile: -------------------------------------------------------------------------------- 1 | include mkinc.mak 2 | CFLAGS=-Iinclude 3 | all: libstemmer.o stemwords 4 | libstemmer.o: $(snowball_sources:.c=.o) 5 | $(AR) -cru $@ $^ 6 | stemwords: examples/stemwords.o libstemmer.o 7 | $(CC) -o $@ $^ 8 | clean: 9 | rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o 10 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_dutch.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * dutch_UTF_8_create_env(void); 9 | extern void dutch_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int dutch_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_danish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * danish_UTF_8_create_env(void); 9 | extern void danish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int danish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_english.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * english_UTF_8_create_env(void); 9 | extern void english_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int english_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_finnish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * finnish_UTF_8_create_env(void); 9 | extern void finnish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int finnish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_french.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * french_UTF_8_create_env(void); 9 | extern void french_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int french_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_german.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * german_UTF_8_create_env(void); 9 | extern void german_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int german_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_italian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * italian_UTF_8_create_env(void); 9 | extern void italian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int italian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_porter.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * porter_UTF_8_create_env(void); 9 | extern void porter_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int porter_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_russian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * russian_UTF_8_create_env(void); 9 | extern void russian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int russian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_spanish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * spanish_UTF_8_create_env(void); 9 | extern void spanish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int spanish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_swedish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * swedish_UTF_8_create_env(void); 9 | extern void swedish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int swedish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_turkish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * turkish_UTF_8_create_env(void); 9 | extern void turkish_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int turkish_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_KOI8_R_russian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * russian_KOI8_R_create_env(void); 9 | extern void russian_KOI8_R_close_env(struct SN_env * z); 10 | 11 | extern int russian_KOI8_R_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_romanian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * romanian_UTF_8_create_env(void); 9 | extern void romanian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int romanian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_hungarian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * hungarian_UTF_8_create_env(void); 9 | extern void hungarian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int hungarian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_norwegian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * norwegian_UTF_8_create_env(void); 9 | extern void norwegian_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int norwegian_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_dutch.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * dutch_ISO_8859_1_create_env(void); 9 | extern void dutch_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int dutch_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_portuguese.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * portuguese_UTF_8_create_env(void); 9 | extern void portuguese_UTF_8_close_env(struct SN_env * z); 10 | 11 | extern int portuguese_UTF_8_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_danish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * danish_ISO_8859_1_create_env(void); 9 | extern void danish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int danish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_english.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * english_ISO_8859_1_create_env(void); 9 | extern void english_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int english_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_finnish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * finnish_ISO_8859_1_create_env(void); 9 | extern void finnish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int finnish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_french.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * french_ISO_8859_1_create_env(void); 9 | extern void french_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int french_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_german.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * german_ISO_8859_1_create_env(void); 9 | extern void german_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int german_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_italian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * italian_ISO_8859_1_create_env(void); 9 | extern void italian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int italian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_porter.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * porter_ISO_8859_1_create_env(void); 9 | extern void porter_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int porter_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_spanish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * spanish_ISO_8859_1_create_env(void); 9 | extern void spanish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int spanish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_swedish.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * swedish_ISO_8859_1_create_env(void); 9 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int swedish_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_2_romanian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * romanian_ISO_8859_2_create_env(void); 9 | extern void romanian_ISO_8859_2_close_env(struct SN_env * z); 10 | 11 | extern int romanian_ISO_8859_2_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * hungarian_ISO_8859_1_create_env(void); 9 | extern void hungarian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int hungarian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void); 9 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | extern struct SN_env * portuguese_ISO_8859_1_create_env(void); 9 | extern void portuguese_ISO_8859_1_close_env(struct SN_env * z); 10 | 11 | extern int portuguese_ISO_8859_1_stem(struct SN_env * z); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /sqlite3_unicodesn_tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef _UNICODESN_TOKENIZER_H_ 2 | #define _UNICODESN_TOKENIZER_H_ 3 | 4 | /* 5 | ** Registers the Unicode Snowball tokenizer as "unicodesn", for use with SQLite's FTS3 or FTS4. 6 | ** This is for use when compiling the tokenizer directly into an application, instead of as a 7 | ** separate shared library. Example of usage: 8 | ** CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian"); 9 | */ 10 | int register_unicodesn_tokenizer(sqlite3 *db); 11 | 12 | #endif /* _UNICODESN_TOKENIZER_H_ */ 13 | -------------------------------------------------------------------------------- /fts3_unicodesn.h: -------------------------------------------------------------------------------- 1 | #ifndef _FTS3_UNICODE_SN_H 2 | #define _FTS3_UNICODE_SN_H 3 | 4 | #include "fts3_tokenizer.h" 5 | 6 | #define TOKENIZER_NAME "unicodesn" 7 | 8 | #ifdef _MSC_VER 9 | #define UNICODE0_DLL_EXPORTED __declspec(dllexport) 10 | #else 11 | #define UNICODE0_DLL_EXPORTED __attribute__((__visibility__("default"))) 12 | #endif 13 | 14 | struct sqlite3_api_routines; 15 | 16 | void sqlite3Fts3UnicodeSnTokenizer(sqlite3_tokenizer_module const **ppModule); 17 | 18 | UNICODE0_DLL_EXPORTED int sqlite3_extension_init( 19 | sqlite3 *db, /* The database connection */ 20 | char **pzErrMsg, /* Write error messages here */ 21 | const struct sqlite3_api_routines *pApi /* API methods */ 22 | ); 23 | 24 | 25 | #endif /* _FTS3_UNICODE0_H */ 26 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | SQLite3-unicodesn 2 | ============== 3 | 4 | SQLite "unicode" full-text-search tokenizer with Snowball stemming 5 | 6 | Installation 7 | ============ 8 | 9 | $ git clone git://github.com/littlesavage/sqlite3-unicodesn.git 10 | $ cd sqlite3-unicodesn 11 | $ make 12 | $ su 13 | # make install 14 | 15 | Usage 16 | ====== 17 | 18 | $ sqlite3 19 | sqlite> .load unicodesn.sqlext 20 | sqlite> CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian"); 21 | sqlite> INSERT INTO fts VALUES ("Пионэры! Идите в жопу!"); 22 | sqlite> SELECT * FROM fts WHERE text MATCH 'Жопа'; 23 | Пионэры! Идите в жопу! 24 | 25 | License 26 | ======= 27 | 28 | Snowball files and stemmers are covered by the BSD license. 29 | 30 | SQLite is in the Public Domain. 31 | 32 | SQLite3-unicodesn code is in the Public Domain. 33 | 34 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/api.h: -------------------------------------------------------------------------------- 1 | 2 | typedef unsigned char symbol; 3 | 4 | /* Or replace 'char' above with 'short' for 16 bit characters. 5 | 6 | More precisely, replace 'char' with whatever type guarantees the 7 | character width you need. Note however that sizeof(symbol) should divide 8 | HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise 9 | there is an alignment problem. In the unlikely event of a problem here, 10 | consult Martin Porter. 11 | 12 | */ 13 | 14 | struct SN_env { 15 | symbol * p; 16 | int c; int l; int lb; int bra; int ket; 17 | symbol * * S; 18 | int * I; 19 | unsigned char * B; 20 | }; 21 | 22 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); 23 | extern void SN_close_env(struct SN_env * z, int S_size); 24 | 25 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s); 26 | 27 | -------------------------------------------------------------------------------- /fts3Int.h: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2009 Nov 12 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ****************************************************************************** 12 | ** 13 | */ 14 | #ifndef _FTSINT_H 15 | #define _FTSINT_H 16 | 17 | #include "sqlite3.h" 18 | #include "fts3_tokenizer.h" 19 | 20 | typedef unsigned char u8; /* 1-byte (or larger) unsigned integer */ 21 | typedef short int i16; /* 2-byte (or larger) signed integer */ 22 | typedef unsigned int u32; /* 4-byte unsigned integer */ 23 | typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */ 24 | typedef sqlite3_int64 i64; /* 8-byte signed integer */ 25 | 26 | #define UNUSED_PARAMETER(x) (void)(x) 27 | 28 | /* fts3_unicode2.c (functions generated by parsing unicode text files) */ 29 | #ifdef SQLITE_ENABLE_FTS4_UNICODE61 30 | int sqlite3FtsUnicodeFold(int, int); 31 | int sqlite3FtsUnicodeIsalnum(int); 32 | int sqlite3FtsUnicodeIsdiacritic(int); 33 | #endif 34 | 35 | #endif /* _FTSINT_H */ 36 | -------------------------------------------------------------------------------- /sqlite3_unicodesn_tokenizer.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2013 September 22 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ****************************************************************************** 12 | ** 13 | */ 14 | #include 15 | 16 | #include "fts3_unicodesn.h" 17 | #include "sqlite3_unicodesn_tokenizer.h" 18 | 19 | /* 20 | ** Register the tokenizer with FTS3 or FTS4. For use when compiling the tokenizer directly into 21 | ** an application, instead of as a separate shared library. 22 | */ 23 | int register_unicodesn_tokenizer( 24 | sqlite3 *db /* The database connection */ 25 | ) 26 | { 27 | const sqlite3_tokenizer_module *tokenizer; 28 | int rc; 29 | sqlite3_stmt *pStmt; 30 | const char *zSql = "SELECT fts3_tokenizer(?, ?)"; 31 | 32 | sqlite3Fts3UnicodeSnTokenizer(&tokenizer); 33 | 34 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); 35 | if( rc!=SQLITE_OK ){ 36 | return rc; 37 | } 38 | 39 | sqlite3_bind_text(pStmt, 1, TOKENIZER_NAME, -1, SQLITE_STATIC); 40 | sqlite3_bind_blob(pStmt, 2, &tokenizer, sizeof(tokenizer), SQLITE_TRANSIENT); 41 | rc = sqlite3_step(pStmt); 42 | if( rc!=SQLITE_OK && rc < SQLITE_ROW ){ 43 | return rc; 44 | } 45 | return sqlite3_finalize(pStmt); 46 | } 47 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC?=gcc 3 | #CFLAGS=-W -Wall -g -O0 4 | CFLAGS?= -Os -DNDEBUG -s 5 | 6 | DESTDIR?= /usr 7 | 8 | STEMMERS?= danish dutch english finnish french german hungarian \ 9 | italian norwegian porter portuguese romanian russian \ 10 | spanish swedish turkish 11 | 12 | CFLAGS+= \ 13 | -DSQLITE_ENABLE_FTS4 \ 14 | -DSQLITE_ENABLE_FTS4_UNICODE61 15 | 16 | SOURCES= \ 17 | fts3_unicode2.c \ 18 | fts3_unicodesn.c \ 19 | extension.c 20 | 21 | HEADERS= fts3_tokenizer.h 22 | 23 | INCLUDES= \ 24 | -Ilibstemmer_c/runtime \ 25 | -Ilibstemmer_c/src_c 26 | 27 | LIBRARIES= -lsqlite3 28 | 29 | SNOWBALL_SOURCES= \ 30 | libstemmer_c/runtime/api_sq3.c \ 31 | libstemmer_c/runtime/utilities_sq3.c 32 | 33 | SNOWBALL_HEADERS= \ 34 | libstemmer_c/include/libstemmer.h \ 35 | libstemmer_c/runtime/api.h \ 36 | libstemmer_c/runtime/header.h 37 | 38 | SNOWBALL_SOURCES+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).c) 39 | 40 | SNOWBALL_HEADERS+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).h) 41 | 42 | SNOWBALL_FLAGS+= $(foreach s, $(STEMMERS), -DWITH_STEMMER_$(s)) 43 | 44 | all: unicodesn.sqlext 45 | 46 | unicodesn.sqlext: $(HEADERS) $(SOURCES) $(SNOWBALL_HEADERS) $(SNOWBALL_SOURCES) 47 | $(CC) $(CFLAGS) $(SNOWBALL_FLAGS) $(INCLUDES) -fPIC -shared -fvisibility=hidden -o $@ \ 48 | $(SOURCES) $(SNOWBALL_SOURCES) $(LIBRARIES) 49 | 50 | clean: 51 | rm -f *.o unicodesn.sqlext 52 | 53 | install: unicodesn.sqlext 54 | mkdir -p ${DESTDIR}/lib 2> /dev/null 55 | install -D -o root -g root -m 644 unicodesn.sqlext ${DESTDIR}/lib 56 | 57 | .PHONY: clean install 58 | -------------------------------------------------------------------------------- /libstemmer_c/mkinc_utf8.mak: -------------------------------------------------------------------------------- 1 | # libstemmer/mkinc_utf8.mak: List of stemming module source files 2 | # 3 | # This file is generated by mkmodules.pl from a list of module names. 4 | # Do not edit manually. 5 | # 6 | # Modules included by this file are: danish, dutch, english, finnish, french, 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | # russian, spanish, swedish, turkish 9 | 10 | snowball_sources= \ 11 | src_c/stem_UTF_8_danish.c \ 12 | src_c/stem_UTF_8_dutch.c \ 13 | src_c/stem_UTF_8_english.c \ 14 | src_c/stem_UTF_8_finnish.c \ 15 | src_c/stem_UTF_8_french.c \ 16 | src_c/stem_UTF_8_german.c \ 17 | src_c/stem_UTF_8_hungarian.c \ 18 | src_c/stem_UTF_8_italian.c \ 19 | src_c/stem_UTF_8_norwegian.c \ 20 | src_c/stem_UTF_8_porter.c \ 21 | src_c/stem_UTF_8_portuguese.c \ 22 | src_c/stem_UTF_8_romanian.c \ 23 | src_c/stem_UTF_8_russian.c \ 24 | src_c/stem_UTF_8_spanish.c \ 25 | src_c/stem_UTF_8_swedish.c \ 26 | src_c/stem_UTF_8_turkish.c \ 27 | runtime/api.c \ 28 | runtime/utilities.c \ 29 | libstemmer/libstemmer_utf8.c 30 | 31 | snowball_headers= \ 32 | src_c/stem_UTF_8_danish.h \ 33 | src_c/stem_UTF_8_dutch.h \ 34 | src_c/stem_UTF_8_english.h \ 35 | src_c/stem_UTF_8_finnish.h \ 36 | src_c/stem_UTF_8_french.h \ 37 | src_c/stem_UTF_8_german.h \ 38 | src_c/stem_UTF_8_hungarian.h \ 39 | src_c/stem_UTF_8_italian.h \ 40 | src_c/stem_UTF_8_norwegian.h \ 41 | src_c/stem_UTF_8_porter.h \ 42 | src_c/stem_UTF_8_portuguese.h \ 43 | src_c/stem_UTF_8_romanian.h \ 44 | src_c/stem_UTF_8_russian.h \ 45 | src_c/stem_UTF_8_spanish.h \ 46 | src_c/stem_UTF_8_swedish.h \ 47 | src_c/stem_UTF_8_turkish.h \ 48 | include/libstemmer.h \ 49 | libstemmer/modules_utf8.h \ 50 | runtime/api.h \ 51 | runtime/header.h 52 | 53 | -------------------------------------------------------------------------------- /extension.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2012 November 11 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ****************************************************************************** 12 | ** 13 | */ 14 | #include 15 | #include 16 | 17 | #include "fts3_unicodesn.h" 18 | 19 | SQLITE_EXTENSION_INIT1 20 | 21 | /* 22 | ** Register a tokenizer implementation with FTS3 or FTS4. 23 | */ 24 | static int registerTokenizer( 25 | sqlite3 *db, 26 | char *zName, 27 | const sqlite3_tokenizer_module *p 28 | ){ 29 | int rc; 30 | sqlite3_stmt *pStmt; 31 | const char *zSql = "SELECT fts3_tokenizer(?, ?)"; 32 | 33 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); 34 | if( rc!=SQLITE_OK ){ 35 | return rc; 36 | } 37 | 38 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); 39 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); 40 | sqlite3_step(pStmt); 41 | 42 | return sqlite3_finalize(pStmt); 43 | } 44 | 45 | /* SQLite invokes this routine once when it loads the extension. 46 | ** Create new functions, collating sequences, and virtual table 47 | ** modules here. This is usually the only exported symbol in 48 | ** the shared library. 49 | */ 50 | int sqlite3_extension_init( 51 | sqlite3 *db, /* The database connection */ 52 | char **pzErrMsg, /* Write error messages here */ 53 | const sqlite3_api_routines *pApi /* API methods */ 54 | ) 55 | { 56 | const sqlite3_tokenizer_module *tokenizer; 57 | 58 | SQLITE_EXTENSION_INIT2(pApi) 59 | 60 | sqlite3Fts3UnicodeSnTokenizer(&tokenizer); 61 | 62 | registerTokenizer(db, TOKENIZER_NAME, tokenizer); 63 | 64 | return 0; 65 | } 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/api_sq3.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include /* for calloc, free */ 5 | #include "header.h" 6 | 7 | static void *local_calloc(size_t nmemb, size_t size) { 8 | void *p = sqlite3_malloc((int)(nmemb*size)); 9 | if (p == NULL) 10 | return NULL; 11 | return memset(p, 0, nmemb*size); 12 | } 13 | 14 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) 15 | { 16 | struct SN_env * z = (struct SN_env *) local_calloc(1, sizeof(struct SN_env)); 17 | if (z == NULL) return NULL; 18 | z->p = create_s(); 19 | if (z->p == NULL) goto error; 20 | if (S_size) 21 | { 22 | int i; 23 | z->S = (symbol * *) local_calloc(S_size, sizeof(symbol *)); 24 | if (z->S == NULL) goto error; 25 | 26 | for (i = 0; i < S_size; i++) 27 | { 28 | z->S[i] = create_s(); 29 | if (z->S[i] == NULL) goto error; 30 | } 31 | } 32 | 33 | if (I_size) 34 | { 35 | z->I = (int *) local_calloc(I_size, sizeof(int)); 36 | if (z->I == NULL) goto error; 37 | } 38 | 39 | if (B_size) 40 | { 41 | z->B = (unsigned char *) local_calloc(B_size, sizeof(unsigned char)); 42 | if (z->B == NULL) goto error; 43 | } 44 | 45 | return z; 46 | error: 47 | SN_close_env(z, S_size); 48 | return NULL; 49 | } 50 | 51 | extern void SN_close_env(struct SN_env * z, int S_size) 52 | { 53 | if (z == NULL) return; 54 | if (S_size) 55 | { 56 | int i; 57 | for (i = 0; i < S_size; i++) 58 | { 59 | lose_s(z->S[i]); 60 | } 61 | sqlite3_free(z->S); 62 | } 63 | sqlite3_free(z->I); 64 | sqlite3_free(z->B); 65 | if (z->p) lose_s(z->p); 66 | sqlite3_free(z); 67 | } 68 | 69 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s) 70 | { 71 | int err = replace_s(z, 0, z->l, size, s, NULL); 72 | z->c = 0; 73 | return err; 74 | } 75 | 76 | -------------------------------------------------------------------------------- /libstemmer_c/MANIFEST: -------------------------------------------------------------------------------- 1 | README 2 | src_c/stem_ISO_8859_1_danish.c 3 | src_c/stem_ISO_8859_1_danish.h 4 | src_c/stem_ISO_8859_1_dutch.c 5 | src_c/stem_ISO_8859_1_dutch.h 6 | src_c/stem_ISO_8859_1_english.c 7 | src_c/stem_ISO_8859_1_english.h 8 | src_c/stem_ISO_8859_1_finnish.c 9 | src_c/stem_ISO_8859_1_finnish.h 10 | src_c/stem_ISO_8859_1_french.c 11 | src_c/stem_ISO_8859_1_french.h 12 | src_c/stem_ISO_8859_1_german.c 13 | src_c/stem_ISO_8859_1_german.h 14 | src_c/stem_ISO_8859_1_hungarian.c 15 | src_c/stem_ISO_8859_1_hungarian.h 16 | src_c/stem_ISO_8859_1_italian.c 17 | src_c/stem_ISO_8859_1_italian.h 18 | src_c/stem_ISO_8859_1_norwegian.c 19 | src_c/stem_ISO_8859_1_norwegian.h 20 | src_c/stem_ISO_8859_1_porter.c 21 | src_c/stem_ISO_8859_1_porter.h 22 | src_c/stem_ISO_8859_1_portuguese.c 23 | src_c/stem_ISO_8859_1_portuguese.h 24 | src_c/stem_ISO_8859_1_spanish.c 25 | src_c/stem_ISO_8859_1_spanish.h 26 | src_c/stem_ISO_8859_1_swedish.c 27 | src_c/stem_ISO_8859_1_swedish.h 28 | src_c/stem_ISO_8859_2_romanian.c 29 | src_c/stem_ISO_8859_2_romanian.h 30 | src_c/stem_KOI8_R_russian.c 31 | src_c/stem_KOI8_R_russian.h 32 | src_c/stem_UTF_8_danish.c 33 | src_c/stem_UTF_8_danish.h 34 | src_c/stem_UTF_8_dutch.c 35 | src_c/stem_UTF_8_dutch.h 36 | src_c/stem_UTF_8_english.c 37 | src_c/stem_UTF_8_english.h 38 | src_c/stem_UTF_8_finnish.c 39 | src_c/stem_UTF_8_finnish.h 40 | src_c/stem_UTF_8_french.c 41 | src_c/stem_UTF_8_french.h 42 | src_c/stem_UTF_8_german.c 43 | src_c/stem_UTF_8_german.h 44 | src_c/stem_UTF_8_hungarian.c 45 | src_c/stem_UTF_8_hungarian.h 46 | src_c/stem_UTF_8_italian.c 47 | src_c/stem_UTF_8_italian.h 48 | src_c/stem_UTF_8_norwegian.c 49 | src_c/stem_UTF_8_norwegian.h 50 | src_c/stem_UTF_8_porter.c 51 | src_c/stem_UTF_8_porter.h 52 | src_c/stem_UTF_8_portuguese.c 53 | src_c/stem_UTF_8_portuguese.h 54 | src_c/stem_UTF_8_romanian.c 55 | src_c/stem_UTF_8_romanian.h 56 | src_c/stem_UTF_8_russian.c 57 | src_c/stem_UTF_8_russian.h 58 | src_c/stem_UTF_8_spanish.c 59 | src_c/stem_UTF_8_spanish.h 60 | src_c/stem_UTF_8_swedish.c 61 | src_c/stem_UTF_8_swedish.h 62 | src_c/stem_UTF_8_turkish.c 63 | src_c/stem_UTF_8_turkish.h 64 | runtime/api.c 65 | runtime/api.h 66 | runtime/header.h 67 | runtime/utilities.c 68 | libstemmer/libstemmer.c 69 | libstemmer/libstemmer_utf8.c 70 | libstemmer/modules.h 71 | libstemmer/modules_utf8.h 72 | include/libstemmer.h 73 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "modules.h" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close == 0) return; 71 | stemmer->close(stemmer->env); 72 | stemmer->close = 0; 73 | free(stemmer); 74 | } 75 | 76 | const sb_symbol * 77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 78 | { 79 | int ret; 80 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 81 | { 82 | stemmer->env->l = 0; 83 | return NULL; 84 | } 85 | ret = stemmer->stem(stemmer->env); 86 | if (ret < 0) return NULL; 87 | stemmer->env->p[stemmer->env->l] = 0; 88 | return (const sb_symbol *)(stemmer->env->p); 89 | } 90 | 91 | int 92 | sb_stemmer_length(struct sb_stemmer * stemmer) 93 | { 94 | return stemmer->env->l; 95 | } 96 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer_c.in: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "@MODULES_H@" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close == 0) return; 71 | stemmer->close(stemmer->env); 72 | stemmer->close = 0; 73 | free(stemmer); 74 | } 75 | 76 | const sb_symbol * 77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 78 | { 79 | int ret; 80 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 81 | { 82 | stemmer->env->l = 0; 83 | return NULL; 84 | } 85 | ret = stemmer->stem(stemmer->env); 86 | if (ret < 0) return NULL; 87 | stemmer->env->p[stemmer->env->l] = 0; 88 | return (const sb_symbol *)(stemmer->env->p); 89 | } 90 | 91 | int 92 | sb_stemmer_length(struct sb_stemmer * stemmer) 93 | { 94 | return stemmer->env->l; 95 | } 96 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules_utf8.txt: -------------------------------------------------------------------------------- 1 | # This file contains a list of stemmers to include in the distribution. 2 | # The format is a set of space separated lines - on each line: 3 | # First item is name of stemmer. 4 | # Second item is comma separated list of character sets. 5 | # Third item is comma separated list of names to refer to the stemmer by. 6 | # 7 | # Lines starting with a #, or blank lines, are ignored. 8 | 9 | # List all the main algorithms for each language, in UTF-8. 10 | 11 | danish UTF_8 danish,da,dan 12 | dutch UTF_8 dutch,nl,dut,nld 13 | english UTF_8 english,en,eng 14 | finnish UTF_8 finnish,fi,fin 15 | french UTF_8 french,fr,fre,fra 16 | german UTF_8 german,de,ger,deu 17 | hungarian UTF_8 hungarian,hu,hun 18 | italian UTF_8 italian,it,ita 19 | norwegian UTF_8 norwegian,no,nor 20 | portuguese UTF_8 portuguese,pt,por 21 | romanian UTF_8 romanian,ro,rum,ron 22 | russian UTF_8 russian,ru,rus 23 | spanish UTF_8 spanish,es,esl,spa 24 | swedish UTF_8 swedish,sv,swe 25 | turkish UTF_8 turkish,tr,tur 26 | 27 | # Also include the traditional porter algorithm for english. 28 | # The porter algorithm is included in the libstemmer distribution to assist 29 | # with backwards compatibility, but for new systems the english algorithm 30 | # should be used in preference. 31 | porter UTF_8 porter 32 | 33 | # Some other stemmers in the snowball project are not included in the standard 34 | # distribution. To compile a libstemmer with them in, add them to this list, 35 | # and regenerate the distribution. (You will need a full source checkout for 36 | # this.) They are included in the snowball website as curiosities, but are not 37 | # intended for general use, and use of them is is not fully supported. These 38 | # algorithms are: 39 | # 40 | # german2 - This is a slight modification of the german stemmer. 41 | #german2 UTF_8 german2 42 | # 43 | # kraaij_pohlmann - This is a different dutch stemmer. 44 | #kraaij_pohlmann UTF_8 kraaij_pohlmann 45 | # 46 | # lovins - This is an english stemmer, but fairly outdated, and 47 | # only really applicable to a restricted type of input text 48 | # (keywords in academic publications). 49 | #lovins UTF_8 lovins 50 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/libstemmer_utf8.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "../include/libstemmer.h" 5 | #include "../runtime/api.h" 6 | #include "modules_utf8.h" 7 | 8 | struct sb_stemmer { 9 | struct SN_env * (*create)(void); 10 | void (*close)(struct SN_env *); 11 | int (*stem)(struct SN_env *); 12 | 13 | struct SN_env * env; 14 | }; 15 | 16 | extern const char ** 17 | sb_stemmer_list(void) 18 | { 19 | return algorithm_names; 20 | } 21 | 22 | static stemmer_encoding_t 23 | sb_getenc(const char * charenc) 24 | { 25 | struct stemmer_encoding * encoding; 26 | if (charenc == NULL) return ENC_UTF_8; 27 | for (encoding = encodings; encoding->name != 0; encoding++) { 28 | if (strcmp(encoding->name, charenc) == 0) break; 29 | } 30 | if (encoding->name == NULL) return ENC_UNKNOWN; 31 | return encoding->enc; 32 | } 33 | 34 | extern struct sb_stemmer * 35 | sb_stemmer_new(const char * algorithm, const char * charenc) 36 | { 37 | stemmer_encoding_t enc; 38 | struct stemmer_modules * module; 39 | struct sb_stemmer * stemmer; 40 | 41 | enc = sb_getenc(charenc); 42 | if (enc == ENC_UNKNOWN) return NULL; 43 | 44 | for (module = modules; module->name != 0; module++) { 45 | if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; 46 | } 47 | if (module->name == NULL) return NULL; 48 | 49 | stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); 50 | if (stemmer == NULL) return NULL; 51 | 52 | stemmer->create = module->create; 53 | stemmer->close = module->close; 54 | stemmer->stem = module->stem; 55 | 56 | stemmer->env = stemmer->create(); 57 | if (stemmer->env == NULL) 58 | { 59 | sb_stemmer_delete(stemmer); 60 | return NULL; 61 | } 62 | 63 | return stemmer; 64 | } 65 | 66 | void 67 | sb_stemmer_delete(struct sb_stemmer * stemmer) 68 | { 69 | if (stemmer == 0) return; 70 | if (stemmer->close != NULL) 71 | { 72 | stemmer->close(stemmer->env); 73 | stemmer->close = NULL; 74 | } 75 | free(stemmer); 76 | } 77 | 78 | const sb_symbol * 79 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) 80 | { 81 | int ret; 82 | if (SN_set_current(stemmer->env, size, (const symbol *)(word))) 83 | { 84 | stemmer->env->l = 0; 85 | return NULL; 86 | } 87 | ret = stemmer->stem(stemmer->env); 88 | if (ret < 0) return NULL; 89 | stemmer->env->p[stemmer->env->l] = 0; 90 | return (const sb_symbol *)(stemmer->env->p); 91 | } 92 | 93 | int 94 | sb_stemmer_length(struct sb_stemmer * stemmer) 95 | { 96 | return stemmer->env->l; 97 | } 98 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/header.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "api.h" 5 | 6 | #define MAXINT INT_MAX 7 | #define MININT INT_MIN 8 | 9 | #define HEAD 2*sizeof(int) 10 | 11 | #define SIZE(p) ((int *)(p))[-1] 12 | #define SET_SIZE(p, n) ((int *)(p))[-1] = n 13 | #define CAPACITY(p) ((int *)(p))[-2] 14 | 15 | struct among 16 | { int s_size; /* number of chars in string */ 17 | const symbol * s; /* search string */ 18 | int substring_i;/* index to longest matching substring */ 19 | int result; /* result of the lookup */ 20 | int (* function)(struct SN_env *); 21 | }; 22 | 23 | extern symbol * create_s(void); 24 | extern void lose_s(symbol * p); 25 | 26 | extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); 27 | 28 | extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 29 | extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 30 | extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 31 | extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 32 | 33 | extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 34 | extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 35 | extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 36 | extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 37 | 38 | extern int eq_s(struct SN_env * z, int s_size, const symbol * s); 39 | extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); 40 | extern int eq_v(struct SN_env * z, const symbol * p); 41 | extern int eq_v_b(struct SN_env * z, const symbol * p); 42 | 43 | extern int find_among(struct SN_env * z, const struct among * v, int v_size); 44 | extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); 45 | 46 | extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); 47 | extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); 48 | extern int slice_from_v(struct SN_env * z, const symbol * p); 49 | extern int slice_del(struct SN_env * z); 50 | 51 | extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); 52 | extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); 53 | 54 | extern symbol * slice_to(struct SN_env * z, symbol * p); 55 | extern symbol * assign_to(struct SN_env * z, symbol * p); 56 | 57 | extern void debug(struct SN_env * z, int number, int line_count); 58 | 59 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules.txt: -------------------------------------------------------------------------------- 1 | # This file contains a list of stemmers to include in the distribution. 2 | # The format is a set of space separated lines - on each line: 3 | # First item is name of stemmer. 4 | # Second item is comma separated list of character sets. 5 | # Third item is comma separated list of names to refer to the stemmer by. 6 | # 7 | # Lines starting with a #, or blank lines, are ignored. 8 | 9 | # List all the main algorithms for each language, in UTF-8, and also with 10 | # the most commonly used encoding. 11 | 12 | danish UTF_8,ISO_8859_1 danish,da,dan 13 | dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld 14 | english UTF_8,ISO_8859_1 english,en,eng 15 | finnish UTF_8,ISO_8859_1 finnish,fi,fin 16 | french UTF_8,ISO_8859_1 french,fr,fre,fra 17 | german UTF_8,ISO_8859_1 german,de,ger,deu 18 | hungarian UTF_8,ISO_8859_1 hungarian,hu,hun 19 | italian UTF_8,ISO_8859_1 italian,it,ita 20 | norwegian UTF_8,ISO_8859_1 norwegian,no,nor 21 | portuguese UTF_8,ISO_8859_1 portuguese,pt,por 22 | romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron 23 | russian UTF_8,KOI8_R russian,ru,rus 24 | spanish UTF_8,ISO_8859_1 spanish,es,esl,spa 25 | swedish UTF_8,ISO_8859_1 swedish,sv,swe 26 | turkish UTF_8 turkish,tr,tur 27 | 28 | # Also include the traditional porter algorithm for english. 29 | # The porter algorithm is included in the libstemmer distribution to assist 30 | # with backwards compatibility, but for new systems the english algorithm 31 | # should be used in preference. 32 | porter UTF_8,ISO_8859_1 porter 33 | 34 | # Some other stemmers in the snowball project are not included in the standard 35 | # distribution. To compile a libstemmer with them in, add them to this list, 36 | # and regenerate the distribution. (You will need a full source checkout for 37 | # this.) They are included in the snowball website as curiosities, but are not 38 | # intended for general use, and use of them is is not fully supported. These 39 | # algorithms are: 40 | # 41 | # german2 - This is a slight modification of the german stemmer. 42 | #german2 UTF_8,ISO_8859_1 german2 43 | # 44 | # kraaij_pohlmann - This is a different dutch stemmer. 45 | #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann 46 | # 47 | # lovins - This is an english stemmer, but fairly outdated, and 48 | # only really applicable to a restricted type of input text 49 | # (keywords in academic publications). 50 | #lovins UTF_8,ISO_8859_1 lovins 51 | -------------------------------------------------------------------------------- /libstemmer_c/mkinc.mak: -------------------------------------------------------------------------------- 1 | # libstemmer/mkinc.mak: List of stemming module source files 2 | # 3 | # This file is generated by mkmodules.pl from a list of module names. 4 | # Do not edit manually. 5 | # 6 | # Modules included by this file are: danish, dutch, english, finnish, french, 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | # russian, spanish, swedish, turkish 9 | 10 | snowball_sources= \ 11 | src_c/stem_ISO_8859_1_danish.c \ 12 | src_c/stem_UTF_8_danish.c \ 13 | src_c/stem_ISO_8859_1_dutch.c \ 14 | src_c/stem_UTF_8_dutch.c \ 15 | src_c/stem_ISO_8859_1_english.c \ 16 | src_c/stem_UTF_8_english.c \ 17 | src_c/stem_ISO_8859_1_finnish.c \ 18 | src_c/stem_UTF_8_finnish.c \ 19 | src_c/stem_ISO_8859_1_french.c \ 20 | src_c/stem_UTF_8_french.c \ 21 | src_c/stem_ISO_8859_1_german.c \ 22 | src_c/stem_UTF_8_german.c \ 23 | src_c/stem_ISO_8859_1_hungarian.c \ 24 | src_c/stem_UTF_8_hungarian.c \ 25 | src_c/stem_ISO_8859_1_italian.c \ 26 | src_c/stem_UTF_8_italian.c \ 27 | src_c/stem_ISO_8859_1_norwegian.c \ 28 | src_c/stem_UTF_8_norwegian.c \ 29 | src_c/stem_ISO_8859_1_porter.c \ 30 | src_c/stem_UTF_8_porter.c \ 31 | src_c/stem_ISO_8859_1_portuguese.c \ 32 | src_c/stem_UTF_8_portuguese.c \ 33 | src_c/stem_ISO_8859_2_romanian.c \ 34 | src_c/stem_UTF_8_romanian.c \ 35 | src_c/stem_KOI8_R_russian.c \ 36 | src_c/stem_UTF_8_russian.c \ 37 | src_c/stem_ISO_8859_1_spanish.c \ 38 | src_c/stem_UTF_8_spanish.c \ 39 | src_c/stem_ISO_8859_1_swedish.c \ 40 | src_c/stem_UTF_8_swedish.c \ 41 | src_c/stem_UTF_8_turkish.c \ 42 | runtime/api.c \ 43 | runtime/utilities.c \ 44 | libstemmer/libstemmer.c 45 | 46 | snowball_headers= \ 47 | src_c/stem_ISO_8859_1_danish.h \ 48 | src_c/stem_UTF_8_danish.h \ 49 | src_c/stem_ISO_8859_1_dutch.h \ 50 | src_c/stem_UTF_8_dutch.h \ 51 | src_c/stem_ISO_8859_1_english.h \ 52 | src_c/stem_UTF_8_english.h \ 53 | src_c/stem_ISO_8859_1_finnish.h \ 54 | src_c/stem_UTF_8_finnish.h \ 55 | src_c/stem_ISO_8859_1_french.h \ 56 | src_c/stem_UTF_8_french.h \ 57 | src_c/stem_ISO_8859_1_german.h \ 58 | src_c/stem_UTF_8_german.h \ 59 | src_c/stem_ISO_8859_1_hungarian.h \ 60 | src_c/stem_UTF_8_hungarian.h \ 61 | src_c/stem_ISO_8859_1_italian.h \ 62 | src_c/stem_UTF_8_italian.h \ 63 | src_c/stem_ISO_8859_1_norwegian.h \ 64 | src_c/stem_UTF_8_norwegian.h \ 65 | src_c/stem_ISO_8859_1_porter.h \ 66 | src_c/stem_UTF_8_porter.h \ 67 | src_c/stem_ISO_8859_1_portuguese.h \ 68 | src_c/stem_UTF_8_portuguese.h \ 69 | src_c/stem_ISO_8859_2_romanian.h \ 70 | src_c/stem_UTF_8_romanian.h \ 71 | src_c/stem_KOI8_R_russian.h \ 72 | src_c/stem_UTF_8_russian.h \ 73 | src_c/stem_ISO_8859_1_spanish.h \ 74 | src_c/stem_UTF_8_spanish.h \ 75 | src_c/stem_ISO_8859_1_swedish.h \ 76 | src_c/stem_UTF_8_swedish.h \ 77 | src_c/stem_UTF_8_turkish.h \ 78 | include/libstemmer.h \ 79 | libstemmer/modules.h \ 80 | runtime/api.h \ 81 | runtime/header.h 82 | 83 | -------------------------------------------------------------------------------- /libstemmer_c/include/libstemmer.h: -------------------------------------------------------------------------------- 1 | 2 | /* Make header file work when included from C++ */ 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | struct sb_stemmer; 8 | typedef unsigned char sb_symbol; 9 | 10 | /* FIXME - should be able to get a version number for each stemming 11 | * algorithm (which will be incremented each time the output changes). */ 12 | 13 | /** Returns an array of the names of the available stemming algorithms. 14 | * Note that these are the canonical names - aliases (ie, other names for 15 | * the same algorithm) will not be included in the list. 16 | * The list is terminated with a null pointer. 17 | * 18 | * The list must not be modified in any way. 19 | */ 20 | const char ** sb_stemmer_list(void); 21 | 22 | /** Create a new stemmer object, using the specified algorithm, for the 23 | * specified character encoding. 24 | * 25 | * All algorithms will usually be available in UTF-8, but may also be 26 | * available in other character encodings. 27 | * 28 | * @param algorithm The algorithm name. This is either the english 29 | * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the 30 | * language. Note that case is significant in this parameter - the 31 | * value should be supplied in lower case. 32 | * 33 | * @param charenc The character encoding. NULL may be passed as 34 | * this value, in which case UTF-8 encoding will be assumed. Otherwise, 35 | * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), 36 | * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that 37 | * case is significant in this parameter. 38 | * 39 | * @return NULL if the specified algorithm is not recognised, or the 40 | * algorithm is not available for the requested encoding. Otherwise, 41 | * returns a pointer to a newly created stemmer for the requested algorithm. 42 | * The returned pointer must be deleted by calling sb_stemmer_delete(). 43 | * 44 | * @note NULL will also be returned if an out of memory error occurs. 45 | */ 46 | struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); 47 | 48 | /** Delete a stemmer object. 49 | * 50 | * This frees all resources allocated for the stemmer. After calling 51 | * this function, the supplied stemmer may no longer be used in any way. 52 | * 53 | * It is safe to pass a null pointer to this function - this will have 54 | * no effect. 55 | */ 56 | void sb_stemmer_delete(struct sb_stemmer * stemmer); 57 | 58 | /** Stem a word. 59 | * 60 | * The return value is owned by the stemmer - it must not be freed or 61 | * modified, and it will become invalid when the stemmer is called again, 62 | * or if the stemmer is freed. 63 | * 64 | * The length of the return value can be obtained using sb_stemmer_length(). 65 | * 66 | * If an out-of-memory error occurs, this will return NULL. 67 | */ 68 | const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, 69 | const sb_symbol * word, int size); 70 | 71 | /** Get the length of the result of the last stemmed word. 72 | * This should not be called before sb_stemmer_stem() has been called. 73 | */ 74 | int sb_stemmer_length(struct sb_stemmer * stemmer); 75 | 76 | #ifdef __cplusplus 77 | } 78 | #endif 79 | 80 | -------------------------------------------------------------------------------- /libstemmer_c/README: -------------------------------------------------------------------------------- 1 | libstemmer_c 2 | ============ 3 | 4 | This document pertains to the C version of the libstemmer distribution, 5 | available for download from: 6 | 7 | http://snowball.tartarus.org/dist/libstemmer_c.tgz 8 | 9 | 10 | Compiling the library 11 | ===================== 12 | 13 | A simple makefile is provided for Unix style systems. On such systems, it 14 | should be possible simply to run "make", and the file "libstemmer.o" 15 | and the example program "stemwords" will be generated. 16 | 17 | If this doesn't work on your system, you need to write your own build 18 | system (or call the compiler directly). The files to compile are 19 | all contained in the "libstemmer", "runtime" and "src_c" directories, 20 | and the public header file is contained in the "include" directory. 21 | 22 | The library comes in two flavours; UTF-8 only, and UTF-8 plus other character 23 | sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of 24 | "libstemmer.c". 25 | 26 | For convenience "mkinc.mak" is a makefile fragment listing the source files and 27 | header files used to compile the standard version of the library. 28 | "mkinc_utf8.mak" is a comparable makefile fragment listing just the source 29 | files for the UTF-8 only version of the library. 30 | 31 | 32 | Using the library 33 | ================= 34 | 35 | The library provides a simple C API. Essentially, a new stemmer can 36 | be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then 37 | used to stem a word, "sb_stemmer_length" returns the stemmed 38 | length of the last word processed, and "sb_stemmer_delete" is 39 | used to delete a stemmer. 40 | 41 | Creating a stemmer is a relatively expensive operation - the expected 42 | usage pattern is that a new stemmer is created when needed, used 43 | to stem many words, and deleted after some time. 44 | 45 | Stemmers are re-entrant, but not threadsafe. In other words, if 46 | you wish to access the same stemmer object from multiple threads, 47 | you must ensure that all access is protected by a mutex or similar 48 | device. 49 | 50 | libstemmer does not currently incorporate any mechanism for caching the results 51 | of stemming operations. Such caching can greatly increase the performance of a 52 | stemmer under certain situations, so suitable patches will be considered for 53 | inclusion. 54 | 55 | The standard libstemmer sources contain an algorithm for each of the supported 56 | languages. The algorithm may be selected using the english name of the 57 | language, or using the 2 or 3 letter ISO 639 language codes. In addition, 58 | the traditional "Porter" stemming algorithm for english is included for 59 | backwards compatibility purposes, but we recommend use of the "English" 60 | stemmer in preference for new projects. 61 | 62 | (Some minor algorithms which are included only as curiosities in the snowball 63 | website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not 64 | included in the standard libstemmer sources. These are not really supported by 65 | the snowball project, but it would be possible to compile a modified libstemmer 66 | library containing these if desired.) 67 | 68 | 69 | The stemwords example 70 | ===================== 71 | 72 | The stemwords example program allows you to run any of the stemmers 73 | compiled into the libstemmer library on a sample vocabulary. For 74 | details on how to use it, run it with the "-h" command line option. 75 | 76 | 77 | Using the library in a larger system 78 | ==================================== 79 | 80 | If you are incorporating the library into the build system of a larger 81 | program, I recommend copying the unpacked tarball without modification into 82 | a subdirectory of the sources of your program. Future versions of the 83 | library are intended to keep the same structure, so this will keep the 84 | work required to move to a new version of the library to a minimum. 85 | 86 | As an additional convenience, the list of source and header files used 87 | in the library is detailed in mkinc.mak - a file which is in a suitable 88 | format for inclusion by a Makefile. By including this file in your build 89 | system, you can link the snowball system into your program with a few 90 | extra rules. 91 | 92 | Using the library in a system using GNU autotools 93 | ================================================= 94 | 95 | The libstemmer_c library can be integrated into a larger system which uses the 96 | GNU autotool framework (and in particular, automake and autoconf) as follows: 97 | 98 | 1) Unpack libstemmer_c.tgz in the top level project directory so that there is 99 | a libstemmer_c subdirectory of the top level directory of the project. 100 | 101 | 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: 102 | 103 | noinst_LTLIBRARIES = libstemmer.la 104 | include $(srcdir)/mkinc.mak 105 | noinst_HEADERS = $(snowball_headers) 106 | libstemmer_la_SOURCES = $(snowball_sources) 107 | 108 | (You may also need to add other lines to this, for example, if you are using 109 | compiler options which are not compatible with compiling the libstemmer 110 | library.) 111 | 112 | 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's 113 | configure.ac file. 114 | 115 | 4) Add to the top level makefile the following lines (or modify existing 116 | assignments to these variables appropriately): 117 | 118 | AUTOMAKE_OPTIONS = subdir-objects 119 | AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include 120 | SUBDIRS=libstemmer_c 121 | _LIBADD = libstemmer_c/libstemmer.la 122 | 123 | (Where is the name of the library or executable which links against 124 | libstemmer.) 125 | 126 | -------------------------------------------------------------------------------- /libstemmer_c/examples/stemwords.c: -------------------------------------------------------------------------------- 1 | /* This is a simple program which uses libstemmer to provide a command 2 | * line interface for stemming using any of the algorithms provided. 3 | */ 4 | 5 | #include 6 | #include /* for malloc, free */ 7 | #include /* for memmove */ 8 | #include /* for isupper, tolower */ 9 | 10 | #include "libstemmer.h" 11 | 12 | const char * progname; 13 | static int pretty = 1; 14 | 15 | static void 16 | stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) 17 | { 18 | #define INC 10 19 | int lim = INC; 20 | sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); 21 | 22 | while(1) { 23 | int ch = getc(f_in); 24 | if (ch == EOF) { 25 | free(b); return; 26 | } 27 | { 28 | int i = 0; 29 | int inlen = 0; 30 | while(1) { 31 | if (ch == '\n' || ch == EOF) break; 32 | if (i == lim) { 33 | sb_symbol * newb; 34 | newb = (sb_symbol *) 35 | realloc(b, (lim + INC) * sizeof(sb_symbol)); 36 | if (newb == 0) goto error; 37 | b = newb; 38 | lim = lim + INC; 39 | } 40 | /* Update count of utf-8 characters. */ 41 | if (ch < 0x80 || ch > 0xBF) inlen += 1; 42 | /* force lower case: */ 43 | if (isupper(ch)) ch = tolower(ch); 44 | 45 | b[i] = ch; 46 | i++; 47 | ch = getc(f_in); 48 | } 49 | 50 | { 51 | const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); 52 | if (stemmed == NULL) 53 | { 54 | fprintf(stderr, "Out of memory"); 55 | exit(1); 56 | } 57 | else 58 | { 59 | if (pretty == 1) { 60 | fwrite(b, i, 1, f_out); 61 | fputs(" -> ", f_out); 62 | } else if (pretty == 2) { 63 | fwrite(b, i, 1, f_out); 64 | if (sb_stemmer_length(stemmer) > 0) { 65 | int j; 66 | if (inlen < 30) { 67 | for (j = 30 - inlen; j > 0; j--) 68 | fputs(" ", f_out); 69 | } else { 70 | fputs("\n", f_out); 71 | for (j = 30; j > 0; j--) 72 | fputs(" ", f_out); 73 | } 74 | } 75 | } 76 | 77 | fputs((char *)stemmed, f_out); 78 | putc('\n', f_out); 79 | } 80 | } 81 | } 82 | } 83 | error: 84 | if (b != 0) free(b); 85 | return; 86 | } 87 | 88 | /** Display the command line syntax, and then exit. 89 | * @param n The value to exit with. 90 | */ 91 | static void 92 | usage(int n) 93 | { 94 | printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" 95 | "\n" 96 | "The input file consists of a list of words to be stemmed, one per\n" 97 | "line. Words should be in lower case, but (for English) A-Z letters\n" 98 | "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" 99 | "used.\n" 100 | "\n" 101 | "If -c is given, the argument is the character encoding of the input\n" 102 | "and output files. If it is omitted, the UTF-8 encoding is used.\n" 103 | "\n" 104 | "If -p is given the output file consists of each word of the input\n" 105 | "file followed by \"->\" followed by its stemmed equivalent.\n" 106 | "If -p2 is given the output file is a two column layout containing\n" 107 | "the input words in the first column and the stemmed eqivalents in\n" 108 | "the second column.\n" 109 | "Otherwise, the output file consists of the stemmed words, one per\n" 110 | "line.\n" 111 | "\n" 112 | "-h displays this help\n", 113 | progname); 114 | exit(n); 115 | } 116 | 117 | int 118 | main(int argc, char * argv[]) 119 | { 120 | char * in = 0; 121 | char * out = 0; 122 | FILE * f_in; 123 | FILE * f_out; 124 | struct sb_stemmer * stemmer; 125 | 126 | char * language = "english"; 127 | char * charenc = NULL; 128 | 129 | char * s; 130 | int i = 1; 131 | pretty = 0; 132 | 133 | progname = argv[0]; 134 | 135 | while(i < argc) { 136 | s = argv[i++]; 137 | if (s[0] == '-') { 138 | if (strcmp(s, "-o") == 0) { 139 | if (i >= argc) { 140 | fprintf(stderr, "%s requires an argument\n", s); 141 | exit(1); 142 | } 143 | out = argv[i++]; 144 | } else if (strcmp(s, "-i") == 0) { 145 | if (i >= argc) { 146 | fprintf(stderr, "%s requires an argument\n", s); 147 | exit(1); 148 | } 149 | in = argv[i++]; 150 | } else if (strcmp(s, "-l") == 0) { 151 | if (i >= argc) { 152 | fprintf(stderr, "%s requires an argument\n", s); 153 | exit(1); 154 | } 155 | language = argv[i++]; 156 | } else if (strcmp(s, "-c") == 0) { 157 | if (i >= argc) { 158 | fprintf(stderr, "%s requires an argument\n", s); 159 | exit(1); 160 | } 161 | charenc = argv[i++]; 162 | } else if (strcmp(s, "-p2") == 0) { 163 | pretty = 2; 164 | } else if (strcmp(s, "-p") == 0) { 165 | pretty = 1; 166 | } else if (strcmp(s, "-h") == 0) { 167 | usage(0); 168 | } else { 169 | fprintf(stderr, "option %s unknown\n", s); 170 | usage(1); 171 | } 172 | } else { 173 | fprintf(stderr, "unexpected parameter %s\n", s); 174 | usage(1); 175 | } 176 | } 177 | 178 | /* prepare the files */ 179 | f_in = (in == 0) ? stdin : fopen(in, "r"); 180 | if (f_in == 0) { 181 | fprintf(stderr, "file %s not found\n", in); 182 | exit(1); 183 | } 184 | f_out = (out == 0) ? stdout : fopen(out, "w"); 185 | if (f_out == 0) { 186 | fprintf(stderr, "file %s cannot be opened\n", out); 187 | exit(1); 188 | } 189 | 190 | /* do the stemming process: */ 191 | stemmer = sb_stemmer_new(language, charenc); 192 | if (stemmer == 0) { 193 | if (charenc == NULL) { 194 | fprintf(stderr, "language `%s' not available for stemming\n", language); 195 | exit(1); 196 | } else { 197 | fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); 198 | exit(1); 199 | } 200 | } 201 | stem_file(stemmer, f_in, f_out); 202 | sb_stemmer_delete(stemmer); 203 | 204 | if (in != 0) (void) fclose(f_in); 205 | if (out != 0) (void) fclose(f_out); 206 | 207 | return 0; 208 | } 209 | 210 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules_utf8.h: -------------------------------------------------------------------------------- 1 | /* libstemmer/modules_utf8.h: List of stemming modules. 2 | * 3 | * This file is generated by mkmodules.pl from a list of module names. 4 | * Do not edit manually. 5 | * 6 | * Modules included by this file are: danish, dutch, english, finnish, french, 7 | * german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | * russian, spanish, swedish, turkish 9 | */ 10 | 11 | #include "../src_c/stem_UTF_8_danish.h" 12 | #include "../src_c/stem_UTF_8_dutch.h" 13 | #include "../src_c/stem_UTF_8_english.h" 14 | #include "../src_c/stem_UTF_8_finnish.h" 15 | #include "../src_c/stem_UTF_8_french.h" 16 | #include "../src_c/stem_UTF_8_german.h" 17 | #include "../src_c/stem_UTF_8_hungarian.h" 18 | #include "../src_c/stem_UTF_8_italian.h" 19 | #include "../src_c/stem_UTF_8_norwegian.h" 20 | #include "../src_c/stem_UTF_8_porter.h" 21 | #include "../src_c/stem_UTF_8_portuguese.h" 22 | #include "../src_c/stem_UTF_8_romanian.h" 23 | #include "../src_c/stem_UTF_8_russian.h" 24 | #include "../src_c/stem_UTF_8_spanish.h" 25 | #include "../src_c/stem_UTF_8_swedish.h" 26 | #include "../src_c/stem_UTF_8_turkish.h" 27 | 28 | typedef enum { 29 | ENC_UNKNOWN=0, 30 | ENC_UTF_8 31 | } stemmer_encoding_t; 32 | 33 | struct stemmer_encoding { 34 | const char * name; 35 | stemmer_encoding_t enc; 36 | }; 37 | static struct stemmer_encoding encodings[] = { 38 | {"UTF_8", ENC_UTF_8}, 39 | {0,ENC_UNKNOWN} 40 | }; 41 | 42 | struct stemmer_modules { 43 | const char * name; 44 | stemmer_encoding_t enc; 45 | struct SN_env * (*create)(void); 46 | void (*close)(struct SN_env *); 47 | int (*stem)(struct SN_env *); 48 | }; 49 | static struct stemmer_modules modules[] = { 50 | {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 51 | {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 52 | {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 53 | {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 54 | {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 55 | {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 56 | {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 57 | {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 58 | {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 59 | {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 60 | {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 61 | {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 62 | {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 63 | {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 64 | {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 65 | {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 66 | {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 67 | {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 68 | {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 69 | {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 70 | {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 71 | {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 72 | {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 73 | {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 74 | {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 75 | {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 76 | {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 77 | {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 78 | {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 79 | {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 80 | {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 81 | {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 82 | {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 83 | {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, 84 | {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 85 | {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 86 | {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 87 | {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 88 | {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 89 | {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 90 | {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 91 | {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 92 | {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 93 | {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 94 | {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 95 | {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 96 | {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 97 | {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 98 | {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 99 | {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 100 | {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 101 | {0,ENC_UNKNOWN,0,0,0} 102 | }; 103 | static const char * algorithm_names[] = { 104 | "danish", 105 | "dutch", 106 | "english", 107 | "finnish", 108 | "french", 109 | "german", 110 | "hungarian", 111 | "italian", 112 | "norwegian", 113 | "porter", 114 | "portuguese", 115 | "romanian", 116 | "russian", 117 | "spanish", 118 | "swedish", 119 | "turkish", 120 | 0 121 | }; 122 | -------------------------------------------------------------------------------- /fts3_tokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2006 July 10 3 | ** 4 | ** The author disclaims copyright to this source code. 5 | ** 6 | ************************************************************************* 7 | ** Defines the interface to tokenizers used by fulltext-search. There 8 | ** are three basic components: 9 | ** 10 | ** sqlite3_tokenizer_module is a singleton defining the tokenizer 11 | ** interface functions. This is essentially the class structure for 12 | ** tokenizers. 13 | ** 14 | ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps 15 | ** including customization information defined at creation time. 16 | ** 17 | ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate 18 | ** tokens from a particular input. 19 | */ 20 | #ifndef _FTS3_TOKENIZER_H_ 21 | #define _FTS3_TOKENIZER_H_ 22 | 23 | /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. 24 | ** If tokenizers are to be allowed to call sqlite3_*() functions, then 25 | ** we will need a way to register the API consistently. 26 | */ 27 | #include "sqlite3.h" 28 | 29 | /* 30 | ** Structures used by the tokenizer interface. When a new tokenizer 31 | ** implementation is registered, the caller provides a pointer to 32 | ** an sqlite3_tokenizer_module containing pointers to the callback 33 | ** functions that make up an implementation. 34 | ** 35 | ** When an fts3 table is created, it passes any arguments passed to 36 | ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the 37 | ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer 38 | ** implementation. The xCreate() function in turn returns an 39 | ** sqlite3_tokenizer structure representing the specific tokenizer to 40 | ** be used for the fts3 table (customized by the tokenizer clause arguments). 41 | ** 42 | ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() 43 | ** method is called. It returns an sqlite3_tokenizer_cursor object 44 | ** that may be used to tokenize a specific input buffer based on 45 | ** the tokenization rules supplied by a specific sqlite3_tokenizer 46 | ** object. 47 | */ 48 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; 49 | typedef struct sqlite3_tokenizer sqlite3_tokenizer; 50 | typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; 51 | 52 | struct sqlite3_tokenizer_module { 53 | 54 | /* 55 | ** Structure version. Should always be set to 0 or 1. 56 | */ 57 | int iVersion; 58 | 59 | /* 60 | ** Create a new tokenizer. The values in the argv[] array are the 61 | ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL 62 | ** TABLE statement that created the fts3 table. For example, if 63 | ** the following SQL is executed: 64 | ** 65 | ** CREATE .. USING fts3( ... , tokenizer arg1 arg2) 66 | ** 67 | ** then argc is set to 2, and the argv[] array contains pointers 68 | ** to the strings "arg1" and "arg2". 69 | ** 70 | ** This method should return either SQLITE_OK (0), or an SQLite error 71 | ** code. If SQLITE_OK is returned, then *ppTokenizer should be set 72 | ** to point at the newly created tokenizer structure. The generic 73 | ** sqlite3_tokenizer.pModule variable should not be initialised by 74 | ** this callback. The caller will do so. 75 | */ 76 | int (*xCreate)( 77 | int argc, /* Size of argv array */ 78 | const char *const*argv, /* Tokenizer argument strings */ 79 | sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 80 | ); 81 | 82 | /* 83 | ** Destroy an existing tokenizer. The fts3 module calls this method 84 | ** exactly once for each successful call to xCreate(). 85 | */ 86 | int (*xDestroy)(sqlite3_tokenizer *pTokenizer); 87 | 88 | /* 89 | ** Create a tokenizer cursor to tokenize an input buffer. The caller 90 | ** is responsible for ensuring that the input buffer remains valid 91 | ** until the cursor is closed (using the xClose() method). 92 | */ 93 | int (*xOpen)( 94 | sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ 95 | const char *pInput, int nBytes, /* Input buffer */ 96 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ 97 | ); 98 | 99 | /* 100 | ** Destroy an existing tokenizer cursor. The fts3 module calls this 101 | ** method exactly once for each successful call to xOpen(). 102 | */ 103 | int (*xClose)(sqlite3_tokenizer_cursor *pCursor); 104 | 105 | /* 106 | ** Retrieve the next token from the tokenizer cursor pCursor. This 107 | ** method should either return SQLITE_OK and set the values of the 108 | ** "OUT" variables identified below, or SQLITE_DONE to indicate that 109 | ** the end of the buffer has been reached, or an SQLite error code. 110 | ** 111 | ** *ppToken should be set to point at a buffer containing the 112 | ** normalized version of the token (i.e. after any case-folding and/or 113 | ** stemming has been performed). *pnBytes should be set to the length 114 | ** of this buffer in bytes. The input text that generated the token is 115 | ** identified by the byte offsets returned in *piStartOffset and 116 | ** *piEndOffset. *piStartOffset should be set to the index of the first 117 | ** byte of the token in the input buffer. *piEndOffset should be set 118 | ** to the index of the first byte just past the end of the token in 119 | ** the input buffer. 120 | ** 121 | ** The buffer *ppToken is set to point at is managed by the tokenizer 122 | ** implementation. It is only required to be valid until the next call 123 | ** to xNext() or xClose(). 124 | */ 125 | /* TODO(shess) current implementation requires pInput to be 126 | ** nul-terminated. This should either be fixed, or pInput/nBytes 127 | ** should be converted to zInput. 128 | */ 129 | int (*xNext)( 130 | sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ 131 | const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ 132 | int *piStartOffset, /* OUT: Byte offset of token in input buffer */ 133 | int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ 134 | int *piPosition /* OUT: Number of tokens returned before this one */ 135 | ); 136 | 137 | /*********************************************************************** 138 | ** Methods below this point are only available if iVersion>=1. 139 | */ 140 | 141 | /* 142 | ** Configure the language id of a tokenizer cursor. 143 | */ 144 | int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); 145 | }; 146 | 147 | struct sqlite3_tokenizer { 148 | const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ 149 | /* Tokenizer implementations will typically add additional fields */ 150 | }; 151 | 152 | struct sqlite3_tokenizer_cursor { 153 | sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ 154 | /* Tokenizer implementations will typically add additional fields */ 155 | }; 156 | 157 | int fts3_global_term_cnt(int iTerm, int iCol); 158 | int fts3_term_cnt(int iTerm, int iCol); 159 | 160 | 161 | #endif /* _FTS3_TOKENIZER_H_ */ 162 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void); 23 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[1] = { 'e' }; 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' }; 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' }; 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' }; 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' }; 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' }; 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' }; 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' }; 38 | static const symbol s_0_9[2] = { 'e', 'n' }; 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' }; 40 | static const symbol s_0_11[2] = { 'a', 'r' }; 41 | static const symbol s_0_12[2] = { 'e', 'r' }; 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' }; 43 | static const symbol s_0_14[1] = { 's' }; 44 | static const symbol s_0_15[2] = { 'a', 's' }; 45 | static const symbol s_0_16[2] = { 'e', 's' }; 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' }; 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' }; 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' }; 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' }; 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' }; 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' }; 53 | static const symbol s_0_24[3] = { 'e', 't', 's' }; 54 | static const symbol s_0_25[2] = { 'e', 't' }; 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' }; 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' }; 57 | static const symbol s_0_28[3] = { 'a', 's', 't' }; 58 | 59 | static const struct among a_0[29] = 60 | { 61 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 62 | /* 1 */ { 1, s_0_1, -1, 1, 0}, 63 | /* 2 */ { 3, s_0_2, 1, 1, 0}, 64 | /* 3 */ { 4, s_0_3, 1, 1, 0}, 65 | /* 4 */ { 4, s_0_4, 1, 1, 0}, 66 | /* 5 */ { 3, s_0_5, 1, 1, 0}, 67 | /* 6 */ { 3, s_0_6, 1, 1, 0}, 68 | /* 7 */ { 6, s_0_7, 6, 1, 0}, 69 | /* 8 */ { 4, s_0_8, 1, 3, 0}, 70 | /* 9 */ { 2, s_0_9, -1, 1, 0}, 71 | /* 10 */ { 5, s_0_10, 9, 1, 0}, 72 | /* 11 */ { 2, s_0_11, -1, 1, 0}, 73 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 74 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 75 | /* 14 */ { 1, s_0_14, -1, 2, 0}, 76 | /* 15 */ { 2, s_0_15, 14, 1, 0}, 77 | /* 16 */ { 2, s_0_16, 14, 1, 0}, 78 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 79 | /* 18 */ { 5, s_0_18, 16, 1, 0}, 80 | /* 19 */ { 4, s_0_19, 16, 1, 0}, 81 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 82 | /* 21 */ { 3, s_0_21, 14, 1, 0}, 83 | /* 22 */ { 6, s_0_22, 21, 1, 0}, 84 | /* 23 */ { 3, s_0_23, 14, 1, 0}, 85 | /* 24 */ { 3, s_0_24, 14, 1, 0}, 86 | /* 25 */ { 2, s_0_25, -1, 1, 0}, 87 | /* 26 */ { 3, s_0_26, 25, 1, 0}, 88 | /* 27 */ { 3, s_0_27, -1, 3, 0}, 89 | /* 28 */ { 3, s_0_28, -1, 1, 0} 90 | }; 91 | 92 | static const symbol s_1_0[2] = { 'd', 't' }; 93 | static const symbol s_1_1[2] = { 'v', 't' }; 94 | 95 | static const struct among a_1[2] = 96 | { 97 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 98 | /* 1 */ { 2, s_1_1, -1, -1, 0} 99 | }; 100 | 101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' }; 102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' }; 103 | static const symbol s_2_2[2] = { 'i', 'g' }; 104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' }; 105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' }; 106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' }; 107 | static const symbol s_2_6[3] = { 'e', 'l', 's' }; 108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' }; 109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' }; 110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' }; 111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' }; 112 | 113 | static const struct among a_2[11] = 114 | { 115 | /* 0 */ { 3, s_2_0, -1, 1, 0}, 116 | /* 1 */ { 4, s_2_1, 0, 1, 0}, 117 | /* 2 */ { 2, s_2_2, -1, 1, 0}, 118 | /* 3 */ { 3, s_2_3, 2, 1, 0}, 119 | /* 4 */ { 3, s_2_4, 2, 1, 0}, 120 | /* 5 */ { 4, s_2_5, 4, 1, 0}, 121 | /* 6 */ { 3, s_2_6, -1, 1, 0}, 122 | /* 7 */ { 3, s_2_7, -1, 1, 0}, 123 | /* 8 */ { 4, s_2_8, 7, 1, 0}, 124 | /* 9 */ { 4, s_2_9, 7, 1, 0}, 125 | /* 10 */ { 7, s_2_10, 9, 1, 0} 126 | }; 127 | 128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 129 | 130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 }; 131 | 132 | static const symbol s_0[] = { 'k' }; 133 | static const symbol s_1[] = { 'e', 'r' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 30 */ 138 | { int ret = z->c + 3; 139 | if (0 > ret || ret > z->l) return 0; 140 | z->c = ret; /* hop, line 30 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 30 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */ 146 | { /* gopast */ /* non v, line 31 */ 147 | int ret = in_grouping(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 31 */ 152 | /* try, line 32 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 38 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 38 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 38 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 29); /* substring, line 38 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 38 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 44 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | { int m2 = z->l - z->c; (void)m2; /* or, line 46 */ 183 | if (in_grouping_b(z, g_s_ending, 98, 122, 0)) goto lab1; 184 | goto lab0; 185 | lab1: 186 | z->c = z->l - m2; 187 | if (!(eq_s_b(z, 1, s_0))) return 0; 188 | if (out_grouping_b(z, g_v, 97, 248, 0)) return 0; 189 | } 190 | lab0: 191 | { int ret = slice_del(z); /* delete, line 46 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 3: 196 | { int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */ 197 | if (ret < 0) return ret; 198 | } 199 | break; 200 | } 201 | return 1; 202 | } 203 | 204 | static int r_consonant_pair(struct SN_env * z) { 205 | { int m_test = z->l - z->c; /* test, line 53 */ 206 | { int mlimit; /* setlimit, line 54 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 54 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | z->ket = z->c; /* [, line 54 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */ 215 | z->bra = z->c; /* ], line 54 */ 216 | z->lb = mlimit; 217 | } 218 | z->c = z->l - m_test; 219 | } 220 | if (z->c <= z->lb) return 0; 221 | z->c--; /* next, line 59 */ 222 | z->bra = z->c; /* ], line 59 */ 223 | { int ret = slice_del(z); /* delete, line 59 */ 224 | if (ret < 0) return ret; 225 | } 226 | return 1; 227 | } 228 | 229 | static int r_other_suffix(struct SN_env * z) { 230 | int among_var; 231 | { int mlimit; /* setlimit, line 63 */ 232 | int m1 = z->l - z->c; (void)m1; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 63 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m1; 237 | z->ket = z->c; /* [, line 63 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 11); /* substring, line 63 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 63 */ 242 | z->lb = mlimit; 243 | } 244 | switch(among_var) { 245 | case 0: return 0; 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 67 */ 248 | if (ret < 0) return ret; 249 | } 250 | break; 251 | } 252 | return 1; 253 | } 254 | 255 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z) { 256 | { int c1 = z->c; /* do, line 74 */ 257 | { int ret = r_mark_regions(z); 258 | if (ret == 0) goto lab0; /* call mark_regions, line 74 */ 259 | if (ret < 0) return ret; 260 | } 261 | lab0: 262 | z->c = c1; 263 | } 264 | z->lb = z->c; z->c = z->l; /* backwards, line 75 */ 265 | 266 | { int m2 = z->l - z->c; (void)m2; /* do, line 76 */ 267 | { int ret = r_main_suffix(z); 268 | if (ret == 0) goto lab1; /* call main_suffix, line 76 */ 269 | if (ret < 0) return ret; 270 | } 271 | lab1: 272 | z->c = z->l - m2; 273 | } 274 | { int m3 = z->l - z->c; (void)m3; /* do, line 77 */ 275 | { int ret = r_consonant_pair(z); 276 | if (ret == 0) goto lab2; /* call consonant_pair, line 77 */ 277 | if (ret < 0) return ret; 278 | } 279 | lab2: 280 | z->c = z->l - m3; 281 | } 282 | { int m4 = z->l - z->c; (void)m4; /* do, line 78 */ 283 | { int ret = r_other_suffix(z); 284 | if (ret == 0) goto lab3; /* call other_suffix, line 78 */ 285 | if (ret < 0) return ret; 286 | } 287 | lab3: 288 | z->c = z->l - m4; 289 | } 290 | z->c = z->lb; 291 | return 1; 292 | } 293 | 294 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); } 295 | 296 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); } 297 | 298 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_norwegian.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int norwegian_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * norwegian_UTF_8_create_env(void); 23 | extern void norwegian_UTF_8_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[1] = { 'e' }; 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' }; 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' }; 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' }; 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' }; 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' }; 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' }; 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' }; 38 | static const symbol s_0_9[2] = { 'e', 'n' }; 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' }; 40 | static const symbol s_0_11[2] = { 'a', 'r' }; 41 | static const symbol s_0_12[2] = { 'e', 'r' }; 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' }; 43 | static const symbol s_0_14[1] = { 's' }; 44 | static const symbol s_0_15[2] = { 'a', 's' }; 45 | static const symbol s_0_16[2] = { 'e', 's' }; 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' }; 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' }; 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' }; 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' }; 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' }; 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' }; 53 | static const symbol s_0_24[3] = { 'e', 't', 's' }; 54 | static const symbol s_0_25[2] = { 'e', 't' }; 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' }; 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' }; 57 | static const symbol s_0_28[3] = { 'a', 's', 't' }; 58 | 59 | static const struct among a_0[29] = 60 | { 61 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 62 | /* 1 */ { 1, s_0_1, -1, 1, 0}, 63 | /* 2 */ { 3, s_0_2, 1, 1, 0}, 64 | /* 3 */ { 4, s_0_3, 1, 1, 0}, 65 | /* 4 */ { 4, s_0_4, 1, 1, 0}, 66 | /* 5 */ { 3, s_0_5, 1, 1, 0}, 67 | /* 6 */ { 3, s_0_6, 1, 1, 0}, 68 | /* 7 */ { 6, s_0_7, 6, 1, 0}, 69 | /* 8 */ { 4, s_0_8, 1, 3, 0}, 70 | /* 9 */ { 2, s_0_9, -1, 1, 0}, 71 | /* 10 */ { 5, s_0_10, 9, 1, 0}, 72 | /* 11 */ { 2, s_0_11, -1, 1, 0}, 73 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 74 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 75 | /* 14 */ { 1, s_0_14, -1, 2, 0}, 76 | /* 15 */ { 2, s_0_15, 14, 1, 0}, 77 | /* 16 */ { 2, s_0_16, 14, 1, 0}, 78 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 79 | /* 18 */ { 5, s_0_18, 16, 1, 0}, 80 | /* 19 */ { 4, s_0_19, 16, 1, 0}, 81 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 82 | /* 21 */ { 3, s_0_21, 14, 1, 0}, 83 | /* 22 */ { 6, s_0_22, 21, 1, 0}, 84 | /* 23 */ { 3, s_0_23, 14, 1, 0}, 85 | /* 24 */ { 3, s_0_24, 14, 1, 0}, 86 | /* 25 */ { 2, s_0_25, -1, 1, 0}, 87 | /* 26 */ { 3, s_0_26, 25, 1, 0}, 88 | /* 27 */ { 3, s_0_27, -1, 3, 0}, 89 | /* 28 */ { 3, s_0_28, -1, 1, 0} 90 | }; 91 | 92 | static const symbol s_1_0[2] = { 'd', 't' }; 93 | static const symbol s_1_1[2] = { 'v', 't' }; 94 | 95 | static const struct among a_1[2] = 96 | { 97 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 98 | /* 1 */ { 2, s_1_1, -1, -1, 0} 99 | }; 100 | 101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' }; 102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' }; 103 | static const symbol s_2_2[2] = { 'i', 'g' }; 104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' }; 105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' }; 106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' }; 107 | static const symbol s_2_6[3] = { 'e', 'l', 's' }; 108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' }; 109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' }; 110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' }; 111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' }; 112 | 113 | static const struct among a_2[11] = 114 | { 115 | /* 0 */ { 3, s_2_0, -1, 1, 0}, 116 | /* 1 */ { 4, s_2_1, 0, 1, 0}, 117 | /* 2 */ { 2, s_2_2, -1, 1, 0}, 118 | /* 3 */ { 3, s_2_3, 2, 1, 0}, 119 | /* 4 */ { 3, s_2_4, 2, 1, 0}, 120 | /* 5 */ { 4, s_2_5, 4, 1, 0}, 121 | /* 6 */ { 3, s_2_6, -1, 1, 0}, 122 | /* 7 */ { 3, s_2_7, -1, 1, 0}, 123 | /* 8 */ { 4, s_2_8, 7, 1, 0}, 124 | /* 9 */ { 4, s_2_9, 7, 1, 0}, 125 | /* 10 */ { 7, s_2_10, 9, 1, 0} 126 | }; 127 | 128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 129 | 130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 }; 131 | 132 | static const symbol s_0[] = { 'k' }; 133 | static const symbol s_1[] = { 'e', 'r' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 30 */ 138 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 139 | if (ret < 0) return 0; 140 | z->c = ret; /* hop, line 30 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 30 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */ 146 | { /* gopast */ /* non v, line 31 */ 147 | int ret = in_grouping_U(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 31 */ 152 | /* try, line 32 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 38 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 38 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 38 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 29); /* substring, line 38 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 38 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 44 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | { int m2 = z->l - z->c; (void)m2; /* or, line 46 */ 183 | if (in_grouping_b_U(z, g_s_ending, 98, 122, 0)) goto lab1; 184 | goto lab0; 185 | lab1: 186 | z->c = z->l - m2; 187 | if (!(eq_s_b(z, 1, s_0))) return 0; 188 | if (out_grouping_b_U(z, g_v, 97, 248, 0)) return 0; 189 | } 190 | lab0: 191 | { int ret = slice_del(z); /* delete, line 46 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 3: 196 | { int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */ 197 | if (ret < 0) return ret; 198 | } 199 | break; 200 | } 201 | return 1; 202 | } 203 | 204 | static int r_consonant_pair(struct SN_env * z) { 205 | { int m_test = z->l - z->c; /* test, line 53 */ 206 | { int mlimit; /* setlimit, line 54 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 54 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | z->ket = z->c; /* [, line 54 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */ 215 | z->bra = z->c; /* ], line 54 */ 216 | z->lb = mlimit; 217 | } 218 | z->c = z->l - m_test; 219 | } 220 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 221 | if (ret < 0) return 0; 222 | z->c = ret; /* next, line 59 */ 223 | } 224 | z->bra = z->c; /* ], line 59 */ 225 | { int ret = slice_del(z); /* delete, line 59 */ 226 | if (ret < 0) return ret; 227 | } 228 | return 1; 229 | } 230 | 231 | static int r_other_suffix(struct SN_env * z) { 232 | int among_var; 233 | { int mlimit; /* setlimit, line 63 */ 234 | int m1 = z->l - z->c; (void)m1; 235 | if (z->c < z->I[0]) return 0; 236 | z->c = z->I[0]; /* tomark, line 63 */ 237 | mlimit = z->lb; z->lb = z->c; 238 | z->c = z->l - m1; 239 | z->ket = z->c; /* [, line 63 */ 240 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 241 | among_var = find_among_b(z, a_2, 11); /* substring, line 63 */ 242 | if (!(among_var)) { z->lb = mlimit; return 0; } 243 | z->bra = z->c; /* ], line 63 */ 244 | z->lb = mlimit; 245 | } 246 | switch(among_var) { 247 | case 0: return 0; 248 | case 1: 249 | { int ret = slice_del(z); /* delete, line 67 */ 250 | if (ret < 0) return ret; 251 | } 252 | break; 253 | } 254 | return 1; 255 | } 256 | 257 | extern int norwegian_UTF_8_stem(struct SN_env * z) { 258 | { int c1 = z->c; /* do, line 74 */ 259 | { int ret = r_mark_regions(z); 260 | if (ret == 0) goto lab0; /* call mark_regions, line 74 */ 261 | if (ret < 0) return ret; 262 | } 263 | lab0: 264 | z->c = c1; 265 | } 266 | z->lb = z->c; z->c = z->l; /* backwards, line 75 */ 267 | 268 | { int m2 = z->l - z->c; (void)m2; /* do, line 76 */ 269 | { int ret = r_main_suffix(z); 270 | if (ret == 0) goto lab1; /* call main_suffix, line 76 */ 271 | if (ret < 0) return ret; 272 | } 273 | lab1: 274 | z->c = z->l - m2; 275 | } 276 | { int m3 = z->l - z->c; (void)m3; /* do, line 77 */ 277 | { int ret = r_consonant_pair(z); 278 | if (ret == 0) goto lab2; /* call consonant_pair, line 77 */ 279 | if (ret < 0) return ret; 280 | } 281 | lab2: 282 | z->c = z->l - m3; 283 | } 284 | { int m4 = z->l - z->c; (void)m4; /* do, line 78 */ 285 | { int ret = r_other_suffix(z); 286 | if (ret == 0) goto lab3; /* call other_suffix, line 78 */ 287 | if (ret < 0) return ret; 288 | } 289 | lab3: 290 | z->c = z->l - m4; 291 | } 292 | z->c = z->lb; 293 | return 1; 294 | } 295 | 296 | extern struct SN_env * norwegian_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); } 297 | 298 | extern void norwegian_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); } 299 | 300 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_swedish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int swedish_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * swedish_ISO_8859_1_create_env(void); 23 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' }; 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' }; 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' }; 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' }; 34 | static const symbol s_0_5[2] = { 'a', 'd' }; 35 | static const symbol s_0_6[1] = { 'e' }; 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' }; 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' }; 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' }; 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' }; 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' }; 41 | static const symbol s_0_12[2] = { 'e', 'n' }; 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' }; 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' }; 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' }; 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' }; 46 | static const symbol s_0_17[2] = { 'a', 'r' }; 47 | static const symbol s_0_18[2] = { 'e', 'r' }; 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' }; 49 | static const symbol s_0_20[2] = { 'o', 'r' }; 50 | static const symbol s_0_21[1] = { 's' }; 51 | static const symbol s_0_22[2] = { 'a', 's' }; 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' }; 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' }; 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' }; 55 | static const symbol s_0_26[2] = { 'e', 's' }; 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' }; 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' }; 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' }; 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' }; 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' }; 62 | static const symbol s_0_33[2] = { 'a', 't' }; 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' }; 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' }; 65 | static const symbol s_0_36[3] = { 'a', 's', 't' }; 66 | 67 | static const struct among a_0[37] = 68 | { 69 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 70 | /* 1 */ { 4, s_0_1, 0, 1, 0}, 71 | /* 2 */ { 4, s_0_2, 0, 1, 0}, 72 | /* 3 */ { 7, s_0_3, 2, 1, 0}, 73 | /* 4 */ { 4, s_0_4, 0, 1, 0}, 74 | /* 5 */ { 2, s_0_5, -1, 1, 0}, 75 | /* 6 */ { 1, s_0_6, -1, 1, 0}, 76 | /* 7 */ { 3, s_0_7, 6, 1, 0}, 77 | /* 8 */ { 4, s_0_8, 6, 1, 0}, 78 | /* 9 */ { 4, s_0_9, 6, 1, 0}, 79 | /* 10 */ { 3, s_0_10, 6, 1, 0}, 80 | /* 11 */ { 4, s_0_11, 6, 1, 0}, 81 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 82 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 83 | /* 14 */ { 4, s_0_14, 12, 1, 0}, 84 | /* 15 */ { 5, s_0_15, 12, 1, 0}, 85 | /* 16 */ { 3, s_0_16, -1, 1, 0}, 86 | /* 17 */ { 2, s_0_17, -1, 1, 0}, 87 | /* 18 */ { 2, s_0_18, -1, 1, 0}, 88 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 89 | /* 20 */ { 2, s_0_20, -1, 1, 0}, 90 | /* 21 */ { 1, s_0_21, -1, 2, 0}, 91 | /* 22 */ { 2, s_0_22, 21, 1, 0}, 92 | /* 23 */ { 5, s_0_23, 22, 1, 0}, 93 | /* 24 */ { 5, s_0_24, 22, 1, 0}, 94 | /* 25 */ { 5, s_0_25, 22, 1, 0}, 95 | /* 26 */ { 2, s_0_26, 21, 1, 0}, 96 | /* 27 */ { 4, s_0_27, 26, 1, 0}, 97 | /* 28 */ { 5, s_0_28, 26, 1, 0}, 98 | /* 29 */ { 3, s_0_29, 21, 1, 0}, 99 | /* 30 */ { 5, s_0_30, 29, 1, 0}, 100 | /* 31 */ { 6, s_0_31, 29, 1, 0}, 101 | /* 32 */ { 4, s_0_32, 21, 1, 0}, 102 | /* 33 */ { 2, s_0_33, -1, 1, 0}, 103 | /* 34 */ { 5, s_0_34, -1, 1, 0}, 104 | /* 35 */ { 3, s_0_35, -1, 1, 0}, 105 | /* 36 */ { 3, s_0_36, -1, 1, 0} 106 | }; 107 | 108 | static const symbol s_1_0[2] = { 'd', 'd' }; 109 | static const symbol s_1_1[2] = { 'g', 'd' }; 110 | static const symbol s_1_2[2] = { 'n', 'n' }; 111 | static const symbol s_1_3[2] = { 'd', 't' }; 112 | static const symbol s_1_4[2] = { 'g', 't' }; 113 | static const symbol s_1_5[2] = { 'k', 't' }; 114 | static const symbol s_1_6[2] = { 't', 't' }; 115 | 116 | static const struct among a_1[7] = 117 | { 118 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 119 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 120 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 121 | /* 3 */ { 2, s_1_3, -1, -1, 0}, 122 | /* 4 */ { 2, s_1_4, -1, -1, 0}, 123 | /* 5 */ { 2, s_1_5, -1, -1, 0}, 124 | /* 6 */ { 2, s_1_6, -1, -1, 0} 125 | }; 126 | 127 | static const symbol s_2_0[2] = { 'i', 'g' }; 128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 129 | static const symbol s_2_2[3] = { 'e', 'l', 's' }; 130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' }; 131 | static const symbol s_2_4[4] = { 'l', 0xF6, 's', 't' }; 132 | 133 | static const struct among a_2[5] = 134 | { 135 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 136 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 137 | /* 2 */ { 3, s_2_2, -1, 1, 0}, 138 | /* 3 */ { 5, s_2_3, -1, 3, 0}, 139 | /* 4 */ { 4, s_2_4, -1, 2, 0} 140 | }; 141 | 142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 }; 143 | 144 | static const unsigned char g_s_ending[] = { 119, 127, 149 }; 145 | 146 | static const symbol s_0[] = { 'l', 0xF6, 's' }; 147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' }; 148 | 149 | static int r_mark_regions(struct SN_env * z) { 150 | z->I[0] = z->l; 151 | { int c_test = z->c; /* test, line 29 */ 152 | { int ret = z->c + 3; 153 | if (0 > ret || ret > z->l) return 0; 154 | z->c = ret; /* hop, line 29 */ 155 | } 156 | z->I[1] = z->c; /* setmark x, line 29 */ 157 | z->c = c_test; 158 | } 159 | if (out_grouping(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */ 160 | { /* gopast */ /* non v, line 30 */ 161 | int ret = in_grouping(z, g_v, 97, 246, 1); 162 | if (ret < 0) return 0; 163 | z->c += ret; 164 | } 165 | z->I[0] = z->c; /* setmark p1, line 30 */ 166 | /* try, line 31 */ 167 | if (!(z->I[0] < z->I[1])) goto lab0; 168 | z->I[0] = z->I[1]; 169 | lab0: 170 | return 1; 171 | } 172 | 173 | static int r_main_suffix(struct SN_env * z) { 174 | int among_var; 175 | { int mlimit; /* setlimit, line 37 */ 176 | int m1 = z->l - z->c; (void)m1; 177 | if (z->c < z->I[0]) return 0; 178 | z->c = z->I[0]; /* tomark, line 37 */ 179 | mlimit = z->lb; z->lb = z->c; 180 | z->c = z->l - m1; 181 | z->ket = z->c; /* [, line 37 */ 182 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 183 | among_var = find_among_b(z, a_0, 37); /* substring, line 37 */ 184 | if (!(among_var)) { z->lb = mlimit; return 0; } 185 | z->bra = z->c; /* ], line 37 */ 186 | z->lb = mlimit; 187 | } 188 | switch(among_var) { 189 | case 0: return 0; 190 | case 1: 191 | { int ret = slice_del(z); /* delete, line 44 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 2: 196 | if (in_grouping_b(z, g_s_ending, 98, 121, 0)) return 0; 197 | { int ret = slice_del(z); /* delete, line 46 */ 198 | if (ret < 0) return ret; 199 | } 200 | break; 201 | } 202 | return 1; 203 | } 204 | 205 | static int r_consonant_pair(struct SN_env * z) { 206 | { int mlimit; /* setlimit, line 50 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 50 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | { int m2 = z->l - z->c; (void)m2; /* and, line 52 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */ 215 | z->c = z->l - m2; 216 | z->ket = z->c; /* [, line 52 */ 217 | if (z->c <= z->lb) { z->lb = mlimit; return 0; } 218 | z->c--; /* next, line 52 */ 219 | z->bra = z->c; /* ], line 52 */ 220 | { int ret = slice_del(z); /* delete, line 52 */ 221 | if (ret < 0) return ret; 222 | } 223 | } 224 | z->lb = mlimit; 225 | } 226 | return 1; 227 | } 228 | 229 | static int r_other_suffix(struct SN_env * z) { 230 | int among_var; 231 | { int mlimit; /* setlimit, line 55 */ 232 | int m1 = z->l - z->c; (void)m1; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 55 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m1; 237 | z->ket = z->c; /* [, line 56 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 5); /* substring, line 56 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 56 */ 242 | switch(among_var) { 243 | case 0: { z->lb = mlimit; return 0; } 244 | case 1: 245 | { int ret = slice_del(z); /* delete, line 57 */ 246 | if (ret < 0) return ret; 247 | } 248 | break; 249 | case 2: 250 | { int ret = slice_from_s(z, 3, s_0); /* <-, line 58 */ 251 | if (ret < 0) return ret; 252 | } 253 | break; 254 | case 3: 255 | { int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */ 256 | if (ret < 0) return ret; 257 | } 258 | break; 259 | } 260 | z->lb = mlimit; 261 | } 262 | return 1; 263 | } 264 | 265 | extern int swedish_ISO_8859_1_stem(struct SN_env * z) { 266 | { int c1 = z->c; /* do, line 66 */ 267 | { int ret = r_mark_regions(z); 268 | if (ret == 0) goto lab0; /* call mark_regions, line 66 */ 269 | if (ret < 0) return ret; 270 | } 271 | lab0: 272 | z->c = c1; 273 | } 274 | z->lb = z->c; z->c = z->l; /* backwards, line 67 */ 275 | 276 | { int m2 = z->l - z->c; (void)m2; /* do, line 68 */ 277 | { int ret = r_main_suffix(z); 278 | if (ret == 0) goto lab1; /* call main_suffix, line 68 */ 279 | if (ret < 0) return ret; 280 | } 281 | lab1: 282 | z->c = z->l - m2; 283 | } 284 | { int m3 = z->l - z->c; (void)m3; /* do, line 69 */ 285 | { int ret = r_consonant_pair(z); 286 | if (ret == 0) goto lab2; /* call consonant_pair, line 69 */ 287 | if (ret < 0) return ret; 288 | } 289 | lab2: 290 | z->c = z->l - m3; 291 | } 292 | { int m4 = z->l - z->c; (void)m4; /* do, line 70 */ 293 | { int ret = r_other_suffix(z); 294 | if (ret == 0) goto lab3; /* call other_suffix, line 70 */ 295 | if (ret < 0) return ret; 296 | } 297 | lab3: 298 | z->c = z->l - m4; 299 | } 300 | z->c = z->lb; 301 | return 1; 302 | } 303 | 304 | extern struct SN_env * swedish_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); } 305 | 306 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); } 307 | 308 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_swedish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int swedish_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_other_suffix(struct SN_env * z); 14 | static int r_consonant_pair(struct SN_env * z); 15 | static int r_main_suffix(struct SN_env * z); 16 | static int r_mark_regions(struct SN_env * z); 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | extern struct SN_env * swedish_UTF_8_create_env(void); 23 | extern void swedish_UTF_8_close_env(struct SN_env * z); 24 | 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | static const symbol s_0_0[1] = { 'a' }; 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' }; 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' }; 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' }; 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' }; 34 | static const symbol s_0_5[2] = { 'a', 'd' }; 35 | static const symbol s_0_6[1] = { 'e' }; 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' }; 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' }; 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' }; 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' }; 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' }; 41 | static const symbol s_0_12[2] = { 'e', 'n' }; 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' }; 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' }; 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' }; 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' }; 46 | static const symbol s_0_17[2] = { 'a', 'r' }; 47 | static const symbol s_0_18[2] = { 'e', 'r' }; 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' }; 49 | static const symbol s_0_20[2] = { 'o', 'r' }; 50 | static const symbol s_0_21[1] = { 's' }; 51 | static const symbol s_0_22[2] = { 'a', 's' }; 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' }; 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' }; 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' }; 55 | static const symbol s_0_26[2] = { 'e', 's' }; 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' }; 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' }; 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' }; 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' }; 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' }; 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' }; 62 | static const symbol s_0_33[2] = { 'a', 't' }; 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' }; 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' }; 65 | static const symbol s_0_36[3] = { 'a', 's', 't' }; 66 | 67 | static const struct among a_0[37] = 68 | { 69 | /* 0 */ { 1, s_0_0, -1, 1, 0}, 70 | /* 1 */ { 4, s_0_1, 0, 1, 0}, 71 | /* 2 */ { 4, s_0_2, 0, 1, 0}, 72 | /* 3 */ { 7, s_0_3, 2, 1, 0}, 73 | /* 4 */ { 4, s_0_4, 0, 1, 0}, 74 | /* 5 */ { 2, s_0_5, -1, 1, 0}, 75 | /* 6 */ { 1, s_0_6, -1, 1, 0}, 76 | /* 7 */ { 3, s_0_7, 6, 1, 0}, 77 | /* 8 */ { 4, s_0_8, 6, 1, 0}, 78 | /* 9 */ { 4, s_0_9, 6, 1, 0}, 79 | /* 10 */ { 3, s_0_10, 6, 1, 0}, 80 | /* 11 */ { 4, s_0_11, 6, 1, 0}, 81 | /* 12 */ { 2, s_0_12, -1, 1, 0}, 82 | /* 13 */ { 5, s_0_13, 12, 1, 0}, 83 | /* 14 */ { 4, s_0_14, 12, 1, 0}, 84 | /* 15 */ { 5, s_0_15, 12, 1, 0}, 85 | /* 16 */ { 3, s_0_16, -1, 1, 0}, 86 | /* 17 */ { 2, s_0_17, -1, 1, 0}, 87 | /* 18 */ { 2, s_0_18, -1, 1, 0}, 88 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 89 | /* 20 */ { 2, s_0_20, -1, 1, 0}, 90 | /* 21 */ { 1, s_0_21, -1, 2, 0}, 91 | /* 22 */ { 2, s_0_22, 21, 1, 0}, 92 | /* 23 */ { 5, s_0_23, 22, 1, 0}, 93 | /* 24 */ { 5, s_0_24, 22, 1, 0}, 94 | /* 25 */ { 5, s_0_25, 22, 1, 0}, 95 | /* 26 */ { 2, s_0_26, 21, 1, 0}, 96 | /* 27 */ { 4, s_0_27, 26, 1, 0}, 97 | /* 28 */ { 5, s_0_28, 26, 1, 0}, 98 | /* 29 */ { 3, s_0_29, 21, 1, 0}, 99 | /* 30 */ { 5, s_0_30, 29, 1, 0}, 100 | /* 31 */ { 6, s_0_31, 29, 1, 0}, 101 | /* 32 */ { 4, s_0_32, 21, 1, 0}, 102 | /* 33 */ { 2, s_0_33, -1, 1, 0}, 103 | /* 34 */ { 5, s_0_34, -1, 1, 0}, 104 | /* 35 */ { 3, s_0_35, -1, 1, 0}, 105 | /* 36 */ { 3, s_0_36, -1, 1, 0} 106 | }; 107 | 108 | static const symbol s_1_0[2] = { 'd', 'd' }; 109 | static const symbol s_1_1[2] = { 'g', 'd' }; 110 | static const symbol s_1_2[2] = { 'n', 'n' }; 111 | static const symbol s_1_3[2] = { 'd', 't' }; 112 | static const symbol s_1_4[2] = { 'g', 't' }; 113 | static const symbol s_1_5[2] = { 'k', 't' }; 114 | static const symbol s_1_6[2] = { 't', 't' }; 115 | 116 | static const struct among a_1[7] = 117 | { 118 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 119 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 120 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 121 | /* 3 */ { 2, s_1_3, -1, -1, 0}, 122 | /* 4 */ { 2, s_1_4, -1, -1, 0}, 123 | /* 5 */ { 2, s_1_5, -1, -1, 0}, 124 | /* 6 */ { 2, s_1_6, -1, -1, 0} 125 | }; 126 | 127 | static const symbol s_2_0[2] = { 'i', 'g' }; 128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 129 | static const symbol s_2_2[3] = { 'e', 'l', 's' }; 130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' }; 131 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB6, 's', 't' }; 132 | 133 | static const struct among a_2[5] = 134 | { 135 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 136 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 137 | /* 2 */ { 3, s_2_2, -1, 1, 0}, 138 | /* 3 */ { 5, s_2_3, -1, 3, 0}, 139 | /* 4 */ { 5, s_2_4, -1, 2, 0} 140 | }; 141 | 142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 }; 143 | 144 | static const unsigned char g_s_ending[] = { 119, 127, 149 }; 145 | 146 | static const symbol s_0[] = { 'l', 0xC3, 0xB6, 's' }; 147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' }; 148 | 149 | static int r_mark_regions(struct SN_env * z) { 150 | z->I[0] = z->l; 151 | { int c_test = z->c; /* test, line 29 */ 152 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 153 | if (ret < 0) return 0; 154 | z->c = ret; /* hop, line 29 */ 155 | } 156 | z->I[1] = z->c; /* setmark x, line 29 */ 157 | z->c = c_test; 158 | } 159 | if (out_grouping_U(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */ 160 | { /* gopast */ /* non v, line 30 */ 161 | int ret = in_grouping_U(z, g_v, 97, 246, 1); 162 | if (ret < 0) return 0; 163 | z->c += ret; 164 | } 165 | z->I[0] = z->c; /* setmark p1, line 30 */ 166 | /* try, line 31 */ 167 | if (!(z->I[0] < z->I[1])) goto lab0; 168 | z->I[0] = z->I[1]; 169 | lab0: 170 | return 1; 171 | } 172 | 173 | static int r_main_suffix(struct SN_env * z) { 174 | int among_var; 175 | { int mlimit; /* setlimit, line 37 */ 176 | int m1 = z->l - z->c; (void)m1; 177 | if (z->c < z->I[0]) return 0; 178 | z->c = z->I[0]; /* tomark, line 37 */ 179 | mlimit = z->lb; z->lb = z->c; 180 | z->c = z->l - m1; 181 | z->ket = z->c; /* [, line 37 */ 182 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 183 | among_var = find_among_b(z, a_0, 37); /* substring, line 37 */ 184 | if (!(among_var)) { z->lb = mlimit; return 0; } 185 | z->bra = z->c; /* ], line 37 */ 186 | z->lb = mlimit; 187 | } 188 | switch(among_var) { 189 | case 0: return 0; 190 | case 1: 191 | { int ret = slice_del(z); /* delete, line 44 */ 192 | if (ret < 0) return ret; 193 | } 194 | break; 195 | case 2: 196 | if (in_grouping_b_U(z, g_s_ending, 98, 121, 0)) return 0; 197 | { int ret = slice_del(z); /* delete, line 46 */ 198 | if (ret < 0) return ret; 199 | } 200 | break; 201 | } 202 | return 1; 203 | } 204 | 205 | static int r_consonant_pair(struct SN_env * z) { 206 | { int mlimit; /* setlimit, line 50 */ 207 | int m1 = z->l - z->c; (void)m1; 208 | if (z->c < z->I[0]) return 0; 209 | z->c = z->I[0]; /* tomark, line 50 */ 210 | mlimit = z->lb; z->lb = z->c; 211 | z->c = z->l - m1; 212 | { int m2 = z->l - z->c; (void)m2; /* and, line 52 */ 213 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 214 | if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */ 215 | z->c = z->l - m2; 216 | z->ket = z->c; /* [, line 52 */ 217 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 218 | if (ret < 0) { z->lb = mlimit; return 0; } 219 | z->c = ret; /* next, line 52 */ 220 | } 221 | z->bra = z->c; /* ], line 52 */ 222 | { int ret = slice_del(z); /* delete, line 52 */ 223 | if (ret < 0) return ret; 224 | } 225 | } 226 | z->lb = mlimit; 227 | } 228 | return 1; 229 | } 230 | 231 | static int r_other_suffix(struct SN_env * z) { 232 | int among_var; 233 | { int mlimit; /* setlimit, line 55 */ 234 | int m1 = z->l - z->c; (void)m1; 235 | if (z->c < z->I[0]) return 0; 236 | z->c = z->I[0]; /* tomark, line 55 */ 237 | mlimit = z->lb; z->lb = z->c; 238 | z->c = z->l - m1; 239 | z->ket = z->c; /* [, line 56 */ 240 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 241 | among_var = find_among_b(z, a_2, 5); /* substring, line 56 */ 242 | if (!(among_var)) { z->lb = mlimit; return 0; } 243 | z->bra = z->c; /* ], line 56 */ 244 | switch(among_var) { 245 | case 0: { z->lb = mlimit; return 0; } 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 57 */ 248 | if (ret < 0) return ret; 249 | } 250 | break; 251 | case 2: 252 | { int ret = slice_from_s(z, 4, s_0); /* <-, line 58 */ 253 | if (ret < 0) return ret; 254 | } 255 | break; 256 | case 3: 257 | { int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */ 258 | if (ret < 0) return ret; 259 | } 260 | break; 261 | } 262 | z->lb = mlimit; 263 | } 264 | return 1; 265 | } 266 | 267 | extern int swedish_UTF_8_stem(struct SN_env * z) { 268 | { int c1 = z->c; /* do, line 66 */ 269 | { int ret = r_mark_regions(z); 270 | if (ret == 0) goto lab0; /* call mark_regions, line 66 */ 271 | if (ret < 0) return ret; 272 | } 273 | lab0: 274 | z->c = c1; 275 | } 276 | z->lb = z->c; z->c = z->l; /* backwards, line 67 */ 277 | 278 | { int m2 = z->l - z->c; (void)m2; /* do, line 68 */ 279 | { int ret = r_main_suffix(z); 280 | if (ret == 0) goto lab1; /* call main_suffix, line 68 */ 281 | if (ret < 0) return ret; 282 | } 283 | lab1: 284 | z->c = z->l - m2; 285 | } 286 | { int m3 = z->l - z->c; (void)m3; /* do, line 69 */ 287 | { int ret = r_consonant_pair(z); 288 | if (ret == 0) goto lab2; /* call consonant_pair, line 69 */ 289 | if (ret < 0) return ret; 290 | } 291 | lab2: 292 | z->c = z->l - m3; 293 | } 294 | { int m4 = z->l - z->c; (void)m4; /* do, line 70 */ 295 | { int ret = r_other_suffix(z); 296 | if (ret == 0) goto lab3; /* call other_suffix, line 70 */ 297 | if (ret < 0) return ret; 298 | } 299 | lab3: 300 | z->c = z->l - m4; 301 | } 302 | z->c = z->lb; 303 | return 1; 304 | } 305 | 306 | extern struct SN_env * swedish_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); } 307 | 308 | extern void swedish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); } 309 | 310 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_ISO_8859_1_danish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int danish_ISO_8859_1_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_undouble(struct SN_env * z); 14 | static int r_other_suffix(struct SN_env * z); 15 | static int r_consonant_pair(struct SN_env * z); 16 | static int r_main_suffix(struct SN_env * z); 17 | static int r_mark_regions(struct SN_env * z); 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | 23 | extern struct SN_env * danish_ISO_8859_1_create_env(void); 24 | extern void danish_ISO_8859_1_close_env(struct SN_env * z); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' }; 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' }; 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' }; 33 | static const symbol s_0_3[1] = { 'e' }; 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' }; 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' }; 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' }; 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' }; 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' }; 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' }; 40 | static const symbol s_0_10[2] = { 'e', 'n' }; 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' }; 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' }; 43 | static const symbol s_0_13[2] = { 'e', 'r' }; 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' }; 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' }; 46 | static const symbol s_0_16[1] = { 's' }; 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' }; 48 | static const symbol s_0_18[2] = { 'e', 's' }; 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' }; 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' }; 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' }; 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' }; 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' }; 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' }; 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' }; 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' }; 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' }; 58 | static const symbol s_0_28[3] = { 'e', 't', 's' }; 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' }; 60 | static const symbol s_0_30[2] = { 'e', 't' }; 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' }; 62 | 63 | static const struct among a_0[32] = 64 | { 65 | /* 0 */ { 3, s_0_0, -1, 1, 0}, 66 | /* 1 */ { 5, s_0_1, 0, 1, 0}, 67 | /* 2 */ { 4, s_0_2, -1, 1, 0}, 68 | /* 3 */ { 1, s_0_3, -1, 1, 0}, 69 | /* 4 */ { 5, s_0_4, 3, 1, 0}, 70 | /* 5 */ { 4, s_0_5, 3, 1, 0}, 71 | /* 6 */ { 6, s_0_6, 5, 1, 0}, 72 | /* 7 */ { 3, s_0_7, 3, 1, 0}, 73 | /* 8 */ { 4, s_0_8, 3, 1, 0}, 74 | /* 9 */ { 3, s_0_9, 3, 1, 0}, 75 | /* 10 */ { 2, s_0_10, -1, 1, 0}, 76 | /* 11 */ { 5, s_0_11, 10, 1, 0}, 77 | /* 12 */ { 4, s_0_12, 10, 1, 0}, 78 | /* 13 */ { 2, s_0_13, -1, 1, 0}, 79 | /* 14 */ { 5, s_0_14, 13, 1, 0}, 80 | /* 15 */ { 4, s_0_15, 13, 1, 0}, 81 | /* 16 */ { 1, s_0_16, -1, 2, 0}, 82 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 83 | /* 18 */ { 2, s_0_18, 16, 1, 0}, 84 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 85 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 86 | /* 21 */ { 4, s_0_21, 18, 1, 0}, 87 | /* 22 */ { 5, s_0_22, 18, 1, 0}, 88 | /* 23 */ { 4, s_0_23, 18, 1, 0}, 89 | /* 24 */ { 3, s_0_24, 16, 1, 0}, 90 | /* 25 */ { 6, s_0_25, 24, 1, 0}, 91 | /* 26 */ { 5, s_0_26, 24, 1, 0}, 92 | /* 27 */ { 3, s_0_27, 16, 1, 0}, 93 | /* 28 */ { 3, s_0_28, 16, 1, 0}, 94 | /* 29 */ { 5, s_0_29, 28, 1, 0}, 95 | /* 30 */ { 2, s_0_30, -1, 1, 0}, 96 | /* 31 */ { 4, s_0_31, 30, 1, 0} 97 | }; 98 | 99 | static const symbol s_1_0[2] = { 'g', 'd' }; 100 | static const symbol s_1_1[2] = { 'd', 't' }; 101 | static const symbol s_1_2[2] = { 'g', 't' }; 102 | static const symbol s_1_3[2] = { 'k', 't' }; 103 | 104 | static const struct among a_1[4] = 105 | { 106 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 107 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 108 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 109 | /* 3 */ { 2, s_1_3, -1, -1, 0} 110 | }; 111 | 112 | static const symbol s_2_0[2] = { 'i', 'g' }; 113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' }; 115 | static const symbol s_2_3[3] = { 'e', 'l', 's' }; 116 | static const symbol s_2_4[4] = { 'l', 0xF8, 's', 't' }; 117 | 118 | static const struct among a_2[5] = 119 | { 120 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 121 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 122 | /* 2 */ { 4, s_2_2, 1, 1, 0}, 123 | /* 3 */ { 3, s_2_3, -1, 1, 0}, 124 | /* 4 */ { 4, s_2_4, -1, 2, 0} 125 | }; 126 | 127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 128 | 129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 }; 130 | 131 | static const symbol s_0[] = { 's', 't' }; 132 | static const symbol s_1[] = { 'i', 'g' }; 133 | static const symbol s_2[] = { 'l', 0xF8, 's' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 33 */ 138 | { int ret = z->c + 3; 139 | if (0 > ret || ret > z->l) return 0; 140 | z->c = ret; /* hop, line 33 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 33 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */ 146 | { /* gopast */ /* non v, line 34 */ 147 | int ret = in_grouping(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 34 */ 152 | /* try, line 35 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 41 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 41 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 41 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 32); /* substring, line 41 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 41 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 48 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | if (in_grouping_b(z, g_s_ending, 97, 229, 0)) return 0; 183 | { int ret = slice_del(z); /* delete, line 50 */ 184 | if (ret < 0) return ret; 185 | } 186 | break; 187 | } 188 | return 1; 189 | } 190 | 191 | static int r_consonant_pair(struct SN_env * z) { 192 | { int m_test = z->l - z->c; /* test, line 55 */ 193 | { int mlimit; /* setlimit, line 56 */ 194 | int m1 = z->l - z->c; (void)m1; 195 | if (z->c < z->I[0]) return 0; 196 | z->c = z->I[0]; /* tomark, line 56 */ 197 | mlimit = z->lb; z->lb = z->c; 198 | z->c = z->l - m1; 199 | z->ket = z->c; /* [, line 56 */ 200 | if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; } 201 | if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */ 202 | z->bra = z->c; /* ], line 56 */ 203 | z->lb = mlimit; 204 | } 205 | z->c = z->l - m_test; 206 | } 207 | if (z->c <= z->lb) return 0; 208 | z->c--; /* next, line 62 */ 209 | z->bra = z->c; /* ], line 62 */ 210 | { int ret = slice_del(z); /* delete, line 62 */ 211 | if (ret < 0) return ret; 212 | } 213 | return 1; 214 | } 215 | 216 | static int r_other_suffix(struct SN_env * z) { 217 | int among_var; 218 | { int m1 = z->l - z->c; (void)m1; /* do, line 66 */ 219 | z->ket = z->c; /* [, line 66 */ 220 | if (!(eq_s_b(z, 2, s_0))) goto lab0; 221 | z->bra = z->c; /* ], line 66 */ 222 | if (!(eq_s_b(z, 2, s_1))) goto lab0; 223 | { int ret = slice_del(z); /* delete, line 66 */ 224 | if (ret < 0) return ret; 225 | } 226 | lab0: 227 | z->c = z->l - m1; 228 | } 229 | { int mlimit; /* setlimit, line 67 */ 230 | int m2 = z->l - z->c; (void)m2; 231 | if (z->c < z->I[0]) return 0; 232 | z->c = z->I[0]; /* tomark, line 67 */ 233 | mlimit = z->lb; z->lb = z->c; 234 | z->c = z->l - m2; 235 | z->ket = z->c; /* [, line 67 */ 236 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 237 | among_var = find_among_b(z, a_2, 5); /* substring, line 67 */ 238 | if (!(among_var)) { z->lb = mlimit; return 0; } 239 | z->bra = z->c; /* ], line 67 */ 240 | z->lb = mlimit; 241 | } 242 | switch(among_var) { 243 | case 0: return 0; 244 | case 1: 245 | { int ret = slice_del(z); /* delete, line 70 */ 246 | if (ret < 0) return ret; 247 | } 248 | { int m3 = z->l - z->c; (void)m3; /* do, line 70 */ 249 | { int ret = r_consonant_pair(z); 250 | if (ret == 0) goto lab1; /* call consonant_pair, line 70 */ 251 | if (ret < 0) return ret; 252 | } 253 | lab1: 254 | z->c = z->l - m3; 255 | } 256 | break; 257 | case 2: 258 | { int ret = slice_from_s(z, 3, s_2); /* <-, line 72 */ 259 | if (ret < 0) return ret; 260 | } 261 | break; 262 | } 263 | return 1; 264 | } 265 | 266 | static int r_undouble(struct SN_env * z) { 267 | { int mlimit; /* setlimit, line 76 */ 268 | int m1 = z->l - z->c; (void)m1; 269 | if (z->c < z->I[0]) return 0; 270 | z->c = z->I[0]; /* tomark, line 76 */ 271 | mlimit = z->lb; z->lb = z->c; 272 | z->c = z->l - m1; 273 | z->ket = z->c; /* [, line 76 */ 274 | if (out_grouping_b(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; } 275 | z->bra = z->c; /* ], line 76 */ 276 | z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */ 277 | if (z->S[0] == 0) return -1; /* -> ch, line 76 */ 278 | z->lb = mlimit; 279 | } 280 | if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */ 281 | { int ret = slice_del(z); /* delete, line 78 */ 282 | if (ret < 0) return ret; 283 | } 284 | return 1; 285 | } 286 | 287 | extern int danish_ISO_8859_1_stem(struct SN_env * z) { 288 | { int c1 = z->c; /* do, line 84 */ 289 | { int ret = r_mark_regions(z); 290 | if (ret == 0) goto lab0; /* call mark_regions, line 84 */ 291 | if (ret < 0) return ret; 292 | } 293 | lab0: 294 | z->c = c1; 295 | } 296 | z->lb = z->c; z->c = z->l; /* backwards, line 85 */ 297 | 298 | { int m2 = z->l - z->c; (void)m2; /* do, line 86 */ 299 | { int ret = r_main_suffix(z); 300 | if (ret == 0) goto lab1; /* call main_suffix, line 86 */ 301 | if (ret < 0) return ret; 302 | } 303 | lab1: 304 | z->c = z->l - m2; 305 | } 306 | { int m3 = z->l - z->c; (void)m3; /* do, line 87 */ 307 | { int ret = r_consonant_pair(z); 308 | if (ret == 0) goto lab2; /* call consonant_pair, line 87 */ 309 | if (ret < 0) return ret; 310 | } 311 | lab2: 312 | z->c = z->l - m3; 313 | } 314 | { int m4 = z->l - z->c; (void)m4; /* do, line 88 */ 315 | { int ret = r_other_suffix(z); 316 | if (ret == 0) goto lab3; /* call other_suffix, line 88 */ 317 | if (ret < 0) return ret; 318 | } 319 | lab3: 320 | z->c = z->l - m4; 321 | } 322 | { int m5 = z->l - z->c; (void)m5; /* do, line 89 */ 323 | { int ret = r_undouble(z); 324 | if (ret == 0) goto lab4; /* call undouble, line 89 */ 325 | if (ret < 0) return ret; 326 | } 327 | lab4: 328 | z->c = z->l - m5; 329 | } 330 | z->c = z->lb; 331 | return 1; 332 | } 333 | 334 | extern struct SN_env * danish_ISO_8859_1_create_env(void) { return SN_create_env(1, 2, 0); } 335 | 336 | extern void danish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 1); } 337 | 338 | -------------------------------------------------------------------------------- /libstemmer_c/src_c/stem_UTF_8_danish.c: -------------------------------------------------------------------------------- 1 | 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */ 3 | 4 | #include "../runtime/header.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | extern int danish_UTF_8_stem(struct SN_env * z); 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | static int r_undouble(struct SN_env * z); 14 | static int r_other_suffix(struct SN_env * z); 15 | static int r_consonant_pair(struct SN_env * z); 16 | static int r_main_suffix(struct SN_env * z); 17 | static int r_mark_regions(struct SN_env * z); 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | 23 | extern struct SN_env * danish_UTF_8_create_env(void); 24 | extern void danish_UTF_8_close_env(struct SN_env * z); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' }; 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' }; 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' }; 33 | static const symbol s_0_3[1] = { 'e' }; 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' }; 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' }; 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' }; 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' }; 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' }; 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' }; 40 | static const symbol s_0_10[2] = { 'e', 'n' }; 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' }; 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' }; 43 | static const symbol s_0_13[2] = { 'e', 'r' }; 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' }; 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' }; 46 | static const symbol s_0_16[1] = { 's' }; 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' }; 48 | static const symbol s_0_18[2] = { 'e', 's' }; 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' }; 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' }; 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' }; 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' }; 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' }; 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' }; 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' }; 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' }; 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' }; 58 | static const symbol s_0_28[3] = { 'e', 't', 's' }; 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' }; 60 | static const symbol s_0_30[2] = { 'e', 't' }; 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' }; 62 | 63 | static const struct among a_0[32] = 64 | { 65 | /* 0 */ { 3, s_0_0, -1, 1, 0}, 66 | /* 1 */ { 5, s_0_1, 0, 1, 0}, 67 | /* 2 */ { 4, s_0_2, -1, 1, 0}, 68 | /* 3 */ { 1, s_0_3, -1, 1, 0}, 69 | /* 4 */ { 5, s_0_4, 3, 1, 0}, 70 | /* 5 */ { 4, s_0_5, 3, 1, 0}, 71 | /* 6 */ { 6, s_0_6, 5, 1, 0}, 72 | /* 7 */ { 3, s_0_7, 3, 1, 0}, 73 | /* 8 */ { 4, s_0_8, 3, 1, 0}, 74 | /* 9 */ { 3, s_0_9, 3, 1, 0}, 75 | /* 10 */ { 2, s_0_10, -1, 1, 0}, 76 | /* 11 */ { 5, s_0_11, 10, 1, 0}, 77 | /* 12 */ { 4, s_0_12, 10, 1, 0}, 78 | /* 13 */ { 2, s_0_13, -1, 1, 0}, 79 | /* 14 */ { 5, s_0_14, 13, 1, 0}, 80 | /* 15 */ { 4, s_0_15, 13, 1, 0}, 81 | /* 16 */ { 1, s_0_16, -1, 2, 0}, 82 | /* 17 */ { 4, s_0_17, 16, 1, 0}, 83 | /* 18 */ { 2, s_0_18, 16, 1, 0}, 84 | /* 19 */ { 5, s_0_19, 18, 1, 0}, 85 | /* 20 */ { 7, s_0_20, 19, 1, 0}, 86 | /* 21 */ { 4, s_0_21, 18, 1, 0}, 87 | /* 22 */ { 5, s_0_22, 18, 1, 0}, 88 | /* 23 */ { 4, s_0_23, 18, 1, 0}, 89 | /* 24 */ { 3, s_0_24, 16, 1, 0}, 90 | /* 25 */ { 6, s_0_25, 24, 1, 0}, 91 | /* 26 */ { 5, s_0_26, 24, 1, 0}, 92 | /* 27 */ { 3, s_0_27, 16, 1, 0}, 93 | /* 28 */ { 3, s_0_28, 16, 1, 0}, 94 | /* 29 */ { 5, s_0_29, 28, 1, 0}, 95 | /* 30 */ { 2, s_0_30, -1, 1, 0}, 96 | /* 31 */ { 4, s_0_31, 30, 1, 0} 97 | }; 98 | 99 | static const symbol s_1_0[2] = { 'g', 'd' }; 100 | static const symbol s_1_1[2] = { 'd', 't' }; 101 | static const symbol s_1_2[2] = { 'g', 't' }; 102 | static const symbol s_1_3[2] = { 'k', 't' }; 103 | 104 | static const struct among a_1[4] = 105 | { 106 | /* 0 */ { 2, s_1_0, -1, -1, 0}, 107 | /* 1 */ { 2, s_1_1, -1, -1, 0}, 108 | /* 2 */ { 2, s_1_2, -1, -1, 0}, 109 | /* 3 */ { 2, s_1_3, -1, -1, 0} 110 | }; 111 | 112 | static const symbol s_2_0[2] = { 'i', 'g' }; 113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' }; 114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' }; 115 | static const symbol s_2_3[3] = { 'e', 'l', 's' }; 116 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB8, 's', 't' }; 117 | 118 | static const struct among a_2[5] = 119 | { 120 | /* 0 */ { 2, s_2_0, -1, 1, 0}, 121 | /* 1 */ { 3, s_2_1, 0, 1, 0}, 122 | /* 2 */ { 4, s_2_2, 1, 1, 0}, 123 | /* 3 */ { 3, s_2_3, -1, 1, 0}, 124 | /* 4 */ { 5, s_2_4, -1, 2, 0} 125 | }; 126 | 127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; 128 | 129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 }; 130 | 131 | static const symbol s_0[] = { 's', 't' }; 132 | static const symbol s_1[] = { 'i', 'g' }; 133 | static const symbol s_2[] = { 'l', 0xC3, 0xB8, 's' }; 134 | 135 | static int r_mark_regions(struct SN_env * z) { 136 | z->I[0] = z->l; 137 | { int c_test = z->c; /* test, line 33 */ 138 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 3); 139 | if (ret < 0) return 0; 140 | z->c = ret; /* hop, line 33 */ 141 | } 142 | z->I[1] = z->c; /* setmark x, line 33 */ 143 | z->c = c_test; 144 | } 145 | if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */ 146 | { /* gopast */ /* non v, line 34 */ 147 | int ret = in_grouping_U(z, g_v, 97, 248, 1); 148 | if (ret < 0) return 0; 149 | z->c += ret; 150 | } 151 | z->I[0] = z->c; /* setmark p1, line 34 */ 152 | /* try, line 35 */ 153 | if (!(z->I[0] < z->I[1])) goto lab0; 154 | z->I[0] = z->I[1]; 155 | lab0: 156 | return 1; 157 | } 158 | 159 | static int r_main_suffix(struct SN_env * z) { 160 | int among_var; 161 | { int mlimit; /* setlimit, line 41 */ 162 | int m1 = z->l - z->c; (void)m1; 163 | if (z->c < z->I[0]) return 0; 164 | z->c = z->I[0]; /* tomark, line 41 */ 165 | mlimit = z->lb; z->lb = z->c; 166 | z->c = z->l - m1; 167 | z->ket = z->c; /* [, line 41 */ 168 | if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 169 | among_var = find_among_b(z, a_0, 32); /* substring, line 41 */ 170 | if (!(among_var)) { z->lb = mlimit; return 0; } 171 | z->bra = z->c; /* ], line 41 */ 172 | z->lb = mlimit; 173 | } 174 | switch(among_var) { 175 | case 0: return 0; 176 | case 1: 177 | { int ret = slice_del(z); /* delete, line 48 */ 178 | if (ret < 0) return ret; 179 | } 180 | break; 181 | case 2: 182 | if (in_grouping_b_U(z, g_s_ending, 97, 229, 0)) return 0; 183 | { int ret = slice_del(z); /* delete, line 50 */ 184 | if (ret < 0) return ret; 185 | } 186 | break; 187 | } 188 | return 1; 189 | } 190 | 191 | static int r_consonant_pair(struct SN_env * z) { 192 | { int m_test = z->l - z->c; /* test, line 55 */ 193 | { int mlimit; /* setlimit, line 56 */ 194 | int m1 = z->l - z->c; (void)m1; 195 | if (z->c < z->I[0]) return 0; 196 | z->c = z->I[0]; /* tomark, line 56 */ 197 | mlimit = z->lb; z->lb = z->c; 198 | z->c = z->l - m1; 199 | z->ket = z->c; /* [, line 56 */ 200 | if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; } 201 | if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */ 202 | z->bra = z->c; /* ], line 56 */ 203 | z->lb = mlimit; 204 | } 205 | z->c = z->l - m_test; 206 | } 207 | { int ret = skip_utf8(z->p, z->c, z->lb, 0, -1); 208 | if (ret < 0) return 0; 209 | z->c = ret; /* next, line 62 */ 210 | } 211 | z->bra = z->c; /* ], line 62 */ 212 | { int ret = slice_del(z); /* delete, line 62 */ 213 | if (ret < 0) return ret; 214 | } 215 | return 1; 216 | } 217 | 218 | static int r_other_suffix(struct SN_env * z) { 219 | int among_var; 220 | { int m1 = z->l - z->c; (void)m1; /* do, line 66 */ 221 | z->ket = z->c; /* [, line 66 */ 222 | if (!(eq_s_b(z, 2, s_0))) goto lab0; 223 | z->bra = z->c; /* ], line 66 */ 224 | if (!(eq_s_b(z, 2, s_1))) goto lab0; 225 | { int ret = slice_del(z); /* delete, line 66 */ 226 | if (ret < 0) return ret; 227 | } 228 | lab0: 229 | z->c = z->l - m1; 230 | } 231 | { int mlimit; /* setlimit, line 67 */ 232 | int m2 = z->l - z->c; (void)m2; 233 | if (z->c < z->I[0]) return 0; 234 | z->c = z->I[0]; /* tomark, line 67 */ 235 | mlimit = z->lb; z->lb = z->c; 236 | z->c = z->l - m2; 237 | z->ket = z->c; /* [, line 67 */ 238 | if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; } 239 | among_var = find_among_b(z, a_2, 5); /* substring, line 67 */ 240 | if (!(among_var)) { z->lb = mlimit; return 0; } 241 | z->bra = z->c; /* ], line 67 */ 242 | z->lb = mlimit; 243 | } 244 | switch(among_var) { 245 | case 0: return 0; 246 | case 1: 247 | { int ret = slice_del(z); /* delete, line 70 */ 248 | if (ret < 0) return ret; 249 | } 250 | { int m3 = z->l - z->c; (void)m3; /* do, line 70 */ 251 | { int ret = r_consonant_pair(z); 252 | if (ret == 0) goto lab1; /* call consonant_pair, line 70 */ 253 | if (ret < 0) return ret; 254 | } 255 | lab1: 256 | z->c = z->l - m3; 257 | } 258 | break; 259 | case 2: 260 | { int ret = slice_from_s(z, 4, s_2); /* <-, line 72 */ 261 | if (ret < 0) return ret; 262 | } 263 | break; 264 | } 265 | return 1; 266 | } 267 | 268 | static int r_undouble(struct SN_env * z) { 269 | { int mlimit; /* setlimit, line 76 */ 270 | int m1 = z->l - z->c; (void)m1; 271 | if (z->c < z->I[0]) return 0; 272 | z->c = z->I[0]; /* tomark, line 76 */ 273 | mlimit = z->lb; z->lb = z->c; 274 | z->c = z->l - m1; 275 | z->ket = z->c; /* [, line 76 */ 276 | if (out_grouping_b_U(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; } 277 | z->bra = z->c; /* ], line 76 */ 278 | z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */ 279 | if (z->S[0] == 0) return -1; /* -> ch, line 76 */ 280 | z->lb = mlimit; 281 | } 282 | if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */ 283 | { int ret = slice_del(z); /* delete, line 78 */ 284 | if (ret < 0) return ret; 285 | } 286 | return 1; 287 | } 288 | 289 | extern int danish_UTF_8_stem(struct SN_env * z) { 290 | { int c1 = z->c; /* do, line 84 */ 291 | { int ret = r_mark_regions(z); 292 | if (ret == 0) goto lab0; /* call mark_regions, line 84 */ 293 | if (ret < 0) return ret; 294 | } 295 | lab0: 296 | z->c = c1; 297 | } 298 | z->lb = z->c; z->c = z->l; /* backwards, line 85 */ 299 | 300 | { int m2 = z->l - z->c; (void)m2; /* do, line 86 */ 301 | { int ret = r_main_suffix(z); 302 | if (ret == 0) goto lab1; /* call main_suffix, line 86 */ 303 | if (ret < 0) return ret; 304 | } 305 | lab1: 306 | z->c = z->l - m2; 307 | } 308 | { int m3 = z->l - z->c; (void)m3; /* do, line 87 */ 309 | { int ret = r_consonant_pair(z); 310 | if (ret == 0) goto lab2; /* call consonant_pair, line 87 */ 311 | if (ret < 0) return ret; 312 | } 313 | lab2: 314 | z->c = z->l - m3; 315 | } 316 | { int m4 = z->l - z->c; (void)m4; /* do, line 88 */ 317 | { int ret = r_other_suffix(z); 318 | if (ret == 0) goto lab3; /* call other_suffix, line 88 */ 319 | if (ret < 0) return ret; 320 | } 321 | lab3: 322 | z->c = z->l - m4; 323 | } 324 | { int m5 = z->l - z->c; (void)m5; /* do, line 89 */ 325 | { int ret = r_undouble(z); 326 | if (ret == 0) goto lab4; /* call undouble, line 89 */ 327 | if (ret < 0) return ret; 328 | } 329 | lab4: 330 | z->c = z->l - m5; 331 | } 332 | z->c = z->lb; 333 | return 1; 334 | } 335 | 336 | extern struct SN_env * danish_UTF_8_create_env(void) { return SN_create_env(1, 2, 0); } 337 | 338 | extern void danish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 1); } 339 | 340 | -------------------------------------------------------------------------------- /libstemmer_c/libstemmer/modules.h: -------------------------------------------------------------------------------- 1 | /* libstemmer/modules.h: List of stemming modules. 2 | * 3 | * This file is generated by mkmodules.pl from a list of module names. 4 | * Do not edit manually. 5 | * 6 | * Modules included by this file are: danish, dutch, english, finnish, french, 7 | * german, hungarian, italian, norwegian, porter, portuguese, romanian, 8 | * russian, spanish, swedish, turkish 9 | */ 10 | 11 | #include "../src_c/stem_ISO_8859_1_danish.h" 12 | #include "../src_c/stem_UTF_8_danish.h" 13 | #include "../src_c/stem_ISO_8859_1_dutch.h" 14 | #include "../src_c/stem_UTF_8_dutch.h" 15 | #include "../src_c/stem_ISO_8859_1_english.h" 16 | #include "../src_c/stem_UTF_8_english.h" 17 | #include "../src_c/stem_ISO_8859_1_finnish.h" 18 | #include "../src_c/stem_UTF_8_finnish.h" 19 | #include "../src_c/stem_ISO_8859_1_french.h" 20 | #include "../src_c/stem_UTF_8_french.h" 21 | #include "../src_c/stem_ISO_8859_1_german.h" 22 | #include "../src_c/stem_UTF_8_german.h" 23 | #include "../src_c/stem_ISO_8859_1_hungarian.h" 24 | #include "../src_c/stem_UTF_8_hungarian.h" 25 | #include "../src_c/stem_ISO_8859_1_italian.h" 26 | #include "../src_c/stem_UTF_8_italian.h" 27 | #include "../src_c/stem_ISO_8859_1_norwegian.h" 28 | #include "../src_c/stem_UTF_8_norwegian.h" 29 | #include "../src_c/stem_ISO_8859_1_porter.h" 30 | #include "../src_c/stem_UTF_8_porter.h" 31 | #include "../src_c/stem_ISO_8859_1_portuguese.h" 32 | #include "../src_c/stem_UTF_8_portuguese.h" 33 | #include "../src_c/stem_ISO_8859_2_romanian.h" 34 | #include "../src_c/stem_UTF_8_romanian.h" 35 | #include "../src_c/stem_KOI8_R_russian.h" 36 | #include "../src_c/stem_UTF_8_russian.h" 37 | #include "../src_c/stem_ISO_8859_1_spanish.h" 38 | #include "../src_c/stem_UTF_8_spanish.h" 39 | #include "../src_c/stem_ISO_8859_1_swedish.h" 40 | #include "../src_c/stem_UTF_8_swedish.h" 41 | #include "../src_c/stem_UTF_8_turkish.h" 42 | 43 | typedef enum { 44 | ENC_UNKNOWN=0, 45 | ENC_ISO_8859_1, 46 | ENC_ISO_8859_2, 47 | ENC_KOI8_R, 48 | ENC_UTF_8 49 | } stemmer_encoding_t; 50 | 51 | struct stemmer_encoding { 52 | const char * name; 53 | stemmer_encoding_t enc; 54 | }; 55 | static struct stemmer_encoding encodings[] = { 56 | {"ISO_8859_1", ENC_ISO_8859_1}, 57 | {"ISO_8859_2", ENC_ISO_8859_2}, 58 | {"KOI8_R", ENC_KOI8_R}, 59 | {"UTF_8", ENC_UTF_8}, 60 | {0,ENC_UNKNOWN} 61 | }; 62 | 63 | struct stemmer_modules { 64 | const char * name; 65 | stemmer_encoding_t enc; 66 | struct SN_env * (*create)(void); 67 | void (*close)(struct SN_env *); 68 | int (*stem)(struct SN_env *); 69 | }; 70 | static struct stemmer_modules modules[] = { 71 | {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 72 | {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 73 | {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 74 | {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 75 | {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, 76 | {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, 77 | {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 78 | {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 79 | {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 80 | {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 81 | {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 82 | {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 83 | {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 84 | {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 85 | {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 86 | {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 87 | {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 88 | {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 89 | {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, 90 | {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, 91 | {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 92 | {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 93 | {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 94 | {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 95 | {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 96 | {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 97 | {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 98 | {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 99 | {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, 100 | {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, 101 | {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 102 | {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 103 | {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 104 | {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 105 | {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 106 | {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 107 | {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, 108 | {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, 109 | {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 110 | {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 111 | {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, 112 | {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, 113 | {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 114 | {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 115 | {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 116 | {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 117 | {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, 118 | {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, 119 | {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 120 | {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 121 | {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 122 | {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 123 | {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, 124 | {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, 125 | {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 126 | {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 127 | {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, 128 | {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, 129 | {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 130 | {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 131 | {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 132 | {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 133 | {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, 134 | {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, 135 | {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 136 | {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 137 | {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem}, 138 | {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, 139 | {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 140 | {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 141 | {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, 142 | {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, 143 | {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 144 | {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 145 | {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 146 | {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 147 | {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 148 | {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 149 | {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 150 | {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 151 | {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, 152 | {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, 153 | {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 154 | {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 155 | {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, 156 | {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, 157 | {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 158 | {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 159 | {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, 160 | {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, 161 | {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 162 | {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 163 | {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 164 | {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 165 | {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, 166 | {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, 167 | {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 168 | {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 169 | {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, 170 | {0,ENC_UNKNOWN,0,0,0} 171 | }; 172 | static const char * algorithm_names[] = { 173 | "danish", 174 | "dutch", 175 | "english", 176 | "finnish", 177 | "french", 178 | "german", 179 | "hungarian", 180 | "italian", 181 | "norwegian", 182 | "porter", 183 | "portuguese", 184 | "romanian", 185 | "russian", 186 | "spanish", 187 | "swedish", 188 | "turkish", 189 | 0 190 | }; 191 | -------------------------------------------------------------------------------- /libstemmer_c/runtime/utilities_sq3.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "header.h" 9 | 10 | #define unless(C) if(!(C)) 11 | 12 | #define CREATE_SIZE 1 13 | 14 | extern symbol * create_s(void) { 15 | symbol * p; 16 | void * mem = sqlite3_malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); 17 | if (mem == NULL) return NULL; 18 | p = (symbol *) (HEAD + (char *) mem); 19 | CAPACITY(p) = CREATE_SIZE; 20 | SET_SIZE(p, CREATE_SIZE); 21 | return p; 22 | } 23 | 24 | extern void lose_s(symbol * p) { 25 | if (p == NULL) return; 26 | sqlite3_free((char *) p - HEAD); 27 | } 28 | 29 | /* 30 | new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c 31 | if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new 32 | position, or 0 on failure. 33 | 34 | -- used to implement hop and next in the utf8 case. 35 | */ 36 | 37 | extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { 38 | int b; 39 | if (n >= 0) { 40 | for (; n > 0; n--) { 41 | if (c >= l) return -1; 42 | b = p[c++]; 43 | if (b >= 0xC0) { /* 1100 0000 */ 44 | while (c < l) { 45 | b = p[c]; 46 | if (b >= 0xC0 || b < 0x80) break; 47 | /* break unless b is 10------ */ 48 | c++; 49 | } 50 | } 51 | } 52 | } else { 53 | for (; n < 0; n++) { 54 | if (c <= lb) return -1; 55 | b = p[--c]; 56 | if (b >= 0x80) { /* 1000 0000 */ 57 | while (c > lb) { 58 | b = p[c]; 59 | if (b >= 0xC0) break; /* 1100 0000 */ 60 | c--; 61 | } 62 | } 63 | } 64 | } 65 | return c; 66 | } 67 | 68 | /* Code for character groupings: utf8 cases */ 69 | 70 | static int get_utf8(const symbol * p, int c, int l, int * slot) { 71 | int b0, b1; 72 | if (c >= l) return 0; 73 | b0 = p[c++]; 74 | if (b0 < 0xC0 || c == l) { /* 1100 0000 */ 75 | * slot = b0; return 1; 76 | } 77 | b1 = p[c++]; 78 | if (b0 < 0xE0 || c == l) { /* 1110 0000 */ 79 | * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; 80 | } 81 | * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; 82 | } 83 | 84 | static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { 85 | int b0, b1; 86 | if (c <= lb) return 0; 87 | b0 = p[--c]; 88 | if (b0 < 0x80 || c == lb) { /* 1000 0000 */ 89 | * slot = b0; return 1; 90 | } 91 | b1 = p[--c]; 92 | if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ 93 | * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; 94 | } 95 | * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; 96 | } 97 | 98 | extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 99 | do { 100 | int ch; 101 | int w = get_utf8(z->p, z->c, z->l, & ch); 102 | unless (w) return -1; 103 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 104 | return w; 105 | z->c += w; 106 | } while (repeat); 107 | return 0; 108 | } 109 | 110 | extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 111 | do { 112 | int ch; 113 | int w = get_b_utf8(z->p, z->c, z->lb, & ch); 114 | unless (w) return -1; 115 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 116 | return w; 117 | z->c -= w; 118 | } while (repeat); 119 | return 0; 120 | } 121 | 122 | extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 123 | do { 124 | int ch; 125 | int w = get_utf8(z->p, z->c, z->l, & ch); 126 | unless (w) return -1; 127 | unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 128 | return w; 129 | z->c += w; 130 | } while (repeat); 131 | return 0; 132 | } 133 | 134 | extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 135 | do { 136 | int ch; 137 | int w = get_b_utf8(z->p, z->c, z->lb, & ch); 138 | unless (w) return -1; 139 | unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 140 | return w; 141 | z->c -= w; 142 | } while (repeat); 143 | return 0; 144 | } 145 | 146 | /* Code for character groupings: non-utf8 cases */ 147 | 148 | extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 149 | do { 150 | int ch; 151 | if (z->c >= z->l) return -1; 152 | ch = z->p[z->c]; 153 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 154 | return 1; 155 | z->c++; 156 | } while (repeat); 157 | return 0; 158 | } 159 | 160 | extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 161 | do { 162 | int ch; 163 | if (z->c <= z->lb) return -1; 164 | ch = z->p[z->c - 1]; 165 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 166 | return 1; 167 | z->c--; 168 | } while (repeat); 169 | return 0; 170 | } 171 | 172 | extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 173 | do { 174 | int ch; 175 | if (z->c >= z->l) return -1; 176 | ch = z->p[z->c]; 177 | unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 178 | return 1; 179 | z->c++; 180 | } while (repeat); 181 | return 0; 182 | } 183 | 184 | extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 185 | do { 186 | int ch; 187 | if (z->c <= z->lb) return -1; 188 | ch = z->p[z->c - 1]; 189 | unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 190 | return 1; 191 | z->c--; 192 | } while (repeat); 193 | return 0; 194 | } 195 | 196 | extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { 197 | if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; 198 | z->c += s_size; return 1; 199 | } 200 | 201 | extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { 202 | if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; 203 | z->c -= s_size; return 1; 204 | } 205 | 206 | extern int eq_v(struct SN_env * z, const symbol * p) { 207 | return eq_s(z, SIZE(p), p); 208 | } 209 | 210 | extern int eq_v_b(struct SN_env * z, const symbol * p) { 211 | return eq_s_b(z, SIZE(p), p); 212 | } 213 | 214 | extern int find_among(struct SN_env * z, const struct among * v, int v_size) { 215 | 216 | int i = 0; 217 | int j = v_size; 218 | 219 | int c = z->c; int l = z->l; 220 | symbol * q = z->p + c; 221 | 222 | const struct among * w; 223 | 224 | int common_i = 0; 225 | int common_j = 0; 226 | 227 | int first_key_inspected = 0; 228 | 229 | while(1) { 230 | int k = i + ((j - i) >> 1); 231 | int diff = 0; 232 | int common = common_i < common_j ? common_i : common_j; /* smaller */ 233 | w = v + k; 234 | { 235 | int i2; for (i2 = common; i2 < w->s_size; i2++) { 236 | if (c + common == l) { diff = -1; break; } 237 | diff = q[common] - w->s[i2]; 238 | if (diff != 0) break; 239 | common++; 240 | } 241 | } 242 | if (diff < 0) { j = k; common_j = common; } 243 | else { i = k; common_i = common; } 244 | if (j - i <= 1) { 245 | if (i > 0) break; /* v->s has been inspected */ 246 | if (j == i) break; /* only one item in v */ 247 | 248 | /* - but now we need to go round once more to get 249 | v->s inspected. This looks messy, but is actually 250 | the optimal approach. */ 251 | 252 | if (first_key_inspected) break; 253 | first_key_inspected = 1; 254 | } 255 | } 256 | while(1) { 257 | w = v + i; 258 | if (common_i >= w->s_size) { 259 | z->c = c + w->s_size; 260 | if (w->function == 0) return w->result; 261 | { 262 | int res = w->function(z); 263 | z->c = c + w->s_size; 264 | if (res) return w->result; 265 | } 266 | } 267 | i = w->substring_i; 268 | if (i < 0) return 0; 269 | } 270 | } 271 | 272 | /* find_among_b is for backwards processing. Same comments apply */ 273 | 274 | extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { 275 | 276 | int i = 0; 277 | int j = v_size; 278 | 279 | int c = z->c; int lb = z->lb; 280 | symbol * q = z->p + c - 1; 281 | 282 | const struct among * w; 283 | 284 | int common_i = 0; 285 | int common_j = 0; 286 | 287 | int first_key_inspected = 0; 288 | 289 | while(1) { 290 | int k = i + ((j - i) >> 1); 291 | int diff = 0; 292 | int common = common_i < common_j ? common_i : common_j; 293 | w = v + k; 294 | { 295 | int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { 296 | if (c - common == lb) { diff = -1; break; } 297 | diff = q[- common] - w->s[i2]; 298 | if (diff != 0) break; 299 | common++; 300 | } 301 | } 302 | if (diff < 0) { j = k; common_j = common; } 303 | else { i = k; common_i = common; } 304 | if (j - i <= 1) { 305 | if (i > 0) break; 306 | if (j == i) break; 307 | if (first_key_inspected) break; 308 | first_key_inspected = 1; 309 | } 310 | } 311 | while(1) { 312 | w = v + i; 313 | if (common_i >= w->s_size) { 314 | z->c = c - w->s_size; 315 | if (w->function == 0) return w->result; 316 | { 317 | int res = w->function(z); 318 | z->c = c - w->s_size; 319 | if (res) return w->result; 320 | } 321 | } 322 | i = w->substring_i; 323 | if (i < 0) return 0; 324 | } 325 | } 326 | 327 | 328 | /* Increase the size of the buffer pointed to by p to at least n symbols. 329 | * If insufficient memory, returns NULL and frees the old buffer. 330 | */ 331 | static symbol * increase_size(symbol * p, int n) { 332 | symbol * q; 333 | int new_size = n + 20; 334 | void * mem = sqlite3_realloc((char *) p - HEAD, 335 | HEAD + (new_size + 1) * sizeof(symbol)); 336 | if (mem == NULL) { 337 | lose_s(p); 338 | return NULL; 339 | } 340 | q = (symbol *) (HEAD + (char *)mem); 341 | CAPACITY(q) = new_size; 342 | return q; 343 | } 344 | 345 | /* to replace symbols between c_bra and c_ket in z->p by the 346 | s_size symbols at s. 347 | Returns 0 on success, -1 on error. 348 | Also, frees z->p (and sets it to NULL) on error. 349 | */ 350 | extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) 351 | { 352 | int adjustment; 353 | int len; 354 | if (z->p == NULL) { 355 | z->p = create_s(); 356 | if (z->p == NULL) return -1; 357 | } 358 | adjustment = s_size - (c_ket - c_bra); 359 | len = SIZE(z->p); 360 | if (adjustment != 0) { 361 | if (adjustment + len > CAPACITY(z->p)) { 362 | z->p = increase_size(z->p, adjustment + len); 363 | if (z->p == NULL) return -1; 364 | } 365 | memmove(z->p + c_ket + adjustment, 366 | z->p + c_ket, 367 | (len - c_ket) * sizeof(symbol)); 368 | SET_SIZE(z->p, adjustment + len); 369 | z->l += adjustment; 370 | if (z->c >= c_ket) 371 | z->c += adjustment; 372 | else 373 | if (z->c > c_bra) 374 | z->c = c_bra; 375 | } 376 | unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); 377 | if (adjptr != NULL) 378 | *adjptr = adjustment; 379 | return 0; 380 | } 381 | 382 | static int slice_check(struct SN_env * z) { 383 | 384 | if (z->bra < 0 || 385 | z->bra > z->ket || 386 | z->ket > z->l || 387 | z->p == NULL || 388 | z->l > SIZE(z->p)) /* this line could be removed */ 389 | { 390 | #if 0 391 | fprintf(stderr, "faulty slice operation:\n"); 392 | debug(z, -1, 0); 393 | #endif 394 | return -1; 395 | } 396 | return 0; 397 | } 398 | 399 | extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { 400 | if (slice_check(z)) return -1; 401 | return replace_s(z, z->bra, z->ket, s_size, s, NULL); 402 | } 403 | 404 | extern int slice_from_v(struct SN_env * z, const symbol * p) { 405 | return slice_from_s(z, SIZE(p), p); 406 | } 407 | 408 | extern int slice_del(struct SN_env * z) { 409 | return slice_from_s(z, 0, 0); 410 | } 411 | 412 | extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { 413 | int adjustment; 414 | if (replace_s(z, bra, ket, s_size, s, &adjustment)) 415 | return -1; 416 | if (bra <= z->bra) z->bra += adjustment; 417 | if (bra <= z->ket) z->ket += adjustment; 418 | return 0; 419 | } 420 | 421 | extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { 422 | int adjustment; 423 | if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) 424 | return -1; 425 | if (bra <= z->bra) z->bra += adjustment; 426 | if (bra <= z->ket) z->ket += adjustment; 427 | return 0; 428 | } 429 | 430 | extern symbol * slice_to(struct SN_env * z, symbol * p) { 431 | if (slice_check(z)) { 432 | lose_s(p); 433 | return NULL; 434 | } 435 | { 436 | int len = z->ket - z->bra; 437 | if (CAPACITY(p) < len) { 438 | p = increase_size(p, len); 439 | if (p == NULL) 440 | return NULL; 441 | } 442 | memmove(p, z->p + z->bra, len * sizeof(symbol)); 443 | SET_SIZE(p, len); 444 | } 445 | return p; 446 | } 447 | 448 | extern symbol * assign_to(struct SN_env * z, symbol * p) { 449 | int len = z->l; 450 | if (CAPACITY(p) < len) { 451 | p = increase_size(p, len); 452 | if (p == NULL) 453 | return NULL; 454 | } 455 | memmove(p, z->p, len * sizeof(symbol)); 456 | SET_SIZE(p, len); 457 | return p; 458 | } 459 | 460 | #if 0 461 | extern void debug(struct SN_env * z, int number, int line_count) { 462 | int i; 463 | int limit = SIZE(z->p); 464 | /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ 465 | if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); 466 | for (i = 0; i <= limit; i++) { 467 | if (z->lb == i) printf("{"); 468 | if (z->bra == i) printf("["); 469 | if (z->c == i) printf("|"); 470 | if (z->ket == i) printf("]"); 471 | if (z->l == i) printf("}"); 472 | if (i < limit) 473 | { int ch = z->p[i]; 474 | if (ch == 0) ch = '#'; 475 | printf("%c", ch); 476 | } 477 | } 478 | printf("'\n"); 479 | } 480 | #endif 481 | -------------------------------------------------------------------------------- /fts3_unicode2.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2012 May 25 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ****************************************************************************** 12 | */ 13 | 14 | /* 15 | ** DO NOT EDIT THIS MACHINE GENERATED FILE. 16 | */ 17 | 18 | #if defined(SQLITE_ENABLE_FTS4_UNICODE61) 19 | #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) 20 | 21 | #include 22 | #include "fts3Int.h" 23 | 24 | /* 25 | ** Return true if the argument corresponds to a unicode codepoint 26 | ** classified as either a letter or a number. Otherwise false. 27 | ** 28 | ** The results are undefined if the value passed to this function 29 | ** is less than zero. 30 | */ 31 | int sqlite3FtsUnicodeIsalnum(int c){ 32 | /* Each unsigned integer in the following array corresponds to a contiguous 33 | ** range of unicode codepoints that are not either letters or numbers (i.e. 34 | ** codepoints for which this function should return 0). 35 | ** 36 | ** The most significant 22 bits in each 32-bit value contain the first 37 | ** codepoint in the range. The least significant 10 bits are used to store 38 | ** the size of the range (always at least 1). In other words, the value 39 | ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 40 | ** C. It is not possible to represent a range larger than 1023 codepoints 41 | ** using this format. 42 | */ 43 | const static unsigned int aEntry[] = { 44 | 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07, 45 | 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01, 46 | 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401, 47 | 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01, 48 | 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01, 49 | 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802, 50 | 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F, 51 | 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401, 52 | 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804, 53 | 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403, 54 | 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812, 55 | 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001, 56 | 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802, 57 | 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805, 58 | 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401, 59 | 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, 60 | 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807, 61 | 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001, 62 | 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01, 63 | 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804, 64 | 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001, 65 | 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802, 66 | 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01, 67 | 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 68 | 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 69 | 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006, 70 | 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417, 71 | 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14, 72 | 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07, 73 | 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01, 74 | 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001, 75 | 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802, 76 | 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F, 77 | 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 78 | 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802, 79 | 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006, 80 | 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D, 81 | 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802, 82 | 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027, 83 | 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 84 | 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805, 85 | 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04, 86 | 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401, 87 | 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 88 | 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B, 89 | 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A, 90 | 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 91 | 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59, 92 | 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 93 | 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 94 | 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 95 | 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 96 | 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 97 | 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 98 | 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 99 | 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012, 100 | 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004, 101 | 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002, 102 | 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803, 103 | 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, 104 | 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 105 | 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 106 | 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013, 107 | 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06, 108 | 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003, 109 | 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01, 110 | 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403, 111 | 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 112 | 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003, 113 | 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003, 114 | 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E, 115 | 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046, 116 | 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 117 | 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 118 | 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, 119 | 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C, 120 | 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002, 121 | 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025, 122 | 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6, 123 | 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46, 124 | 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060, 125 | 0x380400F0, 126 | }; 127 | static const unsigned int aAscii[4] = { 128 | 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, 129 | }; 130 | 131 | if( c<128 ){ 132 | return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); 133 | }else if( c<(1<<22) ){ 134 | unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; 135 | int iRes; 136 | int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 137 | int iLo = 0; 138 | while( iHi>=iLo ){ 139 | int iTest = (iHi + iLo) / 2; 140 | if( key >= aEntry[iTest] ){ 141 | iRes = iTest; 142 | iLo = iTest+1; 143 | }else{ 144 | iHi = iTest-1; 145 | } 146 | } 147 | assert( aEntry[0]=aEntry[iRes] ); 149 | return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); 150 | } 151 | return 1; 152 | } 153 | 154 | 155 | /* 156 | ** If the argument is a codepoint corresponding to a lowercase letter 157 | ** in the ASCII range with a diacritic added, return the codepoint 158 | ** of the ASCII letter only. For example, if passed 235 - "LATIN 159 | ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER 160 | ** E"). The resuls of passing a codepoint that corresponds to an 161 | ** uppercase letter are undefined. 162 | */ 163 | static int remove_diacritic(int c){ 164 | unsigned short aDia[] = { 165 | 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 166 | 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 167 | 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 168 | 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, 169 | 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, 170 | 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, 171 | 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, 172 | 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, 173 | 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 174 | 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 175 | 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 176 | 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 177 | 62924, 63050, 63082, 63274, 63390, 178 | }; 179 | char aChar[] = { 180 | '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', 181 | 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', 182 | 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', 183 | 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', 184 | 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', 185 | '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', 186 | 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', 187 | 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', 188 | 'e', 'i', 'o', 'u', 'y', 189 | }; 190 | 191 | unsigned int key = (((unsigned int)c)<<3) | 0x00000007; 192 | int iRes = 0; 193 | int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; 194 | int iLo = 0; 195 | while( iHi>=iLo ){ 196 | int iTest = (iHi + iLo) / 2; 197 | if( key >= aDia[iTest] ){ 198 | iRes = iTest; 199 | iLo = iTest+1; 200 | }else{ 201 | iHi = iTest-1; 202 | } 203 | } 204 | assert( key>=aDia[iRes] ); 205 | return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); 206 | }; 207 | 208 | 209 | /* 210 | ** Return true if the argument interpreted as a unicode codepoint 211 | ** is a diacritical modifier character. 212 | */ 213 | int sqlite3FtsUnicodeIsdiacritic(int c){ 214 | unsigned int mask0 = 0x08029FDF; 215 | unsigned int mask1 = 0x000361F8; 216 | if( c<768 || c>817 ) return 0; 217 | return (c < 768+32) ? 218 | (mask0 & (1 << (c-768))) : 219 | (mask1 & (1 << (c-768-32))); 220 | } 221 | 222 | 223 | /* 224 | ** Interpret the argument as a unicode codepoint. If the codepoint 225 | ** is an upper case character that has a lower case equivalent, 226 | ** return the codepoint corresponding to the lower case version. 227 | ** Otherwise, return a copy of the argument. 228 | ** 229 | ** The results are undefined if the value passed to this function 230 | ** is less than zero. 231 | */ 232 | int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){ 233 | /* Each entry in the following array defines a rule for folding a range 234 | ** of codepoints to lower case. The rule applies to a range of nRange 235 | ** codepoints starting at codepoint iCode. 236 | ** 237 | ** If the least significant bit in flags is clear, then the rule applies 238 | ** to all nRange codepoints (i.e. all nRange codepoints are upper case and 239 | ** need to be folded). Or, if it is set, then the rule only applies to 240 | ** every second codepoint in the range, starting with codepoint C. 241 | ** 242 | ** The 7 most significant bits in flags are an index into the aiOff[] 243 | ** array. If a specific codepoint C does require folding, then its lower 244 | ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). 245 | ** 246 | ** The contents of this array are generated by parsing the CaseFolding.txt 247 | ** file distributed as part of the "Unicode Character Database". See 248 | ** http://www.unicode.org for details. 249 | */ 250 | static const struct TableEntry { 251 | unsigned short iCode; 252 | unsigned char flags; 253 | unsigned char nRange; 254 | } aEntry[] = { 255 | {65, 14, 26}, {181, 64, 1}, {192, 14, 23}, 256 | {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, 257 | {313, 1, 16}, {330, 1, 46}, {376, 116, 1}, 258 | {377, 1, 6}, {383, 104, 1}, {385, 50, 1}, 259 | {386, 1, 4}, {390, 44, 1}, {391, 0, 1}, 260 | {393, 42, 2}, {395, 0, 1}, {398, 32, 1}, 261 | {399, 38, 1}, {400, 40, 1}, {401, 0, 1}, 262 | {403, 42, 1}, {404, 46, 1}, {406, 52, 1}, 263 | {407, 48, 1}, {408, 0, 1}, {412, 52, 1}, 264 | {413, 54, 1}, {415, 56, 1}, {416, 1, 6}, 265 | {422, 60, 1}, {423, 0, 1}, {425, 60, 1}, 266 | {428, 0, 1}, {430, 60, 1}, {431, 0, 1}, 267 | {433, 58, 2}, {435, 1, 4}, {439, 62, 1}, 268 | {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, 269 | {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, 270 | {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, 271 | {497, 2, 1}, {498, 1, 4}, {502, 122, 1}, 272 | {503, 134, 1}, {504, 1, 40}, {544, 110, 1}, 273 | {546, 1, 18}, {570, 70, 1}, {571, 0, 1}, 274 | {573, 108, 1}, {574, 68, 1}, {577, 0, 1}, 275 | {579, 106, 1}, {580, 28, 1}, {581, 30, 1}, 276 | {582, 1, 10}, {837, 36, 1}, {880, 1, 4}, 277 | {886, 0, 1}, {902, 18, 1}, {904, 16, 3}, 278 | {908, 26, 1}, {910, 24, 2}, {913, 14, 17}, 279 | {931, 14, 9}, {962, 0, 1}, {975, 4, 1}, 280 | {976, 140, 1}, {977, 142, 1}, {981, 146, 1}, 281 | {982, 144, 1}, {984, 1, 24}, {1008, 136, 1}, 282 | {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1}, 283 | {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1}, 284 | {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32}, 285 | {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1}, 286 | {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38}, 287 | {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1}, 288 | {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1}, 289 | {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6}, 290 | {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6}, 291 | {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8}, 292 | {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2}, 293 | {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1}, 294 | {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2}, 295 | {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2}, 296 | {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2}, 297 | {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1}, 298 | {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16}, 299 | {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47}, 300 | {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1}, 301 | {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1}, 302 | {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1}, 303 | {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2}, 304 | {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, 305 | {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14}, 306 | {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1}, 307 | {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1}, 308 | {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1}, 309 | {65313, 14, 26}, 310 | }; 311 | static const unsigned short aiOff[] = { 312 | 1, 2, 8, 15, 16, 26, 28, 32, 313 | 37, 38, 40, 48, 63, 64, 69, 71, 314 | 79, 80, 116, 202, 203, 205, 206, 207, 315 | 209, 210, 211, 213, 214, 217, 218, 219, 316 | 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721, 317 | 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 318 | 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 319 | 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 320 | 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 321 | 65514, 65521, 65527, 65528, 65529, 322 | }; 323 | 324 | int ret = c; 325 | 326 | assert( c>=0 ); 327 | assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); 328 | 329 | if( c<128 ){ 330 | if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); 331 | }else if( c<65536 ){ 332 | int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 333 | int iLo = 0; 334 | int iRes = -1; 335 | 336 | while( iHi>=iLo ){ 337 | int iTest = (iHi + iLo) / 2; 338 | int cmp = (c - aEntry[iTest].iCode); 339 | if( cmp>=0 ){ 340 | iRes = iTest; 341 | iLo = iTest+1; 342 | }else{ 343 | iHi = iTest-1; 344 | } 345 | } 346 | assert( iRes<0 || c>=aEntry[iRes].iCode ); 347 | 348 | if( iRes>=0 ){ 349 | const struct TableEntry *p = &aEntry[iRes]; 350 | if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ 351 | ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; 352 | assert( ret>0 ); 353 | } 354 | } 355 | 356 | if( bRemoveDiacritic ) ret = remove_diacritic(ret); 357 | } 358 | 359 | else if( c>=66560 && c<66600 ){ 360 | ret = c + 40; 361 | } 362 | 363 | return ret; 364 | } 365 | #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */ 366 | #endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */ 367 | -------------------------------------------------------------------------------- /fts3_unicodesn.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2012 Nov 11 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ****************************************************************************** 12 | ** 13 | ** Implementation of the "unicode" full-text-search tokenizer with Snowball stemming 14 | */ 15 | 16 | #include "fts3_unicodesn.h" 17 | 18 | /* Snowball stemmer */ 19 | #include "api.h" 20 | 21 | #ifdef SQLITE_ENABLE_FTS4_UNICODE61 22 | 23 | #include "fts3Int.h" 24 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "fts3_tokenizer.h" 32 | 33 | #include "libstemmer_c/src_c/stem_UTF_8_danish.h" 34 | #include "libstemmer_c/src_c/stem_UTF_8_dutch.h" 35 | #include "libstemmer_c/src_c/stem_UTF_8_english.h" 36 | #include "libstemmer_c/src_c/stem_UTF_8_finnish.h" 37 | #include "libstemmer_c/src_c/stem_UTF_8_french.h" 38 | #include "libstemmer_c/src_c/stem_UTF_8_german.h" 39 | #include "libstemmer_c/src_c/stem_UTF_8_hungarian.h" 40 | #include "libstemmer_c/src_c/stem_UTF_8_italian.h" 41 | #include "libstemmer_c/src_c/stem_UTF_8_norwegian.h" 42 | #include "libstemmer_c/src_c/stem_UTF_8_porter.h" 43 | #include "libstemmer_c/src_c/stem_UTF_8_portuguese.h" 44 | #include "libstemmer_c/src_c/stem_UTF_8_romanian.h" 45 | #include "libstemmer_c/src_c/stem_UTF_8_russian.h" 46 | #include "libstemmer_c/src_c/stem_UTF_8_spanish.h" 47 | #include "libstemmer_c/src_c/stem_UTF_8_swedish.h" 48 | #include "libstemmer_c/src_c/stem_UTF_8_turkish.h" 49 | 50 | 51 | /* 52 | ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied 53 | ** from the sqlite3 source file utf.c. If this file is compiled as part 54 | ** of the amalgamation, they are not required. 55 | */ 56 | #ifndef SQLITE_AMALGAMATION 57 | 58 | static const unsigned char sqlite3Utf8Trans1[] = { 59 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 60 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 61 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 62 | 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 63 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 64 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 65 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 66 | 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 67 | }; 68 | 69 | #define READ_UTF8(zIn, zTerm, c) \ 70 | c = *(zIn++); \ 71 | if( c>=0xc0 ){ \ 72 | c = sqlite3Utf8Trans1[c-0xc0]; \ 73 | while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ 74 | c = (c<<6) + (0x3f & *(zIn++)); \ 75 | } \ 76 | if( c<0x80 \ 77 | || (c&0xFFFFF800)==0xD800 \ 78 | || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ 79 | } 80 | 81 | #define WRITE_UTF8(zOut, c) { \ 82 | if( c<0x00080 ){ \ 83 | *zOut++ = (u8)(c&0xFF); \ 84 | } \ 85 | else if( c<0x00800 ){ \ 86 | *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \ 87 | *zOut++ = 0x80 + (u8)(c & 0x3F); \ 88 | } \ 89 | else if( c<0x10000 ){ \ 90 | *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \ 91 | *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ 92 | *zOut++ = 0x80 + (u8)(c & 0x3F); \ 93 | }else{ \ 94 | *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \ 95 | *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \ 96 | *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ 97 | *zOut++ = 0x80 + (u8)(c & 0x3F); \ 98 | } \ 99 | } 100 | 101 | #endif /* ifndef SQLITE_AMALGAMATION */ 102 | 103 | typedef struct unicode_tokenizer unicode_tokenizer; 104 | typedef struct unicode_cursor unicode_cursor; 105 | 106 | typedef struct { 107 | struct SN_env * (*create)(void); 108 | void (*close)(struct SN_env *); 109 | int (*stem)(struct SN_env *); 110 | } stemmer_callbacks; 111 | 112 | struct unicode_tokenizer { 113 | sqlite3_tokenizer base; 114 | int bRemoveDiacritic; 115 | int nException; 116 | int *aiException; 117 | /* Snowball stemmer */ 118 | stemmer_callbacks stemmer; 119 | }; 120 | 121 | struct unicode_cursor { 122 | sqlite3_tokenizer_cursor base; 123 | const unsigned char *aInput; /* Input text being tokenized */ 124 | int nInput; /* Size of aInput[] in bytes */ 125 | int iOff; /* Current offset within aInput[] */ 126 | int iToken; /* Index of next token to be returned */ 127 | char *zToken; /* storage for current token */ 128 | int nAlloc; /* space allocated at zToken */ 129 | struct SN_env *pStemmer; /* Snowball stemmer */ 130 | }; 131 | 132 | 133 | /* 134 | ** Destroy a tokenizer allocated by unicodeCreate(). 135 | */ 136 | static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){ 137 | if( pTokenizer ){ 138 | unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer; 139 | sqlite3_free(p->aiException); 140 | sqlite3_free(p); 141 | } 142 | return SQLITE_OK; 143 | } 144 | 145 | /* 146 | ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE 147 | ** statement has specified that the tokenizer for this table shall consider 148 | ** all characters in string zIn/nIn to be separators (if bAlnum==0) or 149 | ** token characters (if bAlnum==1). 150 | ** 151 | ** For each codepoint in the zIn/nIn string, this function checks if the 152 | ** sqlite3FtsUnicodeIsalnum() function already returns the desired result. 153 | ** If so, no action is taken. Otherwise, the codepoint is added to the 154 | ** unicode_tokenizer.aiException[] array. For the purposes of tokenization, 155 | ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all 156 | ** codepoints in the aiException[] array. 157 | ** 158 | ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic() 159 | ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored. 160 | ** It is not possible to change the behaviour of the tokenizer with respect 161 | ** to these codepoints. 162 | */ 163 | static int unicodeAddExceptions( 164 | unicode_tokenizer *p, /* Tokenizer to add exceptions to */ 165 | int bAlnum, /* Replace Isalnum() return value with this */ 166 | const char *zIn, /* Array of characters to make exceptions */ 167 | int nIn /* Length of z in bytes */ 168 | ){ 169 | const unsigned char *z = (const unsigned char *)zIn; 170 | const unsigned char *zTerm = &z[nIn]; 171 | int iCode; 172 | int nEntry = 0; 173 | 174 | assert( bAlnum==0 || bAlnum==1 ); 175 | 176 | while( zaiException, (p->nException+nEntry)*sizeof(int)); 191 | if( aNew==0 ) return SQLITE_NOMEM; 192 | nNew = p->nException; 193 | 194 | z = (const unsigned char *)zIn; 195 | while( zi; j--) aNew[j] = aNew[j-1]; 203 | aNew[i] = iCode; 204 | nNew++; 205 | } 206 | } 207 | p->aiException = aNew; 208 | p->nException = nNew; 209 | } 210 | 211 | return SQLITE_OK; 212 | } 213 | 214 | /* 215 | ** Return true if the p->aiException[] array contains the value iCode. 216 | */ 217 | static int unicodeIsException(unicode_tokenizer *p, int iCode){ 218 | if( p->nException>0 ){ 219 | int *a = p->aiException; 220 | int iLo = 0; 221 | int iHi = p->nException-1; 222 | 223 | while( iHi>=iLo ){ 224 | int iTest = (iHi + iLo) / 2; 225 | if( iCode==a[iTest] ){ 226 | return 1; 227 | }else if( iCode>a[iTest] ){ 228 | iLo = iTest+1; 229 | }else{ 230 | iHi = iTest-1; 231 | } 232 | } 233 | } 234 | 235 | return 0; 236 | } 237 | 238 | /* 239 | ** Return true if, for the purposes of tokenization, codepoint iCode is 240 | ** considered a token character (not a separator). 241 | */ 242 | static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){ 243 | assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); 244 | return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode); 245 | } 246 | 247 | /* Allow stemmers to be looked up by ISO-639 language code or by (English) name. */ 248 | static struct {const char *shortName; const char *longName; stemmer_callbacks stemmer;} 249 | const stemmers[] = { 250 | {"da", "danish", {danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}}, 251 | {"nl", "dutch", {dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}}, 252 | {"en", "english", {english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}}, 253 | {"fi", "finnish", {finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}}, 254 | {"fr", "french", {french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}}, 255 | {"de", "german", {german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}}, 256 | {"hu", "hungarian", {hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}}, 257 | {"it", "italian", {italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}}, 258 | {"no", "norwegian", {norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}}, 259 | {"porter", "porter", {porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}}, 260 | {"pt", "portuguese", {portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}}, 261 | {"ro", "romanian", {romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}}, 262 | {"ru", "russian", {russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}}, 263 | {"es", "spanish", {spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}}, 264 | {"sv", "swedish", {swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}}, 265 | {"tr", "turkish", {turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}}, 266 | }; 267 | 268 | 269 | static int unicodeSetStemmer( 270 | unicode_tokenizer *p, 271 | const char *zIn, /* Stemmer name (not NUL-terminated) */ 272 | int nIn /* Length of z in bytes */ 273 | ) 274 | { 275 | for (int i = 0; i < sizeof(stemmers)/sizeof(stemmers[0]); i++) { 276 | const char *n1 = stemmers[i].shortName, *n2 = stemmers[i].longName; 277 | if ( (nIn==strlen(n1) && memcmp(n1, zIn, nIn)==0) || 278 | (nIn==strlen(n2) && memcmp(n2, zIn, nIn)==0) ) { 279 | p->stemmer = stemmers[i].stemmer; 280 | return SQLITE_OK; 281 | } 282 | } 283 | return SQLITE_ERROR; 284 | } 285 | 286 | /* 287 | ** Create a new tokenizer instance. 288 | */ 289 | static int unicodeCreate( 290 | int nArg, /* Size of array argv[] */ 291 | const char * const *azArg, /* Tokenizer creation arguments */ 292 | sqlite3_tokenizer **pp /* OUT: New tokenizer handle */ 293 | ){ 294 | unicode_tokenizer *pNew; /* New tokenizer object */ 295 | int i; 296 | int rc = SQLITE_OK; 297 | 298 | pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); 299 | if( pNew==NULL ) return SQLITE_NOMEM; 300 | memset(pNew, 0, sizeof(unicode_tokenizer)); 301 | pNew->bRemoveDiacritic = 1; 302 | pNew->stemmer.create = NULL; 303 | pNew->stemmer.close = NULL; 304 | pNew->stemmer.stem = NULL; 305 | 306 | for(i=0; rc==SQLITE_OK && ibRemoveDiacritic = 1; 312 | } 313 | else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ 314 | pNew->bRemoveDiacritic = 0; 315 | } 316 | else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ 317 | rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); 318 | } 319 | else if( n>=11 && memcmp("separators=", z, 11)==0 ){ 320 | rc = unicodeAddExceptions(pNew, 0, &z[11], n-11); 321 | } 322 | else if( n>=8 && memcmp("stemmer=", z, 8)==0 ){ 323 | rc = unicodeSetStemmer(pNew, &z[8], n-8); 324 | } 325 | else{ 326 | /* Unrecognized argument */ 327 | rc = SQLITE_ERROR; 328 | } 329 | } 330 | 331 | if( rc!=SQLITE_OK ){ 332 | unicodeDestroy((sqlite3_tokenizer *)pNew); 333 | pNew = 0; 334 | } 335 | *pp = (sqlite3_tokenizer *)pNew; 336 | return rc; 337 | } 338 | 339 | /* 340 | ** Prepare to begin tokenizing a particular string. The input 341 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor 342 | ** used to incrementally tokenize this string is returned in 343 | ** *ppCursor. 344 | */ 345 | static int unicodeOpen( 346 | sqlite3_tokenizer *p, /* The tokenizer */ 347 | const char *aInput, /* Input string */ 348 | int nInput, /* Size of string aInput in bytes */ 349 | sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */ 350 | ){ 351 | unicode_tokenizer *pTokenizer; 352 | unicode_cursor *pCsr; 353 | 354 | pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor)); 355 | if( pCsr==0 ){ 356 | return SQLITE_NOMEM; 357 | } 358 | memset(pCsr, 0, sizeof(unicode_cursor)); 359 | 360 | pCsr->aInput = (const unsigned char *)aInput; 361 | if( aInput==0 ){ 362 | pCsr->nInput = 0; 363 | }else if( nInput<0 ){ 364 | pCsr->nInput = (int)strlen(aInput); 365 | }else{ 366 | pCsr->nInput = nInput; 367 | } 368 | 369 | pTokenizer = (unicode_tokenizer *)p; 370 | if ( pTokenizer->stemmer.create!=NULL ) { 371 | pCsr->pStemmer = pTokenizer->stemmer.create(); 372 | if ( pCsr->pStemmer==0 ) { 373 | sqlite3_free(p); 374 | return SQLITE_NOMEM; 375 | } 376 | }else { 377 | pCsr->pStemmer = NULL; 378 | } 379 | 380 | *pp = &pCsr->base; 381 | UNUSED_PARAMETER(p); 382 | return SQLITE_OK; 383 | } 384 | 385 | /* 386 | ** Close a tokenization cursor previously opened by a call to 387 | ** simpleOpen() above. 388 | */ 389 | static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){ 390 | unicode_cursor *pCsr = (unicode_cursor *) pCursor; 391 | if ( pCsr->pStemmer != NULL ) { 392 | unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); 393 | p->stemmer.close(pCsr->pStemmer); 394 | } 395 | sqlite3_free(pCsr->zToken); 396 | sqlite3_free(pCsr); 397 | return SQLITE_OK; 398 | } 399 | 400 | /* 401 | ** Extract the next token from a tokenization cursor. The cursor must 402 | ** have been opened by a prior call to simpleOpen(). 403 | */ 404 | static int unicodeNext( 405 | sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ 406 | const char **paToken, /* OUT: Token text */ 407 | int *pnToken, /* OUT: Number of bytes at *paToken */ 408 | int *piStart, /* OUT: Starting offset of token */ 409 | int *piEnd, /* OUT: Ending offset of token */ 410 | int *piPos /* OUT: Position integer of token */ 411 | ){ 412 | unicode_cursor *pCsr = (unicode_cursor *)pC; 413 | unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); 414 | int iCode = 0; 415 | char *zOut; 416 | const unsigned char *z = &pCsr->aInput[pCsr->iOff]; 417 | const unsigned char *zStart = z; 418 | const unsigned char *zEnd; 419 | const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput]; 420 | 421 | /* Scan past any delimiter characters before the start of the next token. 422 | ** Return SQLITE_DONE early if this takes us all the way to the end of 423 | ** the input. */ 424 | while( z=zTerm ) return SQLITE_DONE; 430 | 431 | zOut = pCsr->zToken; 432 | do { 433 | int iOut; 434 | 435 | /* Grow the output buffer if required. */ 436 | if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ 437 | char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); 438 | if( !zNew ) return SQLITE_NOMEM; 439 | zOut = &zNew[zOut - pCsr->zToken]; 440 | pCsr->zToken = zNew; 441 | pCsr->nAlloc += 64; 442 | } 443 | 444 | /* Write the folded case of the last character read to the output */ 445 | zEnd = z; 446 | iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); 447 | if( iOut ){ 448 | WRITE_UTF8(zOut, iOut); 449 | } 450 | 451 | /* If the cursor is not at EOF, read the next character */ 452 | if( z>=zTerm ) break; 453 | READ_UTF8(z, zTerm, iCode); 454 | }while( unicodeIsAlnum(p, iCode) 455 | || sqlite3FtsUnicodeIsdiacritic(iCode) 456 | ); 457 | 458 | if ( pCsr->pStemmer!=NULL ) { 459 | SN_set_current(pCsr->pStemmer, (int)(zOut - pCsr->zToken), (unsigned char *)pCsr->zToken); 460 | if ( p->stemmer.stem(pCsr->pStemmer)<0 ) { 461 | *paToken = pCsr->zToken; 462 | *pnToken = (int)(zOut - pCsr->zToken); 463 | }else { 464 | pCsr->pStemmer->p[pCsr->pStemmer->l] = '\0'; 465 | *paToken = (char *)pCsr->pStemmer->p; 466 | *pnToken = pCsr->pStemmer->l; 467 | } 468 | }else { 469 | *paToken = pCsr->zToken; 470 | *pnToken = (int)(zOut - pCsr->zToken); 471 | } 472 | 473 | /* Set the output variables and return. */ 474 | pCsr->iOff = (int)(z - pCsr->aInput); 475 | *piStart = (int)(zStart - pCsr->aInput); 476 | *piEnd = (int)(zEnd - pCsr->aInput); 477 | *piPos = pCsr->iToken++; 478 | return SQLITE_OK; 479 | } 480 | 481 | /* 482 | ** Set *ppModule to a pointer to the sqlite3_tokenizer_module 483 | ** structure for the unicode tokenizer. 484 | */ 485 | void sqlite3Fts3UnicodeSnTokenizer(sqlite3_tokenizer_module const **ppModule){ 486 | static const sqlite3_tokenizer_module module = { 487 | 0, 488 | unicodeCreate, 489 | unicodeDestroy, 490 | unicodeOpen, 491 | unicodeClose, 492 | unicodeNext, 493 | 0, 494 | }; 495 | *ppModule = &module; 496 | } 497 | 498 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ 499 | #endif /* ifndef SQLITE_ENABLE_FTS4_UNICODE61 */ 500 | --------------------------------------------------------------------------------