├── libstemmer_c
    ├── Makefile
    ├── src_c
    │   ├── stem_UTF_8_dutch.h
    │   ├── stem_UTF_8_danish.h
    │   ├── stem_UTF_8_english.h
    │   ├── stem_UTF_8_finnish.h
    │   ├── stem_UTF_8_french.h
    │   ├── stem_UTF_8_german.h
    │   ├── stem_UTF_8_italian.h
    │   ├── stem_UTF_8_porter.h
    │   ├── stem_UTF_8_russian.h
    │   ├── stem_UTF_8_spanish.h
    │   ├── stem_UTF_8_swedish.h
    │   ├── stem_UTF_8_turkish.h
    │   ├── stem_KOI8_R_russian.h
    │   ├── stem_UTF_8_romanian.h
    │   ├── stem_UTF_8_hungarian.h
    │   ├── stem_UTF_8_norwegian.h
    │   ├── stem_ISO_8859_1_dutch.h
    │   ├── stem_UTF_8_portuguese.h
    │   ├── stem_ISO_8859_1_danish.h
    │   ├── stem_ISO_8859_1_english.h
    │   ├── stem_ISO_8859_1_finnish.h
    │   ├── stem_ISO_8859_1_french.h
    │   ├── stem_ISO_8859_1_german.h
    │   ├── stem_ISO_8859_1_italian.h
    │   ├── stem_ISO_8859_1_porter.h
    │   ├── stem_ISO_8859_1_spanish.h
    │   ├── stem_ISO_8859_1_swedish.h
    │   ├── stem_ISO_8859_2_romanian.h
    │   ├── stem_ISO_8859_1_hungarian.h
    │   ├── stem_ISO_8859_1_norwegian.h
    │   ├── stem_ISO_8859_1_portuguese.h
    │   ├── stem_ISO_8859_1_norwegian.c
    │   ├── stem_UTF_8_norwegian.c
    │   ├── stem_ISO_8859_1_swedish.c
    │   ├── stem_UTF_8_swedish.c
    │   ├── stem_ISO_8859_1_danish.c
    │   └── stem_UTF_8_danish.c
    ├── runtime
    │   ├── api.h
    │   ├── api_sq3.c
    │   ├── header.h
    │   └── utilities_sq3.c
    ├── mkinc_utf8.mak
    ├── MANIFEST
    ├── libstemmer
    │   ├── libstemmer.c
    │   ├── libstemmer_c.in
    │   ├── modules_utf8.txt
    │   ├── libstemmer_utf8.c
    │   ├── modules.txt
    │   ├── modules_utf8.h
    │   └── modules.h
    ├── mkinc.mak
    ├── include
    │   └── libstemmer.h
    ├── README
    └── examples
    │   └── stemwords.c
├── sqlite3_unicodesn_tokenizer.h
├── fts3_unicodesn.h
├── README
├── fts3Int.h
├── sqlite3_unicodesn_tokenizer.c
├── Makefile
├── extension.c
├── fts3_tokenizer.h
├── fts3_unicode2.c
└── fts3_unicodesn.c


/libstemmer_c/Makefile:
--------------------------------------------------------------------------------
 1 | include mkinc.mak
 2 | CFLAGS=-Iinclude
 3 | all: libstemmer.o stemwords
 4 | libstemmer.o: $(snowball_sources:.c=.o)
 5 | 	$(AR) -cru $@ $^
 6 | stemwords: examples/stemwords.o libstemmer.o
 7 | 	$(CC) -o $@ $^
 8 | clean:
 9 | 	rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
10 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_dutch.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * dutch_UTF_8_create_env(void);
 9 | extern void dutch_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int dutch_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_danish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * danish_UTF_8_create_env(void);
 9 | extern void danish_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int danish_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_english.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * english_UTF_8_create_env(void);
 9 | extern void english_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int english_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_finnish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * finnish_UTF_8_create_env(void);
 9 | extern void finnish_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int finnish_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_french.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * french_UTF_8_create_env(void);
 9 | extern void french_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int french_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_german.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * german_UTF_8_create_env(void);
 9 | extern void german_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int german_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_italian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * italian_UTF_8_create_env(void);
 9 | extern void italian_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int italian_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_porter.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * porter_UTF_8_create_env(void);
 9 | extern void porter_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int porter_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_russian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * russian_UTF_8_create_env(void);
 9 | extern void russian_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int russian_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_spanish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * spanish_UTF_8_create_env(void);
 9 | extern void spanish_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int spanish_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_swedish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * swedish_UTF_8_create_env(void);
 9 | extern void swedish_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int swedish_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_turkish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * turkish_UTF_8_create_env(void);
 9 | extern void turkish_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int turkish_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_KOI8_R_russian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * russian_KOI8_R_create_env(void);
 9 | extern void russian_KOI8_R_close_env(struct SN_env * z);
10 | 
11 | extern int russian_KOI8_R_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_romanian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * romanian_UTF_8_create_env(void);
 9 | extern void romanian_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int romanian_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_hungarian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * hungarian_UTF_8_create_env(void);
 9 | extern void hungarian_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int hungarian_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_norwegian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * norwegian_UTF_8_create_env(void);
 9 | extern void norwegian_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int norwegian_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * dutch_ISO_8859_1_create_env(void);
 9 | extern void dutch_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int dutch_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_portuguese.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * portuguese_UTF_8_create_env(void);
 9 | extern void portuguese_UTF_8_close_env(struct SN_env * z);
10 | 
11 | extern int portuguese_UTF_8_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_danish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * danish_ISO_8859_1_create_env(void);
 9 | extern void danish_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int danish_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_english.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * english_ISO_8859_1_create_env(void);
 9 | extern void english_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int english_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * finnish_ISO_8859_1_create_env(void);
 9 | extern void finnish_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int finnish_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_french.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * french_ISO_8859_1_create_env(void);
 9 | extern void french_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int french_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_german.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * german_ISO_8859_1_create_env(void);
 9 | extern void german_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int german_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_italian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * italian_ISO_8859_1_create_env(void);
 9 | extern void italian_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int italian_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_porter.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * porter_ISO_8859_1_create_env(void);
 9 | extern void porter_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int porter_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * spanish_ISO_8859_1_create_env(void);
 9 | extern void spanish_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int spanish_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * swedish_ISO_8859_1_create_env(void);
 9 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int swedish_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * romanian_ISO_8859_2_create_env(void);
 9 | extern void romanian_ISO_8859_2_close_env(struct SN_env * z);
10 | 
11 | extern int romanian_ISO_8859_2_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * hungarian_ISO_8859_1_create_env(void);
 9 | extern void hungarian_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int hungarian_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void);
 9 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | extern struct SN_env * portuguese_ISO_8859_1_create_env(void);
 9 | extern void portuguese_ISO_8859_1_close_env(struct SN_env * z);
10 | 
11 | extern int portuguese_ISO_8859_1_stem(struct SN_env * z);
12 | 
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/sqlite3_unicodesn_tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _UNICODESN_TOKENIZER_H_
 2 | #define _UNICODESN_TOKENIZER_H_
 3 | 
 4 | /*
 5 |  ** Registers the Unicode Snowball tokenizer as "unicodesn", for use with SQLite's FTS3 or FTS4.
 6 |  ** This is for use when compiling the tokenizer directly into an application, instead of as a
 7 |  ** separate shared library. Example of usage:
 8 |  **   CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian");
 9 |  */
10 | int register_unicodesn_tokenizer(sqlite3 *db);
11 | 
12 | #endif /* _UNICODESN_TOKENIZER_H_ */
13 | 


--------------------------------------------------------------------------------
/fts3_unicodesn.h:
--------------------------------------------------------------------------------
 1 | #ifndef _FTS3_UNICODE_SN_H
 2 | #define _FTS3_UNICODE_SN_H
 3 | 
 4 | #include "fts3_tokenizer.h"
 5 | 
 6 | #define TOKENIZER_NAME	"unicodesn"
 7 | 
 8 | #ifdef _MSC_VER
 9 | #define UNICODE0_DLL_EXPORTED __declspec(dllexport)
10 | #else
11 | #define UNICODE0_DLL_EXPORTED __attribute__((__visibility__("default")))
12 | #endif
13 | 
14 | struct sqlite3_api_routines;
15 | 
16 | void sqlite3Fts3UnicodeSnTokenizer(sqlite3_tokenizer_module const **ppModule);
17 | 
18 | UNICODE0_DLL_EXPORTED int sqlite3_extension_init(
19 |       sqlite3 *db,          /* The database connection */
20 |       char **pzErrMsg,      /* Write error messages here */
21 |       const struct sqlite3_api_routines *pApi  /* API methods */
22 |       );
23 | 
24 | 
25 | #endif /* _FTS3_UNICODE0_H */
26 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | SQLite3-unicodesn
 2 | ==============
 3 | 
 4 | SQLite "unicode" full-text-search tokenizer with Snowball stemming
 5 | 
 6 | Installation
 7 | ============
 8 | 
 9 |    $ git clone git://github.com/littlesavage/sqlite3-unicodesn.git
10 |    $ cd sqlite3-unicodesn
11 |    $ make
12 |    $ su
13 |    # make install
14 | 
15 | Usage
16 | ======
17 | 
18 |     $ sqlite3
19 |     sqlite> .load unicodesn.sqlext
20 |     sqlite> CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian");
21 |     sqlite> INSERT INTO fts VALUES ("Пионэры! Идите в жопу!");
22 |     sqlite> SELECT * FROM fts WHERE text MATCH 'Жопа';
23 |     Пионэры! Идите в жопу!
24 | 
25 | License
26 | =======
27 | 
28 | Snowball files and stemmers are covered by the BSD license.
29 | 
30 | SQLite is in the Public Domain.
31 | 
32 | SQLite3-unicodesn code is in the Public Domain.
33 | 
34 | 


--------------------------------------------------------------------------------
/libstemmer_c/runtime/api.h:
--------------------------------------------------------------------------------
 1 | 
 2 | typedef unsigned char symbol;
 3 | 
 4 | /* Or replace 'char' above with 'short' for 16 bit characters.
 5 | 
 6 |    More precisely, replace 'char' with whatever type guarantees the
 7 |    character width you need. Note however that sizeof(symbol) should divide
 8 |    HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
 9 |    there is an alignment problem. In the unlikely event of a problem here,
10 |    consult Martin Porter.
11 | 
12 | */
13 | 
14 | struct SN_env {
15 |     symbol * p;
16 |     int c; int l; int lb; int bra; int ket;
17 |     symbol * * S;
18 |     int * I;
19 |     unsigned char * B;
20 | };
21 | 
22 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
23 | extern void SN_close_env(struct SN_env * z, int S_size);
24 | 
25 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
26 | 
27 | 


--------------------------------------------------------------------------------
/fts3Int.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** 2009 Nov 12
 3 | **
 4 | ** The author disclaims copyright to this source code.  In place of
 5 | ** a legal notice, here is a blessing:
 6 | **
 7 | **    May you do good and not evil.
 8 | **    May you find forgiveness for yourself and forgive others.
 9 | **    May you share freely, never taking more than you give.
10 | **
11 | ******************************************************************************
12 | **
13 | */
14 | #ifndef _FTSINT_H
15 | #define _FTSINT_H
16 | 
17 | #include "sqlite3.h"
18 | #include "fts3_tokenizer.h"
19 | 
20 | typedef unsigned char u8;         /* 1-byte (or larger) unsigned integer */
21 | typedef short int i16;            /* 2-byte (or larger) signed integer */
22 | typedef unsigned int u32;         /* 4-byte unsigned integer */
23 | typedef sqlite3_uint64 u64;       /* 8-byte unsigned integer */
24 | typedef sqlite3_int64 i64;        /* 8-byte signed integer */
25 | 
26 | #define UNUSED_PARAMETER(x) (void)(x)
27 | 
28 | /* fts3_unicode2.c (functions generated by parsing unicode text files) */
29 | #ifdef SQLITE_ENABLE_FTS4_UNICODE61
30 | int sqlite3FtsUnicodeFold(int, int);
31 | int sqlite3FtsUnicodeIsalnum(int);
32 | int sqlite3FtsUnicodeIsdiacritic(int);
33 | #endif
34 | 
35 | #endif /* _FTSINT_H */
36 | 


--------------------------------------------------------------------------------
/sqlite3_unicodesn_tokenizer.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** 2013 September 22
 3 | **
 4 | ** The author disclaims copyright to this source code.  In place of
 5 | ** a legal notice, here is a blessing:
 6 | **
 7 | **    May you do good and not evil.
 8 | **    May you find forgiveness for yourself and forgive others.
 9 | **    May you share freely, never taking more than you give.
10 | **
11 | ******************************************************************************
12 | **
13 | */
14 | #include <sqlite3.h>
15 | 
16 | #include "fts3_unicodesn.h"
17 | #include "sqlite3_unicodesn_tokenizer.h"
18 | 
19 | /*
20 | ** Register the tokenizer with FTS3 or FTS4. For use when compiling the tokenizer directly into
21 | ** an application, instead of as a separate shared library.
22 | */
23 | int register_unicodesn_tokenizer(
24 |       sqlite3 *db          /* The database connection */
25 | )
26 | {
27 |   const sqlite3_tokenizer_module *tokenizer;
28 |   int rc;
29 |   sqlite3_stmt *pStmt;
30 |   const char *zSql = "SELECT fts3_tokenizer(?, ?)";
31 | 
32 |   sqlite3Fts3UnicodeSnTokenizer(&tokenizer);
33 | 
34 |   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
35 |   if( rc!=SQLITE_OK ){
36 |     return rc;
37 |   }
38 | 
39 |   sqlite3_bind_text(pStmt, 1, TOKENIZER_NAME, -1, SQLITE_STATIC);
40 |   sqlite3_bind_blob(pStmt, 2, &tokenizer, sizeof(tokenizer), SQLITE_TRANSIENT);
41 |   rc = sqlite3_step(pStmt);
42 |   if( rc!=SQLITE_OK && rc < SQLITE_ROW ){
43 |       return rc;
44 |   }
45 |   return sqlite3_finalize(pStmt);
46 | }
47 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CC?=gcc
 3 | #CFLAGS=-W  -Wall -g -O0
 4 | CFLAGS?= -Os -DNDEBUG -s
 5 | 
 6 | DESTDIR?= /usr
 7 | 
 8 | STEMMERS?= danish dutch english finnish french german hungarian \
 9 |   italian norwegian porter portuguese romanian russian \
10 |   spanish swedish turkish
11 | 
12 | CFLAGS+= \
13 |   -DSQLITE_ENABLE_FTS4 \
14 |   -DSQLITE_ENABLE_FTS4_UNICODE61
15 | 
16 | SOURCES= \
17 |   fts3_unicode2.c \
18 |   fts3_unicodesn.c \
19 |   extension.c
20 | 
21 | HEADERS=	fts3_tokenizer.h
22 | 
23 | INCLUDES= \
24 |   -Ilibstemmer_c/runtime \
25 |   -Ilibstemmer_c/src_c
26 | 
27 | LIBRARIES=	-lsqlite3
28 | 
29 | SNOWBALL_SOURCES= \
30 |   libstemmer_c/runtime/api_sq3.c \
31 |   libstemmer_c/runtime/utilities_sq3.c
32 | 
33 | SNOWBALL_HEADERS= \
34 |   libstemmer_c/include/libstemmer.h \
35 |   libstemmer_c/runtime/api.h \
36 |   libstemmer_c/runtime/header.h
37 | 
38 | SNOWBALL_SOURCES+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).c)
39 | 
40 | SNOWBALL_HEADERS+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).h)
41 | 
42 | SNOWBALL_FLAGS+= $(foreach s, $(STEMMERS), -DWITH_STEMMER_$(s))
43 | 
44 | all: unicodesn.sqlext
45 | 
46 | unicodesn.sqlext: $(HEADERS) $(SOURCES) $(SNOWBALL_HEADERS) $(SNOWBALL_SOURCES)
47 | 	$(CC) $(CFLAGS) $(SNOWBALL_FLAGS) $(INCLUDES) -fPIC -shared -fvisibility=hidden -o $@ \
48 | 	   $(SOURCES) $(SNOWBALL_SOURCES) $(LIBRARIES)
49 | 
50 | clean:
51 | 	rm -f *.o unicodesn.sqlext
52 | 
53 | install: unicodesn.sqlext
54 | 	mkdir -p ${DESTDIR}/lib 2> /dev/null
55 | 	install -D -o root -g root -m 644 unicodesn.sqlext ${DESTDIR}/lib
56 | 
57 | .PHONY: clean install
58 | 


--------------------------------------------------------------------------------
/libstemmer_c/mkinc_utf8.mak:
--------------------------------------------------------------------------------
 1 | # libstemmer/mkinc_utf8.mak: List of stemming module source files
 2 | #
 3 | # This file is generated by mkmodules.pl from a list of module names.
 4 | # Do not edit manually.
 5 | #
 6 | # Modules included by this file are: danish, dutch, english, finnish, french,
 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian,
 8 | # russian, spanish, swedish, turkish
 9 | 
10 | snowball_sources= \
11 |   src_c/stem_UTF_8_danish.c \
12 |   src_c/stem_UTF_8_dutch.c \
13 |   src_c/stem_UTF_8_english.c \
14 |   src_c/stem_UTF_8_finnish.c \
15 |   src_c/stem_UTF_8_french.c \
16 |   src_c/stem_UTF_8_german.c \
17 |   src_c/stem_UTF_8_hungarian.c \
18 |   src_c/stem_UTF_8_italian.c \
19 |   src_c/stem_UTF_8_norwegian.c \
20 |   src_c/stem_UTF_8_porter.c \
21 |   src_c/stem_UTF_8_portuguese.c \
22 |   src_c/stem_UTF_8_romanian.c \
23 |   src_c/stem_UTF_8_russian.c \
24 |   src_c/stem_UTF_8_spanish.c \
25 |   src_c/stem_UTF_8_swedish.c \
26 |   src_c/stem_UTF_8_turkish.c \
27 |   runtime/api.c \
28 |   runtime/utilities.c \
29 |   libstemmer/libstemmer_utf8.c
30 | 
31 | snowball_headers= \
32 |   src_c/stem_UTF_8_danish.h \
33 |   src_c/stem_UTF_8_dutch.h \
34 |   src_c/stem_UTF_8_english.h \
35 |   src_c/stem_UTF_8_finnish.h \
36 |   src_c/stem_UTF_8_french.h \
37 |   src_c/stem_UTF_8_german.h \
38 |   src_c/stem_UTF_8_hungarian.h \
39 |   src_c/stem_UTF_8_italian.h \
40 |   src_c/stem_UTF_8_norwegian.h \
41 |   src_c/stem_UTF_8_porter.h \
42 |   src_c/stem_UTF_8_portuguese.h \
43 |   src_c/stem_UTF_8_romanian.h \
44 |   src_c/stem_UTF_8_russian.h \
45 |   src_c/stem_UTF_8_spanish.h \
46 |   src_c/stem_UTF_8_swedish.h \
47 |   src_c/stem_UTF_8_turkish.h \
48 |   include/libstemmer.h \
49 |   libstemmer/modules_utf8.h \
50 |   runtime/api.h \
51 |   runtime/header.h
52 | 
53 | 


--------------------------------------------------------------------------------
/extension.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** 2012 November 11
 3 | **
 4 | ** The author disclaims copyright to this source code.  In place of
 5 | ** a legal notice, here is a blessing:
 6 | **
 7 | **    May you do good and not evil.
 8 | **    May you find forgiveness for yourself and forgive others.
 9 | **    May you share freely, never taking more than you give.
10 | **
11 | ******************************************************************************
12 | **
13 | */
14 | #include <sqlite3.h>
15 | #include <sqlite3ext.h>
16 | 
17 | #include "fts3_unicodesn.h"
18 | 
19 | SQLITE_EXTENSION_INIT1
20 | 
21 | /*
22 | ** Register a tokenizer implementation with FTS3 or FTS4.
23 | */
24 | static int registerTokenizer(
25 |   sqlite3 *db,
26 |   char *zName,
27 |   const sqlite3_tokenizer_module *p
28 | ){
29 |   int rc;
30 |   sqlite3_stmt *pStmt;
31 |   const char *zSql = "SELECT fts3_tokenizer(?, ?)";
32 | 
33 |   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
34 |   if( rc!=SQLITE_OK ){
35 |     return rc;
36 |   }
37 | 
38 |   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
39 |   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
40 |   sqlite3_step(pStmt);
41 | 
42 |   return sqlite3_finalize(pStmt);
43 | }
44 | 
45 | /* SQLite invokes this routine once when it loads the extension.
46 | ** Create new functions, collating sequences, and virtual table
47 | ** modules here.  This is usually the only exported symbol in
48 | ** the shared library.
49 | */
50 | int sqlite3_extension_init(
51 |       sqlite3 *db,          /* The database connection */
52 |       char **pzErrMsg,      /* Write error messages here */
53 |       const sqlite3_api_routines *pApi  /* API methods */
54 |       )
55 | {
56 |    const sqlite3_tokenizer_module *tokenizer;
57 | 
58 |    SQLITE_EXTENSION_INIT2(pApi)
59 | 
60 |    sqlite3Fts3UnicodeSnTokenizer(&tokenizer);
61 | 
62 |    registerTokenizer(db, TOKENIZER_NAME, tokenizer);
63 | 
64 |    return 0;
65 | }
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/libstemmer_c/runtime/api_sq3.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <sqlite3.h> /* for calloc, free */
 5 | #include "header.h"
 6 | 
 7 | static void *local_calloc(size_t nmemb, size_t size) {
 8 |    void *p = sqlite3_malloc((int)(nmemb*size));
 9 |    if (p == NULL)
10 |       return NULL;
11 |    return memset(p, 0, nmemb*size);
12 | }
13 | 
14 | extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
15 | {
16 |     struct SN_env * z = (struct SN_env *) local_calloc(1, sizeof(struct SN_env));
17 |     if (z == NULL) return NULL;
18 |     z->p = create_s();
19 |     if (z->p == NULL) goto error;
20 |     if (S_size)
21 |     {
22 |         int i;
23 |         z->S = (symbol * *) local_calloc(S_size, sizeof(symbol *));
24 |         if (z->S == NULL) goto error;
25 | 
26 |         for (i = 0; i < S_size; i++)
27 |         {
28 |             z->S[i] = create_s();
29 |             if (z->S[i] == NULL) goto error;
30 |         }
31 |     }
32 | 
33 |     if (I_size)
34 |     {
35 |         z->I = (int *) local_calloc(I_size, sizeof(int));
36 |         if (z->I == NULL) goto error;
37 |     }
38 | 
39 |     if (B_size)
40 |     {
41 |         z->B = (unsigned char *) local_calloc(B_size, sizeof(unsigned char));
42 |         if (z->B == NULL) goto error;
43 |     }
44 | 
45 |     return z;
46 | error:
47 |     SN_close_env(z, S_size);
48 |     return NULL;
49 | }
50 | 
51 | extern void SN_close_env(struct SN_env * z, int S_size)
52 | {
53 |     if (z == NULL) return;
54 |     if (S_size)
55 |     {
56 |         int i;
57 |         for (i = 0; i < S_size; i++)
58 |         {
59 |             lose_s(z->S[i]);
60 |         }
61 |         sqlite3_free(z->S);
62 |     }
63 |     sqlite3_free(z->I);
64 |     sqlite3_free(z->B);
65 |     if (z->p) lose_s(z->p);
66 |     sqlite3_free(z);
67 | }
68 | 
69 | extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
70 | {
71 |     int err = replace_s(z, 0, z->l, size, s, NULL);
72 |     z->c = 0;
73 |     return err;
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/libstemmer_c/MANIFEST:
--------------------------------------------------------------------------------
 1 | README
 2 | src_c/stem_ISO_8859_1_danish.c
 3 | src_c/stem_ISO_8859_1_danish.h
 4 | src_c/stem_ISO_8859_1_dutch.c
 5 | src_c/stem_ISO_8859_1_dutch.h
 6 | src_c/stem_ISO_8859_1_english.c
 7 | src_c/stem_ISO_8859_1_english.h
 8 | src_c/stem_ISO_8859_1_finnish.c
 9 | src_c/stem_ISO_8859_1_finnish.h
10 | src_c/stem_ISO_8859_1_french.c
11 | src_c/stem_ISO_8859_1_french.h
12 | src_c/stem_ISO_8859_1_german.c
13 | src_c/stem_ISO_8859_1_german.h
14 | src_c/stem_ISO_8859_1_hungarian.c
15 | src_c/stem_ISO_8859_1_hungarian.h
16 | src_c/stem_ISO_8859_1_italian.c
17 | src_c/stem_ISO_8859_1_italian.h
18 | src_c/stem_ISO_8859_1_norwegian.c
19 | src_c/stem_ISO_8859_1_norwegian.h
20 | src_c/stem_ISO_8859_1_porter.c
21 | src_c/stem_ISO_8859_1_porter.h
22 | src_c/stem_ISO_8859_1_portuguese.c
23 | src_c/stem_ISO_8859_1_portuguese.h
24 | src_c/stem_ISO_8859_1_spanish.c
25 | src_c/stem_ISO_8859_1_spanish.h
26 | src_c/stem_ISO_8859_1_swedish.c
27 | src_c/stem_ISO_8859_1_swedish.h
28 | src_c/stem_ISO_8859_2_romanian.c
29 | src_c/stem_ISO_8859_2_romanian.h
30 | src_c/stem_KOI8_R_russian.c
31 | src_c/stem_KOI8_R_russian.h
32 | src_c/stem_UTF_8_danish.c
33 | src_c/stem_UTF_8_danish.h
34 | src_c/stem_UTF_8_dutch.c
35 | src_c/stem_UTF_8_dutch.h
36 | src_c/stem_UTF_8_english.c
37 | src_c/stem_UTF_8_english.h
38 | src_c/stem_UTF_8_finnish.c
39 | src_c/stem_UTF_8_finnish.h
40 | src_c/stem_UTF_8_french.c
41 | src_c/stem_UTF_8_french.h
42 | src_c/stem_UTF_8_german.c
43 | src_c/stem_UTF_8_german.h
44 | src_c/stem_UTF_8_hungarian.c
45 | src_c/stem_UTF_8_hungarian.h
46 | src_c/stem_UTF_8_italian.c
47 | src_c/stem_UTF_8_italian.h
48 | src_c/stem_UTF_8_norwegian.c
49 | src_c/stem_UTF_8_norwegian.h
50 | src_c/stem_UTF_8_porter.c
51 | src_c/stem_UTF_8_porter.h
52 | src_c/stem_UTF_8_portuguese.c
53 | src_c/stem_UTF_8_portuguese.h
54 | src_c/stem_UTF_8_romanian.c
55 | src_c/stem_UTF_8_romanian.h
56 | src_c/stem_UTF_8_russian.c
57 | src_c/stem_UTF_8_russian.h
58 | src_c/stem_UTF_8_spanish.c
59 | src_c/stem_UTF_8_spanish.h
60 | src_c/stem_UTF_8_swedish.c
61 | src_c/stem_UTF_8_swedish.h
62 | src_c/stem_UTF_8_turkish.c
63 | src_c/stem_UTF_8_turkish.h
64 | runtime/api.c
65 | runtime/api.h
66 | runtime/header.h
67 | runtime/utilities.c
68 | libstemmer/libstemmer.c
69 | libstemmer/libstemmer_utf8.c
70 | libstemmer/modules.h
71 | libstemmer/modules_utf8.h
72 | include/libstemmer.h
73 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/libstemmer.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "../include/libstemmer.h"
 5 | #include "../runtime/api.h"
 6 | #include "modules.h"
 7 | 
 8 | struct sb_stemmer {
 9 |     struct SN_env * (*create)(void);
10 |     void (*close)(struct SN_env *);
11 |     int (*stem)(struct SN_env *);
12 | 
13 |     struct SN_env * env;
14 | };
15 | 
16 | extern const char **
17 | sb_stemmer_list(void)
18 | {
19 |     return algorithm_names;
20 | }
21 | 
22 | static stemmer_encoding_t
23 | sb_getenc(const char * charenc)
24 | {
25 |     struct stemmer_encoding * encoding;
26 |     if (charenc == NULL) return ENC_UTF_8;
27 |     for (encoding = encodings; encoding->name != 0; encoding++) {
28 | 	if (strcmp(encoding->name, charenc) == 0) break;
29 |     }
30 |     if (encoding->name == NULL) return ENC_UNKNOWN;
31 |     return encoding->enc;
32 | }
33 | 
34 | extern struct sb_stemmer *
35 | sb_stemmer_new(const char * algorithm, const char * charenc)
36 | {
37 |     stemmer_encoding_t enc;
38 |     struct stemmer_modules * module;
39 |     struct sb_stemmer * stemmer;
40 | 
41 |     enc = sb_getenc(charenc);
42 |     if (enc == ENC_UNKNOWN) return NULL;
43 | 
44 |     for (module = modules; module->name != 0; module++) {
45 | 	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46 |     }
47 |     if (module->name == NULL) return NULL;
48 |     
49 |     stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50 |     if (stemmer == NULL) return NULL;
51 | 
52 |     stemmer->create = module->create;
53 |     stemmer->close = module->close;
54 |     stemmer->stem = module->stem;
55 | 
56 |     stemmer->env = stemmer->create();
57 |     if (stemmer->env == NULL)
58 |     {
59 |         sb_stemmer_delete(stemmer);
60 |         return NULL;
61 |     }
62 | 
63 |     return stemmer;
64 | }
65 | 
66 | void
67 | sb_stemmer_delete(struct sb_stemmer * stemmer)
68 | {
69 |     if (stemmer == 0) return;
70 |     if (stemmer->close == 0) return;
71 |     stemmer->close(stemmer->env);
72 |     stemmer->close = 0;
73 |     free(stemmer);
74 | }
75 | 
76 | const sb_symbol *
77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
78 | {
79 |     int ret;
80 |     if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
81 |     {
82 |         stemmer->env->l = 0;
83 |         return NULL;
84 |     }
85 |     ret = stemmer->stem(stemmer->env);
86 |     if (ret < 0) return NULL;
87 |     stemmer->env->p[stemmer->env->l] = 0;
88 |     return (const sb_symbol *)(stemmer->env->p);
89 | }
90 | 
91 | int
92 | sb_stemmer_length(struct sb_stemmer * stemmer)
93 | {
94 |     return stemmer->env->l;
95 | }
96 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/libstemmer_c.in:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "../include/libstemmer.h"
 5 | #include "../runtime/api.h"
 6 | #include "@MODULES_H@"
 7 | 
 8 | struct sb_stemmer {
 9 |     struct SN_env * (*create)(void);
10 |     void (*close)(struct SN_env *);
11 |     int (*stem)(struct SN_env *);
12 | 
13 |     struct SN_env * env;
14 | };
15 | 
16 | extern const char **
17 | sb_stemmer_list(void)
18 | {
19 |     return algorithm_names;
20 | }
21 | 
22 | static stemmer_encoding_t
23 | sb_getenc(const char * charenc)
24 | {
25 |     struct stemmer_encoding * encoding;
26 |     if (charenc == NULL) return ENC_UTF_8;
27 |     for (encoding = encodings; encoding->name != 0; encoding++) {
28 | 	if (strcmp(encoding->name, charenc) == 0) break;
29 |     }
30 |     if (encoding->name == NULL) return ENC_UNKNOWN;
31 |     return encoding->enc;
32 | }
33 | 
34 | extern struct sb_stemmer *
35 | sb_stemmer_new(const char * algorithm, const char * charenc)
36 | {
37 |     stemmer_encoding_t enc;
38 |     struct stemmer_modules * module;
39 |     struct sb_stemmer * stemmer;
40 | 
41 |     enc = sb_getenc(charenc);
42 |     if (enc == ENC_UNKNOWN) return NULL;
43 | 
44 |     for (module = modules; module->name != 0; module++) {
45 | 	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46 |     }
47 |     if (module->name == NULL) return NULL;
48 |     
49 |     stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50 |     if (stemmer == NULL) return NULL;
51 | 
52 |     stemmer->create = module->create;
53 |     stemmer->close = module->close;
54 |     stemmer->stem = module->stem;
55 | 
56 |     stemmer->env = stemmer->create();
57 |     if (stemmer->env == NULL)
58 |     {
59 |         sb_stemmer_delete(stemmer);
60 |         return NULL;
61 |     }
62 | 
63 |     return stemmer;
64 | }
65 | 
66 | void
67 | sb_stemmer_delete(struct sb_stemmer * stemmer)
68 | {
69 |     if (stemmer == 0) return;
70 |     if (stemmer->close == 0) return;
71 |     stemmer->close(stemmer->env);
72 |     stemmer->close = 0;
73 |     free(stemmer);
74 | }
75 | 
76 | const sb_symbol *
77 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
78 | {
79 |     int ret;
80 |     if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
81 |     {
82 |         stemmer->env->l = 0;
83 |         return NULL;
84 |     }
85 |     ret = stemmer->stem(stemmer->env);
86 |     if (ret < 0) return NULL;
87 |     stemmer->env->p[stemmer->env->l] = 0;
88 |     return (const sb_symbol *)(stemmer->env->p);
89 | }
90 | 
91 | int
92 | sb_stemmer_length(struct sb_stemmer * stemmer)
93 | {
94 |     return stemmer->env->l;
95 | }
96 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/modules_utf8.txt:
--------------------------------------------------------------------------------
 1 | # This file contains a list of stemmers to include in the distribution.
 2 | # The format is a set of space separated lines - on each line:
 3 | #  First item is name of stemmer.
 4 | #  Second item is comma separated list of character sets.
 5 | #  Third item is comma separated list of names to refer to the stemmer by.
 6 | #
 7 | # Lines starting with a #, or blank lines, are ignored.
 8 | 
 9 | # List all the main algorithms for each language, in UTF-8.
10 | 
11 | danish          UTF_8                   danish,da,dan
12 | dutch           UTF_8                   dutch,nl,dut,nld
13 | english         UTF_8                   english,en,eng
14 | finnish         UTF_8                   finnish,fi,fin
15 | french          UTF_8                   french,fr,fre,fra
16 | german          UTF_8                   german,de,ger,deu
17 | hungarian       UTF_8                   hungarian,hu,hun
18 | italian         UTF_8                   italian,it,ita
19 | norwegian       UTF_8                   norwegian,no,nor
20 | portuguese      UTF_8                   portuguese,pt,por
21 | romanian        UTF_8                   romanian,ro,rum,ron
22 | russian         UTF_8                   russian,ru,rus
23 | spanish         UTF_8                   spanish,es,esl,spa
24 | swedish         UTF_8                   swedish,sv,swe
25 | turkish         UTF_8                   turkish,tr,tur
26 | 
27 | # Also include the traditional porter algorithm for english.
28 | # The porter algorithm is included in the libstemmer distribution to assist
29 | # with backwards compatibility, but for new systems the english algorithm
30 | # should be used in preference.
31 | porter          UTF_8                   porter
32 | 
33 | # Some other stemmers in the snowball project are not included in the standard
34 | # distribution. To compile a libstemmer with them in, add them to this list,
35 | # and regenerate the distribution. (You will need a full source checkout for
36 | # this.) They are included in the snowball website as curiosities, but are not
37 | # intended for general use, and use of them is is not fully supported.  These
38 | # algorithms are:
39 | #
40 | # german2          - This is a slight modification of the german stemmer.
41 | #german2          UTF_8                   german2
42 | #
43 | # kraaij_pohlmann  - This is a different dutch stemmer.
44 | #kraaij_pohlmann  UTF_8                   kraaij_pohlmann
45 | #
46 | # lovins           - This is an english stemmer, but fairly outdated, and
47 | #                    only really applicable to a restricted type of input text
48 | #                    (keywords in academic publications).
49 | #lovins           UTF_8                   lovins
50 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/libstemmer_utf8.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "../include/libstemmer.h"
 5 | #include "../runtime/api.h"
 6 | #include "modules_utf8.h"
 7 | 
 8 | struct sb_stemmer {
 9 |     struct SN_env * (*create)(void);
10 |     void (*close)(struct SN_env *);
11 |     int (*stem)(struct SN_env *);
12 | 
13 |     struct SN_env * env;
14 | };
15 | 
16 | extern const char **
17 | sb_stemmer_list(void)
18 | {
19 |     return algorithm_names;
20 | }
21 | 
22 | static stemmer_encoding_t
23 | sb_getenc(const char * charenc)
24 | {
25 |     struct stemmer_encoding * encoding;
26 |     if (charenc == NULL) return ENC_UTF_8;
27 |     for (encoding = encodings; encoding->name != 0; encoding++) {
28 | 	if (strcmp(encoding->name, charenc) == 0) break;
29 |     }
30 |     if (encoding->name == NULL) return ENC_UNKNOWN;
31 |     return encoding->enc;
32 | }
33 | 
34 | extern struct sb_stemmer *
35 | sb_stemmer_new(const char * algorithm, const char * charenc)
36 | {
37 |     stemmer_encoding_t enc;
38 |     struct stemmer_modules * module;
39 |     struct sb_stemmer * stemmer;
40 | 
41 |     enc = sb_getenc(charenc);
42 |     if (enc == ENC_UNKNOWN) return NULL;
43 | 
44 |     for (module = modules; module->name != 0; module++) {
45 | 	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46 |     }
47 |     if (module->name == NULL) return NULL;
48 |     
49 |     stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50 |     if (stemmer == NULL) return NULL;
51 | 
52 |     stemmer->create = module->create;
53 |     stemmer->close = module->close;
54 |     stemmer->stem = module->stem;
55 | 
56 |     stemmer->env = stemmer->create();
57 |     if (stemmer->env == NULL)
58 |     {
59 |         sb_stemmer_delete(stemmer);
60 |         return NULL;
61 |     }
62 | 
63 |     return stemmer;
64 | }
65 | 
66 | void
67 | sb_stemmer_delete(struct sb_stemmer * stemmer)
68 | {
69 |     if (stemmer == 0) return;
70 |     if (stemmer->close != NULL)
71 |     {
72 |         stemmer->close(stemmer->env);
73 |         stemmer->close = NULL;
74 |     }
75 |     free(stemmer);
76 | }
77 | 
78 | const sb_symbol *
79 | sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
80 | {
81 |     int ret;
82 |     if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
83 |     {
84 |         stemmer->env->l = 0;
85 |         return NULL;
86 |     }
87 |     ret = stemmer->stem(stemmer->env);
88 |     if (ret < 0) return NULL;
89 |     stemmer->env->p[stemmer->env->l] = 0;
90 |     return (const sb_symbol *)(stemmer->env->p);
91 | }
92 | 
93 | int
94 | sb_stemmer_length(struct sb_stemmer * stemmer)
95 | {
96 |     return stemmer->env->l;
97 | }
98 | 


--------------------------------------------------------------------------------
/libstemmer_c/runtime/header.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <limits.h>
 3 | 
 4 | #include "api.h"
 5 | 
 6 | #define MAXINT INT_MAX
 7 | #define MININT INT_MIN
 8 | 
 9 | #define HEAD 2*sizeof(int)
10 | 
11 | #define SIZE(p)        ((int *)(p))[-1]
12 | #define SET_SIZE(p, n) ((int *)(p))[-1] = n
13 | #define CAPACITY(p)    ((int *)(p))[-2]
14 | 
15 | struct among
16 | {   int s_size;     /* number of chars in string */
17 |     const symbol * s;       /* search string */
18 |     int substring_i;/* index to longest matching substring */
19 |     int result;     /* result of the lookup */
20 |     int (* function)(struct SN_env *);
21 | };
22 | 
23 | extern symbol * create_s(void);
24 | extern void lose_s(symbol * p);
25 | 
26 | extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
27 | 
28 | extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
29 | extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
30 | extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
31 | extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
32 | 
33 | extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
34 | extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
35 | extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
36 | extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
37 | 
38 | extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
39 | extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
40 | extern int eq_v(struct SN_env * z, const symbol * p);
41 | extern int eq_v_b(struct SN_env * z, const symbol * p);
42 | 
43 | extern int find_among(struct SN_env * z, const struct among * v, int v_size);
44 | extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
45 | 
46 | extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
47 | extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
48 | extern int slice_from_v(struct SN_env * z, const symbol * p);
49 | extern int slice_del(struct SN_env * z);
50 | 
51 | extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
52 | extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
53 | 
54 | extern symbol * slice_to(struct SN_env * z, symbol * p);
55 | extern symbol * assign_to(struct SN_env * z, symbol * p);
56 | 
57 | extern void debug(struct SN_env * z, int number, int line_count);
58 | 
59 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/modules.txt:
--------------------------------------------------------------------------------
 1 | # This file contains a list of stemmers to include in the distribution.
 2 | # The format is a set of space separated lines - on each line:
 3 | #  First item is name of stemmer.
 4 | #  Second item is comma separated list of character sets.
 5 | #  Third item is comma separated list of names to refer to the stemmer by.
 6 | #
 7 | # Lines starting with a #, or blank lines, are ignored.
 8 | 
 9 | # List all the main algorithms for each language, in UTF-8, and also with
10 | # the most commonly used encoding.
11 | 
12 | danish          UTF_8,ISO_8859_1        danish,da,dan
13 | dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
14 | english         UTF_8,ISO_8859_1        english,en,eng
15 | finnish         UTF_8,ISO_8859_1        finnish,fi,fin
16 | french          UTF_8,ISO_8859_1        french,fr,fre,fra
17 | german          UTF_8,ISO_8859_1        german,de,ger,deu
18 | hungarian       UTF_8,ISO_8859_1        hungarian,hu,hun
19 | italian         UTF_8,ISO_8859_1        italian,it,ita
20 | norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
21 | portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
22 | romanian        UTF_8,ISO_8859_2        romanian,ro,rum,ron
23 | russian         UTF_8,KOI8_R            russian,ru,rus
24 | spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
25 | swedish         UTF_8,ISO_8859_1        swedish,sv,swe
26 | turkish         UTF_8                   turkish,tr,tur
27 | 
28 | # Also include the traditional porter algorithm for english.
29 | # The porter algorithm is included in the libstemmer distribution to assist
30 | # with backwards compatibility, but for new systems the english algorithm
31 | # should be used in preference.
32 | porter          UTF_8,ISO_8859_1        porter
33 | 
34 | # Some other stemmers in the snowball project are not included in the standard
35 | # distribution. To compile a libstemmer with them in, add them to this list,
36 | # and regenerate the distribution. (You will need a full source checkout for
37 | # this.) They are included in the snowball website as curiosities, but are not
38 | # intended for general use, and use of them is is not fully supported.  These
39 | # algorithms are:
40 | #
41 | # german2          - This is a slight modification of the german stemmer.
42 | #german2          UTF_8,ISO_8859_1        german2
43 | #
44 | # kraaij_pohlmann  - This is a different dutch stemmer.
45 | #kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann
46 | #
47 | # lovins           - This is an english stemmer, but fairly outdated, and
48 | #                    only really applicable to a restricted type of input text
49 | #                    (keywords in academic publications).
50 | #lovins           UTF_8,ISO_8859_1        lovins
51 | 


--------------------------------------------------------------------------------
/libstemmer_c/mkinc.mak:
--------------------------------------------------------------------------------
 1 | # libstemmer/mkinc.mak: List of stemming module source files
 2 | #
 3 | # This file is generated by mkmodules.pl from a list of module names.
 4 | # Do not edit manually.
 5 | #
 6 | # Modules included by this file are: danish, dutch, english, finnish, french,
 7 | # german, hungarian, italian, norwegian, porter, portuguese, romanian,
 8 | # russian, spanish, swedish, turkish
 9 | 
10 | snowball_sources= \
11 |   src_c/stem_ISO_8859_1_danish.c \
12 |   src_c/stem_UTF_8_danish.c \
13 |   src_c/stem_ISO_8859_1_dutch.c \
14 |   src_c/stem_UTF_8_dutch.c \
15 |   src_c/stem_ISO_8859_1_english.c \
16 |   src_c/stem_UTF_8_english.c \
17 |   src_c/stem_ISO_8859_1_finnish.c \
18 |   src_c/stem_UTF_8_finnish.c \
19 |   src_c/stem_ISO_8859_1_french.c \
20 |   src_c/stem_UTF_8_french.c \
21 |   src_c/stem_ISO_8859_1_german.c \
22 |   src_c/stem_UTF_8_german.c \
23 |   src_c/stem_ISO_8859_1_hungarian.c \
24 |   src_c/stem_UTF_8_hungarian.c \
25 |   src_c/stem_ISO_8859_1_italian.c \
26 |   src_c/stem_UTF_8_italian.c \
27 |   src_c/stem_ISO_8859_1_norwegian.c \
28 |   src_c/stem_UTF_8_norwegian.c \
29 |   src_c/stem_ISO_8859_1_porter.c \
30 |   src_c/stem_UTF_8_porter.c \
31 |   src_c/stem_ISO_8859_1_portuguese.c \
32 |   src_c/stem_UTF_8_portuguese.c \
33 |   src_c/stem_ISO_8859_2_romanian.c \
34 |   src_c/stem_UTF_8_romanian.c \
35 |   src_c/stem_KOI8_R_russian.c \
36 |   src_c/stem_UTF_8_russian.c \
37 |   src_c/stem_ISO_8859_1_spanish.c \
38 |   src_c/stem_UTF_8_spanish.c \
39 |   src_c/stem_ISO_8859_1_swedish.c \
40 |   src_c/stem_UTF_8_swedish.c \
41 |   src_c/stem_UTF_8_turkish.c \
42 |   runtime/api.c \
43 |   runtime/utilities.c \
44 |   libstemmer/libstemmer.c
45 | 
46 | snowball_headers= \
47 |   src_c/stem_ISO_8859_1_danish.h \
48 |   src_c/stem_UTF_8_danish.h \
49 |   src_c/stem_ISO_8859_1_dutch.h \
50 |   src_c/stem_UTF_8_dutch.h \
51 |   src_c/stem_ISO_8859_1_english.h \
52 |   src_c/stem_UTF_8_english.h \
53 |   src_c/stem_ISO_8859_1_finnish.h \
54 |   src_c/stem_UTF_8_finnish.h \
55 |   src_c/stem_ISO_8859_1_french.h \
56 |   src_c/stem_UTF_8_french.h \
57 |   src_c/stem_ISO_8859_1_german.h \
58 |   src_c/stem_UTF_8_german.h \
59 |   src_c/stem_ISO_8859_1_hungarian.h \
60 |   src_c/stem_UTF_8_hungarian.h \
61 |   src_c/stem_ISO_8859_1_italian.h \
62 |   src_c/stem_UTF_8_italian.h \
63 |   src_c/stem_ISO_8859_1_norwegian.h \
64 |   src_c/stem_UTF_8_norwegian.h \
65 |   src_c/stem_ISO_8859_1_porter.h \
66 |   src_c/stem_UTF_8_porter.h \
67 |   src_c/stem_ISO_8859_1_portuguese.h \
68 |   src_c/stem_UTF_8_portuguese.h \
69 |   src_c/stem_ISO_8859_2_romanian.h \
70 |   src_c/stem_UTF_8_romanian.h \
71 |   src_c/stem_KOI8_R_russian.h \
72 |   src_c/stem_UTF_8_russian.h \
73 |   src_c/stem_ISO_8859_1_spanish.h \
74 |   src_c/stem_UTF_8_spanish.h \
75 |   src_c/stem_ISO_8859_1_swedish.h \
76 |   src_c/stem_UTF_8_swedish.h \
77 |   src_c/stem_UTF_8_turkish.h \
78 |   include/libstemmer.h \
79 |   libstemmer/modules.h \
80 |   runtime/api.h \
81 |   runtime/header.h
82 | 
83 | 


--------------------------------------------------------------------------------
/libstemmer_c/include/libstemmer.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* Make header file work when included from C++ */
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | struct sb_stemmer;
 8 | typedef unsigned char sb_symbol;
 9 | 
10 | /* FIXME - should be able to get a version number for each stemming
11 |  * algorithm (which will be incremented each time the output changes). */
12 | 
13 | /** Returns an array of the names of the available stemming algorithms.
14 |  *  Note that these are the canonical names - aliases (ie, other names for
15 |  *  the same algorithm) will not be included in the list.
16 |  *  The list is terminated with a null pointer.
17 |  *
18 |  *  The list must not be modified in any way.
19 |  */
20 | const char ** sb_stemmer_list(void);
21 | 
22 | /** Create a new stemmer object, using the specified algorithm, for the
23 |  *  specified character encoding.
24 |  *
25 |  *  All algorithms will usually be available in UTF-8, but may also be
26 |  *  available in other character encodings.
27 |  *
28 |  *  @param algorithm The algorithm name.  This is either the english
29 |  *  name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
30 |  *  language.  Note that case is significant in this parameter - the
31 |  *  value should be supplied in lower case.
32 |  *
33 |  *  @param charenc The character encoding.  NULL may be passed as
34 |  *  this value, in which case UTF-8 encoding will be assumed. Otherwise,
35 |  *  the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
36 |  *  "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian).  Note that
37 |  *  case is significant in this parameter.
38 |  *
39 |  *  @return NULL if the specified algorithm is not recognised, or the
40 |  *  algorithm is not available for the requested encoding.  Otherwise,
41 |  *  returns a pointer to a newly created stemmer for the requested algorithm.
42 |  *  The returned pointer must be deleted by calling sb_stemmer_delete().
43 |  *
44 |  *  @note NULL will also be returned if an out of memory error occurs.
45 |  */
46 | struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
47 | 
48 | /** Delete a stemmer object.
49 |  *
50 |  *  This frees all resources allocated for the stemmer.  After calling
51 |  *  this function, the supplied stemmer may no longer be used in any way.
52 |  *
53 |  *  It is safe to pass a null pointer to this function - this will have
54 |  *  no effect.
55 |  */
56 | void                sb_stemmer_delete(struct sb_stemmer * stemmer);
57 | 
58 | /** Stem a word.
59 |  *
60 |  *  The return value is owned by the stemmer - it must not be freed or
61 |  *  modified, and it will become invalid when the stemmer is called again,
62 |  *  or if the stemmer is freed.
63 |  *
64 |  *  The length of the return value can be obtained using sb_stemmer_length().
65 |  *
66 |  *  If an out-of-memory error occurs, this will return NULL.
67 |  */
68 | const sb_symbol *   sb_stemmer_stem(struct sb_stemmer * stemmer,
69 | 				    const sb_symbol * word, int size);
70 | 
71 | /** Get the length of the result of the last stemmed word.
72 |  *  This should not be called before sb_stemmer_stem() has been called.
73 |  */
74 | int                 sb_stemmer_length(struct sb_stemmer * stemmer);
75 | 
76 | #ifdef __cplusplus
77 | }
78 | #endif
79 | 
80 | 


--------------------------------------------------------------------------------
/libstemmer_c/README:
--------------------------------------------------------------------------------
  1 | libstemmer_c
  2 | ============
  3 | 
  4 | This document pertains to the C version of the libstemmer distribution,
  5 | available for download from:
  6 | 
  7 | http://snowball.tartarus.org/dist/libstemmer_c.tgz
  8 | 
  9 | 
 10 | Compiling the library
 11 | =====================
 12 | 
 13 | A simple makefile is provided for Unix style systems.  On such systems, it
 14 | should be possible simply to run "make", and the file "libstemmer.o"
 15 | and the example program "stemwords" will be generated.
 16 | 
 17 | If this doesn't work on your system, you need to write your own build
 18 | system (or call the compiler directly).  The files to compile are
 19 | all contained in the "libstemmer", "runtime" and "src_c" directories,
 20 | and the public header file is contained in the "include" directory.
 21 | 
 22 | The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
 23 | sets.  To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
 24 | "libstemmer.c".
 25 | 
 26 | For convenience "mkinc.mak" is a makefile fragment listing the source files and
 27 | header files used to compile the standard version of the library.
 28 | "mkinc_utf8.mak" is a comparable makefile fragment listing just the source
 29 | files for the UTF-8 only version of the library.
 30 | 
 31 | 
 32 | Using the library
 33 | =================
 34 | 
 35 | The library provides a simple C API.  Essentially, a new stemmer can
 36 | be obtained by using "sb_stemmer_new".  "sb_stemmer_stem" is then
 37 | used to stem a word, "sb_stemmer_length" returns the stemmed
 38 | length of the last word processed, and "sb_stemmer_delete" is
 39 | used to delete a stemmer.
 40 | 
 41 | Creating a stemmer is a relatively expensive operation - the expected
 42 | usage pattern is that a new stemmer is created when needed, used
 43 | to stem many words, and deleted after some time.
 44 | 
 45 | Stemmers are re-entrant, but not threadsafe.  In other words, if
 46 | you wish to access the same stemmer object from multiple threads,
 47 | you must ensure that all access is protected by a mutex or similar
 48 | device.
 49 | 
 50 | libstemmer does not currently incorporate any mechanism for caching the results
 51 | of stemming operations.  Such caching can greatly increase the performance of a
 52 | stemmer under certain situations, so suitable patches will be considered for
 53 | inclusion.
 54 | 
 55 | The standard libstemmer sources contain an algorithm for each of the supported
 56 | languages.  The algorithm may be selected using the english name of the
 57 | language, or using the 2 or 3 letter ISO 639 language codes.  In addition,
 58 | the traditional "Porter" stemming algorithm for english is included for
 59 | backwards compatibility purposes, but we recommend use of the "English"
 60 | stemmer in preference for new projects.
 61 | 
 62 | (Some minor algorithms which are included only as curiosities in the snowball
 63 | website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
 64 | included in the standard libstemmer sources.  These are not really supported by
 65 | the snowball project, but it would be possible to compile a modified libstemmer
 66 | library containing these if desired.)
 67 | 
 68 | 
 69 | The stemwords example
 70 | =====================
 71 | 
 72 | The stemwords example program allows you to run any of the stemmers
 73 | compiled into the libstemmer library on a sample vocabulary.  For
 74 | details on how to use it, run it with the "-h" command line option.
 75 | 
 76 | 
 77 | Using the library in a larger system
 78 | ====================================
 79 | 
 80 | If you are incorporating the library into the build system of a larger
 81 | program, I recommend copying the unpacked tarball without modification into
 82 | a subdirectory of the sources of your program.  Future versions of the
 83 | library are intended to keep the same structure, so this will keep the
 84 | work required to move to a new version of the library to a minimum.
 85 | 
 86 | As an additional convenience, the list of source and header files used
 87 | in the library is detailed in mkinc.mak - a file which is in a suitable
 88 | format for inclusion by a Makefile.  By including this file in your build
 89 | system, you can link the snowball system into your program with a few
 90 | extra rules.
 91 | 
 92 | Using the library in a system using GNU autotools
 93 | =================================================
 94 | 
 95 | The libstemmer_c library can be integrated into a larger system which uses the
 96 | GNU autotool framework (and in particular, automake and autoconf) as follows:
 97 | 
 98 | 1) Unpack libstemmer_c.tgz in the top level project directory so that there is
 99 |    a libstemmer_c subdirectory of the top level directory of the project.
100 | 
101 | 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
102 |    
103 | noinst_LTLIBRARIES = libstemmer.la
104 | include $(srcdir)/mkinc.mak
105 | noinst_HEADERS = $(snowball_headers)
106 | libstemmer_la_SOURCES = $(snowball_sources) 
107 | 
108 | (You may also need to add other lines to this, for example, if you are using
109 | compiler options which are not compatible with compiling the libstemmer
110 | library.)
111 | 
112 | 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
113 |    configure.ac file.
114 | 
115 | 4) Add to the top level makefile the following lines (or modify existing
116 |    assignments to these variables appropriately):
117 | 
118 | AUTOMAKE_OPTIONS = subdir-objects
119 | AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
120 | SUBDIRS=libstemmer_c
121 | <name>_LIBADD = libstemmer_c/libstemmer.la
122 | 
123 | (Where <name> is the name of the library or executable which links against
124 | libstemmer.) 
125 | 
126 | 


--------------------------------------------------------------------------------
/libstemmer_c/examples/stemwords.c:
--------------------------------------------------------------------------------
  1 | /* This is a simple program which uses libstemmer to provide a command
  2 |  * line interface for stemming using any of the algorithms provided.
  3 |  */
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h> /* for malloc, free */
  7 | #include <string.h> /* for memmove */
  8 | #include <ctype.h>  /* for isupper, tolower */
  9 | 
 10 | #include "libstemmer.h"
 11 | 
 12 | const char * progname;
 13 | static int pretty = 1;
 14 | 
 15 | static void
 16 | stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
 17 | {
 18 | #define INC 10
 19 |     int lim = INC;
 20 |     sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
 21 | 
 22 |     while(1) {
 23 |         int ch = getc(f_in);
 24 |         if (ch == EOF) {
 25 |             free(b); return;
 26 |         }
 27 |         {
 28 |             int i = 0;
 29 | 	    int inlen = 0;
 30 |             while(1) {
 31 |                 if (ch == '\n' || ch == EOF) break;
 32 |                 if (i == lim) {
 33 |                     sb_symbol * newb;
 34 | 		    newb = (sb_symbol *)
 35 | 			    realloc(b, (lim + INC) * sizeof(sb_symbol));
 36 | 		    if (newb == 0) goto error;
 37 | 		    b = newb;
 38 |                     lim = lim + INC;
 39 |                 }
 40 | 		/* Update count of utf-8 characters. */
 41 | 		if (ch < 0x80 || ch > 0xBF) inlen += 1;
 42 |                 /* force lower case: */
 43 |                 if (isupper(ch)) ch = tolower(ch);
 44 | 
 45 |                 b[i] = ch;
 46 | 		i++;
 47 |                 ch = getc(f_in);
 48 |             }
 49 | 
 50 | 	    {
 51 | 		const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
 52 |                 if (stemmed == NULL)
 53 |                 {
 54 |                     fprintf(stderr, "Out of memory");
 55 |                     exit(1);
 56 |                 }
 57 |                 else
 58 | 		{
 59 | 		    if (pretty == 1) {
 60 | 			fwrite(b, i, 1, f_out);
 61 | 			fputs(" -> ", f_out);
 62 | 		    } else if (pretty == 2) {
 63 | 			fwrite(b, i, 1, f_out);
 64 | 			if (sb_stemmer_length(stemmer) > 0) {
 65 | 			    int j;
 66 | 			    if (inlen < 30) {
 67 | 				for (j = 30 - inlen; j > 0; j--)
 68 | 				    fputs(" ", f_out);
 69 | 			    } else {
 70 | 				fputs("\n", f_out);
 71 | 				for (j = 30; j > 0; j--)
 72 | 				    fputs(" ", f_out);
 73 | 			    }
 74 | 			}
 75 | 		    }
 76 | 
 77 | 		    fputs((char *)stemmed, f_out);
 78 | 		    putc('\n', f_out);
 79 | 		}
 80 |             }
 81 |         }
 82 |     }
 83 | error:
 84 |     if (b != 0) free(b);
 85 |     return;
 86 | }
 87 | 
 88 | /** Display the command line syntax, and then exit.
 89 |  *  @param n The value to exit with.
 90 |  */
 91 | static void
 92 | usage(int n)
 93 | {
 94 |     printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
 95 | 	  "\n"
 96 | 	  "The input file consists of a list of words to be stemmed, one per\n"
 97 | 	  "line. Words should be in lower case, but (for English) A-Z letters\n"
 98 | 	  "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
 99 | 	  "used.\n"
100 | 	  "\n"
101 | 	  "If -c is given, the argument is the character encoding of the input\n"
102 |           "and output files.  If it is omitted, the UTF-8 encoding is used.\n"
103 | 	  "\n"
104 | 	  "If -p is given the output file consists of each word of the input\n"
105 | 	  "file followed by \"->\" followed by its stemmed equivalent.\n"
106 | 	  "If -p2 is given the output file is a two column layout containing\n"
107 | 	  "the input words in the first column and the stemmed eqivalents in\n"
108 | 	  "the second column.\n"
109 | 	  "Otherwise, the output file consists of the stemmed words, one per\n"
110 | 	  "line.\n"
111 | 	  "\n"
112 | 	  "-h displays this help\n",
113 | 	  progname);
114 |     exit(n);
115 | }
116 | 
117 | int
118 | main(int argc, char * argv[])
119 | {
120 |     char * in = 0;
121 |     char * out = 0;
122 |     FILE * f_in;
123 |     FILE * f_out;
124 |     struct sb_stemmer * stemmer;
125 | 
126 |     char * language = "english";
127 |     char * charenc = NULL;
128 | 
129 |     char * s;
130 |     int i = 1;
131 |     pretty = 0;
132 | 
133 |     progname = argv[0];
134 | 
135 |     while(i < argc) {
136 | 	s = argv[i++];
137 | 	if (s[0] == '-') {
138 | 	    if (strcmp(s, "-o") == 0) {
139 | 		if (i >= argc) {
140 | 		    fprintf(stderr, "%s requires an argument\n", s);
141 | 		    exit(1);
142 | 		}
143 | 		out = argv[i++];
144 | 	    } else if (strcmp(s, "-i") == 0) {
145 | 		if (i >= argc) {
146 | 		    fprintf(stderr, "%s requires an argument\n", s);
147 | 		    exit(1);
148 | 		}
149 | 		in = argv[i++];
150 | 	    } else if (strcmp(s, "-l") == 0) {
151 | 		if (i >= argc) {
152 | 		    fprintf(stderr, "%s requires an argument\n", s);
153 | 		    exit(1);
154 | 		}
155 | 		language = argv[i++];
156 | 	    } else if (strcmp(s, "-c") == 0) {
157 | 		if (i >= argc) {
158 | 		    fprintf(stderr, "%s requires an argument\n", s);
159 | 		    exit(1);
160 | 		}
161 | 		charenc = argv[i++];
162 | 	    } else if (strcmp(s, "-p2") == 0) {
163 | 		pretty = 2;
164 | 	    } else if (strcmp(s, "-p") == 0) {
165 | 		pretty = 1;
166 | 	    } else if (strcmp(s, "-h") == 0) {
167 | 		usage(0);
168 | 	    } else {
169 | 		fprintf(stderr, "option %s unknown\n", s);
170 | 		usage(1);
171 | 	    }
172 | 	} else {
173 | 	    fprintf(stderr, "unexpected parameter %s\n", s);
174 | 	    usage(1);
175 | 	}
176 |     }
177 | 
178 |     /* prepare the files */
179 |     f_in = (in == 0) ? stdin : fopen(in, "r");
180 |     if (f_in == 0) {
181 | 	fprintf(stderr, "file %s not found\n", in);
182 | 	exit(1);
183 |     }
184 |     f_out = (out == 0) ? stdout : fopen(out, "w");
185 |     if (f_out == 0) {
186 | 	fprintf(stderr, "file %s cannot be opened\n", out);
187 | 	exit(1);
188 |     }
189 | 
190 |     /* do the stemming process: */
191 |     stemmer = sb_stemmer_new(language, charenc);
192 |     if (stemmer == 0) {
193 |         if (charenc == NULL) {
194 |             fprintf(stderr, "language `%s' not available for stemming\n", language);
195 |             exit(1);
196 |         } else {
197 |             fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
198 |             exit(1);
199 |         }
200 |     }
201 |     stem_file(stemmer, f_in, f_out);
202 |     sb_stemmer_delete(stemmer);
203 | 
204 |     if (in != 0) (void) fclose(f_in);
205 |     if (out != 0) (void) fclose(f_out);
206 | 
207 |     return 0;
208 | }
209 | 
210 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/modules_utf8.h:
--------------------------------------------------------------------------------
  1 | /* libstemmer/modules_utf8.h: List of stemming modules.
  2 |  *
  3 |  * This file is generated by mkmodules.pl from a list of module names.
  4 |  * Do not edit manually.
  5 |  *
  6 |  * Modules included by this file are: danish, dutch, english, finnish, french,
  7 |  * german, hungarian, italian, norwegian, porter, portuguese, romanian,
  8 |  * russian, spanish, swedish, turkish
  9 |  */
 10 | 
 11 | #include "../src_c/stem_UTF_8_danish.h"
 12 | #include "../src_c/stem_UTF_8_dutch.h"
 13 | #include "../src_c/stem_UTF_8_english.h"
 14 | #include "../src_c/stem_UTF_8_finnish.h"
 15 | #include "../src_c/stem_UTF_8_french.h"
 16 | #include "../src_c/stem_UTF_8_german.h"
 17 | #include "../src_c/stem_UTF_8_hungarian.h"
 18 | #include "../src_c/stem_UTF_8_italian.h"
 19 | #include "../src_c/stem_UTF_8_norwegian.h"
 20 | #include "../src_c/stem_UTF_8_porter.h"
 21 | #include "../src_c/stem_UTF_8_portuguese.h"
 22 | #include "../src_c/stem_UTF_8_romanian.h"
 23 | #include "../src_c/stem_UTF_8_russian.h"
 24 | #include "../src_c/stem_UTF_8_spanish.h"
 25 | #include "../src_c/stem_UTF_8_swedish.h"
 26 | #include "../src_c/stem_UTF_8_turkish.h"
 27 | 
 28 | typedef enum {
 29 |   ENC_UNKNOWN=0,
 30 |   ENC_UTF_8
 31 | } stemmer_encoding_t;
 32 | 
 33 | struct stemmer_encoding {
 34 |   const char * name;
 35 |   stemmer_encoding_t enc;
 36 | };
 37 | static struct stemmer_encoding encodings[] = {
 38 |   {"UTF_8", ENC_UTF_8},
 39 |   {0,ENC_UNKNOWN}
 40 | };
 41 | 
 42 | struct stemmer_modules {
 43 |   const char * name;
 44 |   stemmer_encoding_t enc; 
 45 |   struct SN_env * (*create)(void);
 46 |   void (*close)(struct SN_env *);
 47 |   int (*stem)(struct SN_env *);
 48 | };
 49 | static struct stemmer_modules modules[] = {
 50 |   {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 51 |   {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 52 |   {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 53 |   {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 54 |   {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 55 |   {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 56 |   {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 57 |   {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 58 |   {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 59 |   {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 60 |   {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 61 |   {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 62 |   {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 63 |   {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 64 |   {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 65 |   {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
 66 |   {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
 67 |   {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
 68 |   {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
 69 |   {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 70 |   {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 71 |   {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
 72 |   {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
 73 |   {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
 74 |   {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
 75 |   {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
 76 |   {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
 77 |   {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 78 |   {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 79 |   {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
 80 |   {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
 81 |   {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
 82 |   {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
 83 |   {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
 84 |   {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
 85 |   {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
 86 |   {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
 87 |   {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
 88 |   {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
 89 |   {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
 90 |   {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
 91 |   {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
 92 |   {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
 93 |   {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 94 |   {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 95 |   {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
 96 |   {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
 97 |   {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
 98 |   {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
 99 |   {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
100 |   {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
101 |   {0,ENC_UNKNOWN,0,0,0}
102 | };
103 | static const char * algorithm_names[] = {
104 |   "danish", 
105 |   "dutch", 
106 |   "english", 
107 |   "finnish", 
108 |   "french", 
109 |   "german", 
110 |   "hungarian", 
111 |   "italian", 
112 |   "norwegian", 
113 |   "porter", 
114 |   "portuguese", 
115 |   "romanian", 
116 |   "russian", 
117 |   "spanish", 
118 |   "swedish", 
119 |   "turkish", 
120 |   0
121 | };
122 | 


--------------------------------------------------------------------------------
/fts3_tokenizer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** 2006 July 10
  3 | **
  4 | ** The author disclaims copyright to this source code.
  5 | **
  6 | *************************************************************************
  7 | ** Defines the interface to tokenizers used by fulltext-search.  There
  8 | ** are three basic components:
  9 | **
 10 | ** sqlite3_tokenizer_module is a singleton defining the tokenizer
 11 | ** interface functions.  This is essentially the class structure for
 12 | ** tokenizers.
 13 | **
 14 | ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
 15 | ** including customization information defined at creation time.
 16 | **
 17 | ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
 18 | ** tokens from a particular input.
 19 | */
 20 | #ifndef _FTS3_TOKENIZER_H_
 21 | #define _FTS3_TOKENIZER_H_
 22 | 
 23 | /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
 24 | ** If tokenizers are to be allowed to call sqlite3_*() functions, then
 25 | ** we will need a way to register the API consistently.
 26 | */
 27 | #include "sqlite3.h"
 28 | 
 29 | /*
 30 | ** Structures used by the tokenizer interface. When a new tokenizer
 31 | ** implementation is registered, the caller provides a pointer to
 32 | ** an sqlite3_tokenizer_module containing pointers to the callback
 33 | ** functions that make up an implementation.
 34 | **
 35 | ** When an fts3 table is created, it passes any arguments passed to
 36 | ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
 37 | ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
 38 | ** implementation. The xCreate() function in turn returns an 
 39 | ** sqlite3_tokenizer structure representing the specific tokenizer to
 40 | ** be used for the fts3 table (customized by the tokenizer clause arguments).
 41 | **
 42 | ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
 43 | ** method is called. It returns an sqlite3_tokenizer_cursor object
 44 | ** that may be used to tokenize a specific input buffer based on
 45 | ** the tokenization rules supplied by a specific sqlite3_tokenizer
 46 | ** object.
 47 | */
 48 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
 49 | typedef struct sqlite3_tokenizer sqlite3_tokenizer;
 50 | typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
 51 | 
 52 | struct sqlite3_tokenizer_module {
 53 | 
 54 |   /*
 55 |   ** Structure version. Should always be set to 0 or 1.
 56 |   */
 57 |   int iVersion;
 58 | 
 59 |   /*
 60 |   ** Create a new tokenizer. The values in the argv[] array are the
 61 |   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
 62 |   ** TABLE statement that created the fts3 table. For example, if
 63 |   ** the following SQL is executed:
 64 |   **
 65 |   **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
 66 |   **
 67 |   ** then argc is set to 2, and the argv[] array contains pointers
 68 |   ** to the strings "arg1" and "arg2".
 69 |   **
 70 |   ** This method should return either SQLITE_OK (0), or an SQLite error 
 71 |   ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
 72 |   ** to point at the newly created tokenizer structure. The generic
 73 |   ** sqlite3_tokenizer.pModule variable should not be initialised by
 74 |   ** this callback. The caller will do so.
 75 |   */
 76 |   int (*xCreate)(
 77 |     int argc,                           /* Size of argv array */
 78 |     const char *const*argv,             /* Tokenizer argument strings */
 79 |     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
 80 |   );
 81 | 
 82 |   /*
 83 |   ** Destroy an existing tokenizer. The fts3 module calls this method
 84 |   ** exactly once for each successful call to xCreate().
 85 |   */
 86 |   int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
 87 | 
 88 |   /*
 89 |   ** Create a tokenizer cursor to tokenize an input buffer. The caller
 90 |   ** is responsible for ensuring that the input buffer remains valid
 91 |   ** until the cursor is closed (using the xClose() method). 
 92 |   */
 93 |   int (*xOpen)(
 94 |     sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
 95 |     const char *pInput, int nBytes,      /* Input buffer */
 96 |     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
 97 |   );
 98 | 
 99 |   /*
100 |   ** Destroy an existing tokenizer cursor. The fts3 module calls this 
101 |   ** method exactly once for each successful call to xOpen().
102 |   */
103 |   int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
104 | 
105 |   /*
106 |   ** Retrieve the next token from the tokenizer cursor pCursor. This
107 |   ** method should either return SQLITE_OK and set the values of the
108 |   ** "OUT" variables identified below, or SQLITE_DONE to indicate that
109 |   ** the end of the buffer has been reached, or an SQLite error code.
110 |   **
111 |   ** *ppToken should be set to point at a buffer containing the 
112 |   ** normalized version of the token (i.e. after any case-folding and/or
113 |   ** stemming has been performed). *pnBytes should be set to the length
114 |   ** of this buffer in bytes. The input text that generated the token is
115 |   ** identified by the byte offsets returned in *piStartOffset and
116 |   ** *piEndOffset. *piStartOffset should be set to the index of the first
117 |   ** byte of the token in the input buffer. *piEndOffset should be set
118 |   ** to the index of the first byte just past the end of the token in
119 |   ** the input buffer.
120 |   **
121 |   ** The buffer *ppToken is set to point at is managed by the tokenizer
122 |   ** implementation. It is only required to be valid until the next call
123 |   ** to xNext() or xClose(). 
124 |   */
125 |   /* TODO(shess) current implementation requires pInput to be
126 |   ** nul-terminated.  This should either be fixed, or pInput/nBytes
127 |   ** should be converted to zInput.
128 |   */
129 |   int (*xNext)(
130 |     sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
131 |     const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
132 |     int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
133 |     int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
134 |     int *piPosition      /* OUT: Number of tokens returned before this one */
135 |   );
136 | 
137 |   /***********************************************************************
138 |   ** Methods below this point are only available if iVersion>=1.
139 |   */
140 | 
141 |   /* 
142 |   ** Configure the language id of a tokenizer cursor.
143 |   */
144 |   int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
145 | };
146 | 
147 | struct sqlite3_tokenizer {
148 |   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
149 |   /* Tokenizer implementations will typically add additional fields */
150 | };
151 | 
152 | struct sqlite3_tokenizer_cursor {
153 |   sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
154 |   /* Tokenizer implementations will typically add additional fields */
155 | };
156 | 
157 | int fts3_global_term_cnt(int iTerm, int iCol);
158 | int fts3_term_cnt(int iTerm, int iCol);
159 | 
160 | 
161 | #endif /* _FTS3_TOKENIZER_H_ */
162 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_other_suffix(struct SN_env * z);
 14 | static int r_consonant_pair(struct SN_env * z);
 15 | static int r_main_suffix(struct SN_env * z);
 16 | static int r_mark_regions(struct SN_env * z);
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | 
 22 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void);
 23 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z);
 24 | 
 25 | 
 26 | #ifdef __cplusplus
 27 | }
 28 | #endif
 29 | static const symbol s_0_0[1] = { 'a' };
 30 | static const symbol s_0_1[1] = { 'e' };
 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' };
 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' };
 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' };
 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' };
 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' };
 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' };
 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' };
 38 | static const symbol s_0_9[2] = { 'e', 'n' };
 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' };
 40 | static const symbol s_0_11[2] = { 'a', 'r' };
 41 | static const symbol s_0_12[2] = { 'e', 'r' };
 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' };
 43 | static const symbol s_0_14[1] = { 's' };
 44 | static const symbol s_0_15[2] = { 'a', 's' };
 45 | static const symbol s_0_16[2] = { 'e', 's' };
 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' };
 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' };
 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' };
 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' };
 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' };
 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' };
 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' };
 53 | static const symbol s_0_24[3] = { 'e', 't', 's' };
 54 | static const symbol s_0_25[2] = { 'e', 't' };
 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' };
 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' };
 57 | static const symbol s_0_28[3] = { 'a', 's', 't' };
 58 | 
 59 | static const struct among a_0[29] =
 60 | {
 61 | /*  0 */ { 1, s_0_0, -1, 1, 0},
 62 | /*  1 */ { 1, s_0_1, -1, 1, 0},
 63 | /*  2 */ { 3, s_0_2, 1, 1, 0},
 64 | /*  3 */ { 4, s_0_3, 1, 1, 0},
 65 | /*  4 */ { 4, s_0_4, 1, 1, 0},
 66 | /*  5 */ { 3, s_0_5, 1, 1, 0},
 67 | /*  6 */ { 3, s_0_6, 1, 1, 0},
 68 | /*  7 */ { 6, s_0_7, 6, 1, 0},
 69 | /*  8 */ { 4, s_0_8, 1, 3, 0},
 70 | /*  9 */ { 2, s_0_9, -1, 1, 0},
 71 | /* 10 */ { 5, s_0_10, 9, 1, 0},
 72 | /* 11 */ { 2, s_0_11, -1, 1, 0},
 73 | /* 12 */ { 2, s_0_12, -1, 1, 0},
 74 | /* 13 */ { 5, s_0_13, 12, 1, 0},
 75 | /* 14 */ { 1, s_0_14, -1, 2, 0},
 76 | /* 15 */ { 2, s_0_15, 14, 1, 0},
 77 | /* 16 */ { 2, s_0_16, 14, 1, 0},
 78 | /* 17 */ { 4, s_0_17, 16, 1, 0},
 79 | /* 18 */ { 5, s_0_18, 16, 1, 0},
 80 | /* 19 */ { 4, s_0_19, 16, 1, 0},
 81 | /* 20 */ { 7, s_0_20, 19, 1, 0},
 82 | /* 21 */ { 3, s_0_21, 14, 1, 0},
 83 | /* 22 */ { 6, s_0_22, 21, 1, 0},
 84 | /* 23 */ { 3, s_0_23, 14, 1, 0},
 85 | /* 24 */ { 3, s_0_24, 14, 1, 0},
 86 | /* 25 */ { 2, s_0_25, -1, 1, 0},
 87 | /* 26 */ { 3, s_0_26, 25, 1, 0},
 88 | /* 27 */ { 3, s_0_27, -1, 3, 0},
 89 | /* 28 */ { 3, s_0_28, -1, 1, 0}
 90 | };
 91 | 
 92 | static const symbol s_1_0[2] = { 'd', 't' };
 93 | static const symbol s_1_1[2] = { 'v', 't' };
 94 | 
 95 | static const struct among a_1[2] =
 96 | {
 97 | /*  0 */ { 2, s_1_0, -1, -1, 0},
 98 | /*  1 */ { 2, s_1_1, -1, -1, 0}
 99 | };
100 | 
101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' };
102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' };
103 | static const symbol s_2_2[2] = { 'i', 'g' };
104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' };
105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' };
106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' };
107 | static const symbol s_2_6[3] = { 'e', 'l', 's' };
108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' };
109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' };
110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' };
111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' };
112 | 
113 | static const struct among a_2[11] =
114 | {
115 | /*  0 */ { 3, s_2_0, -1, 1, 0},
116 | /*  1 */ { 4, s_2_1, 0, 1, 0},
117 | /*  2 */ { 2, s_2_2, -1, 1, 0},
118 | /*  3 */ { 3, s_2_3, 2, 1, 0},
119 | /*  4 */ { 3, s_2_4, 2, 1, 0},
120 | /*  5 */ { 4, s_2_5, 4, 1, 0},
121 | /*  6 */ { 3, s_2_6, -1, 1, 0},
122 | /*  7 */ { 3, s_2_7, -1, 1, 0},
123 | /*  8 */ { 4, s_2_8, 7, 1, 0},
124 | /*  9 */ { 4, s_2_9, 7, 1, 0},
125 | /* 10 */ { 7, s_2_10, 9, 1, 0}
126 | };
127 | 
128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
129 | 
130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 };
131 | 
132 | static const symbol s_0[] = { 'k' };
133 | static const symbol s_1[] = { 'e', 'r' };
134 | 
135 | static int r_mark_regions(struct SN_env * z) {
136 |     z->I[0] = z->l;
137 |     {   int c_test = z->c; /* test, line 30 */
138 |         {   int ret = z->c + 3;
139 |             if (0 > ret || ret > z->l) return 0;
140 |             z->c = ret; /* hop, line 30 */
141 |         }
142 |         z->I[1] = z->c; /* setmark x, line 30 */
143 |         z->c = c_test;
144 |     }
145 |     if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */
146 |     {    /* gopast */ /* non v, line 31 */
147 |         int ret = in_grouping(z, g_v, 97, 248, 1);
148 |         if (ret < 0) return 0;
149 |         z->c += ret;
150 |     }
151 |     z->I[0] = z->c; /* setmark p1, line 31 */
152 |      /* try, line 32 */
153 |     if (!(z->I[0] < z->I[1])) goto lab0;
154 |     z->I[0] = z->I[1];
155 | lab0:
156 |     return 1;
157 | }
158 | 
159 | static int r_main_suffix(struct SN_env * z) {
160 |     int among_var;
161 |     {   int mlimit; /* setlimit, line 38 */
162 |         int m1 = z->l - z->c; (void)m1;
163 |         if (z->c < z->I[0]) return 0;
164 |         z->c = z->I[0]; /* tomark, line 38 */
165 |         mlimit = z->lb; z->lb = z->c;
166 |         z->c = z->l - m1;
167 |         z->ket = z->c; /* [, line 38 */
168 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
169 |         among_var = find_among_b(z, a_0, 29); /* substring, line 38 */
170 |         if (!(among_var)) { z->lb = mlimit; return 0; }
171 |         z->bra = z->c; /* ], line 38 */
172 |         z->lb = mlimit;
173 |     }
174 |     switch(among_var) {
175 |         case 0: return 0;
176 |         case 1:
177 |             {   int ret = slice_del(z); /* delete, line 44 */
178 |                 if (ret < 0) return ret;
179 |             }
180 |             break;
181 |         case 2:
182 |             {   int m2 = z->l - z->c; (void)m2; /* or, line 46 */
183 |                 if (in_grouping_b(z, g_s_ending, 98, 122, 0)) goto lab1;
184 |                 goto lab0;
185 |             lab1:
186 |                 z->c = z->l - m2;
187 |                 if (!(eq_s_b(z, 1, s_0))) return 0;
188 |                 if (out_grouping_b(z, g_v, 97, 248, 0)) return 0;
189 |             }
190 |         lab0:
191 |             {   int ret = slice_del(z); /* delete, line 46 */
192 |                 if (ret < 0) return ret;
193 |             }
194 |             break;
195 |         case 3:
196 |             {   int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */
197 |                 if (ret < 0) return ret;
198 |             }
199 |             break;
200 |     }
201 |     return 1;
202 | }
203 | 
204 | static int r_consonant_pair(struct SN_env * z) {
205 |     {   int m_test = z->l - z->c; /* test, line 53 */
206 |         {   int mlimit; /* setlimit, line 54 */
207 |             int m1 = z->l - z->c; (void)m1;
208 |             if (z->c < z->I[0]) return 0;
209 |             z->c = z->I[0]; /* tomark, line 54 */
210 |             mlimit = z->lb; z->lb = z->c;
211 |             z->c = z->l - m1;
212 |             z->ket = z->c; /* [, line 54 */
213 |             if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; }
214 |             if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */
215 |             z->bra = z->c; /* ], line 54 */
216 |             z->lb = mlimit;
217 |         }
218 |         z->c = z->l - m_test;
219 |     }
220 |     if (z->c <= z->lb) return 0;
221 |     z->c--; /* next, line 59 */
222 |     z->bra = z->c; /* ], line 59 */
223 |     {   int ret = slice_del(z); /* delete, line 59 */
224 |         if (ret < 0) return ret;
225 |     }
226 |     return 1;
227 | }
228 | 
229 | static int r_other_suffix(struct SN_env * z) {
230 |     int among_var;
231 |     {   int mlimit; /* setlimit, line 63 */
232 |         int m1 = z->l - z->c; (void)m1;
233 |         if (z->c < z->I[0]) return 0;
234 |         z->c = z->I[0]; /* tomark, line 63 */
235 |         mlimit = z->lb; z->lb = z->c;
236 |         z->c = z->l - m1;
237 |         z->ket = z->c; /* [, line 63 */
238 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
239 |         among_var = find_among_b(z, a_2, 11); /* substring, line 63 */
240 |         if (!(among_var)) { z->lb = mlimit; return 0; }
241 |         z->bra = z->c; /* ], line 63 */
242 |         z->lb = mlimit;
243 |     }
244 |     switch(among_var) {
245 |         case 0: return 0;
246 |         case 1:
247 |             {   int ret = slice_del(z); /* delete, line 67 */
248 |                 if (ret < 0) return ret;
249 |             }
250 |             break;
251 |     }
252 |     return 1;
253 | }
254 | 
255 | extern int norwegian_ISO_8859_1_stem(struct SN_env * z) {
256 |     {   int c1 = z->c; /* do, line 74 */
257 |         {   int ret = r_mark_regions(z);
258 |             if (ret == 0) goto lab0; /* call mark_regions, line 74 */
259 |             if (ret < 0) return ret;
260 |         }
261 |     lab0:
262 |         z->c = c1;
263 |     }
264 |     z->lb = z->c; z->c = z->l; /* backwards, line 75 */
265 | 
266 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 76 */
267 |         {   int ret = r_main_suffix(z);
268 |             if (ret == 0) goto lab1; /* call main_suffix, line 76 */
269 |             if (ret < 0) return ret;
270 |         }
271 |     lab1:
272 |         z->c = z->l - m2;
273 |     }
274 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 77 */
275 |         {   int ret = r_consonant_pair(z);
276 |             if (ret == 0) goto lab2; /* call consonant_pair, line 77 */
277 |             if (ret < 0) return ret;
278 |         }
279 |     lab2:
280 |         z->c = z->l - m3;
281 |     }
282 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 78 */
283 |         {   int ret = r_other_suffix(z);
284 |             if (ret == 0) goto lab3; /* call other_suffix, line 78 */
285 |             if (ret < 0) return ret;
286 |         }
287 |     lab3:
288 |         z->c = z->l - m4;
289 |     }
290 |     z->c = z->lb;
291 |     return 1;
292 | }
293 | 
294 | extern struct SN_env * norwegian_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); }
295 | 
296 | extern void norwegian_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); }
297 | 
298 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_norwegian.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int norwegian_UTF_8_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_other_suffix(struct SN_env * z);
 14 | static int r_consonant_pair(struct SN_env * z);
 15 | static int r_main_suffix(struct SN_env * z);
 16 | static int r_mark_regions(struct SN_env * z);
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | 
 22 | extern struct SN_env * norwegian_UTF_8_create_env(void);
 23 | extern void norwegian_UTF_8_close_env(struct SN_env * z);
 24 | 
 25 | 
 26 | #ifdef __cplusplus
 27 | }
 28 | #endif
 29 | static const symbol s_0_0[1] = { 'a' };
 30 | static const symbol s_0_1[1] = { 'e' };
 31 | static const symbol s_0_2[3] = { 'e', 'd', 'e' };
 32 | static const symbol s_0_3[4] = { 'a', 'n', 'd', 'e' };
 33 | static const symbol s_0_4[4] = { 'e', 'n', 'd', 'e' };
 34 | static const symbol s_0_5[3] = { 'a', 'n', 'e' };
 35 | static const symbol s_0_6[3] = { 'e', 'n', 'e' };
 36 | static const symbol s_0_7[6] = { 'h', 'e', 't', 'e', 'n', 'e' };
 37 | static const symbol s_0_8[4] = { 'e', 'r', 't', 'e' };
 38 | static const symbol s_0_9[2] = { 'e', 'n' };
 39 | static const symbol s_0_10[5] = { 'h', 'e', 't', 'e', 'n' };
 40 | static const symbol s_0_11[2] = { 'a', 'r' };
 41 | static const symbol s_0_12[2] = { 'e', 'r' };
 42 | static const symbol s_0_13[5] = { 'h', 'e', 't', 'e', 'r' };
 43 | static const symbol s_0_14[1] = { 's' };
 44 | static const symbol s_0_15[2] = { 'a', 's' };
 45 | static const symbol s_0_16[2] = { 'e', 's' };
 46 | static const symbol s_0_17[4] = { 'e', 'd', 'e', 's' };
 47 | static const symbol s_0_18[5] = { 'e', 'n', 'd', 'e', 's' };
 48 | static const symbol s_0_19[4] = { 'e', 'n', 'e', 's' };
 49 | static const symbol s_0_20[7] = { 'h', 'e', 't', 'e', 'n', 'e', 's' };
 50 | static const symbol s_0_21[3] = { 'e', 'n', 's' };
 51 | static const symbol s_0_22[6] = { 'h', 'e', 't', 'e', 'n', 's' };
 52 | static const symbol s_0_23[3] = { 'e', 'r', 's' };
 53 | static const symbol s_0_24[3] = { 'e', 't', 's' };
 54 | static const symbol s_0_25[2] = { 'e', 't' };
 55 | static const symbol s_0_26[3] = { 'h', 'e', 't' };
 56 | static const symbol s_0_27[3] = { 'e', 'r', 't' };
 57 | static const symbol s_0_28[3] = { 'a', 's', 't' };
 58 | 
 59 | static const struct among a_0[29] =
 60 | {
 61 | /*  0 */ { 1, s_0_0, -1, 1, 0},
 62 | /*  1 */ { 1, s_0_1, -1, 1, 0},
 63 | /*  2 */ { 3, s_0_2, 1, 1, 0},
 64 | /*  3 */ { 4, s_0_3, 1, 1, 0},
 65 | /*  4 */ { 4, s_0_4, 1, 1, 0},
 66 | /*  5 */ { 3, s_0_5, 1, 1, 0},
 67 | /*  6 */ { 3, s_0_6, 1, 1, 0},
 68 | /*  7 */ { 6, s_0_7, 6, 1, 0},
 69 | /*  8 */ { 4, s_0_8, 1, 3, 0},
 70 | /*  9 */ { 2, s_0_9, -1, 1, 0},
 71 | /* 10 */ { 5, s_0_10, 9, 1, 0},
 72 | /* 11 */ { 2, s_0_11, -1, 1, 0},
 73 | /* 12 */ { 2, s_0_12, -1, 1, 0},
 74 | /* 13 */ { 5, s_0_13, 12, 1, 0},
 75 | /* 14 */ { 1, s_0_14, -1, 2, 0},
 76 | /* 15 */ { 2, s_0_15, 14, 1, 0},
 77 | /* 16 */ { 2, s_0_16, 14, 1, 0},
 78 | /* 17 */ { 4, s_0_17, 16, 1, 0},
 79 | /* 18 */ { 5, s_0_18, 16, 1, 0},
 80 | /* 19 */ { 4, s_0_19, 16, 1, 0},
 81 | /* 20 */ { 7, s_0_20, 19, 1, 0},
 82 | /* 21 */ { 3, s_0_21, 14, 1, 0},
 83 | /* 22 */ { 6, s_0_22, 21, 1, 0},
 84 | /* 23 */ { 3, s_0_23, 14, 1, 0},
 85 | /* 24 */ { 3, s_0_24, 14, 1, 0},
 86 | /* 25 */ { 2, s_0_25, -1, 1, 0},
 87 | /* 26 */ { 3, s_0_26, 25, 1, 0},
 88 | /* 27 */ { 3, s_0_27, -1, 3, 0},
 89 | /* 28 */ { 3, s_0_28, -1, 1, 0}
 90 | };
 91 | 
 92 | static const symbol s_1_0[2] = { 'd', 't' };
 93 | static const symbol s_1_1[2] = { 'v', 't' };
 94 | 
 95 | static const struct among a_1[2] =
 96 | {
 97 | /*  0 */ { 2, s_1_0, -1, -1, 0},
 98 | /*  1 */ { 2, s_1_1, -1, -1, 0}
 99 | };
100 | 
101 | static const symbol s_2_0[3] = { 'l', 'e', 'g' };
102 | static const symbol s_2_1[4] = { 'e', 'l', 'e', 'g' };
103 | static const symbol s_2_2[2] = { 'i', 'g' };
104 | static const symbol s_2_3[3] = { 'e', 'i', 'g' };
105 | static const symbol s_2_4[3] = { 'l', 'i', 'g' };
106 | static const symbol s_2_5[4] = { 'e', 'l', 'i', 'g' };
107 | static const symbol s_2_6[3] = { 'e', 'l', 's' };
108 | static const symbol s_2_7[3] = { 'l', 'o', 'v' };
109 | static const symbol s_2_8[4] = { 'e', 'l', 'o', 'v' };
110 | static const symbol s_2_9[4] = { 's', 'l', 'o', 'v' };
111 | static const symbol s_2_10[7] = { 'h', 'e', 't', 's', 'l', 'o', 'v' };
112 | 
113 | static const struct among a_2[11] =
114 | {
115 | /*  0 */ { 3, s_2_0, -1, 1, 0},
116 | /*  1 */ { 4, s_2_1, 0, 1, 0},
117 | /*  2 */ { 2, s_2_2, -1, 1, 0},
118 | /*  3 */ { 3, s_2_3, 2, 1, 0},
119 | /*  4 */ { 3, s_2_4, 2, 1, 0},
120 | /*  5 */ { 4, s_2_5, 4, 1, 0},
121 | /*  6 */ { 3, s_2_6, -1, 1, 0},
122 | /*  7 */ { 3, s_2_7, -1, 1, 0},
123 | /*  8 */ { 4, s_2_8, 7, 1, 0},
124 | /*  9 */ { 4, s_2_9, 7, 1, 0},
125 | /* 10 */ { 7, s_2_10, 9, 1, 0}
126 | };
127 | 
128 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
129 | 
130 | static const unsigned char g_s_ending[] = { 119, 125, 149, 1 };
131 | 
132 | static const symbol s_0[] = { 'k' };
133 | static const symbol s_1[] = { 'e', 'r' };
134 | 
135 | static int r_mark_regions(struct SN_env * z) {
136 |     z->I[0] = z->l;
137 |     {   int c_test = z->c; /* test, line 30 */
138 |         {   int ret = skip_utf8(z->p, z->c, 0, z->l, + 3);
139 |             if (ret < 0) return 0;
140 |             z->c = ret; /* hop, line 30 */
141 |         }
142 |         z->I[1] = z->c; /* setmark x, line 30 */
143 |         z->c = c_test;
144 |     }
145 |     if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 31 */
146 |     {    /* gopast */ /* non v, line 31 */
147 |         int ret = in_grouping_U(z, g_v, 97, 248, 1);
148 |         if (ret < 0) return 0;
149 |         z->c += ret;
150 |     }
151 |     z->I[0] = z->c; /* setmark p1, line 31 */
152 |      /* try, line 32 */
153 |     if (!(z->I[0] < z->I[1])) goto lab0;
154 |     z->I[0] = z->I[1];
155 | lab0:
156 |     return 1;
157 | }
158 | 
159 | static int r_main_suffix(struct SN_env * z) {
160 |     int among_var;
161 |     {   int mlimit; /* setlimit, line 38 */
162 |         int m1 = z->l - z->c; (void)m1;
163 |         if (z->c < z->I[0]) return 0;
164 |         z->c = z->I[0]; /* tomark, line 38 */
165 |         mlimit = z->lb; z->lb = z->c;
166 |         z->c = z->l - m1;
167 |         z->ket = z->c; /* [, line 38 */
168 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851426 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
169 |         among_var = find_among_b(z, a_0, 29); /* substring, line 38 */
170 |         if (!(among_var)) { z->lb = mlimit; return 0; }
171 |         z->bra = z->c; /* ], line 38 */
172 |         z->lb = mlimit;
173 |     }
174 |     switch(among_var) {
175 |         case 0: return 0;
176 |         case 1:
177 |             {   int ret = slice_del(z); /* delete, line 44 */
178 |                 if (ret < 0) return ret;
179 |             }
180 |             break;
181 |         case 2:
182 |             {   int m2 = z->l - z->c; (void)m2; /* or, line 46 */
183 |                 if (in_grouping_b_U(z, g_s_ending, 98, 122, 0)) goto lab1;
184 |                 goto lab0;
185 |             lab1:
186 |                 z->c = z->l - m2;
187 |                 if (!(eq_s_b(z, 1, s_0))) return 0;
188 |                 if (out_grouping_b_U(z, g_v, 97, 248, 0)) return 0;
189 |             }
190 |         lab0:
191 |             {   int ret = slice_del(z); /* delete, line 46 */
192 |                 if (ret < 0) return ret;
193 |             }
194 |             break;
195 |         case 3:
196 |             {   int ret = slice_from_s(z, 2, s_1); /* <-, line 48 */
197 |                 if (ret < 0) return ret;
198 |             }
199 |             break;
200 |     }
201 |     return 1;
202 | }
203 | 
204 | static int r_consonant_pair(struct SN_env * z) {
205 |     {   int m_test = z->l - z->c; /* test, line 53 */
206 |         {   int mlimit; /* setlimit, line 54 */
207 |             int m1 = z->l - z->c; (void)m1;
208 |             if (z->c < z->I[0]) return 0;
209 |             z->c = z->I[0]; /* tomark, line 54 */
210 |             mlimit = z->lb; z->lb = z->c;
211 |             z->c = z->l - m1;
212 |             z->ket = z->c; /* [, line 54 */
213 |             if (z->c - 1 <= z->lb || z->p[z->c - 1] != 116) { z->lb = mlimit; return 0; }
214 |             if (!(find_among_b(z, a_1, 2))) { z->lb = mlimit; return 0; } /* substring, line 54 */
215 |             z->bra = z->c; /* ], line 54 */
216 |             z->lb = mlimit;
217 |         }
218 |         z->c = z->l - m_test;
219 |     }
220 |     {   int ret = skip_utf8(z->p, z->c, z->lb, 0, -1);
221 |         if (ret < 0) return 0;
222 |         z->c = ret; /* next, line 59 */
223 |     }
224 |     z->bra = z->c; /* ], line 59 */
225 |     {   int ret = slice_del(z); /* delete, line 59 */
226 |         if (ret < 0) return ret;
227 |     }
228 |     return 1;
229 | }
230 | 
231 | static int r_other_suffix(struct SN_env * z) {
232 |     int among_var;
233 |     {   int mlimit; /* setlimit, line 63 */
234 |         int m1 = z->l - z->c; (void)m1;
235 |         if (z->c < z->I[0]) return 0;
236 |         z->c = z->I[0]; /* tomark, line 63 */
237 |         mlimit = z->lb; z->lb = z->c;
238 |         z->c = z->l - m1;
239 |         z->ket = z->c; /* [, line 63 */
240 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((4718720 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
241 |         among_var = find_among_b(z, a_2, 11); /* substring, line 63 */
242 |         if (!(among_var)) { z->lb = mlimit; return 0; }
243 |         z->bra = z->c; /* ], line 63 */
244 |         z->lb = mlimit;
245 |     }
246 |     switch(among_var) {
247 |         case 0: return 0;
248 |         case 1:
249 |             {   int ret = slice_del(z); /* delete, line 67 */
250 |                 if (ret < 0) return ret;
251 |             }
252 |             break;
253 |     }
254 |     return 1;
255 | }
256 | 
257 | extern int norwegian_UTF_8_stem(struct SN_env * z) {
258 |     {   int c1 = z->c; /* do, line 74 */
259 |         {   int ret = r_mark_regions(z);
260 |             if (ret == 0) goto lab0; /* call mark_regions, line 74 */
261 |             if (ret < 0) return ret;
262 |         }
263 |     lab0:
264 |         z->c = c1;
265 |     }
266 |     z->lb = z->c; z->c = z->l; /* backwards, line 75 */
267 | 
268 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 76 */
269 |         {   int ret = r_main_suffix(z);
270 |             if (ret == 0) goto lab1; /* call main_suffix, line 76 */
271 |             if (ret < 0) return ret;
272 |         }
273 |     lab1:
274 |         z->c = z->l - m2;
275 |     }
276 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 77 */
277 |         {   int ret = r_consonant_pair(z);
278 |             if (ret == 0) goto lab2; /* call consonant_pair, line 77 */
279 |             if (ret < 0) return ret;
280 |         }
281 |     lab2:
282 |         z->c = z->l - m3;
283 |     }
284 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 78 */
285 |         {   int ret = r_other_suffix(z);
286 |             if (ret == 0) goto lab3; /* call other_suffix, line 78 */
287 |             if (ret < 0) return ret;
288 |         }
289 |     lab3:
290 |         z->c = z->l - m4;
291 |     }
292 |     z->c = z->lb;
293 |     return 1;
294 | }
295 | 
296 | extern struct SN_env * norwegian_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); }
297 | 
298 | extern void norwegian_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); }
299 | 
300 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int swedish_ISO_8859_1_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_other_suffix(struct SN_env * z);
 14 | static int r_consonant_pair(struct SN_env * z);
 15 | static int r_main_suffix(struct SN_env * z);
 16 | static int r_mark_regions(struct SN_env * z);
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | 
 22 | extern struct SN_env * swedish_ISO_8859_1_create_env(void);
 23 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z);
 24 | 
 25 | 
 26 | #ifdef __cplusplus
 27 | }
 28 | #endif
 29 | static const symbol s_0_0[1] = { 'a' };
 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' };
 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' };
 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' };
 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' };
 34 | static const symbol s_0_5[2] = { 'a', 'd' };
 35 | static const symbol s_0_6[1] = { 'e' };
 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' };
 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' };
 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' };
 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' };
 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' };
 41 | static const symbol s_0_12[2] = { 'e', 'n' };
 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' };
 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' };
 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' };
 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' };
 46 | static const symbol s_0_17[2] = { 'a', 'r' };
 47 | static const symbol s_0_18[2] = { 'e', 'r' };
 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' };
 49 | static const symbol s_0_20[2] = { 'o', 'r' };
 50 | static const symbol s_0_21[1] = { 's' };
 51 | static const symbol s_0_22[2] = { 'a', 's' };
 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' };
 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' };
 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' };
 55 | static const symbol s_0_26[2] = { 'e', 's' };
 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' };
 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' };
 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' };
 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' };
 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' };
 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' };
 62 | static const symbol s_0_33[2] = { 'a', 't' };
 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' };
 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' };
 65 | static const symbol s_0_36[3] = { 'a', 's', 't' };
 66 | 
 67 | static const struct among a_0[37] =
 68 | {
 69 | /*  0 */ { 1, s_0_0, -1, 1, 0},
 70 | /*  1 */ { 4, s_0_1, 0, 1, 0},
 71 | /*  2 */ { 4, s_0_2, 0, 1, 0},
 72 | /*  3 */ { 7, s_0_3, 2, 1, 0},
 73 | /*  4 */ { 4, s_0_4, 0, 1, 0},
 74 | /*  5 */ { 2, s_0_5, -1, 1, 0},
 75 | /*  6 */ { 1, s_0_6, -1, 1, 0},
 76 | /*  7 */ { 3, s_0_7, 6, 1, 0},
 77 | /*  8 */ { 4, s_0_8, 6, 1, 0},
 78 | /*  9 */ { 4, s_0_9, 6, 1, 0},
 79 | /* 10 */ { 3, s_0_10, 6, 1, 0},
 80 | /* 11 */ { 4, s_0_11, 6, 1, 0},
 81 | /* 12 */ { 2, s_0_12, -1, 1, 0},
 82 | /* 13 */ { 5, s_0_13, 12, 1, 0},
 83 | /* 14 */ { 4, s_0_14, 12, 1, 0},
 84 | /* 15 */ { 5, s_0_15, 12, 1, 0},
 85 | /* 16 */ { 3, s_0_16, -1, 1, 0},
 86 | /* 17 */ { 2, s_0_17, -1, 1, 0},
 87 | /* 18 */ { 2, s_0_18, -1, 1, 0},
 88 | /* 19 */ { 5, s_0_19, 18, 1, 0},
 89 | /* 20 */ { 2, s_0_20, -1, 1, 0},
 90 | /* 21 */ { 1, s_0_21, -1, 2, 0},
 91 | /* 22 */ { 2, s_0_22, 21, 1, 0},
 92 | /* 23 */ { 5, s_0_23, 22, 1, 0},
 93 | /* 24 */ { 5, s_0_24, 22, 1, 0},
 94 | /* 25 */ { 5, s_0_25, 22, 1, 0},
 95 | /* 26 */ { 2, s_0_26, 21, 1, 0},
 96 | /* 27 */ { 4, s_0_27, 26, 1, 0},
 97 | /* 28 */ { 5, s_0_28, 26, 1, 0},
 98 | /* 29 */ { 3, s_0_29, 21, 1, 0},
 99 | /* 30 */ { 5, s_0_30, 29, 1, 0},
100 | /* 31 */ { 6, s_0_31, 29, 1, 0},
101 | /* 32 */ { 4, s_0_32, 21, 1, 0},
102 | /* 33 */ { 2, s_0_33, -1, 1, 0},
103 | /* 34 */ { 5, s_0_34, -1, 1, 0},
104 | /* 35 */ { 3, s_0_35, -1, 1, 0},
105 | /* 36 */ { 3, s_0_36, -1, 1, 0}
106 | };
107 | 
108 | static const symbol s_1_0[2] = { 'd', 'd' };
109 | static const symbol s_1_1[2] = { 'g', 'd' };
110 | static const symbol s_1_2[2] = { 'n', 'n' };
111 | static const symbol s_1_3[2] = { 'd', 't' };
112 | static const symbol s_1_4[2] = { 'g', 't' };
113 | static const symbol s_1_5[2] = { 'k', 't' };
114 | static const symbol s_1_6[2] = { 't', 't' };
115 | 
116 | static const struct among a_1[7] =
117 | {
118 | /*  0 */ { 2, s_1_0, -1, -1, 0},
119 | /*  1 */ { 2, s_1_1, -1, -1, 0},
120 | /*  2 */ { 2, s_1_2, -1, -1, 0},
121 | /*  3 */ { 2, s_1_3, -1, -1, 0},
122 | /*  4 */ { 2, s_1_4, -1, -1, 0},
123 | /*  5 */ { 2, s_1_5, -1, -1, 0},
124 | /*  6 */ { 2, s_1_6, -1, -1, 0}
125 | };
126 | 
127 | static const symbol s_2_0[2] = { 'i', 'g' };
128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' };
129 | static const symbol s_2_2[3] = { 'e', 'l', 's' };
130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' };
131 | static const symbol s_2_4[4] = { 'l', 0xF6, 's', 't' };
132 | 
133 | static const struct among a_2[5] =
134 | {
135 | /*  0 */ { 2, s_2_0, -1, 1, 0},
136 | /*  1 */ { 3, s_2_1, 0, 1, 0},
137 | /*  2 */ { 3, s_2_2, -1, 1, 0},
138 | /*  3 */ { 5, s_2_3, -1, 3, 0},
139 | /*  4 */ { 4, s_2_4, -1, 2, 0}
140 | };
141 | 
142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 };
143 | 
144 | static const unsigned char g_s_ending[] = { 119, 127, 149 };
145 | 
146 | static const symbol s_0[] = { 'l', 0xF6, 's' };
147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' };
148 | 
149 | static int r_mark_regions(struct SN_env * z) {
150 |     z->I[0] = z->l;
151 |     {   int c_test = z->c; /* test, line 29 */
152 |         {   int ret = z->c + 3;
153 |             if (0 > ret || ret > z->l) return 0;
154 |             z->c = ret; /* hop, line 29 */
155 |         }
156 |         z->I[1] = z->c; /* setmark x, line 29 */
157 |         z->c = c_test;
158 |     }
159 |     if (out_grouping(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */
160 |     {    /* gopast */ /* non v, line 30 */
161 |         int ret = in_grouping(z, g_v, 97, 246, 1);
162 |         if (ret < 0) return 0;
163 |         z->c += ret;
164 |     }
165 |     z->I[0] = z->c; /* setmark p1, line 30 */
166 |      /* try, line 31 */
167 |     if (!(z->I[0] < z->I[1])) goto lab0;
168 |     z->I[0] = z->I[1];
169 | lab0:
170 |     return 1;
171 | }
172 | 
173 | static int r_main_suffix(struct SN_env * z) {
174 |     int among_var;
175 |     {   int mlimit; /* setlimit, line 37 */
176 |         int m1 = z->l - z->c; (void)m1;
177 |         if (z->c < z->I[0]) return 0;
178 |         z->c = z->I[0]; /* tomark, line 37 */
179 |         mlimit = z->lb; z->lb = z->c;
180 |         z->c = z->l - m1;
181 |         z->ket = z->c; /* [, line 37 */
182 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
183 |         among_var = find_among_b(z, a_0, 37); /* substring, line 37 */
184 |         if (!(among_var)) { z->lb = mlimit; return 0; }
185 |         z->bra = z->c; /* ], line 37 */
186 |         z->lb = mlimit;
187 |     }
188 |     switch(among_var) {
189 |         case 0: return 0;
190 |         case 1:
191 |             {   int ret = slice_del(z); /* delete, line 44 */
192 |                 if (ret < 0) return ret;
193 |             }
194 |             break;
195 |         case 2:
196 |             if (in_grouping_b(z, g_s_ending, 98, 121, 0)) return 0;
197 |             {   int ret = slice_del(z); /* delete, line 46 */
198 |                 if (ret < 0) return ret;
199 |             }
200 |             break;
201 |     }
202 |     return 1;
203 | }
204 | 
205 | static int r_consonant_pair(struct SN_env * z) {
206 |     {   int mlimit; /* setlimit, line 50 */
207 |         int m1 = z->l - z->c; (void)m1;
208 |         if (z->c < z->I[0]) return 0;
209 |         z->c = z->I[0]; /* tomark, line 50 */
210 |         mlimit = z->lb; z->lb = z->c;
211 |         z->c = z->l - m1;
212 |         {   int m2 = z->l - z->c; (void)m2; /* and, line 52 */
213 |             if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
214 |             if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */
215 |             z->c = z->l - m2;
216 |             z->ket = z->c; /* [, line 52 */
217 |             if (z->c <= z->lb) { z->lb = mlimit; return 0; }
218 |             z->c--; /* next, line 52 */
219 |             z->bra = z->c; /* ], line 52 */
220 |             {   int ret = slice_del(z); /* delete, line 52 */
221 |                 if (ret < 0) return ret;
222 |             }
223 |         }
224 |         z->lb = mlimit;
225 |     }
226 |     return 1;
227 | }
228 | 
229 | static int r_other_suffix(struct SN_env * z) {
230 |     int among_var;
231 |     {   int mlimit; /* setlimit, line 55 */
232 |         int m1 = z->l - z->c; (void)m1;
233 |         if (z->c < z->I[0]) return 0;
234 |         z->c = z->I[0]; /* tomark, line 55 */
235 |         mlimit = z->lb; z->lb = z->c;
236 |         z->c = z->l - m1;
237 |         z->ket = z->c; /* [, line 56 */
238 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
239 |         among_var = find_among_b(z, a_2, 5); /* substring, line 56 */
240 |         if (!(among_var)) { z->lb = mlimit; return 0; }
241 |         z->bra = z->c; /* ], line 56 */
242 |         switch(among_var) {
243 |             case 0: { z->lb = mlimit; return 0; }
244 |             case 1:
245 |                 {   int ret = slice_del(z); /* delete, line 57 */
246 |                     if (ret < 0) return ret;
247 |                 }
248 |                 break;
249 |             case 2:
250 |                 {   int ret = slice_from_s(z, 3, s_0); /* <-, line 58 */
251 |                     if (ret < 0) return ret;
252 |                 }
253 |                 break;
254 |             case 3:
255 |                 {   int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */
256 |                     if (ret < 0) return ret;
257 |                 }
258 |                 break;
259 |         }
260 |         z->lb = mlimit;
261 |     }
262 |     return 1;
263 | }
264 | 
265 | extern int swedish_ISO_8859_1_stem(struct SN_env * z) {
266 |     {   int c1 = z->c; /* do, line 66 */
267 |         {   int ret = r_mark_regions(z);
268 |             if (ret == 0) goto lab0; /* call mark_regions, line 66 */
269 |             if (ret < 0) return ret;
270 |         }
271 |     lab0:
272 |         z->c = c1;
273 |     }
274 |     z->lb = z->c; z->c = z->l; /* backwards, line 67 */
275 | 
276 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 68 */
277 |         {   int ret = r_main_suffix(z);
278 |             if (ret == 0) goto lab1; /* call main_suffix, line 68 */
279 |             if (ret < 0) return ret;
280 |         }
281 |     lab1:
282 |         z->c = z->l - m2;
283 |     }
284 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 69 */
285 |         {   int ret = r_consonant_pair(z);
286 |             if (ret == 0) goto lab2; /* call consonant_pair, line 69 */
287 |             if (ret < 0) return ret;
288 |         }
289 |     lab2:
290 |         z->c = z->l - m3;
291 |     }
292 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 70 */
293 |         {   int ret = r_other_suffix(z);
294 |             if (ret == 0) goto lab3; /* call other_suffix, line 70 */
295 |             if (ret < 0) return ret;
296 |         }
297 |     lab3:
298 |         z->c = z->l - m4;
299 |     }
300 |     z->c = z->lb;
301 |     return 1;
302 | }
303 | 
304 | extern struct SN_env * swedish_ISO_8859_1_create_env(void) { return SN_create_env(0, 2, 0); }
305 | 
306 | extern void swedish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 0); }
307 | 
308 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_swedish.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int swedish_UTF_8_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_other_suffix(struct SN_env * z);
 14 | static int r_consonant_pair(struct SN_env * z);
 15 | static int r_main_suffix(struct SN_env * z);
 16 | static int r_mark_regions(struct SN_env * z);
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | 
 22 | extern struct SN_env * swedish_UTF_8_create_env(void);
 23 | extern void swedish_UTF_8_close_env(struct SN_env * z);
 24 | 
 25 | 
 26 | #ifdef __cplusplus
 27 | }
 28 | #endif
 29 | static const symbol s_0_0[1] = { 'a' };
 30 | static const symbol s_0_1[4] = { 'a', 'r', 'n', 'a' };
 31 | static const symbol s_0_2[4] = { 'e', 'r', 'n', 'a' };
 32 | static const symbol s_0_3[7] = { 'h', 'e', 't', 'e', 'r', 'n', 'a' };
 33 | static const symbol s_0_4[4] = { 'o', 'r', 'n', 'a' };
 34 | static const symbol s_0_5[2] = { 'a', 'd' };
 35 | static const symbol s_0_6[1] = { 'e' };
 36 | static const symbol s_0_7[3] = { 'a', 'd', 'e' };
 37 | static const symbol s_0_8[4] = { 'a', 'n', 'd', 'e' };
 38 | static const symbol s_0_9[4] = { 'a', 'r', 'n', 'e' };
 39 | static const symbol s_0_10[3] = { 'a', 'r', 'e' };
 40 | static const symbol s_0_11[4] = { 'a', 's', 't', 'e' };
 41 | static const symbol s_0_12[2] = { 'e', 'n' };
 42 | static const symbol s_0_13[5] = { 'a', 'n', 'd', 'e', 'n' };
 43 | static const symbol s_0_14[4] = { 'a', 'r', 'e', 'n' };
 44 | static const symbol s_0_15[5] = { 'h', 'e', 't', 'e', 'n' };
 45 | static const symbol s_0_16[3] = { 'e', 'r', 'n' };
 46 | static const symbol s_0_17[2] = { 'a', 'r' };
 47 | static const symbol s_0_18[2] = { 'e', 'r' };
 48 | static const symbol s_0_19[5] = { 'h', 'e', 't', 'e', 'r' };
 49 | static const symbol s_0_20[2] = { 'o', 'r' };
 50 | static const symbol s_0_21[1] = { 's' };
 51 | static const symbol s_0_22[2] = { 'a', 's' };
 52 | static const symbol s_0_23[5] = { 'a', 'r', 'n', 'a', 's' };
 53 | static const symbol s_0_24[5] = { 'e', 'r', 'n', 'a', 's' };
 54 | static const symbol s_0_25[5] = { 'o', 'r', 'n', 'a', 's' };
 55 | static const symbol s_0_26[2] = { 'e', 's' };
 56 | static const symbol s_0_27[4] = { 'a', 'd', 'e', 's' };
 57 | static const symbol s_0_28[5] = { 'a', 'n', 'd', 'e', 's' };
 58 | static const symbol s_0_29[3] = { 'e', 'n', 's' };
 59 | static const symbol s_0_30[5] = { 'a', 'r', 'e', 'n', 's' };
 60 | static const symbol s_0_31[6] = { 'h', 'e', 't', 'e', 'n', 's' };
 61 | static const symbol s_0_32[4] = { 'e', 'r', 'n', 's' };
 62 | static const symbol s_0_33[2] = { 'a', 't' };
 63 | static const symbol s_0_34[5] = { 'a', 'n', 'd', 'e', 't' };
 64 | static const symbol s_0_35[3] = { 'h', 'e', 't' };
 65 | static const symbol s_0_36[3] = { 'a', 's', 't' };
 66 | 
 67 | static const struct among a_0[37] =
 68 | {
 69 | /*  0 */ { 1, s_0_0, -1, 1, 0},
 70 | /*  1 */ { 4, s_0_1, 0, 1, 0},
 71 | /*  2 */ { 4, s_0_2, 0, 1, 0},
 72 | /*  3 */ { 7, s_0_3, 2, 1, 0},
 73 | /*  4 */ { 4, s_0_4, 0, 1, 0},
 74 | /*  5 */ { 2, s_0_5, -1, 1, 0},
 75 | /*  6 */ { 1, s_0_6, -1, 1, 0},
 76 | /*  7 */ { 3, s_0_7, 6, 1, 0},
 77 | /*  8 */ { 4, s_0_8, 6, 1, 0},
 78 | /*  9 */ { 4, s_0_9, 6, 1, 0},
 79 | /* 10 */ { 3, s_0_10, 6, 1, 0},
 80 | /* 11 */ { 4, s_0_11, 6, 1, 0},
 81 | /* 12 */ { 2, s_0_12, -1, 1, 0},
 82 | /* 13 */ { 5, s_0_13, 12, 1, 0},
 83 | /* 14 */ { 4, s_0_14, 12, 1, 0},
 84 | /* 15 */ { 5, s_0_15, 12, 1, 0},
 85 | /* 16 */ { 3, s_0_16, -1, 1, 0},
 86 | /* 17 */ { 2, s_0_17, -1, 1, 0},
 87 | /* 18 */ { 2, s_0_18, -1, 1, 0},
 88 | /* 19 */ { 5, s_0_19, 18, 1, 0},
 89 | /* 20 */ { 2, s_0_20, -1, 1, 0},
 90 | /* 21 */ { 1, s_0_21, -1, 2, 0},
 91 | /* 22 */ { 2, s_0_22, 21, 1, 0},
 92 | /* 23 */ { 5, s_0_23, 22, 1, 0},
 93 | /* 24 */ { 5, s_0_24, 22, 1, 0},
 94 | /* 25 */ { 5, s_0_25, 22, 1, 0},
 95 | /* 26 */ { 2, s_0_26, 21, 1, 0},
 96 | /* 27 */ { 4, s_0_27, 26, 1, 0},
 97 | /* 28 */ { 5, s_0_28, 26, 1, 0},
 98 | /* 29 */ { 3, s_0_29, 21, 1, 0},
 99 | /* 30 */ { 5, s_0_30, 29, 1, 0},
100 | /* 31 */ { 6, s_0_31, 29, 1, 0},
101 | /* 32 */ { 4, s_0_32, 21, 1, 0},
102 | /* 33 */ { 2, s_0_33, -1, 1, 0},
103 | /* 34 */ { 5, s_0_34, -1, 1, 0},
104 | /* 35 */ { 3, s_0_35, -1, 1, 0},
105 | /* 36 */ { 3, s_0_36, -1, 1, 0}
106 | };
107 | 
108 | static const symbol s_1_0[2] = { 'd', 'd' };
109 | static const symbol s_1_1[2] = { 'g', 'd' };
110 | static const symbol s_1_2[2] = { 'n', 'n' };
111 | static const symbol s_1_3[2] = { 'd', 't' };
112 | static const symbol s_1_4[2] = { 'g', 't' };
113 | static const symbol s_1_5[2] = { 'k', 't' };
114 | static const symbol s_1_6[2] = { 't', 't' };
115 | 
116 | static const struct among a_1[7] =
117 | {
118 | /*  0 */ { 2, s_1_0, -1, -1, 0},
119 | /*  1 */ { 2, s_1_1, -1, -1, 0},
120 | /*  2 */ { 2, s_1_2, -1, -1, 0},
121 | /*  3 */ { 2, s_1_3, -1, -1, 0},
122 | /*  4 */ { 2, s_1_4, -1, -1, 0},
123 | /*  5 */ { 2, s_1_5, -1, -1, 0},
124 | /*  6 */ { 2, s_1_6, -1, -1, 0}
125 | };
126 | 
127 | static const symbol s_2_0[2] = { 'i', 'g' };
128 | static const symbol s_2_1[3] = { 'l', 'i', 'g' };
129 | static const symbol s_2_2[3] = { 'e', 'l', 's' };
130 | static const symbol s_2_3[5] = { 'f', 'u', 'l', 'l', 't' };
131 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB6, 's', 't' };
132 | 
133 | static const struct among a_2[5] =
134 | {
135 | /*  0 */ { 2, s_2_0, -1, 1, 0},
136 | /*  1 */ { 3, s_2_1, 0, 1, 0},
137 | /*  2 */ { 3, s_2_2, -1, 1, 0},
138 | /*  3 */ { 5, s_2_3, -1, 3, 0},
139 | /*  4 */ { 5, s_2_4, -1, 2, 0}
140 | };
141 | 
142 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 };
143 | 
144 | static const unsigned char g_s_ending[] = { 119, 127, 149 };
145 | 
146 | static const symbol s_0[] = { 'l', 0xC3, 0xB6, 's' };
147 | static const symbol s_1[] = { 'f', 'u', 'l', 'l' };
148 | 
149 | static int r_mark_regions(struct SN_env * z) {
150 |     z->I[0] = z->l;
151 |     {   int c_test = z->c; /* test, line 29 */
152 |         {   int ret = skip_utf8(z->p, z->c, 0, z->l, + 3);
153 |             if (ret < 0) return 0;
154 |             z->c = ret; /* hop, line 29 */
155 |         }
156 |         z->I[1] = z->c; /* setmark x, line 29 */
157 |         z->c = c_test;
158 |     }
159 |     if (out_grouping_U(z, g_v, 97, 246, 1) < 0) return 0; /* goto */ /* grouping v, line 30 */
160 |     {    /* gopast */ /* non v, line 30 */
161 |         int ret = in_grouping_U(z, g_v, 97, 246, 1);
162 |         if (ret < 0) return 0;
163 |         z->c += ret;
164 |     }
165 |     z->I[0] = z->c; /* setmark p1, line 30 */
166 |      /* try, line 31 */
167 |     if (!(z->I[0] < z->I[1])) goto lab0;
168 |     z->I[0] = z->I[1];
169 | lab0:
170 |     return 1;
171 | }
172 | 
173 | static int r_main_suffix(struct SN_env * z) {
174 |     int among_var;
175 |     {   int mlimit; /* setlimit, line 37 */
176 |         int m1 = z->l - z->c; (void)m1;
177 |         if (z->c < z->I[0]) return 0;
178 |         z->c = z->I[0]; /* tomark, line 37 */
179 |         mlimit = z->lb; z->lb = z->c;
180 |         z->c = z->l - m1;
181 |         z->ket = z->c; /* [, line 37 */
182 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851442 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
183 |         among_var = find_among_b(z, a_0, 37); /* substring, line 37 */
184 |         if (!(among_var)) { z->lb = mlimit; return 0; }
185 |         z->bra = z->c; /* ], line 37 */
186 |         z->lb = mlimit;
187 |     }
188 |     switch(among_var) {
189 |         case 0: return 0;
190 |         case 1:
191 |             {   int ret = slice_del(z); /* delete, line 44 */
192 |                 if (ret < 0) return ret;
193 |             }
194 |             break;
195 |         case 2:
196 |             if (in_grouping_b_U(z, g_s_ending, 98, 121, 0)) return 0;
197 |             {   int ret = slice_del(z); /* delete, line 46 */
198 |                 if (ret < 0) return ret;
199 |             }
200 |             break;
201 |     }
202 |     return 1;
203 | }
204 | 
205 | static int r_consonant_pair(struct SN_env * z) {
206 |     {   int mlimit; /* setlimit, line 50 */
207 |         int m1 = z->l - z->c; (void)m1;
208 |         if (z->c < z->I[0]) return 0;
209 |         z->c = z->I[0]; /* tomark, line 50 */
210 |         mlimit = z->lb; z->lb = z->c;
211 |         z->c = z->l - m1;
212 |         {   int m2 = z->l - z->c; (void)m2; /* and, line 52 */
213 |             if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1064976 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
214 |             if (!(find_among_b(z, a_1, 7))) { z->lb = mlimit; return 0; } /* among, line 51 */
215 |             z->c = z->l - m2;
216 |             z->ket = z->c; /* [, line 52 */
217 |             {   int ret = skip_utf8(z->p, z->c, z->lb, 0, -1);
218 |                 if (ret < 0) { z->lb = mlimit; return 0; }
219 |                 z->c = ret; /* next, line 52 */
220 |             }
221 |             z->bra = z->c; /* ], line 52 */
222 |             {   int ret = slice_del(z); /* delete, line 52 */
223 |                 if (ret < 0) return ret;
224 |             }
225 |         }
226 |         z->lb = mlimit;
227 |     }
228 |     return 1;
229 | }
230 | 
231 | static int r_other_suffix(struct SN_env * z) {
232 |     int among_var;
233 |     {   int mlimit; /* setlimit, line 55 */
234 |         int m1 = z->l - z->c; (void)m1;
235 |         if (z->c < z->I[0]) return 0;
236 |         z->c = z->I[0]; /* tomark, line 55 */
237 |         mlimit = z->lb; z->lb = z->c;
238 |         z->c = z->l - m1;
239 |         z->ket = z->c; /* [, line 56 */
240 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
241 |         among_var = find_among_b(z, a_2, 5); /* substring, line 56 */
242 |         if (!(among_var)) { z->lb = mlimit; return 0; }
243 |         z->bra = z->c; /* ], line 56 */
244 |         switch(among_var) {
245 |             case 0: { z->lb = mlimit; return 0; }
246 |             case 1:
247 |                 {   int ret = slice_del(z); /* delete, line 57 */
248 |                     if (ret < 0) return ret;
249 |                 }
250 |                 break;
251 |             case 2:
252 |                 {   int ret = slice_from_s(z, 4, s_0); /* <-, line 58 */
253 |                     if (ret < 0) return ret;
254 |                 }
255 |                 break;
256 |             case 3:
257 |                 {   int ret = slice_from_s(z, 4, s_1); /* <-, line 59 */
258 |                     if (ret < 0) return ret;
259 |                 }
260 |                 break;
261 |         }
262 |         z->lb = mlimit;
263 |     }
264 |     return 1;
265 | }
266 | 
267 | extern int swedish_UTF_8_stem(struct SN_env * z) {
268 |     {   int c1 = z->c; /* do, line 66 */
269 |         {   int ret = r_mark_regions(z);
270 |             if (ret == 0) goto lab0; /* call mark_regions, line 66 */
271 |             if (ret < 0) return ret;
272 |         }
273 |     lab0:
274 |         z->c = c1;
275 |     }
276 |     z->lb = z->c; z->c = z->l; /* backwards, line 67 */
277 | 
278 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 68 */
279 |         {   int ret = r_main_suffix(z);
280 |             if (ret == 0) goto lab1; /* call main_suffix, line 68 */
281 |             if (ret < 0) return ret;
282 |         }
283 |     lab1:
284 |         z->c = z->l - m2;
285 |     }
286 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 69 */
287 |         {   int ret = r_consonant_pair(z);
288 |             if (ret == 0) goto lab2; /* call consonant_pair, line 69 */
289 |             if (ret < 0) return ret;
290 |         }
291 |     lab2:
292 |         z->c = z->l - m3;
293 |     }
294 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 70 */
295 |         {   int ret = r_other_suffix(z);
296 |             if (ret == 0) goto lab3; /* call other_suffix, line 70 */
297 |             if (ret < 0) return ret;
298 |         }
299 |     lab3:
300 |         z->c = z->l - m4;
301 |     }
302 |     z->c = z->lb;
303 |     return 1;
304 | }
305 | 
306 | extern struct SN_env * swedish_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); }
307 | 
308 | extern void swedish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 0); }
309 | 
310 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_ISO_8859_1_danish.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int danish_ISO_8859_1_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_undouble(struct SN_env * z);
 14 | static int r_other_suffix(struct SN_env * z);
 15 | static int r_consonant_pair(struct SN_env * z);
 16 | static int r_main_suffix(struct SN_env * z);
 17 | static int r_mark_regions(struct SN_env * z);
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif
 21 | 
 22 | 
 23 | extern struct SN_env * danish_ISO_8859_1_create_env(void);
 24 | extern void danish_ISO_8859_1_close_env(struct SN_env * z);
 25 | 
 26 | 
 27 | #ifdef __cplusplus
 28 | }
 29 | #endif
 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' };
 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' };
 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' };
 33 | static const symbol s_0_3[1] = { 'e' };
 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' };
 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' };
 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' };
 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' };
 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' };
 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' };
 40 | static const symbol s_0_10[2] = { 'e', 'n' };
 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' };
 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' };
 43 | static const symbol s_0_13[2] = { 'e', 'r' };
 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' };
 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' };
 46 | static const symbol s_0_16[1] = { 's' };
 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' };
 48 | static const symbol s_0_18[2] = { 'e', 's' };
 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' };
 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' };
 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' };
 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' };
 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' };
 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' };
 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' };
 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' };
 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' };
 58 | static const symbol s_0_28[3] = { 'e', 't', 's' };
 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' };
 60 | static const symbol s_0_30[2] = { 'e', 't' };
 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' };
 62 | 
 63 | static const struct among a_0[32] =
 64 | {
 65 | /*  0 */ { 3, s_0_0, -1, 1, 0},
 66 | /*  1 */ { 5, s_0_1, 0, 1, 0},
 67 | /*  2 */ { 4, s_0_2, -1, 1, 0},
 68 | /*  3 */ { 1, s_0_3, -1, 1, 0},
 69 | /*  4 */ { 5, s_0_4, 3, 1, 0},
 70 | /*  5 */ { 4, s_0_5, 3, 1, 0},
 71 | /*  6 */ { 6, s_0_6, 5, 1, 0},
 72 | /*  7 */ { 3, s_0_7, 3, 1, 0},
 73 | /*  8 */ { 4, s_0_8, 3, 1, 0},
 74 | /*  9 */ { 3, s_0_9, 3, 1, 0},
 75 | /* 10 */ { 2, s_0_10, -1, 1, 0},
 76 | /* 11 */ { 5, s_0_11, 10, 1, 0},
 77 | /* 12 */ { 4, s_0_12, 10, 1, 0},
 78 | /* 13 */ { 2, s_0_13, -1, 1, 0},
 79 | /* 14 */ { 5, s_0_14, 13, 1, 0},
 80 | /* 15 */ { 4, s_0_15, 13, 1, 0},
 81 | /* 16 */ { 1, s_0_16, -1, 2, 0},
 82 | /* 17 */ { 4, s_0_17, 16, 1, 0},
 83 | /* 18 */ { 2, s_0_18, 16, 1, 0},
 84 | /* 19 */ { 5, s_0_19, 18, 1, 0},
 85 | /* 20 */ { 7, s_0_20, 19, 1, 0},
 86 | /* 21 */ { 4, s_0_21, 18, 1, 0},
 87 | /* 22 */ { 5, s_0_22, 18, 1, 0},
 88 | /* 23 */ { 4, s_0_23, 18, 1, 0},
 89 | /* 24 */ { 3, s_0_24, 16, 1, 0},
 90 | /* 25 */ { 6, s_0_25, 24, 1, 0},
 91 | /* 26 */ { 5, s_0_26, 24, 1, 0},
 92 | /* 27 */ { 3, s_0_27, 16, 1, 0},
 93 | /* 28 */ { 3, s_0_28, 16, 1, 0},
 94 | /* 29 */ { 5, s_0_29, 28, 1, 0},
 95 | /* 30 */ { 2, s_0_30, -1, 1, 0},
 96 | /* 31 */ { 4, s_0_31, 30, 1, 0}
 97 | };
 98 | 
 99 | static const symbol s_1_0[2] = { 'g', 'd' };
100 | static const symbol s_1_1[2] = { 'd', 't' };
101 | static const symbol s_1_2[2] = { 'g', 't' };
102 | static const symbol s_1_3[2] = { 'k', 't' };
103 | 
104 | static const struct among a_1[4] =
105 | {
106 | /*  0 */ { 2, s_1_0, -1, -1, 0},
107 | /*  1 */ { 2, s_1_1, -1, -1, 0},
108 | /*  2 */ { 2, s_1_2, -1, -1, 0},
109 | /*  3 */ { 2, s_1_3, -1, -1, 0}
110 | };
111 | 
112 | static const symbol s_2_0[2] = { 'i', 'g' };
113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' };
114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' };
115 | static const symbol s_2_3[3] = { 'e', 'l', 's' };
116 | static const symbol s_2_4[4] = { 'l', 0xF8, 's', 't' };
117 | 
118 | static const struct among a_2[5] =
119 | {
120 | /*  0 */ { 2, s_2_0, -1, 1, 0},
121 | /*  1 */ { 3, s_2_1, 0, 1, 0},
122 | /*  2 */ { 4, s_2_2, 1, 1, 0},
123 | /*  3 */ { 3, s_2_3, -1, 1, 0},
124 | /*  4 */ { 4, s_2_4, -1, 2, 0}
125 | };
126 | 
127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
128 | 
129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
130 | 
131 | static const symbol s_0[] = { 's', 't' };
132 | static const symbol s_1[] = { 'i', 'g' };
133 | static const symbol s_2[] = { 'l', 0xF8, 's' };
134 | 
135 | static int r_mark_regions(struct SN_env * z) {
136 |     z->I[0] = z->l;
137 |     {   int c_test = z->c; /* test, line 33 */
138 |         {   int ret = z->c + 3;
139 |             if (0 > ret || ret > z->l) return 0;
140 |             z->c = ret; /* hop, line 33 */
141 |         }
142 |         z->I[1] = z->c; /* setmark x, line 33 */
143 |         z->c = c_test;
144 |     }
145 |     if (out_grouping(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */
146 |     {    /* gopast */ /* non v, line 34 */
147 |         int ret = in_grouping(z, g_v, 97, 248, 1);
148 |         if (ret < 0) return 0;
149 |         z->c += ret;
150 |     }
151 |     z->I[0] = z->c; /* setmark p1, line 34 */
152 |      /* try, line 35 */
153 |     if (!(z->I[0] < z->I[1])) goto lab0;
154 |     z->I[0] = z->I[1];
155 | lab0:
156 |     return 1;
157 | }
158 | 
159 | static int r_main_suffix(struct SN_env * z) {
160 |     int among_var;
161 |     {   int mlimit; /* setlimit, line 41 */
162 |         int m1 = z->l - z->c; (void)m1;
163 |         if (z->c < z->I[0]) return 0;
164 |         z->c = z->I[0]; /* tomark, line 41 */
165 |         mlimit = z->lb; z->lb = z->c;
166 |         z->c = z->l - m1;
167 |         z->ket = z->c; /* [, line 41 */
168 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
169 |         among_var = find_among_b(z, a_0, 32); /* substring, line 41 */
170 |         if (!(among_var)) { z->lb = mlimit; return 0; }
171 |         z->bra = z->c; /* ], line 41 */
172 |         z->lb = mlimit;
173 |     }
174 |     switch(among_var) {
175 |         case 0: return 0;
176 |         case 1:
177 |             {   int ret = slice_del(z); /* delete, line 48 */
178 |                 if (ret < 0) return ret;
179 |             }
180 |             break;
181 |         case 2:
182 |             if (in_grouping_b(z, g_s_ending, 97, 229, 0)) return 0;
183 |             {   int ret = slice_del(z); /* delete, line 50 */
184 |                 if (ret < 0) return ret;
185 |             }
186 |             break;
187 |     }
188 |     return 1;
189 | }
190 | 
191 | static int r_consonant_pair(struct SN_env * z) {
192 |     {   int m_test = z->l - z->c; /* test, line 55 */
193 |         {   int mlimit; /* setlimit, line 56 */
194 |             int m1 = z->l - z->c; (void)m1;
195 |             if (z->c < z->I[0]) return 0;
196 |             z->c = z->I[0]; /* tomark, line 56 */
197 |             mlimit = z->lb; z->lb = z->c;
198 |             z->c = z->l - m1;
199 |             z->ket = z->c; /* [, line 56 */
200 |             if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; }
201 |             if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */
202 |             z->bra = z->c; /* ], line 56 */
203 |             z->lb = mlimit;
204 |         }
205 |         z->c = z->l - m_test;
206 |     }
207 |     if (z->c <= z->lb) return 0;
208 |     z->c--; /* next, line 62 */
209 |     z->bra = z->c; /* ], line 62 */
210 |     {   int ret = slice_del(z); /* delete, line 62 */
211 |         if (ret < 0) return ret;
212 |     }
213 |     return 1;
214 | }
215 | 
216 | static int r_other_suffix(struct SN_env * z) {
217 |     int among_var;
218 |     {   int m1 = z->l - z->c; (void)m1; /* do, line 66 */
219 |         z->ket = z->c; /* [, line 66 */
220 |         if (!(eq_s_b(z, 2, s_0))) goto lab0;
221 |         z->bra = z->c; /* ], line 66 */
222 |         if (!(eq_s_b(z, 2, s_1))) goto lab0;
223 |         {   int ret = slice_del(z); /* delete, line 66 */
224 |             if (ret < 0) return ret;
225 |         }
226 |     lab0:
227 |         z->c = z->l - m1;
228 |     }
229 |     {   int mlimit; /* setlimit, line 67 */
230 |         int m2 = z->l - z->c; (void)m2;
231 |         if (z->c < z->I[0]) return 0;
232 |         z->c = z->I[0]; /* tomark, line 67 */
233 |         mlimit = z->lb; z->lb = z->c;
234 |         z->c = z->l - m2;
235 |         z->ket = z->c; /* [, line 67 */
236 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
237 |         among_var = find_among_b(z, a_2, 5); /* substring, line 67 */
238 |         if (!(among_var)) { z->lb = mlimit; return 0; }
239 |         z->bra = z->c; /* ], line 67 */
240 |         z->lb = mlimit;
241 |     }
242 |     switch(among_var) {
243 |         case 0: return 0;
244 |         case 1:
245 |             {   int ret = slice_del(z); /* delete, line 70 */
246 |                 if (ret < 0) return ret;
247 |             }
248 |             {   int m3 = z->l - z->c; (void)m3; /* do, line 70 */
249 |                 {   int ret = r_consonant_pair(z);
250 |                     if (ret == 0) goto lab1; /* call consonant_pair, line 70 */
251 |                     if (ret < 0) return ret;
252 |                 }
253 |             lab1:
254 |                 z->c = z->l - m3;
255 |             }
256 |             break;
257 |         case 2:
258 |             {   int ret = slice_from_s(z, 3, s_2); /* <-, line 72 */
259 |                 if (ret < 0) return ret;
260 |             }
261 |             break;
262 |     }
263 |     return 1;
264 | }
265 | 
266 | static int r_undouble(struct SN_env * z) {
267 |     {   int mlimit; /* setlimit, line 76 */
268 |         int m1 = z->l - z->c; (void)m1;
269 |         if (z->c < z->I[0]) return 0;
270 |         z->c = z->I[0]; /* tomark, line 76 */
271 |         mlimit = z->lb; z->lb = z->c;
272 |         z->c = z->l - m1;
273 |         z->ket = z->c; /* [, line 76 */
274 |         if (out_grouping_b(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; }
275 |         z->bra = z->c; /* ], line 76 */
276 |         z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */
277 |         if (z->S[0] == 0) return -1; /* -> ch, line 76 */
278 |         z->lb = mlimit;
279 |     }
280 |     if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */
281 |     {   int ret = slice_del(z); /* delete, line 78 */
282 |         if (ret < 0) return ret;
283 |     }
284 |     return 1;
285 | }
286 | 
287 | extern int danish_ISO_8859_1_stem(struct SN_env * z) {
288 |     {   int c1 = z->c; /* do, line 84 */
289 |         {   int ret = r_mark_regions(z);
290 |             if (ret == 0) goto lab0; /* call mark_regions, line 84 */
291 |             if (ret < 0) return ret;
292 |         }
293 |     lab0:
294 |         z->c = c1;
295 |     }
296 |     z->lb = z->c; z->c = z->l; /* backwards, line 85 */
297 | 
298 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 86 */
299 |         {   int ret = r_main_suffix(z);
300 |             if (ret == 0) goto lab1; /* call main_suffix, line 86 */
301 |             if (ret < 0) return ret;
302 |         }
303 |     lab1:
304 |         z->c = z->l - m2;
305 |     }
306 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 87 */
307 |         {   int ret = r_consonant_pair(z);
308 |             if (ret == 0) goto lab2; /* call consonant_pair, line 87 */
309 |             if (ret < 0) return ret;
310 |         }
311 |     lab2:
312 |         z->c = z->l - m3;
313 |     }
314 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 88 */
315 |         {   int ret = r_other_suffix(z);
316 |             if (ret == 0) goto lab3; /* call other_suffix, line 88 */
317 |             if (ret < 0) return ret;
318 |         }
319 |     lab3:
320 |         z->c = z->l - m4;
321 |     }
322 |     {   int m5 = z->l - z->c; (void)m5; /* do, line 89 */
323 |         {   int ret = r_undouble(z);
324 |             if (ret == 0) goto lab4; /* call undouble, line 89 */
325 |             if (ret < 0) return ret;
326 |         }
327 |     lab4:
328 |         z->c = z->l - m5;
329 |     }
330 |     z->c = z->lb;
331 |     return 1;
332 | }
333 | 
334 | extern struct SN_env * danish_ISO_8859_1_create_env(void) { return SN_create_env(1, 2, 0); }
335 | 
336 | extern void danish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z, 1); }
337 | 
338 | 


--------------------------------------------------------------------------------
/libstemmer_c/src_c/stem_UTF_8_danish.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* This file was generated automatically by the Snowball to ANSI C compiler */
  3 | 
  4 | #include "../runtime/header.h"
  5 | 
  6 | #ifdef __cplusplus
  7 | extern "C" {
  8 | #endif
  9 | extern int danish_UTF_8_stem(struct SN_env * z);
 10 | #ifdef __cplusplus
 11 | }
 12 | #endif
 13 | static int r_undouble(struct SN_env * z);
 14 | static int r_other_suffix(struct SN_env * z);
 15 | static int r_consonant_pair(struct SN_env * z);
 16 | static int r_main_suffix(struct SN_env * z);
 17 | static int r_mark_regions(struct SN_env * z);
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif
 21 | 
 22 | 
 23 | extern struct SN_env * danish_UTF_8_create_env(void);
 24 | extern void danish_UTF_8_close_env(struct SN_env * z);
 25 | 
 26 | 
 27 | #ifdef __cplusplus
 28 | }
 29 | #endif
 30 | static const symbol s_0_0[3] = { 'h', 'e', 'd' };
 31 | static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' };
 32 | static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' };
 33 | static const symbol s_0_3[1] = { 'e' };
 34 | static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' };
 35 | static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' };
 36 | static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' };
 37 | static const symbol s_0_7[3] = { 'e', 'n', 'e' };
 38 | static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' };
 39 | static const symbol s_0_9[3] = { 'e', 'r', 'e' };
 40 | static const symbol s_0_10[2] = { 'e', 'n' };
 41 | static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' };
 42 | static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' };
 43 | static const symbol s_0_13[2] = { 'e', 'r' };
 44 | static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' };
 45 | static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' };
 46 | static const symbol s_0_16[1] = { 's' };
 47 | static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' };
 48 | static const symbol s_0_18[2] = { 'e', 's' };
 49 | static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' };
 50 | static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' };
 51 | static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' };
 52 | static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' };
 53 | static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' };
 54 | static const symbol s_0_24[3] = { 'e', 'n', 's' };
 55 | static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' };
 56 | static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' };
 57 | static const symbol s_0_27[3] = { 'e', 'r', 's' };
 58 | static const symbol s_0_28[3] = { 'e', 't', 's' };
 59 | static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' };
 60 | static const symbol s_0_30[2] = { 'e', 't' };
 61 | static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' };
 62 | 
 63 | static const struct among a_0[32] =
 64 | {
 65 | /*  0 */ { 3, s_0_0, -1, 1, 0},
 66 | /*  1 */ { 5, s_0_1, 0, 1, 0},
 67 | /*  2 */ { 4, s_0_2, -1, 1, 0},
 68 | /*  3 */ { 1, s_0_3, -1, 1, 0},
 69 | /*  4 */ { 5, s_0_4, 3, 1, 0},
 70 | /*  5 */ { 4, s_0_5, 3, 1, 0},
 71 | /*  6 */ { 6, s_0_6, 5, 1, 0},
 72 | /*  7 */ { 3, s_0_7, 3, 1, 0},
 73 | /*  8 */ { 4, s_0_8, 3, 1, 0},
 74 | /*  9 */ { 3, s_0_9, 3, 1, 0},
 75 | /* 10 */ { 2, s_0_10, -1, 1, 0},
 76 | /* 11 */ { 5, s_0_11, 10, 1, 0},
 77 | /* 12 */ { 4, s_0_12, 10, 1, 0},
 78 | /* 13 */ { 2, s_0_13, -1, 1, 0},
 79 | /* 14 */ { 5, s_0_14, 13, 1, 0},
 80 | /* 15 */ { 4, s_0_15, 13, 1, 0},
 81 | /* 16 */ { 1, s_0_16, -1, 2, 0},
 82 | /* 17 */ { 4, s_0_17, 16, 1, 0},
 83 | /* 18 */ { 2, s_0_18, 16, 1, 0},
 84 | /* 19 */ { 5, s_0_19, 18, 1, 0},
 85 | /* 20 */ { 7, s_0_20, 19, 1, 0},
 86 | /* 21 */ { 4, s_0_21, 18, 1, 0},
 87 | /* 22 */ { 5, s_0_22, 18, 1, 0},
 88 | /* 23 */ { 4, s_0_23, 18, 1, 0},
 89 | /* 24 */ { 3, s_0_24, 16, 1, 0},
 90 | /* 25 */ { 6, s_0_25, 24, 1, 0},
 91 | /* 26 */ { 5, s_0_26, 24, 1, 0},
 92 | /* 27 */ { 3, s_0_27, 16, 1, 0},
 93 | /* 28 */ { 3, s_0_28, 16, 1, 0},
 94 | /* 29 */ { 5, s_0_29, 28, 1, 0},
 95 | /* 30 */ { 2, s_0_30, -1, 1, 0},
 96 | /* 31 */ { 4, s_0_31, 30, 1, 0}
 97 | };
 98 | 
 99 | static const symbol s_1_0[2] = { 'g', 'd' };
100 | static const symbol s_1_1[2] = { 'd', 't' };
101 | static const symbol s_1_2[2] = { 'g', 't' };
102 | static const symbol s_1_3[2] = { 'k', 't' };
103 | 
104 | static const struct among a_1[4] =
105 | {
106 | /*  0 */ { 2, s_1_0, -1, -1, 0},
107 | /*  1 */ { 2, s_1_1, -1, -1, 0},
108 | /*  2 */ { 2, s_1_2, -1, -1, 0},
109 | /*  3 */ { 2, s_1_3, -1, -1, 0}
110 | };
111 | 
112 | static const symbol s_2_0[2] = { 'i', 'g' };
113 | static const symbol s_2_1[3] = { 'l', 'i', 'g' };
114 | static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' };
115 | static const symbol s_2_3[3] = { 'e', 'l', 's' };
116 | static const symbol s_2_4[5] = { 'l', 0xC3, 0xB8, 's', 't' };
117 | 
118 | static const struct among a_2[5] =
119 | {
120 | /*  0 */ { 2, s_2_0, -1, 1, 0},
121 | /*  1 */ { 3, s_2_1, 0, 1, 0},
122 | /*  2 */ { 4, s_2_2, 1, 1, 0},
123 | /*  3 */ { 3, s_2_3, -1, 1, 0},
124 | /*  4 */ { 5, s_2_4, -1, 2, 0}
125 | };
126 | 
127 | static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
128 | 
129 | static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
130 | 
131 | static const symbol s_0[] = { 's', 't' };
132 | static const symbol s_1[] = { 'i', 'g' };
133 | static const symbol s_2[] = { 'l', 0xC3, 0xB8, 's' };
134 | 
135 | static int r_mark_regions(struct SN_env * z) {
136 |     z->I[0] = z->l;
137 |     {   int c_test = z->c; /* test, line 33 */
138 |         {   int ret = skip_utf8(z->p, z->c, 0, z->l, + 3);
139 |             if (ret < 0) return 0;
140 |             z->c = ret; /* hop, line 33 */
141 |         }
142 |         z->I[1] = z->c; /* setmark x, line 33 */
143 |         z->c = c_test;
144 |     }
145 |     if (out_grouping_U(z, g_v, 97, 248, 1) < 0) return 0; /* goto */ /* grouping v, line 34 */
146 |     {    /* gopast */ /* non v, line 34 */
147 |         int ret = in_grouping_U(z, g_v, 97, 248, 1);
148 |         if (ret < 0) return 0;
149 |         z->c += ret;
150 |     }
151 |     z->I[0] = z->c; /* setmark p1, line 34 */
152 |      /* try, line 35 */
153 |     if (!(z->I[0] < z->I[1])) goto lab0;
154 |     z->I[0] = z->I[1];
155 | lab0:
156 |     return 1;
157 | }
158 | 
159 | static int r_main_suffix(struct SN_env * z) {
160 |     int among_var;
161 |     {   int mlimit; /* setlimit, line 41 */
162 |         int m1 = z->l - z->c; (void)m1;
163 |         if (z->c < z->I[0]) return 0;
164 |         z->c = z->I[0]; /* tomark, line 41 */
165 |         mlimit = z->lb; z->lb = z->c;
166 |         z->c = z->l - m1;
167 |         z->ket = z->c; /* [, line 41 */
168 |         if (z->c <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1851440 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
169 |         among_var = find_among_b(z, a_0, 32); /* substring, line 41 */
170 |         if (!(among_var)) { z->lb = mlimit; return 0; }
171 |         z->bra = z->c; /* ], line 41 */
172 |         z->lb = mlimit;
173 |     }
174 |     switch(among_var) {
175 |         case 0: return 0;
176 |         case 1:
177 |             {   int ret = slice_del(z); /* delete, line 48 */
178 |                 if (ret < 0) return ret;
179 |             }
180 |             break;
181 |         case 2:
182 |             if (in_grouping_b_U(z, g_s_ending, 97, 229, 0)) return 0;
183 |             {   int ret = slice_del(z); /* delete, line 50 */
184 |                 if (ret < 0) return ret;
185 |             }
186 |             break;
187 |     }
188 |     return 1;
189 | }
190 | 
191 | static int r_consonant_pair(struct SN_env * z) {
192 |     {   int m_test = z->l - z->c; /* test, line 55 */
193 |         {   int mlimit; /* setlimit, line 56 */
194 |             int m1 = z->l - z->c; (void)m1;
195 |             if (z->c < z->I[0]) return 0;
196 |             z->c = z->I[0]; /* tomark, line 56 */
197 |             mlimit = z->lb; z->lb = z->c;
198 |             z->c = z->l - m1;
199 |             z->ket = z->c; /* [, line 56 */
200 |             if (z->c - 1 <= z->lb || (z->p[z->c - 1] != 100 && z->p[z->c - 1] != 116)) { z->lb = mlimit; return 0; }
201 |             if (!(find_among_b(z, a_1, 4))) { z->lb = mlimit; return 0; } /* substring, line 56 */
202 |             z->bra = z->c; /* ], line 56 */
203 |             z->lb = mlimit;
204 |         }
205 |         z->c = z->l - m_test;
206 |     }
207 |     {   int ret = skip_utf8(z->p, z->c, z->lb, 0, -1);
208 |         if (ret < 0) return 0;
209 |         z->c = ret; /* next, line 62 */
210 |     }
211 |     z->bra = z->c; /* ], line 62 */
212 |     {   int ret = slice_del(z); /* delete, line 62 */
213 |         if (ret < 0) return ret;
214 |     }
215 |     return 1;
216 | }
217 | 
218 | static int r_other_suffix(struct SN_env * z) {
219 |     int among_var;
220 |     {   int m1 = z->l - z->c; (void)m1; /* do, line 66 */
221 |         z->ket = z->c; /* [, line 66 */
222 |         if (!(eq_s_b(z, 2, s_0))) goto lab0;
223 |         z->bra = z->c; /* ], line 66 */
224 |         if (!(eq_s_b(z, 2, s_1))) goto lab0;
225 |         {   int ret = slice_del(z); /* delete, line 66 */
226 |             if (ret < 0) return ret;
227 |         }
228 |     lab0:
229 |         z->c = z->l - m1;
230 |     }
231 |     {   int mlimit; /* setlimit, line 67 */
232 |         int m2 = z->l - z->c; (void)m2;
233 |         if (z->c < z->I[0]) return 0;
234 |         z->c = z->I[0]; /* tomark, line 67 */
235 |         mlimit = z->lb; z->lb = z->c;
236 |         z->c = z->l - m2;
237 |         z->ket = z->c; /* [, line 67 */
238 |         if (z->c - 1 <= z->lb || z->p[z->c - 1] >> 5 != 3 || !((1572992 >> (z->p[z->c - 1] & 0x1f)) & 1)) { z->lb = mlimit; return 0; }
239 |         among_var = find_among_b(z, a_2, 5); /* substring, line 67 */
240 |         if (!(among_var)) { z->lb = mlimit; return 0; }
241 |         z->bra = z->c; /* ], line 67 */
242 |         z->lb = mlimit;
243 |     }
244 |     switch(among_var) {
245 |         case 0: return 0;
246 |         case 1:
247 |             {   int ret = slice_del(z); /* delete, line 70 */
248 |                 if (ret < 0) return ret;
249 |             }
250 |             {   int m3 = z->l - z->c; (void)m3; /* do, line 70 */
251 |                 {   int ret = r_consonant_pair(z);
252 |                     if (ret == 0) goto lab1; /* call consonant_pair, line 70 */
253 |                     if (ret < 0) return ret;
254 |                 }
255 |             lab1:
256 |                 z->c = z->l - m3;
257 |             }
258 |             break;
259 |         case 2:
260 |             {   int ret = slice_from_s(z, 4, s_2); /* <-, line 72 */
261 |                 if (ret < 0) return ret;
262 |             }
263 |             break;
264 |     }
265 |     return 1;
266 | }
267 | 
268 | static int r_undouble(struct SN_env * z) {
269 |     {   int mlimit; /* setlimit, line 76 */
270 |         int m1 = z->l - z->c; (void)m1;
271 |         if (z->c < z->I[0]) return 0;
272 |         z->c = z->I[0]; /* tomark, line 76 */
273 |         mlimit = z->lb; z->lb = z->c;
274 |         z->c = z->l - m1;
275 |         z->ket = z->c; /* [, line 76 */
276 |         if (out_grouping_b_U(z, g_v, 97, 248, 0)) { z->lb = mlimit; return 0; }
277 |         z->bra = z->c; /* ], line 76 */
278 |         z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */
279 |         if (z->S[0] == 0) return -1; /* -> ch, line 76 */
280 |         z->lb = mlimit;
281 |     }
282 |     if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */
283 |     {   int ret = slice_del(z); /* delete, line 78 */
284 |         if (ret < 0) return ret;
285 |     }
286 |     return 1;
287 | }
288 | 
289 | extern int danish_UTF_8_stem(struct SN_env * z) {
290 |     {   int c1 = z->c; /* do, line 84 */
291 |         {   int ret = r_mark_regions(z);
292 |             if (ret == 0) goto lab0; /* call mark_regions, line 84 */
293 |             if (ret < 0) return ret;
294 |         }
295 |     lab0:
296 |         z->c = c1;
297 |     }
298 |     z->lb = z->c; z->c = z->l; /* backwards, line 85 */
299 | 
300 |     {   int m2 = z->l - z->c; (void)m2; /* do, line 86 */
301 |         {   int ret = r_main_suffix(z);
302 |             if (ret == 0) goto lab1; /* call main_suffix, line 86 */
303 |             if (ret < 0) return ret;
304 |         }
305 |     lab1:
306 |         z->c = z->l - m2;
307 |     }
308 |     {   int m3 = z->l - z->c; (void)m3; /* do, line 87 */
309 |         {   int ret = r_consonant_pair(z);
310 |             if (ret == 0) goto lab2; /* call consonant_pair, line 87 */
311 |             if (ret < 0) return ret;
312 |         }
313 |     lab2:
314 |         z->c = z->l - m3;
315 |     }
316 |     {   int m4 = z->l - z->c; (void)m4; /* do, line 88 */
317 |         {   int ret = r_other_suffix(z);
318 |             if (ret == 0) goto lab3; /* call other_suffix, line 88 */
319 |             if (ret < 0) return ret;
320 |         }
321 |     lab3:
322 |         z->c = z->l - m4;
323 |     }
324 |     {   int m5 = z->l - z->c; (void)m5; /* do, line 89 */
325 |         {   int ret = r_undouble(z);
326 |             if (ret == 0) goto lab4; /* call undouble, line 89 */
327 |             if (ret < 0) return ret;
328 |         }
329 |     lab4:
330 |         z->c = z->l - m5;
331 |     }
332 |     z->c = z->lb;
333 |     return 1;
334 | }
335 | 
336 | extern struct SN_env * danish_UTF_8_create_env(void) { return SN_create_env(1, 2, 0); }
337 | 
338 | extern void danish_UTF_8_close_env(struct SN_env * z) { SN_close_env(z, 1); }
339 | 
340 | 


--------------------------------------------------------------------------------
/libstemmer_c/libstemmer/modules.h:
--------------------------------------------------------------------------------
  1 | /* libstemmer/modules.h: List of stemming modules.
  2 |  *
  3 |  * This file is generated by mkmodules.pl from a list of module names.
  4 |  * Do not edit manually.
  5 |  *
  6 |  * Modules included by this file are: danish, dutch, english, finnish, french,
  7 |  * german, hungarian, italian, norwegian, porter, portuguese, romanian,
  8 |  * russian, spanish, swedish, turkish
  9 |  */
 10 | 
 11 | #include "../src_c/stem_ISO_8859_1_danish.h"
 12 | #include "../src_c/stem_UTF_8_danish.h"
 13 | #include "../src_c/stem_ISO_8859_1_dutch.h"
 14 | #include "../src_c/stem_UTF_8_dutch.h"
 15 | #include "../src_c/stem_ISO_8859_1_english.h"
 16 | #include "../src_c/stem_UTF_8_english.h"
 17 | #include "../src_c/stem_ISO_8859_1_finnish.h"
 18 | #include "../src_c/stem_UTF_8_finnish.h"
 19 | #include "../src_c/stem_ISO_8859_1_french.h"
 20 | #include "../src_c/stem_UTF_8_french.h"
 21 | #include "../src_c/stem_ISO_8859_1_german.h"
 22 | #include "../src_c/stem_UTF_8_german.h"
 23 | #include "../src_c/stem_ISO_8859_1_hungarian.h"
 24 | #include "../src_c/stem_UTF_8_hungarian.h"
 25 | #include "../src_c/stem_ISO_8859_1_italian.h"
 26 | #include "../src_c/stem_UTF_8_italian.h"
 27 | #include "../src_c/stem_ISO_8859_1_norwegian.h"
 28 | #include "../src_c/stem_UTF_8_norwegian.h"
 29 | #include "../src_c/stem_ISO_8859_1_porter.h"
 30 | #include "../src_c/stem_UTF_8_porter.h"
 31 | #include "../src_c/stem_ISO_8859_1_portuguese.h"
 32 | #include "../src_c/stem_UTF_8_portuguese.h"
 33 | #include "../src_c/stem_ISO_8859_2_romanian.h"
 34 | #include "../src_c/stem_UTF_8_romanian.h"
 35 | #include "../src_c/stem_KOI8_R_russian.h"
 36 | #include "../src_c/stem_UTF_8_russian.h"
 37 | #include "../src_c/stem_ISO_8859_1_spanish.h"
 38 | #include "../src_c/stem_UTF_8_spanish.h"
 39 | #include "../src_c/stem_ISO_8859_1_swedish.h"
 40 | #include "../src_c/stem_UTF_8_swedish.h"
 41 | #include "../src_c/stem_UTF_8_turkish.h"
 42 | 
 43 | typedef enum {
 44 |   ENC_UNKNOWN=0,
 45 |   ENC_ISO_8859_1,
 46 |   ENC_ISO_8859_2,
 47 |   ENC_KOI8_R,
 48 |   ENC_UTF_8
 49 | } stemmer_encoding_t;
 50 | 
 51 | struct stemmer_encoding {
 52 |   const char * name;
 53 |   stemmer_encoding_t enc;
 54 | };
 55 | static struct stemmer_encoding encodings[] = {
 56 |   {"ISO_8859_1", ENC_ISO_8859_1},
 57 |   {"ISO_8859_2", ENC_ISO_8859_2},
 58 |   {"KOI8_R", ENC_KOI8_R},
 59 |   {"UTF_8", ENC_UTF_8},
 60 |   {0,ENC_UNKNOWN}
 61 | };
 62 | 
 63 | struct stemmer_modules {
 64 |   const char * name;
 65 |   stemmer_encoding_t enc; 
 66 |   struct SN_env * (*create)(void);
 67 |   void (*close)(struct SN_env *);
 68 |   int (*stem)(struct SN_env *);
 69 | };
 70 | static struct stemmer_modules modules[] = {
 71 |   {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
 72 |   {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 73 |   {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
 74 |   {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 75 |   {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
 76 |   {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
 77 |   {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
 78 |   {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 79 |   {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
 80 |   {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
 81 |   {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
 82 |   {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 83 |   {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
 84 |   {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
 85 |   {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
 86 |   {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 87 |   {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
 88 |   {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 89 |   {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
 90 |   {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
 91 |   {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
 92 |   {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 93 |   {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
 94 |   {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
 95 |   {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
 96 |   {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 97 |   {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
 98 |   {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
 99 |   {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
100 |   {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
101 |   {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
102 |   {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
103 |   {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
104 |   {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
105 |   {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
106 |   {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
107 |   {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
108 |   {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
109 |   {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
110 |   {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
111 |   {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
112 |   {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113 |   {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
114 |   {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115 |   {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
116 |   {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117 |   {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
118 |   {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
119 |   {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
120 |   {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
121 |   {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
122 |   {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
123 |   {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
124 |   {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
125 |   {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
126 |   {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
127 |   {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
128 |   {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
129 |   {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
130 |   {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
131 |   {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
132 |   {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
133 |   {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
134 |   {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
135 |   {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
136 |   {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
137 |   {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
138 |   {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
139 |   {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
140 |   {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
141 |   {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
142 |   {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
143 |   {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
144 |   {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
145 |   {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
146 |   {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
147 |   {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
148 |   {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
149 |   {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
150 |   {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
151 |   {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
152 |   {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
153 |   {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
154 |   {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
155 |   {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
156 |   {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
157 |   {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
158 |   {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
159 |   {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
160 |   {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
161 |   {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
162 |   {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
163 |   {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
164 |   {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
165 |   {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
166 |   {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
167 |   {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168 |   {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169 |   {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
170 |   {0,ENC_UNKNOWN,0,0,0}
171 | };
172 | static const char * algorithm_names[] = {
173 |   "danish", 
174 |   "dutch", 
175 |   "english", 
176 |   "finnish", 
177 |   "french", 
178 |   "german", 
179 |   "hungarian", 
180 |   "italian", 
181 |   "norwegian", 
182 |   "porter", 
183 |   "portuguese", 
184 |   "romanian", 
185 |   "russian", 
186 |   "spanish", 
187 |   "swedish", 
188 |   "turkish", 
189 |   0
190 | };
191 | 


--------------------------------------------------------------------------------
/libstemmer_c/runtime/utilities_sq3.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | 
  6 | #include <sqlite3.h>
  7 | 
  8 | #include "header.h"
  9 | 
 10 | #define unless(C) if(!(C))
 11 | 
 12 | #define CREATE_SIZE 1
 13 | 
 14 | extern symbol * create_s(void) {
 15 |     symbol * p;
 16 |     void * mem = sqlite3_malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
 17 |     if (mem == NULL) return NULL;
 18 |     p = (symbol *) (HEAD + (char *) mem);
 19 |     CAPACITY(p) = CREATE_SIZE;
 20 |     SET_SIZE(p, CREATE_SIZE);
 21 |     return p;
 22 | }
 23 | 
 24 | extern void lose_s(symbol * p) {
 25 |     if (p == NULL) return;
 26 |     sqlite3_free((char *) p - HEAD);
 27 | }
 28 | 
 29 | /*
 30 |    new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
 31 |    if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
 32 |    position, or 0 on failure.
 33 | 
 34 |    -- used to implement hop and next in the utf8 case.
 35 | */
 36 | 
 37 | extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
 38 |     int b;
 39 |     if (n >= 0) {
 40 |         for (; n > 0; n--) {
 41 |             if (c >= l) return -1;
 42 |             b = p[c++];
 43 |             if (b >= 0xC0) {   /* 1100 0000 */
 44 |                 while (c < l) {
 45 |                     b = p[c];
 46 |                     if (b >= 0xC0 || b < 0x80) break;
 47 |                     /* break unless b is 10------ */
 48 |                     c++;
 49 |                 }
 50 |             }
 51 |         }
 52 |     } else {
 53 |         for (; n < 0; n++) {
 54 |             if (c <= lb) return -1;
 55 |             b = p[--c];
 56 |             if (b >= 0x80) {   /* 1000 0000 */
 57 |                 while (c > lb) {
 58 |                     b = p[c];
 59 |                     if (b >= 0xC0) break; /* 1100 0000 */
 60 |                     c--;
 61 |                 }
 62 |             }
 63 |         }
 64 |     }
 65 |     return c;
 66 | }
 67 | 
 68 | /* Code for character groupings: utf8 cases */
 69 | 
 70 | static int get_utf8(const symbol * p, int c, int l, int * slot) {
 71 |     int b0, b1;
 72 |     if (c >= l) return 0;
 73 |     b0 = p[c++];
 74 |     if (b0 < 0xC0 || c == l) {   /* 1100 0000 */
 75 |         * slot = b0; return 1;
 76 |     }
 77 |     b1 = p[c++];
 78 |     if (b0 < 0xE0 || c == l) {   /* 1110 0000 */
 79 |         * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
 80 |     }
 81 |     * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
 82 | }
 83 | 
 84 | static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
 85 |     int b0, b1;
 86 |     if (c <= lb) return 0;
 87 |     b0 = p[--c];
 88 |     if (b0 < 0x80 || c == lb) {   /* 1000 0000 */
 89 |         * slot = b0; return 1;
 90 |     }
 91 |     b1 = p[--c];
 92 |     if (b1 >= 0xC0 || c == lb) {   /* 1100 0000 */
 93 |         * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
 94 |     }
 95 |     * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
 96 | }
 97 | 
 98 | extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
 99 |     do {
100 | 	int ch;
101 | 	int w = get_utf8(z->p, z->c, z->l, & ch);
102 | 	unless (w) return -1;
103 | 	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
104 | 	    return w;
105 | 	z->c += w;
106 |     } while (repeat);
107 |     return 0;
108 | }
109 | 
110 | extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
111 |     do {
112 | 	int ch;
113 | 	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
114 | 	unless (w) return -1;
115 | 	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
116 | 	    return w;
117 | 	z->c -= w;
118 |     } while (repeat);
119 |     return 0;
120 | }
121 | 
122 | extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
123 |     do {
124 | 	int ch;
125 | 	int w = get_utf8(z->p, z->c, z->l, & ch);
126 | 	unless (w) return -1;
127 | 	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
128 | 	    return w;
129 | 	z->c += w;
130 |     } while (repeat);
131 |     return 0;
132 | }
133 | 
134 | extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
135 |     do {
136 | 	int ch;
137 | 	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
138 | 	unless (w) return -1;
139 | 	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
140 | 	    return w;
141 | 	z->c -= w;
142 |     } while (repeat);
143 |     return 0;
144 | }
145 | 
146 | /* Code for character groupings: non-utf8 cases */
147 | 
148 | extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
149 |     do {
150 | 	int ch;
151 | 	if (z->c >= z->l) return -1;
152 | 	ch = z->p[z->c];
153 | 	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
154 | 	    return 1;
155 | 	z->c++;
156 |     } while (repeat);
157 |     return 0;
158 | }
159 | 
160 | extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
161 |     do {
162 | 	int ch;
163 | 	if (z->c <= z->lb) return -1;
164 | 	ch = z->p[z->c - 1];
165 | 	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
166 | 	    return 1;
167 | 	z->c--;
168 |     } while (repeat);
169 |     return 0;
170 | }
171 | 
172 | extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
173 |     do {
174 | 	int ch;
175 | 	if (z->c >= z->l) return -1;
176 | 	ch = z->p[z->c];
177 | 	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
178 | 	    return 1;
179 | 	z->c++;
180 |     } while (repeat);
181 |     return 0;
182 | }
183 | 
184 | extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
185 |     do {
186 | 	int ch;
187 | 	if (z->c <= z->lb) return -1;
188 | 	ch = z->p[z->c - 1];
189 | 	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
190 | 	    return 1;
191 | 	z->c--;
192 |     } while (repeat);
193 |     return 0;
194 | }
195 | 
196 | extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
197 |     if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
198 |     z->c += s_size; return 1;
199 | }
200 | 
201 | extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
202 |     if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
203 |     z->c -= s_size; return 1;
204 | }
205 | 
206 | extern int eq_v(struct SN_env * z, const symbol * p) {
207 |     return eq_s(z, SIZE(p), p);
208 | }
209 | 
210 | extern int eq_v_b(struct SN_env * z, const symbol * p) {
211 |     return eq_s_b(z, SIZE(p), p);
212 | }
213 | 
214 | extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
215 | 
216 |     int i = 0;
217 |     int j = v_size;
218 | 
219 |     int c = z->c; int l = z->l;
220 |     symbol * q = z->p + c;
221 | 
222 |     const struct among * w;
223 | 
224 |     int common_i = 0;
225 |     int common_j = 0;
226 | 
227 |     int first_key_inspected = 0;
228 | 
229 |     while(1) {
230 |         int k = i + ((j - i) >> 1);
231 |         int diff = 0;
232 |         int common = common_i < common_j ? common_i : common_j; /* smaller */
233 |         w = v + k;
234 |         {
235 |             int i2; for (i2 = common; i2 < w->s_size; i2++) {
236 |                 if (c + common == l) { diff = -1; break; }
237 |                 diff = q[common] - w->s[i2];
238 |                 if (diff != 0) break;
239 |                 common++;
240 |             }
241 |         }
242 |         if (diff < 0) { j = k; common_j = common; }
243 |                  else { i = k; common_i = common; }
244 |         if (j - i <= 1) {
245 |             if (i > 0) break; /* v->s has been inspected */
246 |             if (j == i) break; /* only one item in v */
247 | 
248 |             /* - but now we need to go round once more to get
249 |                v->s inspected. This looks messy, but is actually
250 |                the optimal approach.  */
251 | 
252 |             if (first_key_inspected) break;
253 |             first_key_inspected = 1;
254 |         }
255 |     }
256 |     while(1) {
257 |         w = v + i;
258 |         if (common_i >= w->s_size) {
259 |             z->c = c + w->s_size;
260 |             if (w->function == 0) return w->result;
261 |             {
262 |                 int res = w->function(z);
263 |                 z->c = c + w->s_size;
264 |                 if (res) return w->result;
265 |             }
266 |         }
267 |         i = w->substring_i;
268 |         if (i < 0) return 0;
269 |     }
270 | }
271 | 
272 | /* find_among_b is for backwards processing. Same comments apply */
273 | 
274 | extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
275 | 
276 |     int i = 0;
277 |     int j = v_size;
278 | 
279 |     int c = z->c; int lb = z->lb;
280 |     symbol * q = z->p + c - 1;
281 | 
282 |     const struct among * w;
283 | 
284 |     int common_i = 0;
285 |     int common_j = 0;
286 | 
287 |     int first_key_inspected = 0;
288 | 
289 |     while(1) {
290 |         int k = i + ((j - i) >> 1);
291 |         int diff = 0;
292 |         int common = common_i < common_j ? common_i : common_j;
293 |         w = v + k;
294 |         {
295 |             int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
296 |                 if (c - common == lb) { diff = -1; break; }
297 |                 diff = q[- common] - w->s[i2];
298 |                 if (diff != 0) break;
299 |                 common++;
300 |             }
301 |         }
302 |         if (diff < 0) { j = k; common_j = common; }
303 |                  else { i = k; common_i = common; }
304 |         if (j - i <= 1) {
305 |             if (i > 0) break;
306 |             if (j == i) break;
307 |             if (first_key_inspected) break;
308 |             first_key_inspected = 1;
309 |         }
310 |     }
311 |     while(1) {
312 |         w = v + i;
313 |         if (common_i >= w->s_size) {
314 |             z->c = c - w->s_size;
315 |             if (w->function == 0) return w->result;
316 |             {
317 |                 int res = w->function(z);
318 |                 z->c = c - w->s_size;
319 |                 if (res) return w->result;
320 |             }
321 |         }
322 |         i = w->substring_i;
323 |         if (i < 0) return 0;
324 |     }
325 | }
326 | 
327 | 
328 | /* Increase the size of the buffer pointed to by p to at least n symbols.
329 |  * If insufficient memory, returns NULL and frees the old buffer.
330 |  */
331 | static symbol * increase_size(symbol * p, int n) {
332 |     symbol * q;
333 |     int new_size = n + 20;
334 |     void * mem = sqlite3_realloc((char *) p - HEAD,
335 |                          HEAD + (new_size + 1) * sizeof(symbol));
336 |     if (mem == NULL) {
337 |         lose_s(p);
338 |         return NULL;
339 |     }
340 |     q = (symbol *) (HEAD + (char *)mem);
341 |     CAPACITY(q) = new_size;
342 |     return q;
343 | }
344 | 
345 | /* to replace symbols between c_bra and c_ket in z->p by the
346 |    s_size symbols at s.
347 |    Returns 0 on success, -1 on error.
348 |    Also, frees z->p (and sets it to NULL) on error.
349 | */
350 | extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
351 | {
352 |     int adjustment;
353 |     int len;
354 |     if (z->p == NULL) {
355 |         z->p = create_s();
356 |         if (z->p == NULL) return -1;
357 |     }
358 |     adjustment = s_size - (c_ket - c_bra);
359 |     len = SIZE(z->p);
360 |     if (adjustment != 0) {
361 |         if (adjustment + len > CAPACITY(z->p)) {
362 |             z->p = increase_size(z->p, adjustment + len);
363 |             if (z->p == NULL) return -1;
364 |         }
365 |         memmove(z->p + c_ket + adjustment,
366 |                 z->p + c_ket,
367 |                 (len - c_ket) * sizeof(symbol));
368 |         SET_SIZE(z->p, adjustment + len);
369 |         z->l += adjustment;
370 |         if (z->c >= c_ket)
371 |             z->c += adjustment;
372 |         else
373 |             if (z->c > c_bra)
374 |                 z->c = c_bra;
375 |     }
376 |     unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
377 |     if (adjptr != NULL)
378 |         *adjptr = adjustment;
379 |     return 0;
380 | }
381 | 
382 | static int slice_check(struct SN_env * z) {
383 | 
384 |     if (z->bra < 0 ||
385 |         z->bra > z->ket ||
386 |         z->ket > z->l ||
387 |         z->p == NULL ||
388 |         z->l > SIZE(z->p)) /* this line could be removed */
389 |     {
390 | #if 0
391 |         fprintf(stderr, "faulty slice operation:\n");
392 |         debug(z, -1, 0);
393 | #endif
394 |         return -1;
395 |     }
396 |     return 0;
397 | }
398 | 
399 | extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
400 |     if (slice_check(z)) return -1;
401 |     return replace_s(z, z->bra, z->ket, s_size, s, NULL);
402 | }
403 | 
404 | extern int slice_from_v(struct SN_env * z, const symbol * p) {
405 |     return slice_from_s(z, SIZE(p), p);
406 | }
407 | 
408 | extern int slice_del(struct SN_env * z) {
409 |     return slice_from_s(z, 0, 0);
410 | }
411 | 
412 | extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
413 |     int adjustment;
414 |     if (replace_s(z, bra, ket, s_size, s, &adjustment))
415 |         return -1;
416 |     if (bra <= z->bra) z->bra += adjustment;
417 |     if (bra <= z->ket) z->ket += adjustment;
418 |     return 0;
419 | }
420 | 
421 | extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
422 |     int adjustment;
423 |     if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
424 |         return -1;
425 |     if (bra <= z->bra) z->bra += adjustment;
426 |     if (bra <= z->ket) z->ket += adjustment;
427 |     return 0;
428 | }
429 | 
430 | extern symbol * slice_to(struct SN_env * z, symbol * p) {
431 |     if (slice_check(z)) {
432 |         lose_s(p);
433 |         return NULL;
434 |     }
435 |     {
436 |         int len = z->ket - z->bra;
437 |         if (CAPACITY(p) < len) {
438 |             p = increase_size(p, len);
439 |             if (p == NULL)
440 |                 return NULL;
441 |         }
442 |         memmove(p, z->p + z->bra, len * sizeof(symbol));
443 |         SET_SIZE(p, len);
444 |     }
445 |     return p;
446 | }
447 | 
448 | extern symbol * assign_to(struct SN_env * z, symbol * p) {
449 |     int len = z->l;
450 |     if (CAPACITY(p) < len) {
451 |         p = increase_size(p, len);
452 |         if (p == NULL)
453 |             return NULL;
454 |     }
455 |     memmove(p, z->p, len * sizeof(symbol));
456 |     SET_SIZE(p, len);
457 |     return p;
458 | }
459 | 
460 | #if 0
461 | extern void debug(struct SN_env * z, int number, int line_count) {
462 |     int i;
463 |     int limit = SIZE(z->p);
464 |     /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
465 |     if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
466 |     for (i = 0; i <= limit; i++) {
467 |         if (z->lb == i) printf("{");
468 |         if (z->bra == i) printf("[");
469 |         if (z->c == i) printf("|");
470 |         if (z->ket == i) printf("]");
471 |         if (z->l == i) printf("}");
472 |         if (i < limit)
473 |         {   int ch = z->p[i];
474 |             if (ch == 0) ch = '#';
475 |             printf("%c", ch);
476 |         }
477 |     }
478 |     printf("'\n");
479 | }
480 | #endif
481 | 


--------------------------------------------------------------------------------
/fts3_unicode2.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** 2012 May 25
  3 | **
  4 | ** The author disclaims copyright to this source code.  In place of
  5 | ** a legal notice, here is a blessing:
  6 | **
  7 | **    May you do good and not evil.
  8 | **    May you find forgiveness for yourself and forgive others.
  9 | **    May you share freely, never taking more than you give.
 10 | **
 11 | ******************************************************************************
 12 | */
 13 | 
 14 | /*
 15 | ** DO NOT EDIT THIS MACHINE GENERATED FILE.
 16 | */
 17 | 
 18 | #if defined(SQLITE_ENABLE_FTS4_UNICODE61)
 19 | #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
 20 | 
 21 | #include <assert.h>
 22 | #include "fts3Int.h"
 23 | 
 24 | /*
 25 | ** Return true if the argument corresponds to a unicode codepoint
 26 | ** classified as either a letter or a number. Otherwise false.
 27 | **
 28 | ** The results are undefined if the value passed to this function
 29 | ** is less than zero.
 30 | */
 31 | int sqlite3FtsUnicodeIsalnum(int c){
 32 |   /* Each unsigned integer in the following array corresponds to a contiguous
 33 |   ** range of unicode codepoints that are not either letters or numbers (i.e.
 34 |   ** codepoints for which this function should return 0).
 35 |   **
 36 |   ** The most significant 22 bits in each 32-bit value contain the first 
 37 |   ** codepoint in the range. The least significant 10 bits are used to store
 38 |   ** the size of the range (always at least 1). In other words, the value 
 39 |   ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
 40 |   ** C. It is not possible to represent a range larger than 1023 codepoints 
 41 |   ** using this format.
 42 |   */
 43 |   const static unsigned int aEntry[] = {
 44 |     0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
 45 |     0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
 46 |     0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
 47 |     0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
 48 |     0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
 49 |     0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
 50 |     0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
 51 |     0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
 52 |     0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
 53 |     0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
 54 |     0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
 55 |     0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
 56 |     0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
 57 |     0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
 58 |     0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
 59 |     0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
 60 |     0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
 61 |     0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
 62 |     0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
 63 |     0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
 64 |     0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
 65 |     0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
 66 |     0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
 67 |     0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
 68 |     0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
 69 |     0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
 70 |     0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
 71 |     0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
 72 |     0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
 73 |     0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
 74 |     0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
 75 |     0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
 76 |     0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
 77 |     0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
 78 |     0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
 79 |     0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
 80 |     0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
 81 |     0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
 82 |     0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
 83 |     0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
 84 |     0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
 85 |     0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
 86 |     0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
 87 |     0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
 88 |     0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
 89 |     0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
 90 |     0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
 91 |     0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
 92 |     0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
 93 |     0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
 94 |     0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
 95 |     0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
 96 |     0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
 97 |     0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
 98 |     0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
 99 |     0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
100 |     0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
101 |     0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
102 |     0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
103 |     0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
104 |     0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
105 |     0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
106 |     0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
107 |     0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
108 |     0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
109 |     0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
110 |     0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
111 |     0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
112 |     0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
113 |     0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
114 |     0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
115 |     0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
116 |     0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
117 |     0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
118 |     0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
119 |     0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
120 |     0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
121 |     0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
122 |     0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
123 |     0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
124 |     0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
125 |     0x380400F0,
126 |   };
127 |   static const unsigned int aAscii[4] = {
128 |     0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
129 |   };
130 | 
131 |   if( c<128 ){
132 |     return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
133 |   }else if( c<(1<<22) ){
134 |     unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
135 |     int iRes;
136 |     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
137 |     int iLo = 0;
138 |     while( iHi>=iLo ){
139 |       int iTest = (iHi + iLo) / 2;
140 |       if( key >= aEntry[iTest] ){
141 |         iRes = iTest;
142 |         iLo = iTest+1;
143 |       }else{
144 |         iHi = iTest-1;
145 |       }
146 |     }
147 |     assert( aEntry[0]<key );
148 |     assert( key>=aEntry[iRes] );
149 |     return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
150 |   }
151 |   return 1;
152 | }
153 | 
154 | 
155 | /*
156 | ** If the argument is a codepoint corresponding to a lowercase letter
157 | ** in the ASCII range with a diacritic added, return the codepoint
158 | ** of the ASCII letter only. For example, if passed 235 - "LATIN
159 | ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
160 | ** E"). The resuls of passing a codepoint that corresponds to an
161 | ** uppercase letter are undefined.
162 | */
163 | static int remove_diacritic(int c){
164 |   unsigned short aDia[] = {
165 |         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
166 |      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
167 |      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
168 |      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
169 |      3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
170 |      3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
171 |      4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
172 |      6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
173 |     61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
174 |     61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
175 |     62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
176 |     62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
177 |     62924, 63050, 63082, 63274, 63390, 
178 |   };
179 |   char aChar[] = {
180 |     '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
181 |     'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
182 |     's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
183 |     'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
184 |     'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
185 |     '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
186 |     'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
187 |     'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
188 |     'e',  'i',  'o',  'u',  'y',  
189 |   };
190 | 
191 |   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
192 |   int iRes = 0;
193 |   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
194 |   int iLo = 0;
195 |   while( iHi>=iLo ){
196 |     int iTest = (iHi + iLo) / 2;
197 |     if( key >= aDia[iTest] ){
198 |       iRes = iTest;
199 |       iLo = iTest+1;
200 |     }else{
201 |       iHi = iTest-1;
202 |     }
203 |   }
204 |   assert( key>=aDia[iRes] );
205 |   return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
206 | };
207 | 
208 | 
209 | /*
210 | ** Return true if the argument interpreted as a unicode codepoint
211 | ** is a diacritical modifier character.
212 | */
213 | int sqlite3FtsUnicodeIsdiacritic(int c){
214 |   unsigned int mask0 = 0x08029FDF;
215 |   unsigned int mask1 = 0x000361F8;
216 |   if( c<768 || c>817 ) return 0;
217 |   return (c < 768+32) ?
218 |       (mask0 & (1 << (c-768))) :
219 |       (mask1 & (1 << (c-768-32)));
220 | }
221 | 
222 | 
223 | /*
224 | ** Interpret the argument as a unicode codepoint. If the codepoint
225 | ** is an upper case character that has a lower case equivalent,
226 | ** return the codepoint corresponding to the lower case version.
227 | ** Otherwise, return a copy of the argument.
228 | **
229 | ** The results are undefined if the value passed to this function
230 | ** is less than zero.
231 | */
232 | int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
233 |   /* Each entry in the following array defines a rule for folding a range
234 |   ** of codepoints to lower case. The rule applies to a range of nRange
235 |   ** codepoints starting at codepoint iCode.
236 |   **
237 |   ** If the least significant bit in flags is clear, then the rule applies
238 |   ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
239 |   ** need to be folded). Or, if it is set, then the rule only applies to
240 |   ** every second codepoint in the range, starting with codepoint C.
241 |   **
242 |   ** The 7 most significant bits in flags are an index into the aiOff[]
243 |   ** array. If a specific codepoint C does require folding, then its lower
244 |   ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
245 |   **
246 |   ** The contents of this array are generated by parsing the CaseFolding.txt
247 |   ** file distributed as part of the "Unicode Character Database". See
248 |   ** http://www.unicode.org for details.
249 |   */
250 |   static const struct TableEntry {
251 |     unsigned short iCode;
252 |     unsigned char flags;
253 |     unsigned char nRange;
254 |   } aEntry[] = {
255 |     {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
256 |     {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
257 |     {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
258 |     {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
259 |     {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
260 |     {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
261 |     {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
262 |     {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
263 |     {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
264 |     {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
265 |     {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
266 |     {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
267 |     {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
268 |     {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
269 |     {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
270 |     {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
271 |     {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
272 |     {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
273 |     {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
274 |     {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
275 |     {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
276 |     {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
277 |     {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
278 |     {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
279 |     {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
280 |     {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
281 |     {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
282 |     {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
283 |     {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
284 |     {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
285 |     {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
286 |     {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
287 |     {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
288 |     {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
289 |     {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
290 |     {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
291 |     {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
292 |     {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
293 |     {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
294 |     {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
295 |     {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
296 |     {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
297 |     {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
298 |     {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
299 |     {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
300 |     {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
301 |     {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
302 |     {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
303 |     {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
304 |     {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
305 |     {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
306 |     {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
307 |     {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
308 |     {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
309 |     {65313, 14, 26},       
310 |   };
311 |   static const unsigned short aiOff[] = {
312 |    1,     2,     8,     15,    16,    26,    28,    32,    
313 |    37,    38,    40,    48,    63,    64,    69,    71,    
314 |    79,    80,    116,   202,   203,   205,   206,   207,   
315 |    209,   210,   211,   213,   214,   217,   218,   219,   
316 |    775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
317 |    54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
318 |    57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
319 |    65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
320 |    65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
321 |    65514, 65521, 65527, 65528, 65529, 
322 |   };
323 | 
324 |   int ret = c;
325 | 
326 |   assert( c>=0 );
327 |   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
328 | 
329 |   if( c<128 ){
330 |     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
331 |   }else if( c<65536 ){
332 |     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
333 |     int iLo = 0;
334 |     int iRes = -1;
335 | 
336 |     while( iHi>=iLo ){
337 |       int iTest = (iHi + iLo) / 2;
338 |       int cmp = (c - aEntry[iTest].iCode);
339 |       if( cmp>=0 ){
340 |         iRes = iTest;
341 |         iLo = iTest+1;
342 |       }else{
343 |         iHi = iTest-1;
344 |       }
345 |     }
346 |     assert( iRes<0 || c>=aEntry[iRes].iCode );
347 | 
348 |     if( iRes>=0 ){
349 |       const struct TableEntry *p = &aEntry[iRes];
350 |       if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
351 |         ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
352 |         assert( ret>0 );
353 |       }
354 |     }
355 | 
356 |     if( bRemoveDiacritic ) ret = remove_diacritic(ret);
357 |   }
358 |   
359 |   else if( c>=66560 && c<66600 ){
360 |     ret = c + 40;
361 |   }
362 | 
363 |   return ret;
364 | }
365 | #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
366 | #endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */
367 | 


--------------------------------------------------------------------------------
/fts3_unicodesn.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** 2012 Nov 11
  3 | **
  4 | ** The author disclaims copyright to this source code.  In place of
  5 | ** a legal notice, here is a blessing:
  6 | **
  7 | **    May you do good and not evil.
  8 | **    May you find forgiveness for yourself and forgive others.
  9 | **    May you share freely, never taking more than you give.
 10 | **
 11 | ******************************************************************************
 12 | **
 13 | ** Implementation of the "unicode" full-text-search tokenizer with Snowball stemming
 14 | */
 15 | 
 16 | #include "fts3_unicodesn.h"
 17 | 
 18 | /* Snowball stemmer */
 19 | #include "api.h"
 20 | 
 21 | #ifdef SQLITE_ENABLE_FTS4_UNICODE61
 22 | 
 23 | #include "fts3Int.h"
 24 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
 25 | 
 26 | #include <assert.h>
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include <string.h>
 30 | 
 31 | #include "fts3_tokenizer.h"
 32 | 
 33 | #include "libstemmer_c/src_c/stem_UTF_8_danish.h"
 34 | #include "libstemmer_c/src_c/stem_UTF_8_dutch.h"
 35 | #include "libstemmer_c/src_c/stem_UTF_8_english.h"
 36 | #include "libstemmer_c/src_c/stem_UTF_8_finnish.h"
 37 | #include "libstemmer_c/src_c/stem_UTF_8_french.h"
 38 | #include "libstemmer_c/src_c/stem_UTF_8_german.h"
 39 | #include "libstemmer_c/src_c/stem_UTF_8_hungarian.h"
 40 | #include "libstemmer_c/src_c/stem_UTF_8_italian.h"
 41 | #include "libstemmer_c/src_c/stem_UTF_8_norwegian.h"
 42 | #include "libstemmer_c/src_c/stem_UTF_8_porter.h"
 43 | #include "libstemmer_c/src_c/stem_UTF_8_portuguese.h"
 44 | #include "libstemmer_c/src_c/stem_UTF_8_romanian.h"
 45 | #include "libstemmer_c/src_c/stem_UTF_8_russian.h"
 46 | #include "libstemmer_c/src_c/stem_UTF_8_spanish.h"
 47 | #include "libstemmer_c/src_c/stem_UTF_8_swedish.h"
 48 | #include "libstemmer_c/src_c/stem_UTF_8_turkish.h"
 49 | 
 50 | 
 51 | /*
 52 | ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 53 | ** from the sqlite3 source file utf.c. If this file is compiled as part
 54 | ** of the amalgamation, they are not required.
 55 | */
 56 | #ifndef SQLITE_AMALGAMATION
 57 | 
 58 | static const unsigned char sqlite3Utf8Trans1[] = {
 59 |   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 60 |   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 61 |   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 62 |   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 63 |   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 64 |   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 65 |   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 66 |   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 67 | };
 68 | 
 69 | #define READ_UTF8(zIn, zTerm, c)                           \
 70 |   c = *(zIn++);                                            \
 71 |   if( c>=0xc0 ){                                           \
 72 |     c = sqlite3Utf8Trans1[c-0xc0];                         \
 73 |     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
 74 |       c = (c<<6) + (0x3f & *(zIn++));                      \
 75 |     }                                                      \
 76 |     if( c<0x80                                             \
 77 |         || (c&0xFFFFF800)==0xD800                          \
 78 |         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
 79 |   }
 80 | 
 81 | #define WRITE_UTF8(zOut, c) {                          \
 82 |   if( c<0x00080 ){                                     \
 83 |     *zOut++ = (u8)(c&0xFF);                            \
 84 |   }                                                    \
 85 |   else if( c<0x00800 ){                                \
 86 |     *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
 87 |     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
 88 |   }                                                    \
 89 |   else if( c<0x10000 ){                                \
 90 |     *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
 91 |     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
 92 |     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
 93 |   }else{                                               \
 94 |     *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
 95 |     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
 96 |     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
 97 |     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
 98 |   }                                                    \
 99 | }
100 | 
101 | #endif /* ifndef SQLITE_AMALGAMATION */
102 | 
103 | typedef struct unicode_tokenizer unicode_tokenizer;
104 | typedef struct unicode_cursor unicode_cursor;
105 | 
106 | typedef struct {
107 |     struct SN_env * (*create)(void);
108 |     void (*close)(struct SN_env *);
109 |     int (*stem)(struct SN_env *);
110 | } stemmer_callbacks;
111 | 
112 | struct unicode_tokenizer {
113 |   sqlite3_tokenizer base;
114 |   int bRemoveDiacritic;
115 |   int nException;
116 |   int *aiException;
117 |   /* Snowball stemmer */
118 |   stemmer_callbacks stemmer;
119 | };
120 | 
121 | struct unicode_cursor {
122 |   sqlite3_tokenizer_cursor base;
123 |   const unsigned char *aInput;    /* Input text being tokenized */
124 |   int nInput;                     /* Size of aInput[] in bytes */
125 |   int iOff;                       /* Current offset within aInput[] */
126 |   int iToken;                     /* Index of next token to be returned */
127 |   char *zToken;                   /* storage for current token */
128 |   int nAlloc;                     /* space allocated at zToken */
129 |   struct SN_env *pStemmer;         /* Snowball stemmer */
130 | };
131 | 
132 | 
133 | /*
134 | ** Destroy a tokenizer allocated by unicodeCreate().
135 | */
136 | static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
137 |   if( pTokenizer ){
138 |     unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
139 |     sqlite3_free(p->aiException);
140 |     sqlite3_free(p);
141 |   }
142 |   return SQLITE_OK;
143 | }
144 | 
145 | /*
146 | ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
147 | ** statement has specified that the tokenizer for this table shall consider
148 | ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
149 | ** token characters (if bAlnum==1).
150 | **
151 | ** For each codepoint in the zIn/nIn string, this function checks if the
152 | ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
153 | ** If so, no action is taken. Otherwise, the codepoint is added to the 
154 | ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
155 | ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
156 | ** codepoints in the aiException[] array.
157 | **
158 | ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
159 | ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
160 | ** It is not possible to change the behaviour of the tokenizer with respect
161 | ** to these codepoints.
162 | */
163 | static int unicodeAddExceptions(
164 |   unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
165 |   int bAlnum,                     /* Replace Isalnum() return value with this */
166 |   const char *zIn,                /* Array of characters to make exceptions */
167 |   int nIn                         /* Length of z in bytes */
168 | ){
169 |   const unsigned char *z = (const unsigned char *)zIn;
170 |   const unsigned char *zTerm = &z[nIn];
171 |   int iCode;
172 |   int nEntry = 0;
173 | 
174 |   assert( bAlnum==0 || bAlnum==1 );
175 | 
176 |   while( z<zTerm ){
177 |     READ_UTF8(z, zTerm, iCode);
178 |     assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
179 |     if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
180 |      && sqlite3FtsUnicodeIsdiacritic(iCode)==0 
181 |     ){
182 |       nEntry++;
183 |     }
184 |   }
185 | 
186 |   if( nEntry ){
187 |     int *aNew;                    /* New aiException[] array */
188 |     int nNew;                     /* Number of valid entries in array aNew[] */
189 | 
190 |     aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
191 |     if( aNew==0 ) return SQLITE_NOMEM;
192 |     nNew = p->nException;
193 | 
194 |     z = (const unsigned char *)zIn;
195 |     while( z<zTerm ){
196 |       READ_UTF8(z, zTerm, iCode);
197 |       if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
198 |        && sqlite3FtsUnicodeIsdiacritic(iCode)==0
199 |       ){
200 |         int i, j;
201 |         for(i=0; i<nNew && aNew[i]<iCode; i++);
202 |         for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
203 |         aNew[i] = iCode;
204 |         nNew++;
205 |       }
206 |     }
207 |     p->aiException = aNew;
208 |     p->nException = nNew;
209 |   }
210 | 
211 |   return SQLITE_OK;
212 | }
213 | 
214 | /*
215 | ** Return true if the p->aiException[] array contains the value iCode.
216 | */
217 | static int unicodeIsException(unicode_tokenizer *p, int iCode){
218 |   if( p->nException>0 ){
219 |     int *a = p->aiException;
220 |     int iLo = 0;
221 |     int iHi = p->nException-1;
222 | 
223 |     while( iHi>=iLo ){
224 |       int iTest = (iHi + iLo) / 2;
225 |       if( iCode==a[iTest] ){
226 |         return 1;
227 |       }else if( iCode>a[iTest] ){
228 |         iLo = iTest+1;
229 |       }else{
230 |         iHi = iTest-1;
231 |       }
232 |     }
233 |   }
234 | 
235 |   return 0;
236 | }
237 | 
238 | /*
239 | ** Return true if, for the purposes of tokenization, codepoint iCode is
240 | ** considered a token character (not a separator).
241 | */
242 | static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
243 |   assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
244 |   return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
245 | }
246 | 
247 | /* Allow stemmers to be looked up by ISO-639 language code or by (English) name. */
248 | static struct {const char *shortName; const char *longName; stemmer_callbacks stemmer;}
249 | const stemmers[] = {
250 |     {"da", "danish",     {danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}},
251 |     {"nl", "dutch",      {dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}},
252 |     {"en", "english",    {english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}},
253 |     {"fi", "finnish",    {finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}},
254 |     {"fr", "french",     {french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}},
255 |     {"de", "german",     {german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}},
256 |     {"hu", "hungarian",  {hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}},
257 |     {"it", "italian",    {italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}},
258 |     {"no", "norwegian",  {norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}},
259 |     {"porter", "porter", {porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}},
260 |     {"pt", "portuguese", {portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}},
261 |     {"ro", "romanian",   {romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}},
262 |     {"ru", "russian",    {russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}},
263 |     {"es", "spanish",    {spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}},
264 |     {"sv", "swedish",    {swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}},
265 |     {"tr", "turkish",    {turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}},
266 | };
267 | 
268 | 
269 | static int unicodeSetStemmer(
270 |   unicode_tokenizer *p,
271 |   const char *zIn,                /* Stemmer name (not NUL-terminated) */
272 |   int nIn                         /* Length of z in bytes */
273 | )
274 | {
275 |   for (int i = 0; i < sizeof(stemmers)/sizeof(stemmers[0]); i++) {
276 |     const char *n1 = stemmers[i].shortName, *n2 = stemmers[i].longName;
277 |     if ( (nIn==strlen(n1) &&  memcmp(n1, zIn, nIn)==0) ||
278 |          (nIn==strlen(n2) &&  memcmp(n2, zIn, nIn)==0) ) {
279 |       p->stemmer = stemmers[i].stemmer;
280 |       return SQLITE_OK;
281 |     }
282 |   }
283 |   return SQLITE_ERROR;
284 | }
285 | 
286 | /*
287 | ** Create a new tokenizer instance.
288 | */
289 | static int unicodeCreate(
290 |   int nArg,                       /* Size of array argv[] */
291 |   const char * const *azArg,      /* Tokenizer creation arguments */
292 |   sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
293 | ){
294 |   unicode_tokenizer *pNew;        /* New tokenizer object */
295 |   int i;
296 |   int rc = SQLITE_OK;
297 | 
298 |   pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
299 |   if( pNew==NULL ) return SQLITE_NOMEM;
300 |   memset(pNew, 0, sizeof(unicode_tokenizer));
301 |   pNew->bRemoveDiacritic = 1;
302 |   pNew->stemmer.create = NULL;
303 |   pNew->stemmer.close = NULL;
304 |   pNew->stemmer.stem = NULL;
305 | 
306 |   for(i=0; rc==SQLITE_OK && i<nArg; i++){
307 |     const char *z = azArg[i];
308 |     int n = (int)strlen(z);
309 | 
310 |     if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
311 |       pNew->bRemoveDiacritic = 1;
312 |     }
313 |     else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
314 |       pNew->bRemoveDiacritic = 0;
315 |     }
316 |     else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
317 |       rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
318 |     }
319 |     else if( n>=11 && memcmp("separators=", z, 11)==0 ){
320 |       rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
321 |     }
322 |     else if( n>=8 && memcmp("stemmer=", z, 8)==0 ){
323 |       rc = unicodeSetStemmer(pNew, &z[8], n-8);
324 |     }
325 |     else{
326 |       /* Unrecognized argument */
327 |       rc  = SQLITE_ERROR;
328 |     }
329 |   }
330 | 
331 |   if( rc!=SQLITE_OK ){
332 |     unicodeDestroy((sqlite3_tokenizer *)pNew);
333 |     pNew = 0;
334 |   }
335 |   *pp = (sqlite3_tokenizer *)pNew;
336 |   return rc;
337 | }
338 | 
339 | /*
340 | ** Prepare to begin tokenizing a particular string.  The input
341 | ** string to be tokenized is pInput[0..nBytes-1].  A cursor
342 | ** used to incrementally tokenize this string is returned in 
343 | ** *ppCursor.
344 | */
345 | static int unicodeOpen(
346 |   sqlite3_tokenizer *p,           /* The tokenizer */
347 |   const char *aInput,             /* Input string */
348 |   int nInput,                     /* Size of string aInput in bytes */
349 |   sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
350 | ){
351 |   unicode_tokenizer *pTokenizer;
352 |   unicode_cursor *pCsr;
353 | 
354 |   pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
355 |   if( pCsr==0 ){
356 |     return SQLITE_NOMEM;
357 |   }
358 |   memset(pCsr, 0, sizeof(unicode_cursor));
359 | 
360 |   pCsr->aInput = (const unsigned char *)aInput;
361 |   if( aInput==0 ){
362 |     pCsr->nInput = 0;
363 |   }else if( nInput<0 ){
364 |     pCsr->nInput = (int)strlen(aInput);
365 |   }else{
366 |     pCsr->nInput = nInput;
367 |   }
368 | 
369 |   pTokenizer = (unicode_tokenizer *)p;
370 |   if ( pTokenizer->stemmer.create!=NULL ) {
371 |      pCsr->pStemmer = pTokenizer->stemmer.create();
372 |      if ( pCsr->pStemmer==0 ) {
373 | 	sqlite3_free(p);
374 | 	return SQLITE_NOMEM;
375 |      }
376 |   }else {
377 |      pCsr->pStemmer = NULL;
378 |   }
379 | 
380 |   *pp = &pCsr->base;
381 |   UNUSED_PARAMETER(p);
382 |   return SQLITE_OK;
383 | }
384 | 
385 | /*
386 | ** Close a tokenization cursor previously opened by a call to
387 | ** simpleOpen() above.
388 | */
389 | static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
390 |   unicode_cursor *pCsr = (unicode_cursor *) pCursor;
391 |   if ( pCsr->pStemmer != NULL ) {
392 |      unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
393 |      p->stemmer.close(pCsr->pStemmer);
394 |   }
395 |   sqlite3_free(pCsr->zToken);
396 |   sqlite3_free(pCsr);
397 |   return SQLITE_OK;
398 | }
399 | 
400 | /*
401 | ** Extract the next token from a tokenization cursor.  The cursor must
402 | ** have been opened by a prior call to simpleOpen().
403 | */
404 | static int unicodeNext(
405 |   sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
406 |   const char **paToken,           /* OUT: Token text */
407 |   int *pnToken,                   /* OUT: Number of bytes at *paToken */
408 |   int *piStart,                   /* OUT: Starting offset of token */
409 |   int *piEnd,                     /* OUT: Ending offset of token */
410 |   int *piPos                      /* OUT: Position integer of token */
411 | ){
412 |   unicode_cursor *pCsr = (unicode_cursor *)pC;
413 |   unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
414 |   int iCode = 0;
415 |   char *zOut;
416 |   const unsigned char *z = &pCsr->aInput[pCsr->iOff];
417 |   const unsigned char *zStart = z;
418 |   const unsigned char *zEnd;
419 |   const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
420 | 
421 |   /* Scan past any delimiter characters before the start of the next token.
422 |   ** Return SQLITE_DONE early if this takes us all the way to the end of 
423 |   ** the input.  */
424 |   while( z<zTerm ){
425 |     READ_UTF8(z, zTerm, iCode);
426 |     if( unicodeIsAlnum(p, iCode) ) break;
427 |     zStart = z;
428 |   }
429 |   if( zStart>=zTerm ) return SQLITE_DONE;
430 | 
431 |   zOut = pCsr->zToken;
432 |   do {
433 |     int iOut;
434 | 
435 |     /* Grow the output buffer if required. */
436 |     if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
437 |       char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
438 |       if( !zNew ) return SQLITE_NOMEM;
439 |       zOut = &zNew[zOut - pCsr->zToken];
440 |       pCsr->zToken = zNew;
441 |       pCsr->nAlloc += 64;
442 |     }
443 | 
444 |     /* Write the folded case of the last character read to the output */
445 |     zEnd = z;
446 |     iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
447 |     if( iOut ){
448 |       WRITE_UTF8(zOut, iOut);
449 |     }
450 | 
451 |     /* If the cursor is not at EOF, read the next character */
452 |     if( z>=zTerm ) break;
453 |     READ_UTF8(z, zTerm, iCode);
454 |   }while( unicodeIsAlnum(p, iCode) 
455 |        || sqlite3FtsUnicodeIsdiacritic(iCode)
456 |   );
457 | 
458 |   if ( pCsr->pStemmer!=NULL ) {
459 |      SN_set_current(pCsr->pStemmer, (int)(zOut - pCsr->zToken), (unsigned char *)pCsr->zToken);
460 |      if ( p->stemmer.stem(pCsr->pStemmer)<0 ) {
461 | 	*paToken = pCsr->zToken;
462 | 	*pnToken = (int)(zOut - pCsr->zToken);
463 |      }else {
464 | 	pCsr->pStemmer->p[pCsr->pStemmer->l] = '\0';
465 | 	*paToken = (char *)pCsr->pStemmer->p;
466 | 	*pnToken = pCsr->pStemmer->l;
467 |      }
468 |   }else {
469 |      *paToken = pCsr->zToken;
470 |      *pnToken = (int)(zOut - pCsr->zToken);
471 |   }
472 | 
473 |   /* Set the output variables and return. */
474 |   pCsr->iOff = (int)(z - pCsr->aInput);
475 |   *piStart = (int)(zStart - pCsr->aInput);
476 |   *piEnd = (int)(zEnd - pCsr->aInput);
477 |   *piPos = pCsr->iToken++;
478 |   return SQLITE_OK;
479 | }
480 | 
481 | /*
482 | ** Set *ppModule to a pointer to the sqlite3_tokenizer_module 
483 | ** structure for the unicode tokenizer.
484 | */
485 | void sqlite3Fts3UnicodeSnTokenizer(sqlite3_tokenizer_module const **ppModule){
486 |   static const sqlite3_tokenizer_module module = {
487 |     0,
488 |     unicodeCreate,
489 |     unicodeDestroy,
490 |     unicodeOpen,
491 |     unicodeClose,
492 |     unicodeNext,
493 |     0,
494 |   };
495 |   *ppModule = &module;
496 | }
497 | 
498 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
499 | #endif /* ifndef SQLITE_ENABLE_FTS4_UNICODE61 */
500 | 


--------------------------------------------------------------------------------