├── .gitignore ├── Makefile ├── expected └── tsexample.out ├── sql └── tsexample.sql ├── tsexample--1.0.sql ├── tsexample.c └── tsexample.control /.gitignore: -------------------------------------------------------------------------------- 1 | /log/ 2 | /results/ 3 | /tmp_check/ 4 | /regression.diffs 5 | /regression.out 6 | *.so 7 | *.o 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # contrib/tsexample/Makefile 2 | 3 | MODULES = tsexample 4 | 5 | EXTENSION = tsexample 6 | DATA = tsexample--1.0.sql 7 | PGFILEDESC = "tsexample - example of custom postgresql full text search parser, dictionaries and configuration" 8 | 9 | REGRESS = tsexample 10 | 11 | ifdef USE_PGXS 12 | PG_CONFIG = pg_config 13 | PGXS := $(shell $(PG_CONFIG) --pgxs) 14 | include $(PGXS) 15 | else 16 | subdir = contrib/tsexample 17 | top_builddir = ../.. 18 | include $(top_builddir)/src/Makefile.global 19 | include $(top_srcdir)/contrib/contrib-global.mk 20 | endif 21 | -------------------------------------------------------------------------------- /expected/tsexample.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION tsexample; 2 | SELECT * FROM ts_parse('sample_parser', 'abc def 123 1xx yy3'); 3 | tokid | token 4 | -------+------- 5 | 1 | abc 6 | 1 | def 7 | 2 | 123 8 | 1 | 1xx 9 | 1 | yy3 10 | (5 rows) 11 | 12 | SELECT * FROM ts_parse('sample_parser', 'x-z p; pg_config'); 13 | tokid | token 14 | -------+----------- 15 | 1 | x 16 | 1 | z 17 | 1 | p 18 | 1 | qwe 19 | 1 | pg_config 20 | (5 rows) 21 | 22 | SELECT to_tsvector('sample', 'abcdef 12345678 xyz'); 23 | to_tsvector 24 | ------------------------------------------------- 25 | '12345678':2 'abc':1 'abcdef':1 'def':1 'xyz':3 26 | (1 row) 27 | 28 | SELECT plainto_tsquery('sample', 'abcdef 12345678 xyz'); 29 | plainto_tsquery 30 | --------------------------------------------------- 31 | ( 'abcdef' | 'abc' & 'def' ) & '12345678' & 'xyz' 32 | (1 row) 33 | 34 | SELECT * FROM ts_debug('sample', 'abcdef 12345678 xyz'); 35 | alias | description | token | dictionaries | dictionary | lexemes 36 | --------+-----------------------------------+----------+--------------+------------+------------------ 37 | word | Word, all alphanumeric characters | abcdef | {cut3} | cut3 | {abcdef,abc,def} 38 | number | Number, all digits | 12345678 | {simple} | simple | {12345678} 39 | word | Word, all alphanumeric characters | xyz | {cut3} | cut3 | {xyz} 40 | (3 rows) 41 | 42 | -------------------------------------------------------------------------------- /sql/tsexample.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION tsexample; 2 | 3 | SELECT * FROM ts_parse('sample_parser', 'abc def 123 1xx yy3'); 4 | SELECT * FROM ts_parse('sample_parser', 'x-z p; pg_config'); 5 | SELECT to_tsvector('sample', 'abcdef 12345678 xyz'); 6 | SELECT plainto_tsquery('sample', 'abcdef 12345678 xyz'); 7 | SELECT * FROM ts_debug('sample', 'abcdef 12345678 xyz'); 8 | -------------------------------------------------------------------------------- /tsexample--1.0.sql: -------------------------------------------------------------------------------- 1 | /* contrib/tsexample/tsexample--1.0.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 4 | \echo Use "CREATE EXTENSION tsexample;" to load this file. \quit 5 | 6 | CREATE OR REPLACE FUNCTION sparser_start(internal, integer) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT IMMUTABLE; 10 | 11 | CREATE OR REPLACE FUNCTION sparser_nexttoken(internal, internal, internal) 12 | RETURNS internal 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT IMMUTABLE; 15 | 16 | CREATE OR REPLACE FUNCTION sparser_end(internal) 17 | RETURNS void 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT IMMUTABLE; 20 | 21 | CREATE OR REPLACE FUNCTION sparser_lextype(internal) 22 | RETURNS internal 23 | AS 'MODULE_PATHNAME' 24 | LANGUAGE C STRICT IMMUTABLE; 25 | 26 | CREATE TEXT SEARCH PARSER sample_parser ( 27 | START = sparser_start, 28 | GETTOKEN = sparser_nexttoken, 29 | END = sparser_end, 30 | LEXTYPES = sparser_lextype 31 | ); 32 | COMMENT ON TEXT SEARCH PARSER sample_parser IS 'sample word parser'; 33 | 34 | CREATE OR REPLACE FUNCTION cutdict_init(internal) 35 | RETURNS internal 36 | AS 'MODULE_PATHNAME' 37 | LANGUAGE C STRICT IMMUTABLE; 38 | 39 | CREATE OR REPLACE FUNCTION cutdict_lexize(internal, internal, internal, internal) 40 | RETURNS internal 41 | AS 'MODULE_PATHNAME' 42 | LANGUAGE C STRICT IMMUTABLE; 43 | 44 | CREATE TEXT SEARCH TEMPLATE cutdict ( 45 | INIT = cutdict_init, 46 | LEXIZE = cutdict_lexize 47 | ); 48 | COMMENT ON TEXT SEARCH TEMPLATE cutdict IS 'cut dictionary: lowercase and keep only beginning and ending of long words'; 49 | 50 | CREATE TEXT SEARCH DICTIONARY cut3 ( 51 | TEMPLATE = cutdict, 52 | nbegin = 3, 53 | nend = 3 54 | ); 55 | COMMENT ON TEXT SEARCH DICTIONARY cut3 IS 'cut dictionary with nbegin = nend = 3'; 56 | 57 | CREATE TEXT SEARCH CONFIGURATION sample ( 58 | PARSER = "sample_parser" 59 | ); 60 | ALTER TEXT SEARCH CONFIGURATION sample ADD MAPPING FOR word WITH cut3; 61 | ALTER TEXT SEARCH CONFIGURATION sample ADD MAPPING FOR number WITH simple; 62 | COMMENT ON TEXT SEARCH CONFIGURATION sample IS 'sample configuration'; 63 | -------------------------------------------------------------------------------- /tsexample.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * tsexample.c 4 | * Example of custom full text search parser and dictionaries 5 | * 6 | * Copyright (c) 2016, Postgres Professional 7 | * Author: Alexander Korotkov 8 | * 9 | * IDENTIFICATION 10 | * contrib/tsexample/tsexample.c 11 | * 12 | *------------------------------------------------------------------------- 13 | */ 14 | #include "postgres.h" 15 | #include "commands/defrem.h" 16 | #include "tsearch/ts_public.h" 17 | #include "tsearch/ts_locale.h" 18 | 19 | typedef struct 20 | { 21 | char *begin; 22 | char *end; 23 | char *p; 24 | } SParserStatus; 25 | 26 | /* Output token categories */ 27 | 28 | #define WORD_TOKEN 1 29 | #define NUMBER_TOKEN 2 30 | 31 | #define LAST_TOKEN_NUM 2 32 | 33 | static const char *const tok_alias[] = { 34 | "", 35 | "word", 36 | "number" 37 | }; 38 | 39 | static const char *const lex_descr[] = { 40 | "", 41 | "Word, all alphanumeric characters", 42 | "Number, all digits" 43 | }; 44 | 45 | PG_MODULE_MAGIC; 46 | 47 | PG_FUNCTION_INFO_V1(sparser_start); 48 | PG_FUNCTION_INFO_V1(sparser_nexttoken); 49 | PG_FUNCTION_INFO_V1(sparser_end); 50 | PG_FUNCTION_INFO_V1(sparser_lextype); 51 | PG_FUNCTION_INFO_V1(cutdict_init); 52 | PG_FUNCTION_INFO_V1(cutdict_lexize); 53 | 54 | Datum 55 | sparser_start(PG_FUNCTION_ARGS) 56 | { 57 | SParserStatus *status = (SParserStatus *) palloc0(sizeof(SParserStatus)); 58 | 59 | status->begin = (char *) PG_GETARG_POINTER(0); 60 | status->end = status->begin + PG_GETARG_INT32(1); 61 | status->p = status->begin; 62 | 63 | PG_RETURN_POINTER(status); 64 | } 65 | 66 | Datum 67 | sparser_nexttoken(PG_FUNCTION_ARGS) 68 | { 69 | SParserStatus *status = (SParserStatus *) PG_GETARG_POINTER(0); 70 | char **t = (char **) PG_GETARG_POINTER(1); 71 | int *tlen = (int *) PG_GETARG_POINTER(2); 72 | bool found = false, 73 | has_nondigit = false; 74 | 75 | while (status->p < status->end) 76 | { 77 | int p_len = pg_mblen(status->p); 78 | 79 | if (t_isalpha(status->p) || t_isdigit(status->p) || 80 | (p_len == 1 && *status->p == '_')) 81 | { 82 | if (!t_isdigit(status->p)) 83 | has_nondigit = true; 84 | if (!found) 85 | { 86 | *t = status->p; 87 | found = true; 88 | } 89 | } 90 | else 91 | { 92 | if (found) 93 | break; 94 | } 95 | status->p += p_len; 96 | } 97 | 98 | if (found) 99 | { 100 | *tlen = status->p - *t; 101 | if (has_nondigit) 102 | PG_RETURN_INT32(WORD_TOKEN); 103 | else 104 | PG_RETURN_INT32(NUMBER_TOKEN); 105 | } 106 | else 107 | { 108 | PG_RETURN_INT32(0); 109 | } 110 | } 111 | 112 | Datum 113 | sparser_end(PG_FUNCTION_ARGS) 114 | { 115 | SParserStatus *status = (SParserStatus *) PG_GETARG_POINTER(0); 116 | 117 | pfree(status); 118 | PG_RETURN_VOID(); 119 | } 120 | 121 | 122 | Datum 123 | sparser_lextype(PG_FUNCTION_ARGS) 124 | { 125 | LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LAST_TOKEN_NUM + 1)); 126 | int i; 127 | 128 | for (i = 1; i <= LAST_TOKEN_NUM; i++) 129 | { 130 | descr[i - 1].lexid = i; 131 | descr[i - 1].alias = pstrdup(tok_alias[i]); 132 | descr[i - 1].descr = pstrdup(lex_descr[i]); 133 | } 134 | 135 | descr[LAST_TOKEN_NUM].lexid = 0; 136 | 137 | PG_RETURN_POINTER(descr); 138 | } 139 | 140 | typedef struct 141 | { 142 | int nbegin; 143 | int nend; 144 | } CutDict; 145 | 146 | Datum 147 | cutdict_init(PG_FUNCTION_ARGS) 148 | { 149 | List *dictoptions = (List *) PG_GETARG_POINTER(0); 150 | CutDict *d = (CutDict *) palloc0(sizeof(CutDict)); 151 | bool nbegin_loaded = false, 152 | nend_loaded = false; 153 | ListCell *l; 154 | 155 | foreach(l, dictoptions) 156 | { 157 | DefElem *defel = (DefElem *) lfirst(l); 158 | 159 | if (pg_strcasecmp("nbegin", defel->defname) == 0) 160 | { 161 | if (nbegin_loaded) 162 | ereport(ERROR, 163 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 164 | errmsg("multiple nbegin parameters"))); 165 | d->nbegin = atoi(defGetString(defel)); 166 | nbegin_loaded = true; 167 | } 168 | else if (pg_strcasecmp("nend", defel->defname) == 0) 169 | { 170 | if (nend_loaded) 171 | ereport(ERROR, 172 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 173 | errmsg("multiple nend parameters"))); 174 | d->nend = atoi(defGetString(defel)); 175 | nend_loaded = true; 176 | } 177 | else 178 | { 179 | ereport(ERROR, 180 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 181 | errmsg("unrecognized cut dictionary parameter: \"%s\"", 182 | defel->defname))); 183 | } 184 | } 185 | 186 | if (!nbegin_loaded || !nend_loaded) 187 | { 188 | ereport(ERROR, 189 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 190 | errmsg("both nbegin and nend parameters of cut dictionary must be specified"))); 191 | } 192 | 193 | PG_RETURN_POINTER(d); 194 | } 195 | 196 | Datum 197 | cutdict_lexize(PG_FUNCTION_ARGS) 198 | { 199 | CutDict *d = (CutDict *) PG_GETARG_POINTER(0); 200 | char *in = (char *) PG_GETARG_POINTER(1); 201 | int32 len = PG_GETARG_INT32(2); 202 | char *txt; 203 | int strlen; 204 | TSLexeme *res; 205 | int residx = 0; 206 | uint16 nvariant = 1; 207 | 208 | res = palloc0(sizeof(TSLexeme) * 4); 209 | txt = lowerstr_with_len(in, len); 210 | strlen = pg_mbstrlen(txt); 211 | 212 | if (strlen <= d->nbegin + d->nend) 213 | { 214 | res[residx].nvariant = nvariant; 215 | res[residx++].lexeme = txt; 216 | nvariant++; 217 | } 218 | 219 | 220 | if (strlen > d->nbegin) 221 | { 222 | int i; 223 | char *p = txt; 224 | 225 | for (i = 0; i < d->nbegin; i++) 226 | p += pg_mblen(p); 227 | res[residx].nvariant = nvariant; 228 | res[residx++].lexeme = pnstrdup(txt, p - txt); 229 | } 230 | 231 | if (strlen > d->nend) 232 | { 233 | int i; 234 | char *p = txt; 235 | 236 | for (i = 0; i < strlen - d->nend; i++) 237 | p += pg_mblen(p); 238 | res[residx].nvariant = nvariant; 239 | res[residx++].lexeme = pstrdup(p); 240 | } 241 | 242 | PG_RETURN_POINTER(res); 243 | } 244 | -------------------------------------------------------------------------------- /tsexample.control: -------------------------------------------------------------------------------- 1 | # tsexample extension 2 | comment = 'tsexample - example of custom postgresql full text search parser, dictionaries and configuration' 3 | default_version = '1.0' 4 | module_pathname = '$libdir/tsexample' 5 | relocatable = true 6 | --------------------------------------------------------------------------------