├── .github └── workflows │ └── main.yml ├── CHANGELOG ├── COPYRIGHT ├── META.json ├── Makefile ├── README.md ├── check-alpine.sh ├── check-debian.sh ├── dict.utf8.xdb ├── dict_extra.txt ├── docker ├── alpine │ └── 16 │ │ └── Dockerfile ├── bookworm │ ├── 15 │ │ └── Dockerfile │ └── 16 │ │ └── Dockerfile └── bullseye │ ├── 15 │ └── Dockerfile │ └── 16 │ └── Dockerfile ├── expected ├── zhparser-alpine.out └── zhparser-debian.out ├── rules.utf8.ini ├── sql └── zhparser.sql ├── zhparser--1.0--2.0.sql ├── zhparser--1.0.sql ├── zhparser--2.0--2.1.sql ├── zhparser--2.0.sql ├── zhparser--2.1--2.2.sql ├── zhparser--2.1.sql ├── zhparser--2.2.sql ├── zhparser--2.3.sql ├── zhparser--unpackaged--1.0.sql ├── zhparser-backup-custom-dict.sh ├── zhparser.c ├── zhparser.control └── zhparser.h /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 17 | jobs: 18 | # This workflow contains a single job called "build" 19 | build: 20 | # The type of runner that the job will run on 21 | runs-on: ubuntu-latest 22 | 23 | # Steps represent a sequence of tasks that will be executed as part of the job 24 | steps: 25 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 26 | - uses: actions/checkout@v2 27 | 28 | # Runs a single command using the runners shell 29 | - name: Run a one-line script 30 | run: echo Hello, world! 31 | 32 | # Runs a set of commands using the runners shell 33 | - name: Run a multi-line script 34 | run: | 35 | sudo apt-get -y install postgresql-server-dev-16 36 | wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.3.tar.bz2 | tar jxf - 37 | cd scws-1.2.3 ; ./configure ; sudo make install 38 | cd $GITHUB_WORKSPACE 39 | ls 40 | env 41 | type pg_config 42 | cd $GITHUB_WORKSPACE 43 | pg_config 44 | export PG_CONFIG=/usr/bin/pg_config ; make && sudo make install 45 | echo test, and deploy your project. 46 | 47 | freebsd-test: 48 | runs-on: ubuntu-latest 49 | name: A job to run test in FreeBSD 50 | env: 51 | MYTOKEN : ${{ secrets.MYTOKEN }} 52 | MYTOKEN2: "value2" 53 | steps: 54 | - uses: actions/checkout@v4 55 | - name: Test in FreeBSD 56 | id: test 57 | uses: vmactions/freebsd-vm@v1 58 | with: 59 | envs: 'MYTOKEN MYTOKEN2' 60 | usesh: true 61 | prepare: | 62 | pkg install -y scws 63 | pkg install -y postgresql16-server 64 | pkg install -y gmake 65 | pkg install -y git 66 | 67 | run: | 68 | env 69 | freebsd-version 70 | git clone https://github.com/amutu/zhparser.git 71 | cd zhparser ; gmake 72 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 2.2 (2021-11-08) 2 | -- move custom word from /base/${DATABASE_ID}/zhprs_dict_${DATABASE_NAME}.txt to /base/zhprs_dict_${DATABASE_NAME}.txt(data don't have /base/${DATABASE_ID} when tablespace is setted) 3 | 4 | 2.1 (2019-04-23) 5 | -- custom word store in DataDir. 6 | 7 | 2.0 (2019-03-05) 8 | -- support custom word in table zhparser.zhprs_custom_word 9 | 10 | 0.2.0 (2017-05-28) 11 | -- fix regression test 12 | -- make pg_config configurable 13 | -- optimize doc 14 | 15 | 0.1.5 (2017-05-24) 16 | -- optimize memory usage for zhparser 17 | -- support load multi dict 18 | -- export SCWS settings to GUC 19 | -- make context of dict_in_memory and extra_dicts to backend,others to userset 20 | -- fix a bug of unknow type index out of range 21 | -- detect dict file type from file extension 22 | -- add check script to run regress 23 | -- update scws to version 1.2.3 in install doc 24 | 25 | 0.1.4 (2013-11-03) 26 | -- fix pgxn meta 27 | 28 | 0.1.3 (2013-11-03) 29 | -- resolve the compile warning for pstrdup 30 | -- add the Chinese dict for SCWS 31 | -- fix rpath flag which emmit error on Mac OS 32 | -- update instal doc to note use gmake for *BSD 33 | -- update install doc to download SCWS from git url 34 | 35 | 0.1.2 (2013-10-23) 36 | -- resolve the pstrdup name conflict for pg 9.3 37 | 38 | 0.1.1 (2013-02-05) 39 | -- add CHANGELOG 40 | -- fmt the doc 41 | 42 | 0.1.0 (2013-02-04) 43 | -- init release in pgxn 44 | -- parse Chinese encode in UTF-8 45 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | zhparser 2 | 3 | Portions Copyright (c) 2012-2013, Jov(amutu@amutu.com) 4 | 5 | Permission to use, copy, modify, and distribute this software and its 6 | documentation for any purpose, without fee, and without a written agreement 7 | is hereby granted, provided that the above copyright notice and this 8 | paragraph and the following two paragraphs appear in all copies. 9 | 10 | IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR 11 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 12 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS 13 | DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE 14 | POSSIBILITY OF SUCH DAMAGE. 15 | 16 | THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 17 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 18 | AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 19 | ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO 20 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 21 | -------------------------------------------------------------------------------- /META.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "zhparser", 3 | "abstract": "a parser for full-text search of Chinese", 4 | "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese.It implements a Chinese parser base on the Simple Chinese Word Segmentation(SCWS)", 5 | "version": "0.2.0", 6 | "maintainer": [ 7 | "Jov " 8 | ], 9 | "license": "postgresql", 10 | "prereqs": { 11 | "runtime": { 12 | "requires": { 13 | "PostgreSQL": "9.2.0" 14 | }, 15 | "recommends": { 16 | "PostgreSQL": "9.6.0" 17 | } 18 | } 19 | }, 20 | "provides": { 21 | "zhparser": { 22 | "abstract": "a parser for full-text search of Chinese", 23 | "file": "zhparser--1.0.sql", 24 | "docfile": "README.md", 25 | "version": "0.2.0" 26 | } 27 | }, 28 | "resources": { 29 | "homepage": "http://amutu.com/blog/zhparser/", 30 | "bugtracker": { 31 | "web": "http://github.com/amutu/zhparser/issues/" 32 | }, 33 | "repository": { 34 | "url": "git://github.com/amutu/zhparser.git", 35 | "web": "http://github.com/amutu/zhparser/", 36 | "type": "git" 37 | } 38 | }, 39 | "generated_by": "Jov", 40 | "meta-spec": { 41 | "version": "1.0.0", 42 | "url": "http://pgxn.org/meta/spec.txt" 43 | }, 44 | "release_status": "stable", 45 | "tags": [ 46 | "parser", 47 | "full text search", 48 | "Chinese", 49 | "dictionary" 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # contrib/zhparser/Makefile 2 | 3 | MODULE_big = zhparser 4 | OBJS = zhparser.o 5 | 6 | EXTENSION = zhparser 7 | DATA = zhparser--1.0.sql zhparser--unpackaged--1.0.sql \ 8 | zhparser--1.0--2.0.sql zhparser--2.0.sql \ 9 | zhparser--2.0--2.1.sql zhparser--2.1.sql zhparser--2.1--2.2.sql \ 10 | zhparser--2.2.sql zhparser--2.3.sql 11 | DATA_TSEARCH = dict.utf8.xdb rules.utf8.ini 12 | 13 | REGRESS = zhparser 14 | 15 | SCWS_HOME ?= /usr/local 16 | PG_CPPFLAGS = -I$(SCWS_HOME)/include/scws 17 | SHLIB_LINK = -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib 18 | 19 | PG_CONFIG ?= pg_config 20 | PGXS := $(shell $(PG_CONFIG) --pgxs) 21 | include $(PGXS) 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Zhparser 2 | ======== 3 | 4 | Zhparser is a PostgreSQL extension for full-text search of Chinese language (Mandarin Chinese). It implements a Chinese language parser base on 5 | the [Simple Chinese Word Segmentation(SCWS)](https://github.com/hightman/scws). 6 | 7 | Project home page: http://blog.amutu.com/zhparser/ 8 | 9 | **注意**:对于分词结果不满意的,或者需要调试分词结果的,可以在这个页面调试:http://www.xunsearch.com/scws/demo/v48.php 10 | 11 | Docker快速体验 12 | ------- 13 | run the container: 14 | > docker run --name pgzhparser -d -e POSTGRES_PASSWORD=somepassword zhparser/zhparser:bookworm-16 15 | 16 | login the postgres database as user postgres: 17 | > docker exec -it pgzhparser psql postgres postgres 18 | 19 | create the extension and use it: 20 | > CREATE EXTENSION zhparser; 21 | > CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); 22 | > ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; 23 | > SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动'); 24 | 25 | you will get: 26 | tokid | token 27 | -------+------- 28 | 101 | hello 29 | 101 | world 30 | 117 | ! 31 | 101 | 2010 32 | 113 | 年 33 | 118 | 保障 34 | 110 | 房建 35 | 118 | 设在 36 | 110 | 全国 37 | 110 | 范围 38 | 102 | 内 39 | 118 | 获 40 | 97 | 全面 41 | 118 | 启动 42 | (14 行记录) 43 | 44 | 更多docker镜像信息,访问这里:[zhparser的dockerub](https://hub.docker.com/r/zhparser/zhparser) 45 | zhparser的docker镜像基于PostgreSQL的docker官方镜像构建,更多的用法参见:https://hub.docker.com/_/postgres 46 | 47 | INSTALL 48 | ------- 49 | 0.前置条件 50 | 51 | zhparser支持PostgreSQL 9.2及以上版本,请确保你的PG版本符合要求。 52 | 对于REDHAT/CentOS Linux系统,请确保安装了相关的库和头文件,一般它们在postgresql-devel软件包中。 53 | 54 | 1.安装SCWS 55 | 56 | ``` 57 | wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.3.tar.bz2 | tar xf - 58 | 59 | cd scws-1.2.3 ; ./configure ; make install 60 | 61 | 注意:在FreeBSD release 10及以上版本上运行configure时,需要增加--with-pic选项。 62 | 63 | 如果是从github上下载的scws源码需要先运行以下命令生成configure文件: 64 | 65 | touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing 66 | 67 | ``` 68 | 2.下载zhparser源码 69 | 70 | ``` 71 | git clone https://github.com/amutu/zhparser.git 72 | 73 | ``` 74 | 3.编译和安装zhparser 75 | 76 | ``` 77 | make && make install 78 | 79 | ``` 80 | 81 | 如果scws的路径不在默认的 /usr/local 下,可以设置SCWS_HOME 例如: ` SCWS_HOME=/usr make && make install ` 82 | 83 | 如果你同时安装了多个版本的PostgreSQL, 可以通过指定 PG\_CONFIG 来为指定的版本编译扩展: 84 | 85 | ``` 86 | PG_CONFIG=/usr/lib/postgresql/9.5/bin/pg_config make && make install 87 | 88 | ``` 89 | 90 | 注意:在*BSD上编译安装时,使用gmake代替make 91 | 92 | 4.创建extension 93 | 94 | ``` 95 | psql dbname superuser -c 'CREATE EXTENSION zhparser' 96 | 97 | ``` 98 | 99 | CONFIGURATION 100 | ------- 101 | 以下配置在PG9.2及以上版本使用,这些选项是用来控制字典加载行为和分词行为的,这些选项都不是必须的,默认都为false(即如果没有在配置文件中设置这些选项,则zhparser的行为与将下面的选项设置为false一致)。 102 | 103 | 忽略所有的标点等特殊符号: 104 | zhparser.punctuation_ignore = f 105 | 106 | 闲散文字自动以二字分词法聚合: 107 | zhparser.seg_with_duality = f 108 | 109 | 将词典全部加载到内存里: 110 | zhparser.dict_in_memory = f 111 | 112 | 短词复合: 113 | zhparser.multi_short = f 114 | 115 | 散字二元复合: 116 | zhparser.multi_duality = f 117 | 118 | 重要单字复合: 119 | zhparser.multi_zmain = f 120 | 121 | 全部单字复合: 122 | zhparser.multi_zall = f 123 | 124 | 除了zhparser自带的词典,用户可以增加自定义词典,自定义词典的优先级高于自带的词典。自定义词典的文件必须放在share/tsearch_data目录中,zhparser根据文件扩展名确定词典的格式类型,.txt扩展名表示词典是文本格式,.xdb扩展名表示这个词典是xdb格式,多个文件使用逗号分隔,词典的分词优先级由低到高,如: 125 | 126 | zhparser.extra_dicts = 'dict_extra.txt,mydict.xdb' 127 | 128 | 注意:zhparser.extra_dicts和zhparser.dict_in_memory两个选项需要在backend启动前设置(可以在配置文件中修改然后reload,之后新建连接会生效),其他选项可以随时在session中设置生效。zhparser的选项与scws相关的选项对应,关于这些选项的含义,可以参考scws的文档:http://www.xunsearch.com/scws/docs.php#libscws 129 | 130 | EXAMPLE 131 | ------- 132 | ``` 133 | -- create the extension 134 | 135 | CREATE EXTENSION zhparser; 136 | 137 | -- make test configuration using parser 138 | 139 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); 140 | 141 | -- add token mapping 142 | 143 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; 144 | 145 | -- ts_parse 146 | 147 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了保障房的建设和投入力度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。'); 148 | 149 | -- test to_tsvector 150 | 151 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历>史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。'); 152 | 153 | -- test to_tsquery 154 | 155 | SELECT to_tsquery('testzhcfg', '保障房资金压力'); 156 | ``` 157 | 158 | 自定义词库 159 | ------- 160 | ** 详解 TXT 词库的写法 (TXT词库目前已兼容 cli/scws_gen_dict 所用的文本词库) ** 161 | 162 | 1) 每行一条记录,以 # 或 分号开头的相当于注释,忽略跳过 163 | 164 | 2) 每行由4个字段组成,依次为“词语"(由中文字或3个以下的字母合成), "TF", "IDF", "词性",字段使用空格或制表符分开,数量不限,可自行对齐以美化 165 | 166 | 3) 除“词语”外,其它字段可忽略不写。若忽略,TF和IDF默认值为 1.0 而 词性为 "@" 167 | 168 | 4) 由于 TXT 库动态加载(内部监测文件修改时间自动转换成 xdb 存于系统临时目录),故建议TXT词库不要过大 169 | 170 | 5) 删除词做法,请将词性设为“!“,则表示该词设为无效,即使在其它核心库中存在该词也视为无效 171 | 172 | 注意:1.自定义词典的格式可以是文本TXT,也可以是二进制的XDB格式。XDB格式效率更高,适合大辞典使用。可以使用scws自带的工具scws-gen-dict将文本词典转换为XDB格式;2.zhparser默认的词典是简体中文,如果需要繁体中文,可以在[这里](http://www.xunsearch.com/scws/download.php)下载已经生成好的XDB格式此词典。3.自定义词典的例子可以参考[dict_extra.txt](https://github.com/amutu/zhparser/blob/master/dict_extra.txt)。更多信息参见[SCWS官方文档](http://www.xunsearch.com/scws/docs.php#utilscws)。 173 | 174 | 自定义词库 2.1 175 | ------- 176 | ** 自定义词库2.1 增加自定义词库的易容性, 并兼容1.0提供的功能 ** 177 | 178 | 179 | 自定义词库需要superuser权限, 自定义库是数据库级别的(不是实例),每个数据库拥有自己的自定义分词, 并存储在data目录下base/数据库ID下(2.0 版本存储在share/tsearch_data下) 180 | 181 | 生成环境版本升级(新环境直接安装就可以): 182 | alter extension zhparser update ; 183 | ``` 184 | test=# SELECT * FROM ts_parse('zhparser', '保障房资金压力'); 185 | tokid | token 186 | -------+------- 187 | 118 | 保障 188 | 110 | 房 189 | 110 | 资金 190 | 110 | 压力 191 | 192 | test=# insert into zhparser.zhprs_custom_word values('资金压力'); 193 | --删除词insert into zhprs_custom_word(word, attr) values('word', '!'); 194 | --\d zhprs_custom_word 查看其表结构,支持TD, IDF 195 | test=# select sync_zhprs_custom_word(); 196 | sync_zhprs_custom_word 197 | ------------------------ 198 | 199 | (1 row) 200 | 201 | test=# \q --sync 后重新建立连接 202 | [lzzhang@lzzhang-pc bin]$ ./psql -U lzzhang -d test -p 1600 203 | test=# SELECT * FROM ts_parse('zhparser', '保障房资金压力'); 204 | tokid | token 205 | -------+---------- 206 | 118 | 保障 207 | 110 | 房 208 | 120 | 资金压力 209 | ``` 210 | 211 | 212 | COPYRITE 213 | -------- 214 | 215 | zhparser 216 | 217 | Portions Copyright (c) 2012-2017, Jov(amutu@amutu.com) 218 | 219 | Permission to use, copy, modify, and distribute this software and its documentation 220 | for any purpose, without fee, and without a written agreement is hereby granted, 221 | provided that the above copyright notice and this paragraph and the following 222 | two paragraphs appear in all copies. 223 | 224 | IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR 225 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 226 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS 227 | DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE 228 | POSSIBILITY OF SUCH DAMAGE. 229 | 230 | THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 231 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 232 | AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 233 | ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO 234 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 235 | 236 | [![Powered by DartNode](https://dartnode.com/branding/DN-Open-Source-sm.png)](https://dartnode.com "Powered by DartNode - Free VPS for Open Source") 237 | -------------------------------------------------------------------------------- /check-alpine.sh: -------------------------------------------------------------------------------- 1 | pid=$$ 2 | docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@alpine zhparser/zhparser:alpine-16 3 | sleep 5 4 | export PGPASSWORD=somepassword@alpine 5 | psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-alpine.out - 6 | 7 | if [ $? -eq 0 ] 8 | then 9 | echo "pass!" 10 | else 11 | echo "do not pass!" 12 | fi 13 | docker stop testpgzhparser-$pid 14 | -------------------------------------------------------------------------------- /check-debian.sh: -------------------------------------------------------------------------------- 1 | pid=$$ 2 | docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@debian-16 zhparser/zhparser:bookworm-16 3 | sleep 5 4 | export PGPASSWORD=somepassword@debian-16 5 | psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-debian.out - 6 | 7 | if [ $? -eq 0 ] 8 | then 9 | echo "pass!" 10 | else 11 | echo "do not pass!" 12 | fi 13 | docker stop testpgzhparser-$pid 14 | -------------------------------------------------------------------------------- /dict.utf8.xdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amutu/zhparser/8b04302dc2011c12ef87211a0527bbf41830e97e/dict.utf8.xdb -------------------------------------------------------------------------------- /dict_extra.txt: -------------------------------------------------------------------------------- 1 | ; dict_extra.txt 2 | 我是新增词 2.0 3 | 再试一个 1.0 1.0 @ 4 | ; 以下词为删除项 5 | 删除 1.0 1.0 ! 6 | -------------------------------------------------------------------------------- /docker/alpine/16/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_CONTAINER_VERSION=16 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-alpine as builder 3 | 4 | RUN set -ex \ 5 | && apk --no-cache add git build-base linux-headers make postgresql-dev automake libtool autoconf m4 6 | 7 | RUN set -ex \ 8 | && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \ 9 | && cd scws \ 10 | && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \ 11 | && ./configure \ 12 | && make install 13 | 14 | RUN set -ex \ 15 | && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \ 16 | && cd zhparser \ 17 | && make install 18 | 19 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-alpine 20 | ENV LANG zh_CN.UTF-8 21 | 22 | COPY --from=builder /usr/local/lib/postgresql/zhparser.so /usr/local/lib/postgresql/ 23 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/ 24 | COPY --from=builder /usr/local/share/postgresql/extension/zhparser* /usr/local/share/postgresql/extension/ 25 | COPY --from=builder /usr/local/lib/postgresql/bitcode/zhparser* /usr/local/lib/postgresql/bitcode/ 26 | COPY --from=builder /usr/local/share/postgresql/tsearch_data/*.utf8.* /usr/local/share/postgresql/tsearch_data/ 27 | -------------------------------------------------------------------------------- /docker/bookworm/15/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_CONTAINER_VERSION=15 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm as builder 3 | 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | RUN set -ex \ 6 | && apt-get update \ 7 | && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \ 8 | && apt-get clean 9 | 10 | RUN set -ex \ 11 | && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \ 12 | && cd scws \ 13 | && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \ 14 | && ./configure \ 15 | && make install 16 | 17 | RUN set -ex \ 18 | && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \ 19 | && cd zhparser \ 20 | && make install 21 | 22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm 23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8 24 | ENV LANG zh_CN.UTF-8 25 | 26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/ 27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/ 28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/ 29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/ 30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/dict.utf8.xdb /usr/share/postgresql/${PG_MAJOR}/tsearch_data/ 31 | -------------------------------------------------------------------------------- /docker/bookworm/16/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_CONTAINER_VERSION=16 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm as builder 3 | 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | RUN set -ex \ 6 | && apt-get update \ 7 | && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \ 8 | && apt-get clean 9 | 10 | RUN set -ex \ 11 | && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \ 12 | && cd scws \ 13 | && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \ 14 | && ./configure \ 15 | && make install 16 | 17 | RUN set -ex \ 18 | && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \ 19 | && cd zhparser \ 20 | && make install 21 | 22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm 23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8 24 | ENV LANG zh_CN.UTF-8 25 | 26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/ 27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/ 28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/ 29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/ 30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/ 31 | -------------------------------------------------------------------------------- /docker/bullseye/15/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_CONTAINER_VERSION=15 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye as builder 3 | 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | RUN set -ex \ 6 | && apt-get update \ 7 | && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \ 8 | && apt-get clean 9 | 10 | RUN set -ex \ 11 | && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \ 12 | && cd scws \ 13 | && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \ 14 | && ./configure \ 15 | && make install 16 | 17 | RUN set -ex \ 18 | && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \ 19 | && cd zhparser \ 20 | && make install 21 | 22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye 23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8 24 | ENV LANG zh_CN.UTF-8 25 | 26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/ 27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/ 28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/ 29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/ 30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/ 31 | -------------------------------------------------------------------------------- /docker/bullseye/16/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_CONTAINER_VERSION=16 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye as builder 3 | 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | RUN set -ex \ 6 | && apt-get update \ 7 | && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \ 8 | && apt-get clean 9 | 10 | RUN set -ex \ 11 | && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \ 12 | && cd scws \ 13 | && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \ 14 | && ./configure \ 15 | && make install 16 | 17 | RUN set -ex \ 18 | && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \ 19 | && cd zhparser \ 20 | && make install 21 | 22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye 23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8 24 | ENV LANG zh_CN.UTF-8 25 | 26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/ 27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/ 28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/ 29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/ 30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/ 31 | -------------------------------------------------------------------------------- /expected/zhparser-alpine.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION zhparser; 2 | -- make test configuration using parser 3 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); 4 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; 5 | -- ts_parse 6 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。'); 7 | tokid | token 8 | -------+---------- 9 | 101 | hello 10 | 101 | world 11 | 117 | ! 12 | 101 | 2010 13 | 113 | 年 14 | 118 | 保障 15 | 110 | 房建 16 | 118 | 设在 17 | 110 | 全国 18 | 110 | 范围 19 | 102 | 内 20 | 118 | 获 21 | 97 | 全面 22 | 118 | 启动 23 | 117 | , 24 | 110 | 从中 25 | 118 | 央 26 | 118 | 到 27 | 110 | 地方 28 | 100 | 纷纷 29 | 118 | 加大 30 | 118 | 了 31 | 118 | 保 32 | 110 | 障 33 | 110 | 房 34 | 117 | 的 35 | 118 | 建 36 | 118 | 设 37 | 99 | 和 38 | 118 | 投 39 | 118 | 入 40 | 110 | 力 41 | 107 | 度 42 | 117 | 。 43 | 101 | 2011 44 | 113 | 年 45 | 117 | , 46 | 118 | 保障 47 | 110 | 房 48 | 118 | 进入 49 | 118 | 了 50 | 100 | 更 51 | 110 | 大规模 52 | 117 | 的 53 | 118 | 建设 54 | 110 | 阶段 55 | 117 | 。 56 | 110 | 住房 57 | 110 | 城乡建设 58 | 110 | 部党组 59 | 110 | 书记 60 | 117 | 、 61 | 110 | 部长 62 | 110 | 姜 63 | 110 | 伟 64 | 97 | 新 65 | 116 | 去年底 66 | 112 | 在 67 | 110 | 全国 68 | 110 | 住房 69 | 110 | 城乡建设 70 | 118 | 工作 71 | 110 | 会议 72 | 110 | 上表 73 | 118 | 示 74 | 117 | , 75 | 118 | 要 76 | 118 | 继续 77 | 118 | 推进 78 | 110 | 保障性 79 | 118 | 安居 80 | 110 | 工程建设 81 | 117 | 。 82 | (73 rows) 83 | 84 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。'); 85 | to_tsvector 86 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 87 | '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19 88 | (1 row) 89 | 90 | SELECT to_tsquery('testzhcfg', '保障房资金压力'); 91 | to_tsquery 92 | --------------------------------------- 93 | '保障' <-> '房' <-> '资金' <-> '压力' 94 | (1 row) 95 | 96 | -- clean extension 97 | DROP EXTENSION zhparser CASCADE; 98 | -------------------------------------------------------------------------------- /expected/zhparser-debian.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION zhparser; 2 | -- make test configuration using parser 3 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); 4 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; 5 | -- ts_parse 6 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。'); 7 | tokid | token 8 | -------+---------- 9 | 101 | hello 10 | 101 | world 11 | 117 | ! 12 | 101 | 2010 13 | 113 | 年 14 | 118 | 保障 15 | 110 | 房建 16 | 118 | 设在 17 | 110 | 全国 18 | 110 | 范围 19 | 102 | 内 20 | 118 | 获 21 | 97 | 全面 22 | 118 | 启动 23 | 117 | , 24 | 112 | 从 25 | 110 | 中央 26 | 118 | 到 27 | 110 | 地方 28 | 100 | 纷纷 29 | 118 | 加大 30 | 118 | 了 31 | 118 | 保 32 | 110 | 障 33 | 110 | 房 34 | 117 | 的 35 | 118 | 建 36 | 118 | 设 37 | 99 | 和 38 | 118 | 投 39 | 118 | 入 40 | 110 | 力 41 | 107 | 度 42 | 117 | 。 43 | 101 | 2011 44 | 113 | 年 45 | 117 | , 46 | 118 | 保障 47 | 110 | 房 48 | 118 | 进入 49 | 118 | 了 50 | 100 | 更 51 | 110 | 大规模 52 | 117 | 的 53 | 118 | 建设 54 | 110 | 阶段 55 | 117 | 。 56 | 110 | 住房 57 | 110 | 城乡建设 58 | 110 | 部党组 59 | 110 | 书记 60 | 117 | 、 61 | 110 | 部长 62 | 110 | 姜 63 | 110 | 伟 64 | 97 | 新 65 | 116 | 去年底 66 | 112 | 在 67 | 110 | 全国 68 | 110 | 住房 69 | 110 | 城乡建设 70 | 118 | 工作 71 | 110 | 会议 72 | 118 | 上 73 | 118 | 表示 74 | 117 | , 75 | 118 | 要 76 | 118 | 继续 77 | 118 | 推进 78 | 110 | 保障性 79 | 118 | 安居 80 | 110 | 工程建设 81 | 117 | 。 82 | (73 rows) 83 | 84 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。'); 85 | to_tsvector 86 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 87 | '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '年份':16 '年度':9 '应':17 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19 88 | (1 row) 89 | 90 | SELECT to_tsquery('testzhcfg', '保障房资金压力'); 91 | to_tsquery 92 | --------------------------------------- 93 | '保障' <-> '房' <-> '资金' <-> '压力' 94 | (1 row) 95 | 96 | -- clean extension 97 | DROP EXTENSION zhparser CASCADE; 98 | -------------------------------------------------------------------------------- /rules.utf8.ini: -------------------------------------------------------------------------------- 1 | ; 2 | ; auto regular(utf-8) 3 | ; $Id$ 4 | ; 5 | ; special word, 特殊词汇 6 | ; 7 | 8 | [special] 9 | C++ 10 | C# 11 | R&B 12 | P&G 13 | J++ 14 | J# 15 | UTF-8 16 | PS/2 17 | 18 | ; 19 | ; nostats 20 | ; 21 | [nostats] 22 | about 23 | all 24 | also 25 | an 26 | and 27 | any 28 | are 29 | as 30 | at 31 | be 32 | but 33 | by 34 | both 35 | can 36 | for 37 | from 38 | have 39 | here 40 | if 41 | in 42 | is 43 | it 44 | no 45 | not 46 | of 47 | on 48 | or 49 | our 50 | out 51 | that 52 | the 53 | this 54 | to 55 | up 56 | us 57 | 58 | ; 59 | ; 词性语法规则表 60 | ; 61 | [attrs] 62 | ; c 是连词 63 | n + f(1) = 300 64 | n + m(1) = 500 65 | n(1) + v = 100 66 | n + v(1) = 10 67 | r + n(1) = 1000 68 | r(1) + n = 100 69 | d(1) + r = 100 70 | d(1) + v = 100 71 | v(1) + r = 100 72 | n + m(1) = 500 73 | v + f(1) = 30 74 | v(1) + m = 100 75 | v(1) + n = 3 76 | a + u(1) = 5 77 | v + n(1) = 5 78 | u(1) + a = 2 79 | c(1) + * = 50 80 | * + c(1) = 50 81 | 82 | ; 83 | ; 名字停用词表 84 | ; 85 | [noname] 86 | :line = no 87 | 给的说对在和是被最所那这有将 88 | 你会与他为不没很了啊哦呵把去 89 | 90 | ; 91 | ; 双字节符号 92 | ; 93 | [symbol] 94 | :type = none 95 | :line = no 96 | `-=[]、‘;/。,|?》《:“{}+—)(*…%¥#·!~ 97 | ’”〕〈〉「」『』〖〗【】<> 98 | 99 | ; 100 | ; 姓和外文名共同部分 101 | ; 102 | [pubname] 103 | :type = prefix 104 | :line = no 105 | :exclude = noname,symbol,alpha,chnum2 106 | :znum = 1,2 107 | :tf = 5.0 108 | :idf = 3.5 109 | :attr = nr 110 | 艾安贝卜戴费福盖戈古赫华霍吉贾金柯赖劳雷黎利林卢 111 | 鲁伦罗洛马麦米莫穆齐乔冉萨沙史斯温谢尤詹诸 112 | 113 | 114 | [pubname2] 115 | :type = prefix 116 | :line = no 117 | :exclude = noname,symbol,alpha,chnum2 118 | :tf = 5.0 119 | :idf = 3.5 120 | :attr = nr 121 | 伍陆 122 | 123 | [pubname3] 124 | :type = prefix 125 | :line = no 126 | :exclude = noname,symbol,alpha,chnum2 127 | :tf = 5.0 128 | :idf = 3.5 129 | :attr = nr 130 | 万章 131 | 132 | ; 133 | ; 单姓 134 | ; 135 | [surname] 136 | :type = prefix 137 | :line = no 138 | :exclude = noname,symbol,alpha,chnum2 139 | :tf = 5.0 140 | :idf = 3.5 141 | :attr = nr 142 | :znum = 1,2 143 | 144 | 敖白班包宝保鲍毕边卞柏蔡曹岑柴昌常车陈成程迟池褚 145 | 楚储淳崔刀邓狄刁丁董窦杜端段樊范方房斐丰封冯凤伏 146 | 傅甘高耿龚宫勾苟辜谷顾官关管桂郭韩杭郝禾何贺衡洪 147 | 侯胡花黄稽姬纪季简翦姜江蒋焦晋靳荆居康空孔匡邝况 148 | 蓝郎朗乐冷李理厉励连廉练良梁廖凌刘柳隆龙楼娄吕路 149 | 骆麻满茅毛梅孟苗缪闵明牟倪聂牛钮农潘庞裴彭皮朴平 150 | 蒲溥浦戚祁钱强秦丘邱仇裘屈瞿权饶任荣容阮瑞芮赛单 151 | 商邵佘申沈盛石寿舒宋苏孙邰谭谈汤唐陶滕田佟仝屠涂 152 | 汪王危韦魏卫蔚闻翁巫邬武吴奚习夏鲜席冼项萧解辛邢 153 | 幸熊徐许宣薛荀颜阎言严彦晏燕杨阳姚叶蚁易殷银尹应 154 | 英游于於鱼虞俞余禹喻郁尉袁岳云臧曾查翟湛张赵甄郑 155 | 钟周朱竺祝庄卓宗邹祖左肖 156 | 157 | ; 158 | ; 复姓 159 | ; 160 | [surname2] 161 | :type = prefix 162 | :line = yes 163 | :exclude = noname,symbol,alpha,chnum2 164 | :tf = 5.0 165 | :idf = 3.5 166 | :attr = nr 167 | :znum = 1, 2 168 | 东郭 169 | 公孙 170 | 皇甫 171 | 慕容 172 | 欧阳 173 | 单于 174 | 司空 175 | 司马 176 | 司徒 177 | 澹台 178 | 诸葛 179 | 180 | ; 181 | ; 地点名称 182 | ; 183 | [areaname] 184 | :type = suffix 185 | :znum = 2 186 | :exclude = noname,symbol,alpha,chnum2 187 | :tf = 4.5 188 | :idf = 3.0 189 | :attr = ns 190 | :line = no 191 | 192 | 县市镇村乡区 193 | 194 | ; 195 | ; 双字地点名称 196 | ; 197 | [areaname2] 198 | :type = suffix 199 | :znum = 2 200 | :exclude = noname,symbol,alpha,chnum2 201 | :tf = 4.5 202 | :idf = 3.0 203 | :attr = ns 204 | :line = yes 205 | 东路 206 | 西路 207 | 支路 208 | 街道 209 | 南路 210 | 北路 211 | 212 | 213 | [munit] 214 | :type = none 215 | :line = no 216 | 萬亿零年点分秒回节名个多届次集 217 | 218 | [chnum0] 219 | :type = prefix 220 | :line = no 221 | :tf = 2.5 222 | :idf = 1.0 223 | :attr = mt 224 | :include = chnum2,chnum3,munit,pubname3 225 | 0 226 | 227 | [chnum1] 228 | :type = prefix 229 | :include = chnum0,chnum1,munit,pubname3 230 | :tf = 3.0 231 | :idf = 1.0 232 | :attr = mt 233 | :line = no 234 | 一二三四五六七八九十百千 235 | 236 | [chnum2] 237 | :type = prefix 238 | :line = no 239 | :tf = 3.0 240 | :idf = 1.0 241 | :attr = mt 242 | :include = chnum0,chnum2,chnum3,munit,pubname3 243 | 123456789 244 | 245 | [chnum3] 246 | :type = none 247 | :line = no 248 | . 249 | 250 | [chnum4] 251 | :type = prefix 252 | :line = no 253 | :tf = 3.0 254 | :idf = 1.0 255 | :attr = mt 256 | :include = chnum4,munit,pubname2,pubname3 257 | 258 | 壹贰叁肆柒捌玖拾佰仟 259 | 260 | [chnum5] 261 | :type = prefix 262 | :line = no 263 | :tf = 3.5 264 | :idf = 2.0 265 | :attr = nz 266 | :include = chnum1,munit,pubname3,chnum2 267 | 268 | 第每 269 | 270 | [alpha] 271 | :type = prefix 272 | :line = no 273 | :tf = 2.5 274 | :idf = 1.0 275 | :attr = en 276 | :include = alpha 277 | 278 | abcdefghijklmnopqrstuvwxyz 279 | ABCDEFGHIJKLMNOPQRSTUVWXYZ' 280 | 281 | [foregin] 282 | :type = prefix 283 | :line = no 284 | :tf = 4.0 285 | :idf = 3.0 286 | :attr = nr 287 | :include = foregin,pubname,pubname2,pubname3 288 | 阿克拉加内亚巴尔姆爱兰西伊杰纳布可夫勒特坦芬尼根登都 289 | 伯泰胥俄科索沃森奥瓦茨普蒂塞维大莱德冈墨哥弗库澳哈兹 290 | 乌奇切诺里基延达塔卡雅来波迈蓬什比摩曼乃休合娜迪凯帕 291 | 桑佩蒙博托格泽及希匹印埃努烈累法图喀土腓耶逊宾 292 | -------------------------------------------------------------------------------- /sql/zhparser.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION zhparser; 2 | 3 | -- make test configuration using parser 4 | 5 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); 6 | 7 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; 8 | 9 | -- ts_parse 10 | 11 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。'); 12 | 13 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。'); 14 | 15 | SELECT to_tsquery('testzhcfg', '保障房资金压力'); 16 | 17 | -- clean extension 18 | 19 | DROP EXTENSION zhparser CASCADE; 20 | -------------------------------------------------------------------------------- /zhparser--1.0--2.0.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_getsharepath() 2 | RETURNS text 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE SCHEMA zhparser; 7 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!')); 8 | 9 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 10 | $$ 11 | declare 12 | dict_path text; 13 | time_tag_path text; 14 | query text; 15 | begin 16 | select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.txt' into dict_path; 17 | select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.tag' into time_tag_path; 18 | 19 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 20 | execute query; 21 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 22 | execute query; 23 | end; 24 | $$; 25 | 26 | select sync_zhprs_custom_word(); 27 | -------------------------------------------------------------------------------- /zhparser--1.0.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_start(internal, int4) 2 | RETURNS internal 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION zhprs_end(internal) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FUNCTION zhprs_lextype(internal) 17 | RETURNS internal 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT; 20 | 21 | CREATE TEXT SEARCH PARSER zhparser ( 22 | START = zhprs_start, 23 | GETTOKEN = zhprs_getlexeme, 24 | END = zhprs_end, 25 | HEADLINE = pg_catalog.prsd_headline, 26 | LEXTYPES = zhprs_lextype 27 | ); 28 | -------------------------------------------------------------------------------- /zhparser--2.0--2.1.sql: -------------------------------------------------------------------------------- 1 | drop function zhprs_getsharepath(); 2 | 3 | CREATE or REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 4 | $$ 5 | declare 6 | database_oid text; 7 | data_dir text; 8 | dict_path text; 9 | time_tag_path text; 10 | query text; 11 | begin 12 | select setting from pg_settings where name='data_directory' into data_dir; 13 | select oid from pg_database where datname=current_database() into database_oid; 14 | 15 | select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.txt' into dict_path; 16 | select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path; 17 | 18 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 19 | execute query; 20 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 21 | execute query; 22 | end; 23 | $$; 24 | 25 | select sync_zhprs_custom_word(); 26 | -------------------------------------------------------------------------------- /zhparser--2.0.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_start(internal, int4) 2 | RETURNS internal 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION zhprs_end(internal) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FUNCTION zhprs_lextype(internal) 17 | RETURNS internal 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT; 20 | 21 | CREATE TEXT SEARCH PARSER zhparser ( 22 | START = zhprs_start, 23 | GETTOKEN = zhprs_getlexeme, 24 | END = zhprs_end, 25 | HEADLINE = pg_catalog.prsd_headline, 26 | LEXTYPES = zhprs_lextype 27 | ); 28 | 29 | CREATE FUNCTION zhprs_getsharepath() 30 | RETURNS text 31 | AS 'MODULE_PATHNAME' 32 | LANGUAGE C STRICT; 33 | 34 | CREATE SCHEMA zhparser; 35 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!')); 36 | 37 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 38 | $$ 39 | declare 40 | dict_path text; 41 | time_tag_path text; 42 | query text; 43 | begin 44 | select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.txt' into dict_path; 45 | select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.tag' into time_tag_path; 46 | 47 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 48 | execute query; 49 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 50 | execute query; 51 | end; 52 | $$; 53 | 54 | select sync_zhprs_custom_word(); 55 | -------------------------------------------------------------------------------- /zhparser--2.1--2.2.sql: -------------------------------------------------------------------------------- 1 | CREATE or REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 2 | $$ 3 | declare 4 | data_dir text; 5 | dict_path text; 6 | time_tag_path text; 7 | query text; 8 | begin 9 | select setting from pg_settings where name='data_directory' into data_dir; 10 | 11 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path; 12 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path; 13 | 14 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 15 | execute query; 16 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 17 | execute query; 18 | end; 19 | $$; 20 | 21 | -- do not created custom dict files when fresh installed 22 | -- select sync_zhprs_custom_word(); 23 | -------------------------------------------------------------------------------- /zhparser--2.1.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_start(internal, int4) 2 | RETURNS internal 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION zhprs_end(internal) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FUNCTION zhprs_lextype(internal) 17 | RETURNS internal 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT; 20 | 21 | CREATE TEXT SEARCH PARSER zhparser ( 22 | START = zhprs_start, 23 | GETTOKEN = zhprs_getlexeme, 24 | END = zhprs_end, 25 | HEADLINE = pg_catalog.prsd_headline, 26 | LEXTYPES = zhprs_lextype 27 | ); 28 | 29 | 30 | CREATE SCHEMA zhparser; 31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!')); 32 | 33 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 34 | $$ 35 | declare 36 | database_oid text; 37 | data_dir text; 38 | dict_path text; 39 | time_tag_path text; 40 | query text; 41 | begin 42 | select setting from pg_settings where name='data_directory' into data_dir; 43 | select oid from pg_database where datname=current_database() into database_oid; 44 | 45 | 46 | select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.txt' into dict_path; 47 | select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path; 48 | 49 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 50 | execute query; 51 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 52 | execute query; 53 | end; 54 | $$; 55 | 56 | select sync_zhprs_custom_word(); 57 | -------------------------------------------------------------------------------- /zhparser--2.2.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_start(internal, int4) 2 | RETURNS internal 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION zhprs_end(internal) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FUNCTION zhprs_lextype(internal) 17 | RETURNS internal 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT; 20 | 21 | CREATE TEXT SEARCH PARSER zhparser ( 22 | START = zhprs_start, 23 | GETTOKEN = zhprs_getlexeme, 24 | END = zhprs_end, 25 | HEADLINE = pg_catalog.prsd_headline, 26 | LEXTYPES = zhprs_lextype 27 | ); 28 | 29 | 30 | CREATE SCHEMA zhparser; 31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!')); 32 | 33 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 34 | $$ 35 | declare 36 | data_dir text; 37 | dict_path text; 38 | time_tag_path text; 39 | query text; 40 | begin 41 | select setting from pg_settings where name='data_directory' into data_dir; 42 | 43 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path; 44 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path; 45 | 46 | query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ; 47 | execute query; 48 | query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ; 49 | execute query; 50 | end; 51 | $$; 52 | 53 | -- do not created custom dict files when fresh installed 54 | -- select sync_zhprs_custom_word(); 55 | -------------------------------------------------------------------------------- /zhparser--2.3.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION zhprs_start(internal, int4) 2 | RETURNS internal 3 | AS 'MODULE_PATHNAME' 4 | LANGUAGE C STRICT; 5 | 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) 7 | RETURNS internal 8 | AS 'MODULE_PATHNAME' 9 | LANGUAGE C STRICT; 10 | 11 | CREATE FUNCTION zhprs_end(internal) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME' 14 | LANGUAGE C STRICT; 15 | 16 | CREATE FUNCTION zhprs_lextype(internal) 17 | RETURNS internal 18 | AS 'MODULE_PATHNAME' 19 | LANGUAGE C STRICT; 20 | 21 | CREATE TEXT SEARCH PARSER zhparser ( 22 | START = zhprs_start, 23 | GETTOKEN = zhprs_getlexeme, 24 | END = zhprs_end, 25 | HEADLINE = pg_catalog.prsd_headline, 26 | LEXTYPES = zhprs_lextype 27 | ); 28 | 29 | 30 | CREATE SCHEMA zhparser; 31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', 32 | idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!')); 33 | 34 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS 35 | $$ 36 | declare 37 | data_dir text; 38 | dict_path text; 39 | time_tag_path text; 40 | query text; 41 | begin 42 | select setting from pg_settings where name='data_directory' into data_dir; 43 | 44 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path; 45 | select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path; 46 | 47 | query = $q$copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to '$q$ || dict_path || $q$' encoding 'utf8' $q$; 48 | execute query; 49 | query = $q$copy (select now()) to '$q$ || time_tag_path || $q$'$q$; 50 | execute query; 51 | end; 52 | $$; 53 | 54 | -- do not created custom dict files when fresh installed 55 | -- select sync_zhprs_custom_word(); 56 | -------------------------------------------------------------------------------- /zhparser--unpackaged--1.0.sql: -------------------------------------------------------------------------------- 1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 2 | \echo Use "CREATE EXTENSION zhparser" to load this file. \quit 3 | 4 | ALTER EXTENSION zhparser ADD function zhprs_start(internal,integer); 5 | ALTER EXTENSION zhparser ADD function zhprs_getlexeme(internal,internal,internal); 6 | ALTER EXTENSION zhparser ADD function zhprs_end(internal); 7 | ALTER EXTENSION zhparser ADD function zhprs_lextype(internal); 8 | ALTER EXTENSION zhparser ADD text search parser zhparser; 9 | -------------------------------------------------------------------------------- /zhparser-backup-custom-dict.sh: -------------------------------------------------------------------------------- 1 | if [ $# -lt 2 ];then 2 | echo "usage: $0 [restore_from_dir]" 3 | echo "warning: delete is a dangerous cmd, it will delete your custom from pgdata_dir." 4 | echo "!!!you should run backup cmd first, then run the delete cmd !!!" 5 | exit 2 6 | fi 7 | cmd=$1 8 | pgdata=$2 9 | restore_from_dir=$3 10 | 11 | if [ $cmd = 'backup' ];then 12 | backup_dir=zhparser-backup-custom-dict-$(date +'%F:%T') 13 | mkdir ./$backup_dir 14 | echo "will backup $pgdata/base/zhprs_dict_* to $backup_dir/" 15 | cp -a $pgdata/base/zhprs_dict_* $backup_dir/ 16 | if [ "$?" -ne 0 ]; 17 | then 18 | echo "backup error!" 19 | exit 1 20 | else 21 | echo "backup ok!" 22 | fi 23 | fi 24 | 25 | if [ $cmd = 'delete' ];then 26 | echo "will delete $pgdata/base/zhprs_dict_*" 27 | rm $pgdata/base/zhprs_dict_* 28 | if [ "$?" -ne 0 ]; 29 | then 30 | echo "delete error!" 31 | exit 1 32 | else 33 | echo "delete ok!" 34 | fi 35 | fi 36 | 37 | if [ $cmd = 'restore' ];then 38 | echo "will restore $restore_from_dir/zhprs_dict_* to $pgdata/base/" 39 | cp -a $restore_from_dir/zhprs_dict_* $pgdata/base/ 40 | if [ "$?" -ne 0 ]; 41 | then 42 | echo "restore error!" 43 | exit 1 44 | else 45 | echo "restore ok!" 46 | fi 47 | fi 48 | -------------------------------------------------------------------------------- /zhparser.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * zhparser.c 4 | * a text search parser for Chinese 5 | * 6 | *------------------------------------------------------------------------- 7 | */ 8 | #include "zhparser.h" 9 | 10 | #include "postgres.h" 11 | #include "miscadmin.h" 12 | #include "fmgr.h" 13 | #include "utils/guc.h" 14 | #include "utils/builtins.h" 15 | 16 | #if PG_VERSION_NUM >= 100000 17 | #include "utils/varlena.h" 18 | #endif 19 | 20 | #include "commands/dbcommands.h" 21 | 22 | /* dict file extension */ 23 | #define TXT_EXT ".txt" 24 | #define XDB_EXT ".xdb" 25 | /* length of file extension */ 26 | #define EXT_LEN 4 27 | 28 | PG_MODULE_MAGIC; 29 | /* 30 | * types 31 | */ 32 | 33 | /* self-defined type */ 34 | typedef struct 35 | { 36 | char *buffer; /* text to parse */ 37 | int len; /* length of the text in buffer */ 38 | int pos; /* position of the parser */ 39 | scws_t scws; 40 | scws_res_t head; 41 | scws_res_t curr; 42 | } ParserState; 43 | 44 | /* copy-paste from wparser.h of tsearch2 */ 45 | typedef struct 46 | { 47 | int lexid; 48 | char *alias; 49 | char *descr; 50 | } LexDescr; 51 | 52 | static void init(); 53 | 54 | static void init_type(LexDescr descr[]); 55 | 56 | /* 57 | * prototypes 58 | */ 59 | PG_FUNCTION_INFO_V1(zhprs_start); 60 | Datum zhprs_start(PG_FUNCTION_ARGS); 61 | 62 | PG_FUNCTION_INFO_V1(zhprs_getlexeme); 63 | Datum zhprs_getlexeme(PG_FUNCTION_ARGS); 64 | 65 | PG_FUNCTION_INFO_V1(zhprs_end); 66 | Datum zhprs_end(PG_FUNCTION_ARGS); 67 | 68 | PG_FUNCTION_INFO_V1(zhprs_lextype); 69 | Datum zhprs_lextype(PG_FUNCTION_ARGS); 70 | 71 | static scws_t scws = NULL; 72 | static ParserState parser_state; 73 | 74 | /* config */ 75 | static bool dict_in_memory = false; 76 | static char * extra_dicts = NULL; 77 | 78 | static bool punctuation_ignore = false; 79 | static bool seg_with_duality = false; 80 | static bool multi_short = false; 81 | static bool multi_duality = false; 82 | static bool multi_zmain = false; 83 | static bool multi_zall = false; 84 | 85 | static void init(){ 86 | char sharepath[MAXPGPATH]; 87 | char dict_path[MAXPGPATH]; 88 | char rule_path[MAXPGPATH]; 89 | int load_dict_mem_mode = 0x0; 90 | 91 | List *elemlist; 92 | ListCell *l; 93 | 94 | if (!(scws = scws_new())) { 95 | ereport(ERROR, 96 | (errcode(ERRCODE_INTERNAL_ERROR), 97 | errmsg("Failed to init Chinese Parser Lib SCWS!\"%s\"","" 98 | ))); 99 | } 100 | 101 | DefineCustomBoolVariable( 102 | "zhparser.dict_in_memory", 103 | "load dicts into memory", 104 | "load dicts into memory", 105 | &dict_in_memory, 106 | false, 107 | PGC_BACKEND, 108 | 0, 109 | NULL, 110 | NULL, 111 | NULL 112 | ); 113 | DefineCustomStringVariable( 114 | "zhparser.extra_dicts", 115 | "extra dicts files to load", 116 | "extra dicts files to load", 117 | &extra_dicts, 118 | NULL, 119 | PGC_BACKEND, 120 | 0, 121 | NULL, 122 | NULL, 123 | NULL 124 | ); 125 | DefineCustomBoolVariable( 126 | "zhparser.punctuation_ignore", 127 | "set if zhparser ignores the puncuation", 128 | "set if zhparser ignores the puncuation,except \\r and \\n", 129 | &punctuation_ignore, 130 | false, 131 | PGC_USERSET, 132 | 0, 133 | NULL, 134 | NULL, 135 | NULL 136 | ); 137 | 138 | DefineCustomBoolVariable( 139 | "zhparser.seg_with_duality", 140 | "segment words with duality", 141 | "segment words with duality", 142 | &seg_with_duality, 143 | false, 144 | PGC_USERSET, 145 | 0, 146 | NULL, 147 | NULL, 148 | NULL 149 | ); 150 | DefineCustomBoolVariable( 151 | "zhparser.multi_short", 152 | "prefer short words", 153 | "prefer short words", 154 | &multi_short, 155 | false, 156 | PGC_USERSET, 157 | 0, 158 | NULL, 159 | NULL, 160 | NULL 161 | ); 162 | DefineCustomBoolVariable( 163 | "zhparser.multi_duality", 164 | "prefer duality", 165 | "prefer duality", 166 | &multi_duality, 167 | false, 168 | PGC_USERSET, 169 | 0, 170 | NULL, 171 | NULL, 172 | NULL 173 | ); 174 | DefineCustomBoolVariable( 175 | "zhparser.multi_zmain", 176 | "prefer most important element", 177 | "prefer most important element", 178 | &multi_zmain, 179 | false, 180 | PGC_USERSET, 181 | 0, 182 | NULL, 183 | NULL, 184 | NULL 185 | ); 186 | DefineCustomBoolVariable( 187 | "zhparser.multi_zall", 188 | "prefer all element", 189 | "prefer all element", 190 | &multi_zall, 191 | false, 192 | PGC_USERSET, 193 | 0, 194 | NULL, 195 | NULL, 196 | NULL 197 | ); 198 | 199 | get_share_path(my_exec_path, sharepath); 200 | snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s.%s", 201 | sharepath, "dict.utf8", "xdb"); 202 | scws_set_charset(scws, "utf-8"); 203 | 204 | if(dict_in_memory) 205 | load_dict_mem_mode = SCWS_XDICT_MEM; 206 | 207 | /* ignore error,default dict is xdb */ 208 | if( scws_set_dict(scws,dict_path,load_dict_mem_mode | SCWS_XDICT_XDB ) != 0){ 209 | ereport(NOTICE, 210 | (errcode(ERRCODE_INTERNAL_ERROR), 211 | errmsg("zhparser set dict : \"%s\" failed!",dict_path 212 | ))); 213 | } 214 | 215 | snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt", 216 | DataDir, get_database_name(MyDatabaseId)); 217 | if(scws_add_dict(scws, dict_path, load_dict_mem_mode | SCWS_XDICT_TXT) != 0 ){ 218 | ereport(LOG, 219 | (errcode(ERRCODE_INTERNAL_ERROR), 220 | errmsg("zhparser add dict : \"%s\" failed! May not config custom dict, omit this",dict_path 221 | ))); 222 | } 223 | 224 | if(extra_dicts != NULL){ 225 | if(!SplitIdentifierString(extra_dicts,',',&elemlist)){ 226 | scws_free(scws); 227 | list_free(elemlist); 228 | scws = NULL; 229 | ereport(ERROR, 230 | (errcode(ERRCODE_INTERNAL_ERROR), 231 | errmsg("zhparser.extra_dicts syntax error! extra_dicts is \"%s\"",extra_dicts 232 | ))); 233 | } 234 | 235 | foreach(l,elemlist){ 236 | int load_dict_mode = load_dict_mem_mode; 237 | char * ext = strrchr((char*)lfirst(l),'.'); 238 | if(ext != NULL && strlen(ext) == EXT_LEN){ 239 | if(strncmp(ext,TXT_EXT,EXT_LEN) == 0){ 240 | load_dict_mode |= SCWS_XDICT_TXT; 241 | } 242 | else if(strncmp(ext,XDB_EXT,EXT_LEN) == 0){ 243 | load_dict_mode |= SCWS_XDICT_XDB; 244 | } 245 | } 246 | 247 | if(((load_dict_mode & SCWS_XDICT_TXT) == 0) && 248 | ((load_dict_mode & SCWS_XDICT_XDB) == 0)){ 249 | scws_free(scws); 250 | list_free(elemlist); 251 | scws = NULL; 252 | ereport(ERROR, 253 | (errcode(ERRCODE_INTERNAL_ERROR), 254 | errmsg("zhparser.extra_dicts setting error,the file name must end with .txt or .xdb! error file name is \"%s\"",(char*)lfirst(l) 255 | ))); 256 | 257 | } 258 | 259 | snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s", 260 | sharepath, (char*)lfirst(l)); 261 | /* ignore error*/ 262 | if( scws_add_dict(scws,dict_path,load_dict_mode) != 0 ){ 263 | ereport(LOG, 264 | (errcode(ERRCODE_INTERNAL_ERROR), 265 | errmsg("zhparser add dict : \"%s\" failed for extra dict! omit",dict_path 266 | ))); 267 | } 268 | } 269 | list_free(elemlist); 270 | } 271 | 272 | snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/%s.%s", 273 | sharepath, "rules.utf8", "ini"); 274 | scws_set_rule(scws ,rule_path); 275 | } 276 | 277 | /* 278 | * functions 279 | */ 280 | 281 | Datum 282 | zhprs_start(PG_FUNCTION_ARGS) 283 | { 284 | ParserState *pst = &parser_state; 285 | int multi_mode = 0x0; 286 | 287 | if(scws == NULL) 288 | init(); 289 | pst -> scws = scws; 290 | pst -> buffer = (char *) PG_GETARG_POINTER(0); 291 | pst -> len = PG_GETARG_INT32(1); 292 | pst -> pos = 0; 293 | 294 | scws_set_ignore(scws, (int)punctuation_ignore); 295 | scws_set_duality(scws,(int)seg_with_duality); 296 | 297 | if(multi_short){ 298 | multi_mode |= SCWS_MULTI_SHORT; 299 | } 300 | 301 | if(multi_duality){ 302 | multi_mode |= SCWS_MULTI_DUALITY; 303 | } 304 | 305 | if(multi_zmain){ 306 | multi_mode |= SCWS_MULTI_ZMAIN; 307 | } 308 | 309 | if(multi_zall){ 310 | multi_mode |= SCWS_MULTI_ZALL; 311 | } 312 | 313 | scws_set_multi(scws,multi_mode); 314 | 315 | scws_send_text(pst -> scws, pst -> buffer, pst -> len); 316 | 317 | (pst -> head) = (pst -> curr) = scws_get_result(pst -> scws); 318 | 319 | PG_RETURN_POINTER(pst); 320 | } 321 | 322 | Datum 323 | zhprs_getlexeme(PG_FUNCTION_ARGS) 324 | { 325 | ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); 326 | char **t = (char **) PG_GETARG_POINTER(1); 327 | int *tlen = (int *) PG_GETARG_POINTER(2); 328 | int type = -1; 329 | 330 | if((pst -> head) == NULL ) /* already done the work,or no sentence */ 331 | { 332 | *tlen = 0; 333 | type = 0; 334 | } 335 | /* have results */ 336 | else if(pst -> curr != NULL) 337 | { 338 | scws_res_t curr = pst -> curr; 339 | 340 | /* 341 | * check the first char to determine the lextype 342 | * if out of [0,25],then set to 'x',mean unknown type 343 | * so for Ag,Dg,Ng,Tg,Vg,the type will be unknown 344 | * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr 345 | */ 346 | type = (int)(curr -> attr)[0]; 347 | if(type > (int)'x' || type < (int)'a') 348 | type = (int)'x'; 349 | *tlen = curr -> len; 350 | *t = pst -> buffer + curr -> off; 351 | 352 | pst -> curr = curr -> next; 353 | 354 | /* fetch the next sentence */ 355 | if(pst -> curr == NULL ){ 356 | scws_free_result(pst -> head); 357 | (pst -> head) = (pst -> curr) = scws_get_result(pst -> scws); 358 | } 359 | } 360 | 361 | PG_RETURN_INT32(type); 362 | } 363 | 364 | Datum 365 | zhprs_end(PG_FUNCTION_ARGS) 366 | { 367 | PG_RETURN_VOID(); 368 | } 369 | 370 | Datum 371 | zhprs_lextype(PG_FUNCTION_ARGS) 372 | { 373 | LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (26 + 1)); 374 | init_type(descr); 375 | 376 | PG_RETURN_POINTER(descr); 377 | } 378 | 379 | static void init_type(LexDescr descr[]){ 380 | /* 381 | * there are 26 types in this parser,alias from a to z 382 | * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr 383 | */ 384 | descr[0].lexid = 97; 385 | descr[0].alias = pstrdup("a"); 386 | descr[0].descr = pstrdup("adjective,形容词"); 387 | descr[1].lexid = 98; 388 | descr[1].alias = pstrdup("b"); 389 | descr[1].descr = pstrdup("differentiation,区别词"); 390 | descr[2].lexid = 99; 391 | descr[2].alias = pstrdup("c"); 392 | descr[2].descr = pstrdup("conjunction,连词"); 393 | descr[3].lexid = 100; 394 | descr[3].alias = pstrdup("d"); 395 | descr[3].descr = pstrdup("adverb,副词"); 396 | descr[4].lexid = 101; 397 | descr[4].alias = pstrdup("e"); 398 | descr[4].descr = pstrdup("exclamation,感叹词"); 399 | descr[5].lexid = 102; 400 | descr[5].alias = pstrdup("f"); 401 | descr[5].descr = pstrdup("position,方位词"); 402 | descr[6].lexid = 103; 403 | descr[6].alias = pstrdup("g"); 404 | descr[6].descr = pstrdup("root,词根"); 405 | descr[7].lexid = 104; 406 | descr[7].alias = pstrdup("h"); 407 | descr[7].descr = pstrdup("head,前连接成分"); 408 | descr[8].lexid = 105; 409 | descr[8].alias = pstrdup("i"); 410 | descr[8].descr = pstrdup("idiom,成语"); 411 | descr[9].lexid = 106; 412 | descr[9].alias = pstrdup("j"); 413 | descr[9].descr = pstrdup("abbreviation,简称"); 414 | descr[10].lexid = 107; 415 | descr[10].alias = pstrdup("k"); 416 | descr[10].descr = pstrdup("tail,后连接成分"); 417 | descr[11].lexid = 108; 418 | descr[11].alias = pstrdup("l"); 419 | descr[11].descr = pstrdup("tmp,习用语"); 420 | descr[12].lexid = 109; 421 | descr[12].alias = pstrdup("m"); 422 | descr[12].descr = pstrdup("numeral,数词"); 423 | descr[13].lexid = 110; 424 | descr[13].alias = pstrdup("n"); 425 | descr[13].descr = pstrdup("noun,名词"); 426 | descr[14].lexid = 111; 427 | descr[14].alias = pstrdup("o"); 428 | descr[14].descr = pstrdup("onomatopoeia,拟声词"); 429 | descr[15].lexid = 112; 430 | descr[15].alias = pstrdup("p"); 431 | descr[15].descr = pstrdup("prepositional,介词"); 432 | descr[16].lexid = 113; 433 | descr[16].alias = pstrdup("q"); 434 | descr[16].descr = pstrdup("quantity,量词"); 435 | descr[17].lexid = 114; 436 | descr[17].alias = pstrdup("r"); 437 | descr[17].descr = pstrdup("pronoun,代词"); 438 | descr[18].lexid = 115; 439 | descr[18].alias = pstrdup("s"); 440 | descr[18].descr = pstrdup("space,处所词"); 441 | descr[19].lexid = 116; 442 | descr[19].alias = pstrdup("t"); 443 | descr[19].descr = pstrdup("time,时语素"); 444 | descr[20].lexid = 117; 445 | descr[20].alias = pstrdup("u"); 446 | descr[20].descr = pstrdup("auxiliary,助词"); 447 | descr[21].lexid = 118; 448 | descr[21].alias = pstrdup("v"); 449 | descr[21].descr = pstrdup("verb,动词"); 450 | descr[22].lexid = 119; 451 | descr[22].alias = pstrdup("w"); 452 | descr[22].descr = pstrdup("punctuation,标点符号"); 453 | descr[23].lexid = 120; 454 | descr[23].alias = pstrdup("x"); 455 | descr[23].descr = pstrdup("unknown,未知词"); 456 | descr[24].lexid = 121; 457 | descr[24].alias = pstrdup("y"); 458 | descr[24].descr = pstrdup("modal,语气词"); 459 | descr[25].lexid = 122; 460 | descr[25].alias = pstrdup("z"); 461 | descr[25].descr = pstrdup("status,状态词"); 462 | descr[26].lexid = 0; 463 | } 464 | //TODO :headline function 465 | -------------------------------------------------------------------------------- /zhparser.control: -------------------------------------------------------------------------------- 1 | comment = 'a parser for full-text search of Chinese' 2 | default_version = '2.3' 3 | module_pathname = '$libdir/zhparser' 4 | relocatable = true 5 | -------------------------------------------------------------------------------- /zhparser.h: -------------------------------------------------------------------------------- 1 | #ifndef ZHPARSER_H 2 | #define ZHPARSER_H 3 | 4 | #ifndef pstrdup 5 | #define pstrdup scws_pstrdup 6 | #endif 7 | 8 | #include "scws.h" 9 | 10 | #undef pstrdup 11 | 12 | #endif 13 | --------------------------------------------------------------------------------