├── .github
    └── workflows
    │   └── main.yml
├── CHANGELOG
├── COPYRIGHT
├── META.json
├── Makefile
├── README.md
├── check-alpine.sh
├── check-debian.sh
├── dict.utf8.xdb
├── dict_extra.txt
├── docker
    ├── alpine
    │   └── 16
    │   │   └── Dockerfile
    ├── bookworm
    │   ├── 15
    │   │   └── Dockerfile
    │   └── 16
    │   │   └── Dockerfile
    └── bullseye
    │   ├── 15
    │       └── Dockerfile
    │   └── 16
    │       └── Dockerfile
├── expected
    ├── zhparser-alpine.out
    └── zhparser-debian.out
├── rules.utf8.ini
├── sql
    └── zhparser.sql
├── zhparser--1.0--2.0.sql
├── zhparser--1.0.sql
├── zhparser--2.0--2.1.sql
├── zhparser--2.0.sql
├── zhparser--2.1--2.2.sql
├── zhparser--2.1.sql
├── zhparser--2.2.sql
├── zhparser--2.3.sql
├── zhparser--unpackaged--1.0.sql
├── zhparser-backup-custom-dict.sh
├── zhparser.c
├── zhparser.control
└── zhparser.h


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI
 4 | 
 5 | # Controls when the workflow will run
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the master branch
 8 |   push:
 9 |     branches: [ master ]
10 |   pull_request:
11 |     branches: [ master ]
12 | 
13 |   # Allows you to run this workflow manually from the Actions tab
14 |   workflow_dispatch:
15 | 
16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
17 | jobs:
18 |   # This workflow contains a single job called "build"
19 |   build:
20 |     # The type of runner that the job will run on
21 |     runs-on: ubuntu-latest
22 | 
23 |     # Steps represent a sequence of tasks that will be executed as part of the job
24 |     steps:
25 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
26 |       - uses: actions/checkout@v2
27 | 
28 |       # Runs a single command using the runners shell
29 |       - name: Run a one-line script
30 |         run: echo Hello, world!
31 | 
32 |       # Runs a set of commands using the runners shell
33 |       - name: Run a multi-line script
34 |         run: |
35 |           sudo apt-get -y install postgresql-server-dev-16
36 |           wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.3.tar.bz2 | tar jxf -
37 |           cd scws-1.2.3 ; ./configure ; sudo make install
38 |           cd $GITHUB_WORKSPACE
39 |           ls
40 |           env
41 |           type pg_config
42 |           cd $GITHUB_WORKSPACE
43 |           pg_config
44 |           export PG_CONFIG=/usr/bin/pg_config ; make && sudo make install
45 |           echo test, and deploy your project.
46 | 
47 |   freebsd-test:
48 |     runs-on: ubuntu-latest
49 |     name: A job to run test in FreeBSD
50 |     env:
51 |       MYTOKEN : ${{ secrets.MYTOKEN }}
52 |       MYTOKEN2: "value2"
53 |     steps:
54 |     - uses: actions/checkout@v4
55 |     - name: Test in FreeBSD
56 |       id: test
57 |       uses: vmactions/freebsd-vm@v1
58 |       with:
59 |         envs: 'MYTOKEN MYTOKEN2'
60 |         usesh: true
61 |         prepare: |
62 |           pkg install -y scws
63 |           pkg install -y postgresql16-server
64 |           pkg install -y gmake
65 |           pkg install -y git
66 | 
67 |         run: |
68 |           env
69 |           freebsd-version
70 |           git clone https://github.com/amutu/zhparser.git
71 |           cd zhparser ; gmake
72 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 2.2 (2021-11-08)
 2 | -- move custom word from /base/${DATABASE_ID}/zhprs_dict_${DATABASE_NAME}.txt to /base/zhprs_dict_${DATABASE_NAME}.txt(data don't have /base/${DATABASE_ID} when tablespace is setted)
 3 | 
 4 | 2.1 (2019-04-23)
 5 | -- custom word store in DataDir.
 6 | 
 7 | 2.0 (2019-03-05)
 8 | -- support custom word in table zhparser.zhprs_custom_word
 9 | 
10 | 0.2.0 (2017-05-28)
11 | -- fix regression test
12 | -- make pg_config configurable
13 | -- optimize doc
14 | 
15 | 0.1.5 (2017-05-24)
16 | -- optimize memory usage for zhparser
17 | -- support load multi dict
18 | -- export SCWS settings to GUC
19 | -- make context of dict_in_memory and extra_dicts to backend,others to userset
20 | -- fix a bug of unknow type index out of range
21 | -- detect dict file type from file extension
22 | -- add check script to run regress
23 | -- update scws to version 1.2.3 in install doc
24 | 
25 | 0.1.4 (2013-11-03)
26 | -- fix pgxn meta
27 | 
28 | 0.1.3 （2013-11-03）
29 | -- resolve the compile warning for pstrdup
30 | -- add the Chinese dict for SCWS
31 | -- fix rpath flag which emmit error on Mac OS
32 | -- update instal doc to note use gmake for *BSD
33 | -- update install doc to download SCWS from git url 
34 | 
35 | 0.1.2 （2013-10-23）
36 | -- resolve the pstrdup name conflict for pg 9.3
37 | 
38 | 0.1.1 (2013-02-05)
39 | -- add CHANGELOG
40 | -- fmt the doc
41 | 
42 | 0.1.0 (2013-02-04)
43 | -- init release in pgxn
44 | -- parse Chinese encode in UTF-8
45 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | zhparser
 2 | 
 3 | Portions Copyright (c) 2012-2013, Jov(amutu@amutu.com)
 4 | 
 5 | Permission to use, copy, modify, and distribute this software and its
 6 | documentation for any purpose, without fee, and without a written agreement
 7 | is hereby granted, provided that the above copyright notice and this
 8 | paragraph and the following two paragraphs appear in all copies.
 9 | 
10 | IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
11 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
12 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
13 | DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
14 | POSSIBILITY OF SUCH DAMAGE.
15 | 
16 | THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
17 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 | AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
19 | ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
20 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
21 | 


--------------------------------------------------------------------------------
/META.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "zhparser",
 3 |    "abstract": "a parser for full-text search of Chinese",
 4 |    "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese.It implements a Chinese parser base on the Simple Chinese Word Segmentation(SCWS)",
 5 |    "version": "0.2.0",
 6 |    "maintainer": [
 7 |       "Jov <amutu@amutu.com>"
 8 |    ],
 9 |    "license": "postgresql",
10 |    "prereqs": {
11 |       "runtime": {
12 |          "requires": {
13 |             "PostgreSQL": "9.2.0"
14 |          },
15 |          "recommends": {
16 |             "PostgreSQL": "9.6.0"
17 |          }
18 |       }
19 |    },
20 |    "provides": {
21 |       "zhparser": {
22 |          "abstract": "a parser for full-text search of Chinese",
23 |          "file": "zhparser--1.0.sql",
24 |          "docfile": "README.md",
25 |          "version": "0.2.0"
26 |       }
27 |    },
28 |    "resources": {
29 |       "homepage": "http://amutu.com/blog/zhparser/",
30 |       "bugtracker": {
31 |          "web": "http://github.com/amutu/zhparser/issues/"
32 |       },
33 |       "repository": {
34 |         "url":  "git://github.com/amutu/zhparser.git",
35 |         "web":  "http://github.com/amutu/zhparser/",
36 |         "type": "git"
37 |       }
38 |    },
39 |    "generated_by": "Jov",
40 |    "meta-spec": {
41 |       "version": "1.0.0",
42 |       "url": "http://pgxn.org/meta/spec.txt"
43 |    },
44 |    "release_status": "stable",
45 |    "tags": [
46 |       "parser",
47 |       "full text search",
48 |       "Chinese",
49 |       "dictionary"
50 |    ]
51 | }
52 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # contrib/zhparser/Makefile
 2 | 
 3 | MODULE_big = zhparser
 4 | OBJS = zhparser.o
 5 | 
 6 | EXTENSION = zhparser
 7 | DATA = zhparser--1.0.sql zhparser--unpackaged--1.0.sql \
 8 | 	   zhparser--1.0--2.0.sql zhparser--2.0.sql \
 9 | 	   zhparser--2.0--2.1.sql zhparser--2.1.sql zhparser--2.1--2.2.sql \
10 | 	   zhparser--2.2.sql zhparser--2.3.sql
11 | DATA_TSEARCH = dict.utf8.xdb rules.utf8.ini
12 | 
13 | REGRESS = zhparser
14 | 
15 | SCWS_HOME ?= /usr/local
16 | PG_CPPFLAGS = -I$(SCWS_HOME)/include/scws 
17 | SHLIB_LINK = -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
18 | 
19 | PG_CONFIG ?= pg_config
20 | PGXS := $(shell $(PG_CONFIG) --pgxs)
21 | include $(PGXS)
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Zhparser
  2 | ========
  3 | 
  4 | Zhparser is a PostgreSQL extension for full-text search of Chinese language (Mandarin Chinese). It implements a Chinese language parser base on 
  5 | the [Simple Chinese Word Segmentation(SCWS)](https://github.com/hightman/scws).
  6 | 
  7 | Project home page: http://blog.amutu.com/zhparser/
  8 | 
  9 | **注意**：对于分词结果不满意的，或者需要调试分词结果的，可以在这个页面调试：http://www.xunsearch.com/scws/demo/v48.php
 10 | 
 11 | Docker快速体验
 12 | -------
 13 | run the container:  
 14 | > docker run --name pgzhparser -d -e POSTGRES_PASSWORD=somepassword zhparser/zhparser:bookworm-16 
 15 | 
 16 | login the postgres database as user postgres:  
 17 | > docker exec -it pgzhparser psql postgres postgres
 18 | 
 19 | create the extension and use it:  
 20 | > CREATE EXTENSION zhparser;  
 21 | > CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);  
 22 | > ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;  
 23 | > SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动');  
 24 | 
 25 | you will get:  
 26 |  tokid | token  
 27 | -------+-------  
 28 |    101 | hello  
 29 |    101 | world  
 30 |    117 | !  
 31 |    101 | 2010  
 32 |    113 | 年  
 33 |    118 | 保障  
 34 |    110 | 房建  
 35 |    118 | 设在  
 36 |    110 | 全国  
 37 |    110 | 范围  
 38 |    102 | 内  
 39 |    118 | 获  
 40 |     97 | 全面  
 41 |    118 | 启动  
 42 | (14 行记录)  
 43 | 
 44 | 更多docker镜像信息，访问这里：[zhparser的dockerub](https://hub.docker.com/r/zhparser/zhparser)  
 45 | zhparser的docker镜像基于PostgreSQL的docker官方镜像构建，更多的用法参见：https://hub.docker.com/_/postgres
 46 | 
 47 | INSTALL
 48 | -------
 49 | 0.前置条件
 50 | 
 51 | zhparser支持PostgreSQL 9.2及以上版本，请确保你的PG版本符合要求。 
 52 | 对于REDHAT/CentOS Linux系统，请确保安装了相关的库和头文件，一般它们在postgresql-devel软件包中。 
 53 | 
 54 | 1.安装SCWS
 55 | 
 56 | ```
 57 |  wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.3.tar.bz2 | tar xf -
 58 | 
 59 |  cd scws-1.2.3 ; ./configure ; make install
 60 | 
 61 | 注意:在FreeBSD release 10及以上版本上运行configure时，需要增加--with-pic选项。
 62 | 
 63 | 如果是从github上下载的scws源码需要先运行以下命令生成configure文件： 
 64 | 
 65 |  touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing
 66 | 
 67 | ```
 68 | 2.下载zhparser源码
 69 | 
 70 | ```
 71 |  git clone https://github.com/amutu/zhparser.git
 72 | 
 73 | ```
 74 | 3.编译和安装zhparser
 75 | 
 76 | ```
 77 |  make && make install
 78 | 
 79 | ```
 80 | 
 81 | 如果scws的路径不在默认的 /usr/local 下,可以设置SCWS_HOME 例如： ` SCWS_HOME=/usr make && make install `
 82 | 
 83 | 如果你同时安装了多个版本的PostgreSQL, 可以通过指定 PG\_CONFIG 来为指定的版本编译扩展：
 84 | 
 85 | ```
 86 |  PG_CONFIG=/usr/lib/postgresql/9.5/bin/pg_config make && make install
 87 | 
 88 | ```
 89 | 
 90 | 注意:在*BSD上编译安装时，使用gmake代替make
 91 | 
 92 | 4.创建extension
 93 | 
 94 | ```
 95 |  psql dbname superuser -c 'CREATE EXTENSION zhparser'
 96 | 
 97 | ```
 98 | 
 99 | CONFIGURATION
100 | -------
101 | 以下配置在PG9.2及以上版本使用,这些选项是用来控制字典加载行为和分词行为的,这些选项都不是必须的,默认都为false(即如果没有在配置文件中设置这些选项，则zhparser的行为与将下面的选项设置为false一致)。
102 | 
103 | 忽略所有的标点等特殊符号: 
104 | zhparser.punctuation_ignore = f 
105 | 
106 | 闲散文字自动以二字分词法聚合: 
107 | zhparser.seg_with_duality = f 
108 | 
109 | 将词典全部加载到内存里: 
110 | zhparser.dict_in_memory = f 
111 | 
112 | 短词复合: 
113 | zhparser.multi_short = f 
114 | 
115 | 散字二元复合: 
116 | zhparser.multi_duality = f 
117 | 
118 | 重要单字复合: 
119 | zhparser.multi_zmain = f 
120 | 
121 | 全部单字复合: 
122 | zhparser.multi_zall = f 
123 | 
124 | 除了zhparser自带的词典，用户可以增加自定义词典，自定义词典的优先级高于自带的词典。自定义词典的文件必须放在share/tsearch_data目录中,zhparser根据文件扩展名确定词典的格式类型，.txt扩展名表示词典是文本格式，.xdb扩展名表示这个词典是xdb格式，多个文件使用逗号分隔,词典的分词优先级由低到高,如：  
125 | 
126 | zhparser.extra_dicts = 'dict_extra.txt,mydict.xdb' 
127 | 
128 | 注意：zhparser.extra_dicts和zhparser.dict_in_memory两个选项需要在backend启动前设置（可以在配置文件中修改然后reload，之后新建连接会生效）,其他选项可以随时在session中设置生效。zhparser的选项与scws相关的选项对应，关于这些选项的含义，可以参考scws的文档：http://www.xunsearch.com/scws/docs.php#libscws  
129 | 
130 | EXAMPLE
131 | -------
132 | ```
133 | -- create the extension
134 | 
135 | CREATE EXTENSION zhparser;
136 | 
137 | -- make test configuration using parser
138 | 
139 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
140 | 
141 | -- add token mapping
142 | 
143 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
144 | 
145 | -- ts_parse
146 | 
147 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动，从中央到地方纷纷加大 了保障房的建设和投入力度 。2011年，保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示，要继续推进保障性安居工程建设。');
148 | 
149 | -- test to_tsvector
150 | 
151 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调，但实际的年度在建规模以及竣工规模会超以往年份，相对应的对资金的需求也会创历>史纪录。”陈国强说。在他看来，与2011年相比，2012年的保障房建设在资金配套上的压力将更为严峻。');
152 | 
153 | -- test to_tsquery
154 | 
155 | SELECT to_tsquery('testzhcfg', '保障房资金压力');
156 | ```
157 | 
158 | 自定义词库
159 | -------
160 | ** 详解 TXT 词库的写法 (TXT词库目前已兼容 cli/scws_gen_dict 所用的文本词库) ** 
161 | 
162 | 1) 每行一条记录，以 # 或 分号开头的相当于注释，忽略跳过 
163 | 
164 | 2) 每行由4个字段组成，依次为“词语"(由中文字或3个以下的字母合成), "TF", "IDF", "词性"，字段使用空格或制表符分开，数量不限，可自行对齐以美化 
165 | 
166 | 3) 除“词语”外，其它字段可忽略不写。若忽略，TF和IDF默认值为 1.0 而 词性为 "@" 
167 | 
168 | 4) 由于 TXT 库动态加载（内部监测文件修改时间自动转换成 xdb 存于系统临时目录），故建议TXT词库不要过大 
169 | 
170 | 5) 删除词做法，请将词性设为“!“，则表示该词设为无效，即使在其它核心库中存在该词也视为无效 
171 | 
172 | 注意：1.自定义词典的格式可以是文本TXT，也可以是二进制的XDB格式。XDB格式效率更高，适合大辞典使用。可以使用scws自带的工具scws-gen-dict将文本词典转换为XDB格式；2.zhparser默认的词典是简体中文，如果需要繁体中文，可以在[这里](http://www.xunsearch.com/scws/download.php)下载已经生成好的XDB格式此词典。3.自定义词典的例子可以参考[dict_extra.txt](https://github.com/amutu/zhparser/blob/master/dict_extra.txt)。更多信息参见[SCWS官方文档](http://www.xunsearch.com/scws/docs.php#utilscws)。
173 | 
174 | 自定义词库 2.1
175 | -------
176 | ** 自定义词库2.1 增加自定义词库的易容性, 并兼容1.0提供的功能 **
177 | 
178 | 
179 | 自定义词库需要superuser权限, 自定义库是数据库级别的(不是实例),每个数据库拥有自己的自定义分词, 并存储在data目录下base/数据库ID下(2.0 版本存储在share/tsearch_data下)
180 | 
181 | 生成环境版本升级(新环境直接安装就可以)：
182 | 	alter extension zhparser update ;
183 | ```
184 | test=# SELECT * FROM ts_parse('zhparser', '保障房资金压力');
185 |  tokid | token
186 | -------+-------
187 |    118 | 保障
188 |    110 | 房
189 |    110 | 资金
190 |    110 | 压力
191 | 
192 | test=# insert into zhparser.zhprs_custom_word values('资金压力');
193 | --删除词insert into zhprs_custom_word(word, attr) values('word', '!');
194 | --\d zhprs_custom_word 查看其表结构，支持TD, IDF
195 | test=# select sync_zhprs_custom_word();
196 |  sync_zhprs_custom_word
197 | ------------------------
198 | 
199 | (1 row)
200 | 
201 | test=# \q --sync 后重新建立连接
202 | [lzzhang@lzzhang-pc bin]$ ./psql -U lzzhang -d test -p 1600
203 | test=# SELECT * FROM ts_parse('zhparser', '保障房资金压力');
204 |  tokid |  token
205 | -------+----------
206 |    118 | 保障
207 |    110 | 房
208 |    120 | 资金压力
209 | ```
210 | 
211 | 
212 | COPYRITE
213 | --------
214 | 
215 | zhparser
216 | 
217 | Portions Copyright (c) 2012-2017, Jov(amutu@amutu.com)
218 | 
219 | Permission to use, copy, modify, and distribute this software and its documentation
220 | for any purpose, without fee, and without a written agreement is hereby granted,
221 | provided that the above copyright notice and this paragraph and the following 
222 | two paragraphs appear in all copies.
223 | 
224 | IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
225 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
226 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
227 | DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
228 | POSSIBILITY OF SUCH DAMAGE.
229 | 
230 | THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
231 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
232 | AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
233 | ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
234 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
235 | 
236 | [![Powered by DartNode](https://dartnode.com/branding/DN-Open-Source-sm.png)](https://dartnode.com "Powered by DartNode - Free VPS for Open Source")
237 | 


--------------------------------------------------------------------------------
/check-alpine.sh:
--------------------------------------------------------------------------------
 1 | pid=$$
 2 | docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@alpine zhparser/zhparser:alpine-16
 3 | sleep 5
 4 | export PGPASSWORD=somepassword@alpine
 5 | psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-alpine.out -
 6 | 
 7 | if [ $? -eq 0 ]
 8 | then
 9 |     echo "pass!"
10 | else
11 |     echo "do not pass!"
12 | fi
13 | docker stop testpgzhparser-$pid
14 | 


--------------------------------------------------------------------------------
/check-debian.sh:
--------------------------------------------------------------------------------
 1 | pid=$$
 2 | docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@debian-16 zhparser/zhparser:bookworm-16
 3 | sleep 5
 4 | export PGPASSWORD=somepassword@debian-16
 5 | psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-debian.out -
 6 | 
 7 | if [ $? -eq 0 ]
 8 | then
 9 |     echo "pass!"
10 | else
11 |     echo "do not pass!"
12 | fi
13 | docker stop testpgzhparser-$pid
14 | 


--------------------------------------------------------------------------------
/dict.utf8.xdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amutu/zhparser/8b04302dc2011c12ef87211a0527bbf41830e97e/dict.utf8.xdb


--------------------------------------------------------------------------------
/dict_extra.txt:
--------------------------------------------------------------------------------
1 | ; dict_extra.txt
2 | 我是新增词     2.0
3 | 再试一个       1.0       1.0    @
4 | ; 以下词为删除项
5 | 删除           1.0      1.0    !
6 | 


--------------------------------------------------------------------------------
/docker/alpine/16/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PG_CONTAINER_VERSION=16
 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-alpine as builder
 3 | 
 4 | RUN set -ex \
 5 |   && apk --no-cache add git build-base linux-headers make postgresql-dev automake libtool autoconf m4
 6 | 
 7 | RUN set -ex \
 8 |   && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \
 9 |   && cd scws \
10 |   && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \
11 |   && ./configure \
12 |   && make install
13 | 
14 | RUN set -ex \
15 |   && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \
16 |   && cd zhparser \
17 |   && make install
18 | 
19 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-alpine
20 | ENV LANG zh_CN.UTF-8
21 | 
22 | COPY --from=builder /usr/local/lib/postgresql/zhparser.so /usr/local/lib/postgresql/
23 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/
24 | COPY --from=builder /usr/local/share/postgresql/extension/zhparser* /usr/local/share/postgresql/extension/
25 | COPY --from=builder /usr/local/lib/postgresql/bitcode/zhparser* /usr/local/lib/postgresql/bitcode/
26 | COPY --from=builder /usr/local/share/postgresql/tsearch_data/*.utf8.* /usr/local/share/postgresql/tsearch_data/
27 | 


--------------------------------------------------------------------------------
/docker/bookworm/15/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PG_CONTAINER_VERSION=15
 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm as builder
 3 | 
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | RUN set -ex \
 6 |   && apt-get update \
 7 |   && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \
 8 |   && apt-get clean
 9 | 
10 | RUN set -ex \
11 |   && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \
12 |   && cd scws \
13 |   && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \
14 |   && ./configure \
15 |   && make install
16 | 
17 | RUN set -ex \
18 |   && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \
19 |   && cd zhparser \
20 |   && make install
21 | 
22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm
23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8
24 | ENV LANG zh_CN.UTF-8
25 | 
26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/
27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/
28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/
29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/
30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/dict.utf8.xdb /usr/share/postgresql/${PG_MAJOR}/tsearch_data/
31 | 


--------------------------------------------------------------------------------
/docker/bookworm/16/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PG_CONTAINER_VERSION=16
 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm as builder
 3 | 
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | RUN set -ex \
 6 |   && apt-get update \
 7 |   && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \
 8 |   && apt-get clean
 9 | 
10 | RUN set -ex \
11 |   && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \
12 |   && cd scws \
13 |   && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \
14 |   && ./configure \
15 |   && make install
16 | 
17 | RUN set -ex \
18 |   && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \
19 |   && cd zhparser \
20 |   && make install
21 | 
22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bookworm
23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8
24 | ENV LANG zh_CN.UTF-8
25 | 
26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/
27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/
28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/
29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/
30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/
31 | 


--------------------------------------------------------------------------------
/docker/bullseye/15/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PG_CONTAINER_VERSION=15
 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye as builder
 3 | 
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | RUN set -ex \
 6 |   && apt-get update \
 7 |   && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \
 8 |   && apt-get clean
 9 | 
10 | RUN set -ex \
11 |   && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \
12 |   && cd scws \
13 |   && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \
14 |   && ./configure \
15 |   && make install
16 | 
17 | RUN set -ex \
18 |   && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \
19 |   && cd zhparser \
20 |   && make install
21 | 
22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye
23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8
24 | ENV LANG zh_CN.UTF-8
25 | 
26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/
27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/
28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/
29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/
30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/
31 | 


--------------------------------------------------------------------------------
/docker/bullseye/16/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PG_CONTAINER_VERSION=16
 2 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye as builder
 3 | 
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | RUN set -ex \
 6 |   && apt-get update \
 7 |   && apt-get install -y build-essential git postgresql-server-dev-${PG_MAJOR} pkg-config binutils automake libtool \
 8 |   && apt-get clean
 9 | 
10 | RUN set -ex \
11 |   && git clone --branch 1.2.3 --single-branch --depth 1 https://github.com/hightman/scws.git \
12 |   && cd scws \
13 |   && touch README;aclocal;autoconf;autoheader;libtoolize;automake --add-missing \
14 |   && ./configure \
15 |   && make install
16 | 
17 | RUN set -ex \
18 |   && git clone --branch master --single-branch --depth 1 https://github.com/amutu/zhparser.git \
19 |   && cd zhparser \
20 |   && make install
21 | 
22 | FROM docker.io/library/postgres:${PG_CONTAINER_VERSION}-bullseye
23 | RUN localedef -i zh_CN -c -f UTF-8 -A /usr/share/locale/locale.alias zh_CN.UTF-8
24 | ENV LANG zh_CN.UTF-8
25 | 
26 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/zhparser.so /usr/lib/postgresql/${PG_MAJOR}/lib/
27 | COPY --from=builder /usr/local/lib/libscws.* /usr/local/lib/
28 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/extension/zhparser* /usr/share/postgresql/${PG_MAJOR}/extension/
29 | COPY --from=builder /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/zhparser* /usr/lib/postgresql/${PG_MAJOR}/lib/bitcode/
30 | COPY --from=builder /usr/share/postgresql/${PG_MAJOR}/tsearch_data/*.utf8.* /usr/share/postgresql/${PG_MAJOR}/tsearch_data/
31 | 


--------------------------------------------------------------------------------
/expected/zhparser-alpine.out:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION zhparser;
 2 | -- make test configuration using parser
 3 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
 4 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
 5 | -- ts_parse
 6 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动，从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年，保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示，要继续推进保障性安居工程建设。');
 7 |  tokid |  token   
 8 | -------+----------
 9 |    101 | hello
10 |    101 | world
11 |    117 | !
12 |    101 | 2010
13 |    113 | 年
14 |    118 | 保障
15 |    110 | 房建
16 |    118 | 设在
17 |    110 | 全国
18 |    110 | 范围
19 |    102 | 内
20 |    118 | 获
21 |     97 | 全面
22 |    118 | 启动
23 |    117 | ，
24 |    110 | 从中
25 |    118 | 央
26 |    118 | 到
27 |    110 | 地方
28 |    100 | 纷纷
29 |    118 | 加大
30 |    118 | 了
31 |    118 | 保
32 |    110 | 障
33 |    110 | 房
34 |    117 | 的
35 |    118 | 建
36 |    118 | 设
37 |     99 | 和
38 |    118 | 投
39 |    118 | 入
40 |    110 | 力
41 |    107 | 度
42 |    117 | 。
43 |    101 | 2011
44 |    113 | 年
45 |    117 | ，
46 |    118 | 保障
47 |    110 | 房
48 |    118 | 进入
49 |    118 | 了
50 |    100 | 更
51 |    110 | 大规模
52 |    117 | 的
53 |    118 | 建设
54 |    110 | 阶段
55 |    117 | 。
56 |    110 | 住房
57 |    110 | 城乡建设
58 |    110 | 部党组
59 |    110 | 书记
60 |    117 | 、
61 |    110 | 部长
62 |    110 | 姜
63 |    110 | 伟
64 |     97 | 新
65 |    116 | 去年底
66 |    112 | 在
67 |    110 | 全国
68 |    110 | 住房
69 |    110 | 城乡建设
70 |    118 | 工作
71 |    110 | 会议
72 |    110 | 上表
73 |    118 | 示
74 |    117 | ，
75 |    118 | 要
76 |    118 | 继续
77 |    118 | 推进
78 |    110 | 保障性
79 |    118 | 安居
80 |    110 | 工程建设
81 |    117 | 。
82 | (73 rows)
83 | 
84 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调，但实际的年度在建规模以及竣工规模会超以往年份，相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来，与2011年相比，2012年的保障房建设在资金配套上的压力将更为严峻。');
85 |                                                                                                                                                               to_tsvector                                                                                                                                                              
86 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
87 |  '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19
88 | (1 row)
89 | 
90 | SELECT to_tsquery('testzhcfg', '保障房资金压力');
91 |               to_tsquery               
92 | ---------------------------------------
93 |  '保障' <-> '房' <-> '资金' <-> '压力'
94 | (1 row)
95 | 
96 | -- clean extension
97 | DROP EXTENSION zhparser CASCADE;
98 | 


--------------------------------------------------------------------------------
/expected/zhparser-debian.out:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION zhparser;
 2 | -- make test configuration using parser
 3 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
 4 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
 5 | -- ts_parse
 6 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动，从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年，保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示，要继续推进保障性安居工程建设。');
 7 |  tokid |  token   
 8 | -------+----------
 9 |    101 | hello
10 |    101 | world
11 |    117 | !
12 |    101 | 2010
13 |    113 | 年
14 |    118 | 保障
15 |    110 | 房建
16 |    118 | 设在
17 |    110 | 全国
18 |    110 | 范围
19 |    102 | 内
20 |    118 | 获
21 |     97 | 全面
22 |    118 | 启动
23 |    117 | ，
24 |    112 | 从
25 |    110 | 中央
26 |    118 | 到
27 |    110 | 地方
28 |    100 | 纷纷
29 |    118 | 加大
30 |    118 | 了
31 |    118 | 保
32 |    110 | 障
33 |    110 | 房
34 |    117 | 的
35 |    118 | 建
36 |    118 | 设
37 |     99 | 和
38 |    118 | 投
39 |    118 | 入
40 |    110 | 力
41 |    107 | 度
42 |    117 | 。
43 |    101 | 2011
44 |    113 | 年
45 |    117 | ，
46 |    118 | 保障
47 |    110 | 房
48 |    118 | 进入
49 |    118 | 了
50 |    100 | 更
51 |    110 | 大规模
52 |    117 | 的
53 |    118 | 建设
54 |    110 | 阶段
55 |    117 | 。
56 |    110 | 住房
57 |    110 | 城乡建设
58 |    110 | 部党组
59 |    110 | 书记
60 |    117 | 、
61 |    110 | 部长
62 |    110 | 姜
63 |    110 | 伟
64 |     97 | 新
65 |    116 | 去年底
66 |    112 | 在
67 |    110 | 全国
68 |    110 | 住房
69 |    110 | 城乡建设
70 |    118 | 工作
71 |    110 | 会议
72 |    118 | 上
73 |    118 | 表示
74 |    117 | ，
75 |    118 | 要
76 |    118 | 继续
77 |    118 | 推进
78 |    110 | 保障性
79 |    118 | 安居
80 |    110 | 工程建设
81 |    117 | 。
82 | (73 rows)
83 | 
84 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调，但实际的年度在建规模以及竣工规模会超以往年份，相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来，与2011年相比，2012年的保障房建设在资金配套上的压力将更为严峻。');
85 |                                                                                                                                                              to_tsvector                                                                                                                                                             
86 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
87 |  '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '年份':16 '年度':9 '应':17 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19
88 | (1 row)
89 | 
90 | SELECT to_tsquery('testzhcfg', '保障房资金压力');
91 |               to_tsquery               
92 | ---------------------------------------
93 |  '保障' <-> '房' <-> '资金' <-> '压力'
94 | (1 row)
95 | 
96 | -- clean extension
97 | DROP EXTENSION zhparser CASCADE;
98 | 


--------------------------------------------------------------------------------
/rules.utf8.ini:
--------------------------------------------------------------------------------
  1 | ;
  2 | ; auto regular(utf-8)
  3 | ; $Id$
  4 | ;
  5 | ; special word, 特殊词汇
  6 | ;
  7 | 
  8 | [special]
  9 | C++
 10 | C#
 11 | R&B
 12 | P&G
 13 | J++
 14 | J#
 15 | UTF-8
 16 | PS/2
 17 | 
 18 | ;
 19 | ; nostats
 20 | ;
 21 | [nostats]
 22 | about
 23 | all
 24 | also
 25 | an
 26 | and
 27 | any
 28 | are
 29 | as
 30 | at
 31 | be
 32 | but
 33 | by
 34 | both
 35 | can
 36 | for
 37 | from
 38 | have
 39 | here
 40 | if
 41 | in
 42 | is
 43 | it
 44 | no
 45 | not
 46 | of
 47 | on
 48 | or
 49 | our
 50 | out
 51 | that
 52 | the
 53 | this
 54 | to
 55 | up
 56 | us
 57 | 
 58 | ;
 59 | ; 词性语法规则表
 60 | ;
 61 | [attrs]
 62 | ; c 是连词
 63 | n + f(1) = 300
 64 | n + m(1) = 500
 65 | n(1) + v = 100
 66 | n + v(1) = 10
 67 | r + n(1) = 1000
 68 | r(1) + n = 100
 69 | d(1) + r = 100
 70 | d(1) + v = 100
 71 | v(1) + r = 100
 72 | n + m(1) = 500
 73 | v + f(1) = 30
 74 | v(1) + m = 100
 75 | v(1) + n = 3
 76 | a + u(1) = 5
 77 | v + n(1) = 5
 78 | u(1) + a = 2
 79 | c(1) + * = 50
 80 | * + c(1) = 50
 81 | 
 82 | ;
 83 | ; 名字停用词表
 84 | ;
 85 | [noname]
 86 | :line = no
 87 | 给的说对在和是被最所那这有将
 88 | 你会与他为不没很了啊哦呵把去
 89 | 
 90 | ;
 91 | ; 双字节符号
 92 | ;
 93 | [symbol]
 94 | :type = none
 95 | :line = no
 96 | ｀－＝［］、‘；／。，｜？》《：“｛｝＋—）（＊…％￥＃·！～
 97 | ’”〕〈〉「」『』〖〗【】＜＞
 98 | 
 99 | ;
100 | ; 姓和外文名共同部分
101 | ;
102 | [pubname]
103 | :type = prefix
104 | :line = no
105 | :exclude = noname,symbol,alpha,chnum2
106 | :znum = 1,2
107 | :tf = 5.0
108 | :idf = 3.5
109 | :attr = nr
110 | 艾安贝卜戴费福盖戈古赫华霍吉贾金柯赖劳雷黎利林卢
111 | 鲁伦罗洛马麦米莫穆齐乔冉萨沙史斯温谢尤詹诸
112 | 
113 | 
114 | [pubname2]
115 | :type = prefix
116 | :line = no
117 | :exclude = noname,symbol,alpha,chnum2
118 | :tf = 5.0
119 | :idf = 3.5
120 | :attr = nr
121 | 伍陆
122 | 
123 | [pubname3]
124 | :type = prefix
125 | :line = no
126 | :exclude = noname,symbol,alpha,chnum2
127 | :tf = 5.0
128 | :idf = 3.5
129 | :attr = nr
130 | 万章
131 | 
132 | ;
133 | ; 单姓
134 | ;
135 | [surname]
136 | :type = prefix
137 | :line = no
138 | :exclude = noname,symbol,alpha,chnum2
139 | :tf = 5.0
140 | :idf = 3.5
141 | :attr = nr
142 | :znum = 1,2
143 | 
144 | 敖白班包宝保鲍毕边卞柏蔡曹岑柴昌常车陈成程迟池褚
145 | 楚储淳崔刀邓狄刁丁董窦杜端段樊范方房斐丰封冯凤伏
146 | 傅甘高耿龚宫勾苟辜谷顾官关管桂郭韩杭郝禾何贺衡洪
147 | 侯胡花黄稽姬纪季简翦姜江蒋焦晋靳荆居康空孔匡邝况
148 | 蓝郎朗乐冷李理厉励连廉练良梁廖凌刘柳隆龙楼娄吕路
149 | 骆麻满茅毛梅孟苗缪闵明牟倪聂牛钮农潘庞裴彭皮朴平
150 | 蒲溥浦戚祁钱强秦丘邱仇裘屈瞿权饶任荣容阮瑞芮赛单
151 | 商邵佘申沈盛石寿舒宋苏孙邰谭谈汤唐陶滕田佟仝屠涂
152 | 汪王危韦魏卫蔚闻翁巫邬武吴奚习夏鲜席冼项萧解辛邢
153 | 幸熊徐许宣薛荀颜阎言严彦晏燕杨阳姚叶蚁易殷银尹应
154 | 英游于於鱼虞俞余禹喻郁尉袁岳云臧曾查翟湛张赵甄郑
155 | 钟周朱竺祝庄卓宗邹祖左肖
156 | 
157 | ;
158 | ; 复姓
159 | ;
160 | [surname2]
161 | :type = prefix
162 | :line = yes
163 | :exclude = noname,symbol,alpha,chnum2
164 | :tf = 5.0
165 | :idf = 3.5
166 | :attr = nr
167 | :znum = 1, 2
168 | 东郭
169 | 公孙
170 | 皇甫
171 | 慕容
172 | 欧阳
173 | 单于
174 | 司空
175 | 司马
176 | 司徒
177 | 澹台
178 | 诸葛
179 | 
180 | ;
181 | ; 地点名称
182 | ;
183 | [areaname]
184 | :type = suffix
185 | :znum = 2
186 | :exclude = noname,symbol,alpha,chnum2
187 | :tf = 4.5
188 | :idf = 3.0
189 | :attr = ns
190 | :line = no
191 | 
192 | 县市镇村乡区
193 | 
194 | ;
195 | ; 双字地点名称
196 | ;
197 | [areaname2]
198 | :type = suffix
199 | :znum = 2
200 | :exclude = noname,symbol,alpha,chnum2
201 | :tf = 4.5
202 | :idf = 3.0
203 | :attr = ns
204 | :line = yes
205 | 东路
206 | 西路
207 | 支路
208 | 街道
209 | 南路
210 | 北路
211 | 
212 | 
213 | [munit]
214 | :type = none
215 | :line = no
216 | 萬亿零年点分秒回节名个多届次集
217 | 
218 | [chnum0]
219 | :type = prefix
220 | :line = no
221 | :tf = 2.5
222 | :idf = 1.0
223 | :attr = mt
224 | :include = chnum2,chnum3,munit,pubname3
225 | ０
226 | 
227 | [chnum1]
228 | :type = prefix
229 | :include = chnum0,chnum1,munit,pubname3
230 | :tf = 3.0
231 | :idf = 1.0
232 | :attr = mt
233 | :line = no
234 | 一二三四五六七八九十百千
235 | 
236 | [chnum2]
237 | :type = prefix
238 | :line = no
239 | :tf = 3.0
240 | :idf = 1.0
241 | :attr = mt
242 | :include = chnum0,chnum2,chnum3,munit,pubname3
243 | １２３４５６７８９
244 | 
245 | [chnum3]
246 | :type = none
247 | :line = no
248 | ．
249 | 
250 | [chnum4]
251 | :type = prefix
252 | :line = no
253 | :tf = 3.0
254 | :idf = 1.0
255 | :attr = mt
256 | :include = chnum4,munit,pubname2,pubname3
257 | 
258 | 壹贰叁肆柒捌玖拾佰仟
259 | 
260 | [chnum5]
261 | :type = prefix
262 | :line = no
263 | :tf = 3.5
264 | :idf = 2.0
265 | :attr = nz
266 | :include = chnum1,munit,pubname3,chnum2
267 | 
268 | 第每
269 | 
270 | [alpha]
271 | :type = prefix
272 | :line = no
273 | :tf = 2.5
274 | :idf = 1.0
275 | :attr = en
276 | :include = alpha
277 | 
278 | ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ
279 | ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ＇
280 | 
281 | [foregin]
282 | :type = prefix
283 | :line = no
284 | :tf = 4.0
285 | :idf = 3.0
286 | :attr = nr
287 | :include = foregin,pubname,pubname2,pubname3
288 | 阿克拉加内亚巴尔姆爱兰西伊杰纳布可夫勒特坦芬尼根登都
289 | 伯泰胥俄科索沃森奥瓦茨普蒂塞维大莱德冈墨哥弗库澳哈兹
290 | 乌奇切诺里基延达塔卡雅来波迈蓬什比摩曼乃休合娜迪凯帕
291 | 桑佩蒙博托格泽及希匹印埃努烈累法图喀土腓耶逊宾
292 | 


--------------------------------------------------------------------------------
/sql/zhparser.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION zhparser;
 2 | 
 3 | -- make test configuration using parser
 4 | 
 5 | CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
 6 | 
 7 | ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
 8 | 
 9 | -- ts_parse
10 | 
11 | SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动，从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年，保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示，要继续推进保障性安居工程建设。');
12 | 
13 | SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调，但实际的年度在建规模以及竣工规模会超以往年份，相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来，与2011年相比，2012年的保障房建设在资金配套上的压力将更为严峻。');
14 | 
15 | SELECT to_tsquery('testzhcfg', '保障房资金压力');
16 | 
17 | -- clean extension
18 | 
19 | DROP EXTENSION zhparser CASCADE;
20 | 


--------------------------------------------------------------------------------
/zhparser--1.0--2.0.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_getsharepath()
 2 | RETURNS text
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE SCHEMA zhparser;
 7 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!'));
 8 | 
 9 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
10 | $$
11 | declare
12 | 	dict_path text;
13 | 	time_tag_path text;
14 | 	query text;
15 | begin
16 | 	select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.txt' into dict_path;
17 | 	select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.tag' into time_tag_path;
18 | 
19 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
20 | 	execute query;
21 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
22 | 	execute query;
23 | end;
24 | $$;
25 | 
26 | select sync_zhprs_custom_word();
27 | 


--------------------------------------------------------------------------------
/zhparser--1.0.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_start(internal, int4)
 2 | RETURNS internal
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
 7 | RETURNS internal
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION zhprs_end(internal)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FUNCTION zhprs_lextype(internal)
17 | RETURNS internal
18 | AS 'MODULE_PATHNAME'
19 | LANGUAGE C STRICT;
20 | 
21 | CREATE TEXT SEARCH PARSER zhparser (
22 |     START    = zhprs_start,
23 |     GETTOKEN = zhprs_getlexeme,
24 |     END      = zhprs_end,
25 |     HEADLINE = pg_catalog.prsd_headline,
26 |     LEXTYPES = zhprs_lextype
27 | );
28 | 


--------------------------------------------------------------------------------
/zhparser--2.0--2.1.sql:
--------------------------------------------------------------------------------
 1 | drop function zhprs_getsharepath();
 2 | 
 3 | CREATE or REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
 4 | $$
 5 | declare
 6 | 	database_oid text;
 7 | 	data_dir text;
 8 | 	dict_path text;
 9 | 	time_tag_path text;
10 | 	query text;
11 | begin
12 | 	select setting from pg_settings where name='data_directory' into data_dir;
13 | 	select oid from pg_database where datname=current_database() into database_oid;
14 | 
15 | 	select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.txt' into dict_path;
16 | 	select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path;
17 | 
18 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
19 | 	execute query;
20 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
21 | 	execute query;
22 | end;
23 | $$;
24 | 
25 | select sync_zhprs_custom_word();
26 | 


--------------------------------------------------------------------------------
/zhparser--2.0.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_start(internal, int4)
 2 | RETURNS internal
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
 7 | RETURNS internal
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION zhprs_end(internal)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FUNCTION zhprs_lextype(internal)
17 | RETURNS internal
18 | AS 'MODULE_PATHNAME'
19 | LANGUAGE C STRICT;
20 | 
21 | CREATE TEXT SEARCH PARSER zhparser (
22 |     START    = zhprs_start,
23 |     GETTOKEN = zhprs_getlexeme,
24 |     END      = zhprs_end,
25 |     HEADLINE = pg_catalog.prsd_headline,
26 |     LEXTYPES = zhprs_lextype
27 | );
28 | 
29 | CREATE FUNCTION zhprs_getsharepath()
30 | RETURNS text
31 | AS 'MODULE_PATHNAME'
32 | LANGUAGE C STRICT;
33 | 
34 | CREATE SCHEMA zhparser;
35 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!'));
36 | 
37 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
38 | $$
39 | declare
40 | 	dict_path text;
41 | 	time_tag_path text;
42 | 	query text;
43 | begin
44 | 	select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.txt' into dict_path;
45 | 	select zhprs_getsharepath() || '/tsearch_data/qc_dict_' || current_database() || '.tag' into time_tag_path;
46 | 
47 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
48 | 	execute query;
49 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
50 | 	execute query;
51 | end;
52 | $$;
53 | 
54 | select sync_zhprs_custom_word();
55 | 


--------------------------------------------------------------------------------
/zhparser--2.1--2.2.sql:
--------------------------------------------------------------------------------
 1 | CREATE or REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
 2 | $$
 3 | declare
 4 | 	data_dir text;
 5 | 	dict_path text;
 6 | 	time_tag_path text;
 7 | 	query text;
 8 | begin
 9 | 	select setting from pg_settings where name='data_directory' into data_dir;
10 | 
11 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path;
12 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path;
13 | 
14 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
15 | 	execute query;
16 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
17 | 	execute query;
18 | end;
19 | $$;
20 | 
21 | -- do not created custom dict files when fresh installed
22 | -- select sync_zhprs_custom_word();
23 | 


--------------------------------------------------------------------------------
/zhparser--2.1.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_start(internal, int4)
 2 | RETURNS internal
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
 7 | RETURNS internal
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION zhprs_end(internal)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FUNCTION zhprs_lextype(internal)
17 | RETURNS internal
18 | AS 'MODULE_PATHNAME'
19 | LANGUAGE C STRICT;
20 | 
21 | CREATE TEXT SEARCH PARSER zhparser (
22 |     START    = zhprs_start,
23 |     GETTOKEN = zhprs_getlexeme,
24 |     END      = zhprs_end,
25 |     HEADLINE = pg_catalog.prsd_headline,
26 |     LEXTYPES = zhprs_lextype
27 | );
28 | 
29 | 
30 | CREATE SCHEMA zhparser;
31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!'));
32 | 
33 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
34 | $$
35 | declare
36 | 	database_oid text;
37 | 	data_dir text;
38 | 	dict_path text;
39 | 	time_tag_path text;
40 | 	query text;
41 | begin
42 | 	select setting from pg_settings where name='data_directory' into data_dir;
43 | 	select oid from pg_database where datname=current_database() into database_oid;
44 | 
45 | 
46 | 	select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.txt' into dict_path;
47 | 	select data_dir || '/base/' || database_oid || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path;
48 | 
49 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
50 | 	execute query;
51 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
52 | 	execute query;
53 | end;
54 | $$;
55 | 
56 | select sync_zhprs_custom_word();
57 | 


--------------------------------------------------------------------------------
/zhparser--2.2.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_start(internal, int4)
 2 | RETURNS internal
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
 7 | RETURNS internal
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION zhprs_end(internal)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FUNCTION zhprs_lextype(internal)
17 | RETURNS internal
18 | AS 'MODULE_PATHNAME'
19 | LANGUAGE C STRICT;
20 | 
21 | CREATE TEXT SEARCH PARSER zhparser (
22 |     START    = zhprs_start,
23 |     GETTOKEN = zhprs_getlexeme,
24 |     END      = zhprs_end,
25 |     HEADLINE = pg_catalog.prsd_headline,
26 |     LEXTYPES = zhprs_lextype
27 | );
28 | 
29 | 
30 | CREATE SCHEMA zhparser;
31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!'));
32 | 
33 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
34 | $$
35 | declare
36 | 	data_dir text;
37 | 	dict_path text;
38 | 	time_tag_path text;
39 | 	query text;
40 | begin
41 | 	select setting from pg_settings where name='data_directory' into data_dir;
42 | 
43 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path;
44 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path;
45 | 
46 | 	query = 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to ' || chr(39) || dict_path || chr(39) || ' encoding ' || chr(39) || 'utf8' || chr(39) ;
47 | 	execute query;
48 | 	query = 'copy (select now()) to ' || chr(39) || time_tag_path || chr(39) ;
49 | 	execute query;
50 | end;
51 | $$;
52 | 
53 | -- do not created custom dict files when fresh installed
54 | -- select sync_zhprs_custom_word();
55 | 


--------------------------------------------------------------------------------
/zhparser--2.3.sql:
--------------------------------------------------------------------------------
 1 | CREATE FUNCTION zhprs_start(internal, int4)
 2 | RETURNS internal
 3 | AS 'MODULE_PATHNAME'
 4 | LANGUAGE C STRICT;
 5 | 
 6 | CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
 7 | RETURNS internal
 8 | AS 'MODULE_PATHNAME'
 9 | LANGUAGE C STRICT;
10 | 
11 | CREATE FUNCTION zhprs_end(internal)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME'
14 | LANGUAGE C STRICT;
15 | 
16 | CREATE FUNCTION zhprs_lextype(internal)
17 | RETURNS internal
18 | AS 'MODULE_PATHNAME'
19 | LANGUAGE C STRICT;
20 | 
21 | CREATE TEXT SEARCH PARSER zhparser (
22 |     START    = zhprs_start,
23 |     GETTOKEN = zhprs_getlexeme,
24 |     END      = zhprs_end,
25 |     HEADLINE = pg_catalog.prsd_headline,
26 |     LEXTYPES = zhprs_lextype
27 | );
28 | 
29 | 
30 | CREATE SCHEMA zhparser;
31 | CREATE TABLE zhparser.zhprs_custom_word(word text primary key, tf float default '1.0', 
32 |   idf float default '1.0', attr char default '@', check(attr = '@' or attr = '!'));
33 | 
34 | CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
35 | $$
36 | declare
37 | 	data_dir text;
38 | 	dict_path text;
39 | 	time_tag_path text;
40 | 	query text;
41 | begin
42 | 	select setting from pg_settings where name='data_directory' into data_dir;
43 | 
44 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.txt' into dict_path;
45 | 	select data_dir || '/base' || '/zhprs_dict_' || current_database() || '.tag' into time_tag_path;
46 | 
47 | 	query = $q$copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to '$q$ || dict_path || $q$' encoding 'utf8' $q$;
48 | 	execute query;
49 | 	query = $q$copy (select now()) to '$q$  || time_tag_path || $q$'$q$;
50 | 	execute query;
51 | end;
52 | $$;
53 | 
54 | -- do not created custom dict files when fresh installed
55 | -- select sync_zhprs_custom_word();
56 | 


--------------------------------------------------------------------------------
/zhparser--unpackaged--1.0.sql:
--------------------------------------------------------------------------------
1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
2 | \echo Use "CREATE EXTENSION zhparser" to load this file. \quit
3 | 
4 | ALTER EXTENSION zhparser ADD function zhprs_start(internal,integer);
5 | ALTER EXTENSION zhparser ADD function zhprs_getlexeme(internal,internal,internal);
6 | ALTER EXTENSION zhparser ADD function zhprs_end(internal);
7 | ALTER EXTENSION zhparser ADD function zhprs_lextype(internal);
8 | ALTER EXTENSION zhparser ADD text search parser zhparser;
9 | 


--------------------------------------------------------------------------------
/zhparser-backup-custom-dict.sh:
--------------------------------------------------------------------------------
 1 | if [ $# -lt 2 ];then
 2 | 	echo "usage: $0 <backup|restore|delete> <pgdata_dir> [restore_from_dir]"
 3 | 	echo "warning: delete is a dangerous cmd, it will delete your custom from pgdata_dir."
 4 |        	echo "!!!you should run backup cmd first, then run the delete cmd !!!"
 5 | 	exit 2
 6 | fi
 7 | cmd=$1
 8 | pgdata=$2
 9 | restore_from_dir=$3
10 | 
11 | if [ $cmd = 'backup' ];then
12 | 	backup_dir=zhparser-backup-custom-dict-$(date +'%F:%T')
13 | 	mkdir ./$backup_dir
14 | 	echo "will backup $pgdata/base/zhprs_dict_* to $backup_dir/"
15 | 	cp -a $pgdata/base/zhprs_dict_* $backup_dir/
16 | 	if [ "$?" -ne 0 ];
17 | 	then
18 | 		echo "backup error!"
19 | 		exit 1
20 | 	else
21 | 		echo "backup ok!"
22 | 	fi
23 | fi
24 | 
25 | if [ $cmd = 'delete' ];then
26 | 	echo "will delete $pgdata/base/zhprs_dict_*"
27 | 	rm $pgdata/base/zhprs_dict_*
28 | 	if [ "$?" -ne 0 ];
29 | 	then
30 | 		echo "delete error!"
31 | 		exit 1
32 | 	else
33 | 		echo "delete ok!"
34 | 	fi
35 | fi
36 | 
37 | if [ $cmd = 'restore' ];then
38 | 	echo "will restore $restore_from_dir/zhprs_dict_* to $pgdata/base/"
39 | 	cp -a $restore_from_dir/zhprs_dict_* $pgdata/base/
40 | 	if [ "$?" -ne 0 ];
41 | 	then
42 | 		echo "restore error!"
43 | 		exit 1
44 | 	else
45 | 		echo "restore ok!"
46 | 	fi
47 | fi
48 | 


--------------------------------------------------------------------------------
/zhparser.c:
--------------------------------------------------------------------------------
  1 | /*-------------------------------------------------------------------------
  2 |  *
  3 |  * zhparser.c
  4 |  *	  a text search parser for Chinese
  5 |  *
  6 |  *-------------------------------------------------------------------------
  7 |  */
  8 | #include "zhparser.h"
  9 | 
 10 | #include "postgres.h"
 11 | #include "miscadmin.h"
 12 | #include "fmgr.h"
 13 | #include "utils/guc.h"
 14 | #include "utils/builtins.h"
 15 | 
 16 | #if PG_VERSION_NUM >= 100000
 17 | #include "utils/varlena.h"
 18 | #endif
 19 | 
 20 | #include "commands/dbcommands.h"
 21 | 
 22 | /* dict file extension */
 23 | #define TXT_EXT ".txt"
 24 | #define XDB_EXT ".xdb"
 25 | /* length of file extension */
 26 | #define EXT_LEN 4
 27 | 
 28 | PG_MODULE_MAGIC;
 29 | /*
 30 |  * types
 31 |  */
 32 | 
 33 | /* self-defined type */
 34 | typedef struct
 35 | {
 36 | 	char	   *buffer;			/* text to parse */
 37 | 	int		len;			/* length of the text in buffer */
 38 | 	int		pos;			/* position of the parser */
 39 | 	scws_t scws;
 40 | 	scws_res_t head;
 41 | 	scws_res_t curr;
 42 | } ParserState;
 43 | 
 44 | /* copy-paste from wparser.h of tsearch2 */
 45 | typedef struct
 46 | {
 47 | 	int			lexid;
 48 | 	char	   *alias;
 49 | 	char	   *descr;
 50 | } LexDescr;
 51 | 
 52 | static void init();
 53 | 
 54 | static void init_type(LexDescr descr[]);
 55 | 
 56 | /*
 57 |  * prototypes
 58 |  */
 59 | PG_FUNCTION_INFO_V1(zhprs_start);
 60 | Datum		zhprs_start(PG_FUNCTION_ARGS);
 61 | 
 62 | PG_FUNCTION_INFO_V1(zhprs_getlexeme);
 63 | Datum		zhprs_getlexeme(PG_FUNCTION_ARGS);
 64 | 
 65 | PG_FUNCTION_INFO_V1(zhprs_end);
 66 | Datum		zhprs_end(PG_FUNCTION_ARGS);
 67 | 
 68 | PG_FUNCTION_INFO_V1(zhprs_lextype);
 69 | Datum		zhprs_lextype(PG_FUNCTION_ARGS);
 70 | 
 71 | static scws_t scws = NULL;
 72 | static ParserState parser_state;
 73 | 
 74 | /* config */
 75 | static bool dict_in_memory = false;
 76 | static char * extra_dicts = NULL;
 77 | 
 78 | static bool punctuation_ignore = false;
 79 | static bool seg_with_duality = false;
 80 | static bool multi_short = false;
 81 | static bool multi_duality = false;
 82 | static bool multi_zmain = false;
 83 | static bool multi_zall = false;
 84 | 
 85 | static void init(){
 86 | 	char sharepath[MAXPGPATH];
 87 | 	char dict_path[MAXPGPATH];
 88 | 	char rule_path[MAXPGPATH];
 89 | 	int load_dict_mem_mode = 0x0;
 90 | 
 91 | 	List *elemlist;
 92 | 	ListCell *l;
 93 | 
 94 | 	if (!(scws = scws_new())) {
 95 | 		ereport(ERROR,
 96 | 				(errcode(ERRCODE_INTERNAL_ERROR),
 97 | 				 errmsg("Failed to init Chinese Parser Lib SCWS!\"%s\"",""
 98 | 				       )));
 99 | 	}
100 | 	
101 | 	DefineCustomBoolVariable(
102 | 		"zhparser.dict_in_memory",
103 | 		"load dicts into memory",
104 | 		"load dicts into memory",
105 | 		&dict_in_memory,
106 | 		false,
107 | 		PGC_BACKEND,
108 | 		0,
109 | 		NULL,
110 | 		NULL,
111 | 		NULL
112 | 		);
113 | 	DefineCustomStringVariable(
114 | 		"zhparser.extra_dicts",
115 | 		"extra dicts files to load",
116 | 		"extra dicts files to load",
117 | 		&extra_dicts,
118 | 		NULL,
119 | 		PGC_BACKEND,
120 | 		0,
121 | 		NULL,
122 | 		NULL,
123 | 		NULL
124 | 		);
125 | 	DefineCustomBoolVariable(
126 | 		"zhparser.punctuation_ignore",
127 | 		"set if zhparser ignores the puncuation",
128 | 		"set if zhparser ignores the puncuation,except \\r and \\n",
129 | 		&punctuation_ignore,
130 | 		false,
131 | 		PGC_USERSET,
132 | 		0,
133 | 		NULL,
134 | 		NULL,
135 | 		NULL
136 | 		);
137 | 
138 | 	DefineCustomBoolVariable(
139 | 		"zhparser.seg_with_duality",
140 | 		"segment words with duality",
141 | 		"segment words with duality",
142 | 		&seg_with_duality,
143 | 		false,
144 | 		PGC_USERSET,
145 | 		0,
146 | 		NULL,
147 | 		NULL,
148 | 		NULL
149 | 		);
150 | 	DefineCustomBoolVariable(
151 | 		"zhparser.multi_short",
152 | 		"prefer short words",
153 | 		"prefer short words",
154 | 		&multi_short,
155 | 		false,
156 | 		PGC_USERSET,
157 | 		0,
158 | 		NULL,
159 | 		NULL,
160 | 		NULL
161 | 		);
162 | 	DefineCustomBoolVariable(
163 | 		"zhparser.multi_duality",
164 | 		"prefer duality",
165 | 		"prefer duality",
166 | 		&multi_duality,
167 | 		false,
168 | 		PGC_USERSET,
169 | 		0,
170 | 		NULL,
171 | 		NULL,
172 | 		NULL
173 | 		);
174 | 	DefineCustomBoolVariable(
175 | 		"zhparser.multi_zmain",
176 | 		"prefer most important element",
177 | 		"prefer most important element",
178 | 		&multi_zmain,
179 | 		false,
180 | 		PGC_USERSET,
181 | 		0,
182 | 		NULL,
183 | 		NULL,
184 | 		NULL
185 | 		);
186 | 	DefineCustomBoolVariable(
187 | 		"zhparser.multi_zall",
188 | 		"prefer all element",
189 | 		"prefer all element",
190 | 		&multi_zall,
191 | 		false,
192 | 		PGC_USERSET,
193 | 		0,
194 | 		NULL,
195 | 		NULL,
196 | 		NULL
197 | 		);
198 | 
199 | 	get_share_path(my_exec_path, sharepath);
200 | 	snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s.%s",
201 | 			sharepath, "dict.utf8", "xdb");
202 | 	scws_set_charset(scws, "utf-8");
203 | 
204 | 	if(dict_in_memory)
205 | 	    load_dict_mem_mode = SCWS_XDICT_MEM;
206 | 
207 | 	/* ignore error,default dict is xdb */
208 | 	if( scws_set_dict(scws,dict_path,load_dict_mem_mode | SCWS_XDICT_XDB ) != 0){
209 | 	    ereport(NOTICE,
210 | 		    (errcode(ERRCODE_INTERNAL_ERROR),
211 | 		     errmsg("zhparser set dict : \"%s\" failed!",dict_path
212 | 			 )));
213 | 	}
214 | 
215 | 	snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt",
216 | 			DataDir, get_database_name(MyDatabaseId));
217 | 	if(scws_add_dict(scws, dict_path, load_dict_mem_mode | SCWS_XDICT_TXT) != 0 ){
218 | 		ereport(LOG,
219 | 			    (errcode(ERRCODE_INTERNAL_ERROR),
220 | 			     errmsg("zhparser add dict : \"%s\" failed! May not config custom dict, omit this",dict_path
221 | 				 )));
222 | 	}
223 | 
224 | 	if(extra_dicts != NULL){
225 | 	    if(!SplitIdentifierString(extra_dicts,',',&elemlist)){
226 | 		scws_free(scws);
227 | 		list_free(elemlist);
228 | 		scws = NULL;
229 | 		ereport(ERROR,
230 | 				(errcode(ERRCODE_INTERNAL_ERROR),
231 | 				 errmsg("zhparser.extra_dicts syntax error! extra_dicts is \"%s\"",extra_dicts
232 | 				       )));
233 | 	    }
234 | 
235 | 	    foreach(l,elemlist){
236 | 		int load_dict_mode = load_dict_mem_mode;
237 | 		char * ext = strrchr((char*)lfirst(l),'.');
238 | 		if(ext != NULL && strlen(ext) == EXT_LEN){
239 | 		    if(strncmp(ext,TXT_EXT,EXT_LEN) == 0){
240 | 			load_dict_mode |= SCWS_XDICT_TXT;
241 | 		    }
242 | 		    else if(strncmp(ext,XDB_EXT,EXT_LEN) == 0){
243 | 			load_dict_mode |= SCWS_XDICT_XDB;
244 | 		    }
245 | 		}
246 | 
247 | 		if(((load_dict_mode & SCWS_XDICT_TXT) == 0) &&
248 | 			((load_dict_mode & SCWS_XDICT_XDB) == 0)){
249 | 			scws_free(scws);
250 | 			list_free(elemlist);
251 | 			scws = NULL;
252 | 			ereport(ERROR,
253 | 				(errcode(ERRCODE_INTERNAL_ERROR),
254 | 				 errmsg("zhparser.extra_dicts setting error,the file name must end with .txt or .xdb! error file name is \"%s\"",(char*)lfirst(l)
255 | 				     )));
256 | 		
257 | 		}
258 | 
259 | 		snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s",
260 | 			sharepath, (char*)lfirst(l));
261 | 		/* ignore error*/
262 | 		if( scws_add_dict(scws,dict_path,load_dict_mode) != 0 ){
263 | 		    ereport(LOG,
264 | 			    (errcode(ERRCODE_INTERNAL_ERROR),
265 | 			     errmsg("zhparser add dict : \"%s\" failed for extra dict! omit",dict_path
266 | 				 )));
267 | 		}
268 | 	    }
269 | 	    list_free(elemlist);
270 | 	}
271 | 
272 | 	snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/%s.%s",
273 | 			sharepath, "rules.utf8", "ini");
274 | 	scws_set_rule(scws ,rule_path);
275 | }
276 | 
277 | /*
278 |  * functions
279 |  */
280 | 
281 | Datum
282 | zhprs_start(PG_FUNCTION_ARGS)
283 | {
284 | 	ParserState *pst = &parser_state;
285 | 	int multi_mode = 0x0;
286 | 
287 | 	if(scws == NULL)
288 | 		init();
289 | 	pst -> scws = scws;
290 | 	pst -> buffer = (char *) PG_GETARG_POINTER(0);
291 | 	pst -> len = PG_GETARG_INT32(1);
292 | 	pst -> pos = 0;
293 | 
294 | 	scws_set_ignore(scws, (int)punctuation_ignore);
295 | 	scws_set_duality(scws,(int)seg_with_duality);
296 | 
297 | 	if(multi_short){
298 | 	    multi_mode |= SCWS_MULTI_SHORT;
299 | 	}
300 | 
301 | 	if(multi_duality){
302 | 	    multi_mode |= SCWS_MULTI_DUALITY;
303 | 	}
304 | 
305 | 	if(multi_zmain){
306 | 	    multi_mode |= SCWS_MULTI_ZMAIN;
307 | 	}
308 | 
309 | 	if(multi_zall){
310 | 	    multi_mode |= SCWS_MULTI_ZALL;
311 | 	}
312 | 
313 | 	scws_set_multi(scws,multi_mode);
314 | 
315 | 	scws_send_text(pst -> scws, pst -> buffer, pst -> len);
316 | 
317 | 	(pst -> head) = (pst -> curr) = scws_get_result(pst -> scws);
318 | 
319 | 	PG_RETURN_POINTER(pst);
320 | }
321 | 
322 | Datum
323 | zhprs_getlexeme(PG_FUNCTION_ARGS)
324 | {
325 | 	ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
326 | 	char	  **t = (char **) PG_GETARG_POINTER(1);
327 | 	int		   *tlen = (int *) PG_GETARG_POINTER(2);
328 | 	int			type = -1;
329 | 
330 | 	if((pst -> head) == NULL ) /* already done the work,or no sentence */
331 | 	{
332 | 		*tlen = 0;
333 | 		type = 0;
334 | 	}
335 | 	/* have results */
336 | 	else if(pst -> curr != NULL)
337 | 	{
338 | 		scws_res_t  curr = pst -> curr;
339 | 
340 | 		/*
341 |  		* check the first char to determine the lextype
342 |  		* if out of [0,25],then set to 'x',mean unknown type 
343 |  		* so for Ag,Dg,Ng,Tg,Vg,the type will be unknown
344 |  		* for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
345 | 		*/
346 | 		type = (int)(curr -> attr)[0];
347 | 		if(type > (int)'x' || type < (int)'a')
348 | 		    type = (int)'x';
349 | 		*tlen = curr -> len;
350 | 		*t = pst -> buffer + curr -> off;
351 | 
352 | 		pst -> curr = curr -> next;
353 | 
354 | 		/* fetch the next sentence */
355 | 		if(pst -> curr == NULL ){
356 | 			scws_free_result(pst -> head);
357 | 			(pst -> head) =	(pst -> curr) = scws_get_result(pst -> scws);
358 | 		}
359 | 	}
360 | 
361 | 	PG_RETURN_INT32(type);
362 | }
363 | 
364 | Datum
365 | zhprs_end(PG_FUNCTION_ARGS)
366 | {
367 | 	PG_RETURN_VOID();
368 | }
369 | 
370 | Datum
371 | zhprs_lextype(PG_FUNCTION_ARGS)
372 | {
373 | 	LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (26 + 1));
374 | 	init_type(descr);
375 | 
376 | 	PG_RETURN_POINTER(descr);
377 | }
378 | 
379 | static void init_type(LexDescr descr[]){
380 | 	/* 
381 | 	* there are 26 types in this parser,alias from a to z
382 | 	* for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
383 | 	*/
384 | 	descr[0].lexid = 97;
385 | 	descr[0].alias = pstrdup("a");
386 | 	descr[0].descr = pstrdup("adjective,形容词");
387 | 	descr[1].lexid = 98;
388 | 	descr[1].alias = pstrdup("b");
389 | 	descr[1].descr = pstrdup("differentiation,区别词");
390 | 	descr[2].lexid = 99;
391 | 	descr[2].alias = pstrdup("c");
392 | 	descr[2].descr = pstrdup("conjunction,连词");
393 | 	descr[3].lexid = 100;
394 | 	descr[3].alias = pstrdup("d");
395 | 	descr[3].descr = pstrdup("adverb,副词");
396 | 	descr[4].lexid = 101;
397 | 	descr[4].alias = pstrdup("e");
398 | 	descr[4].descr = pstrdup("exclamation,感叹词");
399 | 	descr[5].lexid = 102;
400 | 	descr[5].alias = pstrdup("f");
401 | 	descr[5].descr = pstrdup("position,方位词");
402 | 	descr[6].lexid = 103;
403 | 	descr[6].alias = pstrdup("g");
404 | 	descr[6].descr = pstrdup("root,词根");
405 | 	descr[7].lexid = 104;
406 | 	descr[7].alias = pstrdup("h");
407 | 	descr[7].descr = pstrdup("head,前连接成分");
408 | 	descr[8].lexid = 105;
409 | 	descr[8].alias = pstrdup("i");
410 | 	descr[8].descr = pstrdup("idiom,成语");
411 | 	descr[9].lexid = 106;
412 | 	descr[9].alias = pstrdup("j");
413 | 	descr[9].descr = pstrdup("abbreviation,简称");
414 | 	descr[10].lexid = 107;
415 | 	descr[10].alias = pstrdup("k");
416 | 	descr[10].descr = pstrdup("tail,后连接成分");
417 | 	descr[11].lexid = 108;
418 | 	descr[11].alias = pstrdup("l");
419 | 	descr[11].descr = pstrdup("tmp,习用语");
420 | 	descr[12].lexid = 109;
421 | 	descr[12].alias = pstrdup("m");
422 | 	descr[12].descr = pstrdup("numeral,数词");
423 | 	descr[13].lexid = 110;
424 | 	descr[13].alias = pstrdup("n");
425 | 	descr[13].descr = pstrdup("noun,名词");
426 | 	descr[14].lexid = 111;
427 | 	descr[14].alias = pstrdup("o");
428 | 	descr[14].descr = pstrdup("onomatopoeia,拟声词");
429 | 	descr[15].lexid = 112;
430 | 	descr[15].alias = pstrdup("p");
431 | 	descr[15].descr = pstrdup("prepositional,介词");
432 | 	descr[16].lexid = 113;
433 | 	descr[16].alias = pstrdup("q");
434 | 	descr[16].descr = pstrdup("quantity,量词");
435 | 	descr[17].lexid = 114;
436 | 	descr[17].alias = pstrdup("r");
437 | 	descr[17].descr = pstrdup("pronoun,代词");
438 | 	descr[18].lexid = 115;
439 | 	descr[18].alias = pstrdup("s");
440 | 	descr[18].descr = pstrdup("space,处所词");
441 | 	descr[19].lexid = 116;
442 | 	descr[19].alias = pstrdup("t");
443 | 	descr[19].descr = pstrdup("time,时语素");
444 | 	descr[20].lexid = 117;
445 | 	descr[20].alias = pstrdup("u");
446 | 	descr[20].descr = pstrdup("auxiliary,助词");
447 | 	descr[21].lexid = 118;
448 | 	descr[21].alias = pstrdup("v");
449 | 	descr[21].descr = pstrdup("verb,动词");
450 | 	descr[22].lexid = 119;
451 | 	descr[22].alias = pstrdup("w");
452 | 	descr[22].descr = pstrdup("punctuation,标点符号");
453 | 	descr[23].lexid = 120;
454 | 	descr[23].alias = pstrdup("x");
455 | 	descr[23].descr = pstrdup("unknown,未知词");
456 | 	descr[24].lexid = 121;
457 | 	descr[24].alias = pstrdup("y");
458 | 	descr[24].descr = pstrdup("modal,语气词");
459 | 	descr[25].lexid = 122;
460 | 	descr[25].alias = pstrdup("z");
461 | 	descr[25].descr = pstrdup("status,状态词");
462 | 	descr[26].lexid = 0;
463 | }
464 | //TODO :headline function
465 | 


--------------------------------------------------------------------------------
/zhparser.control:
--------------------------------------------------------------------------------
1 | comment = 'a parser for full-text search of Chinese'
2 | default_version = '2.3'
3 | module_pathname = '$libdir/zhparser'
4 | relocatable = true
5 | 


--------------------------------------------------------------------------------
/zhparser.h:
--------------------------------------------------------------------------------
 1 | #ifndef ZHPARSER_H 
 2 | #define ZHPARSER_H
 3 | 
 4 | #ifndef pstrdup
 5 | #define pstrdup scws_pstrdup
 6 | #endif
 7 | 
 8 | #include "scws.h"
 9 | 
10 | #undef pstrdup
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------