├── .gitignore
├── .styleci.yml
├── LICENSE
├── README.md
├── composer.json
├── composer.lock
├── nbproject
├── project.properties
└── project.xml
├── src
├── Common.php
├── Dictionary.php
└── Seeker.php
├── tests
├── benchmark
│ ├── build_dict.php
│ ├── dict.txt
│ ├── packed.bin
│ └── seek.php
├── bootstrap.php
├── configuration.xml
└── src
│ └── DictionaryTest.php
└── utmake.yaml
/.gitignore:
--------------------------------------------------------------------------------
1 | /nbproject/private/
2 | /vendor/
--------------------------------------------------------------------------------
/.styleci.yml:
--------------------------------------------------------------------------------
1 | preset: laravel
2 |
3 | risky: false
4 |
5 | enabled:
6 |
7 | disabled:
8 | - braces
9 |
10 | finder:
11 | exclude:
12 | - "tests"
13 | name:
14 | - "*.php"
15 | # not-name:
16 | # - "*Stub.php"
17 | # contains:
18 | # - "Foo"
19 | # not-contains:
20 | # - "config"
21 | # path:
22 | # - "foo/Stuff"
23 | # not-path:
24 | # - "libraries"
25 | # depth:
26 | # - "< 3"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Andares Merigold
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Adarts 使用说明
2 |
3 | 基于静态Darts实现了AC自动机的字符串匹配类库,支持UTF-8编码,目前在项目中用于敏感词功能。
4 |
5 | **该库只支持PHP7及以上版本**,之前版本因为有C扩展实现了类似功能,也不需要这个。
6 |
7 | 因为并不精通算法,实现上更偏向于业务代码风格,较多地利用了php数组双向链表的特性。算法实现上若有不对,或是有更好的优化方案,**请狠狠给个merge request!**
8 |
9 | ## 更新与路线图
10 |
11 | |功能描述|实现版本|
12 | |---|---|
13 | |修正在双数组模式下导入同路径词条引发的问题| 1.6 |
14 | |调整部分方法及属性命名| 1.5 |
15 | |搜索时增加```$limit```和```$skip```参数| 1.4 |
16 | |增加批量查找功能,可一次获取所有命中词| 1.3 |
17 | |修正查找失败时的指针偏移bug| 1.3 |
18 |
19 | ## 更新内容详述
20 |
21 | ### 1.6版本改动
22 |
23 | 在此库已稳定将近9个月后居然在工作中发现了```重要BUG```,运营在字典中添加了一个单个字的词条后,居然查找不到这个词。
24 |
25 | 经过排查,发现这是压缩为```Double Array```后的正常现象。双数组结构可以获取更高的执行效率,同时也让算法实现的功能更为单一:拿一个字典与匹配字串进行碰撞,判断字串是否有包含字典中的词条。
26 |
27 | 因此,经过双数组压缩过后的字典中的每个节点,是不能同时为过路节点又为末节点的。
28 |
29 | 也就是说,如果往字典中添加```小猪```和```小猪狗```这两个词条,执的结果是无法检查到```小猪```这个词条的。
30 |
31 | 经过取舍,为了保证该库和算法的设计本意,对字典的构建算法做了更改,当同时添加上述两个词条时,只有```小猪```会被加入。因为后者已经包含前者,所以并不会对“是否包含字典中的词条”这一检查目的造成影响。
32 |
33 | ### 1.5版本改动
34 |
35 | > * 变更```getWordsByState()```方法更名为```getWordByState()```,原方法名做兼容性留存
36 |
37 | ## 安装
38 |
39 | 一个简单的composer库而已:
40 |
41 | ```
42 | composer require andares/adarts
43 | ```
44 |
45 | 序列化功能用到了msgpack,这个东西我很喜欢,推荐使用:
46 |
47 | ```
48 | pecl install msgpack-2.0.1
49 | ```
50 |
51 | 安装完后在php.ini或是conf.d里加上```extension=msgpack.so```即可,线上环境莫忘重启fpm
52 |
53 | ## 创建字典
54 |
55 | ```
56 | $dict = new \Adarts\Dictionary();
57 | $dict->add('word1')
58 | ->add('word2')
59 | ->add('word3')
60 | ->add('word4')
61 | ->confirm();
62 | ```
63 |
64 | * add() 向字典中添加一个词条,返回$this,可像上面那样重复调用。
65 | * confirm() 当词条添加完后生成darts树及失败指针。
66 |
67 | 当然我知道大家实际中一般都会这么用:
68 |
69 | ```
70 | add($word);
79 | }
80 | $dict->confirm();
81 | ```
82 |
83 | > 请注意,由于```confirm()```行为会对数据做压缩处理,因此一个新建的字典对象应只能执行一次```confirm()```操作,重复执行将导致报错。如果有需要重新生成字典,可以重新new一个新的字典对象进行创建。因为压缩后的darts树本身就算不上一种可逆操作,所以这个库本身并不能同时作为存放原始字典的容器使用。这个库字典设计一切目标是为了更快地搜索匹配,词库更适合放在数据库、文件系统等其他地方。
84 |
85 | ## 搜索匹配
86 |
87 | 字典创建完后就可以用于搜索,例如:
88 |
89 | ```
90 | $result = $dict->seek('get out! asshole!')->current();
91 | if ($result) {
92 | throw new \LogicException('you could not say it');
93 | }
94 | ```
95 |
96 | * seek() 搜索一个字串,看是否有字典中匹配的词。返回一个生成器对象,所以请使用```current()```方法获取第一个匹配词。
97 |
98 | 在上面的例子中,我们假设```asshole```在字典中,那么```current()```方法即可得到一个不为0的整数。
99 |
100 | > 事实上,$result即Darts中的**叶子节点state**
101 |
102 | **如果传入的字串中未包含字典中的内容,由于迭代器特性,则会返回一个null值,这点需要注意!**
103 |
104 | ### 限制搜索范围
105 |
106 | 搜索时传入```$limit```参数,可限制只搜索到某个第几个字**作为开头**,例如:
107 |
108 | ```{php}
109 | // 假设“违法”和“犯规”两字在字典中
110 | $limit = 1;
111 | $result = $dict->seek('违法犯规', $limit)->current();
112 |
113 | // 这时$result结果只有违法
114 | ```
115 |
116 | 搜索时传入第二个参数```$skip```,可跳过几个字符开始搜索,例如:
117 |
118 | ```{php}
119 | $limit = 1;
120 | $skip = 2;
121 | $result = $dict->seek('违法犯规', $limit, $skip)->current();
122 |
123 | // 这时$result结果是犯规
124 | ```
125 |
126 | 这里有几个要点需要理解:
127 |
128 | 1. limit限制的数字,是指查找起始点,而非结束点。所以这里limit=1会从第一个字"违"查起,但只要能一直匹配到成功,不会在“违”字结束匹配。
129 | 2. 传入的数字是UTF-8字符数,不是字节数。
130 | 3. skip和limit均不支持负数。
131 |
132 |
133 | ## 根据state获取匹配词
134 |
135 | 稍稍改进一下上面的代码:
136 |
137 | ```
138 | $result = $dict->seek('get out! asshole!')->current();
139 | if ($result) {
140 | throw new \LogicException('you could not say ' . $dict->getWordByState($result));
141 | }
142 | ```
143 |
144 | * getWordByState() 根据**叶子节点state**获取找到的匹配词,如果没意外上面取到的是asshole
145 |
146 | ## 查找多个命中词
147 |
148 | ```
149 | foreach ($dict->seek('get out! asshole!') as $result) {
150 | echo "you could not say ' . $dict->getWordByState($result);
151 | }
152 | ```
153 |
154 | 利用迭代器特性,foreach返回的生成器对象即可获取所有命中词条。
155 |
156 | ### 关于找到的位置
157 |
158 | 因为支持失败指针,所以state的转换不是线性的,当通过失败指针跳到其他词条(的某个节点)时,还没找到好的方法(有效率地)逆推到起始节点的办法。
159 |
160 | 因此```seek()```只能告诉你是否有找到,最多带一个找到了什么,如果需要实现知道位置的功能,可以使用找到词条另外调php方法去处理。在已经明确结果的 下,单词条的查询效率不会有什么问题。
161 |
162 | ## 序列化
163 |
164 | Trie树的策略就是提前对字典做分析,在搜索的时候以最少的步数进行匹配搜索。所以每次搜索时都实时建立字典显然有违初衷,我们可以通过在输入词条列表时创建字典,并调```confirm()```生成分析后的Darts数据,然后对Dictionary进行序列化后保存(用你最爱的持久化方案,MySQL、mongodb、redis、memcache、leveldb等等等等)。
165 |
166 | 这样当需要搜索时,只需要读出字典对象直接搜索就行了。
167 |
168 | Adarts使用msgpack进行字典序列化,以获得比php序列化或json更好的I/O性能。
169 |
170 | ```
171 | // 这是创建$dict并添加词条并confirm()的1000行代码
172 |
173 | $packed = serialize($dict);
174 |
175 | // 这是把序列化后的数据持久化的1000行代码
176 | // 嗯,也许可能长这样
177 | redis()->set('dict', $packed);
178 | ```
179 |
180 | 现在我们要用了,那么:
181 |
182 | ```
183 | // 可能长成这样的读出序列化数据代码
184 | $packed = redis()->get('dict');
185 | $dict = unserialize($packed);
186 |
187 | $result = $dict->seek($str)->current();
188 | // ...搜索后的1000行业务代码
189 | ```
190 |
191 | ### 精简字典对象
192 |
193 | 显然,上面的做法还没有让速度达到最快,因为了字典对象在创建的过程中会产生大量中间数据。这其中一部分是在其他一些场景(非搜索)中有用的,有一些则目前看起来没什么用处。
194 |
195 | 所以如果确认这个字典对象的序列化数据只用在搜索这一场景中(判断有无),那么可以这样打包:
196 |
197 | ```
198 | $packed = serialize($dict->simplify());
199 | ```
200 |
201 | 这样做会只留搜索必要的数据,让序列化后的数据更小,读出和反序列化更快。
202 |
203 | > 注意```simplify()```之后的字典对象去掉了**词条states索引**,这会导致```getWordByState()```方法不可用(总是返回空串)。所以对于有两种需求的情况来说,推荐精简与完整的字典序列化各存一份,以适用不同的场景。
204 |
205 | ## 感谢
206 |
207 | 感谢网上被复制了无数份的Trie PHP实现那个教程文档,虽然Trie树部分的代码完全没有参考价值,但我确实复制粘贴了原作者写的UTF-8拆分代码,并得到了指点。
208 |
209 | 还有[http://www.cnblogs.com/ooon/p/4883159.html](双数组Tire树(DART)详解)此文,参考的内容是最多的,刚看第一眼的时候其实并没有想读懂>_<
210 |
211 | 嗯,暂时就先写到这里,如果有坑请发issue给我,先谢过大家帮忙调试了!
212 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "andares/adarts",
3 | "description": "a darts class",
4 | "keywords": ["darts", "dart", "trie", "tree"],
5 | "license": "MIT",
6 | "authors": [
7 | {
8 | "name": "Andares Merigold",
9 | "email": "andares@outlook.com"
10 | }
11 | ],
12 | "require": {
13 | "php": "^7.1.0"
14 | },
15 | "require-dev": {
16 | "tracy/tracy": "^2.4"
17 | },
18 | "suggest": {
19 | "ext-msgpack": "Allow serialize dictionary by msgpack, it is compact & faster"
20 | },
21 | "autoload": {
22 | "classmap": ["src/"]
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/composer.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_readme": [
3 | "This file locks the dependencies of your project to a known state",
4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
5 | "This file is @generated automatically"
6 | ],
7 | "hash": "f878303cc5d2a3b42474cbe01d3af385",
8 | "content-hash": "381682c49cf0734c14792f514f637d12",
9 | "packages": [],
10 | "packages-dev": [
11 | {
12 | "name": "tracy/tracy",
13 | "version": "v2.4.8",
14 | "source": {
15 | "type": "git",
16 | "url": "https://github.com/nette/tracy.git",
17 | "reference": "4538cbe7f11d03d78afd34a168ca1c966792a3d6"
18 | },
19 | "dist": {
20 | "type": "zip",
21 | "url": "https://files.phpcomposer.com/files/nette/tracy/4538cbe7f11d03d78afd34a168ca1c966792a3d6.zip",
22 | "reference": "4538cbe7f11d03d78afd34a168ca1c966792a3d6",
23 | "shasum": ""
24 | },
25 | "require": {
26 | "ext-json": "*",
27 | "ext-session": "*",
28 | "php": ">=5.4.4"
29 | },
30 | "require-dev": {
31 | "nette/di": "~2.3",
32 | "nette/tester": "~2.0"
33 | },
34 | "suggest": {
35 | "https://nette.org/donate": "Please support Tracy via a donation"
36 | },
37 | "type": "library",
38 | "extra": {
39 | "branch-alias": {
40 | "dev-master": "2.4-dev"
41 | }
42 | },
43 | "autoload": {
44 | "classmap": [
45 | "src"
46 | ],
47 | "files": [
48 | "src/shortcuts.php"
49 | ]
50 | },
51 | "notification-url": "https://packagist.org/downloads/",
52 | "license": [
53 | "BSD-3-Clause",
54 | "GPL-2.0",
55 | "GPL-3.0"
56 | ],
57 | "authors": [
58 | {
59 | "name": "David Grudl",
60 | "homepage": "https://davidgrudl.com"
61 | },
62 | {
63 | "name": "Nette Community",
64 | "homepage": "https://nette.org/contributors"
65 | }
66 | ],
67 | "description": "😎 Tracy: the addictive tool to ease debugging PHP code for cool developers. Friendly design, logging, profiler, advanced features like debugging AJAX calls or CLI support. You will love it.",
68 | "homepage": "https://tracy.nette.org",
69 | "keywords": [
70 | "Xdebug",
71 | "debug",
72 | "debugger",
73 | "nette",
74 | "profiler"
75 | ],
76 | "time": "2017-07-14 06:38:56"
77 | }
78 | ],
79 | "aliases": [],
80 | "minimum-stability": "stable",
81 | "stability-flags": [],
82 | "prefer-stable": false,
83 | "prefer-lowest": false,
84 | "platform": {
85 | "php": "^7.1.0"
86 | },
87 | "platform-dev": []
88 | }
89 |
--------------------------------------------------------------------------------
/nbproject/project.properties:
--------------------------------------------------------------------------------
1 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_create_2e_tests=true
2 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_enabled=true
3 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_path=tests/bootstrap.php
4 | auxiliary.org-netbeans-modules-php-phpunit.configuration_2e_enabled=true
5 | auxiliary.org-netbeans-modules-php-phpunit.configuration_2e_path=tests/configuration.xml
6 | auxiliary.org-netbeans-modules-php-phpunit.customSuite_2e_enabled=false
7 | auxiliary.org-netbeans-modules-php-phpunit.customSuite_2e_path=
8 | auxiliary.org-netbeans-modules-php-phpunit.phpUnit_2e_enabled=false
9 | auxiliary.org-netbeans-modules-php-phpunit.phpUnit_2e_path=
10 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_groups_2e_ask=false
11 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_run_2e_all=false
12 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_run_2e_phpunit_2e_only=false
13 | file.reference.adarts-tests=tests
14 | include.path=${php.global.include.path}
15 | php.version=PHP_70
16 | source.encoding=UTF-8
17 | src.dir=.
18 | tags.asp=false
19 | tags.short=false
20 | test.src.dir=${file.reference.adarts-tests}
21 | testing.providers=PhpUnit
22 | web.root=.
23 |
--------------------------------------------------------------------------------
/nbproject/project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | org.netbeans.modules.php.project
4 |
5 |
6 | adarts
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/Common.php:
--------------------------------------------------------------------------------
1 | check[0] = 0;
30 | $this->base[0] = 1;
31 | }
32 |
33 | /**
34 | *
35 | * @param string $sample
36 | * @return \Generator
37 | */
38 | public function seek(string $sample, int $limit = 0, int $skip = 0): \Generator {
39 | // 先生成用于搜索的转义串
40 | $haystack = [];
41 | foreach ($this->split($sample) as $char) {
42 | $haystack[] = $this->index[$char] ?? 0;
43 | }
44 |
45 | $seeker = new Seeker($this->check, $this->base, $this->fail_states);
46 | return $seeker($haystack, $limit, $skip);
47 | }
48 |
49 | /**
50 | * 精简词典对像,仅保留搜索必要的数据
51 | * @return self
52 | */
53 | public function simplify(): self {
54 | $this->index_code =
55 | $this->tmp_tree =
56 | $this->word_states =
57 | $this->begin_used = [];
58 | $this->index_count = 0;
59 | return $this;
60 | }
61 |
62 | /**
63 | * 添加词进字典
64 | * @param string $word
65 | * @return self
66 | */
67 | public function add(string $word): self {
68 | // 构建临时树
69 | $tmp_tree = &$this->tmp_tree;
70 | foreach ($this->split($word) as $char) {
71 | // 加入索引
72 | $code = $this->index($char);
73 |
74 | // 插树
75 | $tmp_tree = &$this->putNode($code, $tmp_tree);
76 |
77 | // 抛弃无用词条
78 | if ($tmp_tree == $this->tmp_tree) {
79 | break;
80 | }
81 | }
82 |
83 | // 修剪trie树,抛弃无用词条
84 | $tmp_tree && $tmp_tree != $this->tmp_tree && $tmp_tree = [];
85 |
86 | return $this;
87 | }
88 |
89 | /**
90 | * 根据叶子节点 state 拿word
91 | * @param int $state
92 | * @return string
93 | */
94 | public function getWordByState($state): string {
95 | return $this->word_states[$state] ?? '';
96 | }
97 |
98 | /**
99 | * 版本兼容性方法,已作废
100 | * @deprecated since version 1.5
101 | * @param int $state
102 | * @return string
103 | */
104 | public function getWordsByState($state) {
105 | return $this->getWordByState($state);
106 | }
107 |
108 | /**
109 | *
110 | * @param array $haystack
111 | * @return string
112 | */
113 | public function translate(array $haystack): string {
114 | $this->indexCode();
115 |
116 | $result = '';
117 | foreach ($haystack as $code) {
118 | $result .= $this->index_code[$code];
119 | }
120 | return $result;
121 | }
122 |
123 | /**
124 | * 为 index 创建 code 反向索引
125 | * @param bool $force
126 | */
127 | private function indexCode($force = false) {
128 | if (!$this->index_code || $force) {
129 | foreach ($this->index as $word => $code) {
130 | $this->index_code[$code] = $word;
131 | }
132 | }
133 | }
134 |
135 | /**
136 | * 添加完毕
137 | * @return self
138 | */
139 | public function confirm(): self {
140 | return $this->compress()->makeFailStates();
141 | }
142 |
143 | /**
144 | * 创建失败指针
145 | * @return self
146 | */
147 | private function makeFailStates(): self {
148 | $seeker = new Seeker($this->check, $this->base, []);
149 | $this->traverseTreeForMakeFailCursor($this->tmp_tree, [], $seeker);
150 | return $this;
151 | }
152 |
153 | /**
154 | * 遍历tmp_tree创建失败指针
155 | * @param array $tree
156 | * @param array $haystack
157 | * @param \Adarts\Seeker $seeker
158 | * @param int $code
159 | */
160 | private function traverseTreeForMakeFailCursor(array &$tree,
161 | array $haystack, Seeker $seeker, int $code = 0) {
162 |
163 | $code && $haystack[] = $code;
164 | foreach ($tree as $code => $children_tree) {
165 | $this->gainFailCursor($haystack, $seeker, $code);
166 | if ($children_tree) {
167 | $this->traverseTreeForMakeFailCursor($children_tree,
168 | $haystack, $seeker, $code);
169 | }
170 | }
171 | }
172 |
173 | /**
174 | * 搜索失败指针
175 | * @param array $haystack
176 | * @param \Adarts\Seeker $seeker
177 | * @param int $code
178 | * @return void
179 | */
180 | private function gainFailCursor(array $haystack,
181 | Seeker $seeker, int $code) {
182 |
183 | if (!$haystack) {
184 | return;
185 | }
186 |
187 | $haystack[] = $code;
188 | $self = $seeker->forFail($haystack);
189 | array_shift($haystack);
190 | do {
191 | $state = $seeker->forFail($haystack);
192 | if ($state) {
193 | $this->fail_states[$self] = $state;
194 | break;
195 | }
196 | array_shift($haystack);
197 | } while ($haystack);
198 | }
199 |
200 | /**
201 | * 将临时Trie树压缩成Darts
202 | * @return self
203 | */
204 | private function compress(): self {
205 | $base = $this->base[0];
206 | $this->beginUse($base);
207 |
208 | $this->indexCode();
209 | $this->traverseTreeForCompress($this->tmp_tree, $base);
210 | return $this;
211 | }
212 |
213 | /**
214 | * 遍历Trie树,生成check与base
215 | * @param array $tree
216 | * @param int $base
217 | */
218 | private function traverseTreeForCompress(array &$tree, int $base,
219 | string $prefix = '') {
220 |
221 | // 先处理当前层级
222 | foreach ($tree as $code => $children_tree) {
223 | $state = $this->getState($base, $code);
224 | $this->check[$state] = $base;
225 | }
226 |
227 | // 再处理子级
228 | foreach ($tree as $code => $children_tree) {
229 | $state = $this->getState($base, $code);
230 | $word = $prefix . $this->index_code[$code];
231 |
232 | // 计算此子级的check 即当前节点的base
233 | if ($children_tree) {
234 | $next_base = $this->findBegin($children_tree);
235 | $this->beginUse($next_base);
236 | $this->base[$state] = $next_base;
237 |
238 | $this->traverseTreeForCompress($children_tree, $next_base, $word);
239 | } else {
240 | // 叶节点
241 | $this->base[$state] = -$this->check[$state];
242 |
243 | // 创建 word 的 state 索引
244 | $this->word_states[$state] = $word;
245 | }
246 | }
247 | }
248 |
249 | /**
250 | * 暂时用步进法搜索
251 | * @return int
252 | */
253 | private function findBegin(array &$tree): int {
254 | $base = 1;
255 | $found = false;
256 | while (!$found) {
257 | // 步进
258 | $base++;
259 |
260 | // 如有使用跳过
261 | if (isset($this->begin_used[$base])) {
262 | continue;
263 | }
264 |
265 | // 查找是否符合条件
266 | foreach ($tree as $child_code => $child_tree) {
267 | if (isset($this->check[$this->getState($base, $child_code)])) {
268 | continue 2;
269 | }
270 | }
271 |
272 | $found = true;
273 | }
274 | return $base;
275 | }
276 |
277 | /**
278 | *
279 | * @param int $base
280 | * @return self
281 | */
282 | private function beginUse(int $base): self {
283 | $this->begin_used[$base] = 1;
284 | return $this;
285 | }
286 |
287 | /**
288 | * 往临时树里添加一个节点
289 | * @param int $code
290 | * @param array $tmp_tree
291 | * @return array
292 | */
293 | private function &putNode(int $code, array &$tmp_tree): array {
294 | if (isset($tmp_tree[$code]) && $tmp_tree[$code] == []) {
295 | return $this->tmp_tree;
296 | }
297 |
298 | !isset($tmp_tree[$code]) && $tmp_tree[$code] = [];
299 | return $tmp_tree[$code];
300 | }
301 |
302 | /**
303 | * 索引字符并返回code
304 | * @param string $char
305 | * @return int
306 | */
307 | private function index(string $char): int {
308 | if (isset($this->index[$char])) {
309 | return $this->index[$char];
310 | }
311 |
312 | $this->index_count++;
313 | $this->index[$char] = $this->index_count;
314 | return $this->index_count;
315 | }
316 |
317 | /**
318 | * utf8拆字
319 | * @param string $str
320 | */
321 | private function split(string $str) {
322 | $len = strlen($str);
323 | for ($i = 0; $i < $len; $i++) {
324 | $c = $str[$i];
325 | $n = ord($c);
326 | if (($n >> 7) == 0) {
327 | //0xxx xxxx, asci, single
328 | yield $c;
329 | } elseif (($n >> 4) == 15) { //1111 xxxx, first in four char
330 | if ($i < $len - 3) {
331 | yield $c . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
332 | $i += 3;
333 | }
334 | } elseif (($n >> 5) == 7) {
335 | //111x xxxx, first in three char
336 | if ($i < $len - 2) {
337 | yield $c . $str[$i + 1] . $str[$i + 2];
338 | $i += 2;
339 | }
340 | } elseif (($n >> 6) == 3) {
341 | //11xx xxxx, first in two char
342 | if ($i < $len - 1) {
343 | yield $c . $str[$i + 1];
344 | $i++;
345 | }
346 | }
347 | }
348 | }
349 |
350 | /**
351 | * 序列化
352 | * @return string
353 | */
354 | public function serialize(): string {
355 | $data['check'] = $this->check;
356 | $data['base'] = $this->base;
357 | $data['index'] = $this->index;
358 | $data['fail_states'] = $this->fail_states;
359 | $data['index_count'] = $this->index_count;
360 | $data['index_code'] = $this->index_code;
361 | $data['tmp_tree'] = $this->tmp_tree;
362 | $data['word_states'] = $this->word_states;
363 | $data['begin_used'] = $this->begin_used;
364 | return msgpack_pack($data);
365 | }
366 |
367 | /**
368 | * 反序列化
369 | * @param string $serialized
370 | */
371 | public function unserialize($serialized) {
372 | $data = msgpack_unpack($serialized);
373 | $this->check = $data['check'];
374 | $this->base = $data['base'];
375 | $this->index = $data['index'];
376 | $this->fail_states = $data['fail_states'];
377 | $this->index_count = $data['index_count'];
378 | $this->index_code = $data['index_code'];
379 | $this->tmp_tree = $data['tmp_tree'];
380 | $this->word_states = $data['word_states'] ?? $data['words_states'];
381 | $this->begin_used = $data['begin_used'];
382 | }
383 |
384 | }
385 |
--------------------------------------------------------------------------------
/src/Seeker.php:
--------------------------------------------------------------------------------
1 | check = $check;
19 | $this->base = $base;
20 | $this->fail_states = $fail_states;
21 | }
22 |
23 | public function forFail(array $haystack): int {
24 | // 当前base
25 | $base = $this->base[0];
26 | // 预计算实际匹配指针
27 | $cursor = 0;
28 |
29 | while (isset($haystack[$cursor])) {
30 | $state = $this->getState($base, $haystack[$cursor]);
31 |
32 | // 根据state取出base,查找下一个state
33 | if (!isset($this->base[$state]) || $this->check[$state] != $base) {
34 | return 0;
35 | }
36 | $base = $this->base[$state];
37 | $cursor++;
38 | }
39 | return $state;
40 | }
41 |
42 | public function __invoke(array $haystack,
43 | int $limit = 0, int $skip = 0): \Generator {
44 |
45 | $it = $this->process($haystack, $limit, $skip);
46 | return $it;
47 | }
48 |
49 | private function process(array $haystack, int $limit = 0, int $skip = 0) {
50 | // 当前base
51 | $base = $this->base[0];
52 | // 开始位指针
53 | $start = $skip;
54 | // 检测位指针
55 | $verify = 0;
56 | // 预计算实际匹配指针
57 | $cursor = $start + $verify;
58 | // 初始置state为0
59 | $pre_state = 0;
60 |
61 | // 开始搜索
62 | $limit && $limit += $skip;
63 | while (isset($haystack[$cursor]) && (!$limit || $start < $limit)) {
64 | // 根据当前 base 与匹配指针位计算出 state
65 | // 未进入索引取不到 code 的 state = -1
66 | $state = isset($haystack[$cursor]) ?
67 | $this->getState($base, $haystack[$cursor]) : -1;
68 |
69 | // 根据 state 查找是否有 base
70 | // 并且使用 check 位校验父节点 base 是否匹配
71 | if (isset($this->base[$state]) && $this->check[$state] == $base) {
72 | // 根据 base 位不为负检查是否为叶子节点
73 | if ($this->base[$state] > 0) {
74 | // 非叶子节点 base 置为下层节点 check
75 | $base = $this->base[$state];
76 | // 检验位推进
77 | $verify++;
78 | // 设置 pre state 用于调用失败指针
79 | $pre_state = $state;
80 |
81 | } else {
82 | // 遇到叶子节点,匹配成功
83 | yield $state;
84 |
85 | // 重置搜索位
86 | $base = $this->base[0];
87 | $start++;
88 | $verify = 0;
89 | $pre_state = 0;
90 | }
91 | } else {
92 | // state 检查失败
93 | if (isset($this->fail_states[$pre_state])) {
94 | // 如果有 fail 指针,重置 base 到失败指针,退一步继续匹配
95 | $base = $this->check[$this->fail_states[$pre_state]];
96 | $start += $verify - 1;
97 |
98 | } else {
99 | // 无 fail 指针,重置 base 到 root
100 | $base = $this->base[0];
101 | $verify ? ($start += $verify) : $start++;
102 | }
103 | // 重置检测位 pre state
104 | $verify = 0;
105 | $pre_state = 0;
106 | }
107 |
108 | // 计算出新的(或相同的)检测指针
109 | $cursor = $start + $verify;
110 | // du("$cursor = $start + $verify", '$cursor = $start + $verify');
111 | }
112 |
113 | // 搜索结束
114 | return 0;
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/tests/benchmark/build_dict.php:
--------------------------------------------------------------------------------
1 | add($word);
21 | }
22 | fclose($handle);
23 |
24 | $dict->confirm();
25 |
26 | echo "dict build timecost: ".round((microtime(true) - $t) * 1000)."ms\n";
27 |
28 | // 不使用 simplitfy() 方法可获得反向翻译功能,但字典容量会大大增加。
29 | $packed = serialize($dict->simplify());
30 | file_put_contents(__DIR__.'/packed.bin', $packed);
31 |
32 | echo "done.\n";
33 |
--------------------------------------------------------------------------------
/tests/benchmark/dict.txt:
--------------------------------------------------------------------------------
1 | 考前答案
2 | 万科
3 | 家宝
4 |
--------------------------------------------------------------------------------
/tests/benchmark/packed.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andares/adarts/44e5faff7a78270f128189f7a50a32a38087c047/tests/benchmark/packed.bin
--------------------------------------------------------------------------------
/tests/benchmark/seek.php:
--------------------------------------------------------------------------------
1 | seek('上上下下左左右右A家宝BBA')->current(), "\n";
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
--------------------------------------------------------------------------------
/tests/src/DictionaryTest.php:
--------------------------------------------------------------------------------
1 | object = new Dictionary;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown() {
28 |
29 | }
30 |
31 | public function testForBug() {
32 | $dictionary = new \Adarts\Dictionary();
33 | $dictionary->add('钱');
34 | $dictionary->add('测试');
35 | $dictionary->confirm();
36 | $keywords = [];
37 | foreach ($dictionary->seek('钱很多测试') as $result) {
38 | $keywords[] = $dictionary->getWordByState($result);
39 | }
40 |
41 |
42 | $dictionary->add('走起');
43 | // 当上面confirm后,再添加再confirm会出现
44 | // PHP Notice: Undefined offset: 7
45 | $dictionary->confirm();
46 | $keywords = [];
47 | foreach ($dictionary->seek('钱很多测试走起拉') as $result) {
48 | $keywords[] = $dictionary->getWordByState($result);
49 | }
50 | /*
51 | $this->object
52 | // ->add("妓女")
53 | // ->add("妓")
54 | ->add("妓飞飞")
55 | ->add("飞机")
56 | // ->add("走开")
57 | ->confirm();
58 | du($this->object);
59 | $result = $this->object->seek('天下大妓同')->current();
60 | du($result);
61 | du($this->object->getWordByState($result));
62 | du($this->object);
63 | */
64 | }
65 |
66 | /**
67 | * @covers Adarts\Dictionary::add
68 | * @todo Implement testAdd().
69 | */
70 | public function testAdd() {
71 | // die();
72 | $this->object
73 | ->add("毛 abcfd")
74 | ->add("bcev")
75 | ->add("毛 主 席")
76 | ->add("主 导")
77 | ->add("习boss威武") // 这是无效词条
78 | ->add("习boss")
79 | ->confirm();
80 | du($this->object);
81 |
82 | $packed = serialize($this->object);
83 | // var_dump($packed);
84 | // du(strlen($packed));
85 | $this->object = unserialize($packed);
86 |
87 | // 测试找批量
88 | $result_list = [
89 | '毛 abcfd' => 1,
90 | '主 导' => 1,
91 | '习boss' => 1,
92 | ];
93 | foreach ($this->object->seek('abd毛 主毛 abcfd 毛 主 导习bossk') as $result) {
94 | $word = $this->object->getWordByState($result);
95 | $this->assertTrue(isset($result_list[$word]));
96 | unset($result_list[$word]);
97 | }
98 | $this->assertEquals(0, count($result_list));
99 |
100 | // 测试深回归
101 | $result = $this->object->seek('123毛 abcfwr')->current();
102 | $this->assertNull($result);
103 | $this->assertEquals('', $this->object->getWordByState($result));
104 |
105 | // 测试失败指针
106 | $result = $this->object->seek('abd毛 主d 毛 主 导k')->current();
107 | $this->assertEquals(26, $result);
108 | $this->assertEquals('主 导', $this->object->getWordByState($result));
109 |
110 | // 简化
111 | $packed = serialize($this->object->simplify());
112 | // var_dump($packed);
113 | // du(strlen($packed));
114 | $this->object = unserialize($packed);
115 |
116 | // 测试未找到
117 | $result = $this->object->seek('abd毛习')->current();
118 | $this->assertNull($result);
119 | $this->assertEquals('', $this->object->getWordByState($result));
120 |
121 | // 测试找到
122 | $result = $this->object->seek('abd习bosseee')->current();
123 | $this->assertEquals(33, $result);
124 |
125 | // 测试限制
126 | foreach ($this->object->seek('主 导 习boss bcev', 3) as $result) {}
127 | $this->assertEquals(26, $result);
128 |
129 | // $result归位
130 | $result = 0;
131 | foreach ($this->object->seek('主 导 习boss bcev', 2, 3) as $result) {}
132 | $this->assertEquals(33, $result);
133 |
134 | }
135 |
136 | /**
137 | * @covers Adarts\Dictionary::prepare
138 | * @todo Implement testPrepare().
139 | */
140 | public function testPrepare() {
141 | // Remove the following lines when you implement this test.
142 | $this->markTestIncomplete(
143 | 'This test has not been implemented yet.'
144 | );
145 | }
146 |
147 | /**
148 | * @covers Adarts\Dictionary::compress
149 | * @todo Implement testCompress().
150 | */
151 | public function testCompress() {
152 | // Remove the following lines when you implement this test.
153 | $this->markTestIncomplete(
154 | 'This test has not been implemented yet.'
155 | );
156 | }
157 |
158 | }
159 |
--------------------------------------------------------------------------------
/utmake.yaml:
--------------------------------------------------------------------------------
1 | class_prefix: Adarts
2 | class_dir: src/classes
3 | tests_dir: tests/src
4 | bootstrap: tests/bootstrap.php
5 | configuration: tests/configuration.xml
6 |
--------------------------------------------------------------------------------