├── .gitignore ├── .styleci.yml ├── LICENSE ├── README.md ├── composer.json ├── composer.lock ├── nbproject ├── project.properties └── project.xml ├── src ├── Common.php ├── Dictionary.php └── Seeker.php ├── tests ├── benchmark │ ├── build_dict.php │ ├── dict.txt │ ├── packed.bin │ └── seek.php ├── bootstrap.php ├── configuration.xml └── src │ └── DictionaryTest.php └── utmake.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | /nbproject/private/ 2 | /vendor/ -------------------------------------------------------------------------------- /.styleci.yml: -------------------------------------------------------------------------------- 1 | preset: laravel 2 | 3 | risky: false 4 | 5 | enabled: 6 | 7 | disabled: 8 | - braces 9 | 10 | finder: 11 | exclude: 12 | - "tests" 13 | name: 14 | - "*.php" 15 | # not-name: 16 | # - "*Stub.php" 17 | # contains: 18 | # - "Foo" 19 | # not-contains: 20 | # - "config" 21 | # path: 22 | # - "foo/Stuff" 23 | # not-path: 24 | # - "libraries" 25 | # depth: 26 | # - "< 3" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Andares Merigold 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Adarts 使用说明 2 | 3 | 基于静态Darts实现了AC自动机的字符串匹配类库,支持UTF-8编码,目前在项目中用于敏感词功能。 4 | 5 | **该库只支持PHP7及以上版本**,之前版本因为有C扩展实现了类似功能,也不需要这个。 6 | 7 | 因为并不精通算法,实现上更偏向于业务代码风格,较多地利用了php数组双向链表的特性。算法实现上若有不对,或是有更好的优化方案,**请狠狠给个merge request!** 8 | 9 | ## 更新与路线图 10 | 11 | |功能描述|实现版本| 12 | |---|---| 13 | |修正在双数组模式下导入同路径词条引发的问题| 1.6 | 14 | |调整部分方法及属性命名| 1.5 | 15 | |搜索时增加```$limit```和```$skip```参数| 1.4 | 16 | |增加批量查找功能,可一次获取所有命中词| 1.3 | 17 | |修正查找失败时的指针偏移bug| 1.3 | 18 | 19 | ## 更新内容详述 20 | 21 | ### 1.6版本改动 22 | 23 | 在此库已稳定将近9个月后居然在工作中发现了```重要BUG```,运营在字典中添加了一个单个字的词条后,居然查找不到这个词。 24 | 25 | 经过排查,发现这是压缩为```Double Array```后的正常现象。双数组结构可以获取更高的执行效率,同时也让算法实现的功能更为单一:拿一个字典与匹配字串进行碰撞,判断字串是否有包含字典中的词条。 26 | 27 | 因此,经过双数组压缩过后的字典中的每个节点,是不能同时为过路节点又为末节点的。 28 | 29 | 也就是说,如果往字典中添加```小猪```和```小猪狗```这两个词条,执的结果是无法检查到```小猪```这个词条的。 30 | 31 | 经过取舍,为了保证该库和算法的设计本意,对字典的构建算法做了更改,当同时添加上述两个词条时,只有```小猪```会被加入。因为后者已经包含前者,所以并不会对“是否包含字典中的词条”这一检查目的造成影响。 32 | 33 | ### 1.5版本改动 34 | 35 | > * 变更```getWordsByState()```方法更名为```getWordByState()```,原方法名做兼容性留存 36 | 37 | ## 安装 38 | 39 | 一个简单的composer库而已: 40 | 41 | ``` 42 | composer require andares/adarts 43 | ``` 44 | 45 | 序列化功能用到了msgpack,这个东西我很喜欢,推荐使用: 46 | 47 | ``` 48 | pecl install msgpack-2.0.1 49 | ``` 50 | 51 | 安装完后在php.ini或是conf.d里加上```extension=msgpack.so```即可,线上环境莫忘重启fpm 52 | 53 | ## 创建字典 54 | 55 | ``` 56 | $dict = new \Adarts\Dictionary(); 57 | $dict->add('word1') 58 | ->add('word2') 59 | ->add('word3') 60 | ->add('word4') 61 | ->confirm(); 62 | ``` 63 | 64 | * add() 向字典中添加一个词条,返回$this,可像上面那样重复调用。 65 | * confirm() 当词条添加完后生成darts树及失败指针。 66 | 67 | 当然我知道大家实际中一般都会这么用: 68 | 69 | ``` 70 | add($word); 79 | } 80 | $dict->confirm(); 81 | ``` 82 | 83 | > 请注意,由于```confirm()```行为会对数据做压缩处理,因此一个新建的字典对象应只能执行一次```confirm()```操作,重复执行将导致报错。如果有需要重新生成字典,可以重新new一个新的字典对象进行创建。因为压缩后的darts树本身就算不上一种可逆操作,所以这个库本身并不能同时作为存放原始字典的容器使用。这个库字典设计一切目标是为了更快地搜索匹配,词库更适合放在数据库、文件系统等其他地方。 84 | 85 | ## 搜索匹配 86 | 87 | 字典创建完后就可以用于搜索,例如: 88 | 89 | ``` 90 | $result = $dict->seek('get out! asshole!')->current(); 91 | if ($result) { 92 | throw new \LogicException('you could not say it'); 93 | } 94 | ``` 95 | 96 | * seek() 搜索一个字串,看是否有字典中匹配的词。返回一个生成器对象,所以请使用```current()```方法获取第一个匹配词。 97 | 98 | 在上面的例子中,我们假设```asshole```在字典中,那么```current()```方法即可得到一个不为0的整数。 99 | 100 | > 事实上,$result即Darts中的**叶子节点state** 101 | 102 | **如果传入的字串中未包含字典中的内容,由于迭代器特性,则会返回一个null值,这点需要注意!** 103 | 104 | ### 限制搜索范围 105 | 106 | 搜索时传入```$limit```参数,可限制只搜索到某个第几个字**作为开头**,例如: 107 | 108 | ```{php} 109 | // 假设“违法”和“犯规”两字在字典中 110 | $limit = 1; 111 | $result = $dict->seek('违法犯规', $limit)->current(); 112 | 113 | // 这时$result结果只有违法 114 | ``` 115 | 116 | 搜索时传入第二个参数```$skip```,可跳过几个字符开始搜索,例如: 117 | 118 | ```{php} 119 | $limit = 1; 120 | $skip = 2; 121 | $result = $dict->seek('违法犯规', $limit, $skip)->current(); 122 | 123 | // 这时$result结果是犯规 124 | ``` 125 | 126 | 这里有几个要点需要理解: 127 | 128 | 1. limit限制的数字,是指查找起始点,而非结束点。所以这里limit=1会从第一个字"违"查起,但只要能一直匹配到成功,不会在“违”字结束匹配。 129 | 2. 传入的数字是UTF-8字符数,不是字节数。 130 | 3. skip和limit均不支持负数。 131 | 132 | 133 | ## 根据state获取匹配词 134 | 135 | 稍稍改进一下上面的代码: 136 | 137 | ``` 138 | $result = $dict->seek('get out! asshole!')->current(); 139 | if ($result) { 140 | throw new \LogicException('you could not say ' . $dict->getWordByState($result)); 141 | } 142 | ``` 143 | 144 | * getWordByState() 根据**叶子节点state**获取找到的匹配词,如果没意外上面取到的是asshole 145 | 146 | ## 查找多个命中词 147 | 148 | ``` 149 | foreach ($dict->seek('get out! asshole!') as $result) { 150 | echo "you could not say ' . $dict->getWordByState($result); 151 | } 152 | ``` 153 | 154 | 利用迭代器特性,foreach返回的生成器对象即可获取所有命中词条。 155 | 156 | ### 关于找到的位置 157 | 158 | 因为支持失败指针,所以state的转换不是线性的,当通过失败指针跳到其他词条(的某个节点)时,还没找到好的方法(有效率地)逆推到起始节点的办法。 159 | 160 | 因此```seek()```只能告诉你是否有找到,最多带一个找到了什么,如果需要实现知道位置的功能,可以使用找到词条另外调php方法去处理。在已经明确结果的 下,单词条的查询效率不会有什么问题。 161 | 162 | ## 序列化 163 | 164 | Trie树的策略就是提前对字典做分析,在搜索的时候以最少的步数进行匹配搜索。所以每次搜索时都实时建立字典显然有违初衷,我们可以通过在输入词条列表时创建字典,并调```confirm()```生成分析后的Darts数据,然后对Dictionary进行序列化后保存(用你最爱的持久化方案,MySQL、mongodb、redis、memcache、leveldb等等等等)。 165 | 166 | 这样当需要搜索时,只需要读出字典对象直接搜索就行了。 167 | 168 | Adarts使用msgpack进行字典序列化,以获得比php序列化或json更好的I/O性能。 169 | 170 | ``` 171 | // 这是创建$dict并添加词条并confirm()的1000行代码 172 | 173 | $packed = serialize($dict); 174 | 175 | // 这是把序列化后的数据持久化的1000行代码 176 | // 嗯,也许可能长这样 177 | redis()->set('dict', $packed); 178 | ``` 179 | 180 | 现在我们要用了,那么: 181 | 182 | ``` 183 | // 可能长成这样的读出序列化数据代码 184 | $packed = redis()->get('dict'); 185 | $dict = unserialize($packed); 186 | 187 | $result = $dict->seek($str)->current(); 188 | // ...搜索后的1000行业务代码 189 | ``` 190 | 191 | ### 精简字典对象 192 | 193 | 显然,上面的做法还没有让速度达到最快,因为了字典对象在创建的过程中会产生大量中间数据。这其中一部分是在其他一些场景(非搜索)中有用的,有一些则目前看起来没什么用处。 194 | 195 | 所以如果确认这个字典对象的序列化数据只用在搜索这一场景中(判断有无),那么可以这样打包: 196 | 197 | ``` 198 | $packed = serialize($dict->simplify()); 199 | ``` 200 | 201 | 这样做会只留搜索必要的数据,让序列化后的数据更小,读出和反序列化更快。 202 | 203 | > 注意```simplify()```之后的字典对象去掉了**词条states索引**,这会导致```getWordByState()```方法不可用(总是返回空串)。所以对于有两种需求的情况来说,推荐精简与完整的字典序列化各存一份,以适用不同的场景。 204 | 205 | ## 感谢 206 | 207 | 感谢网上被复制了无数份的Trie PHP实现那个教程文档,虽然Trie树部分的代码完全没有参考价值,但我确实复制粘贴了原作者写的UTF-8拆分代码,并得到了指点。 208 | 209 | 还有[http://www.cnblogs.com/ooon/p/4883159.html](双数组Tire树(DART)详解)此文,参考的内容是最多的,刚看第一眼的时候其实并没有想读懂>_< 210 | 211 | 嗯,暂时就先写到这里,如果有坑请发issue给我,先谢过大家帮忙调试了! 212 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "andares/adarts", 3 | "description": "a darts class", 4 | "keywords": ["darts", "dart", "trie", "tree"], 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "Andares Merigold", 9 | "email": "andares@outlook.com" 10 | } 11 | ], 12 | "require": { 13 | "php": "^7.1.0" 14 | }, 15 | "require-dev": { 16 | "tracy/tracy": "^2.4" 17 | }, 18 | "suggest": { 19 | "ext-msgpack": "Allow serialize dictionary by msgpack, it is compact & faster" 20 | }, 21 | "autoload": { 22 | "classmap": ["src/"] 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", 5 | "This file is @generated automatically" 6 | ], 7 | "hash": "f878303cc5d2a3b42474cbe01d3af385", 8 | "content-hash": "381682c49cf0734c14792f514f637d12", 9 | "packages": [], 10 | "packages-dev": [ 11 | { 12 | "name": "tracy/tracy", 13 | "version": "v2.4.8", 14 | "source": { 15 | "type": "git", 16 | "url": "https://github.com/nette/tracy.git", 17 | "reference": "4538cbe7f11d03d78afd34a168ca1c966792a3d6" 18 | }, 19 | "dist": { 20 | "type": "zip", 21 | "url": "https://files.phpcomposer.com/files/nette/tracy/4538cbe7f11d03d78afd34a168ca1c966792a3d6.zip", 22 | "reference": "4538cbe7f11d03d78afd34a168ca1c966792a3d6", 23 | "shasum": "" 24 | }, 25 | "require": { 26 | "ext-json": "*", 27 | "ext-session": "*", 28 | "php": ">=5.4.4" 29 | }, 30 | "require-dev": { 31 | "nette/di": "~2.3", 32 | "nette/tester": "~2.0" 33 | }, 34 | "suggest": { 35 | "https://nette.org/donate": "Please support Tracy via a donation" 36 | }, 37 | "type": "library", 38 | "extra": { 39 | "branch-alias": { 40 | "dev-master": "2.4-dev" 41 | } 42 | }, 43 | "autoload": { 44 | "classmap": [ 45 | "src" 46 | ], 47 | "files": [ 48 | "src/shortcuts.php" 49 | ] 50 | }, 51 | "notification-url": "https://packagist.org/downloads/", 52 | "license": [ 53 | "BSD-3-Clause", 54 | "GPL-2.0", 55 | "GPL-3.0" 56 | ], 57 | "authors": [ 58 | { 59 | "name": "David Grudl", 60 | "homepage": "https://davidgrudl.com" 61 | }, 62 | { 63 | "name": "Nette Community", 64 | "homepage": "https://nette.org/contributors" 65 | } 66 | ], 67 | "description": "😎 Tracy: the addictive tool to ease debugging PHP code for cool developers. Friendly design, logging, profiler, advanced features like debugging AJAX calls or CLI support. You will love it.", 68 | "homepage": "https://tracy.nette.org", 69 | "keywords": [ 70 | "Xdebug", 71 | "debug", 72 | "debugger", 73 | "nette", 74 | "profiler" 75 | ], 76 | "time": "2017-07-14 06:38:56" 77 | } 78 | ], 79 | "aliases": [], 80 | "minimum-stability": "stable", 81 | "stability-flags": [], 82 | "prefer-stable": false, 83 | "prefer-lowest": false, 84 | "platform": { 85 | "php": "^7.1.0" 86 | }, 87 | "platform-dev": [] 88 | } 89 | -------------------------------------------------------------------------------- /nbproject/project.properties: -------------------------------------------------------------------------------- 1 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_create_2e_tests=true 2 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_enabled=true 3 | auxiliary.org-netbeans-modules-php-phpunit.bootstrap_2e_path=tests/bootstrap.php 4 | auxiliary.org-netbeans-modules-php-phpunit.configuration_2e_enabled=true 5 | auxiliary.org-netbeans-modules-php-phpunit.configuration_2e_path=tests/configuration.xml 6 | auxiliary.org-netbeans-modules-php-phpunit.customSuite_2e_enabled=false 7 | auxiliary.org-netbeans-modules-php-phpunit.customSuite_2e_path= 8 | auxiliary.org-netbeans-modules-php-phpunit.phpUnit_2e_enabled=false 9 | auxiliary.org-netbeans-modules-php-phpunit.phpUnit_2e_path= 10 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_groups_2e_ask=false 11 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_run_2e_all=false 12 | auxiliary.org-netbeans-modules-php-phpunit.test_2e_run_2e_phpunit_2e_only=false 13 | file.reference.adarts-tests=tests 14 | include.path=${php.global.include.path} 15 | php.version=PHP_70 16 | source.encoding=UTF-8 17 | src.dir=. 18 | tags.asp=false 19 | tags.short=false 20 | test.src.dir=${file.reference.adarts-tests} 21 | testing.providers=PhpUnit 22 | web.root=. 23 | -------------------------------------------------------------------------------- /nbproject/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.netbeans.modules.php.project 4 | 5 | 6 | adarts 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/Common.php: -------------------------------------------------------------------------------- 1 | check[0] = 0; 30 | $this->base[0] = 1; 31 | } 32 | 33 | /** 34 | * 35 | * @param string $sample 36 | * @return \Generator 37 | */ 38 | public function seek(string $sample, int $limit = 0, int $skip = 0): \Generator { 39 | // 先生成用于搜索的转义串 40 | $haystack = []; 41 | foreach ($this->split($sample) as $char) { 42 | $haystack[] = $this->index[$char] ?? 0; 43 | } 44 | 45 | $seeker = new Seeker($this->check, $this->base, $this->fail_states); 46 | return $seeker($haystack, $limit, $skip); 47 | } 48 | 49 | /** 50 | * 精简词典对像,仅保留搜索必要的数据 51 | * @return self 52 | */ 53 | public function simplify(): self { 54 | $this->index_code = 55 | $this->tmp_tree = 56 | $this->word_states = 57 | $this->begin_used = []; 58 | $this->index_count = 0; 59 | return $this; 60 | } 61 | 62 | /** 63 | * 添加词进字典 64 | * @param string $word 65 | * @return self 66 | */ 67 | public function add(string $word): self { 68 | // 构建临时树 69 | $tmp_tree = &$this->tmp_tree; 70 | foreach ($this->split($word) as $char) { 71 | // 加入索引 72 | $code = $this->index($char); 73 | 74 | // 插树 75 | $tmp_tree = &$this->putNode($code, $tmp_tree); 76 | 77 | // 抛弃无用词条 78 | if ($tmp_tree == $this->tmp_tree) { 79 | break; 80 | } 81 | } 82 | 83 | // 修剪trie树,抛弃无用词条 84 | $tmp_tree && $tmp_tree != $this->tmp_tree && $tmp_tree = []; 85 | 86 | return $this; 87 | } 88 | 89 | /** 90 | * 根据叶子节点 state 拿word 91 | * @param int $state 92 | * @return string 93 | */ 94 | public function getWordByState($state): string { 95 | return $this->word_states[$state] ?? ''; 96 | } 97 | 98 | /** 99 | * 版本兼容性方法,已作废 100 | * @deprecated since version 1.5 101 | * @param int $state 102 | * @return string 103 | */ 104 | public function getWordsByState($state) { 105 | return $this->getWordByState($state); 106 | } 107 | 108 | /** 109 | * 110 | * @param array $haystack 111 | * @return string 112 | */ 113 | public function translate(array $haystack): string { 114 | $this->indexCode(); 115 | 116 | $result = ''; 117 | foreach ($haystack as $code) { 118 | $result .= $this->index_code[$code]; 119 | } 120 | return $result; 121 | } 122 | 123 | /** 124 | * 为 index 创建 code 反向索引 125 | * @param bool $force 126 | */ 127 | private function indexCode($force = false) { 128 | if (!$this->index_code || $force) { 129 | foreach ($this->index as $word => $code) { 130 | $this->index_code[$code] = $word; 131 | } 132 | } 133 | } 134 | 135 | /** 136 | * 添加完毕 137 | * @return self 138 | */ 139 | public function confirm(): self { 140 | return $this->compress()->makeFailStates(); 141 | } 142 | 143 | /** 144 | * 创建失败指针 145 | * @return self 146 | */ 147 | private function makeFailStates(): self { 148 | $seeker = new Seeker($this->check, $this->base, []); 149 | $this->traverseTreeForMakeFailCursor($this->tmp_tree, [], $seeker); 150 | return $this; 151 | } 152 | 153 | /** 154 | * 遍历tmp_tree创建失败指针 155 | * @param array $tree 156 | * @param array $haystack 157 | * @param \Adarts\Seeker $seeker 158 | * @param int $code 159 | */ 160 | private function traverseTreeForMakeFailCursor(array &$tree, 161 | array $haystack, Seeker $seeker, int $code = 0) { 162 | 163 | $code && $haystack[] = $code; 164 | foreach ($tree as $code => $children_tree) { 165 | $this->gainFailCursor($haystack, $seeker, $code); 166 | if ($children_tree) { 167 | $this->traverseTreeForMakeFailCursor($children_tree, 168 | $haystack, $seeker, $code); 169 | } 170 | } 171 | } 172 | 173 | /** 174 | * 搜索失败指针 175 | * @param array $haystack 176 | * @param \Adarts\Seeker $seeker 177 | * @param int $code 178 | * @return void 179 | */ 180 | private function gainFailCursor(array $haystack, 181 | Seeker $seeker, int $code) { 182 | 183 | if (!$haystack) { 184 | return; 185 | } 186 | 187 | $haystack[] = $code; 188 | $self = $seeker->forFail($haystack); 189 | array_shift($haystack); 190 | do { 191 | $state = $seeker->forFail($haystack); 192 | if ($state) { 193 | $this->fail_states[$self] = $state; 194 | break; 195 | } 196 | array_shift($haystack); 197 | } while ($haystack); 198 | } 199 | 200 | /** 201 | * 将临时Trie树压缩成Darts 202 | * @return self 203 | */ 204 | private function compress(): self { 205 | $base = $this->base[0]; 206 | $this->beginUse($base); 207 | 208 | $this->indexCode(); 209 | $this->traverseTreeForCompress($this->tmp_tree, $base); 210 | return $this; 211 | } 212 | 213 | /** 214 | * 遍历Trie树,生成check与base 215 | * @param array $tree 216 | * @param int $base 217 | */ 218 | private function traverseTreeForCompress(array &$tree, int $base, 219 | string $prefix = '') { 220 | 221 | // 先处理当前层级 222 | foreach ($tree as $code => $children_tree) { 223 | $state = $this->getState($base, $code); 224 | $this->check[$state] = $base; 225 | } 226 | 227 | // 再处理子级 228 | foreach ($tree as $code => $children_tree) { 229 | $state = $this->getState($base, $code); 230 | $word = $prefix . $this->index_code[$code]; 231 | 232 | // 计算此子级的check 即当前节点的base 233 | if ($children_tree) { 234 | $next_base = $this->findBegin($children_tree); 235 | $this->beginUse($next_base); 236 | $this->base[$state] = $next_base; 237 | 238 | $this->traverseTreeForCompress($children_tree, $next_base, $word); 239 | } else { 240 | // 叶节点 241 | $this->base[$state] = -$this->check[$state]; 242 | 243 | // 创建 word 的 state 索引 244 | $this->word_states[$state] = $word; 245 | } 246 | } 247 | } 248 | 249 | /** 250 | * 暂时用步进法搜索 251 | * @return int 252 | */ 253 | private function findBegin(array &$tree): int { 254 | $base = 1; 255 | $found = false; 256 | while (!$found) { 257 | // 步进 258 | $base++; 259 | 260 | // 如有使用跳过 261 | if (isset($this->begin_used[$base])) { 262 | continue; 263 | } 264 | 265 | // 查找是否符合条件 266 | foreach ($tree as $child_code => $child_tree) { 267 | if (isset($this->check[$this->getState($base, $child_code)])) { 268 | continue 2; 269 | } 270 | } 271 | 272 | $found = true; 273 | } 274 | return $base; 275 | } 276 | 277 | /** 278 | * 279 | * @param int $base 280 | * @return self 281 | */ 282 | private function beginUse(int $base): self { 283 | $this->begin_used[$base] = 1; 284 | return $this; 285 | } 286 | 287 | /** 288 | * 往临时树里添加一个节点 289 | * @param int $code 290 | * @param array $tmp_tree 291 | * @return array 292 | */ 293 | private function &putNode(int $code, array &$tmp_tree): array { 294 | if (isset($tmp_tree[$code]) && $tmp_tree[$code] == []) { 295 | return $this->tmp_tree; 296 | } 297 | 298 | !isset($tmp_tree[$code]) && $tmp_tree[$code] = []; 299 | return $tmp_tree[$code]; 300 | } 301 | 302 | /** 303 | * 索引字符并返回code 304 | * @param string $char 305 | * @return int 306 | */ 307 | private function index(string $char): int { 308 | if (isset($this->index[$char])) { 309 | return $this->index[$char]; 310 | } 311 | 312 | $this->index_count++; 313 | $this->index[$char] = $this->index_count; 314 | return $this->index_count; 315 | } 316 | 317 | /** 318 | * utf8拆字 319 | * @param string $str 320 | */ 321 | private function split(string $str) { 322 | $len = strlen($str); 323 | for ($i = 0; $i < $len; $i++) { 324 | $c = $str[$i]; 325 | $n = ord($c); 326 | if (($n >> 7) == 0) { 327 | //0xxx xxxx, asci, single 328 | yield $c; 329 | } elseif (($n >> 4) == 15) { //1111 xxxx, first in four char 330 | if ($i < $len - 3) { 331 | yield $c . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; 332 | $i += 3; 333 | } 334 | } elseif (($n >> 5) == 7) { 335 | //111x xxxx, first in three char 336 | if ($i < $len - 2) { 337 | yield $c . $str[$i + 1] . $str[$i + 2]; 338 | $i += 2; 339 | } 340 | } elseif (($n >> 6) == 3) { 341 | //11xx xxxx, first in two char 342 | if ($i < $len - 1) { 343 | yield $c . $str[$i + 1]; 344 | $i++; 345 | } 346 | } 347 | } 348 | } 349 | 350 | /** 351 | * 序列化 352 | * @return string 353 | */ 354 | public function serialize(): string { 355 | $data['check'] = $this->check; 356 | $data['base'] = $this->base; 357 | $data['index'] = $this->index; 358 | $data['fail_states'] = $this->fail_states; 359 | $data['index_count'] = $this->index_count; 360 | $data['index_code'] = $this->index_code; 361 | $data['tmp_tree'] = $this->tmp_tree; 362 | $data['word_states'] = $this->word_states; 363 | $data['begin_used'] = $this->begin_used; 364 | return msgpack_pack($data); 365 | } 366 | 367 | /** 368 | * 反序列化 369 | * @param string $serialized 370 | */ 371 | public function unserialize($serialized) { 372 | $data = msgpack_unpack($serialized); 373 | $this->check = $data['check']; 374 | $this->base = $data['base']; 375 | $this->index = $data['index']; 376 | $this->fail_states = $data['fail_states']; 377 | $this->index_count = $data['index_count']; 378 | $this->index_code = $data['index_code']; 379 | $this->tmp_tree = $data['tmp_tree']; 380 | $this->word_states = $data['word_states'] ?? $data['words_states']; 381 | $this->begin_used = $data['begin_used']; 382 | } 383 | 384 | } 385 | -------------------------------------------------------------------------------- /src/Seeker.php: -------------------------------------------------------------------------------- 1 | check = $check; 19 | $this->base = $base; 20 | $this->fail_states = $fail_states; 21 | } 22 | 23 | public function forFail(array $haystack): int { 24 | // 当前base 25 | $base = $this->base[0]; 26 | // 预计算实际匹配指针 27 | $cursor = 0; 28 | 29 | while (isset($haystack[$cursor])) { 30 | $state = $this->getState($base, $haystack[$cursor]); 31 | 32 | // 根据state取出base,查找下一个state 33 | if (!isset($this->base[$state]) || $this->check[$state] != $base) { 34 | return 0; 35 | } 36 | $base = $this->base[$state]; 37 | $cursor++; 38 | } 39 | return $state; 40 | } 41 | 42 | public function __invoke(array $haystack, 43 | int $limit = 0, int $skip = 0): \Generator { 44 | 45 | $it = $this->process($haystack, $limit, $skip); 46 | return $it; 47 | } 48 | 49 | private function process(array $haystack, int $limit = 0, int $skip = 0) { 50 | // 当前base 51 | $base = $this->base[0]; 52 | // 开始位指针 53 | $start = $skip; 54 | // 检测位指针 55 | $verify = 0; 56 | // 预计算实际匹配指针 57 | $cursor = $start + $verify; 58 | // 初始置state为0 59 | $pre_state = 0; 60 | 61 | // 开始搜索 62 | $limit && $limit += $skip; 63 | while (isset($haystack[$cursor]) && (!$limit || $start < $limit)) { 64 | // 根据当前 base 与匹配指针位计算出 state 65 | // 未进入索引取不到 code 的 state = -1 66 | $state = isset($haystack[$cursor]) ? 67 | $this->getState($base, $haystack[$cursor]) : -1; 68 | 69 | // 根据 state 查找是否有 base 70 | // 并且使用 check 位校验父节点 base 是否匹配 71 | if (isset($this->base[$state]) && $this->check[$state] == $base) { 72 | // 根据 base 位不为负检查是否为叶子节点 73 | if ($this->base[$state] > 0) { 74 | // 非叶子节点 base 置为下层节点 check 75 | $base = $this->base[$state]; 76 | // 检验位推进 77 | $verify++; 78 | // 设置 pre state 用于调用失败指针 79 | $pre_state = $state; 80 | 81 | } else { 82 | // 遇到叶子节点,匹配成功 83 | yield $state; 84 | 85 | // 重置搜索位 86 | $base = $this->base[0]; 87 | $start++; 88 | $verify = 0; 89 | $pre_state = 0; 90 | } 91 | } else { 92 | // state 检查失败 93 | if (isset($this->fail_states[$pre_state])) { 94 | // 如果有 fail 指针,重置 base 到失败指针,退一步继续匹配 95 | $base = $this->check[$this->fail_states[$pre_state]]; 96 | $start += $verify - 1; 97 | 98 | } else { 99 | // 无 fail 指针,重置 base 到 root 100 | $base = $this->base[0]; 101 | $verify ? ($start += $verify) : $start++; 102 | } 103 | // 重置检测位 pre state 104 | $verify = 0; 105 | $pre_state = 0; 106 | } 107 | 108 | // 计算出新的(或相同的)检测指针 109 | $cursor = $start + $verify; 110 | // du("$cursor = $start + $verify", '$cursor = $start + $verify'); 111 | } 112 | 113 | // 搜索结束 114 | return 0; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /tests/benchmark/build_dict.php: -------------------------------------------------------------------------------- 1 | add($word); 21 | } 22 | fclose($handle); 23 | 24 | $dict->confirm(); 25 | 26 | echo "dict build timecost: ".round((microtime(true) - $t) * 1000)."ms\n"; 27 | 28 | // 不使用 simplitfy() 方法可获得反向翻译功能,但字典容量会大大增加。 29 | $packed = serialize($dict->simplify()); 30 | file_put_contents(__DIR__.'/packed.bin', $packed); 31 | 32 | echo "done.\n"; 33 | -------------------------------------------------------------------------------- /tests/benchmark/dict.txt: -------------------------------------------------------------------------------- 1 | 考前答案 2 | 万科 3 | 家宝 4 | -------------------------------------------------------------------------------- /tests/benchmark/packed.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andares/adarts/44e5faff7a78270f128189f7a50a32a38087c047/tests/benchmark/packed.bin -------------------------------------------------------------------------------- /tests/benchmark/seek.php: -------------------------------------------------------------------------------- 1 | seek('上上下下左左右右A家宝BBA')->current(), "\n"; -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /tests/src/DictionaryTest.php: -------------------------------------------------------------------------------- 1 | object = new Dictionary; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() { 28 | 29 | } 30 | 31 | public function testForBug() { 32 | $dictionary = new \Adarts\Dictionary(); 33 | $dictionary->add('钱'); 34 | $dictionary->add('测试'); 35 | $dictionary->confirm(); 36 | $keywords = []; 37 | foreach ($dictionary->seek('钱很多测试') as $result) { 38 | $keywords[] = $dictionary->getWordByState($result); 39 | } 40 | 41 | 42 | $dictionary->add('走起'); 43 | // 当上面confirm后,再添加再confirm会出现 44 | // PHP Notice: Undefined offset: 7 45 | $dictionary->confirm(); 46 | $keywords = []; 47 | foreach ($dictionary->seek('钱很多测试走起拉') as $result) { 48 | $keywords[] = $dictionary->getWordByState($result); 49 | } 50 | /* 51 | $this->object 52 | // ->add("妓女") 53 | // ->add("妓") 54 | ->add("妓飞飞") 55 | ->add("飞机") 56 | // ->add("走开") 57 | ->confirm(); 58 | du($this->object); 59 | $result = $this->object->seek('天下大妓同')->current(); 60 | du($result); 61 | du($this->object->getWordByState($result)); 62 | du($this->object); 63 | */ 64 | } 65 | 66 | /** 67 | * @covers Adarts\Dictionary::add 68 | * @todo Implement testAdd(). 69 | */ 70 | public function testAdd() { 71 | // die(); 72 | $this->object 73 | ->add("毛 abcfd") 74 | ->add("bcev") 75 | ->add("毛 主 席") 76 | ->add("主 导") 77 | ->add("习boss威武") // 这是无效词条 78 | ->add("习boss") 79 | ->confirm(); 80 | du($this->object); 81 | 82 | $packed = serialize($this->object); 83 | // var_dump($packed); 84 | // du(strlen($packed)); 85 | $this->object = unserialize($packed); 86 | 87 | // 测试找批量 88 | $result_list = [ 89 | '毛 abcfd' => 1, 90 | '主 导' => 1, 91 | '习boss' => 1, 92 | ]; 93 | foreach ($this->object->seek('abd毛 主毛 abcfd 毛 主 导习bossk') as $result) { 94 | $word = $this->object->getWordByState($result); 95 | $this->assertTrue(isset($result_list[$word])); 96 | unset($result_list[$word]); 97 | } 98 | $this->assertEquals(0, count($result_list)); 99 | 100 | // 测试深回归 101 | $result = $this->object->seek('123毛 abcfwr')->current(); 102 | $this->assertNull($result); 103 | $this->assertEquals('', $this->object->getWordByState($result)); 104 | 105 | // 测试失败指针 106 | $result = $this->object->seek('abd毛 主d 毛 主 导k')->current(); 107 | $this->assertEquals(26, $result); 108 | $this->assertEquals('主 导', $this->object->getWordByState($result)); 109 | 110 | // 简化 111 | $packed = serialize($this->object->simplify()); 112 | // var_dump($packed); 113 | // du(strlen($packed)); 114 | $this->object = unserialize($packed); 115 | 116 | // 测试未找到 117 | $result = $this->object->seek('abd毛习')->current(); 118 | $this->assertNull($result); 119 | $this->assertEquals('', $this->object->getWordByState($result)); 120 | 121 | // 测试找到 122 | $result = $this->object->seek('abd习bosseee')->current(); 123 | $this->assertEquals(33, $result); 124 | 125 | // 测试限制 126 | foreach ($this->object->seek('主 导 习boss bcev', 3) as $result) {} 127 | $this->assertEquals(26, $result); 128 | 129 | // $result归位 130 | $result = 0; 131 | foreach ($this->object->seek('主 导 习boss bcev', 2, 3) as $result) {} 132 | $this->assertEquals(33, $result); 133 | 134 | } 135 | 136 | /** 137 | * @covers Adarts\Dictionary::prepare 138 | * @todo Implement testPrepare(). 139 | */ 140 | public function testPrepare() { 141 | // Remove the following lines when you implement this test. 142 | $this->markTestIncomplete( 143 | 'This test has not been implemented yet.' 144 | ); 145 | } 146 | 147 | /** 148 | * @covers Adarts\Dictionary::compress 149 | * @todo Implement testCompress(). 150 | */ 151 | public function testCompress() { 152 | // Remove the following lines when you implement this test. 153 | $this->markTestIncomplete( 154 | 'This test has not been implemented yet.' 155 | ); 156 | } 157 | 158 | } 159 | -------------------------------------------------------------------------------- /utmake.yaml: -------------------------------------------------------------------------------- 1 | class_prefix: Adarts 2 | class_dir: src/classes 3 | tests_dir: tests/src 4 | bootstrap: tests/bootstrap.php 5 | configuration: tests/configuration.xml 6 | --------------------------------------------------------------------------------