├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── composer.json
├── phpunit.php
├── phpunit.xml
├── src
├── Analysis
│ ├── Addition.php
│ ├── Analysis.php
│ ├── AnalysisInterface.php
│ ├── ChineseAnalysis.php
│ ├── ChineseAnalysisInterface.php
│ ├── Config.php
│ ├── Loader.php
│ ├── StringTool.php
│ └── dict
│ │ ├── base_dic_full.dic
│ │ ├── not-build
│ │ └── base_dic_full.txt
│ │ └── words_addons.dic
├── Dict
│ └── DictGenerator.php
└── Split
│ ├── Config.php
│ ├── Split.php
│ └── SplitInterface.php
├── test.php
└── tests
├── Analysis
└── LoaderTest.php
├── analysisTest.php
└── initTest.php
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | # Change these settings to your own preference
7 | indent_style = space
8 | indent_size = 4
9 |
10 | # We recommend you to keep these unchanged
11 | end_of_line = lf
12 | charset = utf-8
13 | trim_trailing_whitespace = true
14 | insert_final_newline = true
15 |
16 | [*.md]
17 | trim_trailing_whitespace = false
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # gitignore for most project
2 | #
3 | # Created: 2015-7
4 | # Updated:
5 | # Maintainer: call@woola.net
6 |
7 | # -----------------------------
8 |
9 | # Numerous always-ignore extensions
10 |
11 | *.diff
12 | *.err
13 | *.orig
14 | *.log
15 | *.rej
16 | *.swo
17 | *.swp
18 | *.vi
19 |
20 | ### SVN ###
21 | .svn/
22 |
23 |
24 | ### Mercurial ###
25 | /.hg/*
26 | */.hg/*
27 | .hgignore
28 |
29 |
30 | ### CVS ###
31 | /CVS/*
32 | */CVS/*
33 | .cvsignore
34 | */.cvsignore
35 |
36 | ### Archives ###
37 | # It's better to unpack these files and commit the raw source because
38 | # git has its own built in compression methods.
39 | *.7z
40 | *.jar
41 | *.rar
42 | *.zip
43 | *.gz
44 | *.bzip
45 | *.bz2
46 | *.xz
47 | *.lzma
48 | *.cab
49 |
50 | #packing-only formats
51 | *.iso
52 | *.tar
53 |
54 | #package management formats
55 | *.dmg
56 | *.xpi
57 | *.gem
58 | *.egg
59 | *.deb
60 | *.rpm
61 | *.msi
62 | *.msm
63 | *.msp
64 |
65 | # -----------------------------
66 |
67 | # OS or Editor
68 |
69 | ### OSX ###
70 | .DS_Store
71 | .AppleDouble
72 | .LSOverride
73 |
74 | # Icon must end with two \r
75 | Icon
76 |
77 |
78 | # Thumbnails
79 | ._*
80 |
81 | # Files that might appear on external disk
82 | .Spotlight-V100
83 | .Trashes
84 |
85 | # Directories potentially created on remote AFP share
86 | .AppleDB
87 | .AppleDesktop
88 | Network Trash Folder
89 | Temporary Items
90 | .apdisk
91 |
92 |
93 | ### Linux ###
94 | *~
95 |
96 | # KDE directory preferences
97 | .directory
98 |
99 |
100 | ### Windows ###
101 | # Windows image file caches
102 | Thumbs.db
103 | ehthumbs.db
104 |
105 | # Folder config file
106 | Desktop.ini
107 |
108 | # Recycle Bin used on file shares
109 | $RECYCLE.BIN/
110 |
111 | # Windows shortcuts
112 | *.lnk
113 |
114 | # -----------------------------
115 |
116 | ### JetBrains ###
117 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
118 |
119 | ## Directory-based project format:
120 | .idea/
121 | # if you remove the above rule, at least ignore the following:
122 |
123 | # User-specific stuff:
124 | # .idea/workspace.xml
125 | # .idea/tasks.xml
126 | # .idea/dictionaries
127 |
128 | # Sensitive or high-churn files:
129 | # .idea/dataSources.ids
130 | # .idea/dataSources.xml
131 | # .idea/sqlDataSources.xml
132 | # .idea/dynamic.xml
133 | # .idea/uiDesigner.xml
134 |
135 | # Gradle:
136 | # .idea/gradle.xml
137 | # .idea/libraries
138 |
139 | # Mongo Explorer plugin:
140 | # .idea/mongoSettings.xml
141 |
142 | ## File-based project format:
143 | *.ipr
144 | *.iws
145 |
146 | ## Plugin-specific files:
147 |
148 | # IntelliJ
149 | out/
150 |
151 | # mpeltonen/sbt-idea plugin
152 | .idea_modules/
153 |
154 | # JIRA plugin
155 | atlassian-ide-plugin.xml
156 |
157 | # Crashlytics plugin (for Android Studio and IntelliJ)
158 | com_crashlytics_export_strings.xml
159 | crashlytics.properties
160 | crashlytics-build.properties
161 |
162 |
163 | ### Eclipse ###
164 | *.pydevproject
165 | .metadata
166 | .gradle
167 | tmp/
168 | *.tmp
169 | *.bak
170 | *~.nib
171 | local.properties
172 | .settings/
173 | .loadpath
174 |
175 | # External tool builders
176 | .externalToolBuilders/
177 |
178 | # Locally stored "Eclipse launch configurations"
179 | *.launch
180 |
181 | # CDT-specific
182 | .cproject
183 |
184 | # PDT-specific
185 | .buildpath
186 |
187 | # sbteclipse plugin
188 | .target
189 |
190 | # TeXlipse plugin
191 | .texlipse
192 |
193 |
194 | ### SublimeText ###
195 | # cache files for sublime text
196 | *.tmlanguage.cache
197 | *.tmPreferences.cache
198 | *.stTheme.cache
199 |
200 | # workspace files are user-specific
201 | *.sublime-workspace
202 |
203 | # project files should be checked into the repository, unless a significant
204 | # proportion of contributors will probably not be using SublimeText
205 | *.sublime-project
206 |
207 | # sftp configuration file
208 | sftp-config.json
209 |
210 |
211 | ### Dreamweaver ###
212 | # DW Dreamweaver added files
213 | _notes
214 | dwsync.xml
215 |
216 |
217 | ### vim ###
218 | [._]*.s[a-w][a-z]
219 | [._]s[a-w][a-z]
220 | *.un~
221 | Session.vim
222 | .netrwhist
223 |
224 | # -----------------------------
225 |
226 | # PHP
227 |
228 | ### Composer ###
229 | composer.phar
230 | vendor/
231 |
232 | # Commit your application's lock file http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file
233 | # You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file
234 | # composer.lock
235 |
236 |
237 | ### Symfony ###
238 | # Cache and logs (Symfony2)
239 | /app/cache/*
240 | /app/logs/*
241 | !app/cache/.gitkeep
242 | !app/logs/.gitkeep
243 |
244 | # Cache and logs (Symfony3)
245 | /var/cache/*
246 | /var/logs/*
247 | !var/cache/.gitkeep
248 | !var/logs/.gitkeep
249 |
250 | # Parameters
251 | /app/config/parameters.yml
252 | /app/config/parameters.ini
253 |
254 | # Managed by Composer
255 | /app/bootstrap.php.cache
256 | /var/bootstrap.php.cache
257 | /bin/*
258 | !bin/console
259 | !bin/symfony_requirements
260 |
261 | # Assets and user uploads
262 | /web/bundles/
263 | /web/uploads/
264 |
265 | # Build data
266 | /build/
267 |
268 | # Composer PHAR
269 | /composer.phar
270 |
271 |
272 | ### Laravel ###
273 | /bootstrap/compiled.php
274 | .env.*.php
275 | .env.php
276 |
277 |
278 | ### Sass ###
279 | .sass-cache
280 | *.css.map
281 |
282 | # -----------------------------
283 |
284 | ### python ###
285 | *.pyc
286 |
287 | ### docs ###
288 | /docs/build/*
289 |
290 | /composer.lock
291 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 |
3 | php:
4 | - 5.4
5 | - 5.5
6 | - 5.6
7 | - 7.0
8 | - hhvm
9 |
10 | before_install:
11 | - travis_retry composer self-update
12 | install:
13 | - travis_retry composer install --no-interaction --prefer-source
14 |
15 | script:
16 | - vendor/bin/phpunit --verbose --coverage-text
17 |
18 | matrix:
19 | allow_failures:
20 | - php: hhvm
21 | - php: 7.0
22 | fast_finish: true
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | phpSplit php中文分词库
2 | ============================
3 | [](https://travis-ci.org/Callwoola/php-split)
4 |
5 |
6 | ### phpSplit 是一个基于php开发的中文分词库
7 |
8 | 居于Unicode编码词典的php分词器
9 | * 只适用于php5,必要函数 iconv
10 | * 本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
11 | * 简单操作流程: SetSource -> StartAnalysis -> GetResult
12 | * 对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
13 |
14 |
15 | ### 使用
16 |
17 | * 首先 确保使用php为5.4+
18 | * 安装composer
19 |
20 | ```
21 | composer install
22 | ```
23 |
24 | ```php
25 | require __DIR__ .'/vendor/autoload.php';
26 | $split = new \phpSplit\Split\Split();
27 | var_dump($split->simple("您好 phpSplit"));
28 | ```
29 |
30 |
31 | ```php
32 | array(3) {
33 | [0] =>
34 | string(0) ""
35 | [1] =>
36 | string(6) "您好"
37 | [2] =>
38 | string(8) "phpSplit"
39 | }
40 |
41 | ```
42 |
43 |
44 | ### 分词结果后缀说明
45 | ```php
46 | 名词n、
47 | 时间词t、
48 | 处所词s、
49 | 方位词f、
50 | 数词m、
51 | 量词q、
52 | 区别词b、
53 | 代词r、
54 | 动词v、
55 | 形容词a、
56 | 状态词z、
57 | 副词d、
58 | 介词p、
59 | 连词c、
60 | 助词u、
61 | 语气词y、
62 | 叹词e、
63 | 拟声词o、
64 | 成语i、
65 | 习用语l、
66 | 简称j、
67 | 前接成分h、
68 | 后接成分k、
69 | 语素g、
70 | 非语素字x、
71 | 标点符号w
72 | ```
73 |
74 | 同事增加了以下3类标记
75 | *专有名词的分类标记,即人名nr,地名ns,团体机关单位名称nt,其他专有名词nz;
76 | *语素的子类标记,即名语素Ng,动语素Vg,形容语素Ag,时语素Tg,副语素Dg等;
77 | *动词和形容词的子类标记,即名动词vn(具有名词特性的动词),名形词an(具有名词特性的形容词),副动词vd(具有副词特性的动词),副形词ad(具有副词特性的形容词)
78 |
79 | 合计约40个左右。
80 |
81 | 欢迎大家完善
82 |
83 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "callwoola/php-split",
3 | "description": "php ZH",
4 | "authors": [
5 | {
6 | "name": "Neo",
7 | "email": "call@woola.net"
8 | }
9 | ],
10 | "require": {
11 | "php": ">=5.3.0",
12 | "illuminate/support": "~4.0"
13 | },
14 | "require-dev": {
15 | "phpunit/phpunit": "3.7.*"
16 | },
17 | "autoload": {
18 | "psr-4": {
19 | "phpSplit\\": "src/"
20 | }
21 | },
22 | "extra": {
23 | "branch-alias": {
24 | "dev-master": "0.1.x-dev"
25 | }
26 | },
27 | "minimum-stability": "dev",
28 | "repositories": {
29 | "packagist": {
30 | "type": "composer",
31 | "url": "https://packagist.laravel-china.org"
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/phpunit.php:
--------------------------------------------------------------------------------
1 |
2 |
15 |
16 |
17 | tests
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/src/Analysis/Addition.php:
--------------------------------------------------------------------------------
1 | '专用机,6,n...'
11 | */
12 | public function getAdditionDict()
13 | {
14 | $returnString = "";
15 | foreach ($this->additionDict as $value) {
16 | $returnString .= "\n{$value},100,nx";
17 | }
18 |
19 | return $returnString;
20 | }
21 |
22 | /**
23 | * @param array $arr
24 | */
25 | public function setAdditionDict($arr = [])
26 | {
27 | $this->additionDict = $arr;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/Analysis/Analysis.php:
--------------------------------------------------------------------------------
1 | differMax = false;
23 | $this->unitWord = false;
24 |
25 | $this->setSource($source);
26 |
27 | $load_all = true;
28 | $this->isLoadAll = $load_all;
29 |
30 | list($mainDicHand, $mainDic, $additionDict, $loadTime) = $this->getLoadDict();
31 |
32 | $this->mainDicHand = $mainDicHand;
33 | $this->mainDic = $mainDic;
34 | $this->addonDic = $additionDict;
35 | $this->loadTime = $loadTime;
36 |
37 | $this->analysis = new Analysis();
38 | }
39 |
40 | /**
41 | * 析构函数
42 | */
43 | function __destruct()
44 | {
45 | if ($this->mainDicHand !== false) {
46 | @fclose($this->mainDicHand);
47 | }
48 | }
49 |
50 | /**
51 | * 从文件获得词
52 | *
53 | * @param $key
54 | * @param $type (类型 word 或 key_groups)
55 | * @return short int
56 | */
57 | public function getWordInfos($key, $type = 'word')
58 | {
59 | // TODO 简化算法
60 | if (!$this->mainDicHand) {
61 | $this->mainDicHand = fopen($this->mainDicFile, 'r');
62 | }
63 | $p = 0;
64 |
65 | // 根据字符串计算key索引
66 | $l = strlen($key);
67 | $h = 0x238f13af;
68 | while ($l--) {
69 | $h += ($h << 5);
70 | $h ^= ord($key[$l]);
71 | $h &= 0x7fffffff;
72 | }
73 |
74 | $keynum = ($h % $this->mask_value);
75 |
76 | if (isset($this->mainDicInfos[$keynum])) {
77 | $data = $this->mainDicInfos[$keynum];
78 | } else {
79 | //rewind( $this->mainDicHand );
80 | $move_pos = $keynum * 8;
81 | fseek($this->mainDicHand, $move_pos, SEEK_SET);
82 | $dat = fread($this->mainDicHand, 8);
83 | $arr = unpack('I1s/n1l/n1c', $dat);
84 | if ($arr['l'] == 0) {
85 | return false;
86 | }
87 | fseek($this->mainDicHand, $arr['s'], SEEK_SET);
88 | $data = @unserialize(fread($this->mainDicHand, $arr['l']));
89 | $this->mainDicInfos[$keynum] = $data;
90 | }
91 | if (!is_array($data) || !isset($data[$key])) {
92 | return false;
93 | }
94 | return ($type == 'word' ? $data[$key] : $data);
95 | }
96 |
97 |
98 | /**
99 | * @param array $addonDic
100 | */
101 | public function setAttach($addonDic = [])
102 | {
103 | $this->addonDic = $addonDic;
104 | }
105 |
106 | /**
107 | * 设置源字符串
108 | * @param $source
109 | * @param $source_charset
110 | * @param $target_charset
111 | *
112 | * @return bool
113 | */
114 | public function setSource($source)
115 | {
116 | $source_charset = 'utf-8';
117 | $target_charset = 'utf-8';
118 |
119 | $this->sourceCharSet = strtolower($source_charset);
120 | $this->targetCharSet = strtolower($target_charset);
121 | $this->simpleResult = [];
122 | $this->finallyResult = [];
123 | $this->finallyIndex = [];
124 |
125 |
126 | if ($source != '') {
127 | $rs = true;
128 | if (preg_match("/^utf/", $source_charset)) {
129 | $this->sourceString = iconv('utf-8', UCS2, $source);
130 | } else if (preg_match("/^gb/", $source_charset)) {
131 | $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
132 | } else if (preg_match("/^big/", $source_charset)) {
133 | $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
134 | } else {
135 | $rs = false;
136 | }
137 | } else {
138 | $rs = false;
139 | }
140 | return $rs;
141 | }
142 |
143 | /**
144 | * 设置结果类型(只在获取finallyResult才有效)
145 | * @param $rstype 1 为全部, 2去除特殊符号
146 | *
147 | * @return void
148 | */
149 | public function setResultType($rstype)
150 | {
151 | $this->resultType = $rstype;
152 | }
153 |
154 |
155 | /**
156 | * 检测某个词是否存在
157 | */
158 | public function isWord($word)
159 | {
160 | $winfos = $this->getWordInfos($word);
161 | return ($winfos !== false);
162 | }
163 |
164 | /**
165 | * 获得某个词的词性及词频信息
166 | * @parem $word unicode编码的词
167 | * @return void
168 | */
169 | public function getWordProperty($word)
170 | {
171 | if (strlen($word) < 4) {
172 | return '/s';
173 | }
174 | $infos = $this->getWordInfos($word);
175 | return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
176 | }
177 |
178 | /**
179 | * 指定某词的词性信息(通常是新词)
180 | * @param $word unicode编码的词
181 | * @param $infos array('c' => 词频, 'm' => 词性);
182 | * @return void;
183 | */
184 | public function setWordInfos($word, $infos)
185 | {
186 | if (strlen($word) < 4) {
187 | return;
188 | }
189 | if (isset($this->mainDicInfos[$word])) {
190 | $this->newWords[$word]++;
191 | $this->mainDicInfos[$word]['c']++;
192 | } else {
193 | $this->newWords[$word] = 1;
194 | $this->mainDicInfos[$word] = $infos;
195 | }
196 | }
197 |
198 | /**
199 | * 开始执行分析
200 | * @parem bool optimize 是否对结果进行优化
201 | * @return bool
202 | */
203 | public function startAnalysis($optimize = true)
204 | {
205 | // $this->analysis->analysis(
206 | // [
207 | // $this->sourceString,
208 | // $this->simpleResult,
209 | //
210 | // ]
211 | // );
212 |
213 |
214 | // if (!$this->isLoadDic) {
215 | // $this->LoadDict();
216 | // }
217 |
218 |
219 | $this->simpleResult = $this->finallyResult = [];
220 | $this->sourceString .= chr(0) . chr(32);
221 | $slen = strlen($this->sourceString);
222 | $sbcArr = [];
223 | $j = 0;
224 |
225 |
226 | //全角与半角字符对照表
227 | for ($i = 0xFF00; $i < 0xFF5F; $i++) {
228 | $scb = 0x20 + $j;
229 | $j++;
230 | $sbcArr[$i] = $scb;
231 | }
232 | //对字符串进行粗分
233 | $onstr = '';
234 | $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
235 | $s = 0;
236 | $ansiWordMatch = "[0-9a-z@#%\+\.-]";
237 | $notNumberMatch = "[a-z@#%\+]";
238 | for ($i = 0; $i < $slen; $i++) {
239 | $c = $this->sourceString[$i] . $this->sourceString[++$i];
240 | $cn = hexdec(bin2hex($c));
241 | $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
242 | //ANSI字符
243 | if ($cn < 0x80) {
244 | if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) {
245 | if ($lastc != 2 && $onstr != '') {
246 | $this->simpleResult[$s]['w'] = $onstr;
247 | $this->simpleResult[$s]['t'] = $lastc;
248 | $this->_deep_analysis($onstr, $lastc, $s, $optimize);
249 | $s++;
250 | $onstr = '';
251 | }
252 | $lastc = 2;
253 | $onstr .= chr(0) . chr($cn);
254 | } else {
255 | if ($onstr != '') {
256 | $this->simpleResult[$s]['w'] = $onstr;
257 | if ($lastc == 2) {
258 | if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4;
259 | }
260 | $this->simpleResult[$s]['t'] = $lastc;
261 | if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
262 | $s++;
263 | }
264 | $onstr = '';
265 | $lastc = 3;
266 | if ($cn < 31) {
267 | continue;
268 | } else {
269 | $this->simpleResult[$s]['w'] = chr(0) . chr($cn);
270 | $this->simpleResult[$s]['t'] = 3;
271 | $s++;
272 | }
273 | }
274 | } //普通字符
275 | else {
276 | //正常文字
277 | if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D)
278 | || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)
279 | ) {
280 | if ($lastc != 1 && $onstr != '') {
281 | $this->simpleResult[$s]['w'] = $onstr;
282 | if ($lastc == 2) {
283 | if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4;
284 | }
285 | $this->simpleResult[$s]['t'] = $lastc;
286 | if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
287 | $s++;
288 | $onstr = '';
289 | }
290 | $lastc = 1;
291 | $onstr .= $c;
292 | } //特殊符号
293 | else {
294 | if ($onstr != '') {
295 | $this->simpleResult[$s]['w'] = $onstr;
296 | if ($lastc == 2) {
297 | if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4;
298 | }
299 | $this->simpleResult[$s]['t'] = $lastc;
300 | if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
301 | $s++;
302 | }
303 |
304 | //检测书名
305 | if ($cn == 0x300A) {
306 | $tmpw = '';
307 | $n = 1;
308 | $isok = false;
309 | $ew = chr(0x30) . chr(0x0B);
310 | while (true) {
311 | if (!isset($this->sourceString[$i + $n + 1])) break;
312 | $w = $this->sourceString[$i + $n] . $this->sourceString[$i + $n + 1];
313 | if ($w == $ew) {
314 | $this->simpleResult[$s]['w'] = $c;
315 | $this->simpleResult[$s]['t'] = 5;
316 | $s++;
317 |
318 | $this->simpleResult[$s]['w'] = $tmpw;
319 | $this->newWords[$tmpw] = 1;
320 | if (!isset($this->newWords[$tmpw])) {
321 | $this->foundWordStr .= StringTool::encoding($tmpw, $this->targetCharSet) . '/nb, ';
322 | $this->setWordInfos($tmpw, ['c' => 1, 'm' => 'nb']);
323 | }
324 | $this->simpleResult[$s]['t'] = 13;
325 |
326 | $s++;
327 |
328 | //最大切分模式对书名继续分词
329 | if ($this->differMax) {
330 | $this->simpleResult[$s]['w'] = $tmpw;
331 | $this->simpleResult[$s]['t'] = 21;
332 | $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
333 | $s++;
334 | }
335 |
336 | $this->simpleResult[$s]['w'] = $ew;
337 | $this->simpleResult[$s]['t'] = 5;
338 | $s++;
339 |
340 | $i = $i + $n + 1;
341 | $isok = true;
342 | $onstr = '';
343 | $lastc = 5;
344 | break;
345 | } else {
346 | $n = $n + 2;
347 | $tmpw .= $w;
348 | if (strlen($tmpw) > 60) {
349 | break;
350 | }
351 | }
352 | }//while
353 | if (!$isok) {
354 | $this->simpleResult[$s]['w'] = $c;
355 | $this->simpleResult[$s]['t'] = 5;
356 | $s++;
357 | $onstr = '';
358 | $lastc = 5;
359 | }
360 | continue;
361 | }
362 |
363 | $onstr = '';
364 | $lastc = 5;
365 | if ($cn == 0x3000) {
366 | continue;
367 | } else {
368 | $this->simpleResult[$s]['w'] = $c;
369 | $this->simpleResult[$s]['t'] = 5;
370 | $s++;
371 | }
372 | }//2byte symbol
373 |
374 | }//end 2byte char
375 |
376 | }//end for
377 |
378 | // 处理分词后的结果
379 |
380 | $newarr = [];
381 | $i = 0;
382 |
383 | // 转换最终分词结果到 finallyResult 数组
384 | foreach ($this->simpleResult as $k => $v) {
385 | if (empty($v['w'])) continue;
386 | if (isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0) {
387 | foreach ($this->finallyResult[$k] as $w) {
388 | if (!empty($w)) {
389 | $newarr[$i]['w'] = $w;
390 | $newarr[$i]['t'] = 20;
391 | $i++;
392 | }
393 | }
394 | } else if ($v['t'] != 21) {
395 | $newarr[$i]['w'] = $v['w'];
396 | $newarr[$i]['t'] = $v['t'];
397 | $i++;
398 | }
399 | }
400 | $this->finallyResult = $newarr;
401 | $newarr = '';
402 | }
403 |
404 | /**
405 | * 深入分词
406 | * @parem $str
407 | * @parem $ctype (2 英文类, 3 中/韩/日文类)
408 | * @parem $spos 当前粗分结果游标
409 | * @return bool
410 | */
411 | private function _deep_analysis(&$str, $ctype, $spos, $optimize = true)
412 | {
413 |
414 | $notSplitLen = $this->notSplitLen;
415 | $simpleResult = $this->simpleResult;
416 | $addonDic = $this->addonDic;
417 |
418 | //中文句子
419 | if ($ctype == 1) {
420 | $slen = strlen($str);
421 | //小于系统配置分词要求长度的句子
422 | if ($slen < $notSplitLen) {
423 | $tmpstr = '';
424 | $lastType = 0;
425 | if ($spos > 0) $lastType = $simpleResult[$spos - 1]['t'];
426 | if ($slen < 5) {
427 | //echo iconv(UCS2, 'utf-8', $str).'
';
428 | if ($lastType == 4 && (isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]))) {
429 | $str2 = '';
430 | if (!isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)])) {
431 | $str2 = substr($str, 2, 2);
432 | $str = substr($str, 0, 2);
433 | }
434 | $ww = $this->simpleResult[$spos - 1]['w'] . $str;
435 | $this->simpleResult[$spos - 1]['w'] = $ww;
436 | $this->simpleResult[$spos - 1]['t'] = 4;
437 | if (!isset($this->newWords[$this->simpleResult[$spos - 1]['w']])) {
438 | $this->foundWordStr .= StringTool::encoding($ww, $this->targetCharSet) . '/mu, ';
439 | $this->setWordInfos($ww, ['c' => 1, 'm' => 'mu']);
440 | }
441 | $this->simpleResult[$spos]['w'] = '';
442 | if ($str2 != '') {
443 | $this->finallyResult[$spos - 1][] = $ww;
444 | $this->finallyResult[$spos - 1][] = $str2;
445 | }
446 | } else {
447 | $this->finallyResult[$spos][] = $str;
448 | }
449 | } else {
450 | $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
451 | }
452 | } //正常长度的句子,循环进行分词处理
453 | else {
454 | $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
455 | }
456 | } //英文句子,转为小写
457 | else {
458 | if ($this->toLower) {
459 | $this->finallyResult[$spos][] = strtolower($str);
460 | } else {
461 | $this->finallyResult[$spos][] = $str;
462 | }
463 | }
464 |
465 | $this->notSplitLen = $notSplitLen;
466 | $this->simpleResult = $simpleResult;
467 | $this->addonDic = $addonDic;
468 | }
469 |
470 | /**
471 | * 中文的深入分词
472 | * @param $str
473 | * @param $lastec
474 | * @param $spos
475 | * @param $slen
476 | * @param $optimize
477 | *
478 | * @return void
479 | */
480 | private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = true)
481 | {
482 | $quote1 = chr(0x20) . chr(0x1C);
483 | $tmparr = [];
484 | $hasw = 0;
485 | //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
486 | if ($spos > 0 && $slen < 11 && $this->simpleResult[$spos - 1]['w'] == $quote1) {
487 | $tmparr[] = $str;
488 | if (!isset($this->newWords[$str])) {
489 | $this->foundWordStr .= StringTool::encoding($str, $this->targetCharSet) . '/nq, ';
490 | $this->setWordInfos($str, ['c' => 1, 'm' => 'nq']);
491 | }
492 | if (!$this->differMax) {
493 | $this->finallyResult[$spos][] = $str;
494 | return;
495 | }
496 | }
497 | //进行切分
498 | for ($i = $slen - 1; $i > 0; $i -= 2) {
499 | //单个词
500 | $nc = $str[$i - 1] . $str[$i];
501 | //是否已经到最后两个字
502 | if ($i <= 2) {
503 | $tmparr[] = $nc;
504 | $i = 0;
505 | break;
506 | }
507 | $isok = false;
508 | $i = $i + 1;
509 | for ($k = $this->dicWordMax; $k > 1; $k = $k - 2) {
510 | if ($i < $k) continue;
511 | $w = substr($str, $i - $k, $k);
512 | if (strlen($w) <= 2) {
513 | $i = $i - 1;
514 | break;
515 | }
516 | if ($this->isWord($w)) {
517 | $tmparr[] = $w;
518 | $i = $i - $k + 1;
519 | $isok = true;
520 | break;
521 | }
522 | }
523 | //echo '
';
524 | //没适合词
525 | if (!$isok) $tmparr[] = $nc;
526 | }
527 | $wcount = count($tmparr);
528 | if ($wcount == 0) return;
529 | $this->finallyResult[$spos] = array_reverse($tmparr);
530 | //优化结果(岐义处理、新词、数词、人名识别等)
531 | if ($optimize) {
532 | $this->_optimize_result($this->finallyResult[$spos], $spos);
533 | }
534 | }
535 |
536 | /**
537 | * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
538 | * t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
539 | *
540 | * @param $smarr
541 | * @param $spos 是否优化合并的结果
542 | * @return bool
543 | */
544 | private function _optimize_result(&$smarr, $spos)
545 | {
546 | $newarr = [];
547 | $prePos = $spos - 1;
548 | $arlen = count($smarr);
549 | $i = $j = 0;
550 | //检测数量词
551 | if ($prePos > -1 && !isset($this->finallyResult[$prePos])) {
552 | $lastw = $this->simpleResult[$prePos]['w'];
553 | $lastt = $this->simpleResult[$prePos]['t'];
554 | if (($lastt == 4 || isset($this->addonDic['c'][$lastw])) && isset($this->addonDic['u'][$smarr[0]])) {
555 | $this->simpleResult[$prePos]['w'] = $lastw . $smarr[0];
556 | $this->simpleResult[$prePos]['t'] = 4;
557 | if (!isset($this->newWords[$this->simpleResult[$prePos]['w']])) {
558 | $this->foundWordStr .= StringTool::encoding($this->simpleResult[$prePos]['w'], $this->targetCharSet) . '/mu, ';
559 | $this->setWordInfos($this->simpleResult[$prePos]['w'], ['c' => 1, 'm' => 'mu']);
560 | }
561 | $smarr[0] = '';
562 | $i++;
563 | }
564 | }
565 | for (; $i < $arlen; $i++) {
566 |
567 | if (!isset($smarr[$i + 1])) {
568 | $newarr[$j] = $smarr[$i];
569 | break;
570 | }
571 | $cw = $smarr[$i];
572 | $nw = $smarr[$i + 1];
573 | $ischeck = false;
574 | //检测数量词
575 | if (isset($this->addonDic['c'][$cw]) && isset($this->addonDic['u'][$nw])) {
576 | //最大切分时保留合并前的词
577 | if ($this->differMax) {
578 | $newarr[$j] = chr(0) . chr(0x28);
579 | $j++;
580 | $newarr[$j] = $cw;
581 | $j++;
582 | $newarr[$j] = $nw;
583 | $j++;
584 | $newarr[$j] = chr(0) . chr(0x29);
585 | $j++;
586 | }
587 | $newarr[$j] = $cw . $nw;
588 | if (!isset($this->newWords[$newarr[$j]])) {
589 | $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/mu, ';
590 | $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'mu']);
591 | }
592 | $j++;
593 | $i++;
594 | $ischeck = true;
595 | } //检测前导词(通常是姓)
596 | else if (isset($this->addonDic['n'][$smarr[$i]])) {
597 | $is_rs = false;
598 | //词语是副词或介词或频率很高的词不作为人名
599 | if (strlen($nw) == 4) {
600 | $winfos = $this->getWordInfos($nw);
601 | if (isset($winfos['m']) && ($winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
602 | $is_rs = true;
603 | }
604 | }
605 | if (!isset($this->addonDic['s'][$nw]) && strlen($nw) < 5 && !$is_rs) {
606 | $newarr[$j] = $cw . $nw;
607 | //echo iconv(UCS2, 'utf-8', $newarr[$j])."
";
608 | //尝试检测第三个词
609 | if (strlen($nw) == 2 && isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && !isset($this->addonDic['s'][$smarr[$i + 2]])) {
610 | $newarr[$j] .= $smarr[$i + 2];
611 | $i++;
612 | }
613 | if (!isset($this->newWords[$newarr[$j]])) {
614 | $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'nr']);
615 | $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/nr, ';
616 | }
617 | //为了防止错误,保留合并前的姓名
618 | if (strlen($nw) == 4) {
619 | $j++;
620 | $newarr[$j] = chr(0) . chr(0x28);
621 | $j++;
622 | $newarr[$j] = $cw;
623 | $j++;
624 | $newarr[$j] = $nw;
625 | $j++;
626 | $newarr[$j] = chr(0) . chr(0x29);
627 | }
628 |
629 | $j++;
630 | $i++;
631 | $ischeck = true;
632 | }
633 | } //检测后缀词(地名等)
634 | else if (isset($this->addonDic['a'][$nw])) {
635 | $is_rs = false;
636 | //词语是副词或介词不作为前缀
637 | if (strlen($cw) > 2) {
638 | $winfos = $this->getWordInfos($cw);
639 | if (isset($winfos['m']) && ($winfos['m'] == 'a' || $winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
640 | $is_rs = true;
641 | }
642 | }
643 | if (!isset($this->addonDic['s'][$cw]) && !$is_rs) {
644 | $newarr[$j] = $cw . $nw;
645 | if (!isset($this->newWords[$newarr[$j]])) {
646 | $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/na, ';
647 | $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'na']);
648 | }
649 | $i++;
650 | $j++;
651 | $ischeck = true;
652 | }
653 | } //新词识别(暂无规则)
654 | else if ($this->unitWord) {
655 | if (strlen($cw) == 2 && strlen($nw) == 2
656 | && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
657 | && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])
658 | ) {
659 | $newarr[$j] = $cw . $nw;
660 | //尝试检测第三个词
661 | if (isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && (isset($this->addonDic['a'][$smarr[$i + 2]]) || isset($this->addonDic['u'][$smarr[$i + 2]]))) {
662 | $newarr[$j] .= $smarr[$i + 2];
663 | $i++;
664 | }
665 | if (!isset($this->newWords[$newarr[$j]])) {
666 | $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/ms, ';
667 | $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'ms']);
668 | }
669 | $i++;
670 | $j++;
671 | $ischeck = true;
672 | }
673 | }
674 |
675 | //不符合规则
676 | if (!$ischeck) {
677 | $newarr[$j] = $cw;
678 | //二元消岐处理——最大切分模式
679 | if ($this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7) {
680 | $slen = strlen($nw);
681 | $hasDiff = false;
682 | for ($y = 2; $y <= $slen - 2; $y = $y + 2) {
683 | $nhead = substr($nw, $y - 2, 2);
684 | $nfont = $cw . substr($nw, 0, $y - 2);
685 | if ($this->isWord($nfont . $nhead)) {
686 | if (strlen($cw) > 2) $j++;
687 | $hasDiff = true;
688 | $newarr[$j] = $nfont . $nhead;
689 | }
690 | }
691 | }
692 | $j++;
693 | }
694 |
695 | }//end for
696 | $smarr = $newarr;
697 | }
698 |
699 |
700 | /**
701 | * 获取最终结果字符串(用空格分开后的分词结果)
702 | *
703 | * @return string
704 | */
705 | public function getFinallyResult($spword = ' ', $word_meanings = false)
706 | {
707 | $rsstr = '';
708 | foreach ($this->finallyResult as $v) {
709 | if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
710 | continue;
711 | }
712 | $m = '';
713 | if ($word_meanings) {
714 | $m = $this->getWordProperty($v['w']);
715 | }
716 | $w = StringTool::encoding($v['w'], $this->targetCharSet);
717 | if ($w != ' ') {
718 | if ($word_meanings) {
719 | $rsstr .= $spword . $w . $m;
720 | } else {
721 | $rsstr .= $spword . $w;
722 | }
723 | }
724 | }
725 | return $rsstr;
726 | }
727 |
728 | /**
729 | * 获取粗分结果,不包含粗分属性
730 | *
731 | * @return array()
732 | */
733 | public function getSimpleResult()
734 | {
735 | $list = [];
736 | foreach ($this->simpleResult as $k => $v) {
737 | if (empty($v['w'])) continue;
738 | $w = StringTool::encoding($v['w'], $this->targetCharSet);
739 | if ($w != ' ') $list[] = $w;
740 | }
741 |
742 | return $list;
743 | }
744 |
745 | /**
746 | * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
747 | *
748 | * @return array()
749 | */
750 | public function getSimpleResultAll()
751 | {
752 | $rearr = [];
753 | foreach ($this->simpleResult as $k => $v) {
754 | $w = StringTool::encoding($v['w'], $this->targetCharSet);
755 | if ($w != ' ') {
756 | $rearr[$k]['w'] = $w;
757 | $rearr[$k]['t'] = $v['t'];
758 | }
759 | }
760 |
761 | return $rearr;
762 | }
763 |
764 | /**
765 | * 获取最终关键字(返回用 "," 间隔的关键字)
766 | *
767 | * @return string
768 | */
769 | public function getFinallyKeywords($num = 10)
770 | {
771 | $n = 0;
772 |
773 | // 获取索引hash数组
774 | $arr = [];
775 | foreach ($this->finallyResult as $v) {
776 | if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
777 | continue;
778 | }
779 | $w = StringTool::encoding($v['w'], $this->targetCharSet);
780 | if ($w == ' ') {
781 | continue;
782 | }
783 | if (isset($arr[$w])) {
784 | $arr[$w]++;
785 | } else {
786 | $arr[$w] = 1;
787 | }
788 | }
789 |
790 | arsort($arr);
791 |
792 | $finallyString = '';
793 | foreach ($arr as $k => $v) {
794 | //排除长度为1的词
795 | if (strlen($k) == 1) {
796 | continue;
797 | } //排除长度为2的非英文词
798 | elseif (strlen($k) == 2 && preg_match('/[^0-9a-zA-Z]/', $k)) {
799 | continue;
800 |
801 | } //排除单个中文字
802 | elseif (strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
803 | continue;
804 | }
805 | $finallyString .= ($finallyString == '' ? $k : ',' . $k);
806 | $n++;
807 | if ($n > $num) break;
808 | }
809 |
810 | return $finallyString;
811 | }
812 | }
813 |
814 | ?>
815 |
--------------------------------------------------------------------------------
/src/Analysis/ChineseAnalysisInterface.php:
--------------------------------------------------------------------------------
1 | addonDicFile;
16 | // $mainDicFile = dirname(__FILE__) . '/' . $this->mainDicFile;
17 |
18 | //常量定义
19 | $_SP_ = chr(0xFF) . chr(0xFE);
20 | $UCS2 = 'ucs-2be';
21 | $additionFile = __DIR__ . '/dict/words_addons.dic';
22 | $mainDicFile = __DIR__ . '/dict/base_dic_full.dic';
23 | $mainDicHand = null;
24 | $additionDict = [];
25 | $startTime = microtime(true);
26 | // $mainDicFile = null;
27 |
28 | //正常读取文件
29 | $dicAddon = $additionFile;
30 |
31 | if ($mainDic == '' || !file_exists($mainDic)) {
32 | $dicWords = $mainDicFile;
33 | } else {
34 | $dicWords = $mainDic;
35 | $mainDicFile = $mainDic;
36 | }
37 |
38 | // 加载主词典(只打开)
39 | $mainDicHand = fopen($dicWords, 'r');
40 |
41 | // //加载附加的 分词
42 | // if (!empty($additionDict)) {
43 | // $mainDicHand = $mainDicHand . $this->getAdditionDict();
44 | // }
45 |
46 | // 载入副词典
47 | $hw = '';
48 |
49 | $ds = file($dicAddon);
50 | foreach ($ds as $d) {
51 | $d = trim($d);
52 | if ($d == '') continue;
53 | $estr = substr($d, 1, 1);
54 | if ($estr == ':') {
55 | $hw = substr($d, 0, 1);
56 | } else {
57 | $spstr = $_SP_;
58 | $spstr = iconv($UCS2, 'utf-8', $spstr);
59 | $ws = explode(',', $d);
60 | $wall = iconv('utf-8', $UCS2, join($spstr, $ws));
61 | $ws = explode($_SP_, $wall);
62 | foreach ($ws as $estr) {
63 | $additionDict[$hw][$estr] = strlen($estr);
64 | }
65 | }
66 | }
67 |
68 | $loadTime = microtime(true) - $startTime;
69 |
70 | // $isLoadDic = true;
71 |
72 | return [$mainDicHand, $mainDic, $additionDict, $loadTime];
73 | }
74 |
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/Analysis/StringTool.php:
--------------------------------------------------------------------------------
1 |
13 | *
14 | * @param $source_file
15 | * @param $target_file
16 | *
17 | * @return void
18 | */
19 | public function MakeDict($source_file, $target_file = '')
20 | {
21 | $target_file = ($target_file == '' ? $this->mainDicFile : $target_file);
22 | $allk = [];
23 | $fp = fopen($source_file, 'r');
24 | while ($line = fgets($fp, 512)) {
25 | if ($line[0] == '@') continue;
26 | list($w, $r, $a) = explode(',', $line);
27 | $a = trim($a);
28 | $w = iconv('utf-8', UCS2, $w);
29 | $k = $this->_get_index($w);
30 | if (isset($allk[$k]))
31 | $allk[$k][$w] = [$r, $a];
32 | else
33 | $allk[$k][$w] = [$r, $a];
34 | }
35 | fclose($fp);
36 | $fp = fopen($target_file, 'w');
37 | $heade_rarr = [];
38 | $alldat = '';
39 | $start_pos = $this->mask_value * 8;
40 | foreach ($allk as $k => $v) {
41 | $dat = serialize($v);
42 | $dlen = strlen($dat);
43 | $alldat .= $dat;
44 |
45 | $heade_rarr[$k][0] = $start_pos;
46 | $heade_rarr[$k][1] = $dlen;
47 | $heade_rarr[$k][2] = count($v);
48 |
49 | $start_pos += $dlen;
50 | }
51 | unset($allk);
52 | for ($i = 0; $i < $this->mask_value; $i++) {
53 | if (!isset($heade_rarr[$i])) {
54 | $heade_rarr[$i] = [0, 0, 0];
55 | }
56 | fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
57 | }
58 | fwrite($fp, $alldat);
59 | fclose($fp);
60 | }
61 |
62 | /**
63 | * 导出词典的词条
64 | * 保存位置
65 | *
66 | * @param $targetFile
67 | *
68 | * @return bool
69 | */
70 | public function exportDict($targetFile)
71 | {
72 | if (!$this->mainDicHand) {
73 | $this->mainDicHand = fopen($this->mainDicFile, 'r');
74 | }
75 | $fp = fopen($targetFile, 'w');
76 |
77 | // for ($i = 0; $i <= $this->mask_value; $i++) {
78 | // $move_pos = $i * 8;
79 | // fseek($this->mainDicHand, $move_pos, SEEK_SET);
80 | // $dat = fread($this->mainDicHand, 8);
81 | // $arr = unpack('I1s/n1l/n1c', $dat);
82 | // if ($arr['l'] == 0) {
83 | // continue;
84 | // }
85 | // fseek($this->mainDicHand, $arr['s'], SEEK_SET);
86 | // $data = @unserialize(fread($this->mainDicHand, $arr['l']));
87 | // if (!is_array($data)) continue;
88 | // foreach ($data as $k => $v) {
89 | // $w = iconv(UCS2, 'utf-8', $k);
90 | // fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
91 | // }
92 | // }
93 |
94 | fwrite($fp, $this->ExportDictCore($this->mainDicHand));
95 | fclose($fp);
96 | return true;
97 | }
98 |
99 |
100 | /**
101 | * @param $source_str
102 | * @return string
103 | */
104 | public function exportDictCore($source_str)
105 | {
106 | $str = '';
107 | for ($i = 0; $i <= $this->mask_value; $i++) {
108 | $move_pos = $i * 8;
109 | fseek($source_str, $move_pos, SEEK_SET);
110 | $dat = fread($source_str, 8);
111 | $arr = unpack('I1s/n1l/n1c', $dat);
112 | if ($arr['l'] == 0) {
113 | continue;
114 | }
115 | fseek($source_str, $arr['s'], SEEK_SET);
116 | $data = @unserialize(fread($source_str, $arr['l']));
117 | if (!is_array($data)) continue;
118 | foreach ($data as $k => $v) {
119 | $w = iconv(UCS2, 'utf-8', $k);
120 | // fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
121 | $str .= "{$w},{$v[0]},{$v[1]}\n";
122 | }
123 | }
124 |
125 | return $str;
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/Split/Config.php:
--------------------------------------------------------------------------------
1 | loadConfig();
20 |
21 | ChineseAnalysis::$loadInit = false;
22 |
23 | $this->pa = new ChineseAnalysis('utf-8', 'utf-8', false);
24 | }
25 |
26 |
27 | /**
28 | * 添加附加词
29 | *
30 | * @param array $words
31 | * @return void
32 | */
33 | public function attach(array $words = [])
34 | {
35 | $this->pa->setAttach($words);
36 | }
37 |
38 | /**
39 | * 开始分词
40 | *
41 | * @param string $word
42 | * @return array
43 | */
44 | public function start($word = '')
45 | {
46 | $this->pa->setSource($word);
47 | $this->pa->startAnalysis(true);
48 |
49 | $getInfo = true;
50 | $sign = '-';
51 | $result = $this->pa->getFinallyResult($sign, $getInfo);
52 | $result = explode($sign, $result);
53 | $result = array_filter($result, function ($var) {
54 | return !empty($var);
55 | });
56 |
57 | return $result;
58 | }
59 |
60 | /**
61 | * 简单分词方法
62 | *
63 | * @param string $string
64 | * @return array
65 | */
66 | public function simple($string = '')
67 | {
68 | $this->pa->setSource($string);
69 | $this->pa->startAnalysis(true);
70 |
71 | $getInfo = true;
72 | $sign = '-';
73 | $result = $this->pa->getFinallyResult($sign, $getInfo);
74 | $result = explode($sign, $result);
75 | $result = array_filter($result, function ($var) {
76 | return !empty($var);
77 | });
78 |
79 | return array_map(function ($word) {
80 | $word = explode('/', $word);
81 |
82 | return $word[0];
83 | }, $result);
84 | }
85 |
86 | /**
87 | * load config
88 | *
89 | * @return bool
90 | */
91 | public static function loadConfig()
92 | {
93 | $files = [__DIR__ . '/Config.php',];
94 |
95 | foreach ($files as $file) {
96 | if (is_file($file)) {
97 | require_once($file);
98 |
99 | return true;
100 | }
101 | }
102 |
103 | return false;
104 | }
105 | }
106 |
107 | ?>
108 |
--------------------------------------------------------------------------------
/src/Split/SplitInterface.php:
--------------------------------------------------------------------------------
1 |
30 |
--------------------------------------------------------------------------------
/test.php:
--------------------------------------------------------------------------------
1 | simple("您好 phpSplit"));
5 | ?>
6 |
--------------------------------------------------------------------------------
/tests/Analysis/LoaderTest.php:
--------------------------------------------------------------------------------
1 | getLoadDict();
17 |
18 | $this->assertTrue(true);
19 | // var_dump($result);
20 | }
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/tests/analysisTest.php:
--------------------------------------------------------------------------------
1 | LoadDict();
24 | $pa->SetSource($str);
25 | $pa->differMax = false;
26 | $pa->unitWord = false;
27 | $pa->StartAnalysis(true);
28 | // $resultArray=$pa->GetFinallyIndex();
29 | $getInfo=true;
30 | $sign='-';
31 | $result=$pa->GetFinallyResult($sign,$getInfo);
32 | $result=explode($sign,$result);
33 | $filterResult=[];
34 |
35 | var_dump($result);
36 |
37 | foreach($result as $k=>$value){
38 | if (preg_match('/\/n/i', $value) === 1) {
39 | $arrValue=explode('/',$value);
40 | $filterResult[$arrValue[0]]=(int)preg_replace('/(n[a-z|A-Z]*)/','',$arrValue[1]);
41 | }
42 | }
43 |
44 | $this->assertTrue(count($filterResult)>0);
45 | }
46 | }
47 |
48 |
--------------------------------------------------------------------------------
/tests/initTest.php:
--------------------------------------------------------------------------------
1 | start("您好phpSplit,不管怎么说你开心就好"));
17 |
18 | $this->assertTrue(True);
19 | }
20 |
21 |
22 | /**
23 | * 简单测试
24 | */
25 | public function testSimple()
26 | {
27 | echo "test...\n";
28 |
29 | $split = new Split();
30 |
31 | var_dump( $split->simple("您好phpSplit,不管怎么说你开心就好"));
32 |
33 | $this->assertTrue(True);
34 | }
35 |
36 |
37 | /**
38 | * 附加词语测试
39 | */
40 | public function testAddonSimple()
41 | {
42 | echo "test attach ... \n";
43 |
44 | $split = new Split();
45 | $split->attach(['康师傅手机']);
46 | var_dump( $split->simple("您好phpSplit,你喜欢康师傅手机么?"));
47 |
48 | $this->assertTrue(True);
49 | }
50 | }
51 |
52 |
--------------------------------------------------------------------------------