├── README.md
└── WordAnalysis
├── Analysis.php
├── dict
├── base_dic_full.dic
├── not-build
│ └── base_dic_full.txt
├── readme.txt
└── words_addons.dic
├── dict_build.php
└── phpanalysis.class.php
/README.md:
--------------------------------------------------------------------------------
1 | # PHPAnalysis
2 | PHP中分分词,使用PHPAnalysis
3 | 示例请移步:https://feifei.blog.csdn.net/article/details/99717174
--------------------------------------------------------------------------------
/WordAnalysis/Analysis.php:
--------------------------------------------------------------------------------
1 | LoadDict ();
35 | $pa->SetSource ($content);
36 | $pa->StartAnalysis ( true );
37 |
38 | $tags = $pa->GetFinallyKeywords ($num); // 获取文章中的n个关键字
39 | return $tags;//返回关键字
40 | }
41 |
42 | }
--------------------------------------------------------------------------------
/WordAnalysis/dict/base_dic_full.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feixuekeji/PHPAnalysis/eafb8c6e61f5d1a3d2d168dc3e42052ac4fa397f/WordAnalysis/dict/base_dic_full.dic
--------------------------------------------------------------------------------
/WordAnalysis/dict/readme.txt:
--------------------------------------------------------------------------------
1 | 文件说明:
2 |
3 | 1、base_dic_full.dic
4 | hash索引 -- 字典带有词频和词性标志。
5 |
6 | 2、words_addons.dic
7 | s 开头的表示停止词 u 后缀词(地名后缀、数学单位等) n 前导词(姓、汉字数词等) a 后导词(地区,部门等)
8 |
9 | 3、 not-build/base_dic_full.txt
10 | 没编译过的词典源码
11 |
12 | 4、重新编译词典的方法:
13 |
14 | MakeDict( sourcefile, 16 , 'dict/base_dic_full.dic');
22 |
23 | echo "OK";
24 |
25 | ?>
--------------------------------------------------------------------------------
/WordAnalysis/dict/words_addons.dic:
--------------------------------------------------------------------------------
1 | s:停止词
2 | 并,让,才,上,被,把,近,而,是,为,由,等,合,子,除,均,很,也,称,还,分,据,后,向,经,对,但,只,则,设,靠,至,到,将,及,与,或,来,了,从,说,就,的,和,在,方,以,已,有,都,给,要
3 |
4 | n:姓或其它专用前缀词
5 | 新,肖,胡,罗,程,施,满,石,秦,苏,范,包,袁,许,舒,薛,蒋,董,白,田,季,丁,汪,段,梁,林,杜,杨,毛,江,熊,王,潘,沈,汤,谢,谭,韩,顾,雷,陈,阎,陆,马,高,龙,龚,黎,黄,魏,钱,钟,赵,邓,赖,贾,贺,邱,邵,郭,金,郝,郑,邹,李,武,余,夏,唐,朱,何,姚,孟,孙,孔,姜,周,吴,卢,单,刘,冯,史,叶,吕,候,傅,宋,任,文,戴,徐,张,万,方,曾,曹,易,廖,彭,常,尹,乔,于,康,崔,布,钟离,令狐,公冶,公孙,闻人,鲜于,上官,仲孙,万俟,东方,闾丘,长孙,诸葛,申屠,皇甫,尉迟,濮阳,澹台,欧阳,慕容,淳于,宗政,宇文,司徒,轩辕,单于,赫连,司空,太叔,夏侯,司马,公羊,勿,成吉,埃,哈
6 |
7 | u:单位或专用后缀词
8 | u‰,℃,℉,毛,段,步,毫,池,滴,派,洲,款,次,桩,档,桌,桶,梯,楼,棵,炮,点,盏,盆,界,盒,盘,眼,画,男,环,版,片,班,瓣,生,瓶,案,格,族,方,斤,日,时,期,月,曲,斗,文,指,拳,拨,掌,排,丈,撮,本,朵,栋,柜,柄,栏,株,根,样,架,枪,条,束,村,杯,枝,枚,石,码,辈,辆,轮,连,通,里,部,遍,转,车,言,角,袋,课,起,路,趟,重,针,项,顷,顶,顿,颗,首,餐,页,集,锅,钱,钟,门,间,隅,队,行,节,筐,笔,筒,箱,篮,篓,篇,章,站,磅,碟,碗,种,科,窝,秒,簇,米,脚,股,群,船,艇,色,艘,罐,级,粒,类,组,维,缸,缕,招,支,发,双,厘,口,句,台,只,厅,卷,包,勺,匙,匹,升,区,叶,号,地,圈,圆,场,块,堆,坪,团,回,吨,名,拍,员,周,副,剑,代,付,件,伏,份,人,亩,世,下,两,个,串,伙,位,划,分,列,则,剂,刻,刀,出,倍,例,元,克,册,具,声,听,幅,帧,房,批,师,岁,尾,尺,局,层,届,手,壶,成,张,截,户,扇,年,度,座,尊,幢,室,寸,头,宗,字,孔,所,女,套,拉,家,处,折,天,把,夜,担,號,个月,公斤,公分,公克,公担,公亩,公升,公尺,像素,月份,盎司,位数,公里,年级,点钟,克拉,英亩,平方,加仑,公顷,秒钟,千克,世纪,千米,分钟,海里,英寸,英尺,英里,年代,周年,小时,阶段,平米,立方米,立方码,平方米,平方码,平方厘米,立方英寸,立方厘米,立方分米,立方公尺,立方英尺,平方公尺,平方英尺,平方英寸,平方分米,平方公里,平方英里,百位,十位,百次,千次,千名,千亩,千里,千人,千台,千位,万次,万元,万里,万位,万件,万单,万个,万台,万名,万人,亿元,亿,万,千,萬
9 |
10 | a:地名等后置词
11 | 语,署,苑,街,省,湖,乡,海,观,路,娃,山,阁,部,镇,江,河,厅,郡,厂,楼,园,区,党,井,亭,塔,县,家,市,弄,巷,寺,局,中路,村委,诺夫,斯基,维奇,村委会,机,型,率
12 |
13 | c:数量前缀词
14 | 零,一,二,三,四,五,六,七,八,九,十,百,千,万,亿,第,半,几,俩,卅,两,壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,伯,仟
15 |
16 | t:省会等专用词
17 | 京,津,沪,渝,冀,豫,云,辽,黑,湘,皖,鲁,新,苏,浙,赣,鄂,桂,甘,晋,蒙,陕,吉,闽,贵,粤,青,藏,川,宁,琼
--------------------------------------------------------------------------------
/WordAnalysis/dict_build.php:
--------------------------------------------------------------------------------
1 | 请选择要进行的操作:
";
12 | echo "1、用原始文件(dict/not-build/base_dic_full.txt)生成一个标准词典;
";
13 | echo "2、从默认词典(dict/base_dic_full.dic),反编译出原始文件。";
14 | exit();
15 | }
16 |
17 | if( $_GET['ac']=='make' )
18 | {
19 | PhpAnalysis::$loadInit = false;
20 | $pa = new PhpAnalysis('utf-8', 'utf-8', false);
21 | $pa->MakeDict( $dicAddon );
22 | echo "完成词典创建!";
23 | exit();
24 | }
25 | else
26 | {
27 | $pa = new PhpAnalysis('utf-8', 'utf-8', true);
28 | $pa->ExportDict('base_dic_source.txt');
29 | echo "完成反编译词典文件,生成的文件为:base_dic_source.txt !";
30 | exit();
31 | }
32 | ?>
--------------------------------------------------------------------------------
/WordAnalysis/phpanalysis.class.php:
--------------------------------------------------------------------------------
1 | StartAnalysis -> Get***Result
7 | * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
8 | *
9 | * Copyright IT柏拉图 QQ: 2500875 Email: 2500875#qq.com
10 | *
11 | * @version 2.0
12 | *
13 | */
14 |
15 | //常量定义
16 | define('_SP_', chr(0xFF).chr(0xFE));
17 | define('UCS2', 'ucs-2be');
18 | class PhpAnalysis
19 | {
20 |
21 | //hash算法选项
22 | public $mask_value = 0xFFFF;
23 |
24 | //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
25 | public $sourceCharSet = 'utf-8';
26 | public $targetCharSet = 'utf-8';
27 |
28 | //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
29 | public $resultType = 1;
30 |
31 | //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
32 | public $notSplitLen = 5;
33 |
34 | //把英文单词全部转小写
35 | public $toLower = false;
36 |
37 | //使用最大切分模式对二元词进行消岐
38 | public $differMax = false;
39 |
40 | //尝试合并单字
41 | public $unitWord = true;
42 |
43 | //初始化类时直接加载词典
44 | public static $loadInit = true;
45 |
46 | //使用热门词优先模式进行消岐
47 | public $differFreq = false;
48 |
49 | //被转换为unicode的源字符串
50 | private $sourceString = '';
51 |
52 | //附加词典
53 | public $addonDic = array();
54 | public $addonDicFile = 'dict/words_addons.dic';
55 |
56 | //主词典
57 | public $dicStr = '';
58 | public $mainDic = array();
59 | public $mainDicHand = false;
60 | public $mainDicInfos = array();
61 | public $mainDicFile = 'dict/base_dic_full.dic';
62 | //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
63 | private $isLoadAll = false;
64 |
65 | //主词典词语最大长度 x / 2
66 | private $dicWordMax = 14;
67 | //粗分后的数组(通常是截取句子等用途)
68 | private $simpleResult = array();
69 | //最终结果(用空格分开的词汇列表)
70 | private $finallyResult = '';
71 |
72 | //是否已经载入词典
73 | public $isLoadDic = false;
74 | //系统识别或合并的新词
75 | public $newWords = array();
76 | public $foundWordStr = '';
77 | //词库载入时间
78 | public $loadTime = 0;
79 |
80 | /**
81 | * 构造函数
82 | * @param $source_charset
83 | * @param $target_charset
84 | * @param $load_alldic
85 | * @param $source
86 | *
87 | * @return void
88 | */
89 | public function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=true, $source='')
90 | {
91 | $this->addonDicFile = dirname(__FILE__).'/'.$this->addonDicFile;
92 | $this->mainDicFile = dirname(__FILE__).'/'.$this->mainDicFile;
93 | $this->SetSource( $source, $source_charset, $target_charset );
94 | $this->isLoadAll = $load_all;
95 | if(self::$loadInit) $this->LoadDict();
96 | }
97 |
98 | /**
99 | * 析构函数
100 | */
101 | function __destruct()
102 | {
103 | if( $this->mainDicHand !== false )
104 | {
105 | @fclose( $this->mainDicHand );
106 | }
107 | }
108 |
109 | /**
110 | * 根据字符串计算key索引
111 | * @param $key
112 | * @return short int
113 | */
114 | private function _get_index( $key )
115 | {
116 | $l = strlen($key);
117 | $h = 0x238f13af;
118 | while ($l--)
119 | {
120 | $h += ($h << 5);
121 | $h ^= ord($key[$l]);
122 | $h &= 0x7fffffff;
123 | }
124 | return ($h % $this->mask_value);
125 | }
126 |
127 | /**
128 | * 从文件获得词
129 | * @param $key
130 | * @param $type (类型 word 或 key_groups)
131 | * @return short int
132 | */
133 | public function GetWordInfos( $key, $type='word' )
134 | {
135 | if( !$this->mainDicHand )
136 | {
137 | $this->mainDicHand = fopen($this->mainDicFile, 'r');
138 | }
139 | $p = 0;
140 | $keynum = $this->_get_index( $key );
141 | if( isset($this->mainDicInfos[ $keynum ]) )
142 | {
143 | $data = $this->mainDicInfos[ $keynum ];
144 | }
145 | else
146 | {
147 | //rewind( $this->mainDicHand );
148 | $move_pos = $keynum * 8;
149 | fseek($this->mainDicHand, $move_pos, SEEK_SET);
150 | $dat = fread($this->mainDicHand, 8);
151 | $arr = unpack('I1s/n1l/n1c', $dat);
152 | if( $arr['l'] == 0 )
153 | {
154 | return false;
155 | }
156 | fseek($this->mainDicHand, $arr['s'], SEEK_SET);
157 | $data = @unserialize(fread($this->mainDicHand, $arr['l']));
158 | $this->mainDicInfos[ $keynum ] = $data;
159 | }
160 | if( !is_array($data) || !isset($data[$key]) )
161 | {
162 | return false;
163 | }
164 | return ($type=='word' ? $data[$key] : $data);
165 | }
166 |
167 | /**
168 | * 设置源字符串
169 | * @param $source
170 | * @param $source_charset
171 | * @param $target_charset
172 | *
173 | * @return bool
174 | */
175 | public function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
176 | {
177 | $this->sourceCharSet = strtolower($source_charset);
178 | $this->targetCharSet = strtolower($target_charset);
179 | $this->simpleResult = array();
180 | $this->finallyResult = array();
181 | $this->finallyIndex = array();
182 | if( $source != '' )
183 | {
184 | $rs = true;
185 | if( preg_match("/^utf/", $source_charset) ) {
186 | $this->sourceString = iconv('utf-8', UCS2, $source);
187 | }
188 | else if( preg_match("/^gb/", $source_charset) ) {
189 | $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
190 | }
191 | else if( preg_match("/^big/", $source_charset) ) {
192 | $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
193 | }
194 | else {
195 | $rs = false;
196 | }
197 | }
198 | else
199 | {
200 | $rs = false;
201 | }
202 | return $rs;
203 | }
204 |
205 | /**
206 | * 设置结果类型(只在获取finallyResult才有效)
207 | * @param $rstype 1 为全部, 2去除特殊符号
208 | *
209 | * @return void
210 | */
211 | public function SetResultType( $rstype )
212 | {
213 | $this->resultType = $rstype;
214 | }
215 |
216 | /**
217 | * 载入词典
218 | *
219 | * @return void
220 | */
221 | public function LoadDict( $maindic='' )
222 | {
223 | $startt = microtime(true);
224 | //正常读取文件
225 | $dicAddon = $this->addonDicFile;
226 | if($maindic=='' || !file_exists($maindic) )
227 | {
228 | $dicWords = $this->mainDicFile ;
229 | }
230 | else
231 | {
232 | $dicWords = $maindic;
233 | $this->mainDicFile = $maindic;
234 | }
235 |
236 | //加载主词典(只打开)
237 | $this->mainDicHand = fopen($dicWords, 'r');
238 |
239 | //载入副词典
240 | $hw = '';
241 | $ds = file($dicAddon);
242 | foreach($ds as $d)
243 | {
244 | $d = trim($d);
245 | if($d=='') continue;
246 | $estr = substr($d, 1, 1);
247 | if( $estr==':' ) {
248 | $hw = substr($d, 0, 1);
249 | }
250 | else
251 | {
252 | $spstr = _SP_;
253 | $spstr = iconv(UCS2, 'utf-8', $spstr);
254 | $ws = explode(',', $d);
255 | $wall = iconv('utf-8', UCS2, join($spstr, $ws));
256 | $ws = explode(_SP_, $wall);
257 | foreach($ws as $estr)
258 | {
259 | $this->addonDic[$hw][$estr] = strlen($estr);
260 | }
261 | }
262 | }
263 | $this->loadTime = microtime(true) - $startt;
264 | $this->isLoadDic = true;
265 | }
266 |
267 | /**
268 | * 检测某个词是否存在
269 | */
270 | public function IsWord( $word )
271 | {
272 | $winfos = $this->GetWordInfos( $word );
273 | return ($winfos !== false);
274 | }
275 |
276 | /**
277 | * 获得某个词的词性及词频信息
278 | * @parem $word unicode编码的词
279 | * @return void
280 | */
281 | public function GetWordProperty($word)
282 | {
283 | if( strlen($word)<4 )
284 | {
285 | return '/s';
286 | }
287 | $infos = $this->GetWordInfos($word);
288 | return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
289 | }
290 |
291 | /**
292 | * 指定某词的词性信息(通常是新词)
293 | * @parem $word unicode编码的词
294 | * @parem $infos array('c' => 词频, 'm' => 词性);
295 | * @return void;
296 | */
297 | public function SetWordInfos($word, $infos)
298 | {
299 | if( strlen($word)<4 )
300 | {
301 | return ;
302 | }
303 | if( isset($this->mainDicInfos[$word]) )
304 | {
305 | $this->newWords[$word]++;
306 | $this->mainDicInfos[$word]['c']++;
307 | }
308 | else
309 | {
310 | $this->newWords[$word] = 1;
311 | $this->mainDicInfos[$word] = $infos;
312 | }
313 | }
314 |
315 | /**
316 | * 开始执行分析
317 | * @parem bool optimize 是否对结果进行优化
318 | * @return bool
319 | */
320 | public function StartAnalysis($optimize=true)
321 | {
322 | if( !$this->isLoadDic )
323 | {
324 | $this->LoadDict();
325 | }
326 | $this->simpleResult = $this->finallyResult = array();
327 | $this->sourceString .= chr(0).chr(32);
328 | $slen = strlen($this->sourceString);
329 | $sbcArr = array();
330 | $j = 0;
331 | //全角与半角字符对照表
332 | for($i=0xFF00; $i < 0xFF5F; $i++)
333 | {
334 | $scb = 0x20 + $j;
335 | $j++;
336 | $sbcArr[$i] = $scb;
337 | }
338 | //对字符串进行粗分
339 | $onstr = '';
340 | $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
341 | $s = 0;
342 | $ansiWordMatch = "[0-9a-z@#%\+\.-]";
343 | $notNumberMatch = "[a-z@#%\+]";
344 | for($i=0; $i < $slen; $i++)
345 | {
346 | $c = $this->sourceString[$i].$this->sourceString[++$i];
347 | $cn = hexdec(bin2hex($c));
348 | $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
349 | //ANSI字符
350 | if($cn < 0x80)
351 | {
352 | if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
353 | {
354 | if( $lastc != 2 && $onstr != '') {
355 | $this->simpleResult[$s]['w'] = $onstr;
356 | $this->simpleResult[$s]['t'] = $lastc;
357 | $this->_deep_analysis($onstr, $lastc, $s, $optimize);
358 | $s++;
359 | $onstr = '';
360 | }
361 | $lastc = 2;
362 | $onstr .= chr(0).chr($cn);
363 | }
364 | else
365 | {
366 | if( $onstr != '' )
367 | {
368 | $this->simpleResult[$s]['w'] = $onstr;
369 | if( $lastc==2 )
370 | {
371 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
372 | }
373 | $this->simpleResult[$s]['t'] = $lastc;
374 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
375 | $s++;
376 | }
377 | $onstr = '';
378 | $lastc = 3;
379 | if($cn < 31)
380 | {
381 | continue;
382 | }
383 | else
384 | {
385 | $this->simpleResult[$s]['w'] = chr(0).chr($cn);
386 | $this->simpleResult[$s]['t'] = 3;
387 | $s++;
388 | }
389 | }
390 | }
391 | //普通字符
392 | else
393 | {
394 | //正常文字
395 | if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
396 | || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
397 | {
398 | if( $lastc != 1 && $onstr != '')
399 | {
400 | $this->simpleResult[$s]['w'] = $onstr;
401 | if( $lastc==2 )
402 | {
403 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
404 | }
405 | $this->simpleResult[$s]['t'] = $lastc;
406 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
407 | $s++;
408 | $onstr = '';
409 | }
410 | $lastc = 1;
411 | $onstr .= $c;
412 | }
413 | //特殊符号
414 | else
415 | {
416 | if( $onstr != '' )
417 | {
418 | $this->simpleResult[$s]['w'] = $onstr;
419 | if( $lastc==2 )
420 | {
421 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
422 | }
423 | $this->simpleResult[$s]['t'] = $lastc;
424 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
425 | $s++;
426 | }
427 |
428 | //检测书名
429 | if( $cn == 0x300A )
430 | {
431 | $tmpw = '';
432 | $n = 1;
433 | $isok = false;
434 | $ew = chr(0x30).chr(0x0B);
435 | while(true)
436 | {
437 | if( !isset($this->sourceString[$i+$n+1]) ) break;
438 | $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
439 | if( $w == $ew )
440 | {
441 | $this->simpleResult[$s]['w'] = $c;
442 | $this->simpleResult[$s]['t'] = 5;
443 | $s++;
444 |
445 | $this->simpleResult[$s]['w'] = $tmpw;
446 | $this->newWords[$tmpw] = 1;
447 | if( !isset($this->newWords[$tmpw]) )
448 | {
449 | $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
450 | $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
451 | }
452 | $this->simpleResult[$s]['t'] = 13;
453 |
454 | $s++;
455 |
456 | //最大切分模式对书名继续分词
457 | if( $this->differMax )
458 | {
459 | $this->simpleResult[$s]['w'] = $tmpw;
460 | $this->simpleResult[$s]['t'] = 21;
461 | $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
462 | $s++;
463 | }
464 |
465 | $this->simpleResult[$s]['w'] = $ew;
466 | $this->simpleResult[$s]['t'] = 5;
467 | $s++;
468 |
469 | $i = $i + $n + 1;
470 | $isok = true;
471 | $onstr = '';
472 | $lastc = 5;
473 | break;
474 | }
475 | else
476 | {
477 | $n = $n+2;
478 | $tmpw .= $w;
479 | if( strlen($tmpw) > 60 )
480 | {
481 | break;
482 | }
483 | }
484 | }//while
485 | if( !$isok )
486 | {
487 | $this->simpleResult[$s]['w'] = $c;
488 | $this->simpleResult[$s]['t'] = 5;
489 | $s++;
490 | $onstr = '';
491 | $lastc = 5;
492 | }
493 | continue;
494 | }
495 |
496 | $onstr = '';
497 | $lastc = 5;
498 | if( $cn==0x3000 )
499 | {
500 | continue;
501 | }
502 | else
503 | {
504 | $this->simpleResult[$s]['w'] = $c;
505 | $this->simpleResult[$s]['t'] = 5;
506 | $s++;
507 | }
508 | }//2byte symbol
509 |
510 | }//end 2byte char
511 |
512 | }//end for
513 |
514 | //处理分词后的结果
515 | $this->_sort_finally_result();
516 | }
517 |
518 | /**
519 | * 深入分词
520 | * @parem $str
521 | * @parem $ctype (2 英文类, 3 中/韩/日文类)
522 | * @parem $spos 当前粗分结果游标
523 | * @return bool
524 | */
525 | private function _deep_analysis( &$str, $ctype, $spos, $optimize=true )
526 | {
527 |
528 | //中文句子
529 | if( $ctype==1 )
530 | {
531 | $slen = strlen($str);
532 | //小于系统配置分词要求长度的句子
533 | if( $slen < $this->notSplitLen )
534 | {
535 | $tmpstr = '';
536 | $lastType = 0;
537 | if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
538 | if($slen < 5)
539 | {
540 | //echo iconv(UCS2, 'utf-8', $str).'
';
541 | if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
542 | {
543 | $str2 = '';
544 | if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
545 | {
546 | $str2 = substr($str, 2, 2);
547 | $str = substr($str, 0, 2);
548 | }
549 | $ww = $this->simpleResult[$spos - 1]['w'].$str;
550 | $this->simpleResult[$spos - 1]['w'] = $ww;
551 | $this->simpleResult[$spos - 1]['t'] = 4;
552 | if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
553 | {
554 | $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
555 | $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
556 | }
557 | $this->simpleResult[$spos]['w'] = '';
558 | if( $str2 != '' )
559 | {
560 | $this->finallyResult[$spos-1][] = $ww;
561 | $this->finallyResult[$spos-1][] = $str2;
562 | }
563 | }
564 | else {
565 | $this->finallyResult[$spos][] = $str;
566 | }
567 | }
568 | else
569 | {
570 | $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
571 | }
572 | }
573 | //正常长度的句子,循环进行分词处理
574 | else
575 | {
576 | $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
577 | }
578 | }
579 | //英文句子,转为小写
580 | else
581 | {
582 | if( $this->toLower ) {
583 | $this->finallyResult[$spos][] = strtolower($str);
584 | }
585 | else {
586 | $this->finallyResult[$spos][] = $str;
587 | }
588 | }
589 | }
590 |
591 | /**
592 | * 中文的深入分词
593 | * @parem $str
594 | * @return void
595 | */
596 | private function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=true )
597 | {
598 | $quote1 = chr(0x20).chr(0x1C);
599 | $tmparr = array();
600 | $hasw = 0;
601 | //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
602 | if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
603 | {
604 | $tmparr[] = $str;
605 | if( !isset($this->newWords[$str]) )
606 | {
607 | $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
608 | $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
609 | }
610 | if( !$this->differMax )
611 | {
612 | $this->finallyResult[$spos][] = $str;
613 | return ;
614 | }
615 | }
616 | //进行切分
617 | for($i=$slen-1; $i > 0; $i -= 2)
618 | {
619 | //单个词
620 | $nc = $str[$i-1].$str[$i];
621 | //是否已经到最后两个字
622 | if( $i <= 2 )
623 | {
624 | $tmparr[] = $nc;
625 | $i = 0;
626 | break;
627 | }
628 | $isok = false;
629 | $i = $i + 1;
630 | for($k=$this->dicWordMax; $k>1; $k=$k-2)
631 | {
632 | if($i < $k) continue;
633 | $w = substr($str, $i-$k, $k);
634 | if( strlen($w) <= 2 )
635 | {
636 | $i = $i - 1;
637 | break;
638 | }
639 | if( $this->IsWord( $w ) )
640 | {
641 | $tmparr[] = $w;
642 | $i = $i - $k + 1;
643 | $isok = true;
644 | break;
645 | }
646 | }
647 | //echo '