├── README.md └── WordAnalysis ├── Analysis.php ├── dict ├── base_dic_full.dic ├── not-build │ └── base_dic_full.txt ├── readme.txt └── words_addons.dic ├── dict_build.php └── phpanalysis.class.php /README.md: -------------------------------------------------------------------------------- 1 | # PHPAnalysis 2 | PHP中分分词,使用PHPAnalysis 3 | 示例请移步:https://feifei.blog.csdn.net/article/details/99717174 -------------------------------------------------------------------------------- /WordAnalysis/Analysis.php: -------------------------------------------------------------------------------- 1 | LoadDict (); 35 | $pa->SetSource ($content); 36 | $pa->StartAnalysis ( true ); 37 | 38 | $tags = $pa->GetFinallyKeywords ($num); // 获取文章中的n个关键字 39 | return $tags;//返回关键字 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /WordAnalysis/dict/base_dic_full.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feixuekeji/PHPAnalysis/eafb8c6e61f5d1a3d2d168dc3e42052ac4fa397f/WordAnalysis/dict/base_dic_full.dic -------------------------------------------------------------------------------- /WordAnalysis/dict/readme.txt: -------------------------------------------------------------------------------- 1 | 文件说明: 2 | 3 | 1、base_dic_full.dic 4 | hash索引 -- 字典带有词频和词性标志。 5 | 6 | 2、words_addons.dic 7 | s 开头的表示停止词 u 后缀词(地名后缀、数学单位等) n 前导词(姓、汉字数词等) a 后导词(地区,部门等) 8 | 9 | 3、 not-build/base_dic_full.txt 10 | 没编译过的词典源码 11 | 12 | 4、重新编译词典的方法: 13 | 14 | MakeDict( sourcefile, 16 , 'dict/base_dic_full.dic'); 22 | 23 | echo "OK"; 24 | 25 | ?> -------------------------------------------------------------------------------- /WordAnalysis/dict/words_addons.dic: -------------------------------------------------------------------------------- 1 | s:停止词 2 | 并,让,才,上,被,把,近,而,是,为,由,等,合,子,除,均,很,也,称,还,分,据,后,向,经,对,但,只,则,设,靠,至,到,将,及,与,或,来,了,从,说,就,的,和,在,方,以,已,有,都,给,要 3 | 4 | n:姓或其它专用前缀词 5 | 新,肖,胡,罗,程,施,满,石,秦,苏,范,包,袁,许,舒,薛,蒋,董,白,田,季,丁,汪,段,梁,林,杜,杨,毛,江,熊,王,潘,沈,汤,谢,谭,韩,顾,雷,陈,阎,陆,马,高,龙,龚,黎,黄,魏,钱,钟,赵,邓,赖,贾,贺,邱,邵,郭,金,郝,郑,邹,李,武,余,夏,唐,朱,何,姚,孟,孙,孔,姜,周,吴,卢,单,刘,冯,史,叶,吕,候,傅,宋,任,文,戴,徐,张,万,方,曾,曹,易,廖,彭,常,尹,乔,于,康,崔,布,钟离,令狐,公冶,公孙,闻人,鲜于,上官,仲孙,万俟,东方,闾丘,长孙,诸葛,申屠,皇甫,尉迟,濮阳,澹台,欧阳,慕容,淳于,宗政,宇文,司徒,轩辕,单于,赫连,司空,太叔,夏侯,司马,公羊,勿,成吉,埃,哈 6 | 7 | u:单位或专用后缀词 8 | u‰,℃,℉,毛,段,步,毫,池,滴,派,洲,款,次,桩,档,桌,桶,梯,楼,棵,炮,点,盏,盆,界,盒,盘,眼,画,男,环,版,片,班,瓣,生,瓶,案,格,族,方,斤,日,时,期,月,曲,斗,文,指,拳,拨,掌,排,丈,撮,本,朵,栋,柜,柄,栏,株,根,样,架,枪,条,束,村,杯,枝,枚,石,码,辈,辆,轮,连,通,里,部,遍,转,车,言,角,袋,课,起,路,趟,重,针,项,顷,顶,顿,颗,首,餐,页,集,锅,钱,钟,门,间,隅,队,行,节,筐,笔,筒,箱,篮,篓,篇,章,站,磅,碟,碗,种,科,窝,秒,簇,米,脚,股,群,船,艇,色,艘,罐,级,粒,类,组,维,缸,缕,招,支,发,双,厘,口,句,台,只,厅,卷,包,勺,匙,匹,升,区,叶,号,地,圈,圆,场,块,堆,坪,团,回,吨,名,拍,员,周,副,剑,代,付,件,伏,份,人,亩,世,下,两,个,串,伙,位,划,分,列,则,剂,刻,刀,出,倍,例,元,克,册,具,声,听,幅,帧,房,批,师,岁,尾,尺,局,层,届,手,壶,成,张,截,户,扇,年,度,座,尊,幢,室,寸,头,宗,字,孔,所,女,套,拉,家,处,折,天,把,夜,担,號,个月,公斤,公分,公克,公担,公亩,公升,公尺,像素,月份,盎司,位数,公里,年级,点钟,克拉,英亩,平方,加仑,公顷,秒钟,千克,世纪,千米,分钟,海里,英寸,英尺,英里,年代,周年,小时,阶段,平米,立方米,立方码,平方米,平方码,平方厘米,立方英寸,立方厘米,立方分米,立方公尺,立方英尺,平方公尺,平方英尺,平方英寸,平方分米,平方公里,平方英里,百位,十位,百次,千次,千名,千亩,千里,千人,千台,千位,万次,万元,万里,万位,万件,万单,万个,万台,万名,万人,亿元,亿,万,千,萬 9 | 10 | a:地名等后置词 11 | 语,署,苑,街,省,湖,乡,海,观,路,娃,山,阁,部,镇,江,河,厅,郡,厂,楼,园,区,党,井,亭,塔,县,家,市,弄,巷,寺,局,中路,村委,诺夫,斯基,维奇,村委会,机,型,率 12 | 13 | c:数量前缀词 14 | 零,一,二,三,四,五,六,七,八,九,十,百,千,万,亿,第,半,几,俩,卅,两,壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,伯,仟 15 | 16 | t:省会等专用词 17 | 京,津,沪,渝,冀,豫,云,辽,黑,湘,皖,鲁,新,苏,浙,赣,鄂,桂,甘,晋,蒙,陕,吉,闽,贵,粤,青,藏,川,宁,琼 -------------------------------------------------------------------------------- /WordAnalysis/dict_build.php: -------------------------------------------------------------------------------- 1 | 请选择要进行的操作:
"; 12 | echo "1、用原始文件(dict/not-build/base_dic_full.txt)生成一个标准词典;
"; 13 | echo "2、从默认词典(dict/base_dic_full.dic),反编译出原始文件。"; 14 | exit(); 15 | } 16 | 17 | if( $_GET['ac']=='make' ) 18 | { 19 | PhpAnalysis::$loadInit = false; 20 | $pa = new PhpAnalysis('utf-8', 'utf-8', false); 21 | $pa->MakeDict( $dicAddon ); 22 | echo "完成词典创建!"; 23 | exit(); 24 | } 25 | else 26 | { 27 | $pa = new PhpAnalysis('utf-8', 'utf-8', true); 28 | $pa->ExportDict('base_dic_source.txt'); 29 | echo "完成反编译词典文件,生成的文件为:base_dic_source.txt !"; 30 | exit(); 31 | } 32 | ?> -------------------------------------------------------------------------------- /WordAnalysis/phpanalysis.class.php: -------------------------------------------------------------------------------- 1 | StartAnalysis -> Get***Result 7 | * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作 8 | * 9 | * Copyright IT柏拉图 QQ: 2500875 Email: 2500875#qq.com 10 | * 11 | * @version 2.0 12 | * 13 | */ 14 | 15 | //常量定义 16 | define('_SP_', chr(0xFF).chr(0xFE)); 17 | define('UCS2', 'ucs-2be'); 18 | class PhpAnalysis 19 | { 20 | 21 | //hash算法选项 22 | public $mask_value = 0xFFFF; 23 | 24 | //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型) 25 | public $sourceCharSet = 'utf-8'; 26 | public $targetCharSet = 'utf-8'; 27 | 28 | //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文 29 | public $resultType = 1; 30 | 31 | //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1 32 | public $notSplitLen = 5; 33 | 34 | //把英文单词全部转小写 35 | public $toLower = false; 36 | 37 | //使用最大切分模式对二元词进行消岐 38 | public $differMax = false; 39 | 40 | //尝试合并单字 41 | public $unitWord = true; 42 | 43 | //初始化类时直接加载词典 44 | public static $loadInit = true; 45 | 46 | //使用热门词优先模式进行消岐 47 | public $differFreq = false; 48 | 49 | //被转换为unicode的源字符串 50 | private $sourceString = ''; 51 | 52 | //附加词典 53 | public $addonDic = array(); 54 | public $addonDicFile = 'dict/words_addons.dic'; 55 | 56 | //主词典 57 | public $dicStr = ''; 58 | public $mainDic = array(); 59 | public $mainDicHand = false; 60 | public $mainDicInfos = array(); 61 | public $mainDicFile = 'dict/base_dic_full.dic'; 62 | //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条) 63 | private $isLoadAll = false; 64 | 65 | //主词典词语最大长度 x / 2 66 | private $dicWordMax = 14; 67 | //粗分后的数组(通常是截取句子等用途) 68 | private $simpleResult = array(); 69 | //最终结果(用空格分开的词汇列表) 70 | private $finallyResult = ''; 71 | 72 | //是否已经载入词典 73 | public $isLoadDic = false; 74 | //系统识别或合并的新词 75 | public $newWords = array(); 76 | public $foundWordStr = ''; 77 | //词库载入时间 78 | public $loadTime = 0; 79 | 80 | /** 81 | * 构造函数 82 | * @param $source_charset 83 | * @param $target_charset 84 | * @param $load_alldic 85 | * @param $source 86 | * 87 | * @return void 88 | */ 89 | public function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=true, $source='') 90 | { 91 | $this->addonDicFile = dirname(__FILE__).'/'.$this->addonDicFile; 92 | $this->mainDicFile = dirname(__FILE__).'/'.$this->mainDicFile; 93 | $this->SetSource( $source, $source_charset, $target_charset ); 94 | $this->isLoadAll = $load_all; 95 | if(self::$loadInit) $this->LoadDict(); 96 | } 97 | 98 | /** 99 | * 析构函数 100 | */ 101 | function __destruct() 102 | { 103 | if( $this->mainDicHand !== false ) 104 | { 105 | @fclose( $this->mainDicHand ); 106 | } 107 | } 108 | 109 | /** 110 | * 根据字符串计算key索引 111 | * @param $key 112 | * @return short int 113 | */ 114 | private function _get_index( $key ) 115 | { 116 | $l = strlen($key); 117 | $h = 0x238f13af; 118 | while ($l--) 119 | { 120 | $h += ($h << 5); 121 | $h ^= ord($key[$l]); 122 | $h &= 0x7fffffff; 123 | } 124 | return ($h % $this->mask_value); 125 | } 126 | 127 | /** 128 | * 从文件获得词 129 | * @param $key 130 | * @param $type (类型 word 或 key_groups) 131 | * @return short int 132 | */ 133 | public function GetWordInfos( $key, $type='word' ) 134 | { 135 | if( !$this->mainDicHand ) 136 | { 137 | $this->mainDicHand = fopen($this->mainDicFile, 'r'); 138 | } 139 | $p = 0; 140 | $keynum = $this->_get_index( $key ); 141 | if( isset($this->mainDicInfos[ $keynum ]) ) 142 | { 143 | $data = $this->mainDicInfos[ $keynum ]; 144 | } 145 | else 146 | { 147 | //rewind( $this->mainDicHand ); 148 | $move_pos = $keynum * 8; 149 | fseek($this->mainDicHand, $move_pos, SEEK_SET); 150 | $dat = fread($this->mainDicHand, 8); 151 | $arr = unpack('I1s/n1l/n1c', $dat); 152 | if( $arr['l'] == 0 ) 153 | { 154 | return false; 155 | } 156 | fseek($this->mainDicHand, $arr['s'], SEEK_SET); 157 | $data = @unserialize(fread($this->mainDicHand, $arr['l'])); 158 | $this->mainDicInfos[ $keynum ] = $data; 159 | } 160 | if( !is_array($data) || !isset($data[$key]) ) 161 | { 162 | return false; 163 | } 164 | return ($type=='word' ? $data[$key] : $data); 165 | } 166 | 167 | /** 168 | * 设置源字符串 169 | * @param $source 170 | * @param $source_charset 171 | * @param $target_charset 172 | * 173 | * @return bool 174 | */ 175 | public function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' ) 176 | { 177 | $this->sourceCharSet = strtolower($source_charset); 178 | $this->targetCharSet = strtolower($target_charset); 179 | $this->simpleResult = array(); 180 | $this->finallyResult = array(); 181 | $this->finallyIndex = array(); 182 | if( $source != '' ) 183 | { 184 | $rs = true; 185 | if( preg_match("/^utf/", $source_charset) ) { 186 | $this->sourceString = iconv('utf-8', UCS2, $source); 187 | } 188 | else if( preg_match("/^gb/", $source_charset) ) { 189 | $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source)); 190 | } 191 | else if( preg_match("/^big/", $source_charset) ) { 192 | $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source)); 193 | } 194 | else { 195 | $rs = false; 196 | } 197 | } 198 | else 199 | { 200 | $rs = false; 201 | } 202 | return $rs; 203 | } 204 | 205 | /** 206 | * 设置结果类型(只在获取finallyResult才有效) 207 | * @param $rstype 1 为全部, 2去除特殊符号 208 | * 209 | * @return void 210 | */ 211 | public function SetResultType( $rstype ) 212 | { 213 | $this->resultType = $rstype; 214 | } 215 | 216 | /** 217 | * 载入词典 218 | * 219 | * @return void 220 | */ 221 | public function LoadDict( $maindic='' ) 222 | { 223 | $startt = microtime(true); 224 | //正常读取文件 225 | $dicAddon = $this->addonDicFile; 226 | if($maindic=='' || !file_exists($maindic) ) 227 | { 228 | $dicWords = $this->mainDicFile ; 229 | } 230 | else 231 | { 232 | $dicWords = $maindic; 233 | $this->mainDicFile = $maindic; 234 | } 235 | 236 | //加载主词典(只打开) 237 | $this->mainDicHand = fopen($dicWords, 'r'); 238 | 239 | //载入副词典 240 | $hw = ''; 241 | $ds = file($dicAddon); 242 | foreach($ds as $d) 243 | { 244 | $d = trim($d); 245 | if($d=='') continue; 246 | $estr = substr($d, 1, 1); 247 | if( $estr==':' ) { 248 | $hw = substr($d, 0, 1); 249 | } 250 | else 251 | { 252 | $spstr = _SP_; 253 | $spstr = iconv(UCS2, 'utf-8', $spstr); 254 | $ws = explode(',', $d); 255 | $wall = iconv('utf-8', UCS2, join($spstr, $ws)); 256 | $ws = explode(_SP_, $wall); 257 | foreach($ws as $estr) 258 | { 259 | $this->addonDic[$hw][$estr] = strlen($estr); 260 | } 261 | } 262 | } 263 | $this->loadTime = microtime(true) - $startt; 264 | $this->isLoadDic = true; 265 | } 266 | 267 | /** 268 | * 检测某个词是否存在 269 | */ 270 | public function IsWord( $word ) 271 | { 272 | $winfos = $this->GetWordInfos( $word ); 273 | return ($winfos !== false); 274 | } 275 | 276 | /** 277 | * 获得某个词的词性及词频信息 278 | * @parem $word unicode编码的词 279 | * @return void 280 | */ 281 | public function GetWordProperty($word) 282 | { 283 | if( strlen($word)<4 ) 284 | { 285 | return '/s'; 286 | } 287 | $infos = $this->GetWordInfos($word); 288 | return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s"; 289 | } 290 | 291 | /** 292 | * 指定某词的词性信息(通常是新词) 293 | * @parem $word unicode编码的词 294 | * @parem $infos array('c' => 词频, 'm' => 词性); 295 | * @return void; 296 | */ 297 | public function SetWordInfos($word, $infos) 298 | { 299 | if( strlen($word)<4 ) 300 | { 301 | return ; 302 | } 303 | if( isset($this->mainDicInfos[$word]) ) 304 | { 305 | $this->newWords[$word]++; 306 | $this->mainDicInfos[$word]['c']++; 307 | } 308 | else 309 | { 310 | $this->newWords[$word] = 1; 311 | $this->mainDicInfos[$word] = $infos; 312 | } 313 | } 314 | 315 | /** 316 | * 开始执行分析 317 | * @parem bool optimize 是否对结果进行优化 318 | * @return bool 319 | */ 320 | public function StartAnalysis($optimize=true) 321 | { 322 | if( !$this->isLoadDic ) 323 | { 324 | $this->LoadDict(); 325 | } 326 | $this->simpleResult = $this->finallyResult = array(); 327 | $this->sourceString .= chr(0).chr(32); 328 | $slen = strlen($this->sourceString); 329 | $sbcArr = array(); 330 | $j = 0; 331 | //全角与半角字符对照表 332 | for($i=0xFF00; $i < 0xFF5F; $i++) 333 | { 334 | $scb = 0x20 + $j; 335 | $j++; 336 | $sbcArr[$i] = $scb; 337 | } 338 | //对字符串进行粗分 339 | $onstr = ''; 340 | $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 341 | $s = 0; 342 | $ansiWordMatch = "[0-9a-z@#%\+\.-]"; 343 | $notNumberMatch = "[a-z@#%\+]"; 344 | for($i=0; $i < $slen; $i++) 345 | { 346 | $c = $this->sourceString[$i].$this->sourceString[++$i]; 347 | $cn = hexdec(bin2hex($c)); 348 | $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn; 349 | //ANSI字符 350 | if($cn < 0x80) 351 | { 352 | if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) ) 353 | { 354 | if( $lastc != 2 && $onstr != '') { 355 | $this->simpleResult[$s]['w'] = $onstr; 356 | $this->simpleResult[$s]['t'] = $lastc; 357 | $this->_deep_analysis($onstr, $lastc, $s, $optimize); 358 | $s++; 359 | $onstr = ''; 360 | } 361 | $lastc = 2; 362 | $onstr .= chr(0).chr($cn); 363 | } 364 | else 365 | { 366 | if( $onstr != '' ) 367 | { 368 | $this->simpleResult[$s]['w'] = $onstr; 369 | if( $lastc==2 ) 370 | { 371 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4; 372 | } 373 | $this->simpleResult[$s]['t'] = $lastc; 374 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize); 375 | $s++; 376 | } 377 | $onstr = ''; 378 | $lastc = 3; 379 | if($cn < 31) 380 | { 381 | continue; 382 | } 383 | else 384 | { 385 | $this->simpleResult[$s]['w'] = chr(0).chr($cn); 386 | $this->simpleResult[$s]['t'] = 3; 387 | $s++; 388 | } 389 | } 390 | } 391 | //普通字符 392 | else 393 | { 394 | //正常文字 395 | if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D) 396 | || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) ) 397 | { 398 | if( $lastc != 1 && $onstr != '') 399 | { 400 | $this->simpleResult[$s]['w'] = $onstr; 401 | if( $lastc==2 ) 402 | { 403 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4; 404 | } 405 | $this->simpleResult[$s]['t'] = $lastc; 406 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize); 407 | $s++; 408 | $onstr = ''; 409 | } 410 | $lastc = 1; 411 | $onstr .= $c; 412 | } 413 | //特殊符号 414 | else 415 | { 416 | if( $onstr != '' ) 417 | { 418 | $this->simpleResult[$s]['w'] = $onstr; 419 | if( $lastc==2 ) 420 | { 421 | if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4; 422 | } 423 | $this->simpleResult[$s]['t'] = $lastc; 424 | if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize); 425 | $s++; 426 | } 427 | 428 | //检测书名 429 | if( $cn == 0x300A ) 430 | { 431 | $tmpw = ''; 432 | $n = 1; 433 | $isok = false; 434 | $ew = chr(0x30).chr(0x0B); 435 | while(true) 436 | { 437 | if( !isset($this->sourceString[$i+$n+1]) ) break; 438 | $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1]; 439 | if( $w == $ew ) 440 | { 441 | $this->simpleResult[$s]['w'] = $c; 442 | $this->simpleResult[$s]['t'] = 5; 443 | $s++; 444 | 445 | $this->simpleResult[$s]['w'] = $tmpw; 446 | $this->newWords[$tmpw] = 1; 447 | if( !isset($this->newWords[$tmpw]) ) 448 | { 449 | $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, '; 450 | $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb')); 451 | } 452 | $this->simpleResult[$s]['t'] = 13; 453 | 454 | $s++; 455 | 456 | //最大切分模式对书名继续分词 457 | if( $this->differMax ) 458 | { 459 | $this->simpleResult[$s]['w'] = $tmpw; 460 | $this->simpleResult[$s]['t'] = 21; 461 | $this->_deep_analysis($tmpw, $lastc, $s, $optimize); 462 | $s++; 463 | } 464 | 465 | $this->simpleResult[$s]['w'] = $ew; 466 | $this->simpleResult[$s]['t'] = 5; 467 | $s++; 468 | 469 | $i = $i + $n + 1; 470 | $isok = true; 471 | $onstr = ''; 472 | $lastc = 5; 473 | break; 474 | } 475 | else 476 | { 477 | $n = $n+2; 478 | $tmpw .= $w; 479 | if( strlen($tmpw) > 60 ) 480 | { 481 | break; 482 | } 483 | } 484 | }//while 485 | if( !$isok ) 486 | { 487 | $this->simpleResult[$s]['w'] = $c; 488 | $this->simpleResult[$s]['t'] = 5; 489 | $s++; 490 | $onstr = ''; 491 | $lastc = 5; 492 | } 493 | continue; 494 | } 495 | 496 | $onstr = ''; 497 | $lastc = 5; 498 | if( $cn==0x3000 ) 499 | { 500 | continue; 501 | } 502 | else 503 | { 504 | $this->simpleResult[$s]['w'] = $c; 505 | $this->simpleResult[$s]['t'] = 5; 506 | $s++; 507 | } 508 | }//2byte symbol 509 | 510 | }//end 2byte char 511 | 512 | }//end for 513 | 514 | //处理分词后的结果 515 | $this->_sort_finally_result(); 516 | } 517 | 518 | /** 519 | * 深入分词 520 | * @parem $str 521 | * @parem $ctype (2 英文类, 3 中/韩/日文类) 522 | * @parem $spos 当前粗分结果游标 523 | * @return bool 524 | */ 525 | private function _deep_analysis( &$str, $ctype, $spos, $optimize=true ) 526 | { 527 | 528 | //中文句子 529 | if( $ctype==1 ) 530 | { 531 | $slen = strlen($str); 532 | //小于系统配置分词要求长度的句子 533 | if( $slen < $this->notSplitLen ) 534 | { 535 | $tmpstr = ''; 536 | $lastType = 0; 537 | if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t']; 538 | if($slen < 5) 539 | { 540 | //echo iconv(UCS2, 'utf-8', $str).'
'; 541 | if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) ) 542 | { 543 | $str2 = ''; 544 | if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) ) 545 | { 546 | $str2 = substr($str, 2, 2); 547 | $str = substr($str, 0, 2); 548 | } 549 | $ww = $this->simpleResult[$spos - 1]['w'].$str; 550 | $this->simpleResult[$spos - 1]['w'] = $ww; 551 | $this->simpleResult[$spos - 1]['t'] = 4; 552 | if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) ) 553 | { 554 | $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, '; 555 | $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu')); 556 | } 557 | $this->simpleResult[$spos]['w'] = ''; 558 | if( $str2 != '' ) 559 | { 560 | $this->finallyResult[$spos-1][] = $ww; 561 | $this->finallyResult[$spos-1][] = $str2; 562 | } 563 | } 564 | else { 565 | $this->finallyResult[$spos][] = $str; 566 | } 567 | } 568 | else 569 | { 570 | $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize ); 571 | } 572 | } 573 | //正常长度的句子,循环进行分词处理 574 | else 575 | { 576 | $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize ); 577 | } 578 | } 579 | //英文句子,转为小写 580 | else 581 | { 582 | if( $this->toLower ) { 583 | $this->finallyResult[$spos][] = strtolower($str); 584 | } 585 | else { 586 | $this->finallyResult[$spos][] = $str; 587 | } 588 | } 589 | } 590 | 591 | /** 592 | * 中文的深入分词 593 | * @parem $str 594 | * @return void 595 | */ 596 | private function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=true ) 597 | { 598 | $quote1 = chr(0x20).chr(0x1C); 599 | $tmparr = array(); 600 | $hasw = 0; 601 | //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。 602 | if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 ) 603 | { 604 | $tmparr[] = $str; 605 | if( !isset($this->newWords[$str]) ) 606 | { 607 | $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, '; 608 | $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq')); 609 | } 610 | if( !$this->differMax ) 611 | { 612 | $this->finallyResult[$spos][] = $str; 613 | return ; 614 | } 615 | } 616 | //进行切分 617 | for($i=$slen-1; $i > 0; $i -= 2) 618 | { 619 | //单个词 620 | $nc = $str[$i-1].$str[$i]; 621 | //是否已经到最后两个字 622 | if( $i <= 2 ) 623 | { 624 | $tmparr[] = $nc; 625 | $i = 0; 626 | break; 627 | } 628 | $isok = false; 629 | $i = $i + 1; 630 | for($k=$this->dicWordMax; $k>1; $k=$k-2) 631 | { 632 | if($i < $k) continue; 633 | $w = substr($str, $i-$k, $k); 634 | if( strlen($w) <= 2 ) 635 | { 636 | $i = $i - 1; 637 | break; 638 | } 639 | if( $this->IsWord( $w ) ) 640 | { 641 | $tmparr[] = $w; 642 | $i = $i - $k + 1; 643 | $isok = true; 644 | break; 645 | } 646 | } 647 | //echo '
'; 648 | //没适合词 649 | if(!$isok) $tmparr[] = $nc; 650 | } 651 | $wcount = count($tmparr); 652 | if( $wcount==0 ) return ; 653 | $this->finallyResult[$spos] = array_reverse($tmparr); 654 | //优化结果(岐义处理、新词、数词、人名识别等) 655 | if( $optimize ) 656 | { 657 | $this->_optimize_result( $this->finallyResult[$spos], $spos ); 658 | } 659 | } 660 | 661 | /** 662 | * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等) 663 | * @parem $optimize 是否优化合并的结果 664 | * @return bool 665 | */ 666 | //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 667 | private function _optimize_result( &$smarr, $spos ) 668 | { 669 | $newarr = array(); 670 | $prePos = $spos - 1; 671 | $arlen = count($smarr); 672 | $i = $j = 0; 673 | //检测数量词 674 | if( $prePos > -1 && !isset($this->finallyResult[$prePos]) ) 675 | { 676 | $lastw = $this->simpleResult[$prePos]['w']; 677 | $lastt = $this->simpleResult[$prePos]['t']; 678 | if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) ) 679 | { 680 | $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0]; 681 | $this->simpleResult[$prePos]['t'] = 4; 682 | if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) ) 683 | { 684 | $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, '; 685 | $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu')); 686 | } 687 | $smarr[0] = ''; 688 | $i++; 689 | } 690 | } 691 | for(; $i < $arlen; $i++) 692 | { 693 | 694 | if( !isset( $smarr[$i+1] ) ) 695 | { 696 | $newarr[$j] = $smarr[$i]; 697 | break; 698 | } 699 | $cw = $smarr[$i]; 700 | $nw = $smarr[$i+1]; 701 | $ischeck = false; 702 | //检测数量词 703 | if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) ) 704 | { 705 | //最大切分时保留合并前的词 706 | if($this->differMax) 707 | { 708 | $newarr[$j] = chr(0).chr(0x28); 709 | $j++; 710 | $newarr[$j] = $cw; 711 | $j++; 712 | $newarr[$j] = $nw; 713 | $j++; 714 | $newarr[$j] = chr(0).chr(0x29); 715 | $j++; 716 | } 717 | $newarr[$j] = $cw.$nw; 718 | if( !isset($this->newWords[$newarr[$j]]) ) 719 | { 720 | $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, '; 721 | $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu')); 722 | } 723 | $j++; $i++; $ischeck = true; 724 | } 725 | //检测前导词(通常是姓) 726 | else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) ) 727 | { 728 | $is_rs = false; 729 | //词语是副词或介词或频率很高的词不作为人名 730 | if( strlen($nw)==4 ) 731 | { 732 | $winfos = $this->GetWordInfos($nw); 733 | if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) ) 734 | { 735 | $is_rs = true; 736 | } 737 | } 738 | if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs ) 739 | { 740 | $newarr[$j] = $cw.$nw; 741 | //echo iconv(UCS2, 'utf-8', $newarr[$j])."
"; 742 | //尝试检测第三个词 743 | if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) ) 744 | { 745 | $newarr[$j] .= $smarr[$i+2]; 746 | $i++; 747 | } 748 | if( !isset($this->newWords[$newarr[$j]]) ) 749 | { 750 | $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr')); 751 | $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, '; 752 | } 753 | //为了防止错误,保留合并前的姓名 754 | if(strlen($nw)==4) 755 | { 756 | $j++; 757 | $newarr[$j] = chr(0).chr(0x28); 758 | $j++; 759 | $newarr[$j] = $cw; 760 | $j++; 761 | $newarr[$j] = $nw; 762 | $j++; 763 | $newarr[$j] = chr(0).chr(0x29); 764 | } 765 | 766 | $j++; $i++; $ischeck = true; 767 | } 768 | } 769 | //检测后缀词(地名等) 770 | else if( isset($this->addonDic['a'][$nw]) ) 771 | { 772 | $is_rs = false; 773 | //词语是副词或介词不作为前缀 774 | if( strlen($cw)>2 ) 775 | { 776 | $winfos = $this->GetWordInfos($cw); 777 | if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) ) 778 | { 779 | $is_rs = true; 780 | } 781 | } 782 | if( !isset($this->addonDic['s'][$cw]) && !$is_rs ) 783 | { 784 | $newarr[$j] = $cw.$nw; 785 | if( !isset($this->newWords[$newarr[$j]]) ) 786 | { 787 | $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, '; 788 | $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na')); 789 | } 790 | $i++; $j++; $ischeck = true; 791 | } 792 | } 793 | //新词识别(暂无规则) 794 | else if($this->unitWord) 795 | { 796 | if(strlen($cw)==2 && strlen($nw)==2 797 | && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) 798 | && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])) 799 | { 800 | $newarr[$j] = $cw.$nw; 801 | //尝试检测第三个词 802 | if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) ) 803 | { 804 | $newarr[$j] .= $smarr[$i+2]; 805 | $i++; 806 | } 807 | if( !isset($this->newWords[$newarr[$j]]) ) 808 | { 809 | $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, '; 810 | $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms')); 811 | } 812 | $i++; $j++; $ischeck = true; 813 | } 814 | } 815 | 816 | //不符合规则 817 | if( !$ischeck ) 818 | { 819 | $newarr[$j] = $cw; 820 | //二元消岐处理——最大切分模式 821 | if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7) 822 | { 823 | $slen = strlen($nw); 824 | $hasDiff = false; 825 | for($y=2; $y <= $slen-2; $y=$y+2) 826 | { 827 | $nhead = substr($nw, $y-2, 2); 828 | $nfont = $cw.substr($nw, 0, $y-2); 829 | if( $this->IsWord( $nfont.$nhead ) ) 830 | { 831 | if( strlen($cw) > 2 ) $j++; 832 | $hasDiff = true; 833 | $newarr[$j] = $nfont.$nhead; 834 | } 835 | } 836 | } 837 | $j++; 838 | } 839 | 840 | }//end for 841 | $smarr = $newarr; 842 | } 843 | 844 | /** 845 | * 转换最终分词结果到 finallyResult 数组 846 | * @return void 847 | */ 848 | private function _sort_finally_result() 849 | { 850 | $newarr = array(); 851 | $i = 0; 852 | foreach($this->simpleResult as $k=>$v) 853 | { 854 | if( empty($v['w']) ) continue; 855 | if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 ) 856 | { 857 | foreach($this->finallyResult[$k] as $w) 858 | { 859 | if(!empty($w)) 860 | { 861 | $newarr[$i]['w'] = $w; 862 | $newarr[$i]['t'] = 20; 863 | $i++; 864 | } 865 | } 866 | } 867 | else if($v['t'] != 21) 868 | { 869 | $newarr[$i]['w'] = $v['w']; 870 | $newarr[$i]['t'] = $v['t']; 871 | $i++; 872 | } 873 | } 874 | $this->finallyResult = $newarr; 875 | $newarr = ''; 876 | } 877 | 878 | /** 879 | * 把uncode字符串转换为输出字符串 880 | * @parem str 881 | * return string 882 | */ 883 | private function _out_string_encoding( &$str ) 884 | { 885 | $rsc = $this->_source_result_charset(); 886 | if( $rsc==1 ) { 887 | $rsstr = iconv(UCS2, 'utf-8', $str); 888 | } 889 | else if( $rsc==2 ) { 890 | $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) ); 891 | } 892 | else{ 893 | $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) ); 894 | } 895 | return $rsstr; 896 | } 897 | 898 | /** 899 | * 获取最终结果字符串(用空格分开后的分词结果) 900 | * @return string 901 | */ 902 | public function GetFinallyResult($spword=' ', $word_meanings=false) 903 | { 904 | $rsstr = ''; 905 | foreach($this->finallyResult as $v) 906 | { 907 | if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) ) 908 | { 909 | continue; 910 | } 911 | $m = ''; 912 | if( $word_meanings ) 913 | { 914 | $m = $this->GetWordProperty($v['w']); 915 | } 916 | $w = $this->_out_string_encoding($v['w']); 917 | if( $w != ' ' ) 918 | { 919 | if($word_meanings) { 920 | $rsstr .= $spword.$w.$m; 921 | } 922 | else { 923 | $rsstr .= $spword.$w; 924 | } 925 | } 926 | } 927 | return $rsstr; 928 | } 929 | 930 | /** 931 | * 获取粗分结果,不包含粗分属性 932 | * @return array() 933 | */ 934 | public function GetSimpleResult() 935 | { 936 | $rearr = array(); 937 | foreach($this->simpleResult as $k=>$v) 938 | { 939 | if( empty($v['w']) ) continue; 940 | $w = $this->_out_string_encoding($v['w']); 941 | if( $w != ' ' ) $rearr[] = $w; 942 | } 943 | return $rearr; 944 | } 945 | 946 | /** 947 | * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符) 948 | * @return array() 949 | */ 950 | public function GetSimpleResultAll() 951 | { 952 | $rearr = array(); 953 | foreach($this->simpleResult as $k=>$v) 954 | { 955 | $w = $this->_out_string_encoding($v['w']); 956 | if( $w != ' ' ) 957 | { 958 | $rearr[$k]['w'] = $w; 959 | $rearr[$k]['t'] = $v['t']; 960 | } 961 | } 962 | return $rearr; 963 | } 964 | 965 | /** 966 | * 获取索引hash数组 967 | * @return array('word'=>count,...) 968 | */ 969 | public function GetFinallyIndex() 970 | { 971 | $rearr = array(); 972 | foreach($this->finallyResult as $v) 973 | { 974 | if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) ) 975 | { 976 | continue; 977 | } 978 | $w = $this->_out_string_encoding($v['w']); 979 | if( $w == ' ' ) 980 | { 981 | continue; 982 | } 983 | if( isset($rearr[$w]) ) 984 | { 985 | $rearr[$w]++; 986 | } 987 | else 988 | { 989 | $rearr[$w] = 1; 990 | } 991 | } 992 | arsort( $rearr ); 993 | return $rearr; 994 | } 995 | 996 | /** 997 | * 获取最终关键字(返回用 "," 间隔的关键字) 998 | * @return string 999 | */ 1000 | public function GetFinallyKeywords( $num = 10 ) 1001 | { 1002 | $n = 0; 1003 | $arr = $this->GetFinallyIndex(); 1004 | $okstr = ''; 1005 | foreach( $arr as $k => $v ) 1006 | { 1007 | //排除长度为1的词 1008 | if( strlen($k)==1 ) { 1009 | continue; 1010 | } 1011 | //排除长度为2的非英文词 1012 | elseif( strlen($k)==2 && preg_match('/[^0-9a-zA-Z]/', $k) ) { 1013 | continue; 1014 | 1015 | } 1016 | //排除单个中文字 1017 | elseif( strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) { 1018 | continue; 1019 | } 1020 | $okstr .= ($okstr=='' ? $k : ','.$k); 1021 | $n++; 1022 | if( $n > $num ) break; 1023 | } 1024 | return $okstr; 1025 | } 1026 | 1027 | /** 1028 | * 获得保存目标编码 1029 | * @return int 1030 | */ 1031 | private function _source_result_charset() 1032 | { 1033 | if( preg_match("/^utf/", $this->targetCharSet) ) { 1034 | $rs = 1; 1035 | } 1036 | else if( preg_match("/^gb/", $this->targetCharSet) ) { 1037 | $rs = 2; 1038 | } 1039 | else if( preg_match("/^big/", $this->targetCharSet) ) { 1040 | $rs = 3; 1041 | } 1042 | else { 1043 | $rs = 4; 1044 | } 1045 | return $rs; 1046 | } 1047 | 1048 | /** 1049 | * 编译词典 1050 | * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt> 1051 | * 注意, 需要PHP开放足够的内存才能完成操作 1052 | * @return void 1053 | */ 1054 | public function MakeDict( $source_file, $target_file='' ) 1055 | { 1056 | $target_file = ($target_file=='' ? $this->mainDicFile : $target_file); 1057 | $allk = array(); 1058 | $fp = fopen($source_file, 'r'); 1059 | while( $line = fgets($fp, 512) ) 1060 | { 1061 | if( $line[0]=='@' ) continue; 1062 | list($w, $r, $a) = explode(',', $line); 1063 | $a = trim( $a ); 1064 | $w = iconv('utf-8', UCS2, $w); 1065 | $k = $this->_get_index( $w ); 1066 | if( isset($allk[ $k ]) ) 1067 | $allk[ $k ][ $w ] = array($r, $a); 1068 | else 1069 | $allk[ $k ][ $w ] = array($r, $a); 1070 | } 1071 | fclose( $fp ); 1072 | $fp = fopen($target_file, 'w'); 1073 | $heade_rarr = array(); 1074 | $alldat = ''; 1075 | $start_pos = $this->mask_value * 8; 1076 | foreach( $allk as $k => $v ) 1077 | { 1078 | $dat = serialize( $v ); 1079 | $dlen = strlen($dat); 1080 | $alldat .= $dat; 1081 | 1082 | $heade_rarr[ $k ][0] = $start_pos; 1083 | $heade_rarr[ $k ][1] = $dlen; 1084 | $heade_rarr[ $k ][2] = count( $v ); 1085 | 1086 | $start_pos += $dlen; 1087 | } 1088 | unset( $allk ); 1089 | for($i=0; $i < $this->mask_value; $i++) 1090 | { 1091 | if( !isset($heade_rarr[$i]) ) 1092 | { 1093 | $heade_rarr[$i] = array(0, 0, 0); 1094 | } 1095 | fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2])); 1096 | } 1097 | fwrite( $fp, $alldat); 1098 | fclose( $fp ); 1099 | } 1100 | 1101 | /** 1102 | * 导出词典的词条 1103 | * @parem $targetfile 保存位置 1104 | * @return void 1105 | */ 1106 | public function ExportDict( $targetfile ) 1107 | { 1108 | if( !$this->mainDicHand ) 1109 | { 1110 | $this->mainDicHand = fopen($this->mainDicFile, 'r'); 1111 | } 1112 | $fp = fopen($targetfile, 'w'); 1113 | for($i=0; $i <= $this->mask_value; $i++) 1114 | { 1115 | $move_pos = $i * 8; 1116 | fseek($this->mainDicHand, $move_pos, SEEK_SET); 1117 | $dat = fread($this->mainDicHand, 8); 1118 | $arr = unpack('I1s/n1l/n1c', $dat); 1119 | if( $arr['l'] == 0 ) 1120 | { 1121 | continue; 1122 | } 1123 | fseek($this->mainDicHand, $arr['s'], SEEK_SET); 1124 | $data = @unserialize(fread($this->mainDicHand, $arr['l'])); 1125 | if( !is_array($data) ) continue; 1126 | foreach($data as $k => $v) 1127 | { 1128 | $w = iconv(UCS2, 'utf-8', $k); 1129 | fwrite($fp, "{$w},{$v[0]},{$v[1]}\n"); 1130 | } 1131 | } 1132 | fclose( $fp ); 1133 | return true; 1134 | } 1135 | } 1136 | 1137 | ?> 1138 | --------------------------------------------------------------------------------