├── README.md
└── WordAnalysis
    ├── Analysis.php
    ├── dict
        ├── base_dic_full.dic
        ├── not-build
        │   └── base_dic_full.txt
        ├── readme.txt
        └── words_addons.dic
    ├── dict_build.php
    └── phpanalysis.class.php


/README.md:
--------------------------------------------------------------------------------
1 | # PHPAnalysis
2 | PHP中分分词，使用PHPAnalysis
3 | 示例请移步：https://feifei.blog.csdn.net/article/details/99717174


--------------------------------------------------------------------------------
/WordAnalysis/Analysis.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /**
 3 |  * @auther: xxf
 4 |  * Date: 2019/8/19
 5 |  * Time: 11:04
 6 |  */
 7 | 
 8 | namespace WordAnalysis;
 9 | 
10 | /**
11 |  * 中文分词提取关键字
12 |  */
13 | class Analysis
14 | {
15 |     /**
16 |      * Notes:关键字提取
17 |      * @auther: xxf
18 |      * Date: 2019/8/19
19 |      * Time: 11:09
20 |      * @param string $content
21 |      * @param int $num 获取数量
22 |      * @return string
23 |      */
24 |     public static function getKeywords($content = "",$num = 3) {
25 |         if (empty ( $content )) {
26 |             return '';
27 |         }
28 | 
29 |         require_once 'phpanalysis.class.php';
30 | 
31 | 
32 |         \PhpAnalysis::$loadInit = false;
33 |         $pa = new \PhpAnalysis ( 'utf-8', 'utf-8', false );
34 |         $pa->LoadDict ();
35 |         $pa->SetSource ($content);
36 |         $pa->StartAnalysis ( true );
37 | 
38 |         $tags = $pa->GetFinallyKeywords ($num); // 获取文章中的n个关键字
39 |         return $tags;//返回关键字
40 |     }
41 | 
42 | }


--------------------------------------------------------------------------------
/WordAnalysis/dict/base_dic_full.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feixuekeji/PHPAnalysis/eafb8c6e61f5d1a3d2d168dc3e42052ac4fa397f/WordAnalysis/dict/base_dic_full.dic


--------------------------------------------------------------------------------
/WordAnalysis/dict/readme.txt:
--------------------------------------------------------------------------------
 1 | ﻿文件说明：
 2 | 
 3 | 1、base_dic_full.dic
 4 | hash索引 -- 字典带有词频和词性标志。
 5 | 
 6 | 2、words_addons.dic
 7 | s 开头的表示停止词 u 后缀词（地名后缀、数学单位等） n 前导词（姓、汉字数词等） a 后导词(地区,部门等)
 8 | 
 9 | 3、 not-build/base_dic_full.txt
10 | 没编译过的词典源码
11 | 
12 | 4、重新编译词典的方法：
13 | 
14 | <?php
15 | 
16 | header('Content-Type: text/html; charset=utf-8');
17 | 
18 | require_once('phpanalysis.class.php');
19 | 
20 | $pa = new PhpAnalysis('utf-8', 'utf-8', false);
21 | $pa->MakeDict( sourcefile,  16 , 'dict/base_dic_full.dic');
22 | 
23 | echo "OK";
24 | 
25 | ?>


--------------------------------------------------------------------------------
/WordAnalysis/dict/words_addons.dic:
--------------------------------------------------------------------------------
 1 | s:停止词
 2 | 并,让,才,上,被,把,近,而,是,为,由,等,合,子,除,均,很,也,称,还,分,据,后,向,经,对,但,只,则,设,靠,至,到,将,及,与,或,来,了,从,说,就,的,和,在,方,以,已,有,都,给,要
 3 | 
 4 | n:姓或其它专用前缀词
 5 | 新,肖,胡,罗,程,施,满,石,秦,苏,范,包,袁,许,舒,薛,蒋,董,白,田,季,丁,汪,段,梁,林,杜,杨,毛,江,熊,王,潘,沈,汤,谢,谭,韩,顾,雷,陈,阎,陆,马,高,龙,龚,黎,黄,魏,钱,钟,赵,邓,赖,贾,贺,邱,邵,郭,金,郝,郑,邹,李,武,余,夏,唐,朱,何,姚,孟,孙,孔,姜,周,吴,卢,单,刘,冯,史,叶,吕,候,傅,宋,任,文,戴,徐,张,万,方,曾,曹,易,廖,彭,常,尹,乔,于,康,崔,布,钟离,令狐,公冶,公孙,闻人,鲜于,上官,仲孙,万俟,东方,闾丘,长孙,诸葛,申屠,皇甫,尉迟,濮阳,澹台,欧阳,慕容,淳于,宗政,宇文,司徒,轩辕,单于,赫连,司空,太叔,夏侯,司马,公羊,勿,成吉,埃,哈
 6 | 
 7 | u:单位或专用后缀词
 8 | u‰,℃,℉,毛,段,步,毫,池,滴,派,洲,款,次,桩,档,桌,桶,梯,楼,棵,炮,点,盏,盆,界,盒,盘,眼,画,男,环,版,片,班,瓣,生,瓶,案,格,族,方,斤,日,时,期,月,曲,斗,文,指,拳,拨,掌,排,丈,撮,本,朵,栋,柜,柄,栏,株,根,样,架,枪,条,束,村,杯,枝,枚,石,码,辈,辆,轮,连,通,里,部,遍,转,车,言,角,袋,课,起,路,趟,重,针,项,顷,顶,顿,颗,首,餐,页,集,锅,钱,钟,门,间,隅,队,行,节,筐,笔,筒,箱,篮,篓,篇,章,站,磅,碟,碗,种,科,窝,秒,簇,米,脚,股,群,船,艇,色,艘,罐,级,粒,类,组,维,缸,缕,招,支,发,双,厘,口,句,台,只,厅,卷,包,勺,匙,匹,升,区,叶,号,地,圈,圆,场,块,堆,坪,团,回,吨,名,拍,员,周,副,剑,代,付,件,伏,份,人,亩,世,下,两,个,串,伙,位,划,分,列,则,剂,刻,刀,出,倍,例,元,克,册,具,声,听,幅,帧,房,批,师,岁,尾,尺,局,层,届,手,壶,成,张,截,户,扇,年,度,座,尊,幢,室,寸,头,宗,字,孔,所,女,套,拉,家,处,折,天,把,夜,担,號,个月,公斤,公分,公克,公担,公亩,公升,公尺,像素,月份,盎司,位数,公里,年级,点钟,克拉,英亩,平方,加仑,公顷,秒钟,千克,世纪,千米,分钟,海里,英寸,英尺,英里,年代,周年,小时,阶段,平米,立方米,立方码,平方米,平方码,平方厘米,立方英寸,立方厘米,立方分米,立方公尺,立方英尺,平方公尺,平方英尺,平方英寸,平方分米,平方公里,平方英里,百位,十位,百次,千次,千名,千亩,千里,千人,千台,千位,万次,万元,万里,万位,万件,万单,万个,万台,万名,万人,亿元,亿,万,千,萬
 9 | 
10 | a:地名等后置词
11 | 语,署,苑,街,省,湖,乡,海,观,路,娃,山,阁,部,镇,江,河,厅,郡,厂,楼,园,区,党,井,亭,塔,县,家,市,弄,巷,寺,局,中路,村委,诺夫,斯基,维奇,村委会,机,型,率
12 | 
13 | c:数量前缀词
14 | 零,一,二,三,四,五,六,七,八,九,十,百,千,万,亿,第,半,几,俩,卅,两,壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,伯,仟
15 | 
16 | t:省会等专用词
17 | 京,津,沪,渝,冀,豫,云,辽,黑,湘,皖,鲁,新,苏,浙,赣,鄂,桂,甘,晋,蒙,陕,吉,闽,贵,粤,青,藏,川,宁,琼


--------------------------------------------------------------------------------
/WordAnalysis/dict_build.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //编译词库
 3 | ini_set('memory_limit', '128M');
 4 | error_reporting(E_ALL);
 5 | header('Content-Type: text/html; charset=utf-8');
 6 | require_once('phpanalysis.class.php');
 7 | $dicAddon = dirname(__FILE__).'/dict/not-build/base_dic_full.txt';
 8 | 
 9 | if( empty($_GET['ac']) )
10 | {
11 |     echo "<div style='line-height:28px;'>请选择要进行的操作：<br />";
12 |     echo "1、<a href='?ac=make'>用原始文件(dict/not-build/base_dic_full.txt)生成一个标准词典；</a><br />";
13 |     echo "2、<a href='?ac=revert'>从默认词典(dict/base_dic_full.dic)，反编译出原始文件。</a></div>";
14 |     exit();
15 | }
16 | 
17 | if( $_GET['ac']=='make' )
18 | {
19 |     PhpAnalysis::$loadInit = false;
20 |     $pa = new PhpAnalysis('utf-8', 'utf-8', false);
21 |     $pa->MakeDict( $dicAddon );
22 |     echo "完成词典创建！";
23 |     exit();
24 | }
25 | else
26 | {
27 |     $pa = new PhpAnalysis('utf-8', 'utf-8', true);
28 |     $pa->ExportDict('base_dic_source.txt');
29 |     echo "完成反编译词典文件，生成的文件为：base_dic_source.txt ！";
30 |     exit();
31 | }
32 | ?> 


--------------------------------------------------------------------------------
/WordAnalysis/phpanalysis.class.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /*
   3 |  * 居于Unicode编码词典的php分词器
   4 |  *  1、只适用于php5，必要函数 iconv
   5 |  *  2、本程序是使用RMM逆向匹配算法进行分词的，词库需要特别编译，本类里提供了 MakeDict() 方法
   6 |  *  3、简单操作流程： SetSource -> StartAnalysis -> Get***Result
   7 |  *  4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
   8 |  *
   9 |  * Copyright IT柏拉图  QQ: 2500875 Email: 2500875#qq.com
  10 |  *
  11 |  * @version 2.0
  12 |  *
  13 |  */
  14 | 
  15 | //常量定义
  16 | define('_SP_', chr(0xFF).chr(0xFE)); 
  17 | define('UCS2', 'ucs-2be');
  18 | class PhpAnalysis
  19 | {
  20 |     
  21 |     //hash算法选项
  22 |     public $mask_value = 0xFFFF;
  23 |     
  24 |     //输入和输出的字符编码（只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型）  
  25 |     public $sourceCharSet = 'utf-8';
  26 |     public $targetCharSet = 'utf-8';
  27 |     
  28 |     //生成的分词结果数据类型 1 为全部， 2为 词典词汇及单个中日韩简繁字符及英文， 3 为词典词汇及英文
  29 |     public $resultType = 1;
  30 |     
  31 |     //句子长度小于这个数值时不拆分，notSplitLen = n(个汉字) * 2 + 1
  32 |     public $notSplitLen = 5;
  33 |     
  34 |     //把英文单词全部转小写
  35 |     public $toLower = false;
  36 |     
  37 |     //使用最大切分模式对二元词进行消岐
  38 |     public $differMax = false;
  39 |     
  40 |     //尝试合并单字
  41 |     public $unitWord = true;
  42 |     
  43 |     //初始化类时直接加载词典
  44 |     public static $loadInit = true;
  45 |     
  46 |     //使用热门词优先模式进行消岐
  47 |     public $differFreq = false;
  48 |     
  49 |     //被转换为unicode的源字符串
  50 |     private $sourceString = '';
  51 |     
  52 |     //附加词典
  53 |     public $addonDic = array();
  54 |     public $addonDicFile = 'dict/words_addons.dic';
  55 |     
  56 |     //主词典 
  57 |     public $dicStr = '';
  58 |     public $mainDic = array();
  59 |     public $mainDicHand = false;
  60 |     public $mainDicInfos = array();
  61 |     public $mainDicFile = 'dict/base_dic_full.dic';
  62 |     //是否直接载入词典（选是载入速度较慢，但解析较快；选否载入较快，但解析较慢，需要时才会载入特定的词条）
  63 |     private $isLoadAll = false;
  64 |     
  65 |     //主词典词语最大长度 x / 2
  66 |     private $dicWordMax = 14;
  67 |     //粗分后的数组（通常是截取句子等用途）
  68 |     private $simpleResult = array();
  69 |     //最终结果(用空格分开的词汇列表)
  70 |     private $finallyResult = '';
  71 |     
  72 |     //是否已经载入词典
  73 |     public $isLoadDic = false;
  74 |     //系统识别或合并的新词
  75 |     public $newWords = array();
  76 |     public $foundWordStr = '';
  77 |     //词库载入时间
  78 |     public $loadTime = 0;
  79 |     
  80 |     /**
  81 |      * 构造函数
  82 |      * @param $source_charset
  83 |      * @param $target_charset
  84 |      * @param $load_alldic 
  85 |      * @param $source
  86 |      *
  87 |      * @return void
  88 |      */
  89 |     public function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=true, $source='')
  90 |     {
  91 |         $this->addonDicFile = dirname(__FILE__).'/'.$this->addonDicFile;
  92 |         $this->mainDicFile  = dirname(__FILE__).'/'.$this->mainDicFile;
  93 |         $this->SetSource( $source, $source_charset, $target_charset );
  94 |         $this->isLoadAll = $load_all;
  95 |         if(self::$loadInit) $this->LoadDict();
  96 |     }
  97 |     
  98 |    /**
  99 |     * 析构函数
 100 |     */
 101 |     function __destruct()
 102 |     {
 103 |         if( $this->mainDicHand !== false )
 104 |         {
 105 |             @fclose( $this->mainDicHand );
 106 |         }
 107 |     }
 108 |     
 109 |     /**
 110 |      * 根据字符串计算key索引
 111 |      * @param $key
 112 |      * @return short int
 113 |      */
 114 |     private function _get_index( $key )
 115 |     {
 116 |         $l = strlen($key);
 117 |         $h = 0x238f13af;
 118 |         while ($l--)
 119 |         {
 120 |             $h += ($h << 5);
 121 |             $h ^= ord($key[$l]);
 122 |             $h &= 0x7fffffff;
 123 |         }
 124 |         return ($h % $this->mask_value);
 125 |     }
 126 |     
 127 |     /**
 128 |      * 从文件获得词
 129 |      * @param $key
 130 |      * @param $type (类型 word 或 key_groups)
 131 |      * @return short int
 132 |      */
 133 |     public function GetWordInfos( $key, $type='word' )
 134 |     {
 135 |         if( !$this->mainDicHand )
 136 |         {
 137 |             $this->mainDicHand = fopen($this->mainDicFile, 'r');
 138 |         }
 139 |         $p = 0;
 140 |         $keynum = $this->_get_index( $key );
 141 |         if( isset($this->mainDicInfos[ $keynum ]) )
 142 |         {
 143 |             $data = $this->mainDicInfos[ $keynum ];
 144 |         }
 145 |         else
 146 |         {
 147 |             //rewind( $this->mainDicHand );
 148 |             $move_pos = $keynum * 8;
 149 |             fseek($this->mainDicHand, $move_pos, SEEK_SET);
 150 |             $dat = fread($this->mainDicHand, 8);
 151 |             $arr = unpack('I1s/n1l/n1c', $dat);
 152 |             if( $arr['l'] == 0 )
 153 |             {
 154 |                 return false;
 155 |             }
 156 |             fseek($this->mainDicHand, $arr['s'], SEEK_SET);
 157 |             $data = @unserialize(fread($this->mainDicHand, $arr['l']));
 158 |             $this->mainDicInfos[ $keynum ] = $data;
 159 |        }
 160 |        if( !is_array($data) || !isset($data[$key]) ) 
 161 |        {
 162 |            return false;
 163 |        }
 164 |        return ($type=='word' ? $data[$key] : $data);
 165 |     }
 166 |     
 167 |     /**
 168 |      * 设置源字符串
 169 |      * @param $source
 170 |      * @param $source_charset
 171 |      * @param $target_charset
 172 |      *
 173 |      * @return bool
 174 |      */
 175 |     public function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
 176 |     {
 177 |         $this->sourceCharSet = strtolower($source_charset);
 178 |         $this->targetCharSet = strtolower($target_charset);
 179 |         $this->simpleResult = array();
 180 |         $this->finallyResult = array();
 181 |         $this->finallyIndex = array();
 182 |         if( $source != '' )
 183 |         {
 184 |             $rs = true;
 185 |             if( preg_match("/^utf/", $source_charset) ) {
 186 |                 $this->sourceString = iconv('utf-8', UCS2, $source);
 187 |             }
 188 |             else if( preg_match("/^gb/", $source_charset) ) {
 189 |                 $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
 190 |             }
 191 |             else if( preg_match("/^big/", $source_charset) ) {
 192 |                 $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
 193 |             }
 194 |             else {
 195 |                 $rs = false;
 196 |             }
 197 |         }
 198 |         else
 199 |         {
 200 |            $rs = false;
 201 |         }
 202 |         return $rs;
 203 |     }
 204 |     
 205 |     /**
 206 |      * 设置结果类型(只在获取finallyResult才有效)
 207 |      * @param $rstype 1 为全部， 2去除特殊符号
 208 |      *
 209 |      * @return void
 210 |      */
 211 |     public function SetResultType( $rstype )
 212 |     {
 213 |         $this->resultType = $rstype;
 214 |     }
 215 |     
 216 |     /**
 217 |      * 载入词典
 218 |      *
 219 |      * @return void
 220 |      */
 221 |     public function LoadDict( $maindic='' )
 222 |     {
 223 |         $startt = microtime(true);
 224 |         //正常读取文件
 225 |         $dicAddon = $this->addonDicFile;
 226 |         if($maindic=='' || !file_exists($maindic) )
 227 |         {
 228 |             $dicWords = $this->mainDicFile ;
 229 |         }
 230 |         else
 231 |         {
 232 |             $dicWords = $maindic;
 233 |             $this->mainDicFile = $maindic;
 234 |         }
 235 |         
 236 |         //加载主词典（只打开）
 237 |         $this->mainDicHand = fopen($dicWords, 'r');
 238 |         
 239 |         //载入副词典
 240 |         $hw = '';
 241 |         $ds = file($dicAddon);
 242 |         foreach($ds as $d)
 243 |         {
 244 |             $d = trim($d);
 245 |             if($d=='') continue;
 246 |             $estr = substr($d, 1, 1);
 247 |             if( $estr==':' ) {
 248 |                 $hw = substr($d, 0, 1);
 249 |             }
 250 |             else
 251 |             {
 252 |                 $spstr = _SP_;
 253 |                 $spstr = iconv(UCS2, 'utf-8', $spstr);
 254 |                 $ws = explode(',', $d);
 255 |                 $wall = iconv('utf-8', UCS2, join($spstr, $ws));
 256 |                 $ws = explode(_SP_, $wall);
 257 |                 foreach($ws as $estr)
 258 |                 {
 259 |                     $this->addonDic[$hw][$estr] = strlen($estr);
 260 |                 }
 261 |             }
 262 |         }
 263 |         $this->loadTime = microtime(true) - $startt;
 264 |         $this->isLoadDic = true;
 265 |     }
 266 |     
 267 |    /**
 268 |     * 检测某个词是否存在
 269 |     */
 270 |     public function IsWord( $word )
 271 |     {
 272 |          $winfos = $this->GetWordInfos( $word );
 273 |          return ($winfos !== false);
 274 |     }
 275 |     
 276 |     /**
 277 |      * 获得某个词的词性及词频信息
 278 |      * @parem $word unicode编码的词
 279 |      * @return void
 280 |      */
 281 |      public function GetWordProperty($word)
 282 |      {
 283 |         if( strlen($word)<4 )
 284 |         {
 285 |             return '/s';
 286 |         }
 287 |         $infos = $this->GetWordInfos($word);
 288 |         return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
 289 |      }
 290 |     
 291 |     /**
 292 |      * 指定某词的词性信息（通常是新词）
 293 |      * @parem $word unicode编码的词
 294 |      * @parem $infos array('c' => 词频, 'm' => 词性);
 295 |      * @return void;
 296 |      */
 297 |     public function SetWordInfos($word, $infos)
 298 |     {
 299 |         if( strlen($word)<4 )
 300 |         {
 301 |             return ;
 302 |         }
 303 |         if( isset($this->mainDicInfos[$word]) )
 304 |         {
 305 |             $this->newWords[$word]++;
 306 |             $this->mainDicInfos[$word]['c']++;
 307 |         }
 308 |         else
 309 |         {
 310 |             $this->newWords[$word] = 1;
 311 |             $this->mainDicInfos[$word] = $infos;
 312 |         }
 313 |     }
 314 |     
 315 |     /**
 316 |      * 开始执行分析
 317 |      * @parem bool optimize 是否对结果进行优化
 318 |      * @return bool
 319 |      */
 320 |     public function StartAnalysis($optimize=true)
 321 |     {
 322 |         if( !$this->isLoadDic )
 323 |         {
 324 |             $this->LoadDict();
 325 |         }
 326 |         $this->simpleResult = $this->finallyResult = array();
 327 |         $this->sourceString .= chr(0).chr(32);
 328 |         $slen = strlen($this->sourceString);
 329 |         $sbcArr = array();
 330 |         $j = 0;
 331 |         //全角与半角字符对照表
 332 |         for($i=0xFF00; $i < 0xFF5F; $i++)
 333 |         {
 334 |             $scb = 0x20 + $j;
 335 |             $j++;
 336 |             $sbcArr[$i] = $scb;
 337 |         }
 338 |         //对字符串进行粗分
 339 |         $onstr = '';
 340 |         $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
 341 |         $s = 0;
 342 |         $ansiWordMatch = "[0-9a-z@#%\+\.-]";
 343 |         $notNumberMatch = "[a-z@#%\+]";
 344 |         for($i=0; $i < $slen; $i++)
 345 |         {
 346 |             $c = $this->sourceString[$i].$this->sourceString[++$i];
 347 |             $cn = hexdec(bin2hex($c));
 348 |             $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
 349 |             //ANSI字符
 350 |             if($cn < 0x80)
 351 |             {
 352 |                 if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
 353 |                 {
 354 |                     if( $lastc != 2 && $onstr != '') {
 355 |                         $this->simpleResult[$s]['w'] = $onstr;
 356 |                         $this->simpleResult[$s]['t'] = $lastc;
 357 |                         $this->_deep_analysis($onstr, $lastc, $s, $optimize);
 358 |                         $s++;
 359 |                         $onstr = '';
 360 |                     }
 361 |                     $lastc = 2;
 362 |                     $onstr .= chr(0).chr($cn);
 363 |                 }
 364 |                 else
 365 |                 {
 366 |                     if( $onstr != '' )
 367 |                     {
 368 |                         $this->simpleResult[$s]['w'] = $onstr;
 369 |                         if( $lastc==2 )
 370 |                         {
 371 |                             if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
 372 |                         }
 373 |                         $this->simpleResult[$s]['t'] = $lastc;
 374 |                         if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
 375 |                         $s++;
 376 |                     }
 377 |                     $onstr = '';
 378 |                     $lastc = 3;
 379 |                     if($cn < 31)
 380 |                     {
 381 |                         continue;
 382 |                     }
 383 |                     else
 384 |                     {
 385 |                         $this->simpleResult[$s]['w'] = chr(0).chr($cn);
 386 |                         $this->simpleResult[$s]['t'] = 3;
 387 |                         $s++;
 388 |                     }
 389 |                 }
 390 |             }
 391 |             //普通字符
 392 |             else
 393 |             {
 394 |                 //正常文字
 395 |                 if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
 396 |                     || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
 397 |                 {
 398 |                     if( $lastc != 1 && $onstr != '')
 399 |                     {
 400 |                         $this->simpleResult[$s]['w'] = $onstr;
 401 |                         if( $lastc==2 )
 402 |                         {
 403 |                             if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
 404 |                         }
 405 |                         $this->simpleResult[$s]['t'] = $lastc;
 406 |                         if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
 407 |                         $s++;
 408 |                         $onstr = '';
 409 |                     }
 410 |                     $lastc = 1;
 411 |                     $onstr .= $c;
 412 |                 }
 413 |                 //特殊符号
 414 |                 else
 415 |                 {
 416 |                     if( $onstr != '' )
 417 |                     {
 418 |                         $this->simpleResult[$s]['w'] = $onstr;
 419 |                         if( $lastc==2 )
 420 |                         {
 421 |                             if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
 422 |                         }
 423 |                         $this->simpleResult[$s]['t'] = $lastc;
 424 |                         if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
 425 |                         $s++;
 426 |                     }
 427 |                     
 428 |                     //检测书名
 429 |                     if( $cn == 0x300A )
 430 |                     {
 431 |                         $tmpw = '';
 432 |                         $n = 1;
 433 |                         $isok = false;
 434 |                         $ew = chr(0x30).chr(0x0B);
 435 |                         while(true)
 436 |                         {
 437 |                             if( !isset($this->sourceString[$i+$n+1]) )  break;
 438 |                             $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
 439 |                             if( $w == $ew )
 440 |                             {
 441 |                                 $this->simpleResult[$s]['w'] = $c;
 442 |                                 $this->simpleResult[$s]['t'] = 5;
 443 |                                 $s++;
 444 |                         
 445 |                                 $this->simpleResult[$s]['w'] = $tmpw;
 446 |                                 $this->newWords[$tmpw] = 1;
 447 |                                 if( !isset($this->newWords[$tmpw]) )
 448 |                                 {
 449 |                                     $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
 450 |                                     $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
 451 |                                 }
 452 |                                 $this->simpleResult[$s]['t'] = 13;
 453 |                                 
 454 |                                 $s++;
 455 | 
 456 |                                 //最大切分模式对书名继续分词
 457 |                                 if( $this->differMax )
 458 |                                 {
 459 |                                     $this->simpleResult[$s]['w'] = $tmpw;
 460 |                                     $this->simpleResult[$s]['t'] = 21;
 461 |                                     $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
 462 |                                     $s++;
 463 |                                 }
 464 |                                 
 465 |                                 $this->simpleResult[$s]['w'] = $ew;
 466 |                                 $this->simpleResult[$s]['t'] =  5;
 467 |                                 $s++;
 468 |                         
 469 |                                 $i = $i + $n + 1;
 470 |                                 $isok = true;
 471 |                                 $onstr = '';
 472 |                                 $lastc = 5;
 473 |                                 break;
 474 |                             }
 475 |                             else
 476 |                             {
 477 |                                 $n = $n+2;
 478 |                                 $tmpw .= $w;
 479 |                                 if( strlen($tmpw) > 60 )
 480 |                                 {
 481 |                                     break;
 482 |                                 }
 483 |                             }
 484 |                         }//while
 485 |                         if( !$isok )
 486 |                         {
 487 |                             $this->simpleResult[$s]['w'] = $c;
 488 |                               $this->simpleResult[$s]['t'] = 5;
 489 |                               $s++;
 490 |                               $onstr = '';
 491 |                             $lastc = 5;
 492 |                         }
 493 |                         continue;
 494 |                     }
 495 |                     
 496 |                     $onstr = '';
 497 |                     $lastc = 5;
 498 |                     if( $cn==0x3000 )
 499 |                     {
 500 |                         continue;
 501 |                     }
 502 |                     else
 503 |                     {
 504 |                         $this->simpleResult[$s]['w'] = $c;
 505 |                         $this->simpleResult[$s]['t'] = 5;
 506 |                         $s++;
 507 |                     }
 508 |                 }//2byte symbol
 509 |                 
 510 |             }//end 2byte char
 511 |         
 512 |         }//end for
 513 |         
 514 |         //处理分词后的结果
 515 |         $this->_sort_finally_result();
 516 |     }
 517 |     
 518 |     /**
 519 |      * 深入分词
 520 |      * @parem $str
 521 |      * @parem $ctype (2 英文类， 3 中/韩/日文类)
 522 |      * @parem $spos   当前粗分结果游标
 523 |      * @return bool
 524 |      */
 525 |     private function _deep_analysis( &$str, $ctype, $spos, $optimize=true )
 526 |     {
 527 | 
 528 |         //中文句子
 529 |         if( $ctype==1 )
 530 |         {
 531 |             $slen = strlen($str);
 532 |             //小于系统配置分词要求长度的句子
 533 |             if( $slen < $this->notSplitLen )
 534 |             {
 535 |                 $tmpstr = '';
 536 |                 $lastType = 0;
 537 |                 if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
 538 |                 if($slen < 5)
 539 |                 {
 540 |                       //echo iconv(UCS2, 'utf-8', $str).'<br/>';
 541 |                       if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
 542 |                       {
 543 |                               $str2 = '';
 544 |                               if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
 545 |                               {
 546 |                                      $str2 = substr($str, 2, 2);
 547 |                                      $str  = substr($str, 0, 2);
 548 |                               }
 549 |                               $ww = $this->simpleResult[$spos - 1]['w'].$str;
 550 |                               $this->simpleResult[$spos - 1]['w'] = $ww;
 551 |                               $this->simpleResult[$spos - 1]['t'] = 4;
 552 |                               if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
 553 |                               {
 554 |                                      $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
 555 |                                      $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
 556 |                               }
 557 |                               $this->simpleResult[$spos]['w'] = '';
 558 |                               if( $str2 != '' )
 559 |                               {
 560 |                                      $this->finallyResult[$spos-1][] = $ww;
 561 |                                      $this->finallyResult[$spos-1][] = $str2;
 562 |                               }
 563 |                        }
 564 |                        else {
 565 |                               $this->finallyResult[$spos][] = $str;
 566 |                        }
 567 |                 }
 568 |                 else
 569 |                 {
 570 |                       $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
 571 |                 }
 572 |             }
 573 |             //正常长度的句子，循环进行分词处理
 574 |             else
 575 |             {
 576 |                 $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
 577 |             }
 578 |         }
 579 |         //英文句子，转为小写
 580 |         else
 581 |         {
 582 |             if( $this->toLower ) {
 583 |                 $this->finallyResult[$spos][] = strtolower($str);
 584 |             }
 585 |             else {
 586 |                 $this->finallyResult[$spos][] = $str;
 587 |             }
 588 |         }
 589 |     }
 590 |     
 591 |     /**
 592 |      * 中文的深入分词
 593 |      * @parem $str
 594 |      * @return void
 595 |      */
 596 |     private function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=true )
 597 |     {
 598 |         $quote1 = chr(0x20).chr(0x1C);
 599 |         $tmparr = array();
 600 |         $hasw = 0;
 601 |         //如果前一个词为 “ ， 并且字符串小于3个字符当成一个词处理。
 602 |         if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
 603 |         {
 604 |             $tmparr[] = $str;
 605 |             if( !isset($this->newWords[$str]) )
 606 |             {
 607 |                 $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
 608 |                 $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
 609 |             }
 610 |             if( !$this->differMax )
 611 |             {
 612 |                 $this->finallyResult[$spos][] = $str;
 613 |                 return ;
 614 |             }
 615 |         }
 616 |         //进行切分
 617 |         for($i=$slen-1; $i > 0; $i -= 2)
 618 |         {
 619 |             //单个词
 620 |             $nc = $str[$i-1].$str[$i];
 621 |             //是否已经到最后两个字
 622 |             if( $i <= 2 )
 623 |             {
 624 |                 $tmparr[] = $nc;
 625 |                 $i = 0;
 626 |                 break;
 627 |             }
 628 |             $isok = false;
 629 |             $i = $i + 1;
 630 |             for($k=$this->dicWordMax; $k>1; $k=$k-2)
 631 |             {
 632 |                 if($i < $k) continue;
 633 |                 $w = substr($str, $i-$k, $k);
 634 |                 if( strlen($w) <= 2 )
 635 |                 {
 636 |                     $i = $i - 1;
 637 |                     break;
 638 |                 }
 639 |                 if( $this->IsWord( $w ) )
 640 |                 {
 641 |                     $tmparr[] = $w;
 642 |                     $i = $i - $k + 1;
 643 |                     $isok = true;
 644 |                     break;
 645 |                 }
 646 |             }
 647 |             //echo '<hr />';
 648 |             //没适合词
 649 |             if(!$isok) $tmparr[] = $nc;
 650 |         }
 651 |         $wcount = count($tmparr);
 652 |         if( $wcount==0 ) return ;
 653 |         $this->finallyResult[$spos] = array_reverse($tmparr);
 654 |         //优化结果(岐义处理、新词、数词、人名识别等)
 655 |         if( $optimize )
 656 |         {
 657 |             $this->_optimize_result( $this->finallyResult[$spos], $spos );
 658 |         }
 659 |     }
 660 |     
 661 |     /**
 662 |     * 对最终分词结果进行优化（把simpleresult结果合并，并尝试新词识别、数词合并等）
 663 |     * @parem $optimize 是否优化合并的结果
 664 |     * @return bool
 665 |     */
 666 |     //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
 667 |     private function _optimize_result( &$smarr, $spos )
 668 |     {
 669 |         $newarr = array();
 670 |         $prePos = $spos - 1;
 671 |         $arlen = count($smarr);
 672 |         $i = $j = 0;
 673 |         //检测数量词
 674 |         if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
 675 |         {
 676 |             $lastw = $this->simpleResult[$prePos]['w'];
 677 |             $lastt = $this->simpleResult[$prePos]['t'];
 678 |               if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
 679 |               {
 680 |                  $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
 681 |                  $this->simpleResult[$prePos]['t'] = 4;
 682 |                  if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
 683 |                  {
 684 |                      $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
 685 |                      $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
 686 |                  }
 687 |                  $smarr[0] = '';
 688 |                  $i++;
 689 |               }
 690 |        }
 691 |        for(; $i < $arlen; $i++)
 692 |        {
 693 |             
 694 |             if( !isset( $smarr[$i+1] ) )
 695 |             {
 696 |                 $newarr[$j] = $smarr[$i];
 697 |                 break;
 698 |             }
 699 |             $cw = $smarr[$i];
 700 |             $nw = $smarr[$i+1];
 701 |             $ischeck = false;
 702 |             //检测数量词
 703 |             if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
 704 |             {
 705 |                 //最大切分时保留合并前的词
 706 |                 if($this->differMax)
 707 |                 {
 708 |                         $newarr[$j] = chr(0).chr(0x28);
 709 |                         $j++;
 710 |                         $newarr[$j] = $cw;
 711 |                         $j++;
 712 |                         $newarr[$j] = $nw;
 713 |                         $j++;
 714 |                         $newarr[$j] = chr(0).chr(0x29);
 715 |                         $j++;
 716 |                 }
 717 |                 $newarr[$j] = $cw.$nw;
 718 |                 if( !isset($this->newWords[$newarr[$j]]) )
 719 |                 {
 720 |                     $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
 721 |                     $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
 722 |                 }
 723 |                 $j++; $i++; $ischeck = true;
 724 |             }
 725 |             //检测前导词(通常是姓)
 726 |             else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
 727 |             {
 728 |                 $is_rs = false;
 729 |                 //词语是副词或介词或频率很高的词不作为人名
 730 |                 if( strlen($nw)==4 )
 731 |                 {
 732 |                     $winfos = $this->GetWordInfos($nw);
 733 |                     if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
 734 |                     {
 735 |                          $is_rs = true;
 736 |                     }
 737 |                 }
 738 |                 if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
 739 |                 {
 740 |                     $newarr[$j] = $cw.$nw;
 741 |                     //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
 742 |                     //尝试检测第三个词
 743 |                     if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
 744 |                     {
 745 |                         $newarr[$j] .= $smarr[$i+2];
 746 |                         $i++;
 747 |                     }
 748 |                     if( !isset($this->newWords[$newarr[$j]]) )
 749 |                     {
 750 |                         $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
 751 |                         $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
 752 |                     }
 753 |                     //为了防止错误，保留合并前的姓名
 754 |                     if(strlen($nw)==4)
 755 |                     {
 756 |                         $j++;
 757 |                         $newarr[$j] = chr(0).chr(0x28);
 758 |                         $j++;
 759 |                         $newarr[$j] = $cw;
 760 |                         $j++;
 761 |                         $newarr[$j] = $nw;
 762 |                         $j++;
 763 |                         $newarr[$j] = chr(0).chr(0x29);
 764 |                     }
 765 |                     
 766 |                     $j++; $i++; $ischeck = true;
 767 |                 }
 768 |             }
 769 |             //检测后缀词(地名等)
 770 |             else if( isset($this->addonDic['a'][$nw]) )
 771 |             {
 772 |                 $is_rs = false;
 773 |                 //词语是副词或介词不作为前缀
 774 |                 if( strlen($cw)>2 )
 775 |                 {
 776 |                     $winfos = $this->GetWordInfos($cw);
 777 |                     if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
 778 |                     {
 779 |                          $is_rs = true;
 780 |                     }
 781 |                 }
 782 |                 if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
 783 |                 {
 784 |                     $newarr[$j] = $cw.$nw;
 785 |                     if( !isset($this->newWords[$newarr[$j]]) )
 786 |                     {
 787 |                         $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
 788 |                         $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
 789 |                     }
 790 |                     $i++; $j++; $ischeck = true;
 791 |                 }
 792 |             }
 793 |             //新词识别（暂无规则）
 794 |             else if($this->unitWord)
 795 |             {
 796 |                 if(strlen($cw)==2 && strlen($nw)==2 
 797 |                 && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) 
 798 |                 && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
 799 |                 {
 800 |                     $newarr[$j] = $cw.$nw;
 801 |                     //尝试检测第三个词
 802 |                     if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
 803 |                     {
 804 |                         $newarr[$j] .= $smarr[$i+2];
 805 |                         $i++;
 806 |                     }
 807 |                     if( !isset($this->newWords[$newarr[$j]]) )
 808 |                     {
 809 |                         $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
 810 |                         $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
 811 |                     }
 812 |                     $i++; $j++; $ischeck = true;
 813 |                 }
 814 |             }
 815 |             
 816 |             //不符合规则
 817 |             if( !$ischeck )
 818 |             {
 819 |                 $newarr[$j] = $cw;
 820 |                   //二元消岐处理——最大切分模式
 821 |                 if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
 822 |                 {
 823 |                     $slen = strlen($nw);
 824 |                     $hasDiff = false;
 825 |                     for($y=2; $y <= $slen-2; $y=$y+2)
 826 |                     {
 827 |                         $nhead = substr($nw, $y-2, 2);
 828 |                         $nfont = $cw.substr($nw, 0, $y-2);
 829 |                         if( $this->IsWord( $nfont.$nhead ) )
 830 |                         {
 831 |                             if( strlen($cw) > 2 ) $j++;
 832 |                             $hasDiff = true;
 833 |                             $newarr[$j] = $nfont.$nhead;
 834 |                         }
 835 |                     }
 836 |                 }
 837 |                 $j++;
 838 |             }
 839 |             
 840 |        }//end for
 841 |        $smarr =  $newarr;
 842 |     }
 843 |     
 844 |     /**
 845 |     * 转换最终分词结果到 finallyResult 数组
 846 |     * @return void
 847 |     */
 848 |     private function _sort_finally_result()
 849 |     {
 850 |           $newarr = array();
 851 |         $i = 0;
 852 |         foreach($this->simpleResult as $k=>$v)
 853 |         {
 854 |             if( empty($v['w']) ) continue;
 855 |             if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
 856 |             {
 857 |                 foreach($this->finallyResult[$k] as $w)
 858 |                 {
 859 |                     if(!empty($w))
 860 |                     {
 861 |                         $newarr[$i]['w'] = $w;
 862 |                         $newarr[$i]['t'] = 20;
 863 |                         $i++;
 864 |                     }
 865 |                 }
 866 |             }
 867 |             else if($v['t'] != 21)
 868 |             {
 869 |                 $newarr[$i]['w'] = $v['w'];
 870 |                 $newarr[$i]['t'] = $v['t'];
 871 |                 $i++;
 872 |             }
 873 |         }
 874 |         $this->finallyResult = $newarr;
 875 |         $newarr = '';
 876 |       }
 877 |     
 878 |     /**
 879 |      * 把uncode字符串转换为输出字符串
 880 |      * @parem str
 881 |      * return string
 882 |      */
 883 |      private function _out_string_encoding( &$str )
 884 |      {
 885 |         $rsc = $this->_source_result_charset();
 886 |         if( $rsc==1 ) {
 887 |             $rsstr = iconv(UCS2, 'utf-8', $str);
 888 |         }
 889 |         else if( $rsc==2 ) {
 890 |             $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
 891 |         }
 892 |         else{
 893 |             $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
 894 |         }
 895 |         return $rsstr;
 896 |      }
 897 |     
 898 |     /**
 899 |      * 获取最终结果字符串（用空格分开后的分词结果）
 900 |      * @return string
 901 |      */
 902 |      public function GetFinallyResult($spword=' ', $word_meanings=false)
 903 |      {
 904 |         $rsstr = '';
 905 |         foreach($this->finallyResult as $v)
 906 |         {
 907 |             if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
 908 |             {
 909 |                 continue;
 910 |             }
 911 |             $m = '';
 912 |             if( $word_meanings )
 913 |             {
 914 |                 $m = $this->GetWordProperty($v['w']);
 915 |             }
 916 |             $w = $this->_out_string_encoding($v['w']);
 917 |             if( $w != ' ' )
 918 |             {
 919 |                 if($word_meanings) {
 920 |                     $rsstr .= $spword.$w.$m;
 921 |                 }
 922 |                 else {
 923 |                     $rsstr .= $spword.$w;
 924 |                 }
 925 |             }
 926 |         }
 927 |         return $rsstr;
 928 |      }
 929 |      
 930 |     /**
 931 |      * 获取粗分结果，不包含粗分属性
 932 |      * @return array()
 933 |      */
 934 |      public function GetSimpleResult()
 935 |      {
 936 |         $rearr = array();
 937 |         foreach($this->simpleResult as $k=>$v)
 938 |         {
 939 |             if( empty($v['w']) ) continue;
 940 |             $w = $this->_out_string_encoding($v['w']);
 941 |             if( $w != ' ' ) $rearr[] = $w;
 942 |         }
 943 |         return $rearr;
 944 |      }
 945 |      
 946 |     /**
 947 |      * 获取粗分结果，包含粗分属性（1中文词句、2 ANSI词汇（包括全角），3 ANSI标点符号（包括全角），4数字（包括全角），5 中文标点或无法识别字符）
 948 |      * @return array()
 949 |      */
 950 |      public function GetSimpleResultAll()
 951 |      {
 952 |         $rearr = array();
 953 |         foreach($this->simpleResult as $k=>$v)
 954 |         {
 955 |             $w = $this->_out_string_encoding($v['w']);
 956 |             if( $w != ' ' )
 957 |             {
 958 |                 $rearr[$k]['w'] = $w;
 959 |                 $rearr[$k]['t'] = $v['t'];
 960 |             }
 961 |         }
 962 |         return $rearr;
 963 |      }
 964 |      
 965 |     /**
 966 |      * 获取索引hash数组
 967 |      * @return array('word'=>count,...)
 968 |      */
 969 |      public function GetFinallyIndex()
 970 |      {
 971 |         $rearr = array();
 972 |         foreach($this->finallyResult as $v)
 973 |         {
 974 |             if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
 975 |             {
 976 |                 continue;
 977 |             }
 978 |             $w = $this->_out_string_encoding($v['w']);
 979 |             if( $w == ' ' )
 980 |             {
 981 |                 continue;
 982 |             }
 983 |             if( isset($rearr[$w]) )
 984 |             {
 985 |                  $rearr[$w]++;
 986 |             }
 987 |             else
 988 |             {
 989 |                  $rearr[$w] = 1;
 990 |             }
 991 |         }
 992 |         arsort( $rearr );
 993 |         return $rearr;
 994 |      }
 995 |      
 996 |     /**
 997 |      * 获取最终关键字(返回用 "," 间隔的关键字)
 998 |      * @return string
 999 |      */
1000 |      public function GetFinallyKeywords( $num = 10 )
1001 |      {
1002 |          $n = 0;
1003 |          $arr = $this->GetFinallyIndex();
1004 |          $okstr = '';
1005 |          foreach( $arr as $k => $v )
1006 |          {
1007 |             //排除长度为1的词
1008 |             if( strlen($k)==1 ) {
1009 |                 continue;
1010 |             }
1011 |             //排除长度为2的非英文词
1012 |             elseif( strlen($k)==2 && preg_match('/[^0-9a-zA-Z]/', $k) ) {
1013 |                 continue;
1014 |             
1015 |             }
1016 |             //排除单个中文字
1017 |             elseif( strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
1018 |                 continue;
1019 |             }
1020 |             $okstr .= ($okstr=='' ? $k : ','.$k);
1021 |             $n++;
1022 |             if( $n > $num ) break;
1023 |          }
1024 |          return $okstr;
1025 |      }
1026 |      
1027 |     /**
1028 |      * 获得保存目标编码
1029 |      * @return int
1030 |      */
1031 |      private function _source_result_charset()
1032 |      {
1033 |         if( preg_match("/^utf/", $this->targetCharSet) ) {
1034 |            $rs = 1;
1035 |         }
1036 |         else if( preg_match("/^gb/", $this->targetCharSet) ) {
1037 |            $rs = 2;
1038 |         }
1039 |         else if( preg_match("/^big/", $this->targetCharSet) ) {
1040 |            $rs = 3;
1041 |         }
1042 |         else {
1043 |             $rs = 4;
1044 |         }
1045 |         return $rs;
1046 |      }
1047 |      
1048 |      /**
1049 |      * 编译词典
1050 |      * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
1051 |      * 注意, 需要PHP开放足够的内存才能完成操作
1052 |      * @return void
1053 |      */
1054 |      public function MakeDict( $source_file, $target_file='' )
1055 |      {
1056 |         $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
1057 |         $allk = array();
1058 |         $fp = fopen($source_file, 'r');
1059 |         while( $line = fgets($fp, 512) )
1060 |         {
1061 |             if( $line[0]=='@' ) continue;
1062 |             list($w, $r, $a) = explode(',', $line);
1063 |             $a = trim( $a );
1064 |             $w = iconv('utf-8', UCS2, $w);
1065 |             $k = $this->_get_index( $w );
1066 |             if( isset($allk[ $k ]) )
1067 |                 $allk[ $k ][ $w ] = array($r, $a);
1068 |             else
1069 |                 $allk[ $k ][ $w ] = array($r, $a);
1070 |         }
1071 |         fclose( $fp );
1072 |         $fp = fopen($target_file, 'w');
1073 |         $heade_rarr = array();
1074 |         $alldat = '';
1075 |         $start_pos = $this->mask_value * 8;
1076 |         foreach( $allk as $k => $v )
1077 |         {
1078 |             $dat  = serialize( $v );
1079 |             $dlen = strlen($dat);
1080 |             $alldat .= $dat;
1081 |         
1082 |             $heade_rarr[ $k ][0] = $start_pos;
1083 |             $heade_rarr[ $k ][1] = $dlen;
1084 |             $heade_rarr[ $k ][2] = count( $v );
1085 |         
1086 |             $start_pos += $dlen;
1087 |         }
1088 |         unset( $allk );
1089 |         for($i=0; $i < $this->mask_value; $i++)
1090 |         {
1091 |             if( !isset($heade_rarr[$i]) )
1092 |             {
1093 |                 $heade_rarr[$i] = array(0, 0, 0);
1094 |             }
1095 |             fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
1096 |         }
1097 |         fwrite( $fp, $alldat);
1098 |         fclose( $fp );
1099 |      }
1100 |      
1101 |      /**
1102 |      * 导出词典的词条
1103 |      * @parem $targetfile 保存位置
1104 |      * @return void
1105 |      */
1106 |      public function ExportDict( $targetfile )
1107 |      {
1108 |         if( !$this->mainDicHand )
1109 |         {
1110 |             $this->mainDicHand = fopen($this->mainDicFile, 'r');
1111 |         }
1112 |         $fp = fopen($targetfile, 'w');
1113 |         for($i=0; $i <= $this->mask_value; $i++)
1114 |         {
1115 |             $move_pos = $i * 8;
1116 |             fseek($this->mainDicHand, $move_pos, SEEK_SET);
1117 |             $dat = fread($this->mainDicHand, 8);
1118 |             $arr = unpack('I1s/n1l/n1c', $dat);
1119 |             if( $arr['l'] == 0 )
1120 |             {
1121 |                 continue;
1122 |             }
1123 |             fseek($this->mainDicHand, $arr['s'], SEEK_SET);
1124 |             $data = @unserialize(fread($this->mainDicHand, $arr['l']));
1125 |             if( !is_array($data) ) continue;
1126 |             foreach($data as $k => $v)
1127 |             {
1128 |                 $w = iconv(UCS2, 'utf-8', $k);
1129 |                 fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
1130 |             }
1131 |         }
1132 |         fclose( $fp );
1133 |         return true;
1134 |      }
1135 | }
1136 | 
1137 | ?> 
1138 | 


--------------------------------------------------------------------------------