├── mmseg ├── EXPERIMENTAL ├── CREDITS ├── .svnignore ├── tests │ ├── thesaurus.txt │ ├── unigram.txt │ ├── synonyms.txt │ ├── 002.phpt │ ├── 001.phpt │ ├── 003.phpt │ ├── 005.phpt │ ├── 004.phpt │ └── 006.phpt ├── ini │ └── mmseg.ini ├── config.w32 ├── mmseg.php ├── config.m4 └── php_mmseg.h ├── dependencies └── mmseg-3.2.14 │ ├── AUTHORS │ ├── INSTALL │ ├── NEWS │ ├── README │ ├── THANKS │ ├── ChangeLog │ ├── src │ ├── win32 │ │ ├── t.txt │ │ ├── changelog.txt │ │ ├── t1.txt │ │ ├── readme.txt │ │ ├── syb.txt │ │ ├── libcss03.vcproj │ │ ├── libcss05.vcproj │ │ ├── mmseg03.vcproj │ │ ├── mmseg05.vcproj │ │ ├── t2.txt │ │ ├── css05.sln │ │ └── css03.sln │ ├── t1.txt │ ├── css │ │ ├── Segmenter.h │ │ ├── SegmentPkg.cpp │ │ ├── tolowercase.h │ │ ├── UnigramRecord.cpp │ │ ├── ThesaurusDict.h │ │ ├── UnigramRecord.h │ │ ├── ICorpusReader.h │ │ ├── UnigramCorpusReader.h │ │ ├── SegmenterManager.h │ │ ├── UnigramDict.h │ │ ├── SynonymsDict.h │ │ ├── SegmentPkg.h │ │ ├── UnigramDict.cpp │ │ ├── UnigramCorpusReader.cpp │ │ ├── ThesaurusDict.cpp │ │ ├── SegmenterManager.cpp │ │ ├── mmthunk.h │ │ └── mmthunk.cpp │ ├── mk_dist.bat │ ├── utils │ │ ├── csr_mmap.h │ │ ├── os.h │ │ ├── bsd_getopt_win.h │ │ ├── csr.h │ │ ├── csr_assert.h │ │ ├── csr_utils.h │ │ ├── assert.c │ │ ├── bsd_getopt.c │ │ ├── bsd_getopt.h │ │ ├── freelist.h │ │ ├── StringTokenizer.h │ │ ├── scoped_ptr.h │ │ ├── Singleton.h │ │ ├── csr_pool.h │ │ ├── csr_utils.c │ │ ├── StringTokenizer.cpp │ │ ├── Utf8_16.h │ │ └── csr_mmap.c │ ├── csr_typedefs.h │ ├── config.win.h │ ├── Makefile.am │ ├── iniparser │ │ └── dictionary.h │ └── mmseg_main.cpp │ ├── stamp-h.in │ ├── ruby │ ├── mmseg-i386-mswin32.def │ ├── mmseg-i386-mswin32.exp │ ├── readme.txt │ ├── extconf.win.rb │ ├── extconf.lin.rb │ ├── test.rb │ ├── Makefile │ └── rubyapi.cpp │ ├── config │ ├── .cvsignore │ ├── mkinstalldirs │ ├── sys_errlist.m4 │ ├── sys_siglist.m4 │ ├── readline.m4 │ ├── apu-hints.m4 │ ├── mdate-sh │ ├── find_apr.m4 │ ├── install-sh │ └── missing │ ├── data │ ├── uni.lib │ ├── mmseg.ini │ └── build_unigram.py │ ├── python │ ├── tmmseg.py │ ├── mmseg_interface.h │ ├── pymmseg.c │ ├── pymmseg.sln │ ├── mmseg_interface.cpp │ └── pymmseg.vcproj │ ├── bootstrap │ ├── script │ ├── build_thesaurus.py │ ├── char_table_build.py │ └── build_tolower_table.py │ ├── Makefile.am │ └── config-h.in ├── travis ├── build_php-mmseg.sh └── build_libmmseg.sh ├── .travis.yml ├── LICENSE ├── .gitignore └── README.md /mmseg/EXPERIMENTAL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mmseg/CREDITS: -------------------------------------------------------------------------------- 1 | mmseg 2 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/AUTHORS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/INSTALL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/NEWS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/README: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/THANKS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ChangeLog: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/t.txt: -------------------------------------------------------------------------------- 1 | 工作狂 -------------------------------------------------------------------------------- /mmseg/.svnignore: -------------------------------------------------------------------------------- 1 | .deps 2 | *.lo 3 | *.la 4 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/t1.txt: -------------------------------------------------------------------------------- 1 | 中文分词测试 2 | 中国人上海市 -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/stamp-h.in: -------------------------------------------------------------------------------- 1 | timestamp 2 | -------------------------------------------------------------------------------- /mmseg/tests/thesaurus.txt: -------------------------------------------------------------------------------- 1 | 南京西路 2 | -南京,西路, 3 | 张三丰 4 | -太极宗师,武当祖师, 5 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/changelog.txt: -------------------------------------------------------------------------------- 1 | - append synonyms supports -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/mmseg-i386-mswin32.def: -------------------------------------------------------------------------------- 1 | EXPORTS 2 | Init_mmseg 3 | -------------------------------------------------------------------------------- /mmseg/tests/unigram.txt: -------------------------------------------------------------------------------- 1 | 河 187 2 | x:187 3 | 造假者 1 4 | x:1 5 | 台北队 1 6 | x:1 7 | 湖边 1 8 | -------------------------------------------------------------------------------- /mmseg/ini/mmseg.ini: -------------------------------------------------------------------------------- 1 | extension=mmseg.so 2 | mmseg.dict_dir=/opt/etc 3 | mmseg.autoreload=1 4 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/.cvsignore: -------------------------------------------------------------------------------- 1 | *.texi 2 | *.la 3 | *.lo 4 | *.o 5 | configure.scan 6 | -------------------------------------------------------------------------------- /mmseg/tests/synonyms.txt: -------------------------------------------------------------------------------- 1 | // test commit 2 | .net => dotnet 3 | c# => csharp 4 | c++ => cplusplus 5 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/data/uni.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/data/uni.lib -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/t1.txt: -------------------------------------------------------------------------------- 1 | 不行 去掉 代码 多时 工作 直接 短路 mysql 所以 下载 正在 英文 研究 登录 引起 本 2 | 人 机器 全部 安全 判断 关于 比较 汇报 唯恐 逻辑 表达式 脚本 -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/Segmenter.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/css/Segmenter.h -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/win32/readme.txt -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/SegmentPkg.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/css/SegmentPkg.cpp -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/tolowercase.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/css/tolowercase.h -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/syb.txt: -------------------------------------------------------------------------------- 1 | // test commit 2 | => 3 | .net => dotnet 4 | c# => csharp 5 | c++ => cplusplus 6 | .net => dotNet -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/libcss03.vcproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/win32/libcss03.vcproj -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/libcss05.vcproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/win32/libcss05.vcproj -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/mmseg03.vcproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/win32/mmseg03.vcproj -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/mmseg05.vcproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/src/win32/mmseg05.vcproj -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/mmseg-i386-mswin32.exp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainx/php-mmseg/HEAD/dependencies/mmseg-3.2.14/ruby/mmseg-i386-mswin32.exp -------------------------------------------------------------------------------- /travis/build_php-mmseg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PWD=`pwd` 4 | 5 | cd mmseg 6 | phpize 7 | ./configure --with-mmseg=/opt 8 | make 9 | sudo make install 10 | 11 | cd $PWD 12 | -------------------------------------------------------------------------------- /travis/build_libmmseg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PWD=`pwd` 4 | 5 | cd dependencies/mmseg-3.2.14 6 | ./bootstrap 7 | ./configure --prefix=/opt/ 8 | make 9 | sudo make install 10 | 11 | cd $PWD 12 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/data/mmseg.ini: -------------------------------------------------------------------------------- 1 | [mmseg] 2 | merge_number_and_ascii=0; ;合并英文和数字 abc123/x 3 | number_and_ascii_joint=; ;定义可以连接英文和数字的字符 4 | compress_space=1; ;暂不支持 5 | seperate_number_ascii=0; ;就是将字母和数字打散 6 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/tmmseg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import cmmseg 4 | cmmseg.init('F:\\deps\\mmseg\\src\\win32') 5 | rs = cmmseg.segment((u'中文分词').encode('utf-8')) 6 | for i in rs: 7 | print i.decode('utf-8') -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/readme.txt: -------------------------------------------------------------------------------- 1 | tested on win32 only 2 | 3 | 1 You MUST copy uni.lib from the 'src' directory before running test.rb 4 | 2 I changed the libcss complie mode from 'multi-thread' into 'multi-thread dll', this might be break sphinx's build. -------------------------------------------------------------------------------- /mmseg/tests/002.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 10 | --EXPECT-- 11 | resource(4) of type (mmseg segmenter manager resource) 12 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/mk_dist.bat: -------------------------------------------------------------------------------- 1 | md include 2 | copy *.h include\ 3 | copy css\*.h include\ 4 | copy utils\*.h include\ 5 | md lib 6 | md lib\debug 7 | md lib\release 8 | copy win32\release\*.lib lib\release\ 9 | copy win32\debug\*.lib lib\debug\ 10 | copy win32\release\*.lib lib\ 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.6 5 | - 7.0 6 | 7 | compiler: 8 | - gcc 9 | 10 | #Compile 11 | before_script: 12 | - ./travis/build_libmmseg.sh 13 | - ./travis/build_php-mmseg.sh 14 | - phpenv config-add mmseg/ini/mmseg.ini 15 | 16 | # Run PHPs run-tests.php 17 | script: 18 | - echo "ok" 19 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/extconf.win.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | #mmseg_config = with_config('mmseg-config', 'mmseg-config') 4 | #use_mmseg_config = enable_config('mmseg-config') 5 | #have_library("mmseg") 6 | #have_header('SegmenterManager.h') 7 | dir_config('mmseg') 8 | $libs = append_library($libs, "libcss") 9 | create_makefile("mmseg") 10 | -------------------------------------------------------------------------------- /mmseg/tests/001.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 10 | --EXPECT-- 11 | array(3) { 12 | [0]=> 13 | string(6) "你好" 14 | [1]=> 15 | string(3) "," 16 | [2]=> 17 | string(6) "世界" 18 | } 19 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/extconf.lin.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | #mmseg_config = with_config('mmseg-config', 'mmseg-config') 4 | #use_mmseg_config = enable_config('mmseg-config') 5 | #have_library("mmseg") 6 | #have_header('SegmenterManager.h') 7 | dir_config('mmseg') 8 | $libs = append_library($libs, "stdc++") 9 | $libs = append_library($libs, "mmseg") 10 | create_makefile("mmseg") 11 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/mmseg_interface.h: -------------------------------------------------------------------------------- 1 | #ifndef Py_CMMSEGMODULE_H 2 | #define Py_CMMSEGMODULE_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | PyObject *init(PyObject *self, PyObject *args); 12 | PyObject *segment(PyObject *self, PyObject *args); 13 | 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | 18 | #endif -------------------------------------------------------------------------------- /mmseg/config.w32: -------------------------------------------------------------------------------- 1 | // $Id$ 2 | // vim:ft=javascript 3 | 4 | // If your extension references something external, use ARG_WITH 5 | // ARG_WITH("mmseg", "for mmseg support", "no"); 6 | 7 | // Otherwise, use ARG_ENABLE 8 | // ARG_ENABLE("mmseg", "enable mmseg support", "no"); 9 | 10 | if (PHP_MMSEG != "no") { 11 | EXTENSION("mmseg", "mmseg.c", PHP_EXTNAME_SHARED, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1"); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/test.rb: -------------------------------------------------------------------------------- 1 | require "mmseg" 2 | 3 | #t = Mmseg.new() 4 | txt = "中文分词, 分词算法是一种计算机软件(a computer software)。这好像是废话!" 5 | t = Mmseg.createSeg(".",txt) 6 | while t.next() 7 | print txt[t.start...t.end] 8 | print ' ' 9 | end 10 | 11 | 50000.times { 12 | #5.times { 13 | t.setText(txt) 14 | while t.next() 15 | #print txt[t.start...t.end] 16 | #print ' ' 17 | end 18 | } 19 | 20 | t=nil -------------------------------------------------------------------------------- /mmseg/tests/003.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 12 | --EXPECT-- 13 | array(3) { 14 | [0]=> 15 | string(6) "你好" 16 | [1]=> 17 | string(3) "," 18 | [2]=> 19 | string(6) "世界" 20 | } 21 | -------------------------------------------------------------------------------- /mmseg/tests/005.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 13 | --EXPECT-- 14 | 1 15 | -------------------------------------------------------------------------------- /mmseg/tests/004.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 13 | --EXPECT-- 14 | 1 15 | -------------------------------------------------------------------------------- /mmseg/tests/006.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for mmseg presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 13 | --EXPECT-- 14 | 1 15 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_mmap.h: -------------------------------------------------------------------------------- 1 | #ifndef _CSR_MMAP_H_ 2 | #define _CSR_MMAP_H_ 3 | 4 | #include "csr_typedefs.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct _csr_mmap_t csr_mmap_t; 11 | 12 | /* mmap.c */ 13 | csr_mmap_t *csr_mmap_file(const char*,unsigned char bLoadMem); 14 | csr_mmap_t *csr_mmap_file_w(const char*); 15 | void csr_munmap_file(csr_mmap_t*); 16 | void *csr_mmap_map(csr_mmap_t*); 17 | csr_offset_t csr_mmap_size(csr_mmap_t*); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | 23 | #endif 24 | 25 | -------------------------------------------------------------------------------- /mmseg/mmseg.php: -------------------------------------------------------------------------------- 1 | "; 3 | 4 | if(!extension_loaded('mmseg')) { 5 | dl('mmseg.' . PHP_SHLIB_SUFFIX); 6 | } 7 | $module = 'mmseg'; 8 | $functions = get_extension_funcs($module); 9 | echo "Functions available in the test extension:$br\n"; 10 | foreach($functions as $func) { 11 | echo $func."$br\n"; 12 | } 13 | echo "$br\n"; 14 | $function = 'confirm_' . $module . '_compiled'; 15 | if (extension_loaded($module)) { 16 | $str = $function($module); 17 | } else { 18 | $str = "Module $module is not compiled into PHP"; 19 | } 20 | echo "$str\n"; 21 | ?> 22 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/pymmseg.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "mmseg_interface.h" 5 | 6 | /* 7 | API 8 | - init(dict_path):raise exception 9 | - segment(string):list[] 10 | FIXME: should use multi dict. 11 | */ 12 | static struct PyMethodDef mmseg_methods[] = { 13 | {"init", init, 1}, 14 | {"segment", segment, 1}, 15 | {NULL, NULL} 16 | }; 17 | 18 | PyMODINIT_FUNC 19 | initcmmseg() { 20 | PyObject *m; 21 | //PyObject *c_api_object; 22 | 23 | m = Py_InitModule("cmmseg", mmseg_methods); 24 | if (m == NULL) 25 | return; 26 | /* 27 | c_api_object = PyCObject_FromVoidPtr((void *)PySpam_API, NULL); 28 | if (c_api_object != NULL) 29 | PyModule_AddObject(m, "_C_API", c_api_object); 30 | */ 31 | } 32 | 33 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/os.h: -------------------------------------------------------------------------------- 1 | #ifndef _CSR_INNER_COMMON_H_ 2 | #define _CSR_INNER_COMMON_H_ 3 | /* import win32's setting */ 4 | #ifdef WIN32 5 | #include "config.win.h" 6 | #else 7 | #include "config.h" 8 | #endif 9 | 10 | #ifdef HAVE_UNISTD_H 11 | #include 12 | #endif 13 | #ifdef HAVE_FCNTL_H 14 | #include 15 | #endif 16 | #ifdef HAVE_SYS_STAT_H 17 | #include 18 | #endif 19 | #ifdef HAVE_SYS_TYPES_H 20 | #include 21 | #endif 22 | #ifdef HAVE_SYS_PARAM_H 23 | #include 24 | #endif 25 | 26 | #ifdef __MINGW32__ 27 | #undef HAVE_MMAP 28 | #endif 29 | #ifdef HAVE_MMAP 30 | #include 31 | #endif 32 | 33 | #if !defined HAVE_MMAP && defined HAVE_WINDOWS_H 34 | #include 35 | #endif 36 | 37 | #endif 38 | 39 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/data/build_unigram.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import unicodedata 5 | import re 6 | import codecs 7 | import os 8 | 9 | def main(): 10 | fh = codecs.open(sys.argv[1],"r", "UTF-8") 11 | lines = fh.readlines() 12 | fh.close() 13 | uni_char = {} 14 | for l in lines: 15 | l = l.strip() 16 | toks = l.split('\t') 17 | k = toks[0] 18 | cnt = int(toks[1]) 19 | if k not in uni_char: 20 | uni_char[k] = cnt 21 | fh = codecs.open(sys.argv[2],"r", "UTF-8") 22 | lines = fh.readlines() 23 | fh.close() 24 | for l in lines: 25 | l = l.strip() 26 | if l not in uni_char: 27 | uni_char[l] = 1 28 | pass 29 | for k in uni_char: 30 | cnt = uni_char[k] 31 | print (k+'\t'+str(cnt)).encode('UTF-8') 32 | print ('x:'+str(cnt)).encode('UTF-8') 33 | pass 34 | 35 | if __name__ == "__main__": 36 | main() -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/mkinstalldirs: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # mkinstalldirs --- make directory hierarchy 3 | # Author: Noah Friedman 4 | # Created: 1993-05-16 5 | # Public domain 6 | 7 | # $Id: mkinstalldirs,v 1.13 1999/01/05 03:18:55 bje Exp $ 8 | 9 | errstatus=0 10 | 11 | for file 12 | do 13 | set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'` 14 | shift 15 | 16 | pathcomp= 17 | for d 18 | do 19 | pathcomp="$pathcomp$d" 20 | case "$pathcomp" in 21 | -* ) pathcomp=./$pathcomp ;; 22 | esac 23 | 24 | if test ! -d "$pathcomp"; then 25 | echo "mkdir $pathcomp" 26 | 27 | mkdir "$pathcomp" || lasterr=$? 28 | 29 | if test ! -d "$pathcomp"; then 30 | errstatus=$lasterr 31 | fi 32 | fi 33 | 34 | pathcomp="$pathcomp/" 35 | done 36 | done 37 | 38 | exit $errstatus 39 | 40 | # mkinstalldirs ends here 41 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/pymmseg.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 9.00 3 | # Visual Studio 2005 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pymmseg", "pymmseg.vcproj", "{67B60005-6B3D-4D5C-87CC-5FC650D2EDB8}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {67B60005-6B3D-4D5C-87CC-5FC650D2EDB8}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {67B60005-6B3D-4D5C-87CC-5FC650D2EDB8}.Debug|Win32.Build.0 = Debug|Win32 14 | {67B60005-6B3D-4D5C-87CC-5FC650D2EDB8}.Release|Win32.ActiveCfg = Release|Win32 15 | {67B60005-6B3D-4D5C-87CC-5FC650D2EDB8}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/bsd_getopt_win.h: -------------------------------------------------------------------------------- 1 | /* bsd_getopt.h 2 | * 3 | * Chris Collins 4 | */ 5 | 6 | /** header created for NetBSD getopt/getopt_long */ 7 | 8 | #ifndef HAVE_GETOPT_LONG 9 | #ifndef _BSD_GETOPT_H 10 | #define _BSD_GETOPT_H 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | extern int opterr; 17 | extern int optind; 18 | extern int optopt; 19 | extern int optreset; 20 | extern char *optarg; 21 | 22 | struct option { 23 | char *name; 24 | int has_arg; 25 | int *flag; 26 | int val; 27 | }; 28 | 29 | #define no_argument 0 30 | #define required_argument 1 31 | #define optional_argument 2 32 | 33 | extern int getopt(int nargc, char * const *nargv, const char *options); 34 | extern int getopt_long(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx); 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif /* _BSD_GETOPT_H */ 41 | #endif 42 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/t2.txt: -------------------------------------------------------------------------------- 1 | 国务院办公厅关于调整全国推动中小企业发展工作领导小组组成人员的通知 2 | 国办函〔2003〕77号 3 | 4 | 发展改革委: 5 |   你委《关于调整全国推动中小企业发展工作领导小组及办公室成员的请示》(发改企业〔2003〕1625号)收悉。经国务院领导同志同意,现将调整后的全国推动中小企业发展工作领导小组(以下简称领导小组)主要职责及组成人员名单通知如下: 6 | 一、领导小组主要职责 7 |   1.负责全国中小企业和非国有经济工作的统筹规划、组织领导和政策协调,组织实施《中华人民共和国中小企业促进法》,并监督检查实施情况; 8 |   2.研究中小企业总体发展战略; 9 |   3.讨论决定中小企业和非国有经济重要工作部署; 10 |   4.就中小企业和非国有经济面临的倾向性和方向性问题,向党中央、国务院提出意见和建议。 11 | 二、领导小组组成 12 |   组 长:马 凯 (发展改革委主任) 13 |   副组长:欧新黔 (发展改革委副主任) 14 |   成 员:朱志刚 (财政部副部长) 15 |       马颂德 (科技部副部长) 16 |       刘 坚 (农业部副部长) 17 |       张小建 (劳动保障部副部长) 18 |       傅自应 (商务部部长助理) 19 |       吴晓灵 (人民银行副行长) 20 |       郝昭成 (税务总局副局长) 21 |       杨树德 (工商总局副局长) 22 |       王秦平 (质检总局副局长) 23 |       唐双宁 (银监会副主席) 24 |       范福春 (证监会副主席) 25 |       刘克崮 (开发银行副行长) 26 |       谢伯阳 (全国工商联副主席) 27 |   领导小组办公室设在发展改革委。今后,领导小组成员因工作变动需要调整的,由所在单位提出意见,经领导小组办公室审核后,报领导小组组长批准。 28 | 29 | 国务院办公厅 30 | 二○○三年十一月二十一日 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 rainx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramRecord.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include "UnigramRecord.h" 25 | 26 | namespace css { 27 | 28 | 29 | 30 | UnigramRecord::UnigramRecord() 31 | { 32 | count = 0; 33 | } 34 | 35 | } /* End of namespace css */ -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/bootstrap: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # bootstrap -- Use this script to create generated files from the CVS dist 3 | # Copyright (C) 2000 Gary V. Vaughan 4 | # 5 | # This program is free software; you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation; either version 2, or (at your option) 8 | # any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program; if not, write to the Free Software 17 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 | 19 | ## @start 1 20 | #! /bin/sh 21 | 22 | set -x 23 | aclocal -I config 24 | if test x`uname` == xDarwin 25 | then 26 | glibtoolize --force --copy 27 | else 28 | libtoolize --force --copy 29 | fi 30 | autoheader 31 | automake --add-missing --copy 32 | autoconf 33 | ## @end 1 34 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/ThesaurusDict.h: -------------------------------------------------------------------------------- 1 | // follow http://blog.csdn.net/yiliumu/article/details/23002799 2 | #ifndef THESAURUS_DICT_h 3 | #define THESAURUS_DICT_h 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace css { 10 | //using namespace CRFPP; 11 | #ifdef WIN32 12 | using namespace stdext; 13 | #else 14 | using namespace __gnu_cxx; 15 | #endif 16 | 17 | #include "darts.h" 18 | #include "csr.h" 19 | #include "csr_mmap.h" 20 | 21 | class ThesaurusRecord { 22 | public: 23 | std::string key; 24 | u1 value[1024]; 25 | u2 length; 26 | }; 27 | 28 | class ThesaurusDict { 29 | 30 | public: 31 | typedef Darts::DoubleArray::result_pair_type result_pair_type; 32 | ThesaurusDict () :m_stringpool(NULL){}; 33 | virtual ~ThesaurusDict () {}; 34 | 35 | public: 36 | virtual int load(const char* filename); 37 | int import(const char* filename, const char* target_file = NULL); 38 | const char* find(const char* key,u2 key_len , int *count = NULL); //the return string buffer might contains 0, end with \0\0 39 | int isLoad() 40 | { 41 | return m_da.array() != NULL; 42 | } 43 | protected: 44 | _csr_mmap_t* m_file; 45 | u1* m_stringpool; 46 | Darts::DoubleArray m_da; 47 | }; 48 | 49 | 50 | } /* End of namespace css */ 51 | #endif 52 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/sys_errlist.m4: -------------------------------------------------------------------------------- 1 | ## sys_errlist.m4 -- determine whether the system library provides sys_errlist 2 | ## Copyright (C) 2000 Gary V. Vaughan 3 | ## 4 | ## This program is free software; you can redistribute it and/or modify 5 | ## it under the terms of the GNU General Public License as published by 6 | ## the Free Software Foundation; either version 2, or (at your option) 7 | ## any later version. 8 | ## 9 | ## This program is distributed in the hope that it will be useful, 10 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | ## GNU General Public License for more details. 13 | ## 14 | ## You should have received a copy of the GNU General Public License 15 | ## along with this program; if not, write to the Free Software 16 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | # serial 1 SIC_VAR_SYS_ERRLIST 19 | 20 | AC_DEFUN(SIC_VAR_SYS_ERRLIST, 21 | [AC_CACHE_CHECK([for sys_errlist], 22 | sic_cv_var_sys_errlist, 23 | [AC_TRY_LINK([int *p;], [extern int sys_errlist; p = &sys_errlist;], 24 | sic_cv_var_sys_errlist=yes, sic_cv_var_sys_errlist=no)]) 25 | if test x"$sic_cv_var_sys_errlist" = xyes; then 26 | AC_DEFINE(HAVE_SYS_ERRLIST, 1, 27 | [Define if your system libraries have a sys_errlist variable.]) 28 | fi]) 29 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/sys_siglist.m4: -------------------------------------------------------------------------------- 1 | ## sys_siglist.m4 -- determine whether the system library provides sys_siglist 2 | ## Copyright (C) 2000 Gary V. Vaughan 3 | ## 4 | ## This program is free software; you can redistribute it and/or modify 5 | ## it under the terms of the GNU General Public License as published by 6 | ## the Free Software Foundation; either version 2, or (at your option) 7 | ## any later version. 8 | ## 9 | ## This program is distributed in the hope that it will be useful, 10 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | ## GNU General Public License for more details. 13 | ## 14 | ## You should have received a copy of the GNU General Public License 15 | ## along with this program; if not, write to the Free Software 16 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | # serial 1 SIC_VAR_SYS_SIGLIST 19 | 20 | AC_DEFUN(SIC_VAR_SYS_SIGLIST, 21 | [AC_CACHE_CHECK([for sys_siglist], 22 | sic_cv_var_sys_siglist, 23 | [AC_TRY_LINK([int *p;], [extern int sys_siglist; p = &sys_siglist;], 24 | sic_cv_var_sys_siglist=yes, sic_cv_var_sys_siglist=no)]) 25 | if test x"$sic_cv_var_sys_siglist" = xyes; then 26 | AC_DEFINE(HAVE_SYS_SIGLIST, 1, 27 | [Define if your system libraries have a sys_siglist variable.]) 28 | fi]) 29 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramRecord.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_UnigramRecord_h 25 | #define css_UnigramRecord_h 26 | 27 | #include 28 | #include "csr.h" 29 | 30 | 31 | namespace css { 32 | 33 | class UnigramRecord { 34 | 35 | public: 36 | 37 | UnigramRecord(); 38 | 39 | public: 40 | std::string key; 41 | unistring wkey; 42 | int count; 43 | 44 | }; 45 | 46 | } /* End of namespace css */ 47 | #endif 48 | 49 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/readline.m4: -------------------------------------------------------------------------------- 1 | ## readline.m4 -- provide and handle --with-readline configure option 2 | ## Copyright (C) 2000 Gary V. Vaughan 3 | ## 4 | ## This program is free software; you can redistribute it and/or modify 5 | ## it under the terms of the GNU General Public License as published by 6 | ## the Free Software Foundation; either version 2, or (at your option) 7 | ## any later version. 8 | ## 9 | ## This program is distributed in the hope that it will be useful, 10 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | ## GNU General Public License for more details. 13 | ## 14 | ## You should have received a copy of the GNU General Public License 15 | ## along with this program; if not, write to the Free Software 16 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | # serial 1 SIC_WITH_READLINE 19 | 20 | AC_DEFUN(SIC_WITH_READLINE, 21 | [AC_ARG_WITH(readline, 22 | [ --with-readline compile with the system readline library], 23 | [if test x"${withval-no}" != no; then 24 | sic_save_LIBS=$LIBS 25 | AC_CHECK_LIB(readline, readline) 26 | if test x"${ac_cv_lib_readline_readline}" = xno; then 27 | AC_MSG_ERROR(libreadline not found) 28 | fi 29 | LIBS=$sic_save_LIBS 30 | fi]) 31 | AM_CONDITIONAL(WITH_READLINE, test x"${with_readline-no}" != xno) 32 | ]) 33 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef _CORESEEK_RUNTIME_COMMON_H_ 25 | #define _CORESEEK_RUNTIME_COMMON_H_ 26 | 27 | /*include types*/ 28 | #include "csr_typedefs.h" 29 | 30 | 31 | #include "Utf8_16.h" 32 | #include "StringTokenizer.h" 33 | 34 | /* 35 | #ifdef WIN32 36 | typedef std::wstring unistring; 37 | #else 38 | */ 39 | typedef std::basic_string unistring; 40 | //#endif 41 | 42 | #ifdef WIN32 43 | #define snprintf _snprintf 44 | #endif 45 | 46 | #endif 47 | 48 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_assert.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef _CSR_ASSERT_H_ 25 | #define _CSR_ASSERT_H_ 26 | /* Modified to dump core, rather than exit. May/85 RNS */ 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | void AssertionFailed(char *file, int line); 32 | #ifdef _ASSERT 33 | #undef _ASSERT 34 | #define _ASSERT(ex) do{if (!(ex)) AssertionFailed(__FILE__, __LINE__);}while(0); 35 | #else 36 | #define _ASSERT(ex) 37 | #endif // _DEBUG 38 | 39 | #ifdef __cplusplus 40 | }; 41 | #endif 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/css05.sln: -------------------------------------------------------------------------------- 1 | Microsoft Visual Studio Solution File, Format Version 9.00 2 | # Visual Studio 2005 3 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libcss", "libcss05.vcproj", "{40B08227-C518-4CA0-9A86-F17588554B27}" 4 | EndProject 5 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mmseg", "mmseg05.vcproj", "{2F5525DC-B04E-45E8-BFD6-37792E2F3508}" 6 | EndProject 7 | Global 8 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 9 | Debug|Win32 = Debug|Win32 10 | Release|Win32 = Release|Win32 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {40B08227-C518-4CA0-9A86-F17588554B27}.Debug|Win32.ActiveCfg = Debug|Win32 14 | {40B08227-C518-4CA0-9A86-F17588554B27}.Debug|Win32.Build.0 = Debug|Win32 15 | {40B08227-C518-4CA0-9A86-F17588554B27}.Release|Win32.ActiveCfg = Release|Win32 16 | {40B08227-C518-4CA0-9A86-F17588554B27}.Release|Win32.Build.0 = Release|Win32 17 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Debug|Win32.ActiveCfg = Debug|Win32 18 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Debug|Win32.Build.0 = Debug|Win32 19 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Release|Win32.ActiveCfg = Release|Win32 20 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Release|Win32.Build.0 = Release|Win32 21 | EndGlobalSection 22 | GlobalSection(SolutionProperties) = preSolution 23 | HideSolutionNode = FALSE 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_utils.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef _CSR_UTILES_H_ 25 | #define _CSR_UTILES_H_ 26 | #include "csr_typedefs.h" 27 | 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | //helper function 33 | //#undef atoi 34 | /* Convert a string to an int. */ 35 | int 36 | csr_atoi (const char *nptr); 37 | 38 | void csr_perror(const char *s); 39 | void csr_exit_perror(const char *s); 40 | 41 | unsigned long currentTimeMillis(); 42 | u4 countBitsU4(u4 num); 43 | u2 countBitsU2(u2 num); 44 | u1 countBitsU1(u1 num); 45 | u2 u2_length(const u2* p); 46 | 47 | 48 | #ifdef __cplusplus 49 | }; 50 | #endif 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/assert.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | #include "csr_assert.h" 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif 33 | 34 | #ifdef _NT40_ENV 35 | void 36 | _NTAbort(void) 37 | { 38 | _asm int 3h; /* always trap. */ 39 | } 40 | #endif 41 | 42 | 43 | void 44 | AssertionFailed(char *file, int line) 45 | { 46 | fprintf(stderr, "Assertion failed! file %s, line %d.\n", file, 47 | line); 48 | fflush(stderr); 49 | #ifdef _NT40_ENV 50 | _NTAbort(); 51 | #else 52 | abort(); 53 | #endif 54 | } 55 | 56 | #ifdef __cplusplus 57 | }; 58 | #endif 59 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/ICorpusReader.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_ICorpusReader_h 25 | #define css_ICorpusReader_h 26 | 27 | 28 | namespace css { 29 | 30 | class ICorpusReader { 31 | 32 | public: 33 | 34 | 35 | /** 36 | * Load Corpus file into memory. 37 | * @param filename, the filename to be load. 38 | * @param type must be NULL 39 | */ 40 | virtual int open(const char* filename, const char* type = NULL) = 0; 41 | 42 | virtual long count() = 0; 43 | 44 | public: 45 | // virtual destructor for interface 46 | virtual ~ICorpusReader() { } 47 | }; 48 | 49 | } /* End of namespace css */ 50 | #endif 51 | 52 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/script/build_thesaurus.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import unicodedata 5 | import re 6 | import codecs 7 | import os 8 | 9 | def enmu_all_keys(key): 10 | kl = [] 11 | if len(key)==0: 12 | return kl 13 | prefix = '' 14 | for c in key: 15 | prefix = prefix + c 16 | if len(prefix) == 1: 17 | continue 18 | kl.append(prefix) 19 | kl2 = enmu_all_keys(key[1:]) 20 | return kl + kl2 21 | 22 | def main(): 23 | fh = codecs.open(sys.argv[1],"r", "UTF-8") 24 | lines = fh.readlines() 25 | fh.close() 26 | i = 0 27 | ht = {} 28 | for l in lines: 29 | if i % 2 == 0: 30 | l = l.strip() 31 | l = l.split('\t')[0] 32 | ht[l] = 1 33 | #print l 34 | i = i + 1 35 | 36 | for k in ht: 37 | if len(k) == 1: 38 | continue 39 | subk = {} 40 | kl = enmu_all_keys(k) 41 | for sk in kl: 42 | #print sk, sk != k ,ht.has_key(sk) 43 | if sk != k and ht.has_key(sk): 44 | subk[sk] = 1 45 | ht[k] = subk 46 | 47 | for k in ht: 48 | if ht[k] != 1 and ht[k] != {}: 49 | print k.encode('UTF-8') 50 | s = '' 51 | #print k, ht[k] 52 | for sk in ht[k]: 53 | s = s + sk + ','; 54 | print ('-'+s).encode('UTF-8') 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | #print enmu_all_keys('abc') -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/win32/css03.sln: -------------------------------------------------------------------------------- 1 | Microsoft Visual Studio Solution File, Format Version 8.00 2 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libcss", "libcss03.vcproj", "{40B08227-C518-4CA0-9A86-F17588554B27}" 3 | ProjectSection(ProjectDependencies) = postProject 4 | EndProjectSection 5 | EndProject 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mmseg", "mmseg03.vcproj", "{2F5525DC-B04E-45E8-BFD6-37792E2F3508}" 7 | ProjectSection(ProjectDependencies) = postProject 8 | EndProjectSection 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfiguration) = preSolution 12 | Debug = Debug 13 | Release = Release 14 | EndGlobalSection 15 | GlobalSection(ProjectDependencies) = postSolution 16 | EndGlobalSection 17 | GlobalSection(ProjectConfiguration) = postSolution 18 | {40B08227-C518-4CA0-9A86-F17588554B27}.Debug.ActiveCfg = Debug|Win32 19 | {40B08227-C518-4CA0-9A86-F17588554B27}.Debug.Build.0 = Debug|Win32 20 | {40B08227-C518-4CA0-9A86-F17588554B27}.Release.ActiveCfg = Release|Win32 21 | {40B08227-C518-4CA0-9A86-F17588554B27}.Release.Build.0 = Release|Win32 22 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Debug.ActiveCfg = Debug|Win32 23 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Debug.Build.0 = Debug|Win32 24 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Release.ActiveCfg = Release|Win32 25 | {2F5525DC-B04E-45E8-BFD6-37792E2F3508}.Release.Build.0 = Release|Win32 26 | EndGlobalSection 27 | GlobalSection(ExtensibilityGlobals) = postSolution 28 | EndGlobalSection 29 | GlobalSection(ExtensibilityAddIns) = postSolution 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramCorpusReader.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_UnigramCorpusReader_h 25 | #define css_UnigramCorpusReader_h 26 | 27 | #include 28 | #include 29 | 30 | #include "ICorpusReader.h" 31 | #include "UnigramRecord.h" 32 | 33 | 34 | namespace css { 35 | 36 | class UnigramCorpusReader : virtual public ICorpusReader { 37 | 38 | public: 39 | 40 | virtual UnigramRecord* getAt(int idx); 41 | 42 | UnigramCorpusReader(); 43 | 44 | virtual int open(const char* filename, const char* type); 45 | 46 | virtual long count(); 47 | 48 | 49 | protected: 50 | std::string m_filename; 51 | 52 | 53 | protected: 54 | 55 | /** 56 | * all parsed unigram-record here 57 | * @element-type UnigramRecord 58 | */ 59 | std::vector< UnigramRecord > m_items; 60 | }; 61 | 62 | } /* End of namespace css */ 63 | #endif 64 | 65 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/bsd_getopt.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2006 Robert Millan 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 1. Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * 2. Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 3. The name of the author may not be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 16 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 17 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 18 | * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 21 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 23 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 24 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | #include "bsd_getopt.h" 28 | 29 | int optreset = 0; 30 | 31 | int 32 | bsd_getopt (int argc, char **argv, char *shortopts) 33 | { 34 | if (optreset == 1) 35 | { 36 | optreset = 0; 37 | optind = 0; 38 | } 39 | 40 | return getopt (argc, argv, shortopts); 41 | } 42 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/bsd_getopt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2006 Robert Millan 3 | * Copyright © 2009 Guillem Jover 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions 7 | * are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. The name of the author may not be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 17 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 18 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 19 | * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 22 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 24 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef LIBBSD_GETOPT_H 29 | #define LIBBSD_GETOPT_H 30 | 31 | #include 32 | #include 33 | 34 | __BEGIN_DECLS 35 | extern int optreset; 36 | 37 | int bsd_getopt (int, char **, char *); 38 | __END_DECLS 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/freelist.h: -------------------------------------------------------------------------------- 1 | // 2 | // CRF++ -- Yet Another CRF toolkit 3 | // 4 | // $Id: freelist.h 1588 2007-02-12 09:03:39Z taku $; 5 | // 6 | // Copyright(C) 2005-2007 Taku Kudo 7 | // 8 | #ifndef CRFPP_FREELIST_H__ 9 | #define CRFPP_FREELIST_H__ 10 | 11 | #include 12 | 13 | namespace CRFPP { 14 | template 15 | class Length { 16 | public: 17 | size_t operator()(const T *str) const { return 1; } 18 | }; 19 | 20 | class charLength { 21 | public: 22 | size_t operator()(const char *str) const { return strlen(str) + 1; } 23 | }; 24 | 25 | template > 26 | class FreeList { 27 | private: 28 | std::vector freeList; 29 | size_t pi; 30 | size_t li; 31 | size_t size; 32 | 33 | public: 34 | void free() { li = pi = 0; } 35 | 36 | T* alloc(size_t len = 1) { 37 | if ((pi + len) >= size) { 38 | li++; 39 | pi = 0; 40 | } 41 | if (li == freeList.size()) { 42 | freeList.push_back(new T[size]); 43 | } 44 | T* r = freeList[li] + pi; 45 | pi += len; 46 | return r; 47 | } 48 | 49 | T* dup(T *src, size_t len = 0) { 50 | if (!len) len = LengthFunc () (src); 51 | T *p = alloc(len); 52 | if (src == 0) memset (p, 0, len * sizeof (T)); 53 | else memcpy(p, src, len * sizeof(T)); 54 | return p; 55 | } 56 | 57 | void set_size(size_t n) { size = n; } 58 | 59 | explicit FreeList(size_t _size): pi(0), li(0), size(_size) {} 60 | explicit FreeList(): pi(0), li(0), size(0) {} 61 | 62 | virtual ~FreeList() { 63 | for (li = 0; li < freeList.size(); ++li) { 64 | delete [] freeList[li]; 65 | } 66 | } 67 | }; 68 | 69 | typedef FreeList StrFreeList; 70 | } 71 | #endif 72 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/StringTokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | *********************************************************************** 3 | * Class: StringTokenizer * 4 | * By Arash Partow - 2000 * 5 | * URL: http://www.partow.net/programming/stringtokenizer/index.html * 6 | * * 7 | * Copyright Notice: * 8 | * Free use of this library is permitted under the guidelines and * 9 | * in accordance with the most current version of the Common Public * 10 | * License. * 11 | * http://www.opensource.org/licenses/cpl.php * 12 | * * 13 | *********************************************************************** 14 | */ 15 | 16 | 17 | 18 | #ifndef INCLUDE_STRINGTOKENIZER_H 19 | #define INCLUDE_STRINGTOKENIZER_H 20 | 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | namespace csr { 28 | 29 | class StringTokenizer 30 | { 31 | 32 | public: 33 | 34 | StringTokenizer(const std::string& _str, const std::string& _delim); 35 | ~StringTokenizer(){}; 36 | 37 | int countTokens(); 38 | bool hasMoreTokens(); 39 | std::string nextToken(); 40 | int nextIntToken(); 41 | double nextFloatToken(); 42 | std::string nextToken(const std::string& delim); 43 | std::string remainingString(); 44 | std::string filterNextToken(const std::string& filterStr); 45 | 46 | private: 47 | 48 | std::string token_str; 49 | std::string delim; 50 | 51 | }; 52 | 53 | } //namespace csr { 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/Makefile.am: -------------------------------------------------------------------------------- 1 | ## Makefile.am -- Process this file with automake to produce Makefile.in 2 | ## Copyright (C) 2000 Gary V. Vaughan 3 | ## 4 | ## This program is free software; you can redistribute it and/or modify 5 | ## it under the terms of the GNU General Public License as published by 6 | ## the Free Software Foundation; either version 2, or (at your option) 7 | ## any later version. 8 | ## 9 | ## This program is distributed in the hope that it will be useful, 10 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | ## GNU General Public License for more details. 13 | ## 14 | ## You should have received a copy of the GNU General Public License 15 | ## along with this program; if not, write to the Free Software 16 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | ## @start 1 19 | AUX_DIST = $(ac_aux_dir)/config.guess \ 20 | $(ac_aux_dir)/config.sub \ 21 | $(ac_aux_dir)/install-sh \ 22 | $(ac_aux_dir)/ltconfig \ 23 | $(ac_aux_dir)/ltmain.sh \ 24 | $(ac_aux_dir)/mdate-sh \ 25 | $(ac_aux_dir)/missing \ 26 | $(ac_aux_dir)/mkinstalldirs 27 | AUX_DIST_EXTRA = $(ac_aux_dir)/readline.m4 \ 28 | $(ac_aux_dir)/sys_errlist.m4 \ 29 | $(ac_aux_dir)/sys_siglist.m4 30 | EXTRA_DIST = bootstrap 31 | 32 | dictdir = $(prefix)/etc 33 | dist_dict_SCRIPTS = data/unigram.txt data/uni.lib data/mmseg.ini 34 | 35 | ## @end 1 36 | AUTOMAKE_OPTIONS = gnits 37 | SUBDIRS = src 38 | ## @start 1 39 | MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure config-h.in \ 40 | stamp-h.in $(AUX_DIST) 41 | 42 | ## @end 1 43 | ACLOCAL = aclocal -I $(ac_aux_dir) 44 | 45 | html: 46 | @echo Making $@ in $(docdir) 47 | @cd $(docdir) && make $@ 48 | ## @end 2 49 | ## @start 1 50 | dist-hook: 51 | (cd $(distdir) && mkdir $(ac_aux_dir)) 52 | for file in $(AUX_DIST) $(AUX_DIST_EXTRA); do \ 53 | cp $$file $(distdir)/$$file; \ 54 | done 55 | ## @end 1 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/c,c++,cmake,autotools 3 | 4 | ### Autotools ### 5 | # http://www.gnu.org/software/automake 6 | 7 | Makefile.in 8 | /ar-lib 9 | /mdate-sh 10 | /py-compile 11 | /test-driver 12 | /ylwrap 13 | 14 | # http://www.gnu.org/software/autoconf 15 | 16 | /autom4te.cache 17 | /autoscan.log 18 | /autoscan-*.log 19 | /aclocal.m4 20 | /compile 21 | /config.guess 22 | /config.h.in 23 | /config.sub 24 | /configure 25 | /configure.scan 26 | /depcomp 27 | /install-sh 28 | /missing 29 | /stamp-h1 30 | 31 | # https://www.gnu.org/software/libtool/ 32 | 33 | /ltmain.sh 34 | 35 | # http://www.gnu.org/software/texinfo 36 | 37 | /texinfo.tex 38 | 39 | ### C ### 40 | # Prerequisites 41 | *.d 42 | 43 | # Object files 44 | *.o 45 | *.ko 46 | *.obj 47 | *.elf 48 | 49 | # Linker output 50 | *.ilk 51 | *.map 52 | *.exp 53 | 54 | # Precompiled Headers 55 | *.gch 56 | *.pch 57 | 58 | # Libraries 59 | *.lib 60 | *.a 61 | *.la 62 | *.lo 63 | 64 | # Shared objects (inc. Windows DLLs) 65 | *.dll 66 | *.so 67 | *.so.* 68 | *.dylib 69 | 70 | # Executables 71 | *.exe 72 | *.out 73 | *.app 74 | *.i*86 75 | *.x86_64 76 | *.hex 77 | 78 | # Debug files 79 | *.dSYM/ 80 | *.su 81 | *.idb 82 | *.pdb 83 | 84 | # Kernel Module Compile Results 85 | *.mod* 86 | *.cmd 87 | .tmp_versions/ 88 | modules.order 89 | Module.symvers 90 | Mkfile.old 91 | dkms.conf 92 | 93 | ### C++ ### 94 | # Prerequisites 95 | 96 | # Compiled Object files 97 | *.slo 98 | 99 | # Precompiled Headers 100 | 101 | # Compiled Dynamic libraries 102 | 103 | # Fortran module files 104 | *.mod 105 | *.smod 106 | 107 | # Compiled Static libraries 108 | *.lai 109 | 110 | # Executables 111 | 112 | ### CMake ### 113 | CMakeCache.txt 114 | CMakeFiles 115 | CMakeScripts 116 | Testing 117 | Makefile 118 | cmake_install.cmake 119 | install_manifest.txt 120 | compile_commands.json 121 | CTestTestfile.cmake 122 | build 123 | 124 | # End of https://www.gitignore.io/api/c,c++,cmake,autotools 125 | -------------------------------------------------------------------------------- /mmseg/config.m4: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl config.m4 for extension mmseg 3 | 4 | dnl Comments in this file start with the string 'dnl'. 5 | dnl Remove where necessary. This file will not work 6 | dnl without editing. 7 | 8 | dnl If your extension references something external, use with: 9 | 10 | PHP_ARG_WITH(mmseg, for mmseg support, 11 | Make sure that the comment is aligned: 12 | [ --with-mmseg Include mmseg support]) 13 | 14 | dnl Otherwise use enable: 15 | 16 | dnl PHP_ARG_ENABLE(mmseg, whether to enable mmseg support, 17 | dnl Make sure that the comment is aligned: 18 | dnl [ --enable-mmseg Enable mmseg support]) 19 | 20 | if test "$PHP_MMSEG" != "no"; then 21 | dnl Write more examples of tests here... 22 | 23 | dnl # --with-mmseg -> check with-path 24 | SEARCH_PATH="/usr/local /usr /opt /opt/mmseg" # you might want to change this 25 | SEARCH_FOR="/include/mmseg/SegmenterManager.h" # you most likely want to change this 26 | if test -r $PHP_MMSEG/$SEARCH_FOR; then # path given as parameter 27 | MMSEG_DIR=$PHP_MMSEG 28 | else # search default path list 29 | AC_MSG_CHECKING([for mmseg files in default path]) 30 | for i in $SEARCH_PATH ; do 31 | if test -r $i/$SEARCH_FOR; then 32 | MMSEG_DIR=$i 33 | AC_MSG_RESULT(found in $i) 34 | fi 35 | done 36 | fi 37 | 38 | if test -z "$MMSEG_DIR"; then 39 | AC_MSG_RESULT([not found]) 40 | AC_MSG_ERROR([Please reinstall the mmseg distribution]) 41 | fi 42 | 43 | dnl # --with-mmseg -> add include path 44 | PHP_ADD_INCLUDE($MMSEG_DIR/include/mmseg) 45 | 46 | 47 | dnl # --with-mmseg -> check for lib and symbol presence 48 | LIBNAME=mmseg # you may want to change this 49 | LIBSYMBOL=mmseg # you most likely want to change this 50 | 51 | PHP_ADD_LIBRARY_WITH_PATH($LIBNAME, $MMSEG_DIR/lib, MMSEG_SHARED_LIBADD) 52 | PHP_SUBST(MMSEG_SHARED_LIBADD) 53 | PHP_REQUIRE_CXX() 54 | PHP_ADD_LIBRARY(stdc++, 1, EXTRA_LDFLAGS) 55 | PHP_NEW_EXTENSION(mmseg, mmseg.cpp, $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 -fpermissive -Wno-deprecated -Wno-write-strings) 56 | fi 57 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/apu-hints.m4: -------------------------------------------------------------------------------- 1 | dnl -------------------------------------------------------- -*- autoconf -*- 2 | dnl Copyright 2003-2005 The Apache Software Foundation or its licensors, as 3 | dnl applicable. 4 | dnl 5 | dnl Licensed under the Apache License, Version 2.0 (the "License"); 6 | dnl you may not use this file except in compliance with the License. 7 | dnl You may obtain a copy of the License at 8 | dnl 9 | dnl http://www.apache.org/licenses/LICENSE-2.0 10 | dnl 11 | dnl Unless required by applicable law or agreed to in writing, software 12 | dnl distributed under the License is distributed on an "AS IS" BASIS, 13 | dnl WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | dnl See the License for the specific language governing permissions and 15 | dnl limitations under the License. 16 | 17 | dnl ----------------------------------------------------------------- 18 | dnl apu-hints.m4: apr-util's autoconf macros for platform-specific hints 19 | dnl 20 | dnl We preload various configure settings depending 21 | dnl on previously obtained platform knowledge. 22 | dnl We allow all settings to be overridden from 23 | dnl the command-line. 24 | 25 | dnl 26 | dnl APU_PRELOAD 27 | dnl 28 | dnl Preload various build parameters based on outside knowledge. 29 | dnl 30 | AC_DEFUN([APU_PRELOAD], [ 31 | if test "x$apu_preload_done" != "xyes" ; then 32 | apu_preload_done="yes" 33 | 34 | echo "Applying apr-util hints file rules for $host" 35 | 36 | case "$host" in 37 | *-dec-osf*) 38 | APR_SETIFNULL(apu_crypt_threadsafe, [1]) 39 | ;; 40 | *-hp-hpux11.*) 41 | APR_SETIFNULL(apu_crypt_threadsafe, [1]) 42 | ;; 43 | *-ibm-aix4*|*-ibm-aix5.1*) 44 | APR_SETIFNULL(apu_iconv_inbuf_const, [1]) 45 | ;; 46 | *-ibm-os390) 47 | APR_SETIFNULL(apu_crypt_threadsafe, [1]) 48 | ;; 49 | *-solaris2*) 50 | APR_SETIFNULL(apu_iconv_inbuf_const, [1]) 51 | APR_SETIFNULL(apu_crypt_threadsafe, [1]) 52 | ;; 53 | *-sco3.2v5*) 54 | APR_SETIFNULL(apu_db_xtra_libs, [-lsocket]) 55 | ;; 56 | esac 57 | 58 | fi 59 | ]) 60 | 61 | 62 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/scoped_ptr.h: -------------------------------------------------------------------------------- 1 | // 2 | // CRF++ -- Yet Another CRF toolkit 3 | // 4 | // $Id: scoped_ptr.h 1588 2007-02-12 09:03:39Z taku $; 5 | // 6 | // Copyright(C) 2005-2007 Taku Kudo 7 | // 8 | #ifndef CRFPP_SCOPED_PTR_H__ 9 | #define CRFPP_SCOPED_PTR_H__ 10 | 11 | #include 12 | #include 13 | 14 | namespace CRFPP { 15 | 16 | template class scoped_ptr { 17 | private: 18 | T * ptr_; 19 | scoped_ptr(scoped_ptr const &); 20 | scoped_ptr & operator=(scoped_ptr const &); 21 | typedef scoped_ptr this_type; 22 | 23 | public: 24 | typedef T element_type; 25 | explicit scoped_ptr(T * p = 0): ptr_(p) {} 26 | virtual ~scoped_ptr() { delete ptr_; } 27 | void reset(T * p = 0) { 28 | delete ptr_; 29 | ptr_ = p; 30 | } 31 | T & operator*() const { return *ptr_; } 32 | T * operator->() const { return ptr_; } 33 | T * get() const { return ptr_; } 34 | }; 35 | 36 | template class scoped_array { 37 | private: 38 | T * ptr_; 39 | scoped_array(scoped_array const &); 40 | scoped_array & operator=(scoped_array const &); 41 | typedef scoped_array this_type; 42 | 43 | public: 44 | typedef T element_type; 45 | explicit scoped_array(T * p = 0): ptr_(p) {} 46 | virtual ~scoped_array() { delete [] ptr_; } 47 | void reset(T * p = 0) { 48 | delete [] ptr_; 49 | ptr_ = p; 50 | } 51 | T & operator*() const { return *ptr_; } 52 | T * operator->() const { return ptr_; } 53 | T * get() const { return ptr_; } 54 | T & operator[](size_t i) const { return ptr_[i]; } 55 | }; 56 | 57 | class scoped_string: public scoped_array { 58 | public: 59 | explicit scoped_string() { reset_string(""); } 60 | explicit scoped_string(const std::string &str) { 61 | reset_string(str); 62 | } 63 | 64 | void reset_string(const std::string &str) { 65 | char *p = new char[str.size() + 1]; 66 | strcpy(p, str.c_str()); 67 | reset(p); 68 | } 69 | 70 | void reset_string(const char *str) { 71 | char *p = new char[strlen(str) + 1]; 72 | strcpy(p, str); 73 | reset(p); 74 | } 75 | }; 76 | } 77 | #endif 78 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/SegmenterManager.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_SegmenterManager_h 25 | #define css_SegmenterManager_h 26 | 27 | #include 28 | #include "freelist.h" 29 | 30 | #include "UnigramDict.h" 31 | #include "SynonymsDict.h" 32 | #include "ThesaurusDict.h" 33 | #include "Segmenter.h" 34 | 35 | namespace css { 36 | 37 | //class CrfSegmenter; 38 | using namespace CRFPP; 39 | /** @author Monan Li 40 | */ 41 | class SegmenterManager { 42 | /* {TemplatePath=D:\cos\deps\Segment\doc\}*/ 43 | public: 44 | /** 45 | * Return a newly created segmenter 46 | */ 47 | Segmenter *getSegmenter( bool bFromPool = true); 48 | 49 | virtual int init(const char* path, u1 method = SEG_METHOD_NGRAM); 50 | void loadconfig(const char* confile); 51 | void clear(); 52 | 53 | SegmenterManager(); 54 | virtual ~SegmenterManager(); 55 | const char* what_(){ return m_msg; } 56 | public: 57 | const static u1 SEG_METHOD_NGRAM = 0x1; 58 | protected: 59 | CRFPP::FreeList seg_freelist_; 60 | UnigramDict m_uni; 61 | UnigramDict m_kw; 62 | UnigramDict m_weight; 63 | SynonymsDict m_sym; 64 | ThesaurusDict m_thesaurus; 65 | Segmenter_ConfigObj m_config; 66 | u1 m_method; 67 | u1 m_inited; 68 | char m_msg[1024]; 69 | }; 70 | 71 | } /* End of namespace css */ 72 | #endif 73 | 74 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/csr_typedefs.h: -------------------------------------------------------------------------------- 1 | #ifndef _CSR_TYPEDEFS_H_ 2 | #define _CSR_TYPEDEFS_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #ifndef NULL 9 | #define NULL 0 10 | #endif 11 | 12 | typedef char i1; 13 | typedef unsigned char u1; 14 | typedef short i2; 15 | typedef unsigned short u2; 16 | typedef int i4; 17 | typedef unsigned int u4; 18 | typedef long long i8; 19 | typedef unsigned long long u8; 20 | 21 | typedef u4 csr_offset_t; 22 | /* 23 | #if U8_AVAILABLE 24 | typedef signed long int s8; 25 | typedef unsigned long int u8; 26 | #else 27 | typedef struct {u4 low, high;} u8; 28 | #define s8 u8 29 | #endif 30 | */ 31 | 32 | #define CSR_INT8_MIN -128 33 | #define CSR_INT8_MAX 127 34 | #define CSR_UINT8_MAX 255 35 | 36 | #define CSR_INT16_MIN -32768 37 | #define CSR_INT16_MAX 32767 38 | #define CSR_UINT16_MAX 65535 39 | 40 | /* 41 | * Note that "int" is 32 bits on all currently supported Unix-like operating 42 | * systems, but "long" can be either 32 bits or 64 bits, thus the 32 bit 43 | * constants are not qualified with "L". 44 | */ 45 | #define CSR_INT32_MIN -2147483648 46 | #define CSR_INT32_MAX 2147483647 47 | #define CSR_UINT32_MAX 4294967295U 48 | 49 | #define CSR_INT64_MIN -9223372036854775808LL 50 | #define CSR_INT64_MAX 9223372036854775807LL 51 | #define CSR_UINT64_MAX 18446744073709551615ULL 52 | 53 | 54 | #ifdef WIN32 55 | #undef HIBYTE 56 | #undef LOBYTE 57 | #undef MAKEWORD 58 | #endif 59 | 60 | #ifndef WIN32 61 | typedef unsigned char BYTE; 62 | typedef unsigned short WORD; 63 | #ifndef _WINDEF_ 64 | typedef unsigned int DWORD; 65 | #endif 66 | #endif 67 | 68 | #ifndef WIN32 69 | 70 | #define HIBYTE(W) (((W) >> 8) & 0xFF) 71 | #define LOBYTE(W) ((W) & 0xFF) 72 | #define MAKEWORD(low,high) \ 73 | ((WORD)(((BYTE)(low)) | ((WORD)((BYTE)(high))) << 8)) 74 | 75 | #endif //end win32 76 | 77 | #undef HIWORD 78 | #ifndef HIWORD 79 | #define HIWORD(dw) ((dw)>>16) 80 | #endif 81 | 82 | #undef LOWORD 83 | #ifndef LOWORD 84 | #define LOWORD(dw) ((dw)&0xffff) 85 | #endif 86 | 87 | #undef MAKEDWORD 88 | 89 | #ifndef MAKEDWORD 90 | #define MAKEDWORD(hw,lw) (((hw)<<16)|(lw)) 91 | #endif 92 | 93 | #ifdef __cplusplus 94 | } 95 | #endif 96 | #endif 97 | 98 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramDict.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_UnigramDict_h 25 | #define css_UnigramDict_h 26 | 27 | #include 28 | 29 | #include "darts.h" 30 | 31 | namespace css { 32 | class UnigramCorpusReader; 33 | } /* End of namespace css */ 34 | 35 | namespace css { 36 | 37 | 38 | /** 39 | * How to find item fast is a real problem here. 40 | * @return the string(utf-8,encoded) of the id. 41 | */ 42 | class UnigramDict { 43 | 44 | public: 45 | typedef Darts::DoubleArray::result_pair_type result_pair_type; 46 | UnigramDict() {}; 47 | virtual ~UnigramDict() {}; 48 | public: 49 | 50 | virtual int load(const char* filename); 51 | virtual int isLoad(); 52 | 53 | /** 54 | * This function should be used only, in Debug mode. 55 | */ 56 | virtual std::string getString(int id); 57 | 58 | 59 | /** 60 | * Find all word item in UnigramDict, which buf as a prefix 61 | * @return total items found 62 | */ 63 | virtual int findHits(const char* buf, result_pair_type *result = NULL, size_t result_len = 0, int keylen = 0); 64 | 65 | virtual int import(UnigramCorpusReader &ur); 66 | 67 | virtual int save(const char* filename); 68 | 69 | virtual int exactMatch(const char* key, int *id = NULL); 70 | protected: 71 | Darts::DoubleArray m_da; 72 | }; 73 | 74 | } /* End of namespace css */ 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /mmseg/php_mmseg.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PHP_MMSEG_H 3 | #define PHP_MMSEG_H 4 | 5 | extern zend_module_entry mmseg_module_entry; 6 | #define phpext_mmseg_ptr &mmseg_module_entry 7 | 8 | #define PHP_MMSEG_VERSION "0.3" /* Replace with version number for your extension */ 9 | 10 | #ifdef PHP_WIN32 11 | # define PHP_MMSEG_API __declspec(dllexport) 12 | #elif defined(__GNUC__) && __GNUC__ >= 4 13 | # define PHP_MMSEG_API __attribute__ ((visibility("default"))) 14 | #else 15 | # define PHP_MMSEG_API 16 | #endif 17 | 18 | extern "C" { 19 | #ifdef ZTS 20 | #include "TSRM.h" 21 | #endif 22 | } 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include "SegmenterManager.h" 34 | #include "Segmenter.h" 35 | #include "csr_utils.h" 36 | 37 | using namespace css; 38 | 39 | #define PHP_MMSEG_DESCRIPTOR_RES_NAME "mmseg segmenter manager resource" 40 | 41 | PHP_MINIT_FUNCTION(mmseg); 42 | PHP_MSHUTDOWN_FUNCTION(mmseg); 43 | PHP_RINIT_FUNCTION(mmseg); 44 | PHP_RSHUTDOWN_FUNCTION(mmseg); 45 | PHP_MINFO_FUNCTION(mmseg); 46 | 47 | // 使用全局的mmseg数据进行分词 48 | PHP_FUNCTION(mmseg_segment); 49 | 50 | // 初始化mmseg配置,或者mmseg句柄 51 | PHP_FUNCTION(mmseg_open); 52 | // 关闭句柄,释放空间 53 | PHP_FUNCTION(mmseg_close); 54 | // 字典生成 55 | PHP_FUNCTION(mmseg_gendict); 56 | // 生成特殊短语字典 57 | PHP_FUNCTION(mmseg_gensynonyms); 58 | // 生成同义词词典 59 | PHP_FUNCTION(mmseg_genthesaurus); 60 | 61 | ZEND_BEGIN_MODULE_GLOBALS(mmseg) 62 | void* mgr; /* (SegmenterManager*) */ 63 | time_t dict_mtime; 64 | ZEND_END_MODULE_GLOBALS(mmseg) 65 | 66 | /* Always refer to the globals in your function as MMSEG_G(variable). 67 | You are encouraged to rename these macros something shorter, see 68 | examples in any other php module directory. 69 | */ 70 | #if PHP_MAJOR_VERSION < 7 71 | 72 | #ifdef ZTS 73 | #define MMSEG_G(v) TSRMG(mmseg_globals_id, zend_mmseg_globals *, v) 74 | #else 75 | #define MMSEG_G(v) (mmseg_globals.v) 76 | #endif 77 | 78 | #else 79 | #define MMSEG_G(v) ZEND_MODULE_GLOBALS_ACCESSOR(mmseg, v) 80 | #endif 81 | 82 | #if defined(ZTS) && defined(COMPILE_DL_MMSEG) 83 | ZEND_TSRMLS_CACHE_EXTERN() 84 | #endif 85 | 86 | #ifdef MMSEG_DEBUG 87 | #define MMSEG_LOG(w) php_error(E_WARNING, w) 88 | #else 89 | #define MMSEG_LOG(w) ; 90 | #endif 91 | 92 | #if PHP_MAJOR_VERSION >= 7 93 | #define zend_rsrc_list_entry zend_resource 94 | #endif 95 | 96 | #endif /* PHP_MMSEG_H */ 97 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/Singleton.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef CSR_SINGLETON_H 25 | #define CSR_SINGLETON_H 26 | 27 | #ifdef HAVE_ATEXIT 28 | # ifdef HAVE_CSTDLIB 29 | #include 30 | using std::atexit; 31 | # else 32 | #include 33 | # endif 34 | #endif 35 | 36 | /** 37 | * A template class that implements the Singleton pattern. 38 | * FIXME: should I impl HAVE_ATEXIT mode? like bzflag? 39 | */ 40 | template 41 | class CSR_Singleton 42 | { 43 | static T* ms_instance; 44 | public: 45 | /** 46 | * Static method to access the only pointer of this instance. 47 | * \return a pointer to the only instance of this 48 | */ 49 | static T* Get(); 50 | 51 | /** 52 | * Release resources. 53 | */ 54 | static void Free(); 55 | 56 | protected: 57 | /** 58 | * Default constructor. 59 | */ 60 | CSR_Singleton(); 61 | 62 | /** 63 | * Destructor. 64 | */ 65 | virtual ~CSR_Singleton(); 66 | 67 | static void destroy() { 68 | if ( ms_instance != 0 ) { 69 | delete(ms_instance); 70 | ms_instance = 0; 71 | } 72 | } 73 | }; 74 | template 75 | T* CSR_Singleton::ms_instance = 0; 76 | 77 | template 78 | CSR_Singleton::CSR_Singleton() 79 | { 80 | } 81 | 82 | template 83 | CSR_Singleton::~CSR_Singleton() 84 | { 85 | } 86 | 87 | template 88 | T* CSR_Singleton::Get() 89 | { 90 | if(!ms_instance){ 91 | ms_instance = new T(); 92 | // destroy the singleton when the application terminates 93 | #ifdef HAVE_ATEXIT 94 | atexit(CSR_Singleton::destroy); 95 | #endif 96 | } 97 | return ms_instance; 98 | } 99 | 100 | template 101 | void CSR_Singleton::Free() 102 | { 103 | if( ms_instance ) 104 | { 105 | delete ms_instance; 106 | ms_instance = 0; 107 | } 108 | } 109 | 110 | #endif // CSR_SINGLETON_H 111 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/config.win.h: -------------------------------------------------------------------------------- 1 | /* config.h. Generated by configure. */ 2 | /* config.h.in. Generated from configure.in by autoheader. */ 3 | 4 | /* Define to 1 if you have the header file. */ 5 | //#define HAVE_DLFCN_H 1 6 | 7 | /* Define to 1 if you have the header file. */ 8 | //#define HAVE_FCNTL_H 1 9 | 10 | /* Define to 1 if you have the `getpagesize' function. */ 11 | //#define HAVE_GETPAGESIZE 1 12 | 13 | /* Define to 1 if you have the header file. */ 14 | #define HAVE_INTTYPES_H 1 15 | 16 | /* Define to 1 if you have the header file. */ 17 | /* #undef HAVE_IO_H */ 18 | 19 | /* Define to 1 if you have the header file. */ 20 | #define HAVE_LIMITS_H 1 21 | 22 | /* Define to 1 if you have the header file. */ 23 | #define HAVE_MEMORY_H 1 24 | 25 | /* Define to 1 if you have a working `mmap' system call. */ 26 | //#define HAVE_MMAP 1 27 | 28 | /* Define to 1 if you have the header file. */ 29 | //#define HAVE_NETINET_IN_H 1 30 | 31 | /* Define to 1 if you have the header file. */ 32 | #define HAVE_STDINT_H 1 33 | 34 | /* Define to 1 if you have the header file. */ 35 | #define HAVE_STDLIB_H 1 36 | 37 | /* Define to 1 if you have the header file. */ 38 | #define HAVE_STRINGS_H 1 39 | 40 | /* Define to 1 if you have the header file. */ 41 | #define HAVE_STRING_H 1 42 | 43 | /* Define to 1 if you have the header file. */ 44 | #define HAVE_SYS_FILE_H 1 45 | 46 | /* Define to 1 if you have the header file. */ 47 | //#define HAVE_SYS_PARAM_H 1 48 | 49 | /* Define to 1 if you have the header file. */ 50 | #define HAVE_SYS_STAT_H 1 51 | 52 | /* Define to 1 if you have the header file. */ 53 | #define HAVE_SYS_TYPES_H 1 54 | 55 | /* Define to 1 if you have the header file. */ 56 | //#define HAVE_UNISTD_H 1 57 | 58 | /* Define to 1 if you have the header file. */ 59 | /* #undef HAVE_WINDOWS_H */ 60 | #define HAVE_WINDOWS_H 1 61 | 62 | /* Name of package */ 63 | #define PACKAGE "libsegment" 64 | 65 | /* Define to the address where bug reports for this package should be sent. */ 66 | #define PACKAGE_BUGREPORT "" 67 | 68 | /* Define to the full name of this package. */ 69 | #define PACKAGE_NAME "" 70 | 71 | /* Define to the full name and version of this package. */ 72 | #define PACKAGE_STRING "" 73 | 74 | /* Define to the one symbol short name of this package. */ 75 | #define PACKAGE_TARNAME "" 76 | 77 | /* Define to the version of this package. */ 78 | #define PACKAGE_VERSION "" 79 | 80 | /* Define to 1 if you have the ANSI C header files. */ 81 | #define STDC_HEADERS 1 82 | 83 | /* Version number of package */ 84 | #define VERSION "0.1" 85 | 86 | /* Define to `long' if does not define. */ 87 | /* #undef off_t */ 88 | 89 | /* Define to `unsigned' if does not define. */ 90 | /* #undef size_t */ 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | php-mmseg 2 | ========= 3 | 4 | ## 简介 5 | 6 | 中文分词引擎LibMMSeg的php扩展 7 | 8 | ## 依赖 9 | 10 | *coreseek官网已经关闭了,所以已经无法从官方文档查询到软件包的细节, 如果需要查询官网信息,可以参考 archive.org 上的备份* 11 | 12 | https://web.archive.org/web/20161122124307/http://www.coreseek.cn:80/products-install/faq/ 13 | 14 | 1. php5.4 以上版本 (其他版本目前还没有测试) 15 | 2. LibMMSeg 3.2.14版本 http://www.coreseek.cn/opensource/mmseg/ 16 | ``` 17 | 关于mmseg的安装,参考 http://www.coreseek.cn/products/products-install/ 18 | ``` 19 | 20 | ## 安装 21 | 22 | > 目前在Linux和Mac下测试通过,Windows下还未编译过 23 | 24 | 1. 首先安装LibMMSeg ( http://www.coreseek.cn/uploads/csft/3.2/mmseg-3.2.14.tar.gz ) 25 | **我将libMMeg保存一份到我们的git仓库的dependencies 下** 26 | 27 | ``` 28 | cd dependencies/mmseg-3.2.14 29 | ./bootstrap 30 | ./configure --prefix=/opt/ 31 | make 32 | sudo make install 33 | ``` 34 | 2. 安装php-mmseg 35 | 36 | ``` 37 | cd php-mmseg/mmseg 38 | phpize 39 | ./configure --with-mmseg=/opt 40 | make 41 | sudo make install 42 | ``` 43 | 3. 配置php-mmseg 44 | 45 | 在php.ini中增加 46 | ``` 47 | extension=mmseg.so 48 | mmseg.dict_dir=/opt/etc 49 | mmseg.autoreload=1 50 | ``` 51 | 注意, `mmseg.dict_dir` 配置的是mmseg配置文件和字典所在目录的地址, `mmseg.autoreload` 如果设置为1 则字典文件(特指uni.lib), 在更新之后,系统会自动重新载入配置文件和字典。 52 | 53 | 54 | ## 使用 55 | 56 | 1. 全局字典模式 57 | 58 | 该模式引入全文的字典文件,不需要每次调用分词代码之前调用字典,但是要求必须在配置文件中配置好字典的目录 59 | ```php 60 | $ret = mmseg_segment("你好,世界"); 61 | var_dump($ret); 62 | ``` 63 | 2. 在程序中引入字典 64 | 65 | ```php 66 | $mmseg = mmseg_open("/opt/mmseg/etc"); 67 | $ret = mmseg_segment($mmseg, "你好,世界"); 68 | mmseg_close($mmseg); 69 | var_dump($ret); 70 | ``` 71 | 输出结果(使用mmseg默认的字典) 72 | ```php 73 | array(3) { 74 | [0]=> 75 | string(6) "你好" 76 | [1]=> 77 | string(3) "," 78 | [2]=> 79 | string(6) "世界" 80 | } 81 | ``` 82 | 3. 生成字典文件 83 | 84 | 类似命令行的如下命令, 85 | ``` 86 | mmseg -u unigram.txt 87 | ``` 88 | 89 | 我们可以使用下面的语句生成词典文件,其中第一个参数为要输入的源文本格式的词典文件, 如unigram.txt , 第二个参数为输出的供分词使用的词典文件 90 | 91 | 代码 92 | ```php 93 | $isDone = mmseg_gendict(dirname(__FILE__) . "/" . "unigram.txt", dirname(__FILE__) . "/" . "unigram.txt.uni" ); 94 | 95 | ``` 96 | 97 | 原文本格式的词典文件格式如下(参考LibMMSeg文档 : http://www.coreseek.cn/opensource/mmseg/ ) 98 | ``` 99 | 河 187 100 | x:187 101 | 造假者 1 102 | x:1 103 | 台北队 1 104 | x:1 105 | 湖边 1 106 | ``` 107 | 4. 生成特殊短语,同义词词典 108 | 109 | ```php 110 | // 生成特殊短语词典 111 | mmseg_gensynonyms(dirname(__FILE__) . "/" . "synonyms.txt", dirname(__FILE__) . "/" . "synonyms.dat" ) 112 | // 生成同义词词典 113 | mmseg_genthesaurus(dirname(__FILE__) . "/" . "thesaurus.txt", dirname(__FILE__) . "/" . "thesaurus.dat" ) 114 | 115 | ``` 116 | 117 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/Makefile.am: -------------------------------------------------------------------------------- 1 | ## Makefile.am -- Process this file with automake to produce Makefile.in 2 | ## Copyright (C) 2000 Gary V. Vaughan 3 | ## 4 | ## This program is free software; you can redistribute it and/or modify 5 | ## it under the terms of the GNU General Public License as published by 6 | ## the Free Software Foundation; either version 2, or (at your option) 7 | ## any later version. 8 | ## 9 | ## This program is distributed in the hope that it will be useful, 10 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | ## GNU General Public License for more details. 13 | ## 14 | ## You should have received a copy of the GNU General Public License 15 | ## along with this program; if not, write to the Free Software 16 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | MAINTAINERCLEANFILES = Makefile.in common.h stamp-common 19 | EXTRA_DIST = common-h.in 20 | 21 | AM_CFLAGS = -fPIC 22 | AM_CPPFLAGS = -fPIC -I$(top_builddir) -I$(top_srcdir) -I$(top_srcdir)/src/css -I$(top_srcdir)/src/utils -D_REENTRANT -D_LARGEFILE64_SOURCE 23 | 24 | pkginclude_HEADERS = css/ICorpusReader.h css/SegmenterManager.h css/tolowercase.h css/UnigramRecord.h \ 25 | css/mmthunk.h css/SegmentPkg.h css/UnigramCorpusReader.h \ 26 | css/Segmenter.h css/SynonymsDict.h css/UnigramDict.h \ 27 | utils/csr_mmap.h utils/darts.h utils/scoped_ptr.h utils/Utf8_16.h \ 28 | utils/csr_assert.h utils/csr_pool.h utils/freelist.h utils/Singleton.h \ 29 | utils/csr.h utils/csr_utils.h utils/os.h utils/StringTokenizer.h \ 30 | csr_typedefs.h css/ThesaurusDict.h iniparser/dictionary.h \ 31 | iniparser/iniparser.h utils/bsd_getopt.h utils/bsd_getopt_win.h 32 | 33 | 34 | #noinst_HEADERS = 35 | 36 | ## @start 1 37 | lib_LTLIBRARIES = libmmseg.la 38 | libmmseg_la_LIBADD = 39 | libmmseg_la_LDFLAGS = -static 40 | libmmseg_la_SOURCES = css/mmthunk.cpp css/SegmenterManager.cpp css/SynonymsDict.cpp \ 41 | css/UnigramDict.cpp css/segmenter.cpp css/SegmentPkg.cpp \ 42 | css/UnigramCorpusReader.cpp css/UnigramRecord.cpp \ 43 | utils/assert.c utils/bsd_getopt.c utils/csr_mmap.c utils/csr_utils.c utils/Utf8_16.cpp utils/StringTokenizer.cpp iniparser/dictionary.c iniparser/iniparser.c css/ThesaurusDict.cpp 44 | 45 | bin_PROGRAMS = mmseg 46 | mmseg_LDADD = $(top_builddir)/src/libmmseg.la 47 | mmseg_SOURCES = mmseg_main.cpp 48 | 49 | ## @end 1 50 | ## @start 2 51 | # Regenerate common.h with config.status whenever common-h.in changes. 52 | common.h: stamp-common 53 | @: 54 | stamp-common: $(srcdir)/common-h.in $(top_builddir)/config.status 55 | cd $(top_builddir) \ 56 | && CONFIG_FILES= CONFIG_HEADERS= CONFIG_OTHER=sic/common.h \ 57 | $(SHELL) ./config.status 58 | echo timestamp > $@ 59 | ## @end 2 60 | 61 | # Don't distribute common.h, since it is build host dependent! 62 | dist-hook: 63 | rm -f $(distdir)/common.h 64 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/mdate-sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Get modification time of a file or directory and pretty-print it. 3 | # Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. 4 | # written by Ulrich Drepper , June 1995 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2, or (at your option) 9 | # any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program; if not, write to the Free Software Foundation, 18 | # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | 20 | # Prevent date giving response in another language. 21 | LANG=C 22 | export LANG 23 | LC_ALL=C 24 | export LC_ALL 25 | LC_TIME=C 26 | export LC_TIME 27 | 28 | # Get the extended ls output of the file or directory. 29 | # On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below. 30 | if ls -L /dev/null 1>/dev/null 2>&1; then 31 | set - x`ls -L -l -d $1` 32 | else 33 | set - x`ls -l -d $1` 34 | fi 35 | # The month is at least the fourth argument 36 | # (3 shifts here, the next inside the loop). 37 | shift 38 | shift 39 | shift 40 | 41 | # Find the month. Next argument is day, followed by the year or time. 42 | month= 43 | until test $month 44 | do 45 | shift 46 | case $1 in 47 | Jan) month=January; nummonth=1;; 48 | Feb) month=February; nummonth=2;; 49 | Mar) month=March; nummonth=3;; 50 | Apr) month=April; nummonth=4;; 51 | May) month=May; nummonth=5;; 52 | Jun) month=June; nummonth=6;; 53 | Jul) month=July; nummonth=7;; 54 | Aug) month=August; nummonth=8;; 55 | Sep) month=September; nummonth=9;; 56 | Oct) month=October; nummonth=10;; 57 | Nov) month=November; nummonth=11;; 58 | Dec) month=December; nummonth=12;; 59 | esac 60 | done 61 | 62 | day=$2 63 | 64 | # Here we have to deal with the problem that the ls output gives either 65 | # the time of day or the year. 66 | case $3 in 67 | *:*) set `date`; eval year=\$$# 68 | case $2 in 69 | Jan) nummonthtod=1;; 70 | Feb) nummonthtod=2;; 71 | Mar) nummonthtod=3;; 72 | Apr) nummonthtod=4;; 73 | May) nummonthtod=5;; 74 | Jun) nummonthtod=6;; 75 | Jul) nummonthtod=7;; 76 | Aug) nummonthtod=8;; 77 | Sep) nummonthtod=9;; 78 | Oct) nummonthtod=10;; 79 | Nov) nummonthtod=11;; 80 | Dec) nummonthtod=12;; 81 | esac 82 | # For the first six month of the year the time notation can also 83 | # be used for files modified in the last year. 84 | if (expr $nummonth \> $nummonthtod) > /dev/null; 85 | then 86 | year=`expr $year - 1` 87 | fi;; 88 | *) year=$3;; 89 | esac 90 | 91 | # The result. 92 | echo $day $month $year 93 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/SynonymsDict.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_SynonymsDict_h 25 | #define css_SynonymsDict_h 26 | 27 | #include 28 | #include 29 | #include "darts.h" 30 | #include "csr_mmap.h" 31 | 32 | namespace css { 33 | 34 | typedef struct _csr_sybarray_trie_tag{ 35 | i4 base; 36 | u4 check; 37 | //u4 flag; //used to tell how may features. only low-4bit used now. 38 | size_t offset; //the base offset. 39 | }_csr_sybarray_trie; 40 | 41 | /** 42 | * How to find item fast is a real problem here. 43 | * @return the string(utf-8,encoded) of the id. 44 | */ 45 | class SynonymsDict { 46 | 47 | public: 48 | typedef Darts::DoubleArray::result_pair_type result_pair_type; 49 | typedef struct _tag_result_pair_type { 50 | i4 value; 51 | u1 length; 52 | i4 dict_id; 53 | }Result; 54 | 55 | public: 56 | SynonymsDict():m_file(NULL),array_(NULL){ 57 | string_pool = NULL; 58 | }; 59 | virtual ~SynonymsDict(){ 60 | if(m_file){ 61 | csr_munmap_file(m_file); 62 | } 63 | } 64 | 65 | virtual int load(const char* filename); 66 | 67 | virtual int import(const char* filename); 68 | 69 | virtual int save(const char* filename); 70 | 71 | virtual const char* exactMatch(const char* key, int len = 0); 72 | virtual const char* maxMatch(const char* key, int &len); 73 | 74 | protected: 75 | _csr_mmap_t* m_file; 76 | Darts::DoubleArray m_da; 77 | std::map rKeys; 78 | //std::set rKeys; 79 | std::map lKeys; 80 | 81 | size_t m_string_pool_size; 82 | _csr_sybarray_trie * array_; 83 | const char* string_pool; 84 | 85 | typedef i4 array_type_; 86 | typedef u4 array_u_type_; 87 | typedef u1 node_u_type_; 88 | 89 | inline void set_result(Result& x, i4 r, u1 l) { 90 | x.value = r; 91 | x.length = l; 92 | x.dict_id = 0; 93 | } 94 | inline void set_result(Result& x, i4 r, u1 l,i4 id) { 95 | x.value = r; 96 | x.length = l; 97 | x.dict_id = id; 98 | } 99 | 100 | protected: 101 | int exactMatchID(const char* key); 102 | }; 103 | 104 | } /* End of namespace css */ 105 | #endif 106 | 107 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_pool.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Pool memory allocation 3 | * 4 | */ 5 | 6 | // not thread safe (a pool cannot be shared by threads safely) 7 | 8 | /* 9 | -- principle: 10 | 11 | - user operations on a pool of objects of type T are: 12 | - T *draw() : obtain a unused slot to store an object T 13 | - void drop(T *) : realease a slot 14 | 15 | -- implementation: 16 | 17 | - a pool for objects T is: 18 | 19 | * blocks[64] : an array of allocated blocks of memory: 20 | |---0--> block with capacity 64 21 | |---1--> block with capacity 64 22 | |---2--> block with capacity 128 23 | |---3--> block with capacity 128 24 | |---4--> block with capacity 256 25 | |---5--> block with capacity 256 26 | |---6--> block with capacity 512 27 | |---7--> not yet allocated 28 | : 29 | |---k--> not yet allocated (future capacity ~ 2^(6+k/2)) 30 | : 31 | '--63--> not yet allocated 32 | * cblock : the index of the next unallocated block (here 7). 33 | * next : a pointer to an unused slot inside an allocated bloc 34 | 35 | - the first bytes of an unallocated slot inside a bloc are used to store a 36 | pointer to some other unallocated slot. (this way, we keep a list of all 37 | unused slots starting at ) 38 | 39 | - insertions and deletions in this list are done at the root . 40 | if points to NULL (no slots are availlable) when a draw() 41 | operation is performed a new block is allocated, and the unused slots 42 | list is filled with the allocated slots. 43 | 44 | - memory is freed only at pool's deletion. 45 | 46 | */ 47 | 48 | #include 49 | 50 | #ifndef _CSR_MEM_POOL_H_ 51 | #define _CSR_MEM_POOL_H_ 52 | 53 | namespace csr { 54 | 55 | template 56 | class pool { 57 | 58 | public: 59 | 60 | pool() 61 | { 62 | cblock = 0; 63 | size = sizeof(T) > sizeof(void *) ? sizeof(T) : sizeof(void *); 64 | next = NULL; 65 | memset(block,0,sizeof(block)); 66 | } 67 | 68 | ~pool() 69 | { 70 | for (int k = 0; k < cblock; k++) 71 | free(block[k]); 72 | } 73 | 74 | void clear() { 75 | next = NULL; 76 | cblock = 0; 77 | } 78 | 79 | T *draw() 80 | { 81 | if (!next) addblock(); 82 | void *p = next; 83 | next = *(void **)p; 84 | return (T *) p; 85 | } 86 | 87 | void drop(T *p) 88 | { 89 | *(void **)p = next; 90 | next = (void *) p; 91 | } 92 | 93 | private: 94 | 95 | int size; 96 | int cblock; 97 | void *block[64]; //enough to store unlimited number of objects 98 | void *next; 99 | 100 | void addblock() 101 | { 102 | int i = cblock++; 103 | int blocksize = 1 << (6 + (i/2)); 104 | if(!block[i]) 105 | block[i] = (void *)malloc(blocksize * size); 106 | /* 107 | else{ 108 | memset(block[i],0,blocksize * size); 109 | } 110 | */ 111 | if (!block[i]) throw std::bad_alloc(); 112 | char *p = (char *)block[i]; 113 | for (int k = 0; k < blocksize - 1; k++) 114 | { 115 | *(void**)p = (void *)(p + size); 116 | p += size; 117 | } 118 | *(void **)p = next; 119 | next = block[i]; 120 | //printf("done\n");//debug 121 | } 122 | 123 | }; 124 | 125 | } //namespace csr 126 | #endif 127 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/mmseg_interface.cpp: -------------------------------------------------------------------------------- 1 | #include "mmseg_interface.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "SegmenterManager.h" 12 | #include "Segmenter.h" 13 | #include "csr_utils.h" 14 | 15 | using namespace std; 16 | using namespace css; 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | PyObject *init(PyObject *self, PyObject *args) { 23 | char *fromPython; 24 | PyObject *module = PyImport_ImportModule("cmmseg"); 25 | // 26 | /* 27 | PyObject *module_dict = PyModule_GetDict(module); \ 28 | PyObject *c_api_object = PyDict_GetItemString(module_dict, "_C_API"); \ 29 | if (PyCObject_Check(c_api_object)) { \ 30 | PyCurses_API = (void **)PyCObject_AsVoidPtr(c_api_object); \ 31 | } \ 32 | */ 33 | { 34 | PyObject *module_dict = PyModule_GetDict(module); 35 | if(module_dict) { 36 | PyObject *c_api_object = PyDict_GetItemString(module_dict, "__segmgr"); 37 | if (c_api_object && PyCObject_Check(c_api_object)) 38 | return self; 39 | } 40 | } 41 | if (!PyArg_Parse(args, "(s)", &fromPython)){ 42 | PyErr_SetString(PyExc_ValueError, "invalid dict_path"); 43 | return NULL; 44 | }else { 45 | SegmenterManager* mgr = new SegmenterManager(); 46 | int nRet = 0; 47 | if(fromPython) 48 | nRet = mgr->init(fromPython); 49 | if(nRet == 0){ 50 | //return self; 51 | }else { 52 | delete mgr; 53 | PyErr_SetString(PyExc_ValueError, "invalid dict_path"); 54 | return NULL; 55 | } 56 | //add to module obj 57 | { 58 | //bind to self 59 | PyObject *c_api_object; 60 | c_api_object = PyCObject_FromVoidPtr((void *)mgr, NULL); 61 | if (c_api_object != NULL) 62 | PyModule_AddObject(module, "__segmgr", c_api_object); 63 | } 64 | return module; 65 | } 66 | } 67 | 68 | PyObject *segment(PyObject *self, PyObject *args) { 69 | 70 | PyObject *module = PyImport_ImportModule("cmmseg"); 71 | SegmenterManager* mgr = NULL; 72 | { 73 | PyObject *module_dict = PyModule_GetDict(module); 74 | if(!module_dict) { 75 | PyErr_SetString(PyExc_ValueError, "Needs load segment dictionary library frist!"); 76 | return NULL; 77 | } 78 | PyObject *c_api_object = PyDict_GetItemString(module_dict, "__segmgr"); 79 | 80 | if (!c_api_object || !PyCObject_Check(c_api_object)) { 81 | PyErr_SetString(PyExc_ValueError, "Needs load segment dictionary library frist!"); 82 | return NULL; 83 | } 84 | mgr = (SegmenterManager*)PyCObject_AsVoidPtr(c_api_object); 85 | } 86 | 87 | Segmenter* seg = mgr->getSegmenter(); 88 | char *fromPython; 89 | 90 | if (!PyArg_Parse(args, "(s)", &fromPython)) 91 | return NULL; 92 | else { 93 | seg->setBuffer((u1*)fromPython, (u4)strlen(fromPython)); 94 | 95 | PyObject* seg_result = PyList_New(0); 96 | while(1) 97 | { 98 | u2 len = 0, symlen = 0; 99 | char* tok = (char*)seg->peekToken(len,symlen); 100 | if(!tok || !*tok || !len){ 101 | break; 102 | } 103 | //append new item 104 | PyList_Append(seg_result, PyString_FromStringAndSize(tok,len)); 105 | seg->popToken(len); 106 | } 107 | return seg_result; 108 | } 109 | } 110 | 111 | #ifdef __cplusplus 112 | } 113 | #endif -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_utils.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | #include "csr_utils.h" 29 | 30 | #include 31 | //#define _CLCOMPILER_MSVC 0 32 | 33 | #if WIN32 34 | #include 35 | #include 36 | #else 37 | #include 38 | #include 39 | #include 40 | #endif 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | int 47 | csr_atoi (const char *nptr) 48 | { 49 | return (int) strtol (nptr, (char **) NULL, 10); 50 | } 51 | 52 | u2 u2_length(const u2* p){ 53 | const u2* ptr = p; 54 | while(*ptr) 55 | ptr++; 56 | return ptr-p; 57 | } 58 | 59 | int Cha_lineno, Cha_lineno_error; 60 | int Cha_errno = 0; 61 | static FILE *cha_stderr = NULL; 62 | 63 | void 64 | csr_exit(int status, char *format, ...) 65 | { 66 | va_list ap; 67 | 68 | if (Cha_errno) 69 | return; 70 | 71 | if (!cha_stderr) 72 | cha_stderr = stderr; 73 | else if (cha_stderr != stderr) 74 | fputs("500 ", cha_stderr); 75 | 76 | va_start(ap, format); 77 | vfprintf(cha_stderr, format, ap); 78 | va_end(ap); 79 | if (status >= 0) { 80 | fputc('\n', cha_stderr); 81 | if (cha_stderr == stderr) 82 | exit(status); 83 | Cha_errno = 1; 84 | } 85 | } 86 | 87 | void 88 | csr_perror(const char *s) 89 | { 90 | csr_exit(-1, ""); 91 | perror(s); 92 | } 93 | 94 | void 95 | csr_exit_perror(const char *s) 96 | { 97 | csr_perror(s); 98 | exit(1); 99 | } 100 | 101 | unsigned long currentTimeMillis() { 102 | #if WIN32 //|| defined(__MINGW32__) 103 | struct _timeb tstruct; 104 | _ftime(&tstruct); 105 | 106 | return (((unsigned long) tstruct.time) * 1000) + tstruct.millitm; 107 | #else 108 | 109 | struct timeval tstruct; 110 | if (gettimeofday(&tstruct, NULL) < 0) { 111 | fprintf(stderr,"Error in gettimeofday call."); 112 | } 113 | 114 | return (((long) tstruct.tv_sec) * 1000) + tstruct.tv_usec / 1000; 115 | #endif 116 | } 117 | 118 | u4 countBitsU4(u4 bits) 119 | { 120 | bits = bits - ((bits >> 1) & 0x55555555); 121 | bits = ((bits >> 2) & 0x33333333) + (bits & 0x33333333); 122 | bits = ((bits >> 4) + bits) & 0x0F0F0F0F; 123 | return (bits * 0x01010101) >> 24; 124 | } 125 | u2 countBitsU2(u2 bits) 126 | { 127 | bits = bits - ((bits >> 1) & 0x5555); 128 | bits = ((bits >> 2) & 0x3333) + (bits & 0x3333); 129 | bits = ((bits >> 4) + bits) & 0x0F0F; 130 | return ((bits * 0x0101) >> 8)&0x0F; 131 | } 132 | u1 countBitsU1(u1 bits) 133 | { 134 | bits = bits - ((bits >> 1) & 0x55); 135 | bits = ((bits >> 2) & 0x33) + (bits & 0x33); 136 | bits = ((bits >> 4) + bits) & 0x0F; 137 | return (bits * 0x01); 138 | } 139 | 140 | 141 | 142 | #ifdef __cplusplus 143 | }; 144 | #endif 145 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/SegmentPkg.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef css_SegmentPkg_h 25 | #define css_SegmentPkg_h 26 | #include 27 | #define HAVE_ATEXIT 28 | #include "Singleton.h" 29 | #include "csr_typedefs.h" 30 | 31 | namespace css { 32 | 33 | /* 34 | - find char-class 35 | - tolower case(optional, used in search.). 36 | */ 37 | class ChineseCharTaggerImpl 38 | { 39 | public: 40 | ChineseCharTaggerImpl(){ 41 | init(); 42 | } 43 | 44 | ~ChineseCharTaggerImpl(){ 45 | for(int i=1; i<256; i++) { 46 | if(i == 0x23) 47 | continue; 48 | if(index_map[i]) 49 | delete[] index_map[i]; 50 | } 51 | }; 52 | u2 tagUnicode(u2 iCode, u1 length); 53 | protected: 54 | void init(); 55 | //We reduced the map. only number-char page is exist 56 | //char cjk_map[20736]; // 256*(9f-4e) = 21k 57 | u1* index_map[256]; 58 | u1 ansi_map[256]; 59 | //char sym_map[512]; // 0x3000 - 0x303F && 0xFF?? 60 | }; 61 | 62 | typedef CSR_Singleton ChineseCharTagger; 63 | 64 | #include "tolowercase.h" 65 | 66 | /*To lower 67 | */ 68 | class ToLowerImpl 69 | { 70 | public: 71 | ToLowerImpl(){}; 72 | inline u2 toLower(u2 k){ 73 | u1 idx = k>>8; 74 | u2 iCode = k; 75 | if(table_index[idx]) 76 | iCode = table_index[idx][k&0xFF]; 77 | if(iCode) 78 | return iCode; 79 | return k; 80 | } 81 | }; 82 | 83 | typedef CSR_Singleton ToLower; 84 | 85 | class SegmentPkg { 86 | 87 | public: 88 | 89 | SegmentPkg(); 90 | ~SegmentPkg(); 91 | void init(); 92 | public: 93 | const char* m_buf;//make the hole object less than 64k 94 | u1* m_tag; 95 | int m_length; // used length 96 | u1 m_Own; 97 | int m_size; //total length 98 | int m_used; 99 | u1 m_remains_bytes; 100 | std::vector m_wTagList; //the seps position. 101 | 102 | ChineseCharTaggerImpl* m_tagger; 103 | 104 | public: 105 | /** 106 | @return 0, appended. 107 | @return -1, too large 108 | NOTE: a newly created pkg always return 0. except not enough memory.(throw std::bad_alloc) 109 | */ 110 | int feedData(const char* buf,int length); 111 | int tagData(const char* buf,int length); 112 | void setSize(int length); 113 | public: 114 | /** 115 | * read UTF-8 input can tagger the char-pos in tag array. tag length must equal or larger than buf. 116 | * we assume buf is end with '\0' 117 | * and this function will changed m_wTagList as a side effect. 118 | * @return, the data remains untagged. must less than 3. 119 | */ 120 | int tagData(const char* buf, u1* tag, int length = 0, int offset = 0); 121 | 122 | protected: 123 | const static int DEFAULT_PACKAGE_LENGTH = 65400; 124 | }; 125 | 126 | } /* End of namespace css */ 127 | #endif 128 | 129 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config-h.in: -------------------------------------------------------------------------------- 1 | /* config-h.in. Generated from configure.in by autoheader. */ 2 | 3 | /* Define to 1 if you have the header file. */ 4 | #undef HAVE_ASSERT_H 5 | 6 | /* Define to 1 if you have the `basename' function. */ 7 | #undef HAVE_BASENAME 8 | 9 | /* Define to 1 if you have the `bzero' function. */ 10 | #undef HAVE_BZERO 11 | 12 | /* Define to 1 if you have the `calloc' function. */ 13 | #undef HAVE_CALLOC 14 | 15 | /* Define to 1 if you have the header file. */ 16 | #undef HAVE_DLFCN_H 17 | 18 | /* Define to 1 if you have the header file. */ 19 | #undef HAVE_ERRNO_H 20 | 21 | /* Define to 1 if you have the header file. */ 22 | #undef HAVE_INTTYPES_H 23 | 24 | /* Define to 1 if you have the `readline' library (-lreadline). */ 25 | #undef HAVE_LIBREADLINE 26 | 27 | /* Define to 1 if you have the header file. */ 28 | #undef HAVE_MEMORY_H 29 | 30 | /* Define to 1 if you have the `memset' function. */ 31 | #undef HAVE_MEMSET 32 | 33 | /* Define to 1 if you have the header file. */ 34 | #undef HAVE_STDARG_H 35 | 36 | /* Define to 1 if you have the header file. */ 37 | #undef HAVE_STDINT_H 38 | 39 | /* Define to 1 if you have the header file. */ 40 | #undef HAVE_STDLIB_H 41 | 42 | /* Define to 1 if you have the `strchr' function. */ 43 | #undef HAVE_STRCHR 44 | 45 | /* Define to 1 if you have the `strcspn' function. */ 46 | #undef HAVE_STRCSPN 47 | 48 | /* Define to 1 if you have the `strerror' function. */ 49 | #undef HAVE_STRERROR 50 | 51 | /* Define to 1 if you have the header file. */ 52 | #undef HAVE_STRINGS_H 53 | 54 | /* Define to 1 if you have the header file. */ 55 | #undef HAVE_STRING_H 56 | 57 | /* Define to 1 if you have the `strrchr' function. */ 58 | #undef HAVE_STRRCHR 59 | 60 | /* Define to 1 if you have the `strsignal' function. */ 61 | #undef HAVE_STRSIGNAL 62 | 63 | /* Define to 1 if you have the `strspn' function. */ 64 | #undef HAVE_STRSPN 65 | 66 | /* Define if your system libraries have a sys_errlist variable. */ 67 | #undef HAVE_SYS_ERRLIST 68 | 69 | /* Define if your system libraries have a sys_siglist variable. */ 70 | #undef HAVE_SYS_SIGLIST 71 | 72 | /* Define to 1 if you have the header file. */ 73 | #undef HAVE_SYS_STAT_H 74 | 75 | /* Define to 1 if you have the header file. */ 76 | #undef HAVE_SYS_TYPES_H 77 | 78 | /* Define to 1 if you have that is POSIX.1 compatible. */ 79 | #undef HAVE_SYS_WAIT_H 80 | 81 | /* Define to 1 if you have the header file. */ 82 | #undef HAVE_UNISTD_H 83 | 84 | /* Define to 1 if you have the header file. */ 85 | #undef HAVE_VARARGS_H 86 | 87 | /* Define to 1 if you have the `vfprintf' function. */ 88 | #undef HAVE_VFPRINTF 89 | 90 | /* Define to 1 if you have the `waitpid' function. */ 91 | #undef HAVE_WAITPID 92 | 93 | /* Define to the sub-directory where libtool stores uninstalled libraries. */ 94 | #undef LT_OBJDIR 95 | 96 | /* Name of package */ 97 | #undef PACKAGE 98 | 99 | /* Define to the address where bug reports for this package should be sent. */ 100 | #undef PACKAGE_BUGREPORT 101 | 102 | /* Define to the full name of this package. */ 103 | #undef PACKAGE_NAME 104 | 105 | /* Define to the full name and version of this package. */ 106 | #undef PACKAGE_STRING 107 | 108 | /* Define to the one symbol short name of this package. */ 109 | #undef PACKAGE_TARNAME 110 | 111 | /* Define to the home page for this package. */ 112 | #undef PACKAGE_URL 113 | 114 | /* Define to the version of this package. */ 115 | #undef PACKAGE_VERSION 116 | 117 | /* Define to 1 if you have the ANSI C header files. */ 118 | #undef STDC_HEADERS 119 | 120 | /* Version number of package */ 121 | #undef VERSION 122 | 123 | /* Define to empty if `const' does not conform to ANSI C. */ 124 | #undef const 125 | 126 | /* Define to `int' if does not define. */ 127 | #undef pid_t 128 | 129 | /* Define to `unsigned int' if does not define. */ 130 | #undef size_t 131 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/StringTokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include "StringTokenizer.h" 2 | 3 | namespace csr { 4 | 5 | StringTokenizer::StringTokenizer(const std::string& _str, const std::string& _delim) 6 | { 7 | 8 | if ((_str.length() == 0) || (_delim.length() == 0)) return; 9 | 10 | token_str = _str; 11 | delim = _delim; 12 | 13 | /* 14 | Remove sequential delimiter 15 | */ 16 | unsigned int curr_pos = 0; 17 | 18 | while(true) 19 | { 20 | if ((curr_pos = token_str.find(delim,curr_pos)) != std::string::npos) 21 | { 22 | curr_pos += delim.length(); 23 | 24 | while(token_str.find(delim,curr_pos) == curr_pos) 25 | { 26 | token_str.erase(curr_pos,delim.length()); 27 | } 28 | } 29 | else 30 | break; 31 | } 32 | 33 | /* 34 | Trim leading delimiter 35 | */ 36 | if (token_str.find(delim,0) == 0) 37 | { 38 | token_str.erase(0,delim.length()); 39 | } 40 | 41 | /* 42 | Trim ending delimiter 43 | */ 44 | curr_pos = 0; 45 | if ((curr_pos = token_str.rfind(delim)) != std::string::npos) 46 | { 47 | if (curr_pos != (token_str.length() - delim.length())) return; 48 | token_str.erase(token_str.length() - delim.length(),delim.length()); 49 | } 50 | 51 | } 52 | 53 | 54 | int StringTokenizer::countTokens() 55 | { 56 | 57 | unsigned int prev_pos = 0; 58 | int num_tokens = 0; 59 | 60 | if (token_str.length() > 0) 61 | { 62 | num_tokens = 0; 63 | 64 | unsigned int curr_pos = 0; 65 | while(true) 66 | { 67 | if ((curr_pos = token_str.find(delim,curr_pos)) != std::string::npos) 68 | { 69 | num_tokens++; 70 | prev_pos = curr_pos; 71 | curr_pos += delim.length(); 72 | } 73 | else 74 | break; 75 | } 76 | return ++num_tokens; 77 | } 78 | else 79 | { 80 | return 0; 81 | } 82 | 83 | } 84 | 85 | 86 | bool StringTokenizer::hasMoreTokens() 87 | { 88 | return (token_str.length() > 0); 89 | } 90 | 91 | 92 | std::string StringTokenizer::nextToken() 93 | { 94 | 95 | if (token_str.length() == 0) 96 | return ""; 97 | 98 | std::string tmp_str = ""; 99 | unsigned int pos = token_str.find(delim,0); 100 | 101 | if (pos != std::string::npos) 102 | { 103 | tmp_str = token_str.substr(0,pos); 104 | token_str = token_str.substr(pos+delim.length(),token_str.length()-pos); 105 | } 106 | else 107 | { 108 | tmp_str = token_str.substr(0,token_str.length()); 109 | token_str = ""; 110 | } 111 | 112 | return tmp_str; 113 | } 114 | 115 | 116 | int StringTokenizer::nextIntToken() 117 | { 118 | return atoi(nextToken().c_str()); 119 | } 120 | 121 | 122 | double StringTokenizer::nextFloatToken() 123 | { 124 | return atof(nextToken().c_str()); 125 | } 126 | 127 | 128 | std::string StringTokenizer::nextToken(const std::string& delimiter) 129 | { 130 | if (token_str.length() == 0) 131 | return ""; 132 | 133 | std::string tmp_str = ""; 134 | unsigned int pos = token_str.find(delimiter,0); 135 | 136 | if (pos != std::string::npos) 137 | { 138 | tmp_str = token_str.substr(0,pos); 139 | token_str = token_str.substr(pos + delimiter.length(),token_str.length() - pos); 140 | } 141 | else 142 | { 143 | tmp_str = token_str.substr(0,token_str.length()); 144 | token_str = ""; 145 | } 146 | 147 | return tmp_str; 148 | } 149 | 150 | 151 | std::string StringTokenizer::remainingString() 152 | { 153 | return token_str; 154 | } 155 | 156 | 157 | std::string StringTokenizer::filterNextToken(const std::string& filterStr) 158 | { 159 | std::string tmp_str = nextToken(); 160 | unsigned int currentPos = 0; 161 | 162 | while((currentPos = tmp_str.find(filterStr,currentPos)) != std::string::npos) 163 | { 164 | tmp_str.erase(currentPos,filterStr.length()); 165 | } 166 | 167 | return tmp_str; 168 | } 169 | 170 | }; //namespace csr { 171 | 172 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramDict.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include "UnigramCorpusReader.h" 25 | #include "UnigramDict.h" 26 | 27 | namespace css { 28 | 29 | 30 | 31 | int UnigramDict::load(const char* filename) 32 | { 33 | m_da.clear(); 34 | return m_da.open(filename); 35 | } 36 | 37 | /** 38 | * This function should be used only, in Debug mode. 39 | */ 40 | std::string UnigramDict::getString(int id) 41 | { 42 | return ""; 43 | } 44 | 45 | 46 | /** 47 | * Find all word item in UnigramDict, which buf as a prefix 48 | * @return total items found 49 | */ 50 | int UnigramDict::findHits(const char* buf, result_pair_type *result, size_t result_len, int keylen) 51 | { 52 | if(!m_da.array()) 53 | return 0; 54 | int num = m_da.commonPrefixSearch(buf, result, result_len, keylen); 55 | return num; 56 | } 57 | 58 | int UnigramDict::import(UnigramCorpusReader &ur) 59 | { 60 | std::vector key; 61 | std::vector value; 62 | int i = 0; 63 | UnigramRecord* rec = NULL; 64 | for(i=0;ikey[0]; 68 | key.push_back(ptr); 69 | value.push_back(rec->count); 70 | } 71 | }//end for 72 | //build da 73 | m_da.clear(); 74 | //1st 0 is the length array. 75 | //return m_da.build(key.size(), &key[0], 0, 0, &progress_bar) ; 76 | return m_da.build(key.size(), &key[0], 0, &value[0] ) ; 77 | } 78 | 79 | int UnigramDict::save(const char* filename) 80 | { 81 | m_da.save(filename); 82 | return 0; 83 | } 84 | int UnigramDict::isLoad() 85 | { 86 | return m_da.array() != NULL; 87 | } 88 | 89 | int UnigramDict::exactMatch(const char* key, int *id) 90 | { 91 | Darts::DoubleArray::result_pair_type rs; 92 | m_da.exactMatchSearch(key,rs); 93 | if(id) 94 | *id = rs.pos; 95 | if(rs.pos) 96 | return rs.value; 97 | ///FIXME: this totaly a mixture. some single char's id > 0 if it in unigram input text, while other's id < 0 if not in ungram text. 98 | ///so you can not just simply use UCS2 code as a char's id. 99 | ///FIXED in prof. version by changing unigram-dictionary format. 100 | //check is single char. 101 | int len = strlen(key); 102 | if(len<4){ 103 | const char* tm_pCur = key; 104 | char v = key[0]; 105 | //might be single cjk char. 106 | if ( v<128 && len == 1 && id) 107 | *id = -1*(int)v; 108 | // get number of bytes 109 | int iBytes = 0, iBytesLength = 0; 110 | while ( v & 0x80 ) { iBytes++; v <<= 1; } 111 | if(iBytes == len && len != 1){ 112 | //single char 113 | tm_pCur ++; 114 | int iCode = 0; 115 | iCode = ( v>>iBytes ); 116 | iBytes--; 117 | do 118 | { 119 | if ( !(*tm_pCur) ) 120 | break; 121 | if ( ((*tm_pCur) & 0xC0)!=0x80 ) { 122 | iCode = 0; 123 | break; 124 | } 125 | iCode = ( iCode<<6 ) + ( (*tm_pCur) & 0x3F ); 126 | iBytes--; 127 | tm_pCur++; 128 | } while ( iBytes ); 129 | if(iCode && id) 130 | *id = -1*iCode; 131 | } 132 | } 133 | 134 | return rs.value; 135 | } 136 | 137 | } /* End of namespace css */ 138 | 139 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/UnigramCorpusReader.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "UnigramCorpusReader.h" 30 | #include "UnigramRecord.h" 31 | #include "csr_utils.h" 32 | #include "Utf8_16.h" 33 | 34 | namespace css { 35 | 36 | using namespace csr; 37 | 38 | UnigramRecord *UnigramCorpusReader::getAt(int idx) 39 | { 40 | if(idx >=0 &&idx 17 | #include 18 | #include "csr_typedefs.h" 19 | 20 | #ifdef _MSC_VER 21 | #pragma warning(disable: 4514) // nreferenced inline function has been removed 22 | #endif 23 | namespace csr { 24 | 25 | int csrUTF8Encode ( u1 * pBuf, int iCode ); // forward ref for GCC 26 | int csrUTF8DecodeLength ( const u1 * pBuf ); 27 | int csrUTF8Decode ( const u1 * pBuf ); 28 | int csrUTF8Decode ( const u1 * pBuf , u2& length); 29 | int csrUTF8StringLength(const u1* pBuf); 30 | 31 | class Utf8_16 { 32 | public: 33 | typedef unsigned short utf16; // 16 bits 34 | typedef unsigned char utf8; // 8 bits 35 | typedef unsigned char ubyte; 36 | enum encodingType { 37 | eUnknown, 38 | eUtf16BigEndian, 39 | eUtf16LittleEndian, // Default on Windows 40 | eUtf8, 41 | eLast 42 | }; 43 | static const utf8 k_Boms[eLast][3]; 44 | }; 45 | 46 | // Reads UTF-16 and outputs UTF-8 47 | class Utf16_Iter : public Utf8_16 { 48 | public: 49 | Utf16_Iter(); 50 | void reset(); 51 | void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding); 52 | utf8 get() const { 53 | return m_nCur; 54 | } 55 | void operator++(); 56 | operator bool() { return m_pRead <= m_pEnd; } 57 | 58 | protected: 59 | void toStart(); // Put to start state, swap bytes if necessary 60 | enum eState { 61 | eStart, 62 | e2Bytes2, 63 | e3Bytes2, 64 | e3Bytes3 65 | }; 66 | protected: 67 | encodingType m_eEncoding; 68 | eState m_eState; 69 | utf8 m_nCur; 70 | utf16 m_nCur16; 71 | const ubyte* m_pBuf; 72 | const ubyte* m_pRead; 73 | const ubyte* m_pEnd; 74 | }; 75 | 76 | // Reads UTF-8 and outputs UTF-16 77 | class Utf8_Iter : public Utf8_16 { 78 | public: 79 | Utf8_Iter(); 80 | void reset(); 81 | void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding); 82 | #ifdef _DEBUG 83 | utf16 get() const; 84 | #else 85 | utf16 get() const { return m_nCur; } 86 | #endif 87 | 88 | bool canGet() const { return m_eState == eStart; } 89 | void operator++(); 90 | operator bool() { return m_pRead <= m_pEnd; } 91 | 92 | protected: 93 | void swap(); 94 | void toStart(); // Put to start state, swap bytes if necessary 95 | enum eState { 96 | eStart, 97 | e2Bytes_Byte2, 98 | e3Bytes_Byte2, 99 | e3Bytes_Byte3 100 | }; 101 | protected: 102 | encodingType m_eEncoding; 103 | eState m_eState; 104 | utf16 m_nCur; 105 | const ubyte* m_pBuf; 106 | const ubyte* m_pRead; 107 | const ubyte* m_pEnd; 108 | }; 109 | 110 | // Reads UTF16 and outputs UTF8 111 | class Utf8_16_Read : public Utf8_16 { 112 | public: 113 | Utf8_16_Read(); 114 | ~Utf8_16_Read(); 115 | 116 | size_t convert(char* buf, size_t len); 117 | char* getNewBuf() { return reinterpret_cast(m_pNewBuf); } 118 | 119 | encodingType getEncoding() const { return m_eEncoding; } 120 | protected: 121 | int determineEncoding(); 122 | private: 123 | encodingType m_eEncoding; 124 | ubyte* m_pBuf; 125 | ubyte* m_pNewBuf; 126 | size_t m_nBufSize; 127 | bool m_bFirstRead; 128 | size_t m_nLen; 129 | Utf16_Iter m_Iter16; 130 | }; 131 | 132 | // Read in a UTF-8 buffer and write out to UTF-16 or UTF-8 133 | class Utf8_16_Write : public Utf8_16 { 134 | public: 135 | Utf8_16_Write(); 136 | ~Utf8_16_Write(); 137 | 138 | void setEncoding(encodingType eType); 139 | 140 | FILE * fopen(const char *_name, const char *_type); 141 | size_t fwrite(const void* p, size_t _size); 142 | void fclose(); 143 | protected: 144 | encodingType m_eEncoding; 145 | FILE* m_pFile; 146 | utf16* m_pBuf; 147 | size_t m_nBufSize; 148 | bool m_bFirstWrite; 149 | }; 150 | 151 | }; //end if namespace 152 | 153 | #endif 154 | 155 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/utils/csr_mmap.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2006 coreseek.com 3 | * All rights reserved. 4 | * $Id: csr_mmap.c 5 | */ 6 | 7 | #include "os.h" 8 | 9 | #if ! defined _WIN32 && ! defined __CYGWIN__ 10 | #define O_BINARY 0 11 | #endif 12 | 13 | ///FIXME: should support share. 14 | #ifndef HAVE_MMAP 15 | #define PROT_WRITE 2 16 | #define PROT_READ 1 17 | #endif 18 | 19 | #include "csr_mmap.h" 20 | #include "csr_utils.h" 21 | #include 22 | #include 23 | 24 | #ifdef WIN32 25 | #include 26 | #else 27 | #include 28 | #include 29 | #include 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | struct _csr_mmap_t { 37 | void *map; 38 | csr_offset_t size; 39 | u4 bLoadMem; 40 | #if !defined HAVE_MMAP && defined HAVE_WINDOWS_H 41 | HANDLE hfile; 42 | HANDLE hmap; 43 | #endif 44 | }; 45 | 46 | static csr_mmap_t * 47 | mmap_file(const char *filename, int prot,unsigned char bLoadMem) 48 | { 49 | csr_mmap_t *mm; 50 | int fd; 51 | struct stat st; 52 | #if !defined HAVE_MMAP && defined HAVE_WINDOWS_H 53 | unsigned long file_mode, map_mode, view_mode; 54 | #else 55 | int flag = O_RDONLY; 56 | #endif 57 | 58 | mm = malloc(sizeof(csr_mmap_t)); 59 | memset(mm,0,sizeof(csr_mmap_t)); 60 | if(bLoadMem){ 61 | mm->bLoadMem = bLoadMem; 62 | if ((fd = open(filename, O_RDONLY)) < 0) 63 | //csr_exit_perror(filename); 64 | return NULL; 65 | if (fstat(fd, &st) < 0) 66 | //csr_exit_perror(filename); 67 | return NULL; 68 | mm->size = st.st_size; 69 | mm->map = malloc(mm->size); 70 | if (read(fd, mm->map, mm->size) < 0) 71 | //csr_exit_perror(filename); 72 | return NULL; 73 | close(fd); 74 | return mm; 75 | } 76 | #if !defined HAVE_MMAP && defined HAVE_WINDOWS_H 77 | if ((prot & PROT_WRITE) != 0) { 78 | file_mode = GENERIC_READ | GENERIC_WRITE; 79 | map_mode = PAGE_READWRITE; 80 | view_mode = FILE_MAP_WRITE; 81 | } else { 82 | file_mode = GENERIC_READ; 83 | map_mode = PAGE_READONLY; 84 | view_mode = FILE_MAP_READ; 85 | } 86 | 87 | mm->hfile = CreateFile(filename, file_mode, 0, NULL, 88 | OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); 89 | if (mm->hfile == INVALID_HANDLE_VALUE) 90 | //csr_exit_perror(filename); 91 | return NULL; 92 | 93 | mm->size = GetFileSize(mm->hfile, NULL); 94 | 95 | mm->hmap = CreateFileMapping(mm->hfile, NULL, map_mode, 0, 0, NULL); 96 | if (mm->hmap == NULL) { 97 | CloseHandle(mm->hfile); 98 | //csr_exit_perror(filename); 99 | return NULL; 100 | } 101 | 102 | mm->map = MapViewOfFile(mm->hmap, view_mode, 0, 0, 0); 103 | if (mm->map == NULL) { 104 | CloseHandle(mm->hfile); 105 | CloseHandle(mm->hmap); 106 | //csr_exit_perror(filename); 107 | return NULL; 108 | } 109 | 110 | #else /* !defined HAVE_MMAP && defined HAVE_WINDOWS_H */ 111 | if ((prot & PROT_WRITE) != 0) 112 | flag = O_RDWR; 113 | 114 | if ((fd = open(filename, flag)) < 0) 115 | //csr_exit_perror(filename); 116 | return NULL; 117 | if (fstat(fd, &st) < 0) 118 | //csr_exit_perror(filename); 119 | return NULL; 120 | mm->size = st.st_size; 121 | #ifdef HAVE_MMAP 122 | if ((mm->map = mmap((void *)0, mm->size, prot, MAP_SHARED, fd, 0)) == MAP_FAILED) { 123 | //csr_exit_perror(filename); 124 | return NULL; 125 | } 126 | #else /* HAVE_MMAP */ 127 | mm->map = malloc(mm->size); 128 | if (read(fd, mm->map, mm->size) < 0) 129 | //csr_exit_perror(filename); 130 | return NULL; 131 | #endif /* HAVE_MMAP */ 132 | close(fd); 133 | 134 | #endif /* HAVE_MMAP && defined HAVE_WINDOWS_H */ 135 | return mm; 136 | } 137 | 138 | csr_mmap_t * 139 | csr_mmap_file(const char *filename,unsigned char bLoadMem) 140 | { 141 | return mmap_file(filename, PROT_READ,bLoadMem); 142 | } 143 | 144 | csr_mmap_t * 145 | csr_mmap_file_w(const char *filename) 146 | { 147 | return mmap_file(filename, PROT_READ | PROT_WRITE,0); 148 | } 149 | 150 | void 151 | csr_munmap_file(csr_mmap_t *mm) 152 | { 153 | if(mm->bLoadMem){ 154 | free(mm->map); 155 | free(mm); 156 | return; 157 | } 158 | #if !defined HAVE_MMAP && defined HAVE_WINDOWS_H 159 | UnmapViewOfFile(mm->map); 160 | CloseHandle(mm->hmap); 161 | CloseHandle(mm->hfile); 162 | #else /* !defined HAVE_MMAP && defined HAVE_WINDOWS_H */ 163 | #ifdef HAVE_MMAP 164 | munmap(mm->map, mm->size); 165 | #else /* HAVE_MMAP */ 166 | free(mm->map); 167 | #endif /* HAVE_MMAP */ 168 | #endif /* !defined HAVE_MMAP && defined HAVE_WINDOWS_H */ 169 | free(mm); 170 | } 171 | 172 | void * 173 | csr_mmap_map(csr_mmap_t *mm) 174 | { 175 | return mm->map; 176 | } 177 | 178 | csr_offset_t 179 | csr_mmap_size(csr_mmap_t *mm) 180 | { 181 | return mm->size; 182 | } 183 | 184 | #ifdef __cplusplus 185 | } 186 | #endif 187 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/script/char_table_build.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import unicodedata 5 | import re 6 | import codecs 7 | import os 8 | 9 | # tag set 10 | #tag-set: 11 | #m: number 12 | #e: non CJK char, e.g. English pinyin 13 | #[unuse] t: time. 年号 干支等(此处识别出后,仅加入 oov ,不参与实际分词) 14 | #c: CJK char. 15 | #s: Symbol e.g. @ 16 | #w: Sentence seperator. 17 | #x: unknown char. 18 | # Use to generate c-style 19 | def ANSI_build(name): 20 | tag = {} 21 | for c in range(0x20,0x7F): 22 | tag[c] = 's' 23 | #number 24 | num = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') 25 | for c in num: 26 | #print ord(c) 27 | #print chr(ord(c)) 28 | tag[ord(c)] = 'm' 29 | #eng 30 | for c in range(ord('a'),ord('z')): 31 | tag[c] = 'e' 32 | for c in range(ord('A'),ord('Z')): 33 | tag[c] = 'e' 34 | for c in range(0xC0,0xFF): 35 | tag[c] = 'e' 36 | 37 | #seperate 38 | wset = ('!','"','\'',',','.',':',';','?') 39 | for c in wset: 40 | tag[ord(c)] = 'w' 41 | #do output page 42 | codepage = ['']*256 43 | for c in tag: 44 | codepage[c] = tag[c] 45 | code = 'char '+name+'[]= {' 46 | for i in range(0,256): 47 | if i%8 == 0: 48 | code = code + '\n\t' 49 | if codepage[i]: 50 | code = code + '\'' + codepage[i] +'\'' 51 | else: 52 | code = code + '\'\\0\'' 53 | if i != 256: 54 | code = code + ', ' 55 | code = code + '\n\t};' 56 | print code 57 | #print codepage 58 | pass 59 | 60 | #generate CJK area. 61 | def ChineseBuild(name): 62 | only = (35, 118, 129, 104, 151, 141, 84, 150, 87, 116, 89) 63 | tag = {} 64 | # number 65 | num1 = (u'零',u'〇',u'一',u'二',u'三',u'四',u'五',u'六',u'七',u'八',u'九',u'十',u'壹',u'贰',u'叁',u'肆',u'伍',u'陆',u'柒',u'捌',u'玖',u'拾',u'个',u'百',u'千',u'万',u'亿',u'兆',u'仟',u'佰') 66 | num2 = (u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9', u'0') 67 | num = num1 + num2 68 | for c in num: 69 | iCode = ord(c) 70 | if iCode/256 == 0: 71 | continue 72 | if iCode/256 in only: 73 | print iCode, 74 | print c 75 | tag[iCode] = 'm' 76 | #syb 77 | syb1 = (u'~', u'!', u'@', u'#', u'#', u'¥', u'%', u'…', u'&', u'×', u'(', u')', u'—', u'+', u'{', u'}', u'|', u':', u'“', u'”', u'《', u'》', u'?', u'·', u'·', u'-', u'=', u'【', u'】', u'\', u';', u'‘', u'’', u',', u'。', u'、', u'¨', u'〔', u'〕', u'〈', u'〉', u'「', u'」', u'『', u'』', u'.', u'〖', u'〗', u'【', u'】', u'(', u')', u'[', u']', u'{', u'}', u'。', u',', u':', u'≈', u'≡', u'≠', u'=', u'≤', u'≥', u'<', u'>', u'≮', u'≯', u'∷', u'±', u'+', u'-', u'×', u'÷', u'/', u'∫', u'∮', u'∝', u'∞', u'∧', u'∨', u'∑', u'∏', u'∪', u'∩', u'∈', u'∵', u'∴', u'⊥', u'∥', u'∠', u'⌒', u'⊙', u'≌', u'∽', u' ', u'√') 78 | for c in syb1: 79 | iCode = ord(c) 80 | if iCode/256 == 0: 81 | continue 82 | if iCode/256 in only: 83 | print c 84 | tag[iCode] = 's' 85 | #eng 86 | for c in range(ord(u'a'),ord(u'z')): 87 | tag[c] = 'e' 88 | for c in range(ord(u'A'),ord(u'Z')): 89 | tag[c] = 'e' 90 | #sep 91 | wset = (u'、', u',', u',', u'\'', u'‘', u'’', u'‘', u'’', u'!', u'!', u'?', u'?', u'。', u'。', u'?', u'?', u'.', u'“', u'”', u'“', u'”', u':', u':', u'"',u''',u'`',u'〃') 92 | 93 | for c in wset: 94 | iCode = ord(c) 95 | if iCode/256 == 0: 96 | continue 97 | if iCode/256 in only: 98 | print c 99 | tag[iCode] = 'w' 100 | #process 101 | st = {} 102 | oc = 0x30 103 | codepage = ['\\0']*256 104 | if oc == 0xFF: 105 | for c in range(0xFF01, 0xFF66): 106 | codepage[c-0xFF00] = 's' 107 | if oc == 0x30: 108 | for c in range(0x3001, 0x3040): 109 | codepage[c-0x3001] = 's' 110 | 111 | for c in tag: 112 | iCode = (c) 113 | k = iCode/256 114 | if k == oc: 115 | print iCode%256, c 116 | codepage[iCode%256] = tag[c] 117 | 118 | code = 'char '+name+'[]= {' 119 | for i in range(0, 256): 120 | if i%8 == 0: 121 | code = code + '\n\t' 122 | if codepage[i]: 123 | code = code + '\'' + codepage[i] +'\'' 124 | if i != 256: 125 | code = code + ', ' 126 | code = code + '\n\t};' 127 | print code 128 | 129 | # output all chinese, by tag. 130 | code = '{' 131 | for c in tag: 132 | k = c/256 133 | if k == 0xFF or k == 0x30: 134 | continue 135 | if tag[c] == 'm': 136 | code = code + str(hex(c))+', ' 137 | print code 138 | # output all chinese, by tag. 139 | # NOTE 0x22xx, 0x23xx is number symbol, ignore this block. 140 | code = '{' 141 | for c in tag: 142 | k = c/256 143 | if k == 0xFF or k == 0x30: 144 | continue 145 | if tag[c] == 's': 146 | code = code + str(hex(c))+', ' 147 | print code 148 | # sep 149 | code = '{' 150 | for c in tag: 151 | k = c/256 152 | if k == 0xFF or k == 0x30: 153 | continue 154 | if tag[c] == 'w': 155 | code = code + str(hex(c))+', ' 156 | print code 157 | #eng 158 | code = '{' 159 | for c in tag: 160 | k = c/256 161 | if k == 0xFF or k == 0x30: 162 | continue 163 | if tag[c] == 'e': 164 | code = code + str(hex(c))+', ' 165 | print code 166 | pass 167 | def main(): 168 | ANSI_build("ansipage") 169 | ChineseBuild("sym1") 170 | pass 171 | 172 | if __name__ == "__main__": 173 | main() -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/ThesaurusDict.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "UnigramCorpusReader.h" 6 | #include "ThesaurusDict.h" 7 | 8 | namespace css { 9 | 10 | /* 11 | ThesaurusDict::ThesaurusDict () {}; 12 | virtual ~ThesaurusDict () {}; 13 | */ 14 | 15 | const char thdt_head_mgc[] = "THDT"; 16 | 17 | typedef struct _csr_thesaurusdict_fileheader_tag{ 18 | char mg[4]; 19 | short version; 20 | short reserve; 21 | int darts_size; 22 | int pool_size; 23 | }_csr_thesaurusdict_fileheader; 24 | 25 | int ThesaurusDict::load(const char* filename) 26 | { 27 | m_file = csr_mmap_file(filename,1); 28 | if(!m_file) 29 | return -1; //can not load dict. 30 | csr_offset_t tm_size = csr_mmap_size(m_file); 31 | u1* ptr = (u1*)csr_mmap_map(m_file); 32 | u1* ptr_end = ptr + tm_size; 33 | 34 | _csr_thesaurusdict_fileheader* head_ = (_csr_thesaurusdict_fileheader*)ptr; 35 | if(strncmp(head_->mg,thdt_head_mgc,4) == 0) { 36 | if(head_->version == 1) { 37 | ptr += sizeof(_csr_thesaurusdict_fileheader); 38 | if(ptr >= ptr_end) return -4; //file broken 39 | m_da.clear(); 40 | m_da.set_array(ptr,head_->darts_size); 41 | ptr += m_da.unit_size()*head_->darts_size; 42 | if(ptr >= ptr_end) return -4; //file broken. 43 | m_stringpool = ptr; 44 | ptr += head_->pool_size; 45 | if(ptr > ptr_end) return -4; //file broken. 46 | }else{ 47 | return -2; 48 | } 49 | }else 50 | return -3; //bad format 51 | 52 | return 0; 53 | } 54 | 55 | bool Cmp(const ThesaurusRecord *p1, const ThesaurusRecord *p2) 56 | { 57 | char i = 0; 58 | while(1) { 59 | unsigned char pu1 = p1->key[i]; 60 | unsigned char pu2 = p2->key[i]; 61 | if(pu1 == pu2) { 62 | if(pu1 == 0) 63 | break; 64 | i++; 65 | }else{ 66 | return pu1 < pu2; 67 | } 68 | } 69 | return true; 70 | } 71 | 72 | int ThesaurusDict::import(const char* filename, const char* target_file) 73 | { 74 | std::vector< ThesaurusRecord* > items; 75 | std::istream *is; 76 | int n = 0; 77 | int string_bufsize = 0; 78 | 79 | if (filename == "-") { 80 | is = &std::cin; 81 | } else { 82 | is = new std::ifstream(filename); 83 | } 84 | if (! *is) 85 | return -1; 86 | 87 | std::string line; 88 | std::string key; 89 | while (std::getline(*is, line)) { 90 | if(n%2){ 91 | n++; 92 | // 93 | //the value row 94 | ThesaurusRecord* tr = new ThesaurusRecord; //FIXME: should free, but who care 95 | tr->key = key; 96 | memset(tr->value,0,sizeof(tr->value)); 97 | memcpy(tr->value,&line.c_str()[1], line.length()-1); 98 | tr->length = (u2)line.length(); 99 | u1* ptr = tr->value; 100 | 101 | while(*ptr != '\0') { 102 | if(*ptr == ',') 103 | *ptr = '\0'; 104 | ptr++; 105 | } 106 | 107 | items.push_back(tr); 108 | string_bufsize += (int)line.length() ; //append addtional \0's space 109 | continue; 110 | } 111 | 112 | key = line; 113 | n++; 114 | } 115 | 116 | if (filename != "-") { 117 | delete is; 118 | } 119 | u1* total_buf = (u1*)malloc(string_bufsize); 120 | memset((void*)total_buf, 0, string_bufsize); 121 | u1* total_buf_ptr = total_buf; 122 | //read complete, try make dict 123 | std::sort(items.begin(), items.end(), Cmp); 124 | { 125 | std::vector key; 126 | std::vector value; 127 | 128 | size_t i = 0; 129 | for(i=0;ikey[0]; 132 | key.push_back(ptr); 133 | memcpy(total_buf_ptr, rec->value, rec->length); 134 | value.push_back((int)(total_buf_ptr - total_buf)); //value is the string_pool's offset 135 | total_buf_ptr += rec->length; 136 | //process buf 137 | } 138 | //build the dart 139 | m_da.clear(); 140 | //1st 0 is the length array. 141 | //return m_da.build(key.size(), &key[0], 0, 0, &progress_bar) ; 142 | int nRet = m_da.build(key.size(), &key[0], 0, &value[0] ) ; 143 | //should check the nRet value 144 | //try save file 145 | std::string dest_file = "thesaurus.lib"; 146 | size_t size_ = m_da.size(); 147 | const void* iArray = m_da.array(); 148 | _csr_thesaurusdict_fileheader head; 149 | memcpy(&head,thdt_head_mgc,sizeof(thdt_head_mgc)); 150 | head.darts_size = size_; 151 | head.version = 1; 152 | head.reserve = 0; 153 | head.pool_size = string_bufsize; 154 | 155 | std::FILE *fp = NULL; 156 | if(target_file) 157 | fp = std::fopen(target_file, "wb"); 158 | else 159 | fp = std::fopen(dest_file.c_str(), "wb"); 160 | 161 | std::fwrite(&head,sizeof(_csr_thesaurusdict_fileheader),1,fp); 162 | std::fwrite(iArray, m_da.unit_size(), size_, fp); 163 | std::fwrite(total_buf, sizeof(u1), string_bufsize, fp); 164 | std::fclose(fp); 165 | } 166 | 167 | //free it 168 | free((void*)total_buf); 169 | return 0; 170 | } 171 | 172 | const char* ThesaurusDict::find(const char* key, u2 key_len ,int *count) 173 | { 174 | //the return string buffer might contains 0, end with \0\0 175 | Darts::DoubleArray::result_pair_type rs; 176 | m_da.exactMatchSearch (key,rs, key_len); 177 | if(rs.pos && rs.value >= 0) { 178 | size_t offset = rs.value; 179 | return (const char*)&m_stringpool[offset]; 180 | } 181 | return NULL; 182 | } 183 | 184 | 185 | } //end css 186 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/python/pymmseg.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 14 | 15 | 16 | 17 | 18 | 25 | 28 | 31 | 34 | 37 | 40 | 52 | 55 | 58 | 61 | 68 | 71 | 74 | 77 | 80 | 83 | 86 | 89 | 92 | 93 | 101 | 104 | 107 | 110 | 113 | 116 | 126 | 129 | 132 | 135 | 147 | 150 | 153 | 156 | 159 | 162 | 165 | 168 | 171 | 172 | 173 | 174 | 175 | 176 | 181 | 184 | 185 | 188 | 189 | 190 | 195 | 198 | 199 | 200 | 205 | 206 | 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL = /bin/sh 3 | 4 | #### Start of system configuration section. #### 5 | 6 | srcdir = . 7 | topdir = c:/ruby/lib/ruby/1.8/i386-mswin32 8 | hdrdir = $(topdir) 9 | VPATH = $(srcdir);$(topdir);$(hdrdir) 10 | 11 | DESTDIR = c: 12 | prefix = $(DESTDIR)/ruby 13 | exec_prefix = $(prefix) 14 | sitedir = $(prefix)/lib/ruby/site_ruby 15 | rubylibdir = $(libdir)/ruby/$(ruby_version) 16 | archdir = $(rubylibdir)/$(arch) 17 | sbindir = $(exec_prefix)/sbin 18 | datadir = $(prefix)/share 19 | includedir = $(prefix)/include 20 | infodir = $(prefix)/info 21 | sysconfdir = $(prefix)/etc 22 | mandir = $(prefix)/man 23 | libdir = $(exec_prefix)/lib 24 | sharedstatedir = $(DESTDIR)/etc 25 | oldincludedir = $(DESTDIR)/usr/include 26 | sitearchdir = $(sitelibdir)/$(sitearch) 27 | localstatedir = $(DESTDIR)/var 28 | bindir = $(exec_prefix)/bin 29 | sitelibdir = $(sitedir)/$(ruby_version) 30 | libexecdir = $(exec_prefix)/libexec 31 | 32 | CC = cl -nologo 33 | LIBRUBY = $(RUBY_SO_NAME).lib 34 | LIBRUBY_A = $(RUBY_SO_NAME)-static.lib 35 | LIBRUBYARG_SHARED = $(LIBRUBY) 36 | LIBRUBYARG_STATIC = $(LIBRUBY_A) 37 | 38 | RUBY_EXTCONF_H = 39 | CFLAGS = -MD -Zi -O2b2xg- -G6 40 | INCFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir) 41 | CPPFLAGS = -I../src/include 42 | CXXFLAGS = $(CFLAGS) 43 | DLDFLAGS = -link -incremental:no -debug -opt:ref -opt:icf -dll $(LIBPATH) -def:$(DEFFILE) -implib:$(*F:.so=)-$(arch).lib -pdb:$(*F:.so=)-$(arch).pdb 44 | LDSHARED = cl -nologo -LD 45 | AR = lib -nologo 46 | EXEEXT = .exe 47 | 48 | RUBY_INSTALL_NAME = ruby 49 | RUBY_SO_NAME = msvcrt-ruby18 50 | arch = i386-mswin32 51 | sitearch = i386-msvcrt 52 | ruby_version = 1.8 53 | ruby = c:/ruby/bin/ruby 54 | RUBY = $(ruby:/=\) 55 | RM = $(RUBY) -run -e rm -- -f 56 | MAKEDIRS = @$(RUBY) -run -e mkdir -- -p 57 | INSTALL = @$(RUBY) -run -e install -- -vp 58 | INSTALL_PROG = $(INSTALL) -m 0755 59 | INSTALL_DATA = $(INSTALL) -m 0644 60 | COPY = copy > nul 61 | 62 | #### End of system configuration section. #### 63 | 64 | preload = 65 | 66 | libpath = . $(libdir) ../src/lib 67 | LIBPATH = -libpath:"." -libpath:"$(libdir)" -libpath:"../src/lib" 68 | DEFFILE = $(TARGET)-$(arch).def 69 | 70 | CLEANFILES = mkmf.log 71 | DISTCLEANFILES = vc*.pdb $(DEFFILE) 72 | 73 | extout = 74 | extout_prefix = 75 | target_prefix = 76 | LOCAL_LIBS = 77 | LIBS = $(LIBRUBYARG_SHARED) libcss.lib oldnames.lib user32.lib advapi32.lib ws2_32.lib 78 | SRCS = rubyapi.cpp 79 | OBJS = rubyapi.obj 80 | TARGET = mmseg 81 | DLLIB = $(TARGET).so 82 | EXTSTATIC = 83 | STATIC_LIB = 84 | 85 | RUBYCOMMONDIR = $(sitedir)$(target_prefix) 86 | RUBYLIBDIR = $(sitelibdir)$(target_prefix) 87 | RUBYARCHDIR = $(sitearchdir)$(target_prefix) 88 | 89 | TARGET_SO = $(DLLIB) 90 | CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map 91 | CLEANOBJS = *.obj *.lib *.s[ol] *.pdb *.exp *.bak 92 | 93 | all: $(DLLIB) 94 | static: $(STATIC_LIB) 95 | 96 | clean: 97 | @-$(RM) $(CLEANLIBS:/=\) $(CLEANOBJS:/=\) $(CLEANFILES:/=\) 98 | 99 | distclean: clean 100 | @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log 101 | @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES:/=\) 102 | 103 | realclean: distclean 104 | install: install-so install-rb 105 | 106 | install-so: $(RUBYARCHDIR) 107 | install-so: $(RUBYARCHDIR)/$(DLLIB) 108 | $(RUBYARCHDIR)/$(DLLIB): $(DLLIB) 109 | $(INSTALL_PROG) $(DLLIB:/=\) $(RUBYARCHDIR:/=\) 110 | install-rb: pre-install-rb install-rb-default 111 | install-rb-default: pre-install-rb-default 112 | pre-install-rb: Makefile 113 | pre-install-rb-default: Makefile 114 | $(RUBYARCHDIR): 115 | $(MAKEDIRS) $@ 116 | 117 | site-install: site-install-so site-install-rb 118 | site-install-so: install-so 119 | site-install-rb: install-rb 120 | 121 | .SUFFIXES: .c .m .cc .cxx .cpp .obj 122 | 123 | {$(srcdir)}.cc{}.obj: 124 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 125 | 126 | {$(topdir)}.cc{}.obj: 127 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 128 | 129 | {$(hdrdir)}.cc{}.obj: 130 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 131 | 132 | .cc.obj: 133 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 134 | 135 | {$(srcdir)}.cxx{}.obj: 136 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 137 | 138 | {$(topdir)}.cxx{}.obj: 139 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 140 | 141 | {$(hdrdir)}.cxx{}.obj: 142 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 143 | 144 | .cxx.obj: 145 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 146 | 147 | {$(srcdir)}.cpp{}.obj: 148 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 149 | 150 | {$(topdir)}.cpp{}.obj: 151 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 152 | 153 | {$(hdrdir)}.cpp{}.obj: 154 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 155 | 156 | .cpp.obj: 157 | $(CXX) $(INCFLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -Tp$(<:\=/) 158 | 159 | {$(srcdir)}.c{}.obj: 160 | $(CC) $(INCFLAGS) $(CFLAGS) $(CPPFLAGS) -c -Tc$(<:\=/) 161 | 162 | {$(topdir)}.c{}.obj: 163 | $(CC) $(INCFLAGS) $(CFLAGS) $(CPPFLAGS) -c -Tc$(<:\=/) 164 | 165 | {$(hdrdir)}.c{}.obj: 166 | $(CC) $(INCFLAGS) $(CFLAGS) $(CPPFLAGS) -c -Tc$(<:\=/) 167 | 168 | .c.obj: 169 | $(CC) $(INCFLAGS) $(CFLAGS) $(CPPFLAGS) -c -Tc$(<:\=/) 170 | 171 | $(DLLIB): $(DEFFILE) $(OBJS) 172 | @-$(RM) $@ 173 | $(LDSHARED) -Fe$(@) $(OBJS) $(LIBS) $(LOCAL_LIBS) $(DLDFLAGS) 174 | 175 | 176 | 177 | $(DEFFILE): 178 | $(RUBY) -e "puts 'EXPORTS', 'Init_$(TARGET)'" > $@ 179 | 180 | $(OBJS): {.;$(srcdir);$(topdir);$(hdrdir)}ruby.h {.;$(srcdir);$(topdir);$(hdrdir)}defines.h 181 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/script/build_tolower_table.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import unicodedata 5 | import re 6 | import codecs 7 | import os 8 | 9 | def getNum(leftK): 10 | idxleftK = 0 11 | if leftK.find('U+') == -1: 12 | idxleftK = ord(leftK) 13 | else: 14 | leftK = leftK.replace('U+','0x') 15 | idxleftK = eval(leftK) 16 | return idxleftK 17 | 18 | def dump(i, table): 19 | #print table 20 | print '//'+hex(i/256-1),hex(i) 21 | #print 'const static u2 table_'+hex(i/256-1)+'[] = {' 22 | print 'const static u2 table_'+str(i/256-1)+'[] = {' 23 | line = '' 24 | for j in range(0,256): 25 | #print hex(j) +'->'+hex(table[j]), 26 | if j and j % 16 == 0: 27 | print line; 28 | line = '' 29 | line = line + hex(table[j])+',' 30 | print line[:-1] 31 | print '};' 32 | #dump convert table 33 | print '/*' 34 | for j in range(0,256): 35 | #print i-255+j, table[j] 36 | if i-256+j and table[j]: 37 | print (unichr(i-256+j)+'>'+unichr(table[j])).encode('UTF-8'), 38 | print '*/' 39 | pass 40 | 41 | def main(): 42 | fh = codecs.open(sys.argv[1],"r", "UTF-8") 43 | lines = fh.readlines() 44 | fh.close() 45 | trans_table = [0]*65536; 46 | for line in lines: 47 | if line[0] == '#': 48 | continue 49 | parse = line.strip().split(',') 50 | for p in parse: 51 | keyIdx = p.find('->') 52 | #print p, 53 | if keyIdx != -1 and p.find('..') == -1: 54 | #not range 55 | #print p, 56 | idxleftK = 0 57 | idxrightK = 0 58 | leftK = p[:keyIdx] 59 | if leftK.find('U+') == -1: 60 | idxleftK = ord(leftK) 61 | else: 62 | leftK = leftK.replace('U+','0x') 63 | idxleftK = eval(leftK) 64 | rightK = p[keyIdx+2:] 65 | if rightK.find('U+') == -1: 66 | idxrightK = ord(rightK) 67 | else: 68 | rightK = rightK.replace('U+','0x') 69 | idxrightK = eval(rightK) 70 | if idxleftK > 65536 or idxrightK > 65536: 71 | continue 72 | if idxleftK and idxrightK: 73 | #print leftK,rightK,'\t', 74 | #print idxleftK,idxrightK, 75 | #print (unichr(idxleftK) +'->'+ unichr(idxrightK)+'\t').encode('UTF-8') 76 | trans_table[idxleftK] = idxrightK 77 | #Russian char made things harder. 78 | bSkipOverride = 0; 79 | if bSkipOverride and trans_table[idxleftK] and trans_table[idxleftK] != idxrightK: 80 | print leftK, rightK, "inconst conver",idxleftK,idxrightK,trans_table[idxleftK] 81 | print (unichr(idxleftK) + ',' + unichr(idxrightK) + ',' + unichr(trans_table[idxleftK])).encode('UTF-8') 82 | pass 83 | 84 | trans_table[idxleftK] = idxrightK 85 | pass 86 | if keyIdx != -1 and p.find('..') > 0: 87 | leftK = p[:keyIdx] 88 | rightK = p[keyIdx+2:] 89 | lbegin = leftK.find('..') 90 | strbegin = leftK[:lbegin].strip() 91 | strend = leftK[lbegin+2:].strip() 92 | #print getNum(strbegin),getNum(strend) 93 | from_range = range(getNum(strbegin),getNum(strend)+1) 94 | leftK = rightK 95 | lbegin = leftK.find('..') 96 | strbegin = leftK[:lbegin].strip() 97 | strend = leftK[lbegin+2:].strip() 98 | to_range = range(getNum(strbegin),getNum(strend)+1) 99 | 100 | for i in range(0,len(from_range)): 101 | if trans_table[from_range[i]] and trans_table[from_range[i]] != to_range[i]: 102 | print "inconst conver",from_range[i],to_range[i],trans_table[idxleftK] 103 | #print from_range[i],to_range[i] 104 | trans_table[from_range[i]] = to_range[i] 105 | #print getNum(strbegin),getNum(strend) 106 | #print p, 107 | pass 108 | # 人工 强制指定的符号转换 109 | trans_table[ord(u'/')] = ord('/') 110 | trans_table[ord(u'¥')] = ord('$') 111 | trans_table[ord(u'#')] = ord('#') 112 | trans_table[ord(u'%')] = ord('%') 113 | trans_table[ord(u'!')] = ord('!') 114 | trans_table[ord(u'*')] = ord('*') 115 | trans_table[ord(u'(')] = ord('(') 116 | trans_table[ord(u')')] = ord(')') 117 | trans_table[ord(u'-')] = ord('-') 118 | trans_table[ord(u'+')] = ord('+') 119 | trans_table[ord(u'=')] = ord('=') 120 | trans_table[ord(u'{')] = ord('{') 121 | trans_table[ord(u'}')] = ord('}') 122 | trans_table[ord(u'[')] = ord('[') 123 | trans_table[ord(u']')] = ord(']') 124 | trans_table[ord(u'、')] = ord(',') 125 | trans_table[ord(u'|')] = ord('|') 126 | trans_table[ord(u';')] = ord(';') 127 | trans_table[ord(u':')] = ord(':') 128 | trans_table[ord(u'‘')] = ord('\'') 129 | trans_table[ord(u'“')] = ord('"') 130 | trans_table[ord(u'《')] = ord('<') 131 | trans_table[ord(u'》')] = ord('>') 132 | trans_table[ord(u'〉')] = ord('<') 133 | trans_table[ord(u'〈')] = ord('>') 134 | trans_table[ord(u'?')] = ord('?') 135 | trans_table[ord(u'~')] =ord('~') 136 | trans_table[ord(u'`')] =ord('`') 137 | 138 | #dump the trans-table 139 | #page size = 256 140 | #print trans_table 141 | trans_page = [0]*256 142 | idx_page = [0]*256 143 | i = 0 144 | for i in range(0,65536): 145 | if i%256 == 0: 146 | bOutput = 0 147 | for j in range(0,256): 148 | if trans_page[j]: 149 | bOutput = 1 150 | break 151 | if bOutput: 152 | #print trans_page 153 | idx_page[i/256-1] = 1; 154 | dump(i,trans_page) 155 | trans_page = [0]*256 156 | if trans_table[i]: 157 | trans_page[i%256] = trans_table[i] 158 | 159 | bOutput = 0 160 | for j in range(0,256): 161 | if trans_page[j]: 162 | bOutput = 1 163 | break 164 | if bOutput: 165 | #print trans_page 166 | idx_page[i/256] = 1; 167 | dump(i+1,trans_page) 168 | print 'const static u2 table_index[] = {' 169 | for j in range(0,256): 170 | if idx_page[j]: 171 | print 'table_'+str(j), 172 | else: 173 | print 'NULL', 174 | if j != 255: 175 | print ',', 176 | print '};' 177 | if __name__ == "__main__": 178 | main() -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/ruby/rubyapi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | /* Ruby 1.7 defines NUM2LL(), LL2NUM() and ULL2NUM() macros */ 10 | #ifndef NUM2LL 11 | #define NUM2LL(x) NUM2LONG((x)) 12 | #endif 13 | #ifndef LL2NUM 14 | #define LL2NUM(x) INT2NUM((long) (x)) 15 | #endif 16 | #ifndef ULL2NUM 17 | #define ULL2NUM(x) UINT2NUM((unsigned long) (x)) 18 | #endif 19 | 20 | /* Ruby 1.7 doesn't (yet) define NUM2ULL() */ 21 | #ifndef NUM2ULL 22 | #ifdef HAVE_LONG_LONG 23 | #define NUM2ULL(x) rb_num2ull((x)) 24 | #else 25 | #define NUM2ULL(x) NUM2ULONG(x) 26 | #endif 27 | #endif 28 | 29 | /* RSTRING_LEN, etc are new in Ruby 1.9, but ->ptr and ->len no longer work */ 30 | /* Define these for older versions so we can just write code the new way */ 31 | #ifndef RSTRING_LEN 32 | # define RSTRING_LEN(x) RSTRING(x)->len 33 | #endif 34 | #ifndef RSTRING_PTR 35 | # define RSTRING_PTR(x) RSTRING(x)->ptr 36 | #endif 37 | #ifndef RARRAY_LEN 38 | # define RARRAY_LEN(x) RARRAY(x)->len 39 | #endif 40 | #ifndef RARRAY_PTR 41 | # define RARRAY_PTR(x) RARRAY(x)->ptr 42 | #endif 43 | 44 | #include 45 | #include 46 | 47 | /* calling conventions for Windows */ 48 | #ifndef SWIGSTDCALL 49 | # if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) 50 | # define SWIGSTDCALL __stdcall 51 | # else 52 | # define SWIGSTDCALL 53 | # endif 54 | #endif 55 | 56 | #include "SegmenterManager.h" 57 | #include "Segmenter.h" 58 | 59 | #ifdef __cplusplus 60 | extern "C" { 61 | #endif 62 | 63 | //fixme, unload when so unload? 64 | css::SegmenterManager g_mgr; 65 | int g_bInited = 0; 66 | 67 | static void 68 | mmseg_dfree 69 | (void *cd) 70 | { 71 | //printf("needs to clean up"); 72 | } 73 | 74 | #define MMSEG_FREE mmseg_dfree 75 | 76 | static VALUE 77 | mmseg_free 78 | (VALUE cd) 79 | { 80 | //do free here. 81 | return Qnil; 82 | } 83 | 84 | static VALUE 85 | check_mmseg 86 | (VALUE obj) 87 | { 88 | Check_Type(obj, T_DATA); 89 | if (RDATA(obj)->dfree != MMSEG_FREE) { 90 | rb_raise(rb_eArgError, "mmseg expected (%s)", rb_class2name(CLASS_OF(obj))); 91 | } 92 | return (VALUE)DATA_PTR(obj); 93 | } 94 | static VALUE 95 | mmseg_s_allocate 96 | (VALUE klass) 97 | { 98 | return Data_Wrap_Struct(klass, 0, MMSEG_FREE, 0); 99 | } 100 | 101 | static VALUE 102 | mmseg_initialize(VALUE self){ 103 | mmseg_free(check_mmseg(self)); 104 | DATA_PTR(self) = NULL; 105 | rb_iv_set(self, "@start", INT2NUM(0)); 106 | rb_iv_set(self, "@end", INT2NUM(0)); 107 | return self; 108 | } 109 | 110 | static VALUE mmseg_next(VALUE self) 111 | { 112 | u2 tok_len = 0; 113 | int nPos = 0; 114 | css::Segmenter* seg = NULL; 115 | Data_Get_Struct(self, css::Segmenter, seg); 116 | //printf("%d",seg); //check is got it 117 | if(seg){ 118 | u2 len = 0, symlen = 0; 119 | char* tok = (char*)seg->peekToken(len,symlen); 120 | //printf("%s\t",tok); 121 | //FIXME: if ruby version do not enbale symlen, the len and symlen always the same. 122 | if(!tok || !*tok || !len) 123 | tok_len = 0; 124 | else 125 | tok_len = len; 126 | seg->popToken(len); 127 | } 128 | //update position info 129 | VALUE vPos = rb_iv_get(self, "@end"); 130 | if(!NIL_P(vPos)){ 131 | nPos = FIX2INT(vPos); 132 | } 133 | rb_iv_set(self, "@start", INT2NUM(nPos)); 134 | rb_iv_set(self, "@end", INT2NUM(nPos+tok_len)); 135 | if(tok_len) 136 | return self; 137 | else 138 | return Qnil; 139 | } 140 | 141 | static VALUE mmseg_start(VALUE self) { 142 | return rb_iv_get(self, "@start"); 143 | } 144 | 145 | static VALUE mmseg_end(VALUE self) { 146 | return rb_iv_get(self, "@end"); 147 | } 148 | 149 | 150 | static VALUE 151 | mmseg_settext 152 | (VALUE self, VALUE str) 153 | { 154 | int len; 155 | const char* pstr; 156 | if (TYPE(str) == T_STRING) { 157 | len = RSTRING_LEN(str); 158 | pstr = STR2CSTR(str); 159 | //printf("%d:%s\n",len,pstr); 160 | }else 161 | return Qnil; 162 | 163 | css::Segmenter* seg = NULL; 164 | Data_Get_Struct(self, css::Segmenter, seg); 165 | //printf("%s",pstr); 166 | seg->setBuffer((u1*)pstr,len); 167 | rb_iv_set(self, "@start", INT2NUM(0)); 168 | rb_iv_set(self, "@end", INT2NUM(0)); 169 | return self; 170 | } 171 | 172 | 173 | static VALUE 174 | mmseg_open 175 | (VALUE self, VALUE dict_path, VALUE str) 176 | { 177 | int len; 178 | const char* pstr; 179 | if (TYPE(str) == T_STRING) { 180 | len = RSTRING_LEN(str); 181 | pstr = STR2CSTR(str); 182 | //printf("%d:%s\n",len,pstr); 183 | }else 184 | return Qnil; 185 | 186 | if (!g_bInited && TYPE(dict_path) == T_STRING) { 187 | int nRet = g_mgr.init(STR2CSTR(dict_path)); 188 | if(nRet != 0) { 189 | // should throw an exception 190 | rb_fatal("Can NOT init the segment library."); 191 | return Qnil; 192 | } 193 | g_bInited = 1; 194 | } 195 | if(g_bInited){ 196 | //do segment 197 | css::Segmenter* seg = g_mgr.getSegmenter(); 198 | //hacking 199 | long ptr = (long)seg; 200 | seg->setBuffer((u1*)pstr,len); 201 | self = Data_Wrap_Struct(self, NULL, MMSEG_FREE, (void *)seg); 202 | }else 203 | return Qnil; 204 | 205 | return self; 206 | } 207 | 208 | VALUE cMMseg; 209 | 210 | void Init_mmseg() { 211 | cMMseg = rb_define_class("Mmseg", rb_cData); 212 | rb_define_alloc_func(cMMseg, mmseg_s_allocate); 213 | rb_define_singleton_method(cMMseg, "createSeg", RUBY_METHOD_FUNC(mmseg_open), 2); 214 | rb_define_method(cMMseg, "initialize", RUBY_METHOD_FUNC(mmseg_initialize), 0); 215 | rb_define_method(cMMseg, "setText", RUBY_METHOD_FUNC(mmseg_settext), 1); 216 | rb_define_method(cMMseg, "next", RUBY_METHOD_FUNC(mmseg_next), 0); 217 | rb_define_method(cMMseg, "start", RUBY_METHOD_FUNC(mmseg_start), 0); 218 | rb_define_method(cMMseg, "end", RUBY_METHOD_FUNC(mmseg_end), 0); 219 | } 220 | 221 | #ifdef __cplusplus 222 | } 223 | #endif -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/SegmenterManager.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include "Segmenter.h" 25 | #include "SegmenterManager.h" 26 | extern "C"{ 27 | #include "iniparser/iniparser.h" 28 | } 29 | namespace css { 30 | 31 | 32 | const char g_ngram_unigram_dict_name[] = "uni.lib"; 33 | const char g_kword_unigram_dict_name[] = "kw.lib"; 34 | const char g_wordweight_unigram_dict_name[] = "weight.lib"; 35 | const char g_synonyms_dict_name[] = "synonyms.dat"; 36 | const char g_thesaurus_dict_name[] = "thesaurus.lib"; 37 | const char g_config_name[] = "mmseg.ini"; 38 | /** 39 | * Return a newly created segmenter 40 | */ 41 | 42 | Segmenter *SegmenterManager::getSegmenter( bool bFromPool) 43 | { 44 | Segmenter* seg = NULL; 45 | if(m_method == SEG_METHOD_NGRAM){ 46 | if(bFromPool) 47 | seg = seg_freelist_.alloc(); 48 | else 49 | seg = new Segmenter(); 50 | //init seg 51 | seg->m_unidict = &m_uni; 52 | seg->m_symdict = &m_sym; 53 | if(m_kw.isLoad()) 54 | seg->m_kwdict = &m_kw; 55 | if(m_weight.isLoad()) 56 | seg->m_weightdict = &m_weight; 57 | if(m_thesaurus.isLoad()) 58 | seg->m_thesaurus = &m_thesaurus; 59 | seg->m_config = &m_config; 60 | } 61 | return seg; 62 | } 63 | 64 | void SegmenterManager::loadconfig(const char* confile) 65 | { 66 | if(confile == NULL) 67 | return; 68 | dictionary * ini; 69 | char * s; 70 | int sl = 0; 71 | //m_config 72 | ini = iniparser_load(confile); 73 | if (ini==NULL) { 74 | return; // not exist or not a valid ini file 75 | } 76 | /* 77 | u1 merge_number_and_ascii; 78 | u1 seperate_number_ascii; 79 | u1 compress_space; 80 | u1 number_and_ascii_joint[512]; 81 | */ 82 | m_config.merge_number_and_ascii = 83 | iniparser_getboolean(ini, "mmseg:merge_number_and_ascii", 0); 84 | m_config.seperate_number_ascii = 85 | iniparser_getboolean(ini, "mmseg:seperate_number_ascii", 0); 86 | m_config.compress_space = 87 | iniparser_getboolean(ini, "mmseg:compress_space", 0); 88 | s = 89 | iniparser_getstring(ini, "mmseg:number_and_ascii_joint", NULL); 90 | if(s){ 91 | sl = strlen(s); 92 | if(sl>511){ 93 | memcpy(m_config.number_and_ascii_joint,s,sl); 94 | m_config.number_and_ascii_joint[511] = 0; 95 | }else{ 96 | memcpy(m_config.number_and_ascii_joint,s,sl); 97 | m_config.number_and_ascii_joint[sl] = 0; 98 | } 99 | } 100 | } 101 | 102 | int SegmenterManager::init(const char* path, u1 method) 103 | { 104 | if( method != SEG_METHOD_NGRAM) 105 | return -4; //unsupport segmethod. 106 | 107 | if( m_inited ) 108 | return 0; //only can be init once. 109 | 110 | char buf[1024]; 111 | memset(buf,0,sizeof(buf)); 112 | if(!path) 113 | memcpy(buf,".",1); 114 | else 115 | memcpy(buf,path,strlen(path)); 116 | int nLen = (int)strlen(path); 117 | //check is end. 118 | #ifdef WIN32 119 | if(buf[nLen-1] != '\\'){ 120 | buf[nLen] = '\\'; 121 | nLen++; 122 | } 123 | #else 124 | if(buf[nLen-1] != '/'){ 125 | buf[nLen] = '/'; 126 | nLen++; 127 | } 128 | #endif 129 | m_method = method; 130 | int nRet = 0; 131 | 132 | if(method == SEG_METHOD_NGRAM) { 133 | seg_freelist_.set_size(64); 134 | memcpy(&buf[nLen],g_ngram_unigram_dict_name,strlen(g_ngram_unigram_dict_name)); 135 | nRet = m_uni.load(buf); 136 | 137 | if(nRet!=0){ 138 | printf("Unigram dictionary load Error\n"); 139 | return nRet; 140 | } 141 | //no needs to care kwformat 142 | memcpy(&buf[nLen],g_kword_unigram_dict_name,strlen(g_kword_unigram_dict_name)); 143 | buf[nLen+strlen(g_kword_unigram_dict_name)] = 0; 144 | nRet = m_kw.load(buf); 145 | if(nRet!=0 && nRet!=-1 ){ 146 | //m_kw not exist or format error. 147 | printf("Keyword dictionary load Error\n"); 148 | return nRet; 149 | } 150 | 151 | //try to load weight dict 152 | memcpy(&buf[nLen],g_wordweight_unigram_dict_name,strlen(g_wordweight_unigram_dict_name)); 153 | buf[nLen+strlen(g_wordweight_unigram_dict_name)] = 0; 154 | nRet = m_weight.load(buf); 155 | if(nRet!=0 && nRet!=-1 ){ 156 | //m_kw not exist or format error. 157 | printf("Keyword dictionary load Error\n"); 158 | return nRet; 159 | } 160 | 161 | memcpy(&buf[nLen],g_synonyms_dict_name,strlen(g_synonyms_dict_name)); 162 | buf[nLen+strlen(g_synonyms_dict_name)] = 0; 163 | //load g_synonyms_dict_name, we do not care the load in right or not 164 | nRet = m_sym.load(buf); 165 | if(nRet!=0 && nRet != -1){ 166 | printf("Synonyms dictionary format Error\n"); 167 | } 168 | 169 | memcpy(&buf[nLen],g_thesaurus_dict_name,strlen(g_thesaurus_dict_name)); 170 | buf[nLen+strlen(g_thesaurus_dict_name)] = 0; 171 | //load g_synonyms_dict_name, we do not care the load in right or not 172 | nRet = m_thesaurus.load(buf); 173 | if(nRet!=0 && nRet != -1){ 174 | printf("Thesaurus dictionary format Error\n"); 175 | } 176 | 177 | //read config 178 | memcpy(&buf[nLen],g_config_name,strlen(g_config_name)); 179 | buf[nLen+strlen(g_config_name)] = 0; 180 | loadconfig(buf); 181 | 182 | nRet = 0; 183 | m_inited = 1; 184 | return nRet; 185 | } 186 | return -1; 187 | } 188 | 189 | void SegmenterManager::clear() 190 | { 191 | seg_freelist_.free(); 192 | } 193 | SegmenterManager::SegmenterManager() 194 | :m_inited(0) 195 | { 196 | m_method = SEG_METHOD_NGRAM; 197 | } 198 | SegmenterManager::~SegmenterManager() 199 | { 200 | clear(); 201 | } 202 | } /* End of namespace css */ 203 | 204 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/find_apr.m4: -------------------------------------------------------------------------------- 1 | dnl -------------------------------------------------------- -*- autoconf -*- 2 | dnl Copyright 2000-2005 The Apache Software Foundation 3 | dnl 4 | dnl Licensed under the Apache License, Version 2.0 (the "License"); 5 | dnl you may not use this file except in compliance with the License. 6 | dnl You may obtain a copy of the License at 7 | dnl 8 | dnl http://www.apache.org/licenses/LICENSE-2.0 9 | dnl 10 | dnl Unless required by applicable law or agreed to in writing, software 11 | dnl distributed under the License is distributed on an "AS IS" BASIS, 12 | dnl WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | dnl See the License for the specific language governing permissions and 14 | dnl limitations under the License. 15 | 16 | dnl 17 | dnl find_apr.m4 : locate the APR include files and libraries 18 | dnl 19 | dnl This macro file can be used by applications to find and use the APR 20 | dnl library. It provides a standardized mechanism for using APR. It supports 21 | dnl embedding APR into the application source, or locating an installed 22 | dnl copy of APR. 23 | dnl 24 | dnl APR_FIND_APR(srcdir, builddir, implicit-install-check, acceptable-majors) 25 | dnl 26 | dnl where srcdir is the location of the bundled APR source directory, or 27 | dnl empty if source is not bundled. 28 | dnl 29 | dnl where builddir is the location where the bundled APR will will be built, 30 | dnl or empty if the build will occur in the srcdir. 31 | dnl 32 | dnl where implicit-install-check set to 1 indicates if there is no 33 | dnl --with-apr option specified, we will look for installed copies. 34 | dnl 35 | dnl where acceptable-majors is a space separated list of acceptable major 36 | dnl version numbers. Often only a single major version will be acceptable. 37 | dnl If multiple versions are specified, and --with-apr=PREFIX or the 38 | dnl implicit installed search are used, then the first (leftmost) version 39 | dnl in the list that is found will be used. Currently defaults to [0 1]. 40 | dnl 41 | dnl Sets the following variables on exit: 42 | dnl 43 | dnl apr_found : "yes", "no", "reconfig" 44 | dnl 45 | dnl apr_config : If the apr-config tool exists, this refers to it. If 46 | dnl apr_found is "reconfig", then the bundled directory 47 | dnl should be reconfigured *before* using apr_config. 48 | dnl 49 | dnl Note: this macro file assumes that apr-config has been installed; it 50 | dnl is normally considered a required part of an APR installation. 51 | dnl 52 | dnl If a bundled source directory is available and needs to be (re)configured, 53 | dnl then apr_found is set to "reconfig". The caller should reconfigure the 54 | dnl (passed-in) source directory, placing the result in the build directory, 55 | dnl as appropriate. 56 | dnl 57 | dnl If apr_found is "yes" or "reconfig", then the caller should use the 58 | dnl value of apr_config to fetch any necessary build/link information. 59 | dnl 60 | 61 | AC_DEFUN([APR_FIND_APR], [ 62 | apr_found="no" 63 | 64 | if test "$target_os" = "os2-emx"; then 65 | # Scripts don't pass test -x on OS/2 66 | TEST_X="test -f" 67 | else 68 | TEST_X="test -x" 69 | fi 70 | 71 | ifelse([$4], [], [ 72 | ifdef(AC_WARNING,AC_WARNING([$0: missing argument 4 (acceptable-majors): Defaulting to APR 0.x then APR 1.x])) 73 | acceptable_majors="0 1"], 74 | [acceptable_majors="$4"]) 75 | 76 | apr_temp_acceptable_apr_config="" 77 | for apr_temp_major in $acceptable_majors 78 | do 79 | case $apr_temp_major in 80 | 0) 81 | apr_temp_acceptable_apr_config="$apr_temp_acceptable_apr_config apr-config" 82 | ;; 83 | *) 84 | apr_temp_acceptable_apr_config="$apr_temp_acceptable_apr_config apr-$apr_temp_major-config" 85 | ;; 86 | esac 87 | done 88 | 89 | AC_MSG_CHECKING(for APR) 90 | AC_ARG_WITH(apr, 91 | [ --with-apr=PATH prefix for installed APR, path to APR build tree, 92 | or the full path to apr-config], 93 | [ 94 | if test "$withval" = "no" || test "$withval" = "yes"; then 95 | AC_MSG_ERROR([--with-apr requires a directory or file to be provided]) 96 | fi 97 | 98 | for apr_temp_apr_config_file in $apr_temp_acceptable_apr_config 99 | do 100 | for lookdir in "$withval/bin" "$withval" 101 | do 102 | if $TEST_X "$lookdir/$apr_temp_apr_config_file"; then 103 | apr_found="yes" 104 | apr_config="$lookdir/$apr_temp_apr_config_file" 105 | break 2 106 | fi 107 | done 108 | done 109 | 110 | if test "$apr_found" != "yes" && $TEST_X "$withval" && $withval --help > /dev/null 2>&1 ; then 111 | apr_found="yes" 112 | apr_config="$withval" 113 | fi 114 | 115 | dnl if --with-apr is used, it is a fatal error for its argument 116 | dnl to be invalid 117 | if test "$apr_found" != "yes"; then 118 | AC_MSG_ERROR([the --with-apr parameter is incorrect. It must specify an install prefix, a build directory, or an apr-config file.]) 119 | fi 120 | ],[ 121 | dnl If we allow installed copies, check those before using bundled copy. 122 | if test -n "$3" && test "$3" = "1"; then 123 | for apr_temp_apr_config_file in $apr_temp_acceptable_apr_config 124 | do 125 | if $apr_temp_apr_config_file --help > /dev/null 2>&1 ; then 126 | apr_found="yes" 127 | apr_config="$apr_temp_apr_config_file" 128 | break 129 | else 130 | dnl look in some standard places 131 | for lookdir in /usr /usr/local /usr/local/apr /opt/apr /usr/local/apache2; do 132 | if $TEST_X "$lookdir/bin/$apr_temp_apr_config_file"; then 133 | apr_found="yes" 134 | apr_config="$lookdir/bin/$apr_temp_apr_config_file" 135 | break 2 136 | fi 137 | done 138 | fi 139 | done 140 | fi 141 | dnl if we have not found anything yet and have bundled source, use that 142 | if test "$apr_found" = "no" && test -d "$1"; then 143 | apr_temp_abs_srcdir="`cd $1 && pwd`" 144 | apr_found="reconfig" 145 | apr_bundled_major="`sed -n '/#define.*APR_MAJOR_VERSION/s/^[^0-9]*\([0-9]*\).*$/\1/p' \"$1/include/apr_version.h\"`" 146 | case $apr_bundled_major in 147 | "") 148 | AC_MSG_ERROR([failed to find major version of bundled APR]) 149 | ;; 150 | 0) 151 | apr_temp_apr_config_file="apr-config" 152 | ;; 153 | *) 154 | apr_temp_apr_config_file="apr-$apr_bundled_major-config" 155 | ;; 156 | esac 157 | if test -n "$2"; then 158 | apr_config="$2/$apr_temp_apr_config_file" 159 | else 160 | apr_config="$1/$apr_temp_apr_config_file" 161 | fi 162 | fi 163 | ]) 164 | 165 | AC_MSG_RESULT($apr_found) 166 | ]) 167 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/mmthunk.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #ifndef _MM_THUNK_H_ 25 | #define _MM_THUNK_H_ 26 | #include 27 | #include 28 | #include "UnigramDict.h" 29 | #include "freelist.h" 30 | 31 | #define CHUNK_BUFFER_SIZE 1024 32 | #define CHUNK_DEBUG 0 33 | 34 | namespace css { 35 | 36 | class Chunk{ 37 | public: 38 | Chunk():m_free_score(0.0),total_length(0){} 39 | float m_free_score; 40 | int total_length; 41 | std::vector tokens; 42 | std::vector freqs; 43 | inline void pushToken(u2 len, u2 freq) { 44 | #if CHUNK_DEBUG 45 | printf("pt:%d, %d;\t",len, freq); 46 | #endif 47 | tokens.push_back(len); 48 | total_length += len; 49 | freqs.push_back(freq); 50 | //m_free_score += log((float)freq) * 100; 51 | } 52 | inline float get_free(){ 53 | //m_free_score 54 | float score = 0.0; 55 | std::vector::iterator it; 56 | float freq = 0; 57 | for(it = freqs.begin(); it < freqs.end(); it++){ 58 | freq = ((float)*it) + 1; 59 | score+= log(freq) * 100; 60 | } 61 | return score; 62 | } 63 | inline float get_avl() { 64 | float avg = (float)1.0*total_length/tokens.size(); 65 | return avg; 66 | } 67 | inline float get_avg(){ 68 | float avg = (float)1.0*total_length/tokens.size(); 69 | std::vector::iterator it; 70 | float total = 0; 71 | for(it = tokens.begin(); it < tokens.end(); it++){ 72 | float diff = ((*it) - avg); 73 | total += diff*diff; 74 | } 75 | return (float)1.0*total/(tokens.size() -1); 76 | } 77 | inline void popup() { 78 | if(tokens.size()) { 79 | total_length -= tokens[tokens.size() - 1]; 80 | tokens.pop_back(); 81 | freqs.pop_back(); 82 | } 83 | } 84 | inline void reset() { 85 | tokens.clear(); 86 | freqs.clear(); 87 | total_length = 0; 88 | } 89 | }; 90 | 91 | class ChunkQueue 92 | { 93 | public: 94 | ChunkQueue():max_length(0) {}; 95 | public: 96 | void push(Chunk& ck) { 97 | if(ck.total_length < max_length) 98 | return; //rule:1 99 | if(ck.total_length > max_length) { 100 | max_length = ck.total_length; 101 | m_chunks.clear(); 102 | } 103 | m_chunks.push_back(ck); 104 | }; 105 | u2 getToken(){ 106 | size_t num_chunk = m_chunks.size(); 107 | if(!num_chunk) 108 | return 0; 109 | if(num_chunk == 1) 110 | return m_chunks[0].tokens[0]; 111 | //debug use->dump chunk 112 | #if CHUNK_DEBUG 113 | for(size_t i = 0; i avg_length){ 127 | avg_length = avl; 128 | k_ptr = remains; 129 | *k_ptr = (u4)i; 130 | k_ptr++; 131 | }else 132 | if(avl == avg_length){ 133 | *k_ptr = (u4)i; 134 | k_ptr++; 135 | } 136 | } 137 | if((k_ptr - remains) == 1) 138 | return m_chunks[remains[0]].tokens[0]; //match by rule2 139 | //apply rule 3 140 | u4 remains_r3[256]; 141 | u4* k_ptr_r3 = remains_r3; 142 | avg_length = 1024*64; //an unreachable avg 143 | for(size_t i = 0; imax_score){ 164 | max_score = score; 165 | idx = remains_r3[i]; 166 | } 167 | } 168 | return m_chunks[idx].tokens[0]; 169 | //return 0; 170 | }; 171 | inline void reset() { 172 | m_chunks.clear(); 173 | max_length = 0; 174 | }; 175 | protected: 176 | std::vector m_chunks; 177 | i4 max_length; 178 | }; 179 | 180 | class item_info 181 | { 182 | public: 183 | item_info(): 184 | //length(0), 185 | freq(0){ 186 | }; 187 | 188 | public: 189 | //u4 length; 190 | u4 freq; 191 | std::vector items; 192 | }; 193 | 194 | class MMThunk 195 | { 196 | public: 197 | MMThunk():base_offset(0), m_max_length(-1), m_length(0) 198 | { 199 | memset(m_charinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); 200 | memset(m_kwinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); 201 | item_list.set_size(CHUNK_BUFFER_SIZE*2); 202 | }; 203 | ~MMThunk() {}; 204 | 205 | void setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results); 206 | void setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results); 207 | void advance(u2 step) { base_offset += step; }; 208 | //peek the current token 209 | u1* peekToken(u2& length); 210 | u2 popupToken(); 211 | u1* peekKwToken(u2& pos, u2& length); 212 | u2 popupKwToken(); 213 | 214 | int Tokenize(); 215 | void pushToken(u2 aSize, i4 base); 216 | void reset(); 217 | u4 length() { return m_length; }; 218 | protected: 219 | u2 base_offset; 220 | CRFPP::FreeList item_list; 221 | item_info* m_charinfos[CHUNK_BUFFER_SIZE]; 222 | std::vector tokens; 223 | item_info* m_kwinfos[CHUNK_BUFFER_SIZE]; 224 | i4 m_kw_pos; 225 | i4 m_kw_ipos; 226 | i4 m_max_length; 227 | u4 m_length; 228 | ChunkQueue m_queue; 229 | protected: 230 | void pushChunk(Chunk& ck); 231 | }; 232 | 233 | } 234 | 235 | #endif 236 | 237 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/css/mmthunk.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include "csr_typedefs.h" 25 | #include "mmthunk.h" 26 | 27 | namespace css { 28 | 29 | 30 | void MMThunk::setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results) 31 | { 32 | if(m_max_length < idx) 33 | m_max_length = idx; 34 | 35 | u4 index = (idx % CHUNK_BUFFER_SIZE ) + base_offset; 36 | item_info* item = item_list.alloc(); 37 | item->freq = 0; 38 | item->items.clear(); 39 | for(u2 i = 0; i< rs_count; i++){ 40 | item->freq += results[i].value; 41 | item->items.push_back(results[i].length); 42 | //if(i == rs_count - 1) 43 | // item->length = results[i].length; 44 | } 45 | m_charinfos[idx] = item; 46 | return; 47 | } 48 | 49 | //set the potient key words. 50 | void MMThunk::setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results) 51 | { 52 | if(m_max_length < idx) 53 | m_max_length = idx; 54 | u4 index = (idx % CHUNK_BUFFER_SIZE ) + base_offset; 55 | item_info* item = item_list.alloc(); 56 | item->items.clear(); 57 | for(u2 i = 0; i< rs_count; i++){ 58 | item->freq += results[i].value; 59 | item->items.push_back(results[i].length); 60 | //if(i == rs_count - 1) 61 | // item->length = results[i].length; 62 | } 63 | m_kwinfos[idx] = item; 64 | return; 65 | } 66 | 67 | u1* MMThunk::peekToken(u2& length) 68 | { 69 | length = 0; 70 | if(tokens.size()){ 71 | length = tokens[0]; 72 | //tokens.erase(tokens.begin()); 73 | } 74 | return NULL; 75 | } 76 | 77 | u2 MMThunk::popupToken() 78 | { 79 | u2 length = 0; 80 | if(tokens.size()){ 81 | length = tokens[0]; 82 | m_length -= length; 83 | tokens.erase(tokens.begin()); 84 | } 85 | return length; 86 | } 87 | 88 | u1* MMThunk::peekKwToken(u2& pos, u2& length) 89 | { 90 | if(m_max_length < m_kw_pos) 91 | m_max_length = m_kw_pos; 92 | 93 | while(m_kw_pos <= m_max_length) { 94 | u4 index = (m_kw_pos % CHUNK_BUFFER_SIZE ) + base_offset; 95 | //clear kw_word 96 | item_info* info_kw = m_kwinfos[index]; 97 | if(info_kw) { 98 | //find the item 99 | size_t cnt = info_kw->items.size(); 100 | if(m_kw_ipositems[m_kw_ipos]; 102 | m_kw_ipos++; 103 | //found one 104 | pos = m_kw_pos; 105 | return NULL; 106 | } 107 | } 108 | m_kw_pos++; 109 | m_kw_ipos = 0; 110 | } 111 | 112 | length = 0; 113 | return NULL; 114 | } 115 | 116 | u2 MMThunk::popupKwToken() 117 | { 118 | /* 119 | u2 length = 0; 120 | if(kwtokens.size()){ 121 | length = kwtokens[0]; 122 | kwtokens.erase(kwtokens.begin()); 123 | } 124 | */ 125 | return 0; 126 | } 127 | 128 | //do real segment in this function, return token's count 129 | int MMThunk::Tokenize() 130 | { 131 | #if CHUNK_DEBUG 132 | for(u2 i = 0; m_charinfos[i]; i++){ 133 | std::vector::iterator it; 134 | for(it = m_charinfos[i]->items.begin(); 135 | it < m_charinfos[i]->items.end(); 136 | it++) 137 | printf("%d, ", *it); 138 | printf("\n"); 139 | } 140 | #endif 141 | // appply rules 142 | u2 base = 0; 143 | while(base<=m_max_length){ 144 | Chunk chunk; 145 | item_info* info_1st = m_charinfos[base]; 146 | for(size_t i = 0; iitems.size(); i++){ 147 | if(i == 0) 148 | chunk.pushToken(info_1st->items[i], info_1st->freq); 149 | else 150 | chunk.pushToken(info_1st->items[i],0); 151 | //Chunk L1_chunk = chunk; 152 | u2 idx_2nd = info_1st->items[i] + base; 153 | //check bound 154 | item_info* info_2nd = NULL; 155 | if(idx_2nditems.size(); j++) { 159 | if(j == 0) 160 | chunk.pushToken(info_2nd->items[j], info_2nd->freq); 161 | else 162 | chunk.pushToken(info_2nd->items[j],1); 163 | u2 idx_3rd = info_2nd->items[j] + idx_2nd; 164 | if(idx_3rditems[m_charinfos[idx_3rd]->items.size()-1]; 166 | if(m_charinfos[idx_3rd]->items.size() == 1) 167 | chunk.pushToken(idx_4th, m_charinfos[idx_3rd]->freq ); 168 | else 169 | chunk.pushToken(idx_4th, 1); 170 | //push path. 171 | pushChunk(chunk); 172 | //pop 3part 173 | chunk.popup(); 174 | }else{ 175 | //no 3part, push path 176 | pushChunk(chunk); 177 | } 178 | //pop 2part 179 | chunk.popup(); 180 | }//end for 181 | }//end if 182 | else{ 183 | //no 2part ,push path 184 | pushChunk(chunk); 185 | } 186 | //pop 1part 187 | chunk.popup(); 188 | } 189 | //find the last pharse 190 | //reset. rebase 191 | u2 tok_len = m_queue.getToken(); 192 | if(tok_len){ 193 | pushToken(tok_len, base); //tokens.push_back(tok_len); 194 | }else 195 | break; 196 | m_queue.reset(); 197 | chunk.reset(); 198 | base += tok_len; 199 | }//end while 200 | return 0; 201 | } 202 | 203 | void MMThunk::pushChunk(Chunk& ck) 204 | { 205 | #if CHUNK_DEBUG 206 | printf("Pushing: "); 207 | for(size_t i = 0; i::iterator it = info_kw->items.begin(); 226 | for(;ititems.end();it++) { 227 | if(*it == aSize) { 228 | info_kw->items.erase(it); //find the same item. 229 | break; 230 | } 231 | } 232 | } 233 | } 234 | 235 | void MMThunk::reset() 236 | { 237 | memset(m_charinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); 238 | memset(m_kwinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); 239 | item_list.free(); 240 | tokens.clear(); 241 | m_queue.reset(); 242 | 243 | m_max_length = -1; 244 | m_length = 0; 245 | m_kw_pos = m_kw_ipos = 0; 246 | } 247 | 248 | } 249 | 250 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/iniparser/dictionary.h: -------------------------------------------------------------------------------- 1 | 2 | /*-------------------------------------------------------------------------*/ 3 | /** 4 | @file dictionary.h 5 | @author N. Devillard 6 | @date Sep 2007 7 | @version $Revision: 1.12 $ 8 | @brief Implements a dictionary for string variables. 9 | 10 | This module implements a simple dictionary object, i.e. a list 11 | of string/string associations. This object is useful to store e.g. 12 | informations retrieved from a configuration file (ini files). 13 | */ 14 | /*--------------------------------------------------------------------------*/ 15 | 16 | /* 17 | $Id: dictionary.h,v 1.12 2007-11-23 21:37:00 ndevilla Exp $ 18 | $Author: ndevilla $ 19 | $Date: 2007-11-23 21:37:00 $ 20 | $Revision: 1.12 $ 21 | */ 22 | 23 | #ifndef _DICTIONARY_H_ 24 | #define _DICTIONARY_H_ 25 | 26 | /*--------------------------------------------------------------------------- 27 | Includes 28 | ---------------------------------------------------------------------------*/ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | /*--------------------------------------------------------------------------- 35 | New types 36 | ---------------------------------------------------------------------------*/ 37 | 38 | 39 | /*-------------------------------------------------------------------------*/ 40 | /** 41 | @brief Dictionary object 42 | 43 | This object contains a list of string/string associations. Each 44 | association is identified by a unique string key. Looking up values 45 | in the dictionary is speeded up by the use of a (hopefully collision-free) 46 | hash function. 47 | */ 48 | /*-------------------------------------------------------------------------*/ 49 | typedef struct _dictionary_ { 50 | int n ; /** Number of entries in dictionary */ 51 | int size ; /** Storage size */ 52 | char ** val ; /** List of string values */ 53 | char ** key ; /** List of string keys */ 54 | unsigned * hash ; /** List of hash values for keys */ 55 | } dictionary ; 56 | 57 | 58 | /*--------------------------------------------------------------------------- 59 | Function prototypes 60 | ---------------------------------------------------------------------------*/ 61 | 62 | /*-------------------------------------------------------------------------*/ 63 | /** 64 | @brief Compute the hash key for a string. 65 | @param key Character string to use for key. 66 | @return 1 unsigned int on at least 32 bits. 67 | 68 | This hash function has been taken from an Article in Dr Dobbs Journal. 69 | This is normally a collision-free function, distributing keys evenly. 70 | The key is stored anyway in the struct so that collision can be avoided 71 | by comparing the key itself in last resort. 72 | */ 73 | /*--------------------------------------------------------------------------*/ 74 | unsigned dictionary_hash(char * key); 75 | 76 | /*-------------------------------------------------------------------------*/ 77 | /** 78 | @brief Create a new dictionary object. 79 | @param size Optional initial size of the dictionary. 80 | @return 1 newly allocated dictionary objet. 81 | 82 | This function allocates a new dictionary object of given size and returns 83 | it. If you do not know in advance (roughly) the number of entries in the 84 | dictionary, give size=0. 85 | */ 86 | /*--------------------------------------------------------------------------*/ 87 | dictionary * dictionary_new(int size); 88 | 89 | /*-------------------------------------------------------------------------*/ 90 | /** 91 | @brief Delete a dictionary object 92 | @param d dictionary object to deallocate. 93 | @return void 94 | 95 | Deallocate a dictionary object and all memory associated to it. 96 | */ 97 | /*--------------------------------------------------------------------------*/ 98 | void dictionary_del(dictionary * vd); 99 | 100 | /*-------------------------------------------------------------------------*/ 101 | /** 102 | @brief Get a value from a dictionary. 103 | @param d dictionary object to search. 104 | @param key Key to look for in the dictionary. 105 | @param def Default value to return if key not found. 106 | @return 1 pointer to internally allocated character string. 107 | 108 | This function locates a key in a dictionary and returns a pointer to its 109 | value, or the passed 'def' pointer if no such key can be found in 110 | dictionary. The returned character pointer points to data internal to the 111 | dictionary object, you should not try to free it or modify it. 112 | */ 113 | /*--------------------------------------------------------------------------*/ 114 | char * dictionary_get(dictionary * d, char * key, char * def); 115 | 116 | 117 | /*-------------------------------------------------------------------------*/ 118 | /** 119 | @brief Set a value in a dictionary. 120 | @param d dictionary object to modify. 121 | @param key Key to modify or add. 122 | @param val Value to add. 123 | @return int 0 if Ok, anything else otherwise 124 | 125 | If the given key is found in the dictionary, the associated value is 126 | replaced by the provided one. If the key cannot be found in the 127 | dictionary, it is added to it. 128 | 129 | It is Ok to provide a NULL value for val, but NULL values for the dictionary 130 | or the key are considered as errors: the function will return immediately 131 | in such a case. 132 | 133 | Notice that if you dictionary_set a variable to NULL, a call to 134 | dictionary_get will return a NULL value: the variable will be found, and 135 | its value (NULL) is returned. In other words, setting the variable 136 | content to NULL is equivalent to deleting the variable from the 137 | dictionary. It is not possible (in this implementation) to have a key in 138 | the dictionary without value. 139 | 140 | This function returns non-zero in case of failure. 141 | */ 142 | /*--------------------------------------------------------------------------*/ 143 | int dictionary_set(dictionary * vd, char * key, char * val); 144 | 145 | /*-------------------------------------------------------------------------*/ 146 | /** 147 | @brief Delete a key in a dictionary 148 | @param d dictionary object to modify. 149 | @param key Key to remove. 150 | @return void 151 | 152 | This function deletes a key in a dictionary. Nothing is done if the 153 | key cannot be found. 154 | */ 155 | /*--------------------------------------------------------------------------*/ 156 | void dictionary_unset(dictionary * d, char * key); 157 | 158 | 159 | /*-------------------------------------------------------------------------*/ 160 | /** 161 | @brief Dump a dictionary to an opened file pointer. 162 | @param d Dictionary to dump 163 | @param f Opened file pointer. 164 | @return void 165 | 166 | Dumps a dictionary onto an opened file pointer. Key pairs are printed out 167 | as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as 168 | output file pointers. 169 | */ 170 | /*--------------------------------------------------------------------------*/ 171 | void dictionary_dump(dictionary * d, FILE * out); 172 | 173 | #endif 174 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/install-sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # install - install a program, script, or datafile 4 | # 5 | # This originates from X11R5 (mit/util/scripts/install.sh), which was 6 | # later released in X11R6 (xc/config/util/install.sh) with the 7 | # following copyright and license. 8 | # 9 | # Copyright (C) 1994 X Consortium 10 | # 11 | # Permission is hereby granted, free of charge, to any person obtaining a copy 12 | # of this software and associated documentation files (the "Software"), to 13 | # deal in the Software without restriction, including without limitation the 14 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 15 | # sell copies of the Software, and to permit persons to whom the Software is 16 | # furnished to do so, subject to the following conditions: 17 | # 18 | # The above copyright notice and this permission notice shall be included in 19 | # all copies or substantial portions of the Software. 20 | # 21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | # X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 25 | # AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- 26 | # TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | # 28 | # Except as contained in this notice, the name of the X Consortium shall not 29 | # be used in advertising or otherwise to promote the sale, use or other deal- 30 | # ings in this Software without prior written authorization from the X Consor- 31 | # tium. 32 | # 33 | # 34 | # FSF changes to this file are in the public domain. 35 | # 36 | # Calling this script install-sh is preferred over install.sh, to prevent 37 | # `make' implicit rules from creating a file called install from it 38 | # when there is no Makefile. 39 | # 40 | # This script is compatible with the BSD install script, but was written 41 | # from scratch. It can only install one file at a time, a restriction 42 | # shared with many OS's install programs. 43 | 44 | 45 | # set DOITPROG to echo to test this script 46 | 47 | # Don't use :- since 4.3BSD and earlier shells don't like it. 48 | doit="${DOITPROG-}" 49 | 50 | 51 | # put in absolute paths if you don't have them in your path; or use env. vars. 52 | 53 | mvprog="${MVPROG-mv}" 54 | cpprog="${CPPROG-cp}" 55 | chmodprog="${CHMODPROG-chmod}" 56 | chownprog="${CHOWNPROG-chown}" 57 | chgrpprog="${CHGRPPROG-chgrp}" 58 | stripprog="${STRIPPROG-strip}" 59 | rmprog="${RMPROG-rm}" 60 | mkdirprog="${MKDIRPROG-mkdir}" 61 | 62 | transformbasename="" 63 | transform_arg="" 64 | instcmd="$mvprog" 65 | chmodcmd="$chmodprog 0755" 66 | chowncmd="" 67 | chgrpcmd="" 68 | stripcmd="" 69 | rmcmd="$rmprog -f" 70 | mvcmd="$mvprog" 71 | src="" 72 | dst="" 73 | dir_arg="" 74 | 75 | while [ x"$1" != x ]; do 76 | case $1 in 77 | -c) instcmd="$cpprog" 78 | shift 79 | continue;; 80 | 81 | -d) dir_arg=true 82 | shift 83 | continue;; 84 | 85 | -m) chmodcmd="$chmodprog $2" 86 | shift 87 | shift 88 | continue;; 89 | 90 | -o) chowncmd="$chownprog $2" 91 | shift 92 | shift 93 | continue;; 94 | 95 | -g) chgrpcmd="$chgrpprog $2" 96 | shift 97 | shift 98 | continue;; 99 | 100 | -s) stripcmd="$stripprog" 101 | shift 102 | continue;; 103 | 104 | -t=*) transformarg=`echo $1 | sed 's/-t=//'` 105 | shift 106 | continue;; 107 | 108 | -b=*) transformbasename=`echo $1 | sed 's/-b=//'` 109 | shift 110 | continue;; 111 | 112 | *) if [ x"$src" = x ] 113 | then 114 | src=$1 115 | else 116 | # this colon is to work around a 386BSD /bin/sh bug 117 | : 118 | dst=$1 119 | fi 120 | shift 121 | continue;; 122 | esac 123 | done 124 | 125 | if [ x"$src" = x ] 126 | then 127 | echo "install: no input file specified" 128 | exit 1 129 | else 130 | true 131 | fi 132 | 133 | if [ x"$dir_arg" != x ]; then 134 | dst=$src 135 | src="" 136 | 137 | if [ -d $dst ]; then 138 | instcmd=: 139 | chmodcmd="" 140 | else 141 | instcmd=mkdir 142 | fi 143 | else 144 | 145 | # Waiting for this to be detected by the "$instcmd $src $dsttmp" command 146 | # might cause directories to be created, which would be especially bad 147 | # if $src (and thus $dsttmp) contains '*'. 148 | 149 | if [ -f $src -o -d $src ] 150 | then 151 | true 152 | else 153 | echo "install: $src does not exist" 154 | exit 1 155 | fi 156 | 157 | if [ x"$dst" = x ] 158 | then 159 | echo "install: no destination specified" 160 | exit 1 161 | else 162 | true 163 | fi 164 | 165 | # If destination is a directory, append the input filename; if your system 166 | # does not like double slashes in filenames, you may need to add some logic 167 | 168 | if [ -d $dst ] 169 | then 170 | dst="$dst"/`basename $src` 171 | else 172 | true 173 | fi 174 | fi 175 | 176 | ## this sed command emulates the dirname command 177 | dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` 178 | 179 | # Make sure that the destination directory exists. 180 | # this part is taken from Noah Friedman's mkinstalldirs script 181 | 182 | # Skip lots of stat calls in the usual case. 183 | if [ ! -d "$dstdir" ]; then 184 | defaultIFS=' 185 | ' 186 | IFS="${IFS-${defaultIFS}}" 187 | 188 | oIFS="${IFS}" 189 | # Some sh's can't handle IFS=/ for some reason. 190 | IFS='%' 191 | set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` 192 | IFS="${oIFS}" 193 | 194 | pathcomp='' 195 | 196 | while [ $# -ne 0 ] ; do 197 | pathcomp="${pathcomp}${1}" 198 | shift 199 | 200 | if [ ! -d "${pathcomp}" ] ; 201 | then 202 | $mkdirprog "${pathcomp}" 203 | else 204 | true 205 | fi 206 | 207 | pathcomp="${pathcomp}/" 208 | done 209 | fi 210 | 211 | if [ x"$dir_arg" != x ] 212 | then 213 | $doit $instcmd $dst && 214 | 215 | if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && 216 | if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && 217 | if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && 218 | if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi 219 | else 220 | 221 | # If we're going to rename the final executable, determine the name now. 222 | 223 | if [ x"$transformarg" = x ] 224 | then 225 | dstfile=`basename $dst` 226 | else 227 | dstfile=`basename $dst $transformbasename | 228 | sed $transformarg`$transformbasename 229 | fi 230 | 231 | # don't allow the sed command to completely eliminate the filename 232 | 233 | if [ x"$dstfile" = x ] 234 | then 235 | dstfile=`basename $dst` 236 | else 237 | true 238 | fi 239 | 240 | # Make a temp file name in the proper directory. 241 | 242 | dsttmp=$dstdir/#inst.$$# 243 | 244 | # Move or copy the file name to the temp name 245 | 246 | $doit $instcmd $src $dsttmp && 247 | 248 | trap "rm -f ${dsttmp}" 0 && 249 | 250 | # and set any options; do chmod last to preserve setuid bits 251 | 252 | # If any of these fail, we abort the whole thing. If we want to 253 | # ignore errors from any of these, just make sure not to ignore 254 | # errors from the above "$doit $instcmd $src $dsttmp" command. 255 | 256 | if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && 257 | if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && 258 | if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && 259 | if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && 260 | 261 | # Now rename the file to the real destination. 262 | 263 | $doit $rmcmd -f $dstdir/$dstfile && 264 | $doit $mvcmd $dsttmp $dstdir/$dstfile 265 | 266 | fi && 267 | 268 | 269 | exit 0 270 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/config/missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common wrapper for a few potentially missing GNU programs. 3 | 4 | scriptversion=2013-10-28.13; # UTC 5 | 6 | # Copyright (C) 1996-2014 Free Software Foundation, Inc. 7 | # Originally written by Fran,cois Pinard , 1996. 8 | 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | if test $# -eq 0; then 28 | echo 1>&2 "Try '$0 --help' for more information" 29 | exit 1 30 | fi 31 | 32 | case $1 in 33 | 34 | --is-lightweight) 35 | # Used by our autoconf macros to check whether the available missing 36 | # script is modern enough. 37 | exit 0 38 | ;; 39 | 40 | --run) 41 | # Back-compat with the calling convention used by older automake. 42 | shift 43 | ;; 44 | 45 | -h|--h|--he|--hel|--help) 46 | echo "\ 47 | $0 [OPTION]... PROGRAM [ARGUMENT]... 48 | 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due 50 | to PROGRAM being missing or too old. 51 | 52 | Options: 53 | -h, --help display this help and exit 54 | -v, --version output version information and exit 55 | 56 | Supported PROGRAM values: 57 | aclocal autoconf autoheader autom4te automake makeinfo 58 | bison yacc flex lex help2man 59 | 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and 61 | 'g' are ignored when checking the name. 62 | 63 | Send bug reports to ." 64 | exit $? 65 | ;; 66 | 67 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 68 | echo "missing $scriptversion (GNU Automake)" 69 | exit $? 70 | ;; 71 | 72 | -*) 73 | echo 1>&2 "$0: unknown '$1' option" 74 | echo 1>&2 "Try '$0 --help' for more information" 75 | exit 1 76 | ;; 77 | 78 | esac 79 | 80 | # Run the given program, remember its exit status. 81 | "$@"; st=$? 82 | 83 | # If it succeeded, we are done. 84 | test $st -eq 0 && exit 0 85 | 86 | # Also exit now if we it failed (or wasn't found), and '--version' was 87 | # passed; such an option is passed most likely to detect whether the 88 | # program is present and works. 89 | case $2 in --version|--help) exit $st;; esac 90 | 91 | # Exit code 63 means version mismatch. This often happens when the user 92 | # tries to use an ancient version of a tool on a file that requires a 93 | # minimum version. 94 | if test $st -eq 63; then 95 | msg="probably too old" 96 | elif test $st -eq 127; then 97 | # Program was missing. 98 | msg="missing on your system" 99 | else 100 | # Program was found and executed, but failed. Give up. 101 | exit $st 102 | fi 103 | 104 | perl_URL=http://www.perl.org/ 105 | flex_URL=http://flex.sourceforge.net/ 106 | gnu_software_URL=http://www.gnu.org/software 107 | 108 | program_details () 109 | { 110 | case $1 in 111 | aclocal|automake) 112 | echo "The '$1' program is part of the GNU Automake package:" 113 | echo "<$gnu_software_URL/automake>" 114 | echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" 115 | echo "<$gnu_software_URL/autoconf>" 116 | echo "<$gnu_software_URL/m4/>" 117 | echo "<$perl_URL>" 118 | ;; 119 | autoconf|autom4te|autoheader) 120 | echo "The '$1' program is part of the GNU Autoconf package:" 121 | echo "<$gnu_software_URL/autoconf/>" 122 | echo "It also requires GNU m4 and Perl in order to run:" 123 | echo "<$gnu_software_URL/m4/>" 124 | echo "<$perl_URL>" 125 | ;; 126 | esac 127 | } 128 | 129 | give_advice () 130 | { 131 | # Normalize program name to check for. 132 | normalized_program=`echo "$1" | sed ' 133 | s/^gnu-//; t 134 | s/^gnu//; t 135 | s/^g//; t'` 136 | 137 | printf '%s\n' "'$1' is $msg." 138 | 139 | configure_deps="'configure.ac' or m4 files included by 'configure.ac'" 140 | case $normalized_program in 141 | autoconf*) 142 | echo "You should only need it if you modified 'configure.ac'," 143 | echo "or m4 files included by it." 144 | program_details 'autoconf' 145 | ;; 146 | autoheader*) 147 | echo "You should only need it if you modified 'acconfig.h' or" 148 | echo "$configure_deps." 149 | program_details 'autoheader' 150 | ;; 151 | automake*) 152 | echo "You should only need it if you modified 'Makefile.am' or" 153 | echo "$configure_deps." 154 | program_details 'automake' 155 | ;; 156 | aclocal*) 157 | echo "You should only need it if you modified 'acinclude.m4' or" 158 | echo "$configure_deps." 159 | program_details 'aclocal' 160 | ;; 161 | autom4te*) 162 | echo "You might have modified some maintainer files that require" 163 | echo "the 'autom4te' program to be rebuilt." 164 | program_details 'autom4te' 165 | ;; 166 | bison*|yacc*) 167 | echo "You should only need it if you modified a '.y' file." 168 | echo "You may want to install the GNU Bison package:" 169 | echo "<$gnu_software_URL/bison/>" 170 | ;; 171 | lex*|flex*) 172 | echo "You should only need it if you modified a '.l' file." 173 | echo "You may want to install the Fast Lexical Analyzer package:" 174 | echo "<$flex_URL>" 175 | ;; 176 | help2man*) 177 | echo "You should only need it if you modified a dependency" \ 178 | "of a man page." 179 | echo "You may want to install the GNU Help2man package:" 180 | echo "<$gnu_software_URL/help2man/>" 181 | ;; 182 | makeinfo*) 183 | echo "You should only need it if you modified a '.texi' file, or" 184 | echo "any other file indirectly affecting the aspect of the manual." 185 | echo "You might want to install the Texinfo package:" 186 | echo "<$gnu_software_URL/texinfo/>" 187 | echo "The spurious makeinfo call might also be the consequence of" 188 | echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" 189 | echo "want to install GNU make:" 190 | echo "<$gnu_software_URL/make/>" 191 | ;; 192 | *) 193 | echo "You might have modified some files without having the proper" 194 | echo "tools for further handling them. Check the 'README' file, it" 195 | echo "often tells you about the needed prerequisites for installing" 196 | echo "this package. You may also peek at any GNU archive site, in" 197 | echo "case some other package contains this missing '$1' program." 198 | ;; 199 | esac 200 | } 201 | 202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \ 203 | -e '2,$s/^/ /' >&2 204 | 205 | # Propagate the correct exit status (expected to be 127 for a program 206 | # not found, 63 for a program that failed due to version mismatch). 207 | exit $st 208 | 209 | # Local variables: 210 | # eval: (add-hook 'write-file-hooks 'time-stamp) 211 | # time-stamp-start: "scriptversion=" 212 | # time-stamp-format: "%:y-%02m-%02d.%02H" 213 | # time-stamp-time-zone: "UTC" 214 | # time-stamp-end: "; # UTC" 215 | # End: 216 | -------------------------------------------------------------------------------- /dependencies/mmseg-3.2.14/src/mmseg_main.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: GPL 2.0 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License. You should have 7 | * received a copy of the GPL license along with this program; if you 8 | * did not, you can find it at http://www.gnu.org/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Coreseek.com code. 16 | * 17 | * Copyright (C) 2007-2008. All Rights Reserved. 18 | * 19 | * Author: 20 | * Li monan 21 | * 22 | * ***** END LICENSE BLOCK ***** */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #ifdef WIN32 33 | #include "bsd_getopt_win.h" 34 | #else 35 | #include "bsd_getopt.h" 36 | #endif 37 | 38 | #include "UnigramCorpusReader.h" 39 | #include "UnigramDict.h" 40 | #include "SynonymsDict.h" 41 | #include "ThesaurusDict.h" 42 | #include "SegmenterManager.h" 43 | #include "Segmenter.h" 44 | #include "csr_utils.h" 45 | 46 | using namespace std; 47 | using namespace css; 48 | 49 | #define SEGMENT_OUTPUT 1 50 | 51 | void usage(const char* argv_0) { 52 | printf("Coreseek COS(tm) MM Segment 1.0\n"); 53 | printf("Copyright By Coreseek.com All Right Reserved.\n"); 54 | printf("Usage: %s