├── tagging ├── Model.cpp ├── Perceptron.h ├── Decoder.h ├── Sample.h ├── Utils.h ├── Perceptron.cpp ├── Model.h ├── Decoder.cpp └── Sample.cpp ├── utility ├── head.h ├── POSExtract.h ├── TextClassification.h ├── SentenceSplit.h ├── utf8.h ├── StringType.h ├── StringOperation.h ├── StringSplit.h ├── POSExtract.cpp ├── Tokenize.h ├── TextClassification.cpp ├── SentenceSplit.cpp ├── StringOperation.cpp ├── utf8 │ ├── unchecked.h │ ├── core.h │ └── checked.h ├── StringSplit.cpp ├── StringType.cpp └── Tokenize.cpp ├── Makefile ├── README.md ├── Formatting.h ├── Las.h ├── Main.cpp ├── tip-las.vcxproj ├── Formatting.cpp └── Las.cpp /tagging/Model.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyc7711/tip-las/HEAD/tagging/Model.cpp -------------------------------------------------------------------------------- /utility/head.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utility/StringType.h" 6 | #include "utility/StringOperation.h" 7 | #include "utility/StringSplit.h" 8 | 9 | #include "utility/SentenceSplit.h" 10 | 11 | #include "utility/TextClassification.h" 12 | 13 | #include "utility/Tokenize.h" 14 | 15 | #include "utility/POSExtract.h" 16 | 17 | using namespace std; 18 | 19 | using namespace utility; 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS = -O2 -std=c++11 -I./ 2 | 3 | OBJS = ./utility/SentenceSplit.o ./utility/StringOperation.o ./utility/POSExtract.o \ 4 | ./utility/StringSplit.o ./utility/StringType.o ./utility/TextClassification.o \ 5 | ./utility/Tokenize.o ./tagging/Sample.o ./tagging/Model.o ./tagging/Decoder.o \ 6 | ./tagging/Perceptron.o ./Formatting.o ./Las.o ./Main.o 7 | # tip platform 8 | tip-las: $(OBJS) 9 | g++ -g -o ./$@ $^ -lpthread -ldl 10 | 11 | tip-las.i386: $(OBJS) 12 | g++ -D__MACHINE_TYPE_32__ -o ./tip-tws $^ -lpthread -ldl -m32 13 | 14 | $(OBJS): %.o:%.cpp 15 | g++ $(CXXFLAGS) -o $@ -c $< 16 | 17 | .PHONY: clean 18 | 19 | clean: 20 | -rm $(OBJS) ./tip-las 21 | -------------------------------------------------------------------------------- /utility/POSExtract.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-22 14 | *********************************************************************/ 15 | #ifndef POSEXTRACT_H 16 | #define POSEXTRACT_H 17 | #include 18 | #include 19 | 20 | namespace utility 21 | { 22 | class POSExtract 23 | { 24 | public: 25 | static void ParsePOS(const std::string& line, std::vector >& tokenList); 26 | static void TrimAll(std::string &str); 27 | private: 28 | }; 29 | } 30 | #endif 31 | -------------------------------------------------------------------------------- /utility/TextClassification.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | 16 | #ifndef TEXTCLASSIFICATION_H 17 | #define TEXTCLASSIFICATION_H 18 | #include 19 | #include 20 | #include 21 | #include "StringSplit.h" 22 | #include "StringType.h" 23 | #include "StringOperation.h" 24 | 25 | namespace utility 26 | { 27 | class TextClassification 28 | { 29 | public: 30 | TextClassification(); 31 | ~TextClassification(); 32 | static bool TB_UTF8(const std::string& text,std::vector >& vec); 33 | bool CH_GBK(const std::string& text,std::vector >& vec); 34 | private: 35 | StringType * gbk_type; 36 | }; 37 | } 38 | #endif 39 | -------------------------------------------------------------------------------- /utility/SentenceSplit.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef SENTENCESPLIT_H 16 | #define SENTENCESPLIT_H 17 | 18 | #include 19 | #include 20 | 21 | 22 | namespace utility 23 | { 24 | class SentenceSplit 25 | { 26 | public: 27 | static bool Tibetan(const std::string& line,std::vector >& val_sentences); 28 | static bool ChineseUTF8(const std::string& line,std::vector >& val_sentences); 29 | static bool ChineseANSI(const std::string& line,std::vector >& val_sentences); 30 | private: 31 | inline static bool SplitByTokens(std::vector > &vecstr, const std::string &str, const std::string tokens[], const int tokensnumber); 32 | }; 33 | 34 | } 35 | #endif 36 | -------------------------------------------------------------------------------- /utility/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "./utf8/checked.h" 32 | #include "./utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /utility/StringType.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef STRINGTYPE_H 16 | #define STRINGTYPE_H 17 | #include 18 | #include 19 | #include 20 | 21 | #include "StringSplit.h" 22 | 23 | namespace utility 24 | { 25 | class StringType 26 | { 27 | public: 28 | StringType(); 29 | bool Init(); 30 | int CH_GBKCharType(const std::string& myChar); 31 | 32 | static int TB_UTF8CharType(const std::string & myChar); 33 | static int TB_UTF8StringType(const std::string& str); 34 | static int UTF16CharType(const std::string & str); 35 | static bool IsChGBKPunctuation(const std::string& str); 36 | /*是否为ASCII编码字符串*/ 37 | static bool IsASCIIString(const std::string& word); 38 | /*是否为中文数字*/ 39 | static bool IsCHGBKNumber(const std::string& str); 40 | /*是否是中文字符*/ 41 | static bool IsCHBGKChar(const std::string& str); 42 | static int CHGBKCharIndex(const std::string& chChar); 43 | private: 44 | /*为啥static类型的出错呢?*/ 45 | std::set gbktable; 46 | }; 47 | } 48 | #endif 49 | -------------------------------------------------------------------------------- /tagging/Perceptron.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | /******************************************************************** 16 | * Thanks to Wang Kun and Li Xiaoqing for the help of this project. 17 | * 18 | * 19 | ********************************************************************/ 20 | 21 | #ifndef PERCEPTRON_H 22 | #define PERCEPTRON_H 23 | 24 | #include 25 | #include 26 | #include "Model.h" 27 | #include "Sample.h" 28 | #include "Decoder.h" 29 | 30 | namespace Tagging 31 | { 32 | class Perceptron 33 | { 34 | public: 35 | Perceptron(class Sample* sample, class Model* model, bool train = false, int round = 20, int beamSize = 8, int nGram = 2); 36 | ~Perceptron(); 37 | bool Train(); 38 | bool Test(const std::string& result); 39 | bool Test(const std::vector >& tokens, std::vector& result); 40 | private: 41 | class Sample * m_sample; 42 | class Model * m_model; 43 | class Decoder * m_decoder; 44 | int m_round; 45 | private: 46 | bool Test(std::vector< std::vector >* samples, std::vector& result); 47 | }; 48 | } 49 | #endif 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TIP-LAS: An open source toolkit for Tibetan word segmentation and part-of-speech tagging 2 | =============== 3 | TIP-LAS:藏文分词词性标注系统使用文档 4 | =============== 5 | # 简介 6 | 7 | 分词、词性标注是自然语言处理任务的基础,应用非常广泛,为方便研究者进行对比研究,我们把分词、词性标注系统予以 开源,便于快速构建实验系统。 8 | TIP- LAS系统基于感知机算法构建,分为分词、词性标注两大模块,分词采用统一标记集对音节和紧缩词进行标注,词性标注融合音节特征,具有领域无关等特点。 9 | ## 动态 10 | 本分词系统在中文信息学会举办的第一届藏文分词评测中获得第一名,[参见](http://nmlr.muc.edu.cn/huiyixinxi/2017/09-12/322.html) 11 | # 编译TIP-LAS 12 | 13 |   Linux 下编译:进入主文件目录,输入”make”即可 14 | 15 |   Windows下编译:在主文件目录下,用VS 2013打开“tip-las.vcxproj”文件即可 16 | # 使用可执行程序 17 | 18 | 编译成功后,会在主目录文件夹下生成如下可执行程序: tip-las 19 | 20 | ## 训练命令:tip-las train ws/pos input model 21 | 22 | 说明: 23 | 24 | train表示程序执行训练命令,ws、pos分别表示选择训练分词或者是词性标注模型,input表示输入训练文件,model表示模型文件。当选择ws或pos时,后面的输入文件需要对应切分好的分词语料,或者是词性标注语料。 25 | 26 | ## 测试命令:tip-las test ws/pos/all input output 27 | 28 | 说明: 29 | 30 | test表示程序执行测试命令,ws、pos、all分别表示选择测试分词、词性标注、分词标注一体,input表示输入测试文件,output表示测试结果文件。当选择ws时,输入的是纯藏文分本,输出切分好的藏文文本;当选择pos是输入的是切分好的藏文分本,输出标注结果文本;当选择all是输入的是纯藏文分本,输出的是分词、标注结果。 31 | 32 | ## 网络上相关使用记录分享 33 | 34 | [TIP-LAS 藏语分词工具使用](https://blog.csdn.net/sinat_34328764/article/details/106501751?spm=1001.2014.3001.5501) 35 | 36 | # 引用 37 | 38 | 如果本系统对你的研究或工作有帮助,请致谢或者引用,格式如下: 39 | 40 | [1] 李亚超, 江静, 加羊吉,于洪志. TIP-LAS:一个开源的藏文分词词性标注系统[J]. 中文信息学报, 2015, 29(6):203-207. 41 | 42 | # 参考文献 43 | 本系采用的模型及相关方法详见已经发表文章,列表如下: 44 | 45 | [1] 李亚超,加羊吉,江静,何向真,于洪志. 融合无监督特征的藏文分词方法研究[J]. 中文信息学报, 2017, 31(02):71-75. 46 | 47 | [2] 李亚超, 江静, 加羊吉,于洪志. TIP-LAS:一个开源的藏文分词词性标注系统[J]. 中文信息学报, 2015, 29(6):203-207. 48 | 49 | [3] 李亚超, 加羊吉, 宗成庆,于洪志. 基于条件随机场的藏语自动分词方法研究与实现[J]. 中文信息学报, 2013, 27(4):52-58. 50 | 51 | [4] 于洪志, 李亚超, 汪昆, 冷本扎西. 融合音节特征的最大熵藏文词性标注研究[J]. 中文信息学报, 2013, 27(5):160-165. 52 | # Q&A 53 | 如需训练好模型,及其他问题,请联系:Email: harry_lyc{at)foxmail(dot}com 54 | 55 | # 注意 56 | 本系统、模型可免费用于科研、学习。如需商业用途或者需要企业级模型文件,按照如上联系方式商谈。 57 | -------------------------------------------------------------------------------- /utility/StringOperation.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef STRINGOPERATION_H 16 | #define STRINGOPERATION_H 17 | 18 | #include 19 | namespace utility 20 | { 21 | class StringOperation 22 | { 23 | public: 24 | static bool IsPrefix(const std::string& src, const std::string& prefix); 25 | static bool IsPostfix(const std::string& src, const std::string& postfix); 26 | static void TrimAll(std::string& str); 27 | static void TrimBlanks(std::string& str); 28 | static void TrimBlanksBegin(std::string& str); 29 | static void TrimBlanksEnd(std::string& str); 30 | /*一个字符串是否包含另外一个字符串******/ 31 | static bool Contains(const std::string src,const std::string obj); 32 | static int MatchingNumber(const std::string& src,const std::string& obj); 33 | /*把字符串按照一定的标识符切割成字符串对*/ 34 | static void String2Pair(const std::string& str,std::pair& kv,const std::string& seg="/"); 35 | static void Replace(std::string& text,const std::string& old_str,const std::string& new_str); 36 | static void Remove(std::string& text,const std::string& remove_str); 37 | static void Remove(std::string& text,const std::string remove_strs[],int remove_count); 38 | private: 39 | }; 40 | } 41 | #endif 42 | -------------------------------------------------------------------------------- /utility/StringSplit.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef STRINGSPLIT_H 16 | #define STRINGSPLIT_H 17 | 18 | #include 19 | #include 20 | 21 | namespace utility 22 | { 23 | class StringSplit 24 | { 25 | public: 26 | static bool SplitUTF8(const std::string& utf8,std::vector& vec); 27 | static bool SplitANSI(const std::string& ansi,std::vector& vec); 28 | /*切分时,切分标志可以选择与前驱结合或者是不结合,不结合时丢掉*/ 29 | static bool SplitByToken(std::vector& vecstr, const std::string& str, const std::string token,bool withtoken = false); 30 | static bool SplitByTokens(std::vector& vecstr,const std::string& str,const std::string tokens[],const int tokensnumber,bool withtoken=false); 31 | /*切分标志,作为keyvalue,的value输出*/ 32 | static bool SplitByTokens(std::vector > & vecstr,const std::string& str,const std::string tokens[],const int tokensnumber); 33 | /*切分时,切分标志单独切分出来*/ 34 | static bool SplitToken(std::vector &vecstr, const std::string &str, const std::string token); 35 | /*切分时,切分标志单独切分出来*/ 36 | static bool SplitTokens(std::vector &vecstr, const std::string &str, const std::string tokens[], const int tokensnumber); 37 | private: 38 | }; 39 | } 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /tagging/Decoder.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef DECODER_H 16 | #define DECODER_H 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "Model.h" 25 | 26 | namespace Tagging 27 | { 28 | class Decoder 29 | { 30 | public: 31 | Decoder(class Model * model, bool mode, int pl, int pr, int beamSize = 8, int nGram = 2); 32 | Decoder(); 33 | ~Decoder(); 34 | bool Decode(std::vector > * samples, std::vector& tags, std::vector* tagsGold = NULL); 35 | void GenerateFeatures(std::vector >& features, const std::vector& tagList, int pos); 36 | void SetBeamSize(int s); 37 | void SetNGram(int s); 38 | private: 39 | Model * m_model; 40 | bool m_mode; 41 | int BeamSize = 8; 42 | int NGram = 2; 43 | int paddingLeft; 44 | int paddingRight; 45 | private: 46 | 47 | void ExpandStates(const Cand & cand, std::vector& candStates); 48 | bool CheckHistory(const Cand & cand0, const Cand & cand1); 49 | void AddNewStates(const std::vector &cands); 50 | 51 | std::vector m_candsOld; 52 | std::vector m_candsNew; 53 | std::vector m_goldTags; 54 | std::vector >* m_tokensMatrix; 55 | int m_pos; 56 | 57 | }; 58 | } 59 | #endif 60 | -------------------------------------------------------------------------------- /tagging/Sample.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao, harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef SAMPLE_H 16 | #define SAMPLE_H 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "Utils.h" 28 | 29 | using namespace std; 30 | 31 | namespace Tagging 32 | { 33 | class Sample 34 | { 35 | public: 36 | Sample(std::vector >& dict); 37 | Sample(); 38 | Sample(int left, int right); 39 | ~Sample(); 40 | bool Train(const std::string & trainFile); 41 | bool AddEvents(const std::vector >& sample); 42 | bool AddEventsOver(); 43 | bool Test(const std::string & testFile); 44 | std::vector >* GetSample(int index); 45 | void Tokens2Id(const std::vector >& tokens, std::vector< std::vector >* samples); 46 | std::string GetTag(int tagid); 47 | int GetSize(); 48 | void Shuffle(); 49 | std::string GetSampleTag(int index); 50 | std::vector > > SamplesMatrix; 51 | std::vector > Samples; 52 | std::vector > Dict; 53 | std::unordered_map > lastTagSet; 54 | std::unordered_map > tokenTagSet; 55 | std::set tagSet; 56 | std::unordered_map id2Tag; 57 | int FieldSize; 58 | int LeftPadding; 59 | int RightPadding; 60 | private: 61 | std::vector ids; 62 | std::unordered_map tokensFrequent; 63 | }; 64 | } 65 | #endif 66 | -------------------------------------------------------------------------------- /utility/POSExtract.cpp: -------------------------------------------------------------------------------- 1 | #include "POSExtract.h" 2 | 3 | namespace utility 4 | { 5 | /* 6 | ************************************************* 7 | 功能 :解析标记的文本,标记的文本没有空格作为区分标志 8 | 参数 : 9 | 返回值 : 10 | ------------------------------------------------- 11 | 备注 :标注语料抽取需要根据具体语料而定,这个是通用的抽取方法,不考虑特殊情况 12 | ------------------------------------------------- 13 | 作者 :Li Yachao 14 | 时间 :2013-3-6 15 | ************************************************* 16 | */ 17 | void POSExtract::ParsePOS(const std::string& line, std::vector >& tokenList) 18 | { 19 | tokenList.clear(); 20 | if(line.empty()) 21 | { 22 | return ; 23 | } 24 | std::string segtag = "/"; 25 | int segpos = 0; 26 | int offset = 0; 27 | int index = 0; 28 | while(segpos= line.size()) 39 | { 40 | break; 41 | } 42 | unsigned char c = line.at(segpos + segtag.size() + index); 43 | if(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9'))) 44 | { 45 | if(((c >= 'a') && (c <= 'z'))) 46 | { 47 | if(prefix == '-') 48 | prefix = c; 49 | if((prefix == 'p')) 50 | { 51 | int sa= 0; 52 | } 53 | index++; 54 | } 55 | else if((prefix == 'p') && (c >= '0') && (c <= '9')) 56 | { 57 | index++; 58 | break; 59 | } 60 | else 61 | { 62 | break; 63 | } 64 | } 65 | else 66 | { 67 | break; 68 | } 69 | } 70 | std::pair kv; 71 | kv.first = line.substr(offset, segpos - offset); 72 | kv.second = line.substr(segpos + segtag.size() ,index); 73 | offset = segpos + segtag.size() + index; 74 | index = 0; 75 | TrimAll(kv.first); 76 | if(!kv.first.empty()) 77 | { 78 | tokenList.push_back(kv); 79 | } 80 | } 81 | } 82 | void POSExtract::TrimAll(std::string &str) 83 | { 84 | str.erase(0,str.find_first_not_of(" \t\r\n")); 85 | str.erase(str.find_last_not_of(" \t\r\n")+1); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /Formatting.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | /******************************************************************** 16 | * Thanks to Wang Kun and Li Xiaoqing for the help of this project. 17 | * 18 | * 19 | ********************************************************************/ 20 | 21 | #ifndef FORMATTING_H 22 | #define FORMATTING_H 23 | #include 24 | #include 25 | #include "./utility/head.h" 26 | using namespace utility; 27 | 28 | namespace utility_train 29 | { 30 | class Formatting 31 | { 32 | public: 33 | Formatting(); 34 | ~Formatting(); 35 | void WsTrain(const std::string& text, std::vector >& features, const std::string& language = "TB", const std::string& encode = "UTF8"); 36 | void WsTest(const std::string& text, std::vector >& features, std::vector& seg, 37 | const std::string& language ="TB" ,const std::string& encode="UTF8"); 38 | void PosTrain(const std::string& text, std::vector >& features); 39 | void PosTest(const std::string& text, std::vector >& features, std::vector& seg); 40 | void PosTrainS(const std::string& text, std::vector >& features); 41 | void PosTestS(const std::string& text, std::vector >& features, std::vector& seg); 42 | private: 43 | Tokenize* tokenize; 44 | std::string delimiter;/**/ 45 | std::string head; 46 | std::string tail; 47 | void WSTag(const std::string& word,std::vector >& tags, const std::string& language = "TB",const std::string& code="UTF8"); 48 | }; 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /tagging/Utils.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | #ifndef UTILS_H 16 | #define UTILS_H 17 | #include 18 | #include 19 | #include 20 | namespace Tagging 21 | { 22 | class Utils 23 | { 24 | public: 25 | static void Split(std::vector & val, const std::string &str, const std::string & seg, bool withtoken = false) 26 | { 27 | val.clear(); 28 | std::string::size_type LeftPst = 0; 29 | std::string::size_type RightPst = 0; 30 | 31 | while ((RightPst = str.find(seg.c_str(), LeftPst)) != std::string::npos && LeftPst < str.size()) 32 | { 33 | if (RightPst != 0) 34 | { 35 | std::string term; 36 | if (withtoken) 37 | { 38 | term = str.substr(LeftPst, RightPst - LeftPst + seg.length()); 39 | } 40 | else 41 | { 42 | term = str.substr(LeftPst, RightPst - LeftPst); 43 | } 44 | if (term.length() > 0) 45 | { 46 | val.push_back(term); 47 | } 48 | LeftPst = RightPst + seg.size(); 49 | } 50 | //str start with token 51 | else 52 | { 53 | LeftPst = RightPst + seg.size(); 54 | } 55 | } 56 | if (LeftPst < str.size()) 57 | { 58 | const std::string &term = str.substr(LeftPst); 59 | if (term.length() > 0) 60 | { 61 | val.push_back(term); 62 | } 63 | } 64 | } 65 | 66 | static void TrimLine(std::string &line) 67 | { 68 | line.erase(0, line.find_first_not_of(" \t\r\n")); 69 | line.erase(line.find_last_not_of(" \t\r\n") + 1); 70 | } 71 | 72 | static std::string ToString(int i) 73 | { 74 | std::stringstream ss; 75 | ss << i; 76 | return ss.str(); 77 | } 78 | private: 79 | 80 | }; 81 | 82 | } 83 | #endif 84 | -------------------------------------------------------------------------------- /utility/Tokenize.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | /* 16 | 对输入的文本进行序列化预处理 17 | */ 18 | #ifndef TOKENNIZE_H 19 | #define TOKENNIZE_H 20 | #include 21 | #include 22 | #include "StringSplit.h" 23 | #include "StringOperation.h" 24 | #include "TextClassification.h" 25 | 26 | namespace utility 27 | { 28 | class Tokenize 29 | { 30 | public: 31 | Tokenize(); 32 | ~Tokenize(); 33 | /*中文预处理,以UTF8编码的文本文件读取的行*/ 34 | void GBK(const std::string& line, std::vector >& val); 35 | void GBKAll(const std::string& line, std::vector >& val); 36 | static void Chinese_UTF8(const std::string& line, std::vector& vec_val); 37 | /*中文预处理,以ANSI编码的文本文件读取的行*/ 38 | static void Chinese_ANSI(const std::string& line, std::vector& vec_val); 39 | static void Chinese_ANSIALL(const std::string& line, std::vector& vec_val); 40 | static void Chinese_ANSI(const std::string& line, std::list& vec_val); 41 | /*藏文预处理,以UTF8编码的文本文件读取的行,按照音节点进行切分,格助词不予处理*/ 42 | static void Tibetan(const std::string & line,std::vector& val); 43 | static void Tibetan(const std::string & line,std::vector >& val); 44 | static void Tibetan(const std::string & line,std::list& val); 45 | /*藏文预处理,以UTF8编码的文本文件读取的行,按照音节点进行切分,对于格助词予以全部切分*/ 46 | static void TibetanAll(const std::string & line,std::vector& val); 47 | static void TibetanAll(const std::string & line,std::vector >& val); 48 | static void TibetanAll(const std::string & line,std::list& val); 49 | private: 50 | StringType* str_type; 51 | }; 52 | } 53 | #endif 54 | -------------------------------------------------------------------------------- /Las.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | /******************************************************************** 16 | * Thanks to Wang Kun and Li Xiaoqing for the help of this project. 17 | * 18 | * 19 | ********************************************************************/ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "./utility/head.h" 26 | #include "./tagging/Perceptron.h" 27 | #include "Formatting.h" 28 | using namespace utility_train; 29 | using namespace utility; 30 | using namespace Tagging; 31 | 32 | namespace Tip 33 | { 34 | class Las 35 | { 36 | public: 37 | Las(); 38 | ~Las(); 39 | bool Train(const std::string& trainFile, const std::string& modelFile, const std::string& type, int iter = 30,int beamSize = 16, int nGram = 3); 40 | bool Test(const std::string& inFile, const std::string& outFile, const std::string& model_ws, const std::string& model_pos, const std::string& type); 41 | bool Development(const std::string& inFile, const std::string& model_ws = "ws.model", const std::string& model_pos = "pos.model", const std::string& type = "pos"); 42 | private: 43 | std::string flag_ws = "ws"; 44 | std::string flag_pos = "pos"; 45 | std::string flag_dev = "dev"; 46 | std::string flag_all = "all"; 47 | std::string delimiter = " "; 48 | std::string templatesSet; 49 | double correct; 50 | double total; 51 | class Formatting* format; 52 | class Perceptron* tagging_ws; 53 | class Perceptron* tagging_pos; 54 | bool WsTest(const std::string& text, std::string& val); 55 | bool WsTest(const std::string& text, std::vector& words, bool withSeg = false); 56 | bool PosTest(const std::string& text, std::string& val); 57 | bool PosDevelopment(const std::string& text, std::string& val); 58 | 59 | }; 60 | } 61 | -------------------------------------------------------------------------------- /Main.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao, harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2011-11-28,2014-06-26,2015-02-03,2015-10-23 14 | *********************************************************************/ 15 | /******************************************************************** 16 | * Thanks to Wang Kun and Li Xiaoqing for the help of this project. 17 | * 18 | * 19 | ********************************************************************/ 20 | 21 | #include "Las.h" 22 | 23 | using namespace std; 24 | using namespace Tip; 25 | void Usage(); 26 | void Copyrgiht(); 27 | 28 | int main(int argc, char * argv[]) 29 | { 30 | if (argc < 5) 31 | { 32 | Usage(); 33 | return 0; 34 | } 35 | Copyrgiht(); 36 | std::string task = "ws"; 37 | std::string input = "intput.txt"; 38 | std::string output = "output.txt"; 39 | std::string model_ws = "ws.model"; 40 | std::string model_pos = "pos.model"; 41 | std::string type = "ws"; 42 | int iter = 20; 43 | int beamSize = 8; 44 | int nGram = 2; 45 | type = argv[1]; 46 | task = argv[2]; 47 | input = argv[3]; 48 | output = argv[4]; 49 | class Las* las = new Las(); 50 | if (type == "train") 51 | { 52 | if (argc >= 6) 53 | { 54 | iter = stoi(argv[5]); 55 | } 56 | if (task == "ws") 57 | { 58 | las->Train(input, output, task, iter,beamSize,nGram); 59 | } 60 | else if (task == "pos") 61 | {// for pos 60 iter,give a better performance 62 | las->Train(input, output, task, iter*3, beamSize * 2, nGram+1); 63 | } 64 | else 65 | { 66 | Usage(); 67 | } 68 | } 69 | else if (type == "test") 70 | { 71 | if (!las->Test(input, output, model_ws, model_pos, task)) 72 | { 73 | Usage(); 74 | } 75 | } 76 | else if (type == "dev") 77 | { 78 | if (!las->Development(input)) 79 | { 80 | Usage(); 81 | } 82 | } 83 | else 84 | { 85 | Usage(); 86 | } 87 | delete las; 88 | return 0; 89 | } 90 | void Usage() 91 | { 92 | std::cout << "tip-las train ws/pos input model" << std::endl; 93 | std::cout << "tip-las test ws/pos/all input output" << std::endl; 94 | } 95 | void Copyrgiht() 96 | { 97 | std::cout << "The Inistitute of Chinese Ethnic Information Technology (www.nlit.edu.cn)" << std::endl; 98 | std::cout << "Copyright (C) 2011 - 2017." << std::endl; 99 | std::cout << "Version: 1.6, Last update: 2015-10-23." << std::endl; 100 | std::cout << "Contact: liyc7711(at)gmail(dot)com." << std::endl; 101 | } 102 | -------------------------------------------------------------------------------- /utility/TextClassification.cpp: -------------------------------------------------------------------------------- 1 | #include "TextClassification.h" 2 | 3 | namespace utility 4 | { 5 | TextClassification::TextClassification() 6 | { 7 | gbk_type = new StringType(); 8 | } 9 | TextClassification::~TextClassification() 10 | { 11 | delete gbk_type; 12 | } 13 | /* 14 | ************************************************* 15 | 功能 :对UTF8编码藏文文件进行分类,藏文,标点符号,英文等 16 | 参数 : 17 | 返回值 : 18 | ------------------------------------------------- 19 | 备注 : 20 | ------------------------------------------------- 21 | 作者 :Li Yachao 22 | 时间 :2013-2-25 23 | ************************************************* 24 | */ 25 | bool TextClassification::TB_UTF8(const std::string &text, std::vector > & vec) 26 | { 27 | vec.clear(); 28 | if(text.empty()) 29 | { 30 | return true; 31 | } 32 | //TrimAll(text); 33 | std::vectorcharList; 34 | StringSplit::SplitUTF8(text,charList); 35 | int currentType = StringType::TB_UTF8CharType(charList[0]); 36 | int priorType = currentType ; 37 | std::string buffer=""; 38 | //char bf[3]={'\0'}; 39 | for(int i=0;i kv; 52 | kv.second = bf; 53 | kv.first= buffer; 54 | vec.push_back(kv); 55 | } 56 | buffer.clear(); 57 | } 58 | buffer +=myChar; 59 | priorType = currentType; 60 | } 61 | StringOperation::TrimAll(buffer); 62 | if(!buffer.empty()) 63 | { 64 | std::pair kv; 65 | std::stringstream ss; 66 | ss << priorType; 67 | std::string bf = ss.str(); 68 | kv.second = bf; 69 | kv.first = buffer; 70 | vec.push_back(kv); 71 | } 72 | buffer.clear(); 73 | return true; 74 | } 75 | 76 | /* 77 | ************************************************* 78 | 功能 :对GBK编码文件进行分类,中文,标点符号,英文等 79 | 参数 : 80 | 返回值 : 81 | ------------------------------------------------- 82 | 备注 :1表示汉字;2如标点符号;3表示gbk图形符号;4表示英文等;5表示未知 83 | ------------------------------------------------- 84 | 作者 :Li Yachao 85 | 时间 :2013-2-25 86 | ************************************************* 87 | */ 88 | bool TextClassification::CH_GBK(const std::string &text, std::vector > &vec) 89 | { 90 | vec.clear(); 91 | if(text.empty()) 92 | { 93 | return true; 94 | } 95 | std::vectorcharList; 96 | StringSplit::SplitANSI(text,charList); 97 | int currentType = gbk_type->CH_GBKCharType(charList[0]); 98 | int priorType = currentType ; 99 | std::string buffer=""; 100 | for(int i=0;iCH_GBKCharType(myChar); 104 | if(currentType != priorType) 105 | { 106 | StringOperation::TrimAll(buffer); 107 | if(!buffer.empty()) 108 | { 109 | std::stringstream ss; 110 | ss << priorType; 111 | std::string bf = ss.str(); 112 | std::pair kv; 113 | kv.second = bf; 114 | kv.first= buffer; 115 | vec.push_back(kv); 116 | } 117 | buffer.clear(); 118 | } 119 | buffer +=myChar; 120 | priorType = currentType; 121 | } 122 | StringOperation::TrimAll(buffer); 123 | if(!buffer.empty()) 124 | { 125 | std::pair kv; 126 | std::stringstream ss; 127 | ss << priorType; 128 | std::string bf = ss.str(); 129 | kv.second = bf; 130 | kv.first = buffer; 131 | vec.push_back(kv); 132 | } 133 | buffer.clear(); 134 | return true; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /utility/SentenceSplit.cpp: -------------------------------------------------------------------------------- 1 | #include "SentenceSplit.h" 2 | 3 | namespace utility 4 | { 5 | /* 6 | ************************************************* 7 | 功能 : 切分藏文句子 8 | 参数 : 9 | 返回值 : 10 | ------------------------------------------------- 11 | 备注 : 12 | ------------------------------------------------- 13 | 作者 :Li Yachao 14 | 时间 :2012-3-18 15 | ************************************************* 16 | */ 17 | bool SentenceSplit::Tibetan(const std::string& line, std::vector >& val_sentence) 18 | { 19 | val_sentence.clear(); 20 | if(line.empty()) 21 | { 22 | return false; 23 | } 24 | std::string seg_flags[]={" ","\xe0\xbc\x8d"}; 25 | //std::string seg_flags[]={"4","3"}; 26 | std::vector > val; 27 | SplitByTokens(val,line,seg_flags,2); 28 | val_sentence = val; 29 | val.clear(); 30 | return true; 31 | } 32 | bool SentenceSplit::ChineseUTF8(const std::string& line,std::vector >& val_sentences) 33 | { 34 | val_sentences.clear(); 35 | if(line.empty()) 36 | { 37 | return false; 38 | } 39 | /* 40 | std::string str="。,、;: 41 | ?!“”‘’ 42 | ╗╚()…— 43 | 《》〈〉•"; 44 | */ 45 | const int length = 22; 46 | std::string chCharList[length] = {"\xe3\x80\x82","\xef\xbc\x8c","\xe3\x80\x81","\xef\xbc\x9b","\xef\xbc\x9a",\ 47 | "\xef\xbc\x9f","\xef\xbc\x81","\xe2\x80\x9c","\xe2\x80\x9d","\xe2\x80\x98","\xe2\x80\x99",\ 48 | "\xe2\x95\x97","\xe2\x95\x9a","\xef\xbc\x88","\xef\xbc\x89","\xe2\x80\xa6","\xe2\x80\x94",\ 49 | "\xe3\x80\x8a","\xe3\x80\x8b","\xe3\x80\x88","\xe3\x80\x89","\xe2\x80\xa2"}; 50 | std::vector > val; 51 | SplitByTokens(val,line,chCharList,length); 52 | val_sentences = val; 53 | val.clear(); 54 | return true; 55 | } 56 | bool SentenceSplit::ChineseANSI(const std::string& line,std::vector >& val_sentences) 57 | { 58 | val_sentences.clear(); 59 | if(line.empty()) 60 | { 61 | return false; 62 | } 63 | /* 64 | std::string str="。,、;: 65 | ?!“”‘’ 66 | ╗╚()…— 67 | 《》〈〉•"; 68 | */ 69 | const int length = 22; 70 | std::string chCharList[length] = {"。",",","、",";",":",\ 71 | "?","!","“","”","‘","’",\ 72 | "╗","╚","(",")","…","—",\ 73 | "《","》","〈","〉","•"}; 74 | std::vector >val; 75 | SplitByTokens(val,line,chCharList,length); 76 | val_sentences = val; 77 | val.clear(); 78 | return true; 79 | } 80 | bool SentenceSplit::SplitByTokens(std::vector >& vecstr, const std::string& str, const std::string tokens[], const int tokensnumber) 81 | { 82 | vecstr.clear(); 83 | if((str.empty()) ||tokensnumber <=0 ) 84 | { 85 | return false; 86 | } 87 | std::string buffer=""; 88 | int textLength = str.length(); 89 | int start = 0; 90 | int offset = 0; 91 | while(start < textLength) 92 | { 93 | offset = textLength; 94 | int subLength =0; 95 | //std::string tmp =""; 96 | std::pair kv;kv.first = "";kv.second =""; 97 | for(int i=0;i< tokensnumber;i++) 98 | { 99 | if(tokens[i].empty()) 100 | { 101 | continue; 102 | } 103 | int curr = str.find(tokens[i],start); 104 | if((curr >= 0) &&(curr < offset)) 105 | { 106 | offset = curr; 107 | subLength = tokens[i].length(); 108 | } 109 | } 110 | if(start == offset) 111 | { 112 | kv.first = str.substr(start,0); 113 | kv.second =str.substr(offset,subLength); 114 | start = offset + subLength; 115 | } 116 | else if(start < offset) 117 | { 118 | kv.first = str.substr(start,offset - start) ; 119 | kv.second =str.substr(offset,subLength) ; 120 | start = (offset + subLength ); 121 | } 122 | /*这个影响多个空格连在一块,并且切分标志位空格的情况*/ 123 | vecstr.push_back(kv); 124 | } 125 | return true; 126 | 127 | } 128 | } 129 | 130 | -------------------------------------------------------------------------------- /tagging/Perceptron.cpp: -------------------------------------------------------------------------------- 1 | #include "Perceptron.h" 2 | namespace Tagging 3 | { 4 | Perceptron::Perceptron(class Sample* sample, class Model* model, bool train, int round, int beamSize, int nGram) 5 | { 6 | m_sample = sample; 7 | m_model = model; 8 | m_round = round; 9 | m_decoder = new Decoder(m_model, train, m_model->LeftBound(), m_model->RightBound()); 10 | m_decoder->SetBeamSize(beamSize); 11 | m_decoder->SetNGram(nGram); 12 | model->SetBeamSize(beamSize); 13 | model->SetNGram(nGram); 14 | } 15 | 16 | Perceptron::~Perceptron() 17 | { 18 | delete m_decoder; 19 | } 20 | 21 | bool Perceptron::Test(const std::vector >& tokens, std::vector& result) 22 | { 23 | result.clear(); 24 | std::vector > sample; 25 | std::vector tags; 26 | m_sample->Tokens2Id(tokens, &sample); 27 | Test(&sample, tags); 28 | int size = tags.size(); 29 | for (int i = m_sample->LeftPadding; i < size; i++) 30 | { 31 | result.push_back(m_sample->GetTag(tags[i])); 32 | } 33 | return true; 34 | } 35 | 36 | bool Perceptron::Test(std::vector< std::vector >* samples, std::vector& result) 37 | { 38 | result.clear(); 39 | return m_decoder->Decode(samples, result); 40 | } 41 | 42 | bool Perceptron::Test(const std::string& result) 43 | { 44 | std::ofstream fout(result); 45 | if (!fout.is_open()) 46 | { 47 | std::cout << "Write Result File " << result << " Error!!!" << std::endl;; 48 | return false; 49 | } 50 | int size = m_sample->GetSize(); 51 | std::vector > test; 52 | test.resize(size); 53 | int fieldSize = m_model->FieldSize(); 54 | bool score = (fieldSize == m_sample->FieldSize); 55 | double total = 0; 56 | double correct = 0; 57 | for (int i = 0; i < size; i++) 58 | { 59 | std::vector t; 60 | std::vector g; 61 | if (!m_decoder->Decode(m_sample->GetSample(i), t)) 62 | { 63 | // 64 | } 65 | for (int j = m_sample->LeftPadding; j < (t.size()); j++) 66 | { 67 | fout << m_sample->Samples[i][j - m_sample->LeftPadding] << "\t" << m_sample->GetTag(t[j]) << std::endl; 68 | } 69 | fout << std::endl; 70 | if (score) 71 | { 72 | g.clear(); 73 | std::vector > s = *m_sample->GetSample(i); 74 | for (int j = 0; j < (s.size() - m_sample->RightPadding); j++) 75 | { 76 | g.push_back(s[j][fieldSize - 1]); 77 | } 78 | if (t.size() == g.size()) 79 | { 80 | total += (t.size() - m_sample->LeftPadding); 81 | for (int j = m_sample->LeftPadding; j < t.size(); j++) 82 | { 83 | if (t[j] == g[j]) 84 | { 85 | correct++; 86 | } 87 | } 88 | } 89 | } 90 | } 91 | if (score) 92 | { 93 | std::cout << (correct / total) << std::endl; 94 | } 95 | return true; 96 | } 97 | 98 | bool Perceptron::Train() 99 | { 100 | int size = m_sample->GetSize(); 101 | int progress = size / 30; 102 | for (int iter = 0; iter < m_round; iter++) 103 | { 104 | m_sample->Shuffle(); 105 | std::vector tags_test; 106 | std::vector tags_gold; 107 | for (int i = 0; i < size; i++) 108 | { 109 | if (!m_decoder->Decode(m_sample->GetSample(i), tags_test, &tags_gold)) 110 | { 111 | for (int j = m_sample->LeftPadding; j < tags_test.size(); j++) 112 | { 113 | std::vector > featuresTest; 114 | std::vector > featuresGold; 115 | m_decoder->GenerateFeatures(featuresTest, tags_test, j); 116 | m_decoder->GenerateFeatures(featuresGold, tags_gold, j); 117 | m_model->UpdateWeights(featuresTest, featuresGold, iter, i); 118 | } 119 | } 120 | if (i == (size - 1)) 121 | { 122 | m_model->UpdateWeights(iter, i); 123 | } 124 | if (i % progress == 0) 125 | { 126 | cout << '.'; 127 | cout.flush(); 128 | } 129 | } 130 | std::cout << "\t iter" << iter + 1 << std::endl; 131 | } 132 | return true; 133 | } 134 | } -------------------------------------------------------------------------------- /tagging/Model.h: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * Copyright (C) 2011 - 2015 3 | * The Inistitute of Chinese Ethnic Information Technology ( www.nlit.edu.cn) 4 | * Contact: Li Yachao,harry_lyc@foxmail.com or liyc7711@gmail.com 5 | * 6 | * Permission to use, copy, modify, and distribute this software for 7 | * any non-commercial purpose is hereby granted without fee, provided 8 | * that the above copyright notice appear in all copies and that both 9 | * that copyright notice. 10 | * It is provided "as is" without express or implied warranty. 11 | * 12 | * Version: 1.6 13 | * Last update: 2015-10-23 14 | *********************************************************************/ 15 | /********************************************************************* 16 | * Model Version(1.1) 17 | * 18 | *********************************************************************/ 19 | #ifndef MODEL_H 20 | #define MODEL_H 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include "Sample.h" 33 | #include "Utils.h" 34 | 35 | using namespace std; 36 | 37 | namespace Tagging 38 | { 39 | struct Cand 40 | { 41 | vector taglist; 42 | double acc_score; 43 | bool operator> (const Cand &cand_rhs) const 44 | { 45 | return (acc_score > cand_rhs.acc_score); 46 | } 47 | }; 48 | 49 | struct WeightInfo 50 | { 51 | double weight; 52 | double acc_weight; 53 | int lastline; 54 | int lastround; 55 | int freq; 56 | }; 57 | 58 | struct vechash 59 | { 60 | size_t operator()(const vector& v) const 61 | { 62 | size_t hash = 2166136261; 63 | for (auto &i : v) 64 | { 65 | hash *= 16777619; 66 | hash ^= i; 67 | } 68 | return hash; 69 | } 70 | }; 71 | 72 | class Model 73 | { 74 | public: 75 | Model(const std::string& model, int round, std::string templatesFile = "B", bool tagConstraint = false); 76 | Model(const std::string& model); 77 | Model(); 78 | std::vector< std::vector > > m_Templates; 79 | bool Trigram(); 80 | bool Bigram(); 81 | bool Mode(); 82 | int FieldSize(); 83 | int LeftBound(); 84 | int RightBound(); 85 | bool LoadSamples(class Sample * sample); 86 | double FeaturesWeight(const std::vector< std::vector > & features); 87 | bool SaveModel(class Sample * sample, bool saveBin = true); 88 | bool ReadModel(const std::string& model, class Sample* sample); 89 | bool ReadBinModel(const std::string& model, class Sample* sample); 90 | bool SaveBinModel(std::string model, class Sample * sample); 91 | bool ReadTxtModel(const std::string& model, class Sample* sample); 92 | bool SaveTxtModel(std::string model, class Sample * sample); 93 | void UpdateWeights(const int round, const int line); 94 | void UpdateWeights(const std::vector > & features, const std::vector > & goldFeatures, const int round, const int line); 95 | void CandidateTags(std::vector& candTags, int token, int lastTag); 96 | void SetBeamSize(int s); 97 | void SetNGram(int s); 98 | private: 99 | bool m_Trigram; 100 | bool m_Bigram; 101 | bool m_Mode; 102 | bool m_tagConstraint; 103 | int m_SampleSize; 104 | int m_Round; 105 | int m_FieldSize; 106 | int m_leftBound; 107 | int m_rightBound; 108 | int m_BeamSize; 109 | int m_NGram; 110 | std::string m_modelFile; 111 | std::unordered_map, WeightInfo, vechash> trainParas; 112 | std::unordered_map, double, vechash> testParas; 113 | std::unordered_map > tokensTagset; 114 | std::unordered_map >lastTagset; 115 | private: 116 | //bool ParseTemplates(); 117 | bool TemplatesBaseline(); 118 | bool TemplatesSyllables(); 119 | bool ParseTemplates(const std::string & templates); 120 | bool ParseTemplates(const std::string& tempStr, std::vector >& features); 121 | 122 | }; 123 | } 124 | #endif 125 | -------------------------------------------------------------------------------- /tip-las.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {9D2BFEA7-0E01-4B28-A64D-B2C21641FA07} 15 | tiplas 16 | 17 | 18 | 19 | Application 20 | true 21 | v120 22 | MultiByte 23 | 24 | 25 | Application 26 | false 27 | v120 28 | true 29 | MultiByte 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | Level3 45 | Disabled 46 | true 47 | 48 | 49 | true 50 | 51 | 52 | 53 | 54 | Level3 55 | MaxSpeed 56 | true 57 | true 58 | true 59 | 60 | 61 | true 62 | true 63 | true 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /tagging/Decoder.cpp: -------------------------------------------------------------------------------- 1 | #include "Decoder.h" 2 | 3 | namespace Tagging 4 | { 5 | Decoder::Decoder(class Model * model, bool mode, int pl, int pr, int beamSize, int nGram) 6 | { 7 | m_model = model; 8 | m_mode = mode; 9 | paddingLeft = pl; 10 | paddingRight = pr; 11 | BeamSize = beamSize; 12 | NGram = nGram; 13 | } 14 | 15 | Decoder::~Decoder() 16 | { 17 | m_tokensMatrix = NULL; 18 | } 19 | 20 | void Decoder::SetBeamSize(int s) 21 | { 22 | BeamSize = s; 23 | } 24 | 25 | void Decoder::SetNGram(int s) 26 | { 27 | NGram = s; 28 | } 29 | 30 | bool Decoder::Decode(std::vector >* samples, std::vector& tags, std::vector* tagsGold) 31 | { 32 | /*******clear*********************/ 33 | m_candsOld.clear(); 34 | m_candsNew.clear(); 35 | m_goldTags.clear(); 36 | m_tokensMatrix = NULL; 37 | tags.clear(); 38 | if (m_mode) 39 | { 40 | tagsGold->clear(); 41 | } 42 | /*********************************/ 43 | m_tokensMatrix = samples; 44 | Cand cand; 45 | cand.acc_score = 0; 46 | cand.taglist.resize(paddingLeft, 0); 47 | if (m_mode) 48 | { 49 | m_goldTags.resize(paddingLeft, 0); 50 | } 51 | m_candsOld.push_back(cand); 52 | int size = m_tokensMatrix->size(); 53 | 54 | for (int i = paddingLeft; i < (size - paddingRight); i++) 55 | { 56 | m_pos = i; 57 | for (int j = 0; j < m_candsOld.size(); j++) 58 | { 59 | std::vector tmp; 60 | ExpandStates(m_candsOld[j], tmp); 61 | AddNewStates(tmp); 62 | } 63 | sort(m_candsNew.begin(), m_candsNew.end(), greater()); 64 | m_candsOld.swap(m_candsNew); 65 | if (m_candsOld.size() > BeamSize) 66 | { 67 | m_candsOld.resize(BeamSize); 68 | } 69 | m_candsNew.resize(0); 70 | if (m_mode) 71 | { 72 | int len = m_tokensMatrix->at(i).size(); 73 | m_goldTags.push_back(m_tokensMatrix->at(i).at(len - 1)); 74 | bool track = true; 75 | for (int j = 0; j < m_candsOld.size(); j++) 76 | { 77 | if (m_candsOld[j].taglist == m_goldTags) 78 | { 79 | track = false; 80 | break; 81 | } 82 | } 83 | if (track) 84 | { 85 | tags = m_candsOld[0].taglist; 86 | tagsGold->assign(m_goldTags.begin(), m_goldTags.end()); 87 | return false; 88 | } 89 | } 90 | } 91 | if (m_mode) 92 | { 93 | if (m_candsOld[0].taglist == m_goldTags) 94 | { 95 | return true; 96 | } 97 | tags = m_candsOld[0].taglist; 98 | tagsGold->assign(m_goldTags.begin(), m_goldTags.end()); 99 | return false; 100 | } 101 | else 102 | { 103 | tags = m_candsOld[0].taglist; 104 | return true; 105 | } 106 | } 107 | 108 | bool Decoder::CheckHistory(const Cand & cand0, const Cand & cand1) 109 | { 110 | for (int i = 0; i < NGram; i++) 111 | { 112 | int index = m_pos - i; 113 | if (cand0.taglist[index] != cand1.taglist[index]) 114 | { 115 | return false; 116 | } 117 | } 118 | return true; 119 | } 120 | 121 | void Decoder::ExpandStates(const Cand& cand, std::vector& candStates) 122 | { 123 | candStates.clear(); 124 | int token = m_tokensMatrix->at(m_pos).at(0); 125 | int lastTag = 0; 126 | if (cand.taglist.size() > 0) 127 | { 128 | lastTag = cand.taglist[cand.taglist.size() - 1]; 129 | } 130 | std::vector candTags; 131 | m_model->CandidateTags(candTags, token, lastTag); 132 | std::vector::iterator it = candTags.begin(); 133 | while (it != candTags.end()) 134 | { 135 | Cand cand_n; 136 | cand_n.taglist = cand.taglist; 137 | cand_n.taglist.push_back((*it)); 138 | std::vector > features; 139 | GenerateFeatures(features, cand_n.taglist, cand_n.taglist.size() - 1); 140 | double score = m_model->FeaturesWeight(features); 141 | cand_n.acc_score = cand.acc_score + score; 142 | candStates.push_back(cand_n); 143 | it++; 144 | } 145 | } 146 | 147 | void Decoder::AddNewStates(const std::vector& cands) 148 | { 149 | int size = cands.size(); 150 | for (int i = 0; i < size; i++) 151 | { 152 | bool history = false; 153 | for (int j = 0; j < m_candsNew.size(); j++) 154 | { 155 | history = CheckHistory(cands[i], m_candsNew[j]); 156 | if (history) 157 | { 158 | if (cands[i].acc_score > m_candsNew[j].acc_score) 159 | { 160 | m_candsNew[j].taglist = cands[i].taglist; 161 | m_candsNew[j].acc_score = cands[i].acc_score; 162 | } 163 | break; 164 | } 165 | } 166 | if (false == history) 167 | { 168 | m_candsNew.push_back(cands[i]); 169 | } 170 | } 171 | } 172 | 173 | void Decoder::GenerateFeatures(std::vector >& features, const std::vector& tagList, int pos) 174 | { 175 | features.clear(); 176 | std::vector feature; 177 | int size = m_model->m_Templates.size(); 178 | std::vector< std::vector > >::iterator it = m_model->m_Templates.begin(); 179 | int id = -1; 180 | while (it != m_model->m_Templates.end()) 181 | { 182 | id++; 183 | feature.clear(); 184 | feature.push_back(id); 185 | std::vector >::iterator it1 = (*it).begin(); 186 | while (it1 != (*it).end()) 187 | { 188 | feature.push_back(m_tokensMatrix->at((*it1).first + pos).at((*it1).second)); 189 | it1++; 190 | } 191 | feature.push_back(tagList[pos]); 192 | features.push_back(feature); 193 | it++; 194 | } 195 | if (m_model->Bigram()) 196 | { 197 | feature.clear(); 198 | feature.push_back(++id); 199 | feature.push_back(tagList[pos - 1]); 200 | feature.push_back(tagList[pos]); 201 | features.push_back(feature); 202 | } 203 | if (m_model->Trigram()) 204 | { 205 | feature.clear(); 206 | feature.push_back(++id); 207 | feature.push_back(tagList[pos - 2]); 208 | feature.push_back(tagList[pos - 1]); 209 | feature.push_back(tagList[pos]); 210 | features.push_back(feature); 211 | } 212 | } 213 | 214 | } 215 | -------------------------------------------------------------------------------- /utility/StringOperation.cpp: -------------------------------------------------------------------------------- 1 | #include "StringOperation.h" 2 | 3 | namespace utility 4 | { 5 | bool StringOperation::IsPrefix(const std::string& src, const std::string& prefix) 6 | { 7 | size_t s1=prefix.size(); 8 | size_t s2=src.size(); 9 | if(s1 == 0) 10 | { 11 | return true ; 12 | } 13 | if( s1>s2 ) 14 | { 15 | return false; 16 | } 17 | size_t i=0; 18 | while( is2 ) 29 | { 30 | return false; 31 | } 32 | int i=(int)postfix.size()-1; 33 | int j=(int)src.size()-1; 34 | while( i>=0 && src[j]==postfix[i]) 35 | { 36 | --i, --j; 37 | } 38 | return (i == -1); 39 | } 40 | /* 41 | ************************************************* 42 | 功能 :剔除字符串中所有的空格、制表符、换行符 43 | 参数 : 44 | 返回值 : 45 | ------------------------------------------------- 46 | 备注 :覆盖原来的字符串 47 | ------------------------------------------------- 48 | 作者 :Li Yachao 49 | 时间 :2011-11-30 50 | ************************************************* 51 | */ 52 | void StringOperation::TrimAll(std::string& str) 53 | { 54 | str.erase(0,str.find_first_not_of(" \t\r\n")); 55 | str.erase(str.find_last_not_of(" \t\r\n")+1); 56 | } 57 | /* 58 | ************************************************* 59 | 功能 :剔除字符串中首尾的空格 60 | 参数 : 61 | 返回值 : 62 | ------------------------------------------------- 63 | 备注 :覆盖原来的字符串 64 | ------------------------------------------------- 65 | 作者 :Li Yachao 66 | 时间 :2011-11-30 67 | ************************************************* 68 | */ 69 | void StringOperation::TrimBlanks(std::string& str) 70 | { 71 | if(str.size() == 0) 72 | { 73 | return ; 74 | } 75 | size_t i=0, j=0, size=0; 76 | size = str.size(); 77 | while( i0 && str[j]==' ' ) 83 | { 84 | --j; 85 | } 86 | str = str.substr(i, j-i+1); 87 | 88 | } 89 | /* 90 | ************************************************* 91 | 功能 :剔除字符串中首部的空格 92 | 参数 : 93 | 返回值 : 94 | ------------------------------------------------- 95 | 备注 :覆盖原来的字符串 96 | ------------------------------------------------- 97 | 作者 :Li Yachao 98 | 时间 :2011-11-30 99 | ************************************************* 100 | */ 101 | void StringOperation::TrimBlanksBegin(std::string& str) 102 | { 103 | if(str.size() == 0) 104 | { 105 | return ; 106 | } 107 | size_t size=0; 108 | size = str.size(); 109 | std::string::size_type i=0; 110 | while( i0 && str[j]==' ' ) 138 | { 139 | --j; 140 | } 141 | str = str.substr(i, j-i+1); 142 | } 143 | /* 144 | ************************************************* 145 | 功能 :一个字符串是否包含另外一个字符串 146 | 参数 : src源字符串,obj要匹配的目标出现字符串 147 | 返回值 : 148 | ------------------------------------------------- 149 | 备注 :两个字符串不能为空,不然返回false 150 | ------------------------------------------------- 151 | 作者 :Li Yachao 152 | 时间 :2011-11-30 153 | ************************************************* 154 | */ 155 | bool StringOperation::Contains(const std::string src, const std::string obj) 156 | { 157 | bool val = false; 158 | if(src.empty() || obj.empty()) 159 | { 160 | return val; 161 | } 162 | int index = src.find(obj,0); 163 | if(index >= 0) 164 | { 165 | val = true; 166 | } 167 | return val; 168 | } 169 | /* 170 | ************************************************* 171 | 功能 :一个字符串在另外一个字符串出现的次数 172 | 参数 : src源字符串,obj要匹配的目标出现字符串 173 | 返回值 :出现的次数 174 | ------------------------------------------------- 175 | 备注 :两个字符串不能为空,不然返回0 176 | ------------------------------------------------- 177 | 作者 :Li Yachao 178 | 时间 :2011-11-30 179 | ************************************************* 180 | */ 181 | int StringOperation::MatchingNumber(const std::string& src, const std::string& obj) 182 | { 183 | int val = 0; 184 | if(src.empty() || obj.empty()) 185 | { 186 | return val; 187 | } 188 | int length = src.length(); 189 | int start = 0; 190 | int offset = 0; 191 | while(start < length) 192 | { 193 | offset = src.find( obj , start); 194 | if(offset >=0) 195 | { 196 | val ++; 197 | start = offset + obj.length(); 198 | } 199 | else 200 | {/*注意,没有这个,可能出错*/ 201 | break; 202 | } 203 | } 204 | return val; 205 | } 206 | /* 207 | ************************************************* 208 | 功能 :把字符串按照一定的标识符切割成字符串对 209 | 参数 : 210 | 返回值 : 211 | ------------------------------------------------- 212 | 备注 : 213 | ------------------------------------------------- 214 | 作者 :Li Yachao 215 | 时间 :2011-12-2 216 | ************************************************* 217 | */ 218 | void StringOperation::String2Pair(const std::string& str,std::pair& kv, const std::string& seg) 219 | { 220 | std::string::size_type pos; 221 | pos = str.find_first_of(seg, 0); 222 | kv.first = str.substr(0,pos); 223 | if (pos != std::string::npos) 224 | { 225 | kv.second = str.substr(pos + seg.size() , str.size() - pos - seg.size()); 226 | } 227 | else 228 | { 229 | kv.second = ""; 230 | } 231 | } 232 | void StringOperation::Replace(std::string& text, const std::string& old_str, const std::string& new_str) 233 | { 234 | if(old_str.empty()) 235 | { 236 | return ; 237 | } 238 | std::string::size_type pos=0; 239 | std::string::size_type srclen=old_str.size(); 240 | std::string::size_type dstlen=new_str.size(); 241 | while( (pos=text.find(old_str, pos)) != std::string::npos) 242 | { 243 | text.replace(pos, srclen, new_str); 244 | pos += dstlen; 245 | } 246 | } 247 | void StringOperation::Remove(std::string& text, const std::string& remove_str) 248 | { 249 | Replace(text,remove_str,""); 250 | } 251 | void StringOperation::Remove(std::string& text, const std::string remove_strs[], int remove_count) 252 | { 253 | for(int i=0;i >& dict) 12 | { 13 | Dict.assign(dict.begin(), dict.end()); 14 | } 15 | 16 | Sample::~Sample() 17 | { 18 | 19 | } 20 | 21 | Sample::Sample(int left, int right) 22 | { 23 | LeftPadding = left; 24 | RightPadding = right; 25 | FieldSize = 0; 26 | } 27 | 28 | int Sample::GetSize() 29 | { 30 | return SamplesMatrix.size(); 31 | } 32 | 33 | void Sample::Shuffle() 34 | { 35 | random_shuffle(SamplesMatrix.begin(), SamplesMatrix.end()); 36 | } 37 | 38 | std::string Sample::GetTag(int tagid) 39 | { 40 | return id2Tag[tagid]; 41 | } 42 | 43 | std::vector >* Sample::GetSample(int index) 44 | { 45 | if (index < 0 || index >= SamplesMatrix.size()) 46 | { 47 | return NULL; 48 | } 49 | else 50 | { 51 | return &SamplesMatrix[index]; 52 | } 53 | } 54 | 55 | void Sample::Tokens2Id(const std::vector >& tokens, std::vector< std::vector >* samples) 56 | { 57 | std::vector defaultEvent; 58 | defaultEvent.resize(FieldSize, 0); 59 | int size = tokens.size(); 60 | for (int i = 0; i < LeftPadding; i++) 61 | {//left padding for sample 62 | samples->push_back(defaultEvent); 63 | } 64 | for (int i = 0; i < size; i++) 65 | { 66 | std::vector s; 67 | for (int j = 0; j < tokens[i].size(); j++) 68 | { 69 | std::unordered_map::iterator it = Dict[j].find(tokens.at(i).at(j)); 70 | if (it != Dict[j].end()) 71 | { 72 | s.push_back(it->second); 73 | } 74 | else 75 | { 76 | s.push_back(-1); 77 | } 78 | } 79 | samples->push_back(s); 80 | } 81 | for (int i = 0; i < RightPadding; i++) 82 | {//right padding for sample 83 | samples->push_back(defaultEvent); 84 | } 85 | } 86 | 87 | bool Sample::Test(const std::string & testFile) 88 | { 89 | std::ifstream fin(testFile); 90 | if (!fin.is_open()) 91 | { 92 | std::cout << "Read Test File " << testFile << " Error!!!" << std::endl;; 93 | return false; 94 | } 95 | std::vector< std::vector > events; 96 | std::vector defaultEvent; 97 | std::vector fields; 98 | std::vector samples; 99 | std::string line = ""; 100 | int lineIndex = 0; 101 | std::string seg = "\t"; 102 | while (getline(fin, line)) 103 | { 104 | Utils::TrimLine(line); 105 | lineIndex++; 106 | if (line.empty()) 107 | { 108 | for (int i = 0; i < RightPadding; i++) 109 | {//right padding for samples 110 | events.push_back(defaultEvent); 111 | } 112 | SamplesMatrix.push_back(events); 113 | Samples.push_back(samples); 114 | samples.clear(); 115 | events.clear(); 116 | for (int i = 0; i < LeftPadding; i++) 117 | {//left padding for samples 118 | events.push_back(defaultEvent); 119 | } 120 | continue; 121 | } 122 | fields.clear(); 123 | Utils::Split(fields, line, seg); 124 | if (lineIndex == 1) 125 | { 126 | FieldSize = fields.size(); 127 | defaultEvent.resize(FieldSize, 0); 128 | for (int i = 0; i < LeftPadding; i++) 129 | {//left padding for samples 130 | events.push_back(defaultEvent); 131 | } 132 | } 133 | std::vectortokens; 134 | for (int i = 0; i < fields.size(); i++) 135 | { 136 | std::unordered_map::iterator it = Dict[i].find(fields[i]); 137 | if (it != Dict[i].end()) 138 | { 139 | tokens.push_back((*it).second); 140 | } 141 | else 142 | { 143 | tokens.push_back(-1); 144 | } 145 | } 146 | events.push_back(tokens); 147 | samples.push_back(line); 148 | } 149 | return true; 150 | } 151 | 152 | bool Sample::Train(const std::string & trainFile) 153 | { 154 | std::ifstream fin(trainFile); 155 | if (!fin.is_open()) 156 | { 157 | std::cout << "Read Train File " << trainFile << " Error!!!" << std::endl;; 158 | return false; 159 | } 160 | std::vector< std::vector > events; 161 | std::vector defaultEvent; 162 | std::vector fields; 163 | std::string line; 164 | int lineIndex = 0; 165 | std::string seg = "\t"; 166 | int lastTag = 0; 167 | while (getline(fin, line)) 168 | { 169 | lineIndex++; 170 | Utils::TrimLine(line); 171 | if (line.empty()) 172 | { 173 | for (int i = 0; i < RightPadding; i++) 174 | {//right padding for samples 175 | events.push_back(defaultEvent); 176 | } 177 | SamplesMatrix.push_back(events); 178 | events.clear(); 179 | for (int i = 0; i < LeftPadding; i++) 180 | {//left padding for samples 181 | events.push_back(defaultEvent); 182 | } 183 | lastTag = 0; 184 | continue; 185 | } 186 | fields.clear(); 187 | Utils::Split(fields, line, seg); 188 | if (lineIndex == 1) 189 | { 190 | FieldSize = fields.size(); 191 | if (FieldSize < 2) 192 | { 193 | std::cout << "File Broken At " << lineIndex << std::endl; 194 | } 195 | ids.resize(FieldSize, 1); 196 | Dict.resize(FieldSize); 197 | /*****************************/ 198 | defaultEvent.resize(FieldSize, 0); 199 | for (int i = 0; i < LeftPadding; i++) 200 | { 201 | events.push_back(defaultEvent); 202 | } 203 | /*****************************/ 204 | } 205 | if (FieldSize != fields.size()) 206 | { 207 | std::cout << "File Broken At " << lineIndex << std::endl; 208 | return false; 209 | } 210 | std::vector event_tmp; 211 | for (int i = 0; i < fields.size(); i++) 212 | { 213 | std::string str = fields[i]; 214 | std::unordered_map::iterator it = Dict[i].find(str); 215 | if (it != Dict[i].end()) 216 | { 217 | event_tmp.push_back((*it).second); 218 | } 219 | else 220 | { 221 | Dict[i].insert(std::make_pair(str, ids.at(i))); 222 | event_tmp.push_back(ids.at(i)); 223 | ids.at(i) += 1; 224 | } 225 | } 226 | tokensFrequent[event_tmp[0]] += 1; 227 | lastTagSet[lastTag].insert(event_tmp[FieldSize - 1]); 228 | tagSet.insert(event_tmp[FieldSize - 1]); 229 | tokenTagSet[event_tmp[0]].insert(event_tmp[FieldSize - 1]); 230 | lastTag = event_tmp[FieldSize - 1]; 231 | events.push_back(event_tmp); 232 | } 233 | fin.close(); 234 | AddEventsOver(); 235 | return true; 236 | 237 | } 238 | 239 | bool Sample::AddEvents(const std::vector >& sample) 240 | { 241 | if (sample.size() == 0) 242 | { 243 | return true; 244 | } 245 | std::vector< std::vector > events; 246 | std::vector defaultEvent; 247 | int lastTag = 0; 248 | if (FieldSize < 2) 249 | { 250 | FieldSize = sample.at(0).size(); 251 | ids.resize(FieldSize, 1); 252 | Dict.resize(FieldSize); 253 | } 254 | defaultEvent.resize(FieldSize, 0); 255 | for (int i = 0; i < LeftPadding; i++) 256 | {//left padding for samples 257 | events.push_back(defaultEvent); 258 | } 259 | int row = sample.size(); 260 | for (int i = 0; i < row; i++) 261 | { 262 | int col = sample[i].size(); 263 | std::vector event_tmp; 264 | for (int j = 0; j < col; j++) 265 | { 266 | std::string str = sample[i][j]; 267 | std::unordered_map::iterator it = Dict[j].find(str); 268 | if (it != Dict[j].end()) 269 | { 270 | event_tmp.push_back(it->second); 271 | } 272 | else 273 | { 274 | Dict[j].insert(std::make_pair(str, ids.at(j))); 275 | event_tmp.push_back(ids.at(j)); 276 | ids.at(j) += 1; 277 | } 278 | } 279 | tokensFrequent[event_tmp[0]] += 1; 280 | lastTagSet[lastTag].insert(event_tmp[FieldSize - 1]); 281 | tagSet.insert(event_tmp[FieldSize - 1]); 282 | tokenTagSet[event_tmp[0]].insert(event_tmp[FieldSize - 1]); 283 | lastTag = event_tmp[FieldSize - 1]; 284 | events.push_back(event_tmp); 285 | 286 | } 287 | for (int i = 0; i < RightPadding; i++) 288 | {//right padding for samples 289 | events.push_back(defaultEvent); 290 | } 291 | SamplesMatrix.push_back(events); 292 | //Samples.push_back(sample); 293 | return true; 294 | } 295 | 296 | bool Sample::AddEventsOver() 297 | { 298 | std::set::iterator it = tagSet.begin(); 299 | while (it != tagSet.end()) 300 | { 301 | tokenTagSet[-1].insert(*it);//for all tags 302 | lastTagSet[-1].insert(*it); 303 | it++; 304 | } 305 | return true; 306 | } 307 | } -------------------------------------------------------------------------------- /utility/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {}; 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | std::advance(it, internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | std::advance(it, internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /Formatting.cpp: -------------------------------------------------------------------------------- 1 | #include "Formatting.h" 2 | 3 | namespace utility_train 4 | { 5 | Formatting::Formatting() 6 | { 7 | tokenize = new Tokenize(); 8 | delimiter = " "; 9 | head = "<#>"; 10 | tail = ""; 11 | } 12 | 13 | Formatting::~Formatting() 14 | { 15 | delete tokenize; 16 | } 17 | 18 | void Formatting::WsTrain(const std::string& text, std::vector >& features, const std::string& language, const std::string& encode) 19 | { 20 | features.clear(); 21 | std::string segs[] = { " ", "/" }; 22 | std::vector words; 23 | StringSplit::SplitByTokens(words, text, segs, 2, false); 24 | int size = words.size(); 25 | for (int i = 0; i < size; i++) 26 | { 27 | std::vector >tmp; 28 | WSTag(words[i], tmp, language, encode); 29 | for (int j = 0; j < tmp.size(); j++) 30 | { 31 | std::vector f; 32 | f.push_back(tmp[j].first); 33 | f.push_back(tmp[j].second); 34 | features.push_back(f); 35 | } 36 | } 37 | } 38 | /* 39 | ************************************************* 40 | 功能 : 由测试语料生成训练需要的格式 41 | 参数 : 42 | 返回值 : 43 | ------------------------------------------------- 44 | 备注 : 45 | ------------------------------------------------- 46 | 作者 :Li Yachao 47 | 时间 :2013-3-13,2015-10-24 48 | ************************************************* 49 | */ 50 | void Formatting::WsTest(const std::string& text, std::vector >& features, std::vector & seg, 51 | const std::string& language, const std::string& encode) 52 | { 53 | if (text.empty()) 54 | { 55 | return; 56 | } 57 | features.clear(); 58 | seg.clear(); 59 | std::vector< std::string> tokens; 60 | if (language == "TB") 61 | { 62 | Tokenize::TibetanAll(text, tokens); 63 | } 64 | for (int i = 0; i< tokens.size(); i++) 65 | { 66 | bool s = false; 67 | if (StringOperation::IsPostfix(tokens[i], "\xe0\xbc\x8b")) 68 | { 69 | if (tokens[i].size() > 3) 70 | { 71 | tokens[i]= tokens[i].substr(0, tokens[i].size() - 3); 72 | s = true; 73 | } 74 | } 75 | seg.push_back(s); 76 | std::vector f; 77 | f.push_back(tokens[i]); 78 | features.push_back(f); 79 | } 80 | } 81 | void Formatting::PosTrainS(const std::string& text, std::vector >& features) 82 | { 83 | features.clear(); 84 | std::vector > tokens; 85 | std::string tmp = text; 86 | POSExtract::ParsePOS(tmp, tokens); 87 | std::vectorsc;//current 88 | std::vectorsl;//left 89 | std::vectorsr;//right 90 | for (int i = 0; i f; 100 | f.push_back(tokens[i].first); 101 | /******syllables features********************/ 102 | Tokenize::TibetanAll(tokens[i].first, sc); 103 | if (i > 0) 104 | { 105 | Tokenize::TibetanAll(tokens[i - 1].first, sl); 106 | } 107 | if (i < (tokens.size() - 1)) 108 | { 109 | Tokenize::TibetanAll(tokens[i + 1].first, sr); 110 | } 111 | /*current syllables*/ 112 | if (sc.size() > 2) 113 | { 114 | f.push_back(sc[0]); 115 | f.push_back(sc[1]); 116 | } 117 | else if (sc.size() == 1) 118 | { 119 | f.push_back(sc[0]); 120 | f.push_back(sc[0]); 121 | } 122 | else 123 | { 124 | f.push_back(head); 125 | f.push_back(tail); 126 | } 127 | /*left syllables*/ 128 | if (sl.size() > 1) 129 | { 130 | f.push_back(sl[ sl.size() - 1]); 131 | } 132 | else 133 | { 134 | f.push_back( tail ); 135 | } 136 | /*right syllables*/ 137 | if (sr.size() > 1) 138 | { 139 | f.push_back(sr[sr.size() - 1]); 140 | } 141 | else 142 | { 143 | f.push_back(head); 144 | } 145 | /******syllables features********************/ 146 | f.push_back(tokens[i].second); // 147 | //f.push_back(tokens[i].second.substr(0, 1)); // 148 | features.push_back(f); 149 | } 150 | } 151 | 152 | void Formatting::PosTestS(const std::string& text, std::vector >& features, std::vector& seg) 153 | { 154 | features.clear(); 155 | std::string segs[] = { " ", "/" }; 156 | std::vector words; 157 | std::vectorsc; //current 158 | std::vectorsl; //left 159 | std::vectorsr ;//right 160 | StringSplit::SplitByTokens(words, text, segs, 2, false); 161 | for (int i = 0; i < words.size(); i++) 162 | { 163 | bool s = false; 164 | if (StringOperation::IsPostfix(words[i], "\xe0\xbc\x8b")) 165 | { 166 | if (words[i].size() > 3) 167 | { 168 | words[i] = words[i].substr(0, words[i].size() - 3); 169 | s = true; 170 | } 171 | } 172 | seg.push_back(s); 173 | std::vector f; 174 | f.push_back(words[i]); 175 | /******syllables features********************/ 176 | Tokenize::TibetanAll(words[i], sc); 177 | if (i > 0) 178 | { 179 | Tokenize::TibetanAll(words[i - 1], sl); 180 | } 181 | if (i < (words.size() - 1)) 182 | { 183 | Tokenize::TibetanAll(words[i + 1], sr); 184 | } 185 | /*current syllables*/ 186 | if (sc.size() > 2) 187 | { 188 | f.push_back(sc[0]); 189 | f.push_back(sc[1]); 190 | } 191 | else if (sc.size() == 1) 192 | { 193 | f.push_back(sc[0]); 194 | f.push_back(sc[0]); 195 | } 196 | else 197 | { 198 | f.push_back(head); 199 | f.push_back(tail); 200 | } 201 | /*left syllables*/ 202 | if (sl.size() > 1) 203 | { 204 | f.push_back(sl[sl.size() - 1]); 205 | } 206 | else 207 | { 208 | f.push_back(tail); 209 | } 210 | /*right syllables*/ 211 | if (sr.size() > 1) 212 | { 213 | f.push_back(sr[sr.size() - 1]); 214 | } 215 | else 216 | { 217 | f.push_back(head); 218 | } 219 | /******syllables features********************/ 220 | features.push_back(f); //add 221 | } 222 | } 223 | void Formatting::PosTrain(const std::string& text, std::vector >& features) 224 | { 225 | features.clear(); 226 | std::vector > tokens; 227 | std::string tmp = text; 228 | POSExtract::ParsePOS(tmp,tokens); 229 | for (int i = 0; i f; 239 | f.push_back(tokens[i].first); // 240 | //f.push_back(tokens[i].second.substr(0, 1)); 241 | features.push_back(f); 242 | } 243 | } 244 | 245 | void Formatting::PosTest(const std::string& text, std::vector >& features, std::vector& seg) 246 | { 247 | features.clear(); 248 | std::string segs[] = { " ", "/" }; 249 | std::vector words; 250 | StringSplit::SplitByTokens(words, text, segs, 2, false); 251 | for (int i = 0; i < words.size(); i++) 252 | { 253 | bool s = false; 254 | if (StringOperation::IsPostfix(words[i], "\xe0\xbc\x8b")) 255 | { 256 | if (words[i].size() > 3) 257 | { 258 | words[i] = words[i].substr(0, words[i].size() - 3); 259 | s = true; 260 | } 261 | } 262 | seg.push_back(s); 263 | std::vector f; 264 | f.push_back(words[i]); 265 | features.push_back(f); 266 | } 267 | } 268 | 269 | /* 270 | ************************************************* 271 | 功能 : 对字进行位置标注 272 | 参数 : 273 | 返回值 : 274 | ------------------------------------------------- 275 | 备注 : 276 | ------------------------------------------------- 277 | 作者 :Li Yachao 278 | 时间 :2013-3-13, 2015-10-26 279 | ************************************************* 280 | */ 281 | void Formatting::WSTag(const std::string &word, std::vector > &tags, const std::string &language,const std::string& code) 282 | { 283 | tags.clear(); 284 | std::vectortokens; 285 | if (language == "TB") 286 | { 287 | Tokenize::TibetanAll(word, tokens); 288 | for (int i = 0; i < tokens.size(); i++) 289 | { 290 | if (StringOperation::IsPostfix(tokens[i], "\xe0\xbc\x8b")) 291 | { 292 | if (tokens[i].size() > 3) 293 | { 294 | tokens[i] = tokens[i].substr(0, tokens[i].size() - 3); 295 | } 296 | } 297 | } 298 | } 299 | else if (language == "CHS") 300 | { 301 | if (code == "ANSI") 302 | { 303 | Tokenize::Chinese_ANSI(word, tokens); 304 | } 305 | else 306 | { 307 | Tokenize::Chinese_UTF8(word, tokens); 308 | } 309 | } 310 | 311 | else 312 | { 313 | return; 314 | } 315 | int size = tokens.size(); 316 | if (size == 1) 317 | { 318 | std::pairp; 319 | p.first = tokens[0]; 320 | p.second = "S"; 321 | tags.push_back(p); 322 | } 323 | else if (size == 2) 324 | { 325 | std::pairp; 326 | p.first = tokens[0]; 327 | p.second = "B"; 328 | tags.push_back(p); 329 | p.first = tokens[1]; 330 | p.second = "E"; 331 | tags.push_back(p); 332 | } 333 | else if (size >= 3) 334 | { 335 | for (int i = 0; ip; 338 | p.first = tokens[i]; 339 | if (i == 0) 340 | { 341 | p.second = "B"; 342 | } 343 | else if (i == size - 1) 344 | { 345 | p.second = "E"; 346 | } 347 | else 348 | { 349 | p.second = "M"; 350 | } 351 | tags.push_back(p); 352 | } 353 | } 354 | else 355 | { 356 | return; 357 | } 358 | } 359 | 360 | } 361 | -------------------------------------------------------------------------------- /utility/StringSplit.cpp: -------------------------------------------------------------------------------- 1 | #include "StringSplit.h" 2 | 3 | namespace utility 4 | { 5 | /* 6 | ************************************************* 7 | 功能 :切分UTF8编码格式的字节流为字符串数组 8 | 参数 : 9 | 返回值 : 10 | ------------------------------------------------- 11 | 备注 : 12 | ------------------------------------------------- 13 | 作者 :Li Yachao 14 | 时间 :2011-11-30 15 | ************************************************* 16 | */ 17 | bool StringSplit::SplitUTF8(const std::string &utf8, std::vector &vec) 18 | { 19 | vec.clear(); 20 | for(int i=0;i& vec) 48 | { 49 | vec.clear(); 50 | for(int i=0;i &vecstr, const std::string &str, const std::string token,bool withtoken) 80 | { 81 | vecstr.clear(); 82 | std::string::size_type LeftPst = 0; 83 | std::string::size_type RightPst = 0; 84 | 85 | while((RightPst = str.find(token.c_str(), LeftPst)) != std::string::npos && LeftPst < str.size()) 86 | { 87 | if(RightPst != 0) 88 | { 89 | std::string term ; 90 | if(withtoken) 91 | { 92 | term = str.substr(LeftPst, RightPst-LeftPst + token.length()); 93 | } 94 | else 95 | { 96 | term = str.substr(LeftPst, RightPst-LeftPst); 97 | } 98 | if( term.length() > 0 ) 99 | { 100 | vecstr.push_back(term); 101 | } 102 | LeftPst = RightPst + token.size(); 103 | } 104 | //str以token开头 105 | else 106 | { 107 | LeftPst = RightPst + token.size(); 108 | } 109 | } 110 | if(LeftPst < str.size()) 111 | { 112 | const std::string &term = str.substr(LeftPst); 113 | if( term.length() > 0 ) 114 | { 115 | vecstr.push_back(term); 116 | } 117 | } 118 | return (!vecstr.empty()); 119 | } 120 | /* 121 | ************************************************* 122 | 功能 :将字符串按照一定的Tokena进行切分 123 | 参数 :vecstr切分结果列表,str要切分的字符串,tokens[]切分标志,tokensnumber tokens数量,结果带不带token 124 | 返回值 : 125 | ------------------------------------------------- 126 | 备注 :withtoken为true切分结果包含token,false不包含token,默认值为false 127 | ------------------------------------------------- 128 | 作者 :Li Yachao 129 | 时间 :2011-11-28 130 | ************************************************* 131 | */ 132 | bool StringSplit::SplitByTokens(std::vector &vecstr, const std::string &str, const std::string tokens[],const int tokensnumber, bool withtoken) 133 | { 134 | vecstr.clear(); 135 | if((str.empty()) ||tokensnumber <=0 ) 136 | { 137 | return false ; 138 | } 139 | std::string buffer=""; 140 | int textLength = str.length(); 141 | int start = 0; 142 | int offset = 0; 143 | while(start < textLength) 144 | { 145 | offset = textLength; 146 | int subLength =0; 147 | std::string tmp =""; 148 | for(int i=0;i< tokensnumber;i++) 149 | { 150 | if(tokens[i].empty()) 151 | { 152 | continue; 153 | } 154 | int curr = str.find(tokens[i],start); 155 | if((curr >= 0) &&(curr < offset)) 156 | { 157 | offset = curr; 158 | subLength = tokens[i].length(); 159 | 160 | } 161 | } 162 | if(start == offset) 163 | { 164 | if(withtoken) 165 | { 166 | tmp = str.substr(start,subLength); 167 | } 168 | else 169 | { 170 | tmp = str.substr(start,0); 171 | } 172 | start = offset + subLength; 173 | } 174 | else if(start < offset) 175 | { 176 | int len = 0; 177 | if(withtoken) 178 | { 179 | len = subLength + ( offset - start); 180 | } 181 | else 182 | { 183 | len = ( offset - start); 184 | } 185 | tmp = str.substr(start,len); 186 | start = (offset + subLength ); 187 | } 188 | /*这个影响多个空格连在一块,并且切分标志位空格的情况*/ 189 | if(!tmp.empty()) 190 | { 191 | vecstr.push_back(tmp); 192 | } 193 | } 194 | return true ; 195 | } 196 | /* 197 | ************************************************* 198 | 功能 :将字符串按照一定的Token进行切分,结果返回词和词性对。 199 | 参数 :vecstr切分结果列表,str要切分的字符串,tokens[]切分标志,tokensnumber tokens数量 200 | 返回值 : 201 | ------------------------------------------------- 202 | 备注 :withtoken为true切分结果包含token,false不包含token,默认值为false 203 | ------------------------------------------------- 204 | 作者 :Li Yachao 205 | 时间 :2011-11-30 206 | ************************************************* 207 | */ 208 | bool StringSplit::SplitByTokens(std::vector > &vecstr, const std::string &str, const std::string tokens[], const int tokensnumber) 209 | { 210 | vecstr.clear(); 211 | if((str.empty()) ||tokensnumber <=0 ) 212 | { 213 | return false ; 214 | } 215 | std::string buffer=""; 216 | int textLength = str.length(); 217 | int start = 0; 218 | int offset = 0; 219 | while(start < textLength) 220 | { 221 | offset = textLength; 222 | int subLength =0; 223 | std::string tmp =""; 224 | std::pair kv; 225 | for(int i=0;i< tokensnumber;i++) 226 | { 227 | if(tokens[i].empty()) 228 | { 229 | continue; 230 | } 231 | int curr = str.find(tokens[i],start); 232 | if((curr >= 0) &&(curr < offset)) 233 | { 234 | offset = curr; 235 | subLength = tokens[i].length(); 236 | 237 | } 238 | } 239 | if(start == offset) 240 | { 241 | tmp = str.substr(start,0); 242 | start = offset + subLength; 243 | } 244 | else if(start < offset) 245 | { 246 | int len = 0; 247 | /*if(withtoken) 248 | { 249 | len = subLength + ( offset - start); 250 | } 251 | else 252 | { 253 | len = ( offset - start); 254 | }*/ 255 | tmp = str.substr(start,len); 256 | //std::string key = str.substr(start,offset - start); 257 | //std::string value = str.substr(offset,subLength); 258 | kv.first = str.substr(start,offset - start); 259 | kv.second = str.substr(offset,subLength); 260 | //kv.key = key ; 261 | //kv.value = value ; 262 | 263 | start = (offset + subLength ); 264 | } 265 | /*这个影响多个空格连在一块,并且切分标志位空格的情况*/ 266 | if(!kv.first.empty()) 267 | { 268 | vecstr.push_back(kv); 269 | } 270 | } 271 | return true ; 272 | } 273 | /* 274 | ************************************************* 275 | 功能 :将字符串按照一定的Token进行切分,Token单独切开 276 | 参数 :vecstr切分结果列表,str要切分的字符串,tokens 277 | 返回值 : 278 | ------------------------------------------------- 279 | 备注 :例子"ab/c","abc","/","c" 280 | ------------------------------------------------- 281 | 作者 :Li Yachao 282 | 时间 :2011-12-14 283 | ************************************************* 284 | */ 285 | bool StringSplit::SplitToken(std::vector &vecstr, const std::string &str, const std::string token) 286 | { 287 | vecstr.clear(); 288 | int start = 0; 289 | int offset = 0; 290 | std::string term =""; 291 | std::string tk =""; 292 | while(true) 293 | { 294 | offset = str.find(token,start); 295 | if((offset < 0) || (offset < start)) 296 | { 297 | if((offset < 0) && (start == 0)) 298 | { 299 | vecstr.push_back(str); 300 | } 301 | else if((offset < 0) && (start > 0)) 302 | { 303 | term = str.substr(start,str.size() - start); 304 | if(!term.empty()) 305 | { 306 | vecstr.push_back(term); 307 | } 308 | } 309 | break; 310 | } 311 | term = str.substr(start,offset - start); 312 | if(offset + token.size() <= str.size()) 313 | { 314 | tk = str.substr(offset , token.size()); 315 | } 316 | //TrimAll(term); 317 | if(!term.empty()) 318 | { 319 | vecstr.push_back(term); 320 | } 321 | if(!tk.empty()) 322 | { 323 | vecstr.push_back(tk); 324 | } 325 | start = offset + token.size() ; 326 | } 327 | return true; 328 | } 329 | 330 | bool StringSplit::SplitTokens(std::vector &vecstr, const std::string &str, const std::string tokens[], const int tokensnumber) 331 | { 332 | vecstr.clear(); 333 | if((str.empty()) ||tokensnumber <=0 ) 334 | { 335 | return false ; 336 | } 337 | std::string buffer=""; 338 | int textLength = str.length(); 339 | int start = 0; 340 | int offset = 0; 341 | while(start < textLength) 342 | { 343 | offset = textLength; 344 | int subLength =0; 345 | std::string tmp =""; 346 | std::pair kv; 347 | for(int i=0;i< tokensnumber;i++) 348 | { 349 | if(tokens[i].empty()) 350 | { 351 | continue; 352 | } 353 | int curr = str.find(tokens[i],start); 354 | if((curr >= 0) &&(curr < offset)) 355 | { 356 | offset = curr; 357 | subLength = tokens[i].length(); 358 | } 359 | } 360 | if(start == offset) 361 | { 362 | tmp = str.substr(start,0); 363 | start = offset + subLength; 364 | } 365 | else if(start < offset) 366 | { 367 | int len = 0; 368 | tmp = str.substr(start,len); 369 | std::string key = str.substr(start,offset - start); 370 | std::string value = str.substr(offset,subLength); 371 | kv.first = key ; 372 | kv.second = value ; 373 | start = (offset + subLength ); 374 | } 375 | /*这个影响多个空格连在一块,并且切分标志位空格的情况*/ 376 | if(!kv.first.empty()) 377 | { 378 | //vecstr.push_back(kv); 379 | vecstr.push_back(kv.first); 380 | vecstr.push_back(kv.second); 381 | } 382 | } 383 | return true ; 384 | } 385 | 386 | } 387 | -------------------------------------------------------------------------------- /utility/StringType.cpp: -------------------------------------------------------------------------------- 1 | #include "StringType.h" 2 | 3 | namespace utility 4 | { 5 | StringType::StringType() 6 | { 7 | Init(); 8 | } 9 | bool StringType::Init() 10 | { 11 | /*for(int i=0;i<21003;i++) 12 | { 13 | gbktable.insert(gbk_table[i]); 14 | }*/ 15 | return true; 16 | } 17 | /* 18 | ************************************************* 19 | 功能 :取得一个UTF字符的类型; 20 | 参数 : 21 | 返回值 : 22 | ------------------------------------------------- 23 | 备注 : 24 | ------------------------------------------------- 25 | 作者 :Li Yachao 26 | 时间 :2011-11-30 27 | ************************************************* 28 | */ 29 | int StringType::TB_UTF8CharType(const std::string &myChar) 30 | { 31 | int val = -1; 32 | if(myChar.length() == 1) 33 | { 34 | unsigned short c = myChar[0]; 35 | if(((c >= 97) && (c <= 122)) || ((c >= 65) && (c <= 90))) 36 | { 37 | val = 0;/*英文字母,26个,大小写*/ 38 | } 39 | else if( (c >= 48) && ( c <= 57 ) ) 40 | { 41 | val = 1;/*英文数字*/ 42 | } 43 | else if(c == '.') 44 | { 45 | val = 1;/*英文数字,解决小数点切分错误*/ 46 | } 47 | else if( ((c >= '\x1')&& (c <= '/')) ||( (c >= ':') && (c <= '@') ) || ( ( c >= '[') && (c <= '~') ) ) 48 | { 49 | val = 2;/*英文符号*/ 50 | } 51 | else 52 | { 53 | val = 2; 54 | return val; 55 | } 56 | } 57 | else if(myChar.length() == 3) 58 | { /* 59 | std::string str="。,、;: 60 | ?!“”‘’ 61 | ╗╚()…— 62 | 《》〈〉•"; 63 | */ 64 | const int length = 22; 65 | std::string chCharList[length] = {"\xe3\x80\x82","\xef\xbc\x8c","\xe3\x80\x81","\xef\xbc\x9b","\xef\xbc\x9a",\ 66 | "\xef\xbc\x9f","\xef\xbc\x81","\xe2\x80\x9c","\xe2\x80\x9d","\xe2\x80\x98","\xe2\x80\x99",\ 67 | "\xe2\x95\x97","\xe2\x95\x9a","\xef\xbc\x88","\xef\xbc\x89","\xe2\x80\xa6","\xe2\x80\x94",\ 68 | "\xe3\x80\x8a","\xe3\x80\x8b","\xe3\x80\x88","\xe3\x80\x89","\xe2\x80\xa2"}; 69 | for(int i=0;i utf8chars; 94 | std::string chCharList[length] = {"\xe3\x80\x82","\xef\xbc\x8c","\xe3\x80\x81","\xef\xbc\x9b","\xef\xbc\x9a",\ 95 | "\xef\xbc\x9f","\xef\xbc\x81","\xe2\x80\x9c","\xe2\x80\x9d","\xe2\x80\x98","\xe2\x80\x99",\ 96 | "\xe2\x95\x97","\xe2\x95\x9a","\xef\xbc\x88","\xef\xbc\x89","\xe2\x80\xa6","\xe2\x80\x94",\ 97 | "\xe3\x80\x8a","\xe3\x80\x8b","\xe3\x80\x88","\xe3\x80\x89","\xe2\x80\xa2"}; 98 | StringSplit::SplitUTF8(str,utf8chars); 99 | for(int i=0;i< (utf8chars.size());i++) 100 | { 101 | if((utf8chars[i].length() == 1)) 102 | { 103 | char c = utf8chars[i].at(0); 104 | if(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'))) 105 | { 106 | currentType = 1;/*英文字母,26个,大小写*/ 107 | } 108 | else if( (c >= '0') && ( c <= '9' ) ) 109 | { 110 | currentType = 2;/*英文数字*/ 111 | } 112 | else if((c == '.')) 113 | { 114 | if((i > 0) && (utf8chars[i-1] != ".")) 115 | { 116 | currentType = 2;/*英文数字,解决小数点切分错误*/ 117 | } 118 | else if((i +1 < utf8chars.size()) && (utf8chars[i+1] !=".")) 119 | { 120 | currentType = 2 ; 121 | } 122 | else 123 | { 124 | currentType = 3 ; 125 | } 126 | } 127 | else if( ((c >= '\x1')&& (c <= '/')) ||( (c >= ':') && (c <= '@') ) || ( ( c >= '[') && (c <= '~') ) ) 128 | { 129 | currentType = 3;/*英文符号*/ 130 | } 131 | } 132 | else 133 | { 134 | bool t = true ; 135 | for(int j=0;j0) 151 | { 152 | if(currentType != firstType) 153 | { 154 | return 0; 155 | } 156 | } 157 | else 158 | { 159 | firstType = currentType; 160 | } 161 | } 162 | return firstType; 163 | } 164 | /* 165 | ************************************************* 166 | 功能 :判断GBK编码字符串类型 167 | 参数 : 168 | 返回值 : 169 | ------------------------------------------------- 170 | 备注 :1表示汉字;2如标点符号;3表示gbk图形符号;4表示英文等;5表示未知 171 | ------------------------------------------------- 172 | 作者 :Li Yachao 173 | 时间 :2013-2-25 174 | ************************************************* 175 | */ 176 | int StringType::CH_GBKCharType(const std::string& myChar) 177 | { 178 | int val = 1;/*汉字,不包括标点符号*/ 179 | if(myChar.size() == 1) 180 | { 181 | val = 4; 182 | /*if(IsASCIIString(myChar)) 183 | { 184 | val = 3; 185 | }*/ 186 | } 187 | else if(myChar.size() == 2) 188 | { 189 | unsigned char c0 = myChar.at(0); 190 | unsigned char c1 = myChar.at(1); 191 | unsigned int key = c0 << 8; 192 | key += c1; 193 | std::set::iterator it = gbktable.find(key); 194 | if(it != gbktable.end()) 195 | { 196 | val = 1; 197 | } 198 | else 199 | { 200 | if(IsChGBKPunctuation(myChar)) 201 | { 202 | val = 2; 203 | } 204 | else 205 | { 206 | val = 3; 207 | } 208 | } 209 | 210 | } 211 | else 212 | { 213 | val = 4; 214 | } 215 | 216 | return val; 217 | } 218 | /* 219 | ************************************************* 220 | 功能 :判断字符串是否为ASCII编码字符串 221 | 参数 : 222 | 返回值 : 223 | ------------------------------------------------- 224 | 备注 :判断输入的字符串是否全部是英文字符 225 | ------------------------------------------------- 226 | 作者 :Li Yachao 227 | 时间 :2011-11-28 228 | ************************************************* 229 | */ 230 | bool StringType::IsASCIIString(const std::string& word) 231 | { 232 | for (unsigned int i = 0; i < word.size(); i++) 233 | { 234 | if (word[i] < 0) 235 | { 236 | return false; 237 | } 238 | } 239 | return true; 240 | } 241 | /* 242 | ************************************************* 243 | 功能 :判断字符串是否为中文数字 244 | 参数 : 245 | 返回值 : 246 | ------------------------------------------------- 247 | 备注 :一二三四五六七八九十零〇千万亿 248 | ------------------------------------------------- 249 | 作者 :Li Yachao 250 | 时间 :2013-2-25 251 | ************************************************* 252 | */ 253 | bool StringType::IsCHGBKNumber(const std::string& str) 254 | { 255 | bool val = false; 256 | if(str == "\xd2\xbb") 257 | {/*一*/ 258 | val = true; 259 | } 260 | else if(str == "\xb6\xfe") 261 | { 262 | val = true; 263 | } 264 | else if(str == "\xc8\xfd") 265 | { 266 | val = true; 267 | } 268 | else if(str == "\xcb\xc4") 269 | { 270 | val = true; 271 | } 272 | else if(str == "\xce\xe5") 273 | { 274 | val = true; 275 | } 276 | else if(str == "\xc1\xf9") 277 | { 278 | val = true; 279 | } 280 | else if(str == "\xc6\xdf") 281 | { 282 | val = true; 283 | } 284 | else if(str == "\xb0\xcb") 285 | { 286 | val = true; 287 | } 288 | else if(str == "\xbe\xc5") 289 | { 290 | val = true; 291 | } 292 | else if(str == "\xca\xae") 293 | { 294 | val = true; 295 | } 296 | else if(str == "\xc1\xe3") 297 | { 298 | val = true; 299 | } 300 | else if(str == "\xa9\x96") 301 | { 302 | val = true; 303 | } 304 | else if(str == "\xc7\xa7") 305 | { 306 | val = true; 307 | } 308 | else if(str == "\xcd\xf2") 309 | { 310 | val = true; 311 | } 312 | else if(str == "\xd2\xda") 313 | { 314 | val = true; 315 | } 316 | return val; 317 | 318 | //return true; 319 | } 320 | 321 | /* 322 | ************************************************* 323 | 功能 :判断字符串是否为ASCII编码标点符号 324 | 参数 : 325 | 返回值 : 326 | ------------------------------------------------- 327 | 备注 :。,、;:?!“”‘’╗╚()…—《》〈〉【】 328 | ------------------------------------------------- 329 | 作者 :Li Yachao 330 | 时间 :2013-2-25 331 | ************************************************* 332 | */ 333 | bool StringType::IsChGBKPunctuation(const std::string& str) 334 | { 335 | bool val = false; 336 | if(str == "\xa1\xa3") 337 | { 338 | val = true; 339 | } 340 | else if(str == "\xa3\xac") 341 | { 342 | val = true; 343 | } 344 | else if(str == "\xa1\xa2") 345 | { 346 | val = true; 347 | } 348 | else if(str == "\xa3\xbb") 349 | { 350 | val = true; 351 | } 352 | else if(str == "\xa3\xba") 353 | { 354 | val = true; 355 | } 356 | else if(str == "\a3\xbf") 357 | { 358 | val = true; 359 | } 360 | else if(str == "\xa3\xa1") 361 | { 362 | val = true; 363 | } 364 | else if(str == "\xa1\xb0") 365 | { 366 | val = true; 367 | } 368 | else if(str == "\xa1\xb1") 369 | { 370 | val = true; 371 | } 372 | else if(str == "\xa1\xae") 373 | { 374 | val = true; 375 | } 376 | else if(str == "\xa1\xaf") 377 | { 378 | val = true; 379 | } 380 | else if(str == "\xa8\x5b") 381 | { 382 | val = true; 383 | } 384 | else if(str == "\xa8\x5e") 385 | { 386 | val = true; 387 | } 388 | else if(str == "\xa3\xa8") 389 | { 390 | val = true; 391 | } 392 | else if(str == "\xa3\xa9") 393 | { 394 | val = true; 395 | } 396 | else if(str == "\xa1\xad") 397 | { 398 | val = true; 399 | } 400 | else if(str == "\xa1\xaa") 401 | { 402 | val = true; 403 | } 404 | else if(str == "\xa1\xb6") 405 | { 406 | val = true; 407 | } 408 | else if(str == "\xa1\xb7") 409 | { 410 | val = true; 411 | } 412 | else if(str == "\xa1\xb4") 413 | { 414 | val = true; 415 | } 416 | else if(str == "\xa1\xb5") 417 | { 418 | val = true; 419 | } 420 | else if(str == "\xa1\xbe") 421 | { 422 | val = true; 423 | } 424 | else if(str == "\xa1\xbf") 425 | { 426 | val = true; 427 | } 428 | return val; 429 | } 430 | /* 431 | ************************************************* 432 | 功能 :判断字符串是否为中文字符 433 | 参数 : 434 | 返回值 : 435 | ------------------------------------------------- 436 | 备注 :不包括汉语标点符号,并且好像只是gbk,不是扩充集。 437 | ------------------------------------------------- 438 | 作者 :Li Yachao 439 | 时间 :2011-11-28 440 | ************************************************* 441 | */ 442 | bool StringType::IsCHBGKChar(const std::string& str) 443 | { 444 | if (str.size() != 2) 445 | { 446 | return false; 447 | } 448 | int index = ((unsigned char)str[0]-176)*94 + (unsigned char)str[1] - 161; 449 | if (index >= 0 && index < 6768) 450 | { 451 | return true; 452 | } 453 | else 454 | { 455 | return false; 456 | } 457 | } 458 | int StringType::CHGBKCharIndex(const std::string &chChar) 459 | { 460 | if((chChar.size() == 2)) 461 | { 462 | return ((unsigned char)chChar[0]-176)*94 + (unsigned char)chChar[1] - 161; 463 | } 464 | else 465 | { 466 | return 0; 467 | } 468 | 469 | } 470 | } -------------------------------------------------------------------------------- /Las.cpp: -------------------------------------------------------------------------------- 1 | #include "Las.h" 2 | 3 | namespace Tip 4 | { 5 | Las::~Las() 6 | { 7 | delete format; 8 | if (tagging_ws != NULL) 9 | { 10 | delete tagging_ws; 11 | } 12 | if (tagging_pos != NULL) 13 | { 14 | delete tagging_pos; 15 | } 16 | } 17 | 18 | Las::Las() 19 | { 20 | format = new Formatting(); 21 | tagging_ws = NULL; 22 | tagging_pos = NULL; 23 | templatesSet = "S"; 24 | } 25 | 26 | bool Las::Train(const std::string& trainFile, const std::string& modelFile, const std::string& type, int iter , int beamSize , int nGram) 27 | { 28 | std::ifstream fin; 29 | fin.open(trainFile.c_str()); 30 | if (!fin.is_open()) 31 | { 32 | std::cerr << "Open [" << trainFile << "] error!" << std::endl; 33 | return false; 34 | } 35 | std::string utf8flag = "\xef\xbb\xbf"; 36 | std::string utf16flag = "\xff\xfe"; 37 | std::string myLine = ""; 38 | int lineIndex = 0; 39 | std::vector > fs; 40 | std::cout << "Loading Trainning Samples..." << std::endl; 41 | class Model * model = NULL; 42 | class Sample * s = NULL; 43 | if (type == flag_pos) 44 | { 45 | model = new Model(modelFile, iter, templatesSet); 46 | //model = new Model(modelFile, iter); 47 | } 48 | else 49 | { 50 | model = new Model(modelFile, iter); 51 | } 52 | s = new Sample(model->LeftBound(), model->RightBound()); 53 | while (getline(fin, myLine)) 54 | { 55 | lineIndex++; 56 | StringOperation::TrimAll(myLine); 57 | if (myLine.empty()) 58 | { 59 | continue; 60 | } 61 | if (lineIndex == 1) 62 | { 63 | if (StringOperation::IsPrefix(myLine, utf8flag)) 64 | { 65 | myLine = myLine.substr(3, myLine.length() - 3); 66 | } 67 | else if (StringOperation::IsPrefix(myLine, utf16flag)) 68 | { 69 | std::cout << "File type error,need UTF8 or ANSI file."<WsTrain(myLine, fs); 80 | } 81 | else if (type == flag_pos) 82 | { 83 | format->PosTrainS(myLine,fs); 84 | } 85 | else 86 | { 87 | return false; 88 | } 89 | if (!s->AddEvents(fs)) 90 | { 91 | std::cout << "Add Events Error, at Line" << lineIndex << std::endl; 92 | return false; 93 | } 94 | fs.clear(); 95 | } 96 | fin.close(); 97 | s->AddEventsOver(); 98 | model->LoadSamples(s); 99 | std::cout << "Start Trainning..." << std::endl; 100 | clock_t start, end; 101 | start = clock(); 102 | Perceptron * per = new Perceptron(s, model, true, iter, beamSize, nGram); 103 | per->Train(); 104 | model->SaveModel(s); 105 | end = clock(); 106 | std::cout << "done, time cost: " << double(end - start) / CLOCKS_PER_SEC << " s" << std::endl; 107 | delete s; 108 | delete model; 109 | delete per; 110 | return true; 111 | } 112 | 113 | bool Las::Test(const std::string& inFile, const std::string& outFile, const std::string& modelws, const std::string& modelpos, const std::string& type) 114 | { 115 | class Sample * sample_ws = NULL; 116 | class Sample * sample_pos = NULL; 117 | class Model * model_ws = NULL; 118 | class Model * model_pos = NULL; 119 | clock_t start, end; 120 | start = clock(); 121 | if (type == flag_ws) 122 | { 123 | model_ws = new Model(); 124 | sample_ws = new Sample(); 125 | if (!model_ws->ReadModel(modelws, sample_ws)) 126 | { 127 | std::cout << "Read " << modelws << " Error!!!" << std::endl; 128 | return false; 129 | } 130 | tagging_ws = new Perceptron(sample_ws, model_ws); 131 | } 132 | else if (type == flag_pos) 133 | { 134 | model_pos = new Model(); 135 | sample_pos = new Sample(); 136 | if (!model_pos->ReadModel(modelpos, sample_pos)) 137 | { 138 | std::cout << "Read " << modelpos << " Error!!!" << std::endl; 139 | return false; 140 | } 141 | tagging_pos = new Perceptron(sample_pos, model_pos); 142 | } 143 | else if (type == flag_all) 144 | { 145 | model_ws = new Model(); 146 | sample_ws = new Sample(); 147 | if (!model_ws->ReadModel(modelws, sample_ws)) 148 | { 149 | std::cout << "Read " << modelws << " Error!!!" << std::endl; 150 | return false; 151 | } 152 | tagging_ws = new Perceptron(sample_ws, model_ws); 153 | sample_pos = new Sample(); 154 | model_pos = new Model(); 155 | if (!model_pos->ReadModel(modelpos, sample_pos)) 156 | { 157 | std::cout << "Read " << modelpos << " Error!!!" << std::endl; 158 | return false; 159 | } 160 | tagging_pos = new Perceptron(sample_pos, model_pos); 161 | } 162 | else 163 | { 164 | return false; 165 | } 166 | std::ifstream fin; 167 | std::ofstream fout; 168 | fin.open(inFile.c_str()); 169 | if (!fin.is_open()) 170 | { 171 | std::cerr << "Open [" << inFile << "] Error!" << std::endl; 172 | return false; 173 | } 174 | fout.open(outFile.c_str()); 175 | if (!fout.is_open()) 176 | { 177 | std::cerr << "Open [" << outFile << "] Error!" << std::endl; 178 | return false; 179 | } 180 | std::string utf8flag = "\xef\xbb\xbf"; 181 | std::string utf16flag = "\xff\xfe"; 182 | std::string myLine = ""; 183 | int lineIndex = 0; 184 | std::vector > fs; 185 | std::cout << "Start Tagging..." << std::endl; 186 | while (getline(fin, myLine)) 187 | { 188 | lineIndex++; 189 | if (lineIndex == 1) 190 | { 191 | if (StringOperation::IsPrefix(myLine, utf8flag)) 192 | { 193 | myLine = myLine.substr(3, myLine.length() - 3); 194 | } 195 | else if (StringOperation::IsPrefix(myLine, utf16flag)) 196 | { 197 | std::cout << "File Type Error,need UTF8 or ANSI file.\n"; 198 | return false; 199 | } 200 | } 201 | if (type == flag_ws) 202 | { 203 | std::string val = ""; 204 | WsTest(myLine,val); 205 | fout << val << std::endl; 206 | } 207 | else if (type == flag_pos) 208 | { 209 | std::string val = ""; 210 | PosTest(myLine, val); 211 | fout << val << std::endl; 212 | } 213 | else if (type == flag_all) 214 | { 215 | std::string ws = ""; 216 | std::string pos = ""; 217 | WsTest(myLine, ws); 218 | PosTest(ws, pos); 219 | fout << pos << std::endl; 220 | } 221 | else 222 | { 223 | return false; 224 | } 225 | } 226 | fin.close(); 227 | fout.close(); 228 | if (type == flag_ws) 229 | { 230 | delete sample_ws; 231 | delete model_ws; 232 | } 233 | else if (type == flag_pos) 234 | { 235 | delete sample_pos; 236 | delete model_pos; 237 | } 238 | else if (type == flag_all) 239 | { 240 | delete sample_ws; 241 | delete sample_pos; 242 | delete model_ws; 243 | delete model_pos; 244 | } 245 | else 246 | { 247 | 248 | } 249 | end = clock(); 250 | std::cout << "done, time cost: " << double(end - start) / CLOCKS_PER_SEC << " s" << std::endl; 251 | return true; 252 | } 253 | 254 | bool Las::PosTest(const std::string& text, std::string& val) 255 | { 256 | val.clear(); 257 | std::vector > fs; 258 | std::vector result; 259 | std::vector segs; 260 | format->PosTestS(text,fs,segs); 261 | tagging_pos->Test(fs, result); 262 | int size = result.size(); 263 | for (int i = 0; i < size; i++) 264 | { 265 | val += fs[i][0]; 266 | if (segs[i]) 267 | { 268 | val += "\xe0\xbc\x8b"; 269 | } 270 | val += "/"; 271 | val += result[i]; 272 | val += delimiter; 273 | } 274 | return true; 275 | } 276 | 277 | bool Las::WsTest(const std::string& text, std::string& val) 278 | { 279 | val.clear(); 280 | std::vector > fs; 281 | std::vector segs; 282 | std::vector result; 283 | format->WsTest(text, fs, segs); 284 | tagging_ws->Test(fs, result); 285 | std::string buffer = ""; 286 | int size = result.size(); 287 | for (int i = 0; i < size; i++) 288 | { 289 | buffer += fs[i][0]; 290 | if (segs[i]) 291 | { 292 | buffer += "\xe0\xbc\x8b"; 293 | } 294 | if ((result[i] == "E") || (result[i] == "S")) 295 | { 296 | val += buffer; 297 | val += delimiter; 298 | buffer.clear(); 299 | } 300 | } 301 | return true; 302 | } 303 | 304 | bool Las::WsTest(const std::string& text, std::vector& words, bool withSeg) 305 | { 306 | words.clear(); 307 | std::vector > fs; 308 | std::vector segs; 309 | std::vector result; 310 | format->WsTest(text, fs, segs); 311 | tagging_ws->Test(fs, result); 312 | std::string buffer = ""; 313 | int size = result.size(); 314 | for (int i = 0; i < size; i++) 315 | { 316 | buffer += fs[i][0]; 317 | if (withSeg && segs[i]) 318 | { 319 | buffer += "\xe0\xbc\x8b"; 320 | } 321 | if ((result[i] == "E") || (result[i] == "S")) 322 | { 323 | words.push_back(buffer); 324 | buffer.clear(); 325 | } 326 | } 327 | return true; 328 | } 329 | 330 | bool Las::PosDevelopment(const std::string& text, std::string& val) 331 | { 332 | val.clear(); 333 | std::vector > fs; 334 | std::vector result; 335 | std::vector segs; 336 | format->PosTrainS(text, fs); 337 | tagging_pos->Test(fs, result); 338 | int size = result.size(); 339 | if (size == 0) 340 | { 341 | return true; 342 | } 343 | total += size; 344 | int index = fs.at(0).size() - 1; 345 | for (int i = 0; i < size; i++) 346 | { 347 | if (result[i] == fs[i][index]) 348 | { 349 | correct++; 350 | } 351 | val += fs[i][0]; 352 | //if (segs[i]) 353 | //{ 354 | //val += "\xe0\xbc\x8b"; 355 | //} 356 | val += "/"; 357 | val += result[i]; 358 | val += delimiter; 359 | } 360 | return true; 361 | } 362 | 363 | bool Las::Development(const std::string& inFile, const std::string& modelws, const std::string& modelpos, const std::string& type) 364 | { 365 | class Sample * sample_ws = NULL; 366 | class Sample * sample_pos = NULL; 367 | class Model * model_ws = NULL; 368 | class Model * model_pos = NULL; 369 | if (type == flag_ws) 370 | { 371 | model_ws = new Model(); 372 | sample_ws = new Sample(); 373 | if (!model_ws->ReadModel(modelws, sample_ws)) 374 | { 375 | return false; 376 | } 377 | tagging_ws = new Perceptron(sample_ws, model_ws); 378 | } 379 | else if (type == flag_pos) 380 | { 381 | model_pos = new Model(); 382 | sample_pos = new Sample(); 383 | if (!model_pos->ReadModel(modelpos, sample_pos)) 384 | { 385 | return false; 386 | } 387 | tagging_pos = new Perceptron(sample_pos, model_pos); 388 | } 389 | else 390 | { 391 | return false; 392 | } 393 | std::ifstream fin; 394 | fin.open(inFile.c_str()); 395 | if (!fin.is_open()) 396 | { 397 | std::cerr << "Open [" << inFile << "] Error!" << std::endl; 398 | return false; 399 | } 400 | std::ofstream fout; 401 | fout.open("devout.txt"); 402 | std::string utf8flag = "\xef\xbb\xbf"; 403 | std::string utf16flag = "\xff\xfe"; 404 | std::string myLine = ""; 405 | int lineIndex = 0; 406 | std::vector > fs; 407 | std::cout << "Start Tagging..." << std::endl; 408 | total = 0; 409 | correct = 0; 410 | clock_t start, end; 411 | start = clock(); 412 | while (getline(fin, myLine)) 413 | { 414 | lineIndex++; 415 | if (lineIndex == 1) 416 | { 417 | if (StringOperation::IsPrefix(myLine, utf8flag)) 418 | { 419 | myLine = myLine.substr(3, myLine.length() - 3); 420 | } 421 | else if (StringOperation::IsPrefix(myLine, utf16flag)) 422 | { 423 | std::cout << "File type error,need UTF8 or ANSI file.\n"; 424 | return false; 425 | } 426 | } 427 | if (type == flag_ws) 428 | { 429 | std::string val = ""; 430 | } 431 | else if (type == flag_pos) 432 | { 433 | std::string val = ""; 434 | PosDevelopment(myLine, val); 435 | fout<< val << std::endl; 436 | } 437 | else 438 | { 439 | return false; 440 | } 441 | } 442 | fin.close(); 443 | if (type == flag_ws) 444 | { 445 | delete sample_ws; 446 | delete model_ws; 447 | } 448 | else if (type == flag_pos) 449 | { 450 | delete sample_pos; 451 | delete model_pos; 452 | } 453 | else 454 | { 455 | 456 | } 457 | end = clock(); 458 | std::cout << "Precision: " << (correct / total) << std::endl; 459 | std::cout << "done, time cost: " << double(end - start) / CLOCKS_PER_SEC << " s" << std::endl; 460 | return true; 461 | } 462 | } 463 | -------------------------------------------------------------------------------- /utility/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// get_sequence_x functions decode utf-8 sequences of the length x 137 | 138 | template 139 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) 140 | { 141 | if (it != end) { 142 | if (code_point) 143 | *code_point = mask8(*it); 144 | return UTF8_OK; 145 | } 146 | return NOT_ENOUGH_ROOM; 147 | } 148 | 149 | template 150 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) 151 | { 152 | utf_error ret_code = NOT_ENOUGH_ROOM; 153 | 154 | if (it != end) { 155 | uint32_t cp = mask8(*it); 156 | if (++it != end) { 157 | if (is_trail(*it)) { 158 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 159 | 160 | if (code_point) 161 | *code_point = cp; 162 | ret_code = UTF8_OK; 163 | } 164 | else 165 | ret_code = INCOMPLETE_SEQUENCE; 166 | } 167 | else 168 | ret_code = NOT_ENOUGH_ROOM; 169 | } 170 | 171 | return ret_code; 172 | } 173 | 174 | template 175 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) 176 | { 177 | utf_error ret_code = NOT_ENOUGH_ROOM; 178 | 179 | if (it != end) { 180 | uint32_t cp = mask8(*it); 181 | if (++it != end) { 182 | if (is_trail(*it)) { 183 | cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 184 | if (++it != end) { 185 | if (is_trail(*it)) { 186 | cp += (*it) & 0x3f; 187 | 188 | if (code_point) 189 | *code_point = cp; 190 | ret_code = UTF8_OK; 191 | } 192 | else 193 | ret_code = INCOMPLETE_SEQUENCE; 194 | } 195 | else 196 | ret_code = NOT_ENOUGH_ROOM; 197 | } 198 | else 199 | ret_code = INCOMPLETE_SEQUENCE; 200 | } 201 | else 202 | ret_code = NOT_ENOUGH_ROOM; 203 | } 204 | 205 | return ret_code; 206 | } 207 | 208 | template 209 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) 210 | { 211 | utf_error ret_code = NOT_ENOUGH_ROOM; 212 | 213 | if (it != end) { 214 | uint32_t cp = mask8(*it); 215 | if (++it != end) { 216 | if (is_trail(*it)) { 217 | cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 218 | if (++it != end) { 219 | if (is_trail(*it)) { 220 | cp += (mask8(*it) << 6) & 0xfff; 221 | if (++it != end) { 222 | if (is_trail(*it)) { 223 | cp += (*it) & 0x3f; 224 | 225 | if (code_point) 226 | *code_point = cp; 227 | ret_code = UTF8_OK; 228 | } 229 | else 230 | ret_code = INCOMPLETE_SEQUENCE; 231 | } 232 | else 233 | ret_code = NOT_ENOUGH_ROOM; 234 | } 235 | else 236 | ret_code = INCOMPLETE_SEQUENCE; 237 | } 238 | else 239 | ret_code = NOT_ENOUGH_ROOM; 240 | } 241 | else 242 | ret_code = INCOMPLETE_SEQUENCE; 243 | } 244 | else 245 | ret_code = NOT_ENOUGH_ROOM; 246 | } 247 | 248 | return ret_code; 249 | } 250 | 251 | template 252 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 253 | { 254 | // Save the original value of it so we can go back in case of failure 255 | // Of course, it does not make much sense with i.e. stream iterators 256 | octet_iterator original_it = it; 257 | 258 | uint32_t cp = 0; 259 | // Determine the sequence length based on the lead octet 260 | typedef typename std::iterator_traits::difference_type octet_difference_type; 261 | octet_difference_type length = sequence_length(it); 262 | if (length == 0) 263 | return INVALID_LEAD; 264 | 265 | // Now that we have a valid sequence length, get trail octets and calculate the code point 266 | utf_error err = UTF8_OK; 267 | switch (length) { 268 | case 1: 269 | err = get_sequence_1(it, end, &cp); 270 | break; 271 | case 2: 272 | err = get_sequence_2(it, end, &cp); 273 | break; 274 | case 3: 275 | err = get_sequence_3(it, end, &cp); 276 | break; 277 | case 4: 278 | err = get_sequence_4(it, end, &cp); 279 | break; 280 | } 281 | 282 | if (err == UTF8_OK) { 283 | // Decoding succeeded. Now, security checks... 284 | if (is_code_point_valid(cp)) { 285 | if (!is_overlong_sequence(cp, length)){ 286 | // Passed! Return here. 287 | if (code_point) 288 | *code_point = cp; 289 | ++it; 290 | return UTF8_OK; 291 | } 292 | else 293 | err = OVERLONG_SEQUENCE; 294 | } 295 | else 296 | err = INVALID_CODE_POINT; 297 | } 298 | 299 | // Failure branch - restore the original value of the iterator 300 | it = original_it; 301 | return err; 302 | } 303 | 304 | template 305 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 306 | return validate_next(it, end, 0); 307 | } 308 | 309 | } // namespace internal 310 | 311 | /// The library API - functions intended to be called by the users 312 | 313 | // Byte order mark 314 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 315 | 316 | template 317 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 318 | { 319 | octet_iterator result = start; 320 | while (result != end) { 321 | internal::utf_error err_code = internal::validate_next(result, end); 322 | if (err_code != internal::UTF8_OK) 323 | return result; 324 | } 325 | return result; 326 | } 327 | 328 | template 329 | inline bool is_valid(octet_iterator start, octet_iterator end) 330 | { 331 | return (find_invalid(start, end) == end); 332 | } 333 | 334 | template 335 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 336 | { 337 | return ( 338 | ((it != end) && (internal::mask8(*it++)) == bom[0]) && 339 | ((it != end) && (internal::mask8(*it++)) == bom[1]) && 340 | ((it != end) && (internal::mask8(*it)) == bom[2]) 341 | ); 342 | } 343 | 344 | //Deprecated in release 2.3 345 | template 346 | inline bool is_bom (octet_iterator it) 347 | { 348 | return ( 349 | (internal::mask8(*it++)) == bom[0] && 350 | (internal::mask8(*it++)) == bom[1] && 351 | (internal::mask8(*it)) == bom[2] 352 | ); 353 | } 354 | } // namespace utf8 355 | 356 | #endif // header guard 357 | 358 | 359 | -------------------------------------------------------------------------------- /utility/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 74 | { 75 | while (start != end) { 76 | octet_iterator sequence_start = start; 77 | internal::utf_error err_code = internal::validate_next(start, end); 78 | switch (err_code) { 79 | case internal::UTF8_OK : 80 | for (octet_iterator it = sequence_start; it != start; ++it) 81 | *out++ = *it; 82 | break; 83 | case internal::NOT_ENOUGH_ROOM: 84 | throw not_enough_room(); 85 | case internal::INVALID_LEAD: 86 | append (replacement, out); 87 | ++start; 88 | break; 89 | case internal::INCOMPLETE_SEQUENCE: 90 | case internal::OVERLONG_SEQUENCE: 91 | case internal::INVALID_CODE_POINT: 92 | append (replacement, out); 93 | ++start; 94 | // just one replacement mark for the sequence 95 | while (internal::is_trail(*start) && start != end) 96 | ++start; 97 | break; 98 | } 99 | } 100 | return out; 101 | } 102 | 103 | template 104 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 105 | { 106 | static const uint32_t replacement_marker = internal::mask16(0xfffd); 107 | return replace_invalid(start, end, out, replacement_marker); 108 | } 109 | 110 | template 111 | octet_iterator append(uint32_t cp, octet_iterator result) 112 | { 113 | if (!internal::is_code_point_valid(cp)) 114 | throw invalid_code_point(cp); 115 | 116 | if (cp < 0x80) // one octet 117 | *(result++) = static_cast(cp); 118 | else if (cp < 0x800) { // two octets 119 | *(result++) = static_cast((cp >> 6) | 0xc0); 120 | *(result++) = static_cast((cp & 0x3f) | 0x80); 121 | } 122 | else if (cp < 0x10000) { // three octets 123 | *(result++) = static_cast((cp >> 12) | 0xe0); 124 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 125 | *(result++) = static_cast((cp & 0x3f) | 0x80); 126 | } 127 | else { // four octets 128 | *(result++) = static_cast((cp >> 18) | 0xf0); 129 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 130 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 131 | *(result++) = static_cast((cp & 0x3f) | 0x80); 132 | } 133 | return result; 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = internal::validate_next(it, end, &cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = internal::mask16(*start++); 215 | if (internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start != end) { 237 | uint32_t cp = next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start != end) 261 | (*result++) = next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {}; 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /utility/Tokenize.cpp: -------------------------------------------------------------------------------- 1 | #include "Tokenize.h" 2 | 3 | namespace utility 4 | { 5 | Tokenize::Tokenize() 6 | { 7 | //str_type = new StringType(); 8 | } 9 | Tokenize::~Tokenize() 10 | { 11 | //delete str_type; 12 | } 13 | /* 14 | ************************************************* 15 | 功能 : 中文文本预处理,序列化输入 16 | 参数 : 17 | 返回值 : 18 | ------------------------------------------------- 19 | 备注 : CH表示汉字,PU表示符号,EN表示英文字符,本方法把连续的英文字符组合 20 | ------------------------------------------------- 21 | 作者 :Li Yachao 22 | 时间 :2012-3-10 23 | ************************************************* 24 | */ 25 | void Tokenize::GBK(const std::string &line, std::vector > & val) 26 | { 27 | val.clear(); 28 | std::string buf=""; 29 | for(int i=0;ip; 40 | p.first = buf; 41 | p.second = "EN"; 42 | //vec_val.push_back(buf); 43 | val.push_back(p); 44 | buf.clear(); 45 | } 46 | } 47 | else 48 | { 49 | buf.append(myChar); 50 | } 51 | } 52 | else 53 | { 54 | if(!buf.empty()) 55 | { 56 | std::pairp; 57 | p.first = buf; 58 | p.second = "EN"; 59 | //vec_val.push_back(buf); 60 | val.push_back(p); 61 | buf.clear(); 62 | } 63 | myChar+=line.at(i+1); 64 | i += 1; 65 | std::pairp; 66 | p.first = myChar; 67 | int t = str_type->CH_GBKCharType(myChar); 68 | if((t == 1)) 69 | { 70 | p.second = "CH"; 71 | } 72 | else if((t == 2) || (t == 3)) 73 | { 74 | p.second = "PU"; 75 | } 76 | else 77 | { 78 | p.second = "EN"; 79 | } 80 | val.push_back(p); 81 | } 82 | } 83 | if(!buf.empty()) 84 | { 85 | std::pairp; 86 | p.first = buf; 87 | p.second = "EN"; 88 | val.push_back(p); 89 | } 90 | } 91 | /* 92 | ************************************************* 93 | 功能 : 中文文本预处理,序列化输入 94 | 参数 : 95 | 返回值 : 96 | ------------------------------------------------- 97 | 备注 : CH表示汉字,PU表示符号,EN表示英文字符,把英文字符切分开 98 | ------------------------------------------------- 99 | 作者 :Li Yachao 100 | 时间 :2012-3-10 101 | ************************************************* 102 | */ 103 | void Tokenize::GBKAll(const std::string & line, std::vector > & val) 104 | { 105 | val.clear(); 106 | for(int i=0;ip; 111 | if (myCh1 < 128) 112 | { 113 | p.second = "EN"; 114 | } 115 | else 116 | { 117 | myChar+=line.at(i+1); 118 | i += 1; 119 | int t = str_type->CH_GBKCharType(myChar); 120 | if((t == 1)) 121 | { 122 | p.second = "CH"; 123 | } 124 | else if((t == 2) || (t == 3)) 125 | { 126 | p.second = "PU"; 127 | } 128 | else 129 | { 130 | p.second = "EN"; 131 | } 132 | } 133 | p.first = myChar; 134 | val.push_back(p); 135 | } 136 | } 137 | /* 138 | ************************************************* 139 | 功能 : 中文文本预处理,序列化输入 140 | 参数 : 141 | 返回值 : 142 | ------------------------------------------------- 143 | 备注 : UTF8编码方式,对于中文的可以切分出中文和英文的各个字符 144 | ------------------------------------------------- 145 | 作者 :Li Yachao 146 | 时间 :2012-3-5 147 | ************************************************* 148 | */ 149 | void Tokenize::Chinese_UTF8(const std::string& line, std::vector& vec_val) 150 | { 151 | vec_val.clear(); 152 | if(line.empty()) 153 | { 154 | return ; 155 | } 156 | StringSplit::SplitUTF8(line, vec_val); 157 | } 158 | /* 159 | ************************************************* 160 | 功能 : 中文文本预处理,序列化输入 161 | 参数 : 162 | 返回值 : 163 | ------------------------------------------------- 164 | 备注 : ANSI编码方式,对连续的英文单独切分 165 | ------------------------------------------------- 166 | 作者 :Li Yachao 167 | 时间 :2012-3-5 168 | ************************************************* 169 | */ 170 | void Tokenize::Chinese_ANSI(const std::string& line, std::vector& vec_val) 171 | { 172 | vec_val.clear(); 173 | std::string buf=""; 174 | for(int i=0;i& vec_val) 200 | { 201 | vec_val.clear(); 202 | std::string buf=""; 203 | for(int i=0;i &vec_val) 229 | { 230 | vec_val.clear(); 231 | for(int i=0;i& val) 260 | { 261 | std::vectorvec_val; 262 | Tibetan(line,vec_val); 263 | val.clear(); 264 | val.assign(vec_val.begin(),vec_val.end()); 265 | } 266 | /* 267 | ************************************************* 268 | 功能 : 藏文文本预处理,序列化输入 269 | 参数 : 270 | 返回值 : 271 | ------------------------------------------------- 272 | 备注 : UTF8编码方式,对于藏文是纯藏文的文本输入,对于音节缩减的问题不予处理, 273 | ------------------------------------------------- 274 | 作者 :Li Yachao 275 | 时间 :2012-3-5,2015-11-6 276 | ************************************************* 277 | */ 278 | void Tokenize::Tibetan(const std::string & line,std::vector& val) 279 | { 280 | val.clear(); 281 | std::string tokens[]={" ","/","\x09"}; 282 | std::string tokens_u[]={"\xe0\xbc\x8b"}; 283 | std::string tokens_u1= "\xe0\xbc\x8d"; 284 | //std::string tokens_u1= "///"; 285 | std::vector > vec_classfication; 286 | std::vector vec_units; 287 | std::vector vec_tmp; 288 | std::vector tmp; 289 | std::pair kv; 290 | kv.first = line; 291 | kv.second ="4"; 292 | TextClassification::TB_UTF8(line,vec_classfication); 293 | for(int i=0;i > & val) 324 | { 325 | val.clear(); 326 | std::string tokens[]={" ","/","\x09"}; 327 | std::string tokens_u[]={"\xe0\xbc\x8b"}; 328 | std::string tokens_u1= "\xe0\xbc\x8d"; 329 | std::vector > vec_classfication; 330 | std::vector vec_units; 331 | std::vector vec_tmp; 332 | std::vector tmp; 333 | std::pair kv; 334 | kv.first = line; 335 | kv.second ="4"; 336 | TextClassification::TB_UTF8(line,vec_classfication); 337 | for(int i=0;ip; 369 | p.first = vec_units[j]; 370 | if(vec_units[j] == tokens_u1) 371 | { 372 | p.second = "TPU"; 373 | } 374 | else 375 | { 376 | p.second = "TB"; 377 | } 378 | val.push_back(p); 379 | } 380 | vec_units.clear(); 381 | } 382 | vec_classfication.clear(); 383 | } 384 | /* 385 | ************************************************* 386 | 功能 : 藏文文本预处理,序列化输入 387 | 参数 : 388 | 返回值 : 389 | ------------------------------------------------- 390 | 备注 : UTF8编码方式,对于藏文是纯藏文的文本输入,对于音节缩减的问题予以全部切分 391 | ------------------------------------------------- 392 | 作者 :Li Yachao 393 | 时间 :2012-3-5,2015-11-6 394 | ************************************************* 395 | */ 396 | void Tokenize::TibetanAll(const std::string & line,std::list& val) 397 | { 398 | std::vectorvec_val; 399 | TibetanAll(line,vec_val); 400 | val.clear(); 401 | val.assign(vec_val.begin(),vec_val.end()); 402 | 403 | } 404 | void Tokenize::TibetanAll(const std::string & line,std::vector& val) 405 | { 406 | std::string syllables[6]; 407 | syllables[0] = "\xe0\xbd\xa6\xe0\xbc\x8b"; 408 | syllables[1] = "\xe0\xbd\xa2\xe0\xbc\x8b"; 409 | syllables[2] = "\xe0\xbd\xa0\xe0\xbd\xb2\xe0\xbc\x8b"; 410 | syllables[3] = "\xe0\xbd\xa0\xe0\xbd\xbc\xe0\xbc\x8b"; 411 | syllables[4] = "\xe0\xbd\xa0\xe0\xbd\x84\xe0\xbc\x8b"; 412 | syllables[5] = "\xe0\xbd\xa0\xe0\xbd\x98\xe0\xbc\x8b"; 413 | std::vector tmp; 414 | Tibetan(line,tmp); 415 | val.clear(); 416 | for(int i=0;i=0) && (index <=5)) 444 | { 445 | int length = tmp[i].size() - syllables[index].size(); 446 | std::string sub = tmp[i].substr(0,length); 447 | if(!sub.empty()) 448 | { 449 | val.push_back(sub); 450 | } 451 | val.push_back(syllables[index]); 452 | } 453 | else 454 | { 455 | val.push_back(tmp[i]); 456 | } 457 | } 458 | } 459 | } 460 | void Tokenize::TibetanAll(const std::string & line,std::vector > & val) 461 | { 462 | std::string syllables[6]; 463 | syllables[0] = "\xe0\xbd\xa6\xe0\xbc\x8b"; 464 | syllables[1] = "\xe0\xbd\xa2\xe0\xbc\x8b"; 465 | syllables[2] = "\xe0\xbd\xa0\xe0\xbd\xb2\xe0\xbc\x8b"; 466 | syllables[3] = "\xe0\xbd\xa0\xe0\xbd\xbc\xe0\xbc\x8b"; 467 | syllables[4] = "\xe0\xbd\xa0\xe0\xbd\x84\xe0\xbc\x8b"; 468 | syllables[5] = "\xe0\xbd\xa0\xe0\xbd\x98\xe0\xbc\x8b"; 469 | std::vector > tmp; 470 | Tibetan(line,tmp); 471 | val.clear(); 472 | for(int i=0;ip; 491 | p.first = tmp[i].first; 492 | p.second = "TB"; 493 | val.push_back(p); 494 | } 495 | else 496 | { 497 | int index = -1; 498 | bool sub = false; 499 | for(int k=0;k<6;k++) 500 | { 501 | if(StringOperation::IsPostfix(tmp[i].first,syllables[k])) 502 | { 503 | index = k; 504 | break; 505 | } 506 | } 507 | if((index >=0) && (index <=5)) 508 | { 509 | int length = tmp[i].first.size() - syllables[index].size(); 510 | std::string sub = tmp[i].first.substr(0,length); 511 | if(!sub.empty()) 512 | { 513 | std::pairp; 514 | p.first = sub; 515 | p.second = "TB"; 516 | val.push_back(p); 517 | } 518 | std::pairp; 519 | p.first = syllables[index]; 520 | p.second = "TB"; 521 | val.push_back(p); 522 | } 523 | else 524 | { 525 | std::pairp; 526 | p.first = tmp[i].first; 527 | p.second = "TB"; 528 | val.push_back(p); 529 | } 530 | } 531 | } 532 | } 533 | } 534 | --------------------------------------------------------------------------------