├── Preprocess ├── __init__.py ├── bow.py └── boc.py ├── corpus.zip ├── __init__.py ├── paper └── 限定领域口语对话系统中超出领域话语的对话行为识别.pdf ├── .idea ├── vcs.xml ├── modules.xml ├── Ch2r_ood_understanding.iml ├── misc.xml └── workspace.xml ├── .gitattributes ├── config.yaml ├── .gitignore ├── README.md ├── random_forest.py ├── two-phase.py ├── ME(TFIDF+OOV).py └── cnn.py /Preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /corpus.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZixuanKe/Ch2r_ood_understanding/HEAD/corpus.zip -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | __author__ = 'jdwang' 3 | __date__ = 'create date: 2016-06-23' 4 | __email__ = '383287471@qq.com' 5 | 6 | -------------------------------------------------------------------------------- /paper/限定领域口语对话系统中超出领域话语的对话行为识别.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZixuanKe/Ch2r_ood_understanding/HEAD/paper/限定领域口语对话系统中超出领域话语的对话行为识别.pdf -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.idea/Ch2r_ood_understanding.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | main: 2 | path: main.py 3 | # 描述 4 | 5 | describe: 该包算法参考论文:基于词矢量的短文本分类;该论文使用TFIDF作为特征向量,最后使用最大熵分类器进行模型训练测试时,使用word2vec对集外词(OOV)进行替换. 6 | 7 | name: &name bow_word2vec_oov 8 | 9 | model: &model tfidf 10 | 11 | max_features: 2000 12 | 13 | max_keywords: 2000 14 | 15 | full_mode: False 16 | 17 | remove_stopword: True 18 | 19 | train_data_file_path: new_train_all.csv 20 | test_data_file_path: new_ood_labeled.csv 21 | # 结果输出到... 22 | result_file_path: ['./',*name,'_',*model,'.csv'] 23 | # 使用什么模型:TFIDF 或者 BOW 24 | # 日志文件路径 25 | log_file_path: ['./',*name,'_',*model,'.log'] 26 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Preprocess/bow.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import pandas as pd 3 | import jieba 4 | 5 | 6 | ''' 7 | 简单的分词模块 8 | 9 | ''' 10 | 11 | train_data = pd.read_csv( 12 | 'v2.3_train_S_1518.csv', 13 | sep='\t', 14 | encoding='utf8', 15 | header=0 16 | ) 17 | 18 | test_data = pd.read_csv( 19 | 'v2.3_test_S_131.csv', 20 | sep='\t', 21 | encoding='utf8', 22 | header=0 23 | ) 24 | 25 | train_data['WORDS'] = [" ".join(jieba.cut(sentence)) for sentence in train_data['SENTENCE']] 26 | test_data['WORDS'] = [" ".join(jieba.cut(sentence)) + " " for sentence in test_data['SENTENCE']] 27 | 28 | train_data.to_csv( 29 | "train_seg.csv", 30 | sep='\t', 31 | encoding='utf8', 32 | 33 | ) 34 | 35 | test_data.to_csv( 36 | "test_seg.csv", 37 | sep='\t', 38 | encoding='utf8', 39 | 40 | ) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Preprocess/boc.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import pandas as pd 3 | import yaml 4 | 5 | import jieba 6 | 7 | 8 | 9 | ''' 10 | 按字切分 而不按词切分 11 | 即 BOC(Characteristic) 与 BOW(Word) 的区别 12 | 13 | ''' 14 | def singleword(train_data,test_data): 15 | result_train = [] 16 | for words in train_data['WORDS']: 17 | character_result_train = "" 18 | words = words.split() 19 | for characters in words: 20 | if len(characters) > 1: 21 | for character in characters: 22 | character_result_train += (character+ u" ") 23 | else: 24 | character_result_train += (characters + u" ") 25 | 26 | character_result_train = character_result_train[0:len(character_result_train)-1] 27 | result_train.append(character_result_train) 28 | 29 | 30 | train_data['SINGLE'] = result_train 31 | 32 | 33 | 34 | result_test = [] 35 | for words in test_data['WORDS']: 36 | character_result_test = "" 37 | words = words.split() 38 | for characters in words: 39 | if len(characters) > 1: 40 | for character in characters: 41 | character_result_test += (character+ u" ") 42 | else: 43 | character_result_test += (characters + u" ") 44 | character_result_test = character_result_test[0:len(character_result_test)-1] 45 | result_test.append(character_result_test) 46 | 47 | 48 | test_data['SINGLE'] = result_test 49 | 50 | train_data.to_csv( 51 | "train.csv", 52 | sep = '\t', 53 | encoding = 'utf8', 54 | 55 | ) 56 | 57 | test_data.to_csv( 58 | "test.csv", 59 | sep='\t', 60 | encoding='utf8', 61 | 62 | ) 63 | return result_train,result_test 64 | 65 | 66 | 67 | if __name__ == '__main__': 68 | train_data = pd.read_csv( 69 | 'train_seg.csv', 70 | sep='\t', 71 | encoding='utf8', 72 | header=0 73 | ) 74 | 75 | test_data = pd.read_csv( 76 | 'test_seg.csv', 77 | sep='\t', 78 | encoding='utf8', 79 | header=0 80 | ) 81 | 82 | singleword(train_data,test_data) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Ch2r_ood_understanding 2 | 3 | --- 4 | 本文档为论文[限定领域口语对话系统中超出领域话语的对话行为识别](https://zixuanke.github.io/docs/%E9%99%90%E5%AE%9A%E9%A2%86%E5%9F%9F%E5%8F%A3%E8%AF%AD%E5%AF%B9%E8%AF%9D%E7%B3%BB%E7%BB%9F%E4%B8%AD%E8%B6%85%E5%87%BA%E9%A2%86%E5%9F%9F%E8%AF%9D%E8%AF%AD%E7%9A%84%E5%AF%B9%E8%AF%9D%E8%A1%8C%E4%B8%BA%E8%AF%86%E5%88%AB.pdf)的部分实验代码。代码基于Python,需要用到的外部库有: 5 | 6 | > * Keras(搭建神经网络) 7 | > * Scikit-learn(最大熵,随机森林) 8 | > * gensim(使用word2vec替换字典外的词) 9 | 10 | 实验涉及的方法主要有 11 | > * 二阶段法(two-phase) 12 | > * 最大熵法(ME(TFIDF+OOV)) 13 | > * 随机森林(RF(random_forest.py)) 14 | > * CNN(cnn.py) 15 | 16 | 语料库简介 17 | [语料库](https://github.com/ZixuanKe/Ch2r_ood_understanding/tree/master/corpus)中有两个语料库可供选择: 18 | > * AIML语料库(人造数据集) 19 | > * CCL语料库(实际测试用到的数据集) 20 | 21 | 标签格式为: 22 | 23 | > categoryA # categoryB 24 | 25 | 即 **大类维度为A,小类维度为B** 26 | 27 | 28 | 其中 **大类共4类,小类共16类** 29 | 30 | 实验方法 31 | 预处理模块 32 | [预处理](https://github.com/ZixuanKe/Ch2r_ood_understanding/blob/master/Preprocess)中有两个预处理脚本可供选择: 33 | > * BOC(Bag-of-character 即按字划分,制造“字袋”) 34 | > * BOW(Bag-of-word 即按词划分,制造“词袋”) 35 | 36 | 二阶段法 37 | 我们将分类切割成两部分,首先进行4个大类的分类,在大类的基础上,再对大类下的小类进行细分 38 | > 这样做的合理性,在部分比赛参赛选手的做法中得到证实。理由是我们认为大类分类比小类分类更加容易,在大类之内进行小类分类,可以使得小类分类时范围减少,减少小类分类的难度。然而这样也有不合理性,比如,大类分类出错,则小类分类则无机会再分对,也即误差的传递性。 39 | 40 | > 参考论文: [Splusplus: A Feature-Rich Two-stage Classifier for Sentiment Analysis of Tweets](http://www.aclweb.org/anthology/S/S15/S15-2.pdf#page=557) 41 | 42 | 在代码中,针对每个大类对应的小类,重新训练了各自的分类器: 43 | ```python 44 | resultData,resultTarget = findAllTrainning('attitude',exam_bow_fea_data) #找到其大类的所有小类 45 | gb1 = sub_classfier(resultData,resultTarget) 46 | resultData,resultTarget = findAllTrainning('shopping',exam_bow_fea_data) #找到其大类的所有小类 47 | gb2 = sub_classfier(resultData,resultTarget) 48 | resultData,resultTarget = findAllTrainning('chatting',exam_bow_fea_data) #找到其大类的所有小类 49 | gb3 = sub_classfier(resultData,resultTarget) 50 | resultData,resultTarget = findAllTrainning('trouble',exam_bow_fea_data) #找到其大类的所有小类 51 | gb4 = sub_classfier(resultData,resultTarget) 52 | ``` 53 | 最大熵法 54 | 使用最大熵模型直接分类作为对照组 55 | >* 最大熵模型在许多文本分类问题中都表现了他优越的性能,这里我们利用他作为对照组,观察后面CNN和RF的效果 56 | 57 | > 参考论文: [使用最大熵模型进行中文文本分类](http://www.cnki.net/KCMS/detail/detail.aspx?QueryID=4&CurRec=1&recid=&filename=JFYZ200501013&dbname=CJFD2005&dbcode=CJFQ&pr=&urlid=&yx=&v=MjkxMDVMRzRIdFRNcm85RVo0UjhlWDFMdXhZUzdEaDFUM3FUcldNMUZyQ1VSTHlmYitSckZ5L2hVYnpPTHl2U2Q=) 58 | 59 | >* 当逻辑回归用于多分类问题时,可将损失函数改为交叉熵之后,则其成为最大熵模型[LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression) 60 | 61 | 62 | >* 为了提高分类精度,针对部分在字典外的词,使用word2vec用外部语料(论文中使用SMP2015给出的微博数据,1000万条)进行OOV(out-of-vocabulary)替换(替换为与词汇表最近的词) 63 | 64 | >参考论文: [基于词矢量相似度的短文本分类](http://www.cnki.net/KCMS/detail/detail.aspx?QueryID=0&CurRec=1&recid=&filename=SDDX201412004&dbname=CJFDLAST2015&dbcode=CJFQ&pr=&urlid=&yx=&v=MDE1MzkxRnJDVVJMeWZiK1JyRnkvaFVieklOaW5QZHJHNEg5WE5yWTlGWUlSOGVYMUx1eFlTN0RoMVQzcVRyV00=) 65 | 66 | 代码中,需要设置LogisticRegression的参数 67 | ```python 68 | clf = LogisticRegression(multi_class="multinomial",solver="newton-cg") 69 | ``` 70 | 71 | 卷积神经网络 72 | > 卷积神经网络在NLP中的使用多种多样,这里使用设置不同窗口大小的方法进行探索,即seq-CNN和Bow-CNN 73 | 74 | >参考论文: [ (Johnson and Zhang, NAACL 2015) Effective Use of Word Order for Text Categorization with Convolutional Neural Networks](https://arxiv.org/pdf/1412.1058.pdf) 75 | 76 | Seq-CNN 77 | 由**one-hot编码**拼接而来 78 | > 优点:词语之间顺序的得到保留 79 | > 缺点:维度过大,容易造成维度灾难 80 | 81 | Bow-CNN 82 | 在**Seq-CNN**的基础上,进行降维 83 | > 在确定窗口大小为n的情况,n之内的one-hot coding进行对应位数相加 84 | 优点:窗口内的语序信息丢失 85 | 缺点:窗口间的语序信息得到保留,维度得到降低 86 | 87 | 88 | 随机森林 89 | 传统的**bagging融合模型**,这里**树的棵树**使用交叉验证得到,**树的深度**使用经验值: 90 | > log(M),其中M为总特征数 91 | 92 | 评价指标 93 | > 准确率: sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label)) 94 | -------------------------------------------------------------------------------- /random_forest.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import pandas as pd 3 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 4 | import pandas as pd 5 | import yaml 6 | import pickle 7 | import numpy as np 8 | from sklearn import preprocessing 9 | from sklearn.ensemble import RandomForestClassifier 10 | 11 | 12 | 13 | ''' 14 | 尝试使用randomforest进行分类 与 CNN分类进行比对 15 | 16 | ''' 17 | def load_data(file_name): 18 | import csv 19 | 20 | csvfile = file(file_name, 'rb') 21 | reader = csv.reader(csvfile) 22 | 23 | label = [] 24 | data = [] 25 | for line in reader: 26 | label.append(line[0]) 27 | data.append(line[1:len(line)]) 28 | 29 | # print label 30 | # print data 31 | csvfile.close() 32 | return data,label 33 | 34 | 35 | 36 | if __name__ == '__main__': 37 | 38 | data = pd.read_csv( 39 | "v2.3_test_Sa_79.csv", 40 | sep='\t', 41 | encoding='utf8', 42 | header=0 43 | ) 44 | f = open("result.txt",'a') 45 | train_data_bow_fea_bow,train_data_label_bow = load_data("v2.3_train_Sa_word_seg_i1_dev_830.csv") 46 | test_data_bow_fea_bow,test_data_label_bow = load_data("v2.3_train_Sa_word_seg_i1_val_76.csv") 47 | print "拼接 i1,卷积层" 48 | with open("TrainSet_2+281_feature_d1.pickle","rb") as file: 49 | train_data_bow_fea = pickle.load(file) 50 | train_data_label = pickle.load(file) 51 | test_data_bow_fea = pickle.load(file) 52 | test_data_label = pickle.load(file) 53 | # 54 | # train_data_bow_fea_bow = preprocessing.minmax_scale(train_data_bow_fea_bow) 55 | # test_data_bow_fea_bow = preprocessing.minmax_scale(test_data_bow_fea_bow) 56 | 57 | #拼接用 58 | print "length1: " + str(len(train_data_bow_fea_bow[0])) 59 | print "length2: " + str(len(train_data_bow_fea[0])) 60 | print len(train_data_bow_fea_bow) 61 | print len(train_data_bow_fea) 62 | 63 | train_length = len(train_data_bow_fea_bow[0]) + len(train_data_bow_fea[0]) 64 | test_length = len(test_data_bow_fea_bow[0]) + len(test_data_bow_fea[0]) 65 | 66 | train_weigth = len(train_data_bow_fea_bow) 67 | test_weigth = len(test_data_bow_fea_bow) 68 | 69 | train_data_bow_fea = np.concatenate((train_data_bow_fea,train_data_bow_fea_bow),axis=1) 70 | test_data_bow_fea = np.concatenate((test_data_bow_fea,test_data_bow_fea_bow),axis=1) 71 | 72 | train_data_bow_fea.reshape(train_length,train_weigth) 73 | test_data_bow_fea.reshape(test_length,test_weigth) 74 | 75 | print "length合并: " + str(len(train_data_bow_fea[0])) 76 | 77 | train = train_data_bow_fea 78 | test = test_data_bow_fea 79 | 80 | index_to_label = [ 81 | u'其它#骂人', 82 | u'导购#不成交', 83 | u'导购#不理解', 84 | u'导购#开始', 85 | u'导购#成交', 86 | u'导购#更换', 87 | u'导购#结束', 88 | u'导购#详情', 89 | u'表态#不满', 90 | u'表态#否定', 91 | u'表态#满意', 92 | u'表态#犹豫', 93 | u'表态#疑问', 94 | u'表态#肯定', 95 | u'表态#附和', 96 | u'表态#随便', 97 | u'社交义务#不用谢', 98 | u'社交义务#接受道歉', 99 | u'社交义务#致谢', 100 | u'社交义务#道歉', 101 | u'社交义务#问候', 102 | u'闲聊#天气', 103 | u'闲聊#时间', 104 | u'闲聊#身份信息' 105 | ] 106 | 107 | for n in [1000]: 108 | clf = RandomForestClassifier(n_estimators=n) #随机森林 109 | # clf.fit(train_data_bow_fea,train_data['LABEL']) 110 | 111 | clf.fit(train,train_data_label) 112 | print >> f ,sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label)) 113 | print sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label)) 114 | 115 | 116 | #bad case 输出 117 | predict = clf.predict(test_data_bow_fea) 118 | for i in range(len(test_data_label)): 119 | if test_data_label[i] != predict[i]: 120 | print data['SENTENCE'][i] + "\t" + index_to_label[int(predict[i])] + "\t" + index_to_label[int(test_data_label[i])] -------------------------------------------------------------------------------- /two-phase.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | #coding:utf-8 4 | 5 | 6 | from dateutil.parser import parse 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.externals import joblib #用于保存模型 11 | import jieba 12 | from sklearn.ensemble import GradientBoostingClassifier 13 | from sklearn.metrics import f1_score #评价标准F值 14 | from sklearn.metrics import precision_score 15 | from sklearn.metrics import recall_score 16 | from sklearn.ensemble import RandomForestClassifier 17 | 18 | ''' 19 | 基于二分类的尝试 20 | 方法1: 直接进行16个小类的分类 21 | 方法2: 先进行大类的分类,后进行小类的分类,两步走 22 | 23 | ''' 24 | 25 | 26 | 27 | def sub_classfier(exam_bow_fea_data,exam_bow_fea_target): 28 | ''' 29 | 30 | 训练子分类分类器 31 | 32 | :param exam_bow_fea_data: 数据 33 | :param exam_bow_fea_target: 标签 34 | :return: 返回 随机森林训练模型 35 | ''' 36 | 37 | 38 | rf = RandomForestClassifier(n_estimators=200) #TARGET为label2 39 | print "target:",len(exam_bow_fea_target) 40 | print "data:",len(exam_bow_fea_data) 41 | rf.fit(exam_bow_fea_data, exam_bow_fea_target) 42 | return rf 43 | 44 | def findAllTrainning(mainClass,exam_bow_fea): 45 | 46 | ''' 47 | 找出大类之下的,所有子分类 48 | :param mainClass: 大类 49 | :param exam_bow_fea: 训练数据 50 | :return: 子类训练数据 子类训练标签 51 | ''' 52 | resultData = [] 53 | for rec in range(len(exam)): 54 | if exam.iloc[rec].LABEL1 == mainClass: 55 | resultData .append( exam_bow_fea[rec] ) 56 | print len(resultData) 57 | resultTarget = exam[['LABEL2']][exam.LABEL1 == mainClass] 58 | return resultData,resultTarget 59 | 60 | 61 | 62 | #读取数据 63 | print 'Loading Data' 64 | exam = pd.read_table('train_all.csv', 65 | converters={'date': parse},encoding = 'utf-8') 66 | 67 | 68 | exam_test = pd.read_table('ch2r_test.csv', 69 | converters={'date': parse},encoding = 'utf-8') 70 | 71 | 72 | 73 | #分词 74 | exam = exam.drop(['SEGMENT_FULL','SEGMENT_EVERYWORD'],axis=1) 75 | exam_test = exam_test.drop(['SEGMENT_FULL','SEGMENT_EVERYWORD','SEGMENT_OOV','SEGMENT_OOV_EVERYWORD'],axis=1) 76 | exam['SENTENCE'] = [' '.join(jieba.cut(sentence)) for sentence in exam['SENTENCE']] 77 | exam_test['SENTENCE'] = [' '.join(jieba.cut(sentence)) for sentence in exam_test['SENTENCE']] 78 | print exam.head() 79 | exam['SENTENCE'] = exam['SEGMENT'].apply(lambda x:' '.join(x.split('|'))) 80 | exam_test['SENTENCE'] = exam_test['SEGMENT'].apply(lambda x:' '.join(x.split('|'))) 81 | 82 | #预处理结果文件保存 83 | exam.to_csv('Exam_Prep.csv',encoding = 'utf-8') 84 | exam_test.to_csv('Exam_Prep_Test.csv',encoding = 'utf-8') 85 | 86 | 87 | 88 | #找特征 BOW 89 | vect = CountVectorizer(token_pattern=r"(?u)\b\w+\b") 90 | exam_bow_fea = vect.fit_transform(exam['SENTENCE']).toarray() 91 | exam_bow_fea_test = vect.transform(exam_test['SENTENCE']).toarray() 92 | 93 | 94 | 95 | exam_bow_fea_data = exam_bow_fea #归一化 96 | print len(exam_bow_fea_data) 97 | exam_bow_fea_target = exam['LABEL2'] 98 | print len(exam_bow_fea_target) 99 | 100 | exam_bow_fea_test_data = exam_bow_fea_test #归一化 101 | print len(exam_bow_fea_test_data) 102 | exam_bow_fea_test_target = exam_test['LABEL2'] 103 | print len(exam_bow_fea_test_target) 104 | 105 | 106 | 107 | #特征读取完毕 108 | 109 | 110 | 111 | # 方法1 直接对 16个小类进行分类 112 | esti = 400; dep = 7 113 | gb = RandomForestClassifier(n_estimators=200) 114 | gb.fit(exam_bow_fea_data,exam_bow_fea_target) #直接fit即可,没有明确的标记,不像分类问题 115 | # joblib.dump(gb,"gb.RandomForestClassifierModel") 116 | 117 | 118 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data))/1184.0 119 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data)) 120 | 121 | 122 | #方法2 先大类后小类 123 | 124 | exam_bow_fea_target = exam['LABEL1'] 125 | exam_bow_fea_test_target = exam_test['LABEL1'] 126 | 127 | exam_bow_fea_test_result = exam_test['LABEL2'] #终极结果 128 | 129 | esti = 400; dep = 7 130 | gb = RandomForestClassifier(n_estimators=200) 131 | gb.fit(exam_bow_fea_data,exam_bow_fea_target) #直接fit即可,没有明确的标记,不像分类问题 132 | # joblib.dump(gb,"gb.RandomForestClassifierModel") 133 | 134 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data))/58.0 135 | print exam_bow_fea_test_target 136 | print gb.predict(exam_bow_fea_test_data) 137 | np.savetxt('1.csv', exam_bow_fea_test_target,fmt='%s', delimiter = '/t') 138 | np.savetxt('2csv', gb.predict(exam_bow_fea_test_data),fmt='%s', delimiter = '/t') 139 | 140 | 141 | 142 | mainClass = [i for i in gb.predict(exam_bow_fea_test_data)] 143 | 144 | resultData,resultTarget = findAllTrainning('attitude',exam_bow_fea_data) #找到其大类的所有小类 145 | gb1 = sub_classfier(resultData,resultTarget) 146 | resultData,resultTarget = findAllTrainning('shopping',exam_bow_fea_data) #找到其大类的所有小类 147 | gb2 = sub_classfier(resultData,resultTarget) 148 | resultData,resultTarget = findAllTrainning('chatting',exam_bow_fea_data) #找到其大类的所有小类 149 | gb3 = sub_classfier(resultData,resultTarget) 150 | resultData,resultTarget = findAllTrainning('trouble',exam_bow_fea_data) #找到其大类的所有小类 151 | gb4 = sub_classfier(resultData,resultTarget) 152 | 153 | 154 | 155 | result = [] 156 | for i in range(len(exam_test)): 157 | print mainClass[i] 158 | if mainClass[i] == 'attitude': 159 | result.append( gb1.predict(exam_bow_fea_test_data[i])) 160 | elif mainClass[i] == 'shopping': 161 | result.append( gb2.predict(exam_bow_fea_test_data[i])) 162 | elif mainClass[i] == 'chatting': 163 | result.append( gb3.predict(exam_bow_fea_test_data[i])) 164 | elif mainClass[i] == 'trouble': 165 | result.append( gb4.predict(exam_bow_fea_test_data[i])) 166 | 167 | 168 | 169 | #保存结果 170 | # print sum( result == exam_bow_fea_test_result ) / 58.0 171 | np.savetxt('new.csv', exam_bow_fea_test_result.as_matrix(),fmt='%s', delimiter = '/t') 172 | np.savetxt('re.csv', np.asarray(result).flatten(),fmt='%s', delimiter = '/t') 173 | 174 | -------------------------------------------------------------------------------- /ME(TFIDF+OOV).py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf-8') 5 | 6 | __author__ = 'jdwang' 7 | __date__ = 'create date: 2016-05-29' 8 | import numpy as np 9 | import pandas as pd 10 | import logging 11 | import timeit 12 | import yaml 13 | from gensim.models import Word2Vec 14 | from sklearn.linear_model import LogisticRegression 15 | from dateutil.parser import parse 16 | import pandas as pd 17 | import jieba 18 | 19 | 20 | 21 | 22 | config = yaml.load(file('./config.yaml')) #读取yaml配置文件 23 | config = config['main'] #以字典的方式读取2 24 | logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w', 25 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 26 | start_time = timeit.default_timer() 27 | 28 | #可保存为日志文件进行管理 29 | 30 | print('=' * 30) 31 | # print config['describe'] 32 | print('=' * 30) 33 | print 'start running!' 34 | logging.debug('=' * 30) 35 | logging.debug(config['describe']) 36 | logging.debug('=' * 30) 37 | logging.debug('start running!') 38 | logging.debug('=' * 20) 39 | 40 | 41 | import jieba 42 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 43 | from sklearn.ensemble import RandomForestClassifier 44 | 45 | 46 | train_data = pd.read_csv( 47 | config['train_data_file_path'], 48 | sep='\t', 49 | encoding='utf8', 50 | header=0 51 | ) 52 | 53 | test_data = pd.read_csv( 54 | config['test_data_file_path'], 55 | sep='\t', 56 | encoding='utf8', 57 | header=0 58 | ) 59 | 60 | logging.debug('train data shape is :%s'%(str(train_data.shape))) 61 | print('train data shape is :%s'%(str(train_data.shape))) 62 | 63 | logging.debug('test data shape is :%s'%(str(test_data.shape))) 64 | print('test data shape is :%s'%(str(train_data.shape))) 65 | logging.debug('-' * 20) 66 | # 去除类别 其他#其他 67 | logging.debug('去除类别 其他#其他') 68 | train_data = train_data[train_data['LABEL']!=u'其他#其他'] 69 | test_data = test_data[test_data['LABEL']!=u'其他#其他'] 70 | logging.debug('train data shape is :%s'%(str(train_data.shape))) 71 | print('train data shape is :%s'%(str(train_data.shape))) 72 | 73 | logging.debug('test data shape is :%s'%(str(test_data.shape))) 74 | print('test data shape is :%s'%(str(train_data.shape))) 75 | logging.debug('-' * 20) 76 | 77 | train_data = train_data[['LABEL','SENTENCE']] 78 | test_data = test_data[['LABEL','SENTENCE']] 79 | 80 | index_to_label = list(train_data['LABEL'].unique()) 81 | logging.debug(u'总共类别数:%d,分别为:%s'%(len(index_to_label),','.join(index_to_label))) 82 | print('总共类别数:%d'%(len(index_to_label))) 83 | 84 | label_to_index = {label:idx for idx,label in enumerate(index_to_label)} 85 | 86 | train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index) 87 | test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index) 88 | # print train_data.head() 89 | 90 | 91 | logging.debug('=' * 20) 92 | logging.debug('对数据进行分词...') 93 | logging.debug('-' * 20) 94 | 95 | sentence_to_seg = lambda x: jieba.cut(x,cut_all=True) 96 | 97 | train_data['WORDS'] = [' '.join(jieba.cut(sentence,cut_all=True)) for sentence in train_data['SENTENCE']] 98 | test_data['WORDS'] = [' '.join(jieba.cut(sentence,cut_all=True)) for sentence in test_data['SENTENCE']] 99 | 100 | # train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg) 101 | # test_data['WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg) 102 | print train_data.head() 103 | 104 | logging.debug('=' * 20) 105 | logging.debug('开始生成特征向量...') 106 | 107 | vectorizer = CountVectorizer(analyzer="word", 108 | token_pattern=u'(?u)\\b\w+\\b', 109 | tokenizer=None, 110 | preprocessor=None, 111 | lowercase=False, 112 | stop_words=None, 113 | max_features=config['max_features']) 114 | 115 | print test_data.head() 116 | train_X_features = vectorizer.fit_transform(train_data['WORDS'].as_matrix()).toarray( 117 | 118 | ) 119 | 120 | 121 | vocabulary = vectorizer.get_feature_names() 122 | logging.debug(u'字典大小:%d个词,有:%s'%(len(vocabulary),','.join(vocabulary))) 123 | # print(u'字典大小:%d,有:%s'%(len(vocabulary),','.join(vocabulary))) 124 | 125 | logging.debug('train X shape is :%s'%(str(train_X_features.shape))) 126 | print('train X shape is :%s'%(str(train_X_features.shape))) 127 | 128 | logging.debug('=' * 20) 129 | logging.debug(u'计算概率') 130 | logging.debug('注意:如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数') 131 | 132 | row,col = train_X_features.shape 133 | # 若一个词在句子中出现多次,只算一次 134 | train_X_features = np.asarray([item>0 for item in train_X_features.flatten()],dtype=int).reshape(row,col) 135 | 136 | words_total_count = sum(train_X_features.flatten()) 137 | logging.debug('训练库中,词的总计数为:%d'%(words_total_count)) 138 | print('训练库中,词的总计数为:%d'%(words_total_count)) 139 | 140 | logging.debug('-' * 20) 141 | # 统计每个词的出现次数,如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数 142 | logging.debug('统计每个词的出现次数,如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数') 143 | get_word_count = lambda x: sum(x) 144 | word_counts = np.sum(train_X_features,axis=0) 145 | 146 | p_word = word_counts/(1.0*words_total_count) 147 | logging.debug(u'最大词频为:%f,次数为:%d,该词为:%s'%(max(p_word),max(word_counts),vocabulary[np.argmax(word_counts)])) 148 | # print(u'最大词频为:%f,次数为:%d,该词为:%s'%(max(p_word),max(word_counts),vocabulary[np.argmax(word_counts)])) 149 | 150 | logging.debug('-' * 20) 151 | logging.debug('计算词和各个类的共现次数,以及每个类的句子数...') 152 | 153 | print('计算词和各个类的共现次数...') 154 | # count(word,class) 155 | count_word_class = [] 156 | # count(class) 157 | count_class = [] 158 | for label in index_to_label: 159 | logging.debug('-' * 10) 160 | logging.debug(u'处理类别:%s'%(label)) 161 | # print(u'处理类别:%s'%(label)) 162 | # 计算相应类别的句子 163 | index = (train_data['LABEL'] == label).as_matrix() 164 | sentences = train_X_features[index] 165 | print len(sentences) 166 | logging.debug('句子数为:%d'%(len(sentences))) 167 | print('句子数为:%d'%(len(sentences))) 168 | count_class.append(len(sentences)) 169 | count_word_class.append(np.sum(sentences,axis=0)) 170 | 171 | # count(class) 172 | count_class = np.asarray(count_class) 173 | # P(class) 174 | p_class = count_class/(1.0*len(train_data)) 175 | # P(class|word) 176 | p_class_on_word = count_word_class/(word_counts*1.0) 177 | p_class_on_word = p_class_on_word.transpose() 178 | 179 | logging.debug('-' * 20) 180 | logging.debug('计算 P(class|word)/P(class)') 181 | 182 | print p_class_on_word[0] 183 | print p_class 184 | # P(class|word)/P(class) 185 | p_rate = p_class_on_word/p_class 186 | print p_rate[0] 187 | logging.debug('计算 log( P(class|word)/P(class) )') 188 | # log( P(class|word)/P(class) ) 189 | log_p_rate = np.log(p_rate) 190 | print log_p_rate[0] 191 | 192 | # P(class|word) * log( P(class|word)/P(class) ) 193 | p_ent = log_p_rate * p_class_on_word 194 | p_ent = np.nan_to_num(p_ent) 195 | print p_ent[0] 196 | # 期望交叉熵 197 | entroy = np.sum(p_ent,axis=1) 198 | print entroy[0] 199 | 200 | print p_word[0] 201 | # 结果 = 期望交叉熵 * P(word) 202 | 203 | # 论文直接使用词频*熵,则将会导致词频大的词权重很大, 204 | # 即:entroy = p_word * entroy 205 | # 改进:使用sigmoid函数进行平滑 206 | # 或者不使用词频,效果也更好 207 | def sigmoid(x): 208 | return 1/(1+np.exp(-x)) 209 | # entroy = sigmoid(p_word) * entroy 210 | print entroy[0] 211 | 212 | logging.debug('=' * 20) 213 | 214 | logging.debug('进行特征词选择..') 215 | logging.debug('-' * 20) 216 | sort_index = np.argsort(entroy)[-1::-1] 217 | vocabulary = np.asarray(vocabulary) 218 | # print ','.join(vocabulary[sort_index]) 219 | # print entroy[sort_index] 220 | logging.debug(u'期望交叉熵top 10:%s'%(','.join(vocabulary[sort_index[:10]]))) 221 | logging.debug('大小分别为:%s'%(entroy[sort_index[:10]])) 222 | 223 | logging.debug('-' * 20) 224 | keywords = vocabulary[sort_index[:config['max_keywords']]] 225 | 226 | logging.debug('选取%d个词作为关键词,实际为:%d个'%(config['max_keywords'],len(keywords))) 227 | # print('选取%d个词作为关键词,实际为:%d'%(config['max_keywords'],len(keywords))) 228 | logging.debug(u'关键词分别为(按权重大到小):%s'%(','.join(keywords))) 229 | # print(u'关键词分别为(按权重大到小):%s'%(','.join(keywords))) 230 | logging.debug('-' * 20) 231 | 232 | 233 | 234 | 235 | logging.debug('=' * 20) 236 | logging.debug('生成TFIDF特征向量...') 237 | # TFIDF 字典 238 | tfidf_vocabulary = {item:idx for idx,item in enumerate(keywords)} 239 | 240 | tfidf_vectorizer = TfidfVectorizer(analyzer="word", 241 | token_pattern=u'(?u)\\b\w+\\b', 242 | tokenizer=None, 243 | preprocessor=None, 244 | lowercase=False, 245 | stop_words=None, 246 | vocabulary = tfidf_vocabulary, 247 | max_features=config['max_keywords']) 248 | 249 | exam_bow_fea = tfidf_vectorizer.fit_transform(train_data['WORDS'].as_matrix()).toarray() 250 | print "test: ",len(exam_bow_fea) 251 | 252 | f = open("result.txt","w") 253 | dictionary = tfidf_vectorizer.get_feature_names() 254 | dictionary = [ (word) for word in dictionary] 255 | 256 | print "dictionary length: ",len(dictionary) 257 | print len(test_data['LABEL']) 258 | 259 | 260 | print >> f , (",".join(dictionary)) 261 | 262 | print >> f,"Loading wrod2vec file" 263 | model = Word2Vec.load('weibodata_vectorB.gem') 264 | 265 | print >> f,''.join(u'替换方法2: 直接找出词典中与之最相近的词:') #另一种替换,等待跑出的结果 266 | 267 | list = [] 268 | 269 | for sentences in test_data['WORDS']: 270 | temp = "" 271 | tempWord = "" #word不可以每次都改变 272 | sentence = sentences.split(" ") 273 | for word in sentence: #对于每一个单词 274 | if word not in dictionary: #如果word 不在词典之中 275 | #print ''.join(u'单词不在词典之中') 276 | if word == "!": #空格 则下一个 277 | # print ''.join(u'空格跳出循环') 278 | temp = temp + " " + word 279 | continue 280 | if word == "?": 281 | temp = temp + " " + word 282 | continue 283 | 284 | origin = 0 285 | count = 0 286 | for word_in_dict in dictionary : 287 | 288 | #print ''.join(u'开始计算不在tfidf字典中的单词与字典中单词的相近程度') 289 | #print "count: ",count 290 | if count > 20: 291 | break 292 | 293 | #print "尝试计算 " + word + " 与 " + word_in_dict + "的相似度" 294 | try: 295 | similar = abs(model.similarity(word, word_in_dict)) 296 | 297 | except Exception: 298 | # print word_in_dict + " 或 " + word + " 不在w2v字典,匹配下一个" 299 | count += 1 300 | continue 301 | print >> f,(word + " 与 " + word_in_dict + " 的相似度为:" + str(similar)) 302 | if similar > origin: 303 | origin = similar 304 | #print "Before: ",word 305 | #print "temp: ",word_in_dict 306 | tempWord = word_in_dict #替换为词典中最相近的词 此时未覆盖原词语,使得最终结果相同 307 | #print word + " 被替换为: " + word_in_dict 308 | word = tempWord 309 | temp = temp + " " + word 310 | 311 | list.append(temp) 312 | 313 | print >> f,''.join("替换完成,开始计算tfidf:") 314 | 315 | test_data['WORDS'].to_csv("origin.csv") 316 | test_data['WORDS'] = list 317 | test_data['WORDS'].to_csv("final.csv") 318 | 319 | 320 | exam_bow_fea_test = tfidf_vectorizer.transform(test_data['WORDS'].as_matrix()).toarray() 321 | 322 | print len(exam_bow_fea_test) 323 | 324 | exam_bow_fea_target = train_data['LABEL'] 325 | print len(exam_bow_fea_target) 326 | 327 | exam_bow_fea_test_target = test_data['LABEL'] 328 | print len(exam_bow_fea_test_target) 329 | 330 | 331 | 332 | print '计算最大熵模型' 333 | 334 | 335 | print "Training MaxEnt" 336 | # rf = RandomForestClassifier(n_estimators=200) 337 | # clf = rf 338 | clf = LogisticRegression(multi_class="multinomial",solver="newton-cg") 339 | clf.fit(exam_bow_fea,exam_bow_fea_target) 340 | 341 | # print exam_bow_fea_test_target 342 | # print exam_bow_fea_test_data 343 | exam_bow_fea_test_target = test_data['LABEL'] 344 | print len(exam_bow_fea_test_target) 345 | print len(exam_bow_fea_test) 346 | print len(test_data) 347 | 348 | exam_bow_fea_test_target.to_csv("target_true.csv") 349 | print >> f,",".join(clf.predict(exam_bow_fea_test)) 350 | print >> f,sum(exam_bow_fea_test_target == clf.predict(exam_bow_fea_test)) 351 | print >> f,sum(exam_bow_fea_test_target == clf.predict(exam_bow_fea_test))/(len(test_data)*1.0) 352 | 353 | -------------------------------------------------------------------------------- /cnn.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import pandas as pd 3 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 4 | import pandas as pd 5 | import yaml 6 | from keras.utils import np_utils, generic_utils 7 | import pickle 8 | from keras.models import Sequential, Model 9 | from keras.layers import Embedding, Convolution2D, Input, Activation, MaxPooling2D, Reshape, Dropout, Dense, \ 10 | Flatten, Merge 11 | from keras.optimizers import SGD 12 | from keras.models import model_from_json 13 | from sklearn.preprocessing import OneHotEncoder 14 | import numpy as np; 15 | np.random.seed(1337) # for reproducibility 16 | import random 17 | 18 | # 19 | # config = yaml.load(file('config_my_cnn.yaml')) #读取yaml配置文件 20 | # config = config['OriginBow'] #以字典的方式读取2 21 | 22 | 23 | ''' 24 | cnn 结构可以随时更改 25 | 目前是 一个随机选取方案 的结果 26 | 27 | 28 | ''' 29 | 30 | 31 | 32 | nb_pool = [2,1] 33 | nb_classes = 24 34 | 35 | 36 | 37 | def onehotcoder(train_data,test_data): 38 | 39 | ''' 40 | 对应论文中的 seg编码 41 | :param train_data: 训练数据 42 | :param test_data: 测试数据 43 | :return: 44 | ''' 45 | 46 | vect = CountVectorizer() 47 | train_data_bow_fea = vect.fit_transform(train_data['WORDS']).toarray() 48 | # 规定4维输入,必须先转化[长度,1,宽度,1] 49 | test_data_bow_fea = vect.transform(test_data['WORDS']).toarray() 50 | 51 | length = len(vect.vocabulary_) 52 | values = [] 53 | for i in range(10): 54 | values.append(length) 55 | print len(values) 56 | code = OneHotEncoder(categorical_features=np.array([1,2,3,4,5,6,7,8,9,10]),n_values=values) #10个类别 每个类别有字典总数种可能 57 | train_feature = code.fit_transform(train_data_bow_fea).toarray() #编码 58 | test_feature = code.transform(test_data_bow_fea).toarray() 59 | 60 | # print "训练集:" 61 | # print "每个词的维度:",code.n_values_ 62 | 63 | train_onehot = [] 64 | # print "单词总数:",len(train_feature) 65 | # print "每行总长度", len(train_feature[1]) * 935 - 1 66 | 67 | for i in range(len(train_feature)): 68 | train_one_hot_col = [] 69 | t = 0 70 | while True: 71 | # print "剩下 " + str( (len( train_feature[i])) - t ) + " 维" 72 | if ((len( train_feature[i])) - t) < 0: 73 | break 74 | a = train_feature[i][t:t+935] 75 | t += 935 76 | b = train_feature[i][t:t+935] 77 | c = [a[m]+b[m] for m in range(min(len(a),len(b)))] #2区域内相加 78 | for k in c: 79 | train_one_hot_col.append(k) 80 | train_onehot.append(train_one_hot_col) 81 | 82 | print "最终维度:",len(train_onehot[0]) 83 | 84 | 85 | # print "测试集:" 86 | # print "每个词的维度:",code.n_values_ 87 | 88 | test_onehot = [] 89 | # print "单词总数:",len(test_feature) 90 | # print "每行总长度", len(test_feature[1]) * 935 - 1 91 | 92 | for i in range(len(test_feature)): 93 | test_one_hot_col = [] 94 | t = 0 95 | while True: 96 | # print "剩下 " + str( (len( test_feature[i])) - t ) + " 维" 97 | if ((len( test_feature[i])) - t) < 0: 98 | break 99 | a = test_feature[i][t:t+935] 100 | t += 935 101 | b = test_feature[i][t:t+935] 102 | c = [a[m]+b[m] for m in range(min(len(a),len(b)))] #2区域内相加 103 | for k in c: 104 | test_one_hot_col.append(k) 105 | test_onehot.append(test_one_hot_col) 106 | 107 | print "最终维度:",len(test_onehot[0]) 108 | print len(test_onehot) 109 | 110 | return train_onehot,test_onehot 111 | 112 | 113 | def build(layer1,layer2,hidden1,hidden2,length,width,lr=0.001 ,decay=1e-6,momentum=0.9): 114 | ''' 115 | 开始构建CNN网络 116 | :param layer1: 第一层网络 卷积核数量 117 | :param layer2: 第二层网络 卷积核数量 118 | :param hidden1: 第一个隐藏层网络 卷积核数量 119 | :param hidden2: 第二个隐藏层网络 卷积核数量 120 | :param length: 输入长度 121 | :param width: 输入宽度 122 | :param lr: 学习率 123 | :param decay: 学习率衰减 124 | :param momentum: 125 | :return: 搭建好的CNN模型 126 | ''' 127 | #16*5*1 128 | 129 | layer1_model1=Sequential() 130 | layer1_model1.add(Convolution2D(layer1, 2, 1, 131 | border_mode='valid', 132 | input_shape=(1, length, 1))) 133 | layer1_model1.add(Activation('tanh')) 134 | layer1_model1.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1]))) 135 | 136 | #16*10*1 137 | layer1_model2=Sequential() 138 | layer1_model2.add(Convolution2D(layer1, 4, 1, 139 | border_mode='valid', 140 | input_shape=(1, length, 1))) 141 | layer1_model2.add(Activation('tanh')) 142 | layer1_model2.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1]))) 143 | 144 | #16*20*1 145 | layer1_model3=Sequential() 146 | layer1_model3.add(Convolution2D(layer1, 6, 1, 147 | border_mode='valid', 148 | input_shape=(1, length, 1))) 149 | layer1_model3.add(Activation('tanh')) 150 | layer1_model3.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1]))) 151 | 152 | 153 | 154 | model = Sequential() 155 | 156 | model.add(Merge([layer1_model2,layer1_model1,layer1_model3], mode='concat',concat_axis=2))#merge 157 | 158 | 159 | 160 | model.add(Convolution2D(layer2,3,1))#layer2 32*5*1 161 | model.add(Activation('tanh')) 162 | model.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1]))) 163 | model.add(Dropout(0.25)) 164 | 165 | model.add(Flatten()) #平铺 166 | 167 | model.add(Dense(hidden1)) #Full connection 1: 1000 168 | model.add(Activation('tanh')) 169 | model.add(Dropout(0.5)) 170 | 171 | 172 | model.add(Dense(hidden2)) #Full connection 2: 200 173 | model.add(Activation('tanh')) 174 | model.add(Dropout(0.5)) 175 | 176 | model.add(Dense(nb_classes)) 177 | model.add(Activation('softmax')) 178 | 179 | sgd = SGD(lr=lr, decay=decay, momentum=momentum, nesterov=True) 180 | model.compile(loss='categorical_crossentropy', optimizer=sgd,metrics=["accuracy"]) 181 | 182 | #初始化应该在return 之前 183 | 184 | return model 185 | # 186 | 187 | def load_data(file_name): 188 | import csv 189 | 190 | csvfile = file(file_name, 'rb') 191 | reader = csv.reader(csvfile) 192 | 193 | label = [] 194 | data = [] 195 | for line in reader: 196 | label.append(line[0]) 197 | data.append(line[1:len(line)]) 198 | 199 | # print label 200 | # print data 201 | csvfile.close() 202 | return data,label 203 | 204 | 205 | if __name__ == '__main__': 206 | 207 | # 测试集""" 208 | train_data_bow_fea,train_data_label = load_data("v2.3_train_Sa_word_seg_i1_dev_830.csv") 209 | test_data_bow_fea,test_data_label = load_data("v2.3_train_Sa_word_seg_i1_val_76.csv") 210 | 211 | train_data_bow_fea_v1,train_data_label_v1 = load_data("v2.3_train_Sa_word_seg_i2_dev_555.csv") 212 | test_data_bow_fea_v1,test_data_label_v1 = load_data("v2.3_train_Sa_word_seg_i2_val_275.csv") 213 | # # 214 | 215 | 216 | train_data_bow_fea_v2,train_data_label_v2 = load_data("v2.3_train_Sa_word_seg_i3_dev_553.csv") 217 | test_data_bow_fea_v2,test_data_label_v2 = load_data("v2.3_train_Sa_word_seg_i3_val_277.csv") 218 | 219 | 220 | train_data_bow_fea_v3,train_data_label_v3 = load_data("v2.3_train_Sa_word_seg_i4_dev_552.csv") 221 | test_data_bow_fea_v3,test_data_label_v3 = load_data("v2.3_train_Sa_word_seg_i4_val_278.csv") 222 | 223 | 224 | sentence_width = len(train_data_bow_fea) 225 | sentence_length = len(train_data_bow_fea[1]) 226 | 227 | sentence_width_v1 = len(train_data_bow_fea_v1) 228 | sentence_length_v1 = len(train_data_bow_fea_v1[1]) 229 | 230 | sentence_width_v2 = len(train_data_bow_fea_v2) 231 | sentence_length_v2 = len(train_data_bow_fea_v2[1]) 232 | 233 | sentence_width_v3 = len(train_data_bow_fea_v3) 234 | sentence_length_v3 = len(train_data_bow_fea_v3[1]) 235 | 236 | print sentence_width 237 | print sentence_length 238 | 239 | train_data_bow_fea = np.array(train_data_bow_fea).reshape(len(train_data_bow_fea), 1, len(train_data_bow_fea[1]), 1) 240 | # 规定4维输入,必须先转化[长度,1,宽度,1] 241 | test_data_bow_fea = np.array(test_data_bow_fea).reshape(len(test_data_bow_fea), 1, len(test_data_bow_fea[1]), 1) 242 | 243 | 244 | train_data_bow_fea_v1 = np.array(train_data_bow_fea_v1).reshape(len(train_data_bow_fea_v1), 1, len(train_data_bow_fea_v1[1]), 1) 245 | # 规定4维输入,必须先转化[长度,1,宽度,1] 246 | test_data_bow_fea_v1 = np.array(test_data_bow_fea_v1).reshape(len(test_data_bow_fea_v1), 1, len(test_data_bow_fea_v1[1]), 1) 247 | 248 | 249 | train_data_bow_fea_v2 = np.array(train_data_bow_fea_v2).reshape(len(train_data_bow_fea_v2), 1, len(train_data_bow_fea_v2[1]), 1) 250 | # 规定4维输入,必须先转化[长度,1,宽度,1] 251 | test_data_bow_fea_v2 = np.array(test_data_bow_fea_v2).reshape(len(test_data_bow_fea_v2), 1, len(test_data_bow_fea_v2[1]), 1) 252 | 253 | train_data_bow_fea_v3 = np.array(train_data_bow_fea_v3).reshape(len(train_data_bow_fea_v3), 1, len(train_data_bow_fea_v3[1]), 1) 254 | # 规定4维输入,必须先转化[长度,1,宽度,1] 255 | test_data_bow_fea_v3 = np.array(test_data_bow_fea_v3).reshape(len(test_data_bow_fea_v3), 1, len(test_data_bow_fea_v3[1]), 1) 256 | #改造: 维度也卷积 257 | #改造: 参数改变等 258 | 259 | print '句子数:',sentence_width 260 | print '维度总数:',sentence_length 261 | 262 | label_train = train_data_label 263 | label_train = np_utils.to_categorical(label_train, 24) # 必须使用固定格式表示标签 264 | label_test = test_data_label 265 | label_test = np_utils.to_categorical(label_test, 24) # 必须使用固定格式表示标签 266 | 267 | 268 | 269 | label_train_v1 = train_data_label_v1 270 | label_train_v1 = np_utils.to_categorical(label_train_v1, 24) # 必须使用固定格式表示标签 271 | label_test_v1 = test_data_label_v1 272 | label_test_v1 = np_utils.to_categorical(label_test_v1, 24) # 必须使用固定格式表示标签 273 | 274 | label_train_v2 = train_data_label_v2 275 | label_train_v2 = np_utils.to_categorical(label_train_v2, 24) # 必须使用固定格式表示标签 276 | label_test_v2 = test_data_label_v2 277 | label_test_v2 = np_utils.to_categorical(label_test_v2, 24) # 必须使用固定格式表示标签 278 | 279 | label_train_v3 = train_data_label_v3 280 | label_train_v3 = np_utils.to_categorical(label_train_v3, 24) # 必须使用固定格式表示标签 281 | label_test_v3 = test_data_label_v3 282 | label_test_v3 = np_utils.to_categorical(label_test_v3, 24) # 必须使用固定格式表示标签 283 | 284 | 285 | # layer1_model1 = [10,9,11] 286 | # layer2_model = [30,31,29] 287 | # hidden1_model = [1000,980,1020] 288 | # hidden2_model = [100,80,120] 289 | # 290 | # c = 5 291 | 292 | # layer1_model1 = [5, 6, 4] 293 | # layer2_model = [30, 31, 29] 294 | # hidden1_model = [1000, 980, 1020] 295 | # hidden2_model = [450, 430, 470] 296 | # 297 | # c = 4 298 | 299 | # layer1_model1 = [10, 11, 9] 300 | # layer2_model = [30, 31, 29] 301 | # hidden1_model = [1000, 980, 1020] 302 | # hidden2_model = [450, 430, 470] 303 | # 304 | # c = 3 305 | 306 | # layer1_model1 = [10, 11, 9] 307 | # layer2_model = [15, 14, 16] 308 | # hidden1_model = [1000, 980, 1020] 309 | # hidden2_model = [300, 280, 320] 310 | # 311 | # c = 2 312 | 313 | layer1_model1 = [10, 11, 9] 314 | layer2_model = [30,31, 29] 315 | hidden1_model = [1000, 980, 1020] 316 | hidden2_model = [300, 280, 320] 317 | 318 | c = 1 319 | print c 320 | plan = [] 321 | for i in range(0, len( layer1_model1)): 322 | for j in range(0, len( layer2_model)): 323 | for k in range(0, len( layer2_model)): 324 | for m in range(0, len( layer2_model)): 325 | plan.append([layer1_model1[i],layer2_model[j],hidden1_model[k],hidden2_model[m]]) 326 | 327 | random.shuffle(plan) 328 | 329 | 330 | 331 | 332 | 333 | u = 0 334 | 335 | # for layer1 in layer1_model1: #4,6 336 | # for layer2 in layer2_model: #[6,8] 337 | # for hidden1 in hidden1_model: 338 | # for hidden2 in hidden2_model: 339 | for i in range(20): 340 | 341 | layer1 = plan[i][0] 342 | layer2 = plan[i][1] 343 | hidden1 = plan[i][2] 344 | hidden2 = plan[i][3] 345 | 346 | f = open('result.txt','a') 347 | 348 | print 'layer1: ', layer1 349 | print 'layer2: ', layer2 350 | print 'hidden1: ', hidden1 351 | print 'hidden2: ', hidden2 352 | 353 | print >> f, 'layer1: ', layer1 354 | print >> f,'layer2: ', layer2 355 | print >> f,'hidden1: ', hidden1 356 | print >> f,'hidden2: ', hidden2 357 | 358 | 359 | 360 | #不同卷积核意味着不同权值 361 | 362 | model = build( layer1,layer2,hidden1,hidden2,sentence_length,sentence_width) 363 | 364 | 365 | model.fit([train_data_bow_fea,train_data_bow_fea,train_data_bow_fea],label_train, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0) 366 | 367 | 368 | print '测试准确率:' 369 | print model.metrics_names 370 | print model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True) 371 | 372 | print >> f,'测试准确率:' 373 | print >> f,model.metrics_names 374 | print >> f,model.evaluate([test_data_bow_fea, test_data_bow_fea, test_data_bow_fea], label_test, show_accuracy=True) 375 | 376 | acc = model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True)[1] 377 | 378 | 379 | 380 | #v1 381 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v1,sentence_width_v1) 382 | model.fit([train_data_bow_fea_v1,train_data_bow_fea_v1,train_data_bow_fea_v1],label_train_v1, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0) 383 | acc_v1 = model.evaluate([test_data_bow_fea_v1,test_data_bow_fea_v1,test_data_bow_fea_v1],label_test_v1,show_accuracy=True)[1] 384 | #v2 385 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v2,sentence_length_v2) 386 | model.fit([train_data_bow_fea_v2,train_data_bow_fea_v2,train_data_bow_fea_v2],label_train_v2, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0) 387 | acc_v2 = model.evaluate([test_data_bow_fea_v2,test_data_bow_fea_v2,test_data_bow_fea_v2],label_test_v2,show_accuracy=True)[1] 388 | 389 | #v3 390 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v3,sentence_length_v3) 391 | model.fit([train_data_bow_fea_v3,train_data_bow_fea_v3,train_data_bow_fea_v3],label_train_v3, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0) 392 | acc_v3 = model.evaluate([test_data_bow_fea_v3,test_data_bow_fea_v3,test_data_bow_fea_v3],label_test_v3,show_accuracy=True)[1] 393 | 394 | 395 | import csv 396 | 397 | csvfile = file('result_word&charact_Best' + str(c) + '_Random.csv', 'a') 398 | writer = csv.writer(csvfile) 399 | if u == 0: 400 | writer.writerow(['layer1', 'layer2', 'hidden1','hidden2','val_acc','test_acc']) 401 | u += 1 402 | 403 | data = [ 404 | (layer1, layer2, hidden1,hidden2,acc,((acc_v1 + acc_v2 + acc_v3)/(3*1.0)) ) 405 | ] 406 | writer.writerows(data) 407 | csvfile.close() 408 | 409 | # import csv 410 | # 411 | # csvfile = file('result_word.csv', 'a') 412 | # writer = csv.writer(csvfile) 413 | # if u == 0: 414 | # writer.writerow(['layer1', 'layer2', 'hidden1','hidden2','val_acc','test_acc']) 415 | # u += 1 416 | # 417 | # data = [ 418 | # (layer1, layer2, hidden1,hidden2,"",model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True)[1]), 419 | # ] 420 | # writer.writerows(data) 421 | # csvfile.close() 422 | 423 | f.close() -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 17 | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 74 | 75 | 84 | 85 | 86 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 125 | 126 | 127 | 128 | 131 | 132 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 161 | 162 | 173 | 174 | 192 | 193 | 211 | 212 | 232 | 233 | 254 | 255 | 278 | 279 | 280 | 282 | 283 | 284 | 285 | 1470902465490 286 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | --------------------------------------------------------------------------------