├── .gitignore ├── Count-basedClassifier ├── knnClassifier.py ├── ldaClassifier.py ├── lda_svmClassifier.py ├── level1basedClassifier.py ├── mixingClassifier.py ├── naiveBayesClassifier.py └── svmClassifier.py ├── NNClassifier ├── lstmModel.py └── train.py ├── README.md ├── Utils.py ├── bestResults ├── mix-level1-4-0.7725.xls └── mix-level2-6-0.6576.xls ├── dataCleaner.py └── getAllTags.py /.gitignore: -------------------------------------------------------------------------------- 1 | ./data 2 | ./trainedModel 3 | ./validAPI.py 4 | ./logs -------------------------------------------------------------------------------- /Count-basedClassifier/knnClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.neighbors import KNeighborsClassifier 8 | import xlwt 9 | 10 | # **************************************参数设置******************************************** # 11 | TIMES = 10 # 训练的模型个数 12 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 13 | TAG_LEVEL = 2 # 分类级别 14 | # ***************************************************************************************** # 15 | 16 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 17 | 18 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) 19 | 20 | rawdata = list() 21 | 22 | for i in range(1, rawDataRows): # 跳过表头 23 | r = rawData.row_values(i) 24 | rawText = r[30] 25 | text = u.replaceAllSymbols(r[30]) 26 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 27 | if u.checkOnlyContainEnglish(text): # 6152 rows, maxlen=572, minlen=2 28 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 29 | 30 | 31 | # 按比例得到训练集、测试集 32 | def divideData(rawdata, data, tag, trate): 33 | nsamples = len(data) 34 | 35 | sidx = np.random.permutation(nsamples) 36 | ntrain = int(np.round(nsamples * (1 - trate))) 37 | 38 | train_data = [data[s] for s in sidx[:ntrain]] 39 | train_tag = [tag[s] for s in sidx[:ntrain]] 40 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 41 | test_data = [data[s] for s in sidx[ntrain:]] 42 | test_tag = [tag[s] for s in sidx[ntrain:]] 43 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 44 | 45 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 46 | 47 | 48 | data = u.loadPickle(datapath) 49 | rawdialogue = list() 50 | content = list() 51 | tag = list() 52 | for i, each in enumerate(data): # 一级分类样本数5889 二级分类5887 53 | if each[TAG_LEVEL].strip() == '': 54 | continue 55 | else: 56 | rawdialogue.append(rawdata[i]) 57 | content.append(each[0]) 58 | tag.append(each[TAG_LEVEL]) 59 | 60 | print len(rawdialogue) 61 | print len(content) 62 | print len(tag) 63 | for i in range(TIMES): 64 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 65 | 66 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english') 67 | tfidftransformer = TfidfTransformer() 68 | knn = KNeighborsClassifier(n_neighbors=5) 69 | 70 | text_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', knn)]) 71 | text_clf = text_clf.fit(train_content, train_tag) 72 | pred = text_clf.predict(test_content) 73 | acc = np.round(np.mean(pred == test_tag), 4) 74 | # p= text_clf.predict_proba(test_content) 75 | # predicted=np.argmax(p,axis=1) 76 | # acc = np.round(np.mean(text_clf.classes_[predicted] == test_tag), 4) 77 | print 'KNN分类器的准确率: %.4f' % acc 78 | ''' 79 | modelname = 'svm-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 80 | u.saveAsPickle(text_clf, '../trainedModel/knn/%s.pkl' % modelname) 81 | u.saveModelAcc2txt(modelname, acc, '../logs/knn-model-acc.txt') 82 | 83 | 84 | outpath = '../results/svm/%s.xls' % modelname 85 | workbook = xlwt.Workbook(encoding='utf8') 86 | worksheet = workbook.add_sheet('实验结果') 87 | 88 | for i, each in enumerate(text_clf.predict(test_content)): 89 | worksheet.write(i, 0, test_raw[i]) # 原始文本 90 | worksheet.write(i, 1, test_content[i]) # 处理过的文本 91 | worksheet.write(i, 2, test_tag[i]) # 原始标签 92 | worksheet.write(i, 3, each) # 预测标签 93 | 94 | workbook.save(outpath) 95 | ''' 96 | -------------------------------------------------------------------------------- /Count-basedClassifier/ldaClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.decomposition import LatentDirichletAllocation as LDA 6 | from collections import Counter 7 | import xlwt 8 | 9 | # **************************************参数设置******************************************** # 10 | TIMES = 1 # 训练的模型个数 11 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 12 | TAG_LEVEL = 1 # 分类级别(5 60) 13 | N_FEATURES = 500 14 | N_TOPICS = 5 15 | # ***************************************************************************************** # 16 | 17 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 18 | 19 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 20 | 21 | rawdata = list() 22 | 23 | for i in range(1, rawDataRows): # 跳过表头 24 | r = rawData.row_values(i) 25 | rawText = r[30] 26 | text = u.replaceAllSymbols(r[30]) 27 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 28 | if u.checkOnlyContainEnglish(text): # 6152 rows, maxlen=572, minlen=2 29 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 30 | 31 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 32 | outpath = '../results/bestResult-%d.xls' % TAG_LEVEL 33 | 34 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 35 | 36 | rawdata = list() 37 | 38 | for i in range(1, rawDataRows): # 跳过表头 39 | r = rawData.row_values(i) 40 | rawText = r[30] 41 | text = u.replaceAllSymbols(r[30]) 42 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 43 | if u.checkOnlyContainEnglish(text): # 7173 rows, maxlen=572, minlen=2 44 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 45 | 46 | 47 | # 按比例得到训练集、测试集 48 | def divideData(rawdata, data, tag, trate): 49 | nsamples = len(data) 50 | 51 | sidx = np.random.permutation(nsamples) 52 | ntrain = int(np.round(nsamples * (1 - trate))) 53 | 54 | train_data = [data[s] for s in sidx[:ntrain]] 55 | train_tag = [tag[s] for s in sidx[:ntrain]] 56 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 57 | test_data = [data[s] for s in sidx[ntrain:]] 58 | test_tag = [tag[s] for s in sidx[ntrain:]] 59 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 60 | 61 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 62 | 63 | 64 | data = u.loadPickle(datapath) 65 | 66 | rawdialogue = list() 67 | content = list() 68 | tag = list() 69 | for i, each in enumerate(data): # 一级分类样本数5889 二级分类5887 70 | if each[TAG_LEVEL].strip() == '': 71 | continue 72 | else: 73 | rawdialogue.append(rawdata[i]) 74 | content.append(each[0]) 75 | tag.append(each[TAG_LEVEL]) 76 | 77 | for i in range(TIMES): 78 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 79 | # 得到单词-文档共现矩阵 80 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english', max_features=N_FEATURES) 81 | 82 | train_data = vectorizer.fit_transform(train_content) 83 | 84 | train_tag = np.array(train_tag) 85 | 86 | test_data = vectorizer.fit_transform(test_content) # [n_samples, n_features] 87 | 88 | model = LDA(n_topics=N_TOPICS, max_iter=5, batch_size=128) 89 | model.fit(train_data) 90 | 91 | train_data_distr = model.transform(train_data) 92 | pred_tag = train_data_distr.argmax(axis=1) 93 | 94 | # 投票 95 | id2class = dict() 96 | for idx in range(N_TOPICS): 97 | idxs = np.where(pred_tag == idx)[0] 98 | # print Counter(train_tag[idxs]) 99 | id2class[idx] = Counter(train_tag[idxs]).most_common(1)[0][0] 100 | print id2class 101 | doc_topic_distr = model.transform(test_data) # [n_samples, n_topics] 102 | class_id = doc_topic_distr.argmax(axis=1) 103 | pred = [id2class[each] for each in class_id] 104 | pred=np.array(pred) 105 | test_tag=np.array(test_tag) 106 | acc=np.mean(pred==test_tag) 107 | print 'LDA分类器的准确率: %.4f' % acc 108 | """ 109 | modelname = 'lda-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 110 | u.saveAsPickle(text_clf, '../trainedModel/lda/%s.pkl' % modelname) 111 | u.saveModelAcc2txt(modelname, acc, '../logs/lda-model-acc.txt') 112 | 113 | outpath = '../results/lda/%s.xls' % modelname 114 | workbook = xlwt.Workbook(encoding='utf8') 115 | worksheet = workbook.add_sheet('实验结果') 116 | 117 | for i, each in enumerate(text_clf.predict(test_content)): 118 | worksheet.write(i, 0, test_raw[i]) # 原始文本 119 | worksheet.write(i, 1, test_content[i]) # 处理过的文本 120 | worksheet.write(i, 2, test_tag[i]) # 原始标签 121 | worksheet.write(i, 3, each) # 预测标签 122 | 123 | workbook.save(outpath) 124 | """ 125 | -------------------------------------------------------------------------------- /Count-basedClassifier/lda_svmClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.decomposition import LatentDirichletAllocation as LDA 6 | from sklearn.svm import SVC 7 | 8 | # **************************************参数设置******************************************** # 9 | TIMES = 10 # 训练的模型个数 10 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 11 | TAG_LEVEL = 1 # 分类级别(5 60) 12 | N_FEATURES = 500 13 | N_TOPICS = 100 14 | # ***************************************************************************************** # 15 | 16 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 17 | 18 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 19 | 20 | rawdata = list() 21 | 22 | for i in range(1, rawDataRows): # 跳过表头 23 | r = rawData.row_values(i) 24 | rawText = r[30] 25 | text = u.replaceAllSymbols(r[30]) 26 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 27 | if u.checkOnlyContainEnglish(text): # 6152 rows, maxlen=572, minlen=2 28 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 29 | 30 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 31 | outpath = '../results/bestResult-%d.xls' % TAG_LEVEL 32 | 33 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 34 | 35 | rawdata = list() 36 | 37 | for i in range(1, rawDataRows): # 跳过表头 38 | r = rawData.row_values(i) 39 | rawText = r[30] 40 | text = u.replaceAllSymbols(r[30]) 41 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 42 | if u.checkOnlyContainEnglish(text): # 7173 rows, maxlen=572, minlen=2 43 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 44 | 45 | 46 | # 按比例得到训练集、测试集 47 | def divideData(rawdata, data, tag, trate): 48 | nsamples = len(data) 49 | 50 | sidx = np.random.permutation(nsamples) 51 | ntrain = int(np.round(nsamples * (1 - trate))) 52 | 53 | train_data = [data[s] for s in sidx[:ntrain]] 54 | train_tag = [tag[s] for s in sidx[:ntrain]] 55 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 56 | test_data = [data[s] for s in sidx[ntrain:]] 57 | test_tag = [tag[s] for s in sidx[ntrain:]] 58 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 59 | 60 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 61 | 62 | 63 | data = u.loadPickle(datapath) 64 | 65 | rawdialogue = list() 66 | content = list() 67 | tag = list() 68 | 69 | for i, each in enumerate(data): # 一级分类样本数5889 二级分类5887 70 | if each[TAG_LEVEL].strip() == '': 71 | continue 72 | else: 73 | rawdialogue.append(rawdata[i]) 74 | content.append(each[0]) 75 | tag.append(each[TAG_LEVEL]) 76 | total_acc = 0 77 | for i in range(TIMES): 78 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 79 | # 得到单词-文档共现矩阵 80 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english', max_features=N_FEATURES) 81 | 82 | train_data = vectorizer.fit_transform(train_content) 83 | test_data = vectorizer.fit_transform(test_content) # [n_samples, n_features] 84 | 85 | model = LDA(n_topics=N_TOPICS, batch_size=64) 86 | model.fit(train_data) 87 | 88 | dt_matrix = model.transform(train_data) 89 | test_dt_matrix = model.transform(test_data) 90 | svc = SVC(C=0.99, kernel='linear') 91 | 92 | svc = svc.fit(dt_matrix, train_tag) 93 | pred = svc.predict(test_dt_matrix) 94 | acc = np.round(np.mean(pred == test_tag), 4) 95 | total_acc += acc 96 | print 'LDA分类器的准确率: %.4f' % acc 97 | print 'average accuary: ', total_acc / TIMES 98 | -------------------------------------------------------------------------------- /Count-basedClassifier/level1basedClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.svm import SVC 8 | import xlwt 9 | 10 | # **************************************参数设置******************************************** # 11 | TIMES = 10 # 训练的模型个数 12 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 13 | tag12path = '../data/tag12.pkl' # 一二级字典 14 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' # 原始文件 15 | TAG_LEVEL = 2 # 分类级别 16 | # ***************************************************************************************** # 17 | 18 | 19 | # 按比例得到训练集、测试集 20 | def divideData(rawdata, data, tag, tag2, trate): 21 | nsamples = len(data) 22 | 23 | sidx = np.random.permutation(nsamples) 24 | ntrain = int(np.round(nsamples * (1 - trate))) 25 | 26 | train_data = [data[s] for s in sidx[:ntrain]] 27 | train_tag = [tag[s] for s in sidx[:ntrain]] 28 | train_tag2 = [tag2[s] for s in sidx[:ntrain]] 29 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 30 | 31 | test_data = [data[s] for s in sidx[ntrain:]] 32 | test_tag = [tag[s] for s in sidx[ntrain:]] 33 | test_tag2 = [tag2[s] for s in sidx[ntrain:]] 34 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 35 | 36 | return train_data, train_tag, train_tag2, train_raw, test_data, test_tag, test_tag2, test_raw 37 | 38 | 39 | # 得到一级分类每个类别对应的所有小类在二级分类标签数组的索引值 40 | def getLevel2ClassIndex(c1, c2, tag12): 41 | all_tags = list() 42 | for each in c1: 43 | tags = list() 44 | for tag in tag12[each]: 45 | i = np.argwhere(c2 == tag) 46 | if len(i) != 0: # 处理字典中有的分类在实际样本没有出现的情况 47 | tags.append(i[0][0]) 48 | all_tags.append(tags) 49 | return all_tags 50 | 51 | # 根据一级分类结果调整二级分类概率分布 52 | # 调整了概率分布的指导方式p2'=p2*(1+p1) 53 | def adjustProba(p1,p2,tag22ix): 54 | for ir, prob1 in enumerate(p1): 55 | for it, each in enumerate(tag22ix): 56 | p2[ir][each] += p2[ir][each] * prob1[it] 57 | return p2 58 | 59 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) 60 | data = u.loadPickle(datapath) 61 | tag12 = u.loadPickle(tag12path) 62 | 63 | rawdata = list() 64 | 65 | for i in range(1, rawDataRows): # 跳过表头 66 | r = rawData.row_values(i) 67 | rawText = r[30] 68 | text = u.replaceAllSymbols(r[30]) 69 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 70 | if u.checkOnlyContainEnglish(text): 71 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 72 | 73 | 74 | rawdialogue = list() #原始文本 75 | content = list() #处理后文本 76 | tag = list() #一级标签 77 | tag2 = list() #二级标签 78 | 79 | for i, each in enumerate(data): # 一级分类样本数5889 二级分类5887 80 | if each[1].strip() == '' or each[2].strip() == '': 81 | continue 82 | else: 83 | rawdialogue.append(rawdata[i]) 84 | content.append(each[0]) 85 | tag.append(each[1]) 86 | tag2.append(each[2]) 87 | 88 | for i in range(TIMES): 89 | train_content, train_tag, train_tag2, train_raw, test_content, test_tag, test_tag2, test_raw = divideData( 90 | rawdialogue, content, tag, tag2, 0.2) 91 | 92 | v_1 = CountVectorizer(encoding='unicode', stop_words='english') 93 | t_1 = TfidfTransformer() 94 | svc_1 = SVC(probability=True, C=0.99, kernel='linear') 95 | 96 | v_2 = CountVectorizer(encoding='unicode', stop_words='english') 97 | t_2 = TfidfTransformer() 98 | svc_2 = SVC(probability=True, C=0.99, kernel='linear') 99 | 100 | # 一级分类器 101 | text_clf_1 = Pipeline( 102 | [('vect', v_1), ('tfidf', t_1), ('clf', svc_1)]) 103 | # 二级分类器 104 | text_clf_2 = Pipeline( 105 | [('vect2', v_2), ('tfidf2', t_2), ('clf2', svc_2)]) 106 | 107 | # 独立训练一二级分类器 108 | text_clf_1 = text_clf_1.fit(train_content, train_tag) 109 | text_clf_2 = text_clf_2.fit(train_content, train_tag2) 110 | 111 | c1 = text_clf_1.classes_ # 一级分类类别集合 112 | c2 = text_clf_2.classes_ # 二级分类类别集合 113 | 114 | p1 = text_clf_1.predict_proba(test_content) # 一级分类概率分布 115 | p2 = text_clf_2.predict_proba(test_content) # 二级分类概率分布 116 | 117 | tag22ix = getLevel2ClassIndex(c1, c2, tag12) 118 | 119 | # 根据训练好的一级分类器分类结果,将二级分类器分类结果对应大类下的小类概率进行修正 120 | # 例: 121 | # 一级分类:A,B,C,预测结果为 0.2, 0.3,0.5 122 | # 二级分类:a1,a2,b1,b2,c1,二级分类器预测结果为 0.05, 0.15, 0.1, 0.3, 0.4 123 | # 根据一级分类结果,将二级分类结果修正为 0.05*0.2, 0.15*0.2, 0.1*0.3, 0.3*0.3, 0.4*0.5 124 | # 修正的方式还需要考虑,该方式结果不好 125 | p2=adjustProba(p1,p2,tag22ix) 126 | predix2 = np.argmax(p2, axis=1) 127 | acc = np.mean(c2[predix2] == test_tag2) 128 | 129 | print 'SVM分类器的准确率: %.4f' % acc 130 | modelname = '1based-svm-%d' % i 131 | modelname1 = 'svm1-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 132 | modelname2 = 'svm2-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 133 | u.saveAsPickle(text_clf_1, '../trainedModel/1based-svm/%s.pkl' % modelname1) 134 | u.saveAsPickle(text_clf_2, '../trainedModel/1based-svm/%s.pkl' % modelname2) 135 | 136 | u.saveModelAcc2txt(modelname, acc, '../logs/svm-model-acc.txt') 137 | 138 | outpath = '../results/svm/%s.xls' % modelname 139 | workbook = xlwt.Workbook(encoding='utf8') 140 | worksheet = workbook.add_sheet('实验结果') 141 | 142 | for i, each in enumerate(c2[predix2]): 143 | worksheet.write(i, 0, test_raw[i]) # 原始文本 144 | worksheet.write(i, 1, test_content[i]) # 处理过的文本 145 | worksheet.write(i, 2, test_tag[i]) # 原始标签 146 | worksheet.write(i, 3, each) # 预测标签 147 | 148 | workbook.save(outpath) 149 | 150 | -------------------------------------------------------------------------------- /Count-basedClassifier/mixingClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.svm import SVC 8 | from sklearn.neighbors import KNeighborsClassifier 9 | from sklearn.naive_bayes import MultinomialNB 10 | from collections import Counter 11 | import xlwt 12 | 13 | # **************************************参数设置******************************************** # 14 | TIMES = 10 15 | datapath = '../data/alldata(onlyEng-fixed12).pkl' 16 | TAG_LEVEL = 2 17 | # ***************************************************************************************** # 18 | 19 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 20 | 21 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) 22 | 23 | rawdata = list() 24 | 25 | for i in range(1, rawDataRows): 26 | r = rawData.row_values(i) 27 | rawText = r[30] 28 | text = u.replaceAllSymbols(r[30]) 29 | label1, label2, label3 = r[42], r[43], r[44] 30 | if u.checkOnlyContainEnglish(text): 31 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 32 | 33 | 34 | def divideData(rawdata, data, tag, trate): 35 | nsamples = len(data) 36 | 37 | sidx = np.random.permutation(nsamples) 38 | ntrain = int(np.round(nsamples * (1 - trate))) 39 | 40 | train_data = [data[s] for s in sidx[:ntrain]] 41 | train_tag = [tag[s] for s in sidx[:ntrain]] 42 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 43 | test_data = [data[s] for s in sidx[ntrain:]] 44 | test_tag = [tag[s] for s in sidx[ntrain:]] 45 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 46 | 47 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 48 | 49 | 50 | def vote(mixRes): 51 | finalRes = list() 52 | for i in range(len(mixRes[0])): 53 | res = list() 54 | for j in range(len(mixRes)): 55 | res.append(mixRes[j][i]) 56 | temp = Counter(res).most_common(1)[0] 57 | name = temp[0] 58 | num = temp[1] 59 | if num > len(mixRes) / 2: 60 | finalRes.append(name) 61 | else: 62 | finalRes.append(res[0]) 63 | finalRes = np.array(finalRes) 64 | return finalRes 65 | 66 | 67 | data = u.loadPickle(datapath) 68 | rawdialogue = list() 69 | content = list() 70 | tag = list() 71 | for i, each in enumerate(data): 72 | if each[TAG_LEVEL].strip() == '': 73 | continue 74 | else: 75 | rawdialogue.append(rawdata[i]) 76 | content.append(each[0]) 77 | tag.append(each[TAG_LEVEL]) 78 | print 'num of classes: %d' % len(dict(Counter(tag))) 79 | total_acc = 0 80 | 81 | for time in range(TIMES): 82 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 83 | 84 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english') 85 | tfidftransformer = TfidfTransformer() 86 | 87 | svm_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', SVC(C=0.99, kernel='linear'))]) 88 | svm_clf = svm_clf.fit(train_content, train_tag) 89 | svm_pred = svm_clf.predict(test_content) 90 | svm_acc = np.mean(svm_pred == test_tag) 91 | 92 | knn = KNeighborsClassifier(n_neighbors=5) 93 | 94 | knn_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', knn)]) 95 | knn_clf = knn_clf.fit(train_content, train_tag) 96 | knn_pred = knn_clf.predict(test_content) 97 | knn_acc = np.mean(knn_pred == test_tag) 98 | 99 | bayes_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', MultinomialNB())]) 100 | bayes_clf = bayes_clf.fit(train_content, train_tag) 101 | bayes_pred = bayes_clf.predict(test_content) 102 | bayes_acc = np.mean(bayes_pred == test_tag) 103 | 104 | final_pred = vote([svm_pred, knn_pred, bayes_pred]) 105 | final_acc = np.mean(final_pred == test_tag) 106 | total_acc += final_acc 107 | 108 | modelname = 'mix-level%d-%d-%.4f' % (TAG_LEVEL, time, final_acc) 109 | u.saveAsPickle(svm_clf, '../trainedModel/mix/svm-%s.pkl' % modelname) 110 | u.saveAsPickle(knn_clf, '../trainedModel/mix/knn-%s.pkl' % modelname) 111 | u.saveAsPickle(bayes_clf, '../trainedModel/mix/bayes-%s.pkl' % modelname) 112 | u.saveModelAcc2txt(modelname, final_acc, '../logs/mix-model-acc.txt') 113 | 114 | outpath = '../results/mix/%s.xls' % modelname 115 | workbook = xlwt.Workbook(encoding='utf8') 116 | worksheet = workbook.add_sheet('实验结果') 117 | 118 | for i, each in enumerate(final_pred): 119 | worksheet.write(i, 0, test_raw[i]) 120 | worksheet.write(i, 1, test_content[i]) 121 | worksheet.write(i, 2, test_tag[i]) 122 | worksheet.write(i, 3, each) 123 | 124 | workbook.save(outpath) 125 | 126 | print 'average acc: %.4f' % (total_acc / TIMES) 127 | -------------------------------------------------------------------------------- /Count-basedClassifier/naiveBayesClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.naive_bayes import MultinomialNB 8 | import xlwt 9 | 10 | # **************************************参数设置********************************************* # 11 | TIMES = 10 # 训练的模型个数 12 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 13 | TAG_LEVEL = 1 # 分类级别 14 | # ******************************************************************************************* # 15 | 16 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 17 | 18 | 19 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 20 | 21 | rawdata = list() 22 | 23 | for i in range(1, rawDataRows): # 跳过表头 24 | r = rawData.row_values(i) 25 | rawText = r[30] 26 | text = u.replaceAllSymbols(r[30]) 27 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 28 | if u.checkOnlyContainEnglish(text): # 6152 rows, maxlen=572, minlen=2 29 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 30 | 31 | 32 | # 按比例得到训练集、测试集 33 | def divideData(rawdata, data, tag, trate): 34 | nsamples = len(data) 35 | 36 | sidx = np.random.permutation(nsamples) 37 | ntrain = int(np.round(nsamples * (1 - trate))) 38 | 39 | train_data = [data[s] for s in sidx[:ntrain]] 40 | train_tag = [tag[s] for s in sidx[:ntrain]] 41 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 42 | test_data = [data[s] for s in sidx[ntrain:]] 43 | test_tag = [tag[s] for s in sidx[ntrain:]] 44 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 45 | 46 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 47 | 48 | 49 | data = u.loadPickle(datapath) 50 | rawdialogue = list() 51 | content = list() 52 | tag = list() 53 | for i, each in enumerate(data): # 5889 5887 54 | if each[TAG_LEVEL].strip() == '': 55 | continue 56 | else: 57 | rawdialogue.append(rawdata[i]) 58 | content.append(each[0]) 59 | tag.append(each[TAG_LEVEL]) 60 | 61 | for i in range(TIMES): 62 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 63 | 64 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english') 65 | tfidftransformer = TfidfTransformer() 66 | 67 | text_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', MultinomialNB())]) 68 | text_clf = text_clf.fit(train_content, train_tag) 69 | predicted = text_clf.predict(test_content) 70 | acc = np.round(np.mean(predicted == test_tag), 4) 71 | print 'Bayes分类器的准确率: %.4f' % acc 72 | modelname = 'bayes-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 73 | u.saveAsPickle(text_clf, '../trainedModel/bayes/%s.pkl' % modelname) 74 | u.saveModelAcc2txt(modelname, acc, '../logs/bayes-model-acc.txt') 75 | 76 | outpath = '../results/bayes/%s.xls' % modelname 77 | workbook = xlwt.Workbook(encoding='utf8') 78 | worksheet = workbook.add_sheet('实验结果') 79 | 80 | for i, each in enumerate(text_clf.predict(test_content)): 81 | worksheet.write(i, 0, test_raw[i]) # 原始文本 82 | worksheet.write(i, 1, test_content[i]) # 处理过的文本 83 | worksheet.write(i, 2, test_tag[i]) # 原始标签 84 | worksheet.write(i, 3, each) # 预测标签 85 | 86 | workbook.save(outpath) 87 | -------------------------------------------------------------------------------- /Count-basedClassifier/svmClassifier.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | import Utils as u 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.svm import SVC 8 | import xlwt 9 | 10 | # **************************************参数设置******************************************** # 11 | TIMES = 10 # 训练的模型个数 12 | datapath = '../data/alldata(onlyEng-fixed12).pkl' # 数据源位置 13 | TAG_LEVEL = 2 # 分类级别 14 | # ***************************************************************************************** # 15 | 16 | dataPath = r'../data/Berlinetta MLK 9.06.xlsx' 17 | 18 | 19 | rawData, rawDataRows, rawDataCols = u.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 20 | 21 | rawdata = list() 22 | 23 | for i in range(1, rawDataRows): # 跳过表头 24 | r = rawData.row_values(i) 25 | rawText = r[30] 26 | text = u.replaceAllSymbols(r[30]) 27 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 28 | if u.checkOnlyContainEnglish(text): # 6152 rows, maxlen=572, minlen=2 29 | rawdata.append([rawText.strip(), label1.lower(), label2.lower(), label3.lower()]) 30 | 31 | 32 | # 按比例得到训练集、测试集 33 | def divideData(rawdata, data, tag, trate): 34 | nsamples = len(data) 35 | 36 | sidx = np.random.permutation(nsamples) 37 | ntrain = int(np.round(nsamples * (1 - trate))) 38 | 39 | train_data = [data[s] for s in sidx[:ntrain]] 40 | train_tag = [tag[s] for s in sidx[:ntrain]] 41 | train_raw = [rawdata[s][0] for s in sidx[:ntrain]] 42 | test_data = [data[s] for s in sidx[ntrain:]] 43 | test_tag = [tag[s] for s in sidx[ntrain:]] 44 | test_raw = [rawdata[s][0] for s in sidx[ntrain:]] 45 | 46 | return train_data, train_tag, train_raw, test_data, test_tag, test_raw 47 | 48 | 49 | data = u.loadPickle(datapath) 50 | rawdialogue = list() 51 | content = list() 52 | tag = list() 53 | for i, each in enumerate(data): # 一级分类样本数5889 二级分类5887 54 | if each[TAG_LEVEL].strip() == '': 55 | continue 56 | else: 57 | rawdialogue.append(rawdata[i]) 58 | content.append(each[0]) 59 | tag.append(each[TAG_LEVEL]) 60 | 61 | for i in range(TIMES): 62 | train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2) 63 | 64 | vectorizer = CountVectorizer(encoding='unicode', stop_words='english') 65 | tfidftransformer = TfidfTransformer() 66 | 67 | text_clf = Pipeline([('vect', vectorizer), ('tfidf', tfidftransformer), ('clf', SVC(C=0.99, kernel='linear'))]) 68 | text_clf = text_clf.fit(train_content, train_tag) 69 | predicted = text_clf.predict(test_content) 70 | acc = np.round(np.mean(predicted == test_tag), 4) 71 | print 'SVM分类器的准确率: %.4f' % acc 72 | modelname = 'svm-level%d-%d-%.4f' % (TAG_LEVEL, i, acc) 73 | u.saveAsPickle(text_clf, '../trainedModel/svm/%s.pkl' % modelname) 74 | u.saveModelAcc2txt(modelname, acc, '../logs/svm-model-acc.txt') 75 | 76 | outpath = '../results/svm/%s.xls' % modelname 77 | workbook = xlwt.Workbook(encoding='utf8') 78 | worksheet = workbook.add_sheet('实验结果') 79 | 80 | for i, each in enumerate(text_clf.predict(test_content)): 81 | worksheet.write(i, 0, test_raw[i]) # 原始文本 82 | worksheet.write(i, 1, test_content[i]) # 处理过的文本 83 | worksheet.write(i, 2, test_tag[i]) # 原始标签 84 | worksheet.write(i, 3, each) # 预测标签 85 | 86 | workbook.save(outpath) 87 | -------------------------------------------------------------------------------- /NNClassifier/lstmModel.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import torch 3 | import torch.autograd as autograd # torch中自动计算梯度模块 4 | import torch.nn as nn # 神经网络模块 5 | import torch.nn.functional as F # 神经网络模块中的常用功能 6 | import torch.optim as optim # 模型优化器模块 7 | 8 | torch.manual_seed(1) 9 | 10 | 11 | class LSTMClassifier(nn.Module): 12 | def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size): 13 | super(LSTMClassifier, self).__init__() 14 | self.hidden_dim = hidden_dim 15 | self.embedding_dim = embedding_dim 16 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 17 | 18 | self.lstm = nn.LSTM(embedding_dim, hidden_dim) 19 | 20 | self.hidden2tag = nn.Linear(hidden_dim, tagset_size) 21 | 22 | self.init_emb() 23 | # self.hidden = self.init_hidden() 24 | 25 | # 在0附近初始化词向量矩阵 26 | def init_emb(self): 27 | initrange = 0.5 / self.embedding_dim 28 | self.word_embeddings.weight.data.uniform_(-initrange, initrange) 29 | 30 | ''' 31 | def init_hidden(self): 32 | return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), 33 | autograd.Variable(torch.zeros(1, 1, self.hidden_dim))) 34 | ''' 35 | 36 | def forward(self, sentence): 37 | embeds = self.word_embeddings(sentence) 38 | # x: (time_step, batch, embedding_dim) 39 | lstm_out, hidden = self.lstm(embeds.view(len(sentence), 1, -1), None) 40 | 41 | tag_space = self.hidden2tag(lstm_out[-1, :, :]) 42 | 43 | # tag_scores = F.softmax(tag_space) 44 | return tag_space 45 | -------------------------------------------------------------------------------- /NNClassifier/train.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import torch 3 | import torch.autograd as autograd # torch中自动计算梯度模块 4 | import torch.nn as nn # 神经网络模块 5 | import torch.optim as optim # 模型优化器模块 6 | from lstmModel import LSTMClassifier 7 | import Utils as u 8 | import numpy as np 9 | 10 | torch.manual_seed(1) 11 | 12 | # ****************************参数设置********************************** # 13 | EMBEDDING_DIM = 100 # 词向量维度 14 | HIDDEN_DIM = 100 # LSTM隐藏层维度 15 | EPOCH = 50 # 训练次数 16 | EARLY_STOP = True # 是否启用early stop 17 | EARLY_STOP_THRESHOLD = 4 # early stop的阈值 18 | LEARNING_RATE = 0.001 # 学习率 19 | VALID_RATE = 0.2 # 验证集占比 20 | TEST_RATE = 0.2 # 测试集占比 21 | TRAIN_TIMES = 10 # 需要的模型总数 22 | LOG_DIR = '../logs/lstm-model-acc.txt' # 日志目录 23 | DATA_DIR = '../data/alldata(onlyEng-fixed12).pkl' # 数据目录 24 | TAG_DIR = '../data/tag12.pkl' # 分类标签来源 25 | TAG_LEVEL = 2 #分类级别 26 | # ******************************************************************** # 27 | 28 | # 按比例得到训练集、验证集、测试集 29 | def divideData(data, vrate, trate): 30 | nsamples = len(data) 31 | 32 | sidx = np.random.permutation(nsamples) 33 | 34 | nvalid = int(np.round(nsamples * vrate)) # 验证集数据量 35 | ntest = int(np.round(nsamples * trate)) # 测试集数据量 36 | ntrain = nsamples - nvalid - ntest # 训练集数据量 37 | 38 | train_data = [data[s] for s in sidx[:ntrain]] 39 | valid_data = [data[s] for s in sidx[ntrain:ntrain + nvalid]] 40 | test_data = [data[s] for s in sidx[ntrain + nvalid:]] 41 | 42 | return train_data, valid_data, test_data 43 | 44 | 45 | # 打乱数据集 46 | def shuffleData(data): 47 | nsamples = len(data) 48 | 49 | sidx = np.random.permutation(nsamples) 50 | newdata = [data[s] for s in sidx] 51 | 52 | return newdata 53 | 54 | 55 | # 将原始输入处理成torch接受的格式 56 | def preparexy(seq, word2ix, tag2ix): 57 | idxs = [word2ix[w] for w in seq[0].split()] 58 | x = idxs 59 | y = tag2ix[seq[TAG_LEVEL]] 60 | return x, y 61 | 62 | 63 | def getWord2Ix(data): 64 | word2ix = {} # 单词的索引字典 65 | for sent, tag1, _, _ in data: 66 | for word in sent.split(): 67 | if word not in word2ix: 68 | word2ix[word] = len(word2ix) 69 | 70 | #加入 #UNK# 用于标记不在词典中的词 71 | word2ix['#UNK#']=len(word2ix) 72 | 73 | return word2ix 74 | 75 | 76 | # 该函数只能抽取一级分类 77 | def getTag2Index(tags): 78 | tag2ix = {} # 类别的索引字典 79 | for key in tags: 80 | if key not in tag2ix: 81 | tag2ix[key] = len(tag2ix) 82 | return tag2ix 83 | 84 | # 获得二级分类 85 | def getTag2(tags): 86 | tag2ix={} 87 | for key in tags: 88 | for each in tags[key]: 89 | if each not in tag2ix: 90 | tag2ix[each]=len(tag2ix) 91 | return tag2ix 92 | 93 | # 计算模型准确率 94 | def evaluate(data, word2ix, tag2ix, model): 95 | count = .0 # 统计正确分类的样本数 96 | total = .0 # 统计样本总数 97 | for i in range(len(data)): 98 | if data[i][TAG_LEVEL].strip() == '': 99 | continue 100 | 101 | testx, testy = preparexy(data[i], word2ix, tag2ix) 102 | testx = autograd.Variable(torch.LongTensor(testx)) 103 | testout = model(testx) 104 | 105 | predy = torch.max(testout, 1)[1].data.numpy().squeeze() 106 | if predy == testy: 107 | count += 1.0 108 | total += 1.0 109 | return np.round(count / total, 4) 110 | 111 | 112 | # 训练 113 | def train_step(data, word2ix, tag2ix, model, loss_function, optimizer, epoch): 114 | for i in range(len(data)): 115 | # 如果没有标签,就直接跳过 116 | if data[i][TAG_LEVEL].strip() == '': 117 | continue 118 | if i % 500 == 0: 119 | print '第%d轮, 第%d个样本' % (epoch + 1, i + 1) 120 | 121 | # 得到输入和标签 122 | x, y = preparexy(data[i], word2ix, tag2ix) 123 | 124 | x = autograd.Variable(torch.LongTensor(x)) 125 | y = autograd.Variable(torch.LongTensor([y])) 126 | 127 | out = model(x) # (1 × 类别数目) 128 | 129 | loss = loss_function(out, y) 130 | 131 | optimizer.zero_grad() 132 | loss.backward() 133 | optimizer.step() 134 | 135 | 136 | data = u.loadPickle(DATA_DIR) # 载入数据集 137 | tag12 = u.loadPickle(TAG_DIR) # 载入分类信息 138 | 139 | word2ix = getWord2Ix(data) # 单词索引字典 140 | if TAG_LEVEL == 1: 141 | tag2ix = getTag2Index(tag12) # 一级类别索引字典 142 | else: 143 | tag2ix = getTag2(tag12) # 二级类别索引字典 144 | 145 | vocab_size = len(word2ix) 146 | tags_size = len(tag2ix) 147 | 148 | for time in range(TRAIN_TIMES): 149 | # 定义模型 150 | model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size, tags_size) 151 | # 定义损失函数 152 | loss_function = nn.CrossEntropyLoss() 153 | # 定义参数优化方法 154 | optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) 155 | 156 | training_data, valid_data, test_data = divideData(data, VALID_RATE, TEST_RATE) 157 | 158 | early_stop_count = 0 # 统计验证集上准确率连续没有提高的次数 159 | pre_accurary = .0 # 记录该次训练之前模型最好的准确率 160 | 161 | flag = 'normal' # 是否正常完成训练 162 | 163 | for epoch in range(EPOCH): 164 | # 打乱训练集 165 | tdata = shuffleData(training_data) 166 | 167 | # 在训练集上训练 168 | train_step(tdata, word2ix, tag2ix, model, loss_function, optimizer, epoch) 169 | 170 | # 在验证集上验证 171 | accurary = evaluate(valid_data, word2ix, tag2ix, model) 172 | 173 | print '第%d轮,验证集分类准确率:%.4f' % (epoch + 1, accurary) 174 | 175 | # 如果分类器的准确率在验证集上多次没有提高,就early stop 176 | if EARLY_STOP: 177 | # 如果准确率提升,earlystop计数器清零,否则自增 178 | if accurary >= pre_accurary: 179 | early_stop_count = 0 180 | pre_accurary = accurary 181 | else: 182 | early_stop_count += 1 183 | 184 | if early_stop_count >= EARLY_STOP_THRESHOLD: 185 | print 'early stop!!!' 186 | flag = 'ealystoped' 187 | break 188 | 189 | # 训练结束在测试集上进行测试 190 | test_acc = evaluate(test_data, word2ix, tag2ix, model) 191 | 192 | print '测试集准确率 %.4f' % test_acc 193 | modelname = 'lstmClassifier-level%d-%s-%d-%.4f' % (TAG_LEVEL, flag, time, test_acc) 194 | outpath = '../trainedModel/lstm/%s.pkl' % modelname 195 | # 保存模型 196 | torch.save(model, outpath) 197 | # 在日志中记录本次训练的模型名称以及准确率 198 | u.saveModelAcc2txt(modelname, test_acc, LOG_DIR) 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DialogueClassifier 2 | 通过分析客户和客服对话,对客户的问题进行一些分类。 3 | 具体介绍可以查看CSDN博客,地址如下: 4 | http://blog.csdn.net/sinat_31188625/article/details/78490209 5 | 6 | ## 实验方法 7 | 1. 基于SVM的分类方法 8 | 2. 基于Naive Bayes的分类方法 9 | 3. 基于LSTM循环神经网络的分类方法 10 | 4. 基于LDA和SVM的分类方法 11 | 12 | ## 数据集 13 | 约5890个对话文本,每个文本包含三级分类标签,其中一级分类有5类,二级分类有80类。 14 | 15 | 不同实验中,数据集的划分如下表所示: 16 | 17 | |实验编号|训练集|验证集|测试集| 18 | |:----:|:----:|:----:|:----:| 19 | |1|80%|0|20%| 20 | |2|80%|0|20%| 21 | |3|60%|20%|20%| 22 | 23 | ## 实验结果 24 | 为一级分类和二级分类分别训练了分类器,实验结果如下: 25 | 26 | ||SVM|Bayes|LSTM|1based-svm|1based-svm(2)|lda_svm|mix| 27 | |:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:| 28 | |一级分类准确率(%)|75.97|68.33|66.46|-|-|46.08|74.87| 29 | |二级分类准确率(%)|65.33|37.84|49.48|60.46|66.11|-|62.40| 30 | 31 | 1based-svm的预测方式: 32 | 一级分类有A、B两类,用一级分类器得到的概率分布P1为0.3,0.7 33 | 二级分类有a1,a2,b1,b2四类,用二级分类器得到的概率分布P2为0.1, 0.2, 0.3, 0.4 34 | 那么1based-svm最终预测的概率分布为0.1\*0.3, 0.2\*0.3, 0.3\*0.7, 0.4\*0.7 35 | 一级分类对二级分类的指导方式还有待改进。 36 | 37 | 1based-svm(2)的预测方式: 38 | 一级分类有A、B两类,用一级分类器得到的概率分布P1为0.3,0.7 39 | 二级分类有a1,a2,b1,b2四类,用二级分类器得到的概率分布P2为0.1, 0.2, 0.3, 0.4 40 | 那么1based-svm最终预测的概率分布为0.1\*(1+0.3), 0.2\*(1+0.3), 0.3\*(1+0.7), 0.4\*(1+0.7) 41 | 该方式与前一种方式相比弱化了一级分类对二级分类概率分布的影响,得到的效果较单独训练的SVM略有上升。 42 | 43 | lda_svm的超参数设置: 44 | 文档-词共现矩阵选取了500个特征 45 | 46 | 使用LDA进行降维时,选取的主题个数为50个 47 | 48 | ## 实验中的一些问题 49 | 1. 在数据预处理时,只将数据处理成了\[文本,一级标签,二级标签,三级标签\]的形式,没有建立整理好的数据与原始数据之间的索引,导致在最终输出测试集预测结果到excel时编码难度上升,应该在数据预处理时将数据预处理成\[原始数据编号,文本,一级标签,二级标签,三级标签\]的形式; 50 | 2. 在数据预处理时,原本是要将数据中非英文的项删除,但是在处理时首先将utf-8编码的文本改用Unicode编码,然后进行了非英文检测,结果处理好的数据中仍然有非英文。仔细研究后发现,在.decode('utf8')后应该在检测非英文时.encoed('utf8'),否则无法检测出非英文; 51 | 52 | 53 | 54 | 55 | 注:一级分类和二级分类彼此独立,即进行二级分类时没有引入上一级分类信息。 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Utils.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | import sys 3 | import string 4 | import re 5 | import xlrd 6 | import pickle as pkl 7 | 8 | reload(sys) 9 | sys.setdefaultencoding('utf8') 10 | 11 | specialsymbols = "[\s+\.\!\/_,$%^*(+\"\'" + string.punctuation + "]+|[+——!,。?<>《》:;、~@#¥%……&*()]+" 12 | mathsysmbols = '\d+(\.\d+)*([×\+\-\*\/]\d+(\.\d+)*)*[0-9A-Za-z]*' 13 | 14 | 15 | # """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" 16 | 17 | # 将所有中英文符号替换成空格 18 | def replaceAllSymbols(oldStr): 19 | # 去掉数字 20 | oldStr = re.sub(mathsysmbols.decode("utf-8"), " ".decode("utf-8"), oldStr) 21 | # 再去掉符号 22 | return re.sub(specialsymbols.decode("utf-8"), " ".decode("utf-8"), oldStr) 23 | 24 | 25 | # 检测是否含有中文 26 | def checkContainChinese(check_str): 27 | for ch in check_str.decode('utf-8'): 28 | if u'\u4e00' <= ch <= u'\u9fff': 29 | return True 30 | return False 31 | 32 | 33 | def checkOnlyContainEnglish(check_str): 34 | if check_str.strip() == '': 35 | return False 36 | for ch in check_str.encode('utf-8'): #将原始数据编码回UTF-8 37 | if ch.isalpha() or ch.isdigit() or ch == ' ': 38 | continue 39 | else: 40 | return False 41 | return True 42 | 43 | 44 | # 打开excel文件 45 | def openExcel(dataPath, index): 46 | rawFile = xlrd.open_workbook(dataPath) # 打开文件 47 | rawData = rawFile.sheets()[index] # 打开工作表 48 | rawDataRows = rawData.nrows # 9825 49 | rawDataCols = rawData.ncols # 47 50 | return rawData, rawDataRows, rawDataCols 51 | 52 | 53 | def saveAsPickle(data, outpath): 54 | with open(outpath, 'wb') as f: 55 | pkl.dump(data, f) 56 | 57 | 58 | def loadPickle(datapath): 59 | with open(datapath, 'rb') as f: 60 | return pkl.load(f) 61 | 62 | 63 | def saveModelAcc2txt(model, acc, outpath): 64 | with open(outpath, 'a') as f: 65 | f.write('%s\t%.4f\n' % (model, acc)) 66 | -------------------------------------------------------------------------------- /bestResults/mix-level1-4-0.7725.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jx00109/DialogueClassifier/637e923a01166df8a5f85fb0da77515f64357f61/bestResults/mix-level1-4-0.7725.xls -------------------------------------------------------------------------------- /bestResults/mix-level2-6-0.6576.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jx00109/DialogueClassifier/637e923a01166df8a5f85fb0da77515f64357f61/bestResults/mix-level2-6-0.6576.xls -------------------------------------------------------------------------------- /dataCleaner.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | ''' 3 | 该脚本的作用: 4 | 1. 去除非英文数据 5 | 2. 对原始数据中的标点和不规则单词进行处理 6 | ------------------------------------------- 7 | 输入: *.xlsx 8 | 输出: *.pkl (list) 9 | ''' 10 | import Utils 11 | 12 | dataPath = r'./data/Berlinetta MLK 9.06.xlsx' 13 | pklDataPath = './data/alldata(onlyEng).pkl' 14 | 15 | rawData, rawDataRows, rawDataCols = Utils.openExcel(dataPath=dataPath, index=1) # 9825 rows, 47 cols 16 | 17 | data = list() 18 | 19 | for i in range(1, rawDataRows): # 跳过表头 20 | r = rawData.row_values(i) 21 | text = Utils.replaceAllSymbols(r[30]) 22 | label1, label2, label3 = r[42], r[43], r[44] # 获取对话内容以及对应的三级分类 23 | if Utils.checkOnlyContainEnglish(text): # 7173 rows, maxlen=572, minlen=2 24 | data.append([text.strip().lower(), label1.lower(), label2.lower(), label3.lower()]) 25 | 26 | # 保存为pkl文件 27 | Utils.saveAsPickle(data, pklDataPath) 28 | -------------------------------------------------------------------------------- /getAllTags.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | ''' 3 | 该脚本用于获得分类情况,产生两个字典: 4 | 每个一级分类对应的子分类 5 | 每个二级分类对应的子分类 6 | -------------------------------- 7 | 输入: *.xlsx 8 | 输出: *.pkl (dict) 9 | ''' 10 | import Utils as u 11 | 12 | dataPath = r'./data/Berlinetta MLK 9.06.xlsx' 13 | outpath12 = r'./data/tag12.pkl' 14 | outpath23 = r'./data/tag23.pkl' 15 | 16 | tags12 = dict() # key: 一级分类 value:一级分类的子分类集合 17 | tags23 = dict() # key: 二级分类 value:二级分类的子分类集合 18 | 19 | data, nrows, ncols = u.openExcel(dataPath=dataPath, index=0) 20 | 21 | # 获得类别字典 22 | # key: 分类 23 | # value: 对应分类下的子分类集合 24 | def getClassDict(data, tags, ks, ke, vs, ve): 25 | for i in range(ks, ke): 26 | key = data.row_values(0)[i].strip().lower() 27 | tags[key] = list() 28 | for j in range(vs, ve): 29 | lable2 = str(data.row_values(j)[i]).strip().lower() 30 | if lable2 != '': 31 | tags[key].append(lable2) 32 | return tags 33 | 34 | 35 | tags12 = getClassDict(data, tags12, 0, 5, 1, nrows) 36 | tags23 = getClassDict(data, tags23, 6, ncols, 1, nrows) 37 | 38 | u.saveAsPickle(tags12, outpath12) 39 | u.saveAsPickle(tags23, outpath23) 40 | --------------------------------------------------------------------------------