├── Preprocess
├── __init__.py
├── bow.py
└── boc.py
├── corpus.zip
├── __init__.py
├── paper
└── 限定领域口语对话系统中超出领域话语的对话行为识别.pdf
├── .idea
├── vcs.xml
├── modules.xml
├── Ch2r_ood_understanding.iml
├── misc.xml
└── workspace.xml
├── .gitattributes
├── config.yaml
├── .gitignore
├── README.md
├── random_forest.py
├── two-phase.py
├── ME(TFIDF+OOV).py
└── cnn.py
/Preprocess/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/corpus.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZixuanKe/Ch2r_ood_understanding/HEAD/corpus.zip
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | __author__ = 'jdwang'
3 | __date__ = 'create date: 2016-06-23'
4 | __email__ = '383287471@qq.com'
5 |
6 |
--------------------------------------------------------------------------------
/paper/限定领域口语对话系统中超出领域话语的对话行为识别.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZixuanKe/Ch2r_ood_understanding/HEAD/paper/限定领域口语对话系统中超出领域话语的对话行为识别.pdf
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.idea/Ch2r_ood_understanding.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | main:
2 | path: main.py
3 | # 描述
4 |
5 | describe: 该包算法参考论文:基于词矢量的短文本分类;该论文使用TFIDF作为特征向量,最后使用最大熵分类器进行模型训练测试时,使用word2vec对集外词(OOV)进行替换.
6 |
7 | name: &name bow_word2vec_oov
8 |
9 | model: &model tfidf
10 |
11 | max_features: 2000
12 |
13 | max_keywords: 2000
14 |
15 | full_mode: False
16 |
17 | remove_stopword: True
18 |
19 | train_data_file_path: new_train_all.csv
20 | test_data_file_path: new_ood_labeled.csv
21 | # 结果输出到...
22 | result_file_path: ['./',*name,'_',*model,'.csv']
23 | # 使用什么模型:TFIDF 或者 BOW
24 | # 日志文件路径
25 | log_file_path: ['./',*name,'_',*model,'.log']
26 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Preprocess/bow.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import pandas as pd
3 | import jieba
4 |
5 |
6 | '''
7 | 简单的分词模块
8 |
9 | '''
10 |
11 | train_data = pd.read_csv(
12 | 'v2.3_train_S_1518.csv',
13 | sep='\t',
14 | encoding='utf8',
15 | header=0
16 | )
17 |
18 | test_data = pd.read_csv(
19 | 'v2.3_test_S_131.csv',
20 | sep='\t',
21 | encoding='utf8',
22 | header=0
23 | )
24 |
25 | train_data['WORDS'] = [" ".join(jieba.cut(sentence)) for sentence in train_data['SENTENCE']]
26 | test_data['WORDS'] = [" ".join(jieba.cut(sentence)) + " " for sentence in test_data['SENTENCE']]
27 |
28 | train_data.to_csv(
29 | "train_seg.csv",
30 | sep='\t',
31 | encoding='utf8',
32 |
33 | )
34 |
35 | test_data.to_csv(
36 | "test_seg.csv",
37 | sep='\t',
38 | encoding='utf8',
39 |
40 | )
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
--------------------------------------------------------------------------------
/Preprocess/boc.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import pandas as pd
3 | import yaml
4 |
5 | import jieba
6 |
7 |
8 |
9 | '''
10 | 按字切分 而不按词切分
11 | 即 BOC(Characteristic) 与 BOW(Word) 的区别
12 |
13 | '''
14 | def singleword(train_data,test_data):
15 | result_train = []
16 | for words in train_data['WORDS']:
17 | character_result_train = ""
18 | words = words.split()
19 | for characters in words:
20 | if len(characters) > 1:
21 | for character in characters:
22 | character_result_train += (character+ u" ")
23 | else:
24 | character_result_train += (characters + u" ")
25 |
26 | character_result_train = character_result_train[0:len(character_result_train)-1]
27 | result_train.append(character_result_train)
28 |
29 |
30 | train_data['SINGLE'] = result_train
31 |
32 |
33 |
34 | result_test = []
35 | for words in test_data['WORDS']:
36 | character_result_test = ""
37 | words = words.split()
38 | for characters in words:
39 | if len(characters) > 1:
40 | for character in characters:
41 | character_result_test += (character+ u" ")
42 | else:
43 | character_result_test += (characters + u" ")
44 | character_result_test = character_result_test[0:len(character_result_test)-1]
45 | result_test.append(character_result_test)
46 |
47 |
48 | test_data['SINGLE'] = result_test
49 |
50 | train_data.to_csv(
51 | "train.csv",
52 | sep = '\t',
53 | encoding = 'utf8',
54 |
55 | )
56 |
57 | test_data.to_csv(
58 | "test.csv",
59 | sep='\t',
60 | encoding='utf8',
61 |
62 | )
63 | return result_train,result_test
64 |
65 |
66 |
67 | if __name__ == '__main__':
68 | train_data = pd.read_csv(
69 | 'train_seg.csv',
70 | sep='\t',
71 | encoding='utf8',
72 | header=0
73 | )
74 |
75 | test_data = pd.read_csv(
76 | 'test_seg.csv',
77 | sep='\t',
78 | encoding='utf8',
79 | header=0
80 | )
81 |
82 | singleword(train_data,test_data)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Ch2r_ood_understanding
2 |
3 | ---
4 | 本文档为论文[限定领域口语对话系统中超出领域话语的对话行为识别](https://zixuanke.github.io/docs/%E9%99%90%E5%AE%9A%E9%A2%86%E5%9F%9F%E5%8F%A3%E8%AF%AD%E5%AF%B9%E8%AF%9D%E7%B3%BB%E7%BB%9F%E4%B8%AD%E8%B6%85%E5%87%BA%E9%A2%86%E5%9F%9F%E8%AF%9D%E8%AF%AD%E7%9A%84%E5%AF%B9%E8%AF%9D%E8%A1%8C%E4%B8%BA%E8%AF%86%E5%88%AB.pdf)的部分实验代码。代码基于Python,需要用到的外部库有:
5 |
6 | > * Keras(搭建神经网络)
7 | > * Scikit-learn(最大熵,随机森林)
8 | > * gensim(使用word2vec替换字典外的词)
9 |
10 | 实验涉及的方法主要有
11 | > * 二阶段法(two-phase)
12 | > * 最大熵法(ME(TFIDF+OOV))
13 | > * 随机森林(RF(random_forest.py))
14 | > * CNN(cnn.py)
15 |
16 | 语料库简介
17 | [语料库](https://github.com/ZixuanKe/Ch2r_ood_understanding/tree/master/corpus)中有两个语料库可供选择:
18 | > * AIML语料库(人造数据集)
19 | > * CCL语料库(实际测试用到的数据集)
20 |
21 | 标签格式为:
22 |
23 | > categoryA # categoryB
24 |
25 | 即 **大类维度为A,小类维度为B**
26 |
27 |
28 | 其中 **大类共4类,小类共16类**
29 |
30 | 实验方法
31 | 预处理模块
32 | [预处理](https://github.com/ZixuanKe/Ch2r_ood_understanding/blob/master/Preprocess)中有两个预处理脚本可供选择:
33 | > * BOC(Bag-of-character 即按字划分,制造“字袋”)
34 | > * BOW(Bag-of-word 即按词划分,制造“词袋”)
35 |
36 | 二阶段法
37 | 我们将分类切割成两部分,首先进行4个大类的分类,在大类的基础上,再对大类下的小类进行细分
38 | > 这样做的合理性,在部分比赛参赛选手的做法中得到证实。理由是我们认为大类分类比小类分类更加容易,在大类之内进行小类分类,可以使得小类分类时范围减少,减少小类分类的难度。然而这样也有不合理性,比如,大类分类出错,则小类分类则无机会再分对,也即误差的传递性。
39 |
40 | > 参考论文: [Splusplus: A Feature-Rich Two-stage Classifier for Sentiment Analysis of Tweets](http://www.aclweb.org/anthology/S/S15/S15-2.pdf#page=557)
41 |
42 | 在代码中,针对每个大类对应的小类,重新训练了各自的分类器:
43 | ```python
44 | resultData,resultTarget = findAllTrainning('attitude',exam_bow_fea_data) #找到其大类的所有小类
45 | gb1 = sub_classfier(resultData,resultTarget)
46 | resultData,resultTarget = findAllTrainning('shopping',exam_bow_fea_data) #找到其大类的所有小类
47 | gb2 = sub_classfier(resultData,resultTarget)
48 | resultData,resultTarget = findAllTrainning('chatting',exam_bow_fea_data) #找到其大类的所有小类
49 | gb3 = sub_classfier(resultData,resultTarget)
50 | resultData,resultTarget = findAllTrainning('trouble',exam_bow_fea_data) #找到其大类的所有小类
51 | gb4 = sub_classfier(resultData,resultTarget)
52 | ```
53 | 最大熵法
54 | 使用最大熵模型直接分类作为对照组
55 | >* 最大熵模型在许多文本分类问题中都表现了他优越的性能,这里我们利用他作为对照组,观察后面CNN和RF的效果
56 |
57 | > 参考论文: [使用最大熵模型进行中文文本分类](http://www.cnki.net/KCMS/detail/detail.aspx?QueryID=4&CurRec=1&recid=&filename=JFYZ200501013&dbname=CJFD2005&dbcode=CJFQ&pr=&urlid=&yx=&v=MjkxMDVMRzRIdFRNcm85RVo0UjhlWDFMdXhZUzdEaDFUM3FUcldNMUZyQ1VSTHlmYitSckZ5L2hVYnpPTHl2U2Q=)
58 |
59 | >* 当逻辑回归用于多分类问题时,可将损失函数改为交叉熵之后,则其成为最大熵模型[LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)
60 |
61 |
62 | >* 为了提高分类精度,针对部分在字典外的词,使用word2vec用外部语料(论文中使用SMP2015给出的微博数据,1000万条)进行OOV(out-of-vocabulary)替换(替换为与词汇表最近的词)
63 |
64 | >参考论文: [基于词矢量相似度的短文本分类](http://www.cnki.net/KCMS/detail/detail.aspx?QueryID=0&CurRec=1&recid=&filename=SDDX201412004&dbname=CJFDLAST2015&dbcode=CJFQ&pr=&urlid=&yx=&v=MDE1MzkxRnJDVVJMeWZiK1JyRnkvaFVieklOaW5QZHJHNEg5WE5yWTlGWUlSOGVYMUx1eFlTN0RoMVQzcVRyV00=)
65 |
66 | 代码中,需要设置LogisticRegression的参数
67 | ```python
68 | clf = LogisticRegression(multi_class="multinomial",solver="newton-cg")
69 | ```
70 |
71 | 卷积神经网络
72 | > 卷积神经网络在NLP中的使用多种多样,这里使用设置不同窗口大小的方法进行探索,即seq-CNN和Bow-CNN
73 |
74 | >参考论文: [ (Johnson and Zhang, NAACL 2015) Effective Use of Word Order for Text Categorization with Convolutional Neural Networks](https://arxiv.org/pdf/1412.1058.pdf)
75 |
76 | Seq-CNN
77 | 由**one-hot编码**拼接而来
78 | > 优点:词语之间顺序的得到保留
79 | > 缺点:维度过大,容易造成维度灾难
80 |
81 | Bow-CNN
82 | 在**Seq-CNN**的基础上,进行降维
83 | > 在确定窗口大小为n的情况,n之内的one-hot coding进行对应位数相加
84 | 优点:窗口内的语序信息丢失
85 | 缺点:窗口间的语序信息得到保留,维度得到降低
86 |
87 |
88 | 随机森林
89 | 传统的**bagging融合模型**,这里**树的棵树**使用交叉验证得到,**树的深度**使用经验值:
90 | > log(M),其中M为总特征数
91 |
92 | 评价指标
93 | > 准确率: sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label))
94 |
--------------------------------------------------------------------------------
/random_forest.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import pandas as pd
3 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
4 | import pandas as pd
5 | import yaml
6 | import pickle
7 | import numpy as np
8 | from sklearn import preprocessing
9 | from sklearn.ensemble import RandomForestClassifier
10 |
11 |
12 |
13 | '''
14 | 尝试使用randomforest进行分类 与 CNN分类进行比对
15 |
16 | '''
17 | def load_data(file_name):
18 | import csv
19 |
20 | csvfile = file(file_name, 'rb')
21 | reader = csv.reader(csvfile)
22 |
23 | label = []
24 | data = []
25 | for line in reader:
26 | label.append(line[0])
27 | data.append(line[1:len(line)])
28 |
29 | # print label
30 | # print data
31 | csvfile.close()
32 | return data,label
33 |
34 |
35 |
36 | if __name__ == '__main__':
37 |
38 | data = pd.read_csv(
39 | "v2.3_test_Sa_79.csv",
40 | sep='\t',
41 | encoding='utf8',
42 | header=0
43 | )
44 | f = open("result.txt",'a')
45 | train_data_bow_fea_bow,train_data_label_bow = load_data("v2.3_train_Sa_word_seg_i1_dev_830.csv")
46 | test_data_bow_fea_bow,test_data_label_bow = load_data("v2.3_train_Sa_word_seg_i1_val_76.csv")
47 | print "拼接 i1,卷积层"
48 | with open("TrainSet_2+281_feature_d1.pickle","rb") as file:
49 | train_data_bow_fea = pickle.load(file)
50 | train_data_label = pickle.load(file)
51 | test_data_bow_fea = pickle.load(file)
52 | test_data_label = pickle.load(file)
53 | #
54 | # train_data_bow_fea_bow = preprocessing.minmax_scale(train_data_bow_fea_bow)
55 | # test_data_bow_fea_bow = preprocessing.minmax_scale(test_data_bow_fea_bow)
56 |
57 | #拼接用
58 | print "length1: " + str(len(train_data_bow_fea_bow[0]))
59 | print "length2: " + str(len(train_data_bow_fea[0]))
60 | print len(train_data_bow_fea_bow)
61 | print len(train_data_bow_fea)
62 |
63 | train_length = len(train_data_bow_fea_bow[0]) + len(train_data_bow_fea[0])
64 | test_length = len(test_data_bow_fea_bow[0]) + len(test_data_bow_fea[0])
65 |
66 | train_weigth = len(train_data_bow_fea_bow)
67 | test_weigth = len(test_data_bow_fea_bow)
68 |
69 | train_data_bow_fea = np.concatenate((train_data_bow_fea,train_data_bow_fea_bow),axis=1)
70 | test_data_bow_fea = np.concatenate((test_data_bow_fea,test_data_bow_fea_bow),axis=1)
71 |
72 | train_data_bow_fea.reshape(train_length,train_weigth)
73 | test_data_bow_fea.reshape(test_length,test_weigth)
74 |
75 | print "length合并: " + str(len(train_data_bow_fea[0]))
76 |
77 | train = train_data_bow_fea
78 | test = test_data_bow_fea
79 |
80 | index_to_label = [
81 | u'其它#骂人',
82 | u'导购#不成交',
83 | u'导购#不理解',
84 | u'导购#开始',
85 | u'导购#成交',
86 | u'导购#更换',
87 | u'导购#结束',
88 | u'导购#详情',
89 | u'表态#不满',
90 | u'表态#否定',
91 | u'表态#满意',
92 | u'表态#犹豫',
93 | u'表态#疑问',
94 | u'表态#肯定',
95 | u'表态#附和',
96 | u'表态#随便',
97 | u'社交义务#不用谢',
98 | u'社交义务#接受道歉',
99 | u'社交义务#致谢',
100 | u'社交义务#道歉',
101 | u'社交义务#问候',
102 | u'闲聊#天气',
103 | u'闲聊#时间',
104 | u'闲聊#身份信息'
105 | ]
106 |
107 | for n in [1000]:
108 | clf = RandomForestClassifier(n_estimators=n) #随机森林
109 | # clf.fit(train_data_bow_fea,train_data['LABEL'])
110 |
111 | clf.fit(train,train_data_label)
112 | print >> f ,sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label))
113 | print sum(test_data_label == clf.predict(test)) / (1.0 * len(test_data_label))
114 |
115 |
116 | #bad case 输出
117 | predict = clf.predict(test_data_bow_fea)
118 | for i in range(len(test_data_label)):
119 | if test_data_label[i] != predict[i]:
120 | print data['SENTENCE'][i] + "\t" + index_to_label[int(predict[i])] + "\t" + index_to_label[int(test_data_label[i])]
--------------------------------------------------------------------------------
/two-phase.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | #coding:utf-8
4 |
5 |
6 | from dateutil.parser import parse
7 | from sklearn.feature_extraction.text import CountVectorizer
8 | import pandas as pd
9 | import numpy as np
10 | from sklearn.externals import joblib #用于保存模型
11 | import jieba
12 | from sklearn.ensemble import GradientBoostingClassifier
13 | from sklearn.metrics import f1_score #评价标准F值
14 | from sklearn.metrics import precision_score
15 | from sklearn.metrics import recall_score
16 | from sklearn.ensemble import RandomForestClassifier
17 |
18 | '''
19 | 基于二分类的尝试
20 | 方法1: 直接进行16个小类的分类
21 | 方法2: 先进行大类的分类,后进行小类的分类,两步走
22 |
23 | '''
24 |
25 |
26 |
27 | def sub_classfier(exam_bow_fea_data,exam_bow_fea_target):
28 | '''
29 |
30 | 训练子分类分类器
31 |
32 | :param exam_bow_fea_data: 数据
33 | :param exam_bow_fea_target: 标签
34 | :return: 返回 随机森林训练模型
35 | '''
36 |
37 |
38 | rf = RandomForestClassifier(n_estimators=200) #TARGET为label2
39 | print "target:",len(exam_bow_fea_target)
40 | print "data:",len(exam_bow_fea_data)
41 | rf.fit(exam_bow_fea_data, exam_bow_fea_target)
42 | return rf
43 |
44 | def findAllTrainning(mainClass,exam_bow_fea):
45 |
46 | '''
47 | 找出大类之下的,所有子分类
48 | :param mainClass: 大类
49 | :param exam_bow_fea: 训练数据
50 | :return: 子类训练数据 子类训练标签
51 | '''
52 | resultData = []
53 | for rec in range(len(exam)):
54 | if exam.iloc[rec].LABEL1 == mainClass:
55 | resultData .append( exam_bow_fea[rec] )
56 | print len(resultData)
57 | resultTarget = exam[['LABEL2']][exam.LABEL1 == mainClass]
58 | return resultData,resultTarget
59 |
60 |
61 |
62 | #读取数据
63 | print 'Loading Data'
64 | exam = pd.read_table('train_all.csv',
65 | converters={'date': parse},encoding = 'utf-8')
66 |
67 |
68 | exam_test = pd.read_table('ch2r_test.csv',
69 | converters={'date': parse},encoding = 'utf-8')
70 |
71 |
72 |
73 | #分词
74 | exam = exam.drop(['SEGMENT_FULL','SEGMENT_EVERYWORD'],axis=1)
75 | exam_test = exam_test.drop(['SEGMENT_FULL','SEGMENT_EVERYWORD','SEGMENT_OOV','SEGMENT_OOV_EVERYWORD'],axis=1)
76 | exam['SENTENCE'] = [' '.join(jieba.cut(sentence)) for sentence in exam['SENTENCE']]
77 | exam_test['SENTENCE'] = [' '.join(jieba.cut(sentence)) for sentence in exam_test['SENTENCE']]
78 | print exam.head()
79 | exam['SENTENCE'] = exam['SEGMENT'].apply(lambda x:' '.join(x.split('|')))
80 | exam_test['SENTENCE'] = exam_test['SEGMENT'].apply(lambda x:' '.join(x.split('|')))
81 |
82 | #预处理结果文件保存
83 | exam.to_csv('Exam_Prep.csv',encoding = 'utf-8')
84 | exam_test.to_csv('Exam_Prep_Test.csv',encoding = 'utf-8')
85 |
86 |
87 |
88 | #找特征 BOW
89 | vect = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
90 | exam_bow_fea = vect.fit_transform(exam['SENTENCE']).toarray()
91 | exam_bow_fea_test = vect.transform(exam_test['SENTENCE']).toarray()
92 |
93 |
94 |
95 | exam_bow_fea_data = exam_bow_fea #归一化
96 | print len(exam_bow_fea_data)
97 | exam_bow_fea_target = exam['LABEL2']
98 | print len(exam_bow_fea_target)
99 |
100 | exam_bow_fea_test_data = exam_bow_fea_test #归一化
101 | print len(exam_bow_fea_test_data)
102 | exam_bow_fea_test_target = exam_test['LABEL2']
103 | print len(exam_bow_fea_test_target)
104 |
105 |
106 |
107 | #特征读取完毕
108 |
109 |
110 |
111 | # 方法1 直接对 16个小类进行分类
112 | esti = 400; dep = 7
113 | gb = RandomForestClassifier(n_estimators=200)
114 | gb.fit(exam_bow_fea_data,exam_bow_fea_target) #直接fit即可,没有明确的标记,不像分类问题
115 | # joblib.dump(gb,"gb.RandomForestClassifierModel")
116 |
117 |
118 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data))/1184.0
119 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data))
120 |
121 |
122 | #方法2 先大类后小类
123 |
124 | exam_bow_fea_target = exam['LABEL1']
125 | exam_bow_fea_test_target = exam_test['LABEL1']
126 |
127 | exam_bow_fea_test_result = exam_test['LABEL2'] #终极结果
128 |
129 | esti = 400; dep = 7
130 | gb = RandomForestClassifier(n_estimators=200)
131 | gb.fit(exam_bow_fea_data,exam_bow_fea_target) #直接fit即可,没有明确的标记,不像分类问题
132 | # joblib.dump(gb,"gb.RandomForestClassifierModel")
133 |
134 | print sum(exam_bow_fea_test_target == gb.predict(exam_bow_fea_test_data))/58.0
135 | print exam_bow_fea_test_target
136 | print gb.predict(exam_bow_fea_test_data)
137 | np.savetxt('1.csv', exam_bow_fea_test_target,fmt='%s', delimiter = '/t')
138 | np.savetxt('2csv', gb.predict(exam_bow_fea_test_data),fmt='%s', delimiter = '/t')
139 |
140 |
141 |
142 | mainClass = [i for i in gb.predict(exam_bow_fea_test_data)]
143 |
144 | resultData,resultTarget = findAllTrainning('attitude',exam_bow_fea_data) #找到其大类的所有小类
145 | gb1 = sub_classfier(resultData,resultTarget)
146 | resultData,resultTarget = findAllTrainning('shopping',exam_bow_fea_data) #找到其大类的所有小类
147 | gb2 = sub_classfier(resultData,resultTarget)
148 | resultData,resultTarget = findAllTrainning('chatting',exam_bow_fea_data) #找到其大类的所有小类
149 | gb3 = sub_classfier(resultData,resultTarget)
150 | resultData,resultTarget = findAllTrainning('trouble',exam_bow_fea_data) #找到其大类的所有小类
151 | gb4 = sub_classfier(resultData,resultTarget)
152 |
153 |
154 |
155 | result = []
156 | for i in range(len(exam_test)):
157 | print mainClass[i]
158 | if mainClass[i] == 'attitude':
159 | result.append( gb1.predict(exam_bow_fea_test_data[i]))
160 | elif mainClass[i] == 'shopping':
161 | result.append( gb2.predict(exam_bow_fea_test_data[i]))
162 | elif mainClass[i] == 'chatting':
163 | result.append( gb3.predict(exam_bow_fea_test_data[i]))
164 | elif mainClass[i] == 'trouble':
165 | result.append( gb4.predict(exam_bow_fea_test_data[i]))
166 |
167 |
168 |
169 | #保存结果
170 | # print sum( result == exam_bow_fea_test_result ) / 58.0
171 | np.savetxt('new.csv', exam_bow_fea_test_result.as_matrix(),fmt='%s', delimiter = '/t')
172 | np.savetxt('re.csv', np.asarray(result).flatten(),fmt='%s', delimiter = '/t')
173 |
174 |
--------------------------------------------------------------------------------
/ME(TFIDF+OOV).py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | import sys
3 | reload(sys)
4 | sys.setdefaultencoding('utf-8')
5 |
6 | __author__ = 'jdwang'
7 | __date__ = 'create date: 2016-05-29'
8 | import numpy as np
9 | import pandas as pd
10 | import logging
11 | import timeit
12 | import yaml
13 | from gensim.models import Word2Vec
14 | from sklearn.linear_model import LogisticRegression
15 | from dateutil.parser import parse
16 | import pandas as pd
17 | import jieba
18 |
19 |
20 |
21 |
22 | config = yaml.load(file('./config.yaml')) #读取yaml配置文件
23 | config = config['main'] #以字典的方式读取2
24 | logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w',
25 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
26 | start_time = timeit.default_timer()
27 |
28 | #可保存为日志文件进行管理
29 |
30 | print('=' * 30)
31 | # print config['describe']
32 | print('=' * 30)
33 | print 'start running!'
34 | logging.debug('=' * 30)
35 | logging.debug(config['describe'])
36 | logging.debug('=' * 30)
37 | logging.debug('start running!')
38 | logging.debug('=' * 20)
39 |
40 |
41 | import jieba
42 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
43 | from sklearn.ensemble import RandomForestClassifier
44 |
45 |
46 | train_data = pd.read_csv(
47 | config['train_data_file_path'],
48 | sep='\t',
49 | encoding='utf8',
50 | header=0
51 | )
52 |
53 | test_data = pd.read_csv(
54 | config['test_data_file_path'],
55 | sep='\t',
56 | encoding='utf8',
57 | header=0
58 | )
59 |
60 | logging.debug('train data shape is :%s'%(str(train_data.shape)))
61 | print('train data shape is :%s'%(str(train_data.shape)))
62 |
63 | logging.debug('test data shape is :%s'%(str(test_data.shape)))
64 | print('test data shape is :%s'%(str(train_data.shape)))
65 | logging.debug('-' * 20)
66 | # 去除类别 其他#其他
67 | logging.debug('去除类别 其他#其他')
68 | train_data = train_data[train_data['LABEL']!=u'其他#其他']
69 | test_data = test_data[test_data['LABEL']!=u'其他#其他']
70 | logging.debug('train data shape is :%s'%(str(train_data.shape)))
71 | print('train data shape is :%s'%(str(train_data.shape)))
72 |
73 | logging.debug('test data shape is :%s'%(str(test_data.shape)))
74 | print('test data shape is :%s'%(str(train_data.shape)))
75 | logging.debug('-' * 20)
76 |
77 | train_data = train_data[['LABEL','SENTENCE']]
78 | test_data = test_data[['LABEL','SENTENCE']]
79 |
80 | index_to_label = list(train_data['LABEL'].unique())
81 | logging.debug(u'总共类别数:%d,分别为:%s'%(len(index_to_label),','.join(index_to_label)))
82 | print('总共类别数:%d'%(len(index_to_label)))
83 |
84 | label_to_index = {label:idx for idx,label in enumerate(index_to_label)}
85 |
86 | train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index)
87 | test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index)
88 | # print train_data.head()
89 |
90 |
91 | logging.debug('=' * 20)
92 | logging.debug('对数据进行分词...')
93 | logging.debug('-' * 20)
94 |
95 | sentence_to_seg = lambda x: jieba.cut(x,cut_all=True)
96 |
97 | train_data['WORDS'] = [' '.join(jieba.cut(sentence,cut_all=True)) for sentence in train_data['SENTENCE']]
98 | test_data['WORDS'] = [' '.join(jieba.cut(sentence,cut_all=True)) for sentence in test_data['SENTENCE']]
99 |
100 | # train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg)
101 | # test_data['WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg)
102 | print train_data.head()
103 |
104 | logging.debug('=' * 20)
105 | logging.debug('开始生成特征向量...')
106 |
107 | vectorizer = CountVectorizer(analyzer="word",
108 | token_pattern=u'(?u)\\b\w+\\b',
109 | tokenizer=None,
110 | preprocessor=None,
111 | lowercase=False,
112 | stop_words=None,
113 | max_features=config['max_features'])
114 |
115 | print test_data.head()
116 | train_X_features = vectorizer.fit_transform(train_data['WORDS'].as_matrix()).toarray(
117 |
118 | )
119 |
120 |
121 | vocabulary = vectorizer.get_feature_names()
122 | logging.debug(u'字典大小:%d个词,有:%s'%(len(vocabulary),','.join(vocabulary)))
123 | # print(u'字典大小:%d,有:%s'%(len(vocabulary),','.join(vocabulary)))
124 |
125 | logging.debug('train X shape is :%s'%(str(train_X_features.shape)))
126 | print('train X shape is :%s'%(str(train_X_features.shape)))
127 |
128 | logging.debug('=' * 20)
129 | logging.debug(u'计算概率')
130 | logging.debug('注意:如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数')
131 |
132 | row,col = train_X_features.shape
133 | # 若一个词在句子中出现多次,只算一次
134 | train_X_features = np.asarray([item>0 for item in train_X_features.flatten()],dtype=int).reshape(row,col)
135 |
136 | words_total_count = sum(train_X_features.flatten())
137 | logging.debug('训练库中,词的总计数为:%d'%(words_total_count))
138 | print('训练库中,词的总计数为:%d'%(words_total_count))
139 |
140 | logging.debug('-' * 20)
141 | # 统计每个词的出现次数,如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数
142 | logging.debug('统计每个词的出现次数,如果一个词在一个句子中出现多次,也只算一次,即这里计算的是,这个词在多少个句子中出现的次数')
143 | get_word_count = lambda x: sum(x)
144 | word_counts = np.sum(train_X_features,axis=0)
145 |
146 | p_word = word_counts/(1.0*words_total_count)
147 | logging.debug(u'最大词频为:%f,次数为:%d,该词为:%s'%(max(p_word),max(word_counts),vocabulary[np.argmax(word_counts)]))
148 | # print(u'最大词频为:%f,次数为:%d,该词为:%s'%(max(p_word),max(word_counts),vocabulary[np.argmax(word_counts)]))
149 |
150 | logging.debug('-' * 20)
151 | logging.debug('计算词和各个类的共现次数,以及每个类的句子数...')
152 |
153 | print('计算词和各个类的共现次数...')
154 | # count(word,class)
155 | count_word_class = []
156 | # count(class)
157 | count_class = []
158 | for label in index_to_label:
159 | logging.debug('-' * 10)
160 | logging.debug(u'处理类别:%s'%(label))
161 | # print(u'处理类别:%s'%(label))
162 | # 计算相应类别的句子
163 | index = (train_data['LABEL'] == label).as_matrix()
164 | sentences = train_X_features[index]
165 | print len(sentences)
166 | logging.debug('句子数为:%d'%(len(sentences)))
167 | print('句子数为:%d'%(len(sentences)))
168 | count_class.append(len(sentences))
169 | count_word_class.append(np.sum(sentences,axis=0))
170 |
171 | # count(class)
172 | count_class = np.asarray(count_class)
173 | # P(class)
174 | p_class = count_class/(1.0*len(train_data))
175 | # P(class|word)
176 | p_class_on_word = count_word_class/(word_counts*1.0)
177 | p_class_on_word = p_class_on_word.transpose()
178 |
179 | logging.debug('-' * 20)
180 | logging.debug('计算 P(class|word)/P(class)')
181 |
182 | print p_class_on_word[0]
183 | print p_class
184 | # P(class|word)/P(class)
185 | p_rate = p_class_on_word/p_class
186 | print p_rate[0]
187 | logging.debug('计算 log( P(class|word)/P(class) )')
188 | # log( P(class|word)/P(class) )
189 | log_p_rate = np.log(p_rate)
190 | print log_p_rate[0]
191 |
192 | # P(class|word) * log( P(class|word)/P(class) )
193 | p_ent = log_p_rate * p_class_on_word
194 | p_ent = np.nan_to_num(p_ent)
195 | print p_ent[0]
196 | # 期望交叉熵
197 | entroy = np.sum(p_ent,axis=1)
198 | print entroy[0]
199 |
200 | print p_word[0]
201 | # 结果 = 期望交叉熵 * P(word)
202 |
203 | # 论文直接使用词频*熵,则将会导致词频大的词权重很大,
204 | # 即:entroy = p_word * entroy
205 | # 改进:使用sigmoid函数进行平滑
206 | # 或者不使用词频,效果也更好
207 | def sigmoid(x):
208 | return 1/(1+np.exp(-x))
209 | # entroy = sigmoid(p_word) * entroy
210 | print entroy[0]
211 |
212 | logging.debug('=' * 20)
213 |
214 | logging.debug('进行特征词选择..')
215 | logging.debug('-' * 20)
216 | sort_index = np.argsort(entroy)[-1::-1]
217 | vocabulary = np.asarray(vocabulary)
218 | # print ','.join(vocabulary[sort_index])
219 | # print entroy[sort_index]
220 | logging.debug(u'期望交叉熵top 10:%s'%(','.join(vocabulary[sort_index[:10]])))
221 | logging.debug('大小分别为:%s'%(entroy[sort_index[:10]]))
222 |
223 | logging.debug('-' * 20)
224 | keywords = vocabulary[sort_index[:config['max_keywords']]]
225 |
226 | logging.debug('选取%d个词作为关键词,实际为:%d个'%(config['max_keywords'],len(keywords)))
227 | # print('选取%d个词作为关键词,实际为:%d'%(config['max_keywords'],len(keywords)))
228 | logging.debug(u'关键词分别为(按权重大到小):%s'%(','.join(keywords)))
229 | # print(u'关键词分别为(按权重大到小):%s'%(','.join(keywords)))
230 | logging.debug('-' * 20)
231 |
232 |
233 |
234 |
235 | logging.debug('=' * 20)
236 | logging.debug('生成TFIDF特征向量...')
237 | # TFIDF 字典
238 | tfidf_vocabulary = {item:idx for idx,item in enumerate(keywords)}
239 |
240 | tfidf_vectorizer = TfidfVectorizer(analyzer="word",
241 | token_pattern=u'(?u)\\b\w+\\b',
242 | tokenizer=None,
243 | preprocessor=None,
244 | lowercase=False,
245 | stop_words=None,
246 | vocabulary = tfidf_vocabulary,
247 | max_features=config['max_keywords'])
248 |
249 | exam_bow_fea = tfidf_vectorizer.fit_transform(train_data['WORDS'].as_matrix()).toarray()
250 | print "test: ",len(exam_bow_fea)
251 |
252 | f = open("result.txt","w")
253 | dictionary = tfidf_vectorizer.get_feature_names()
254 | dictionary = [ (word) for word in dictionary]
255 |
256 | print "dictionary length: ",len(dictionary)
257 | print len(test_data['LABEL'])
258 |
259 |
260 | print >> f , (",".join(dictionary))
261 |
262 | print >> f,"Loading wrod2vec file"
263 | model = Word2Vec.load('weibodata_vectorB.gem')
264 |
265 | print >> f,''.join(u'替换方法2: 直接找出词典中与之最相近的词:') #另一种替换,等待跑出的结果
266 |
267 | list = []
268 |
269 | for sentences in test_data['WORDS']:
270 | temp = ""
271 | tempWord = "" #word不可以每次都改变
272 | sentence = sentences.split(" ")
273 | for word in sentence: #对于每一个单词
274 | if word not in dictionary: #如果word 不在词典之中
275 | #print ''.join(u'单词不在词典之中')
276 | if word == "!": #空格 则下一个
277 | # print ''.join(u'空格跳出循环')
278 | temp = temp + " " + word
279 | continue
280 | if word == "?":
281 | temp = temp + " " + word
282 | continue
283 |
284 | origin = 0
285 | count = 0
286 | for word_in_dict in dictionary :
287 |
288 | #print ''.join(u'开始计算不在tfidf字典中的单词与字典中单词的相近程度')
289 | #print "count: ",count
290 | if count > 20:
291 | break
292 |
293 | #print "尝试计算 " + word + " 与 " + word_in_dict + "的相似度"
294 | try:
295 | similar = abs(model.similarity(word, word_in_dict))
296 |
297 | except Exception:
298 | # print word_in_dict + " 或 " + word + " 不在w2v字典,匹配下一个"
299 | count += 1
300 | continue
301 | print >> f,(word + " 与 " + word_in_dict + " 的相似度为:" + str(similar))
302 | if similar > origin:
303 | origin = similar
304 | #print "Before: ",word
305 | #print "temp: ",word_in_dict
306 | tempWord = word_in_dict #替换为词典中最相近的词 此时未覆盖原词语,使得最终结果相同
307 | #print word + " 被替换为: " + word_in_dict
308 | word = tempWord
309 | temp = temp + " " + word
310 |
311 | list.append(temp)
312 |
313 | print >> f,''.join("替换完成,开始计算tfidf:")
314 |
315 | test_data['WORDS'].to_csv("origin.csv")
316 | test_data['WORDS'] = list
317 | test_data['WORDS'].to_csv("final.csv")
318 |
319 |
320 | exam_bow_fea_test = tfidf_vectorizer.transform(test_data['WORDS'].as_matrix()).toarray()
321 |
322 | print len(exam_bow_fea_test)
323 |
324 | exam_bow_fea_target = train_data['LABEL']
325 | print len(exam_bow_fea_target)
326 |
327 | exam_bow_fea_test_target = test_data['LABEL']
328 | print len(exam_bow_fea_test_target)
329 |
330 |
331 |
332 | print '计算最大熵模型'
333 |
334 |
335 | print "Training MaxEnt"
336 | # rf = RandomForestClassifier(n_estimators=200)
337 | # clf = rf
338 | clf = LogisticRegression(multi_class="multinomial",solver="newton-cg")
339 | clf.fit(exam_bow_fea,exam_bow_fea_target)
340 |
341 | # print exam_bow_fea_test_target
342 | # print exam_bow_fea_test_data
343 | exam_bow_fea_test_target = test_data['LABEL']
344 | print len(exam_bow_fea_test_target)
345 | print len(exam_bow_fea_test)
346 | print len(test_data)
347 |
348 | exam_bow_fea_test_target.to_csv("target_true.csv")
349 | print >> f,",".join(clf.predict(exam_bow_fea_test))
350 | print >> f,sum(exam_bow_fea_test_target == clf.predict(exam_bow_fea_test))
351 | print >> f,sum(exam_bow_fea_test_target == clf.predict(exam_bow_fea_test))/(len(test_data)*1.0)
352 |
353 |
--------------------------------------------------------------------------------
/cnn.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import pandas as pd
3 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
4 | import pandas as pd
5 | import yaml
6 | from keras.utils import np_utils, generic_utils
7 | import pickle
8 | from keras.models import Sequential, Model
9 | from keras.layers import Embedding, Convolution2D, Input, Activation, MaxPooling2D, Reshape, Dropout, Dense, \
10 | Flatten, Merge
11 | from keras.optimizers import SGD
12 | from keras.models import model_from_json
13 | from sklearn.preprocessing import OneHotEncoder
14 | import numpy as np;
15 | np.random.seed(1337) # for reproducibility
16 | import random
17 |
18 | #
19 | # config = yaml.load(file('config_my_cnn.yaml')) #读取yaml配置文件
20 | # config = config['OriginBow'] #以字典的方式读取2
21 |
22 |
23 | '''
24 | cnn 结构可以随时更改
25 | 目前是 一个随机选取方案 的结果
26 |
27 |
28 | '''
29 |
30 |
31 |
32 | nb_pool = [2,1]
33 | nb_classes = 24
34 |
35 |
36 |
37 | def onehotcoder(train_data,test_data):
38 |
39 | '''
40 | 对应论文中的 seg编码
41 | :param train_data: 训练数据
42 | :param test_data: 测试数据
43 | :return:
44 | '''
45 |
46 | vect = CountVectorizer()
47 | train_data_bow_fea = vect.fit_transform(train_data['WORDS']).toarray()
48 | # 规定4维输入,必须先转化[长度,1,宽度,1]
49 | test_data_bow_fea = vect.transform(test_data['WORDS']).toarray()
50 |
51 | length = len(vect.vocabulary_)
52 | values = []
53 | for i in range(10):
54 | values.append(length)
55 | print len(values)
56 | code = OneHotEncoder(categorical_features=np.array([1,2,3,4,5,6,7,8,9,10]),n_values=values) #10个类别 每个类别有字典总数种可能
57 | train_feature = code.fit_transform(train_data_bow_fea).toarray() #编码
58 | test_feature = code.transform(test_data_bow_fea).toarray()
59 |
60 | # print "训练集:"
61 | # print "每个词的维度:",code.n_values_
62 |
63 | train_onehot = []
64 | # print "单词总数:",len(train_feature)
65 | # print "每行总长度", len(train_feature[1]) * 935 - 1
66 |
67 | for i in range(len(train_feature)):
68 | train_one_hot_col = []
69 | t = 0
70 | while True:
71 | # print "剩下 " + str( (len( train_feature[i])) - t ) + " 维"
72 | if ((len( train_feature[i])) - t) < 0:
73 | break
74 | a = train_feature[i][t:t+935]
75 | t += 935
76 | b = train_feature[i][t:t+935]
77 | c = [a[m]+b[m] for m in range(min(len(a),len(b)))] #2区域内相加
78 | for k in c:
79 | train_one_hot_col.append(k)
80 | train_onehot.append(train_one_hot_col)
81 |
82 | print "最终维度:",len(train_onehot[0])
83 |
84 |
85 | # print "测试集:"
86 | # print "每个词的维度:",code.n_values_
87 |
88 | test_onehot = []
89 | # print "单词总数:",len(test_feature)
90 | # print "每行总长度", len(test_feature[1]) * 935 - 1
91 |
92 | for i in range(len(test_feature)):
93 | test_one_hot_col = []
94 | t = 0
95 | while True:
96 | # print "剩下 " + str( (len( test_feature[i])) - t ) + " 维"
97 | if ((len( test_feature[i])) - t) < 0:
98 | break
99 | a = test_feature[i][t:t+935]
100 | t += 935
101 | b = test_feature[i][t:t+935]
102 | c = [a[m]+b[m] for m in range(min(len(a),len(b)))] #2区域内相加
103 | for k in c:
104 | test_one_hot_col.append(k)
105 | test_onehot.append(test_one_hot_col)
106 |
107 | print "最终维度:",len(test_onehot[0])
108 | print len(test_onehot)
109 |
110 | return train_onehot,test_onehot
111 |
112 |
113 | def build(layer1,layer2,hidden1,hidden2,length,width,lr=0.001 ,decay=1e-6,momentum=0.9):
114 | '''
115 | 开始构建CNN网络
116 | :param layer1: 第一层网络 卷积核数量
117 | :param layer2: 第二层网络 卷积核数量
118 | :param hidden1: 第一个隐藏层网络 卷积核数量
119 | :param hidden2: 第二个隐藏层网络 卷积核数量
120 | :param length: 输入长度
121 | :param width: 输入宽度
122 | :param lr: 学习率
123 | :param decay: 学习率衰减
124 | :param momentum:
125 | :return: 搭建好的CNN模型
126 | '''
127 | #16*5*1
128 |
129 | layer1_model1=Sequential()
130 | layer1_model1.add(Convolution2D(layer1, 2, 1,
131 | border_mode='valid',
132 | input_shape=(1, length, 1)))
133 | layer1_model1.add(Activation('tanh'))
134 | layer1_model1.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1])))
135 |
136 | #16*10*1
137 | layer1_model2=Sequential()
138 | layer1_model2.add(Convolution2D(layer1, 4, 1,
139 | border_mode='valid',
140 | input_shape=(1, length, 1)))
141 | layer1_model2.add(Activation('tanh'))
142 | layer1_model2.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1])))
143 |
144 | #16*20*1
145 | layer1_model3=Sequential()
146 | layer1_model3.add(Convolution2D(layer1, 6, 1,
147 | border_mode='valid',
148 | input_shape=(1, length, 1)))
149 | layer1_model3.add(Activation('tanh'))
150 | layer1_model3.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1])))
151 |
152 |
153 |
154 | model = Sequential()
155 |
156 | model.add(Merge([layer1_model2,layer1_model1,layer1_model3], mode='concat',concat_axis=2))#merge
157 |
158 |
159 |
160 | model.add(Convolution2D(layer2,3,1))#layer2 32*5*1
161 | model.add(Activation('tanh'))
162 | model.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[1])))
163 | model.add(Dropout(0.25))
164 |
165 | model.add(Flatten()) #平铺
166 |
167 | model.add(Dense(hidden1)) #Full connection 1: 1000
168 | model.add(Activation('tanh'))
169 | model.add(Dropout(0.5))
170 |
171 |
172 | model.add(Dense(hidden2)) #Full connection 2: 200
173 | model.add(Activation('tanh'))
174 | model.add(Dropout(0.5))
175 |
176 | model.add(Dense(nb_classes))
177 | model.add(Activation('softmax'))
178 |
179 | sgd = SGD(lr=lr, decay=decay, momentum=momentum, nesterov=True)
180 | model.compile(loss='categorical_crossentropy', optimizer=sgd,metrics=["accuracy"])
181 |
182 | #初始化应该在return 之前
183 |
184 | return model
185 | #
186 |
187 | def load_data(file_name):
188 | import csv
189 |
190 | csvfile = file(file_name, 'rb')
191 | reader = csv.reader(csvfile)
192 |
193 | label = []
194 | data = []
195 | for line in reader:
196 | label.append(line[0])
197 | data.append(line[1:len(line)])
198 |
199 | # print label
200 | # print data
201 | csvfile.close()
202 | return data,label
203 |
204 |
205 | if __name__ == '__main__':
206 |
207 | # 测试集"""
208 | train_data_bow_fea,train_data_label = load_data("v2.3_train_Sa_word_seg_i1_dev_830.csv")
209 | test_data_bow_fea,test_data_label = load_data("v2.3_train_Sa_word_seg_i1_val_76.csv")
210 |
211 | train_data_bow_fea_v1,train_data_label_v1 = load_data("v2.3_train_Sa_word_seg_i2_dev_555.csv")
212 | test_data_bow_fea_v1,test_data_label_v1 = load_data("v2.3_train_Sa_word_seg_i2_val_275.csv")
213 | # #
214 |
215 |
216 | train_data_bow_fea_v2,train_data_label_v2 = load_data("v2.3_train_Sa_word_seg_i3_dev_553.csv")
217 | test_data_bow_fea_v2,test_data_label_v2 = load_data("v2.3_train_Sa_word_seg_i3_val_277.csv")
218 |
219 |
220 | train_data_bow_fea_v3,train_data_label_v3 = load_data("v2.3_train_Sa_word_seg_i4_dev_552.csv")
221 | test_data_bow_fea_v3,test_data_label_v3 = load_data("v2.3_train_Sa_word_seg_i4_val_278.csv")
222 |
223 |
224 | sentence_width = len(train_data_bow_fea)
225 | sentence_length = len(train_data_bow_fea[1])
226 |
227 | sentence_width_v1 = len(train_data_bow_fea_v1)
228 | sentence_length_v1 = len(train_data_bow_fea_v1[1])
229 |
230 | sentence_width_v2 = len(train_data_bow_fea_v2)
231 | sentence_length_v2 = len(train_data_bow_fea_v2[1])
232 |
233 | sentence_width_v3 = len(train_data_bow_fea_v3)
234 | sentence_length_v3 = len(train_data_bow_fea_v3[1])
235 |
236 | print sentence_width
237 | print sentence_length
238 |
239 | train_data_bow_fea = np.array(train_data_bow_fea).reshape(len(train_data_bow_fea), 1, len(train_data_bow_fea[1]), 1)
240 | # 规定4维输入,必须先转化[长度,1,宽度,1]
241 | test_data_bow_fea = np.array(test_data_bow_fea).reshape(len(test_data_bow_fea), 1, len(test_data_bow_fea[1]), 1)
242 |
243 |
244 | train_data_bow_fea_v1 = np.array(train_data_bow_fea_v1).reshape(len(train_data_bow_fea_v1), 1, len(train_data_bow_fea_v1[1]), 1)
245 | # 规定4维输入,必须先转化[长度,1,宽度,1]
246 | test_data_bow_fea_v1 = np.array(test_data_bow_fea_v1).reshape(len(test_data_bow_fea_v1), 1, len(test_data_bow_fea_v1[1]), 1)
247 |
248 |
249 | train_data_bow_fea_v2 = np.array(train_data_bow_fea_v2).reshape(len(train_data_bow_fea_v2), 1, len(train_data_bow_fea_v2[1]), 1)
250 | # 规定4维输入,必须先转化[长度,1,宽度,1]
251 | test_data_bow_fea_v2 = np.array(test_data_bow_fea_v2).reshape(len(test_data_bow_fea_v2), 1, len(test_data_bow_fea_v2[1]), 1)
252 |
253 | train_data_bow_fea_v3 = np.array(train_data_bow_fea_v3).reshape(len(train_data_bow_fea_v3), 1, len(train_data_bow_fea_v3[1]), 1)
254 | # 规定4维输入,必须先转化[长度,1,宽度,1]
255 | test_data_bow_fea_v3 = np.array(test_data_bow_fea_v3).reshape(len(test_data_bow_fea_v3), 1, len(test_data_bow_fea_v3[1]), 1)
256 | #改造: 维度也卷积
257 | #改造: 参数改变等
258 |
259 | print '句子数:',sentence_width
260 | print '维度总数:',sentence_length
261 |
262 | label_train = train_data_label
263 | label_train = np_utils.to_categorical(label_train, 24) # 必须使用固定格式表示标签
264 | label_test = test_data_label
265 | label_test = np_utils.to_categorical(label_test, 24) # 必须使用固定格式表示标签
266 |
267 |
268 |
269 | label_train_v1 = train_data_label_v1
270 | label_train_v1 = np_utils.to_categorical(label_train_v1, 24) # 必须使用固定格式表示标签
271 | label_test_v1 = test_data_label_v1
272 | label_test_v1 = np_utils.to_categorical(label_test_v1, 24) # 必须使用固定格式表示标签
273 |
274 | label_train_v2 = train_data_label_v2
275 | label_train_v2 = np_utils.to_categorical(label_train_v2, 24) # 必须使用固定格式表示标签
276 | label_test_v2 = test_data_label_v2
277 | label_test_v2 = np_utils.to_categorical(label_test_v2, 24) # 必须使用固定格式表示标签
278 |
279 | label_train_v3 = train_data_label_v3
280 | label_train_v3 = np_utils.to_categorical(label_train_v3, 24) # 必须使用固定格式表示标签
281 | label_test_v3 = test_data_label_v3
282 | label_test_v3 = np_utils.to_categorical(label_test_v3, 24) # 必须使用固定格式表示标签
283 |
284 |
285 | # layer1_model1 = [10,9,11]
286 | # layer2_model = [30,31,29]
287 | # hidden1_model = [1000,980,1020]
288 | # hidden2_model = [100,80,120]
289 | #
290 | # c = 5
291 |
292 | # layer1_model1 = [5, 6, 4]
293 | # layer2_model = [30, 31, 29]
294 | # hidden1_model = [1000, 980, 1020]
295 | # hidden2_model = [450, 430, 470]
296 | #
297 | # c = 4
298 |
299 | # layer1_model1 = [10, 11, 9]
300 | # layer2_model = [30, 31, 29]
301 | # hidden1_model = [1000, 980, 1020]
302 | # hidden2_model = [450, 430, 470]
303 | #
304 | # c = 3
305 |
306 | # layer1_model1 = [10, 11, 9]
307 | # layer2_model = [15, 14, 16]
308 | # hidden1_model = [1000, 980, 1020]
309 | # hidden2_model = [300, 280, 320]
310 | #
311 | # c = 2
312 |
313 | layer1_model1 = [10, 11, 9]
314 | layer2_model = [30,31, 29]
315 | hidden1_model = [1000, 980, 1020]
316 | hidden2_model = [300, 280, 320]
317 |
318 | c = 1
319 | print c
320 | plan = []
321 | for i in range(0, len( layer1_model1)):
322 | for j in range(0, len( layer2_model)):
323 | for k in range(0, len( layer2_model)):
324 | for m in range(0, len( layer2_model)):
325 | plan.append([layer1_model1[i],layer2_model[j],hidden1_model[k],hidden2_model[m]])
326 |
327 | random.shuffle(plan)
328 |
329 |
330 |
331 |
332 |
333 | u = 0
334 |
335 | # for layer1 in layer1_model1: #4,6
336 | # for layer2 in layer2_model: #[6,8]
337 | # for hidden1 in hidden1_model:
338 | # for hidden2 in hidden2_model:
339 | for i in range(20):
340 |
341 | layer1 = plan[i][0]
342 | layer2 = plan[i][1]
343 | hidden1 = plan[i][2]
344 | hidden2 = plan[i][3]
345 |
346 | f = open('result.txt','a')
347 |
348 | print 'layer1: ', layer1
349 | print 'layer2: ', layer2
350 | print 'hidden1: ', hidden1
351 | print 'hidden2: ', hidden2
352 |
353 | print >> f, 'layer1: ', layer1
354 | print >> f,'layer2: ', layer2
355 | print >> f,'hidden1: ', hidden1
356 | print >> f,'hidden2: ', hidden2
357 |
358 |
359 |
360 | #不同卷积核意味着不同权值
361 |
362 | model = build( layer1,layer2,hidden1,hidden2,sentence_length,sentence_width)
363 |
364 |
365 | model.fit([train_data_bow_fea,train_data_bow_fea,train_data_bow_fea],label_train, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0)
366 |
367 |
368 | print '测试准确率:'
369 | print model.metrics_names
370 | print model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True)
371 |
372 | print >> f,'测试准确率:'
373 | print >> f,model.metrics_names
374 | print >> f,model.evaluate([test_data_bow_fea, test_data_bow_fea, test_data_bow_fea], label_test, show_accuracy=True)
375 |
376 | acc = model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True)[1]
377 |
378 |
379 |
380 | #v1
381 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v1,sentence_width_v1)
382 | model.fit([train_data_bow_fea_v1,train_data_bow_fea_v1,train_data_bow_fea_v1],label_train_v1, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0)
383 | acc_v1 = model.evaluate([test_data_bow_fea_v1,test_data_bow_fea_v1,test_data_bow_fea_v1],label_test_v1,show_accuracy=True)[1]
384 | #v2
385 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v2,sentence_length_v2)
386 | model.fit([train_data_bow_fea_v2,train_data_bow_fea_v2,train_data_bow_fea_v2],label_train_v2, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0)
387 | acc_v2 = model.evaluate([test_data_bow_fea_v2,test_data_bow_fea_v2,test_data_bow_fea_v2],label_test_v2,show_accuracy=True)[1]
388 |
389 | #v3
390 | model = build( layer1,layer2,hidden1,hidden2,sentence_length_v3,sentence_length_v3)
391 | model.fit([train_data_bow_fea_v3,train_data_bow_fea_v3,train_data_bow_fea_v3],label_train_v3, batch_size=32, nb_epoch=30,shuffle=True,verbose=1,validation_split=0)
392 | acc_v3 = model.evaluate([test_data_bow_fea_v3,test_data_bow_fea_v3,test_data_bow_fea_v3],label_test_v3,show_accuracy=True)[1]
393 |
394 |
395 | import csv
396 |
397 | csvfile = file('result_word&charact_Best' + str(c) + '_Random.csv', 'a')
398 | writer = csv.writer(csvfile)
399 | if u == 0:
400 | writer.writerow(['layer1', 'layer2', 'hidden1','hidden2','val_acc','test_acc'])
401 | u += 1
402 |
403 | data = [
404 | (layer1, layer2, hidden1,hidden2,acc,((acc_v1 + acc_v2 + acc_v3)/(3*1.0)) )
405 | ]
406 | writer.writerows(data)
407 | csvfile.close()
408 |
409 | # import csv
410 | #
411 | # csvfile = file('result_word.csv', 'a')
412 | # writer = csv.writer(csvfile)
413 | # if u == 0:
414 | # writer.writerow(['layer1', 'layer2', 'hidden1','hidden2','val_acc','test_acc'])
415 | # u += 1
416 | #
417 | # data = [
418 | # (layer1, layer2, hidden1,hidden2,"",model.evaluate([test_data_bow_fea,test_data_bow_fea,test_data_bow_fea],label_test,show_accuracy=True)[1]),
419 | # ]
420 | # writer.writerows(data)
421 | # csvfile.close()
422 |
423 | f.close()
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 | 1470902465490
286 |
287 |
288 | 1470902465490
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
--------------------------------------------------------------------------------