├── README.md
├── lgb_dc.py
├── mlp.py
└── ridge.py


/README.md:
--------------------------------------------------------------------------------
 1 | # public_praise_prediction_yunyi
 2 | 2018云移杯- 景区口碑评价分值预测   7/1186 “云南我来了”队伍
 3 | 
 4 | 比赛地址：http://www.datafountain.cn/?u=7609847&&#/competitions/283/intro
 5 | 
 6 | 对用户给出的评语，预测其相应的评分，评分范围0~5，实际上是一个情感预测的问题。
 7 | 
 8 | ### 简介
 9 | 
10 | 分别使用lightgbm, ridge, mlp三种模型预测口碑分数，三种模型大约都在0.54左右，最后做加权平均。各模型尽可能强而不同。
11 | 
12 | - 数据预处理
13 | 
14 |   首先做中文分词，分别尝试了SnowNLP，jieba，各种分词工具效果相差不大，因此三种模型故意使用不同的分词工具。做word embedding时，有word2vec(doc2vec)和TF-IDF，经过尝试，在不同的模型上不同的embedding有较大的差异，比如mlp我们使用的是TF-IDF，而lightgbm我们使用的word2vec，个人感觉是不是word2vec本身就是神经网络捕获的特征，导致再用mlp做反而效果提升不明显。word2vec有更丰富的语义，但是在这种情感分类问题上，表现得并不非常突出。而且发现去除停用词有时会使得效果变差，有可能比如“！”这种也实际表达了一种强烈的情感，是一个强特。
15 | 
16 | - 模型
17 | 
18 |   将上面的embedding结果传到各个模型中，原先做5类分类，效果较差。后来改成预测0~5之间的数值，在这里用的一个小trick就是大于4.7，全部作为评价为5，会有小小的提升。模型就是不断尝试和调差的过程，包括使用GridSearch找最优参数。用的mlp是一个比较浅的全连接神经网络，192、64、64、1，情感预测的神经网络不需要特别复杂特别深。
19 | 
20 | ### 文件结构
21 | 
22 | - lgb_dc.py: lightgbm 分值预测模型的实现
23 | - mlp.py: mlp 分值预测模型的实现
24 | - ridge.py: ridge (linear model)分值预测模型的实现
25 | 
26 | ### 联系我
27 | 
28 | [cnmengnan@gmail.com](mailto:cnmengnan@gmail.com)
29 | 
30 | blog: [WinterColor blog](http://www.cnblogs.com/mengnan/)


--------------------------------------------------------------------------------
/lgb_dc.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | from sklearn.feature_extraction.text import CountVectorizer
  3 | import pandas as pd
  4 | import jieba
  5 | import numpy as np
  6 | from sklearn.feature_extraction import DictVectorizer
  7 | from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
  8 | from sklearn.pipeline import make_pipeline, make_union, Pipeline
  9 | from sklearn.preprocessing import FunctionTransformer, StandardScaler
 10 | from sklearn.metrics import mean_squared_log_error,mean_squared_error
 11 | from sklearn.model_selection import KFold
 12 | from contextlib import contextmanager
 13 | from functools import partial
 14 | from operator import itemgetter
 15 | from multiprocessing.pool import ThreadPool
 16 | import time
 17 | from typing import List, Dict
 18 | from sklearn import preprocessing
 19 | from sklearn.feature_selection import SelectKBest
 20 | from sklearn.feature_selection import chi2
 21 | from six.moves import cPickle as pickle
 22 | import os
 23 | import lightgbm as lgb
 24 | from sklearn.model_selection import train_test_split
 25 | from sklearn.svm import SVC
 26 | from gensim.models.doc2vec import TaggedDocument
 27 | from gensim.models.doc2vec import Doc2Vec
 28 | 
 29 | resource_path='resource/lgb/'
 30 | data_path='data/'
 31 | 
 32 | @contextmanager
 33 | def timer(name):
 34 |     start=time.clock()
 35 |     yield
 36 |     print(f'[{name}] done in {time.clock() - start:.0f} s')
 37 | 
 38 | def get_dataset_x(df:pd.DataFrame)->pd.DataFrame:
 39 |     df['text']=df['Discuss'].fillna(' ')
 40 |     df['len_text']=df['Discuss'].apply(lambda x:len(x))
 41 |     return df[['text','len_text']]
 42 | 
 43 | def get_d2v_x(train_docs:list,train_tags:list,test_docs:list,vec_size=200)->list:
 44 |     trainX=[]
 45 |     for i,text in enumerate(train_docs):
 46 |         word_list=' '.join(jieba.cut(text))
 47 |         document=TaggedDocument(word_list.split(' '),train_tags[i])
 48 |         trainX.append(document)
 49 | 
 50 |     if os.path.exists(resource_path+'d2v.model'):
 51 |         model_dv=Doc2Vec.load(resource_path+'d2v.model')
 52 |     else:
 53 |         # train doc2vec
 54 |         with timer('train doc2vec'):
 55 |             model_dv=Doc2Vec(trainX,min_count=1, window = 3, size = vec_size, sample=1e-3, negative=5, workers=4)
 56 |             model_dv.train(trainX, total_examples=model_dv.corpus_count, epochs=70)
 57 |             model_dv.save(resource_path+'d2v.model')
 58 | 
 59 |     train_vec=[]
 60 |     for x in train_docs:
 61 |         word_list=' '.join(jieba.cut(x))
 62 |         train_vec.append(model_dv.infer_vector(word_list.split(' ')))
 63 | 
 64 |     test_vec=[]
 65 |     for x in test_docs:
 66 |         word_list=' '.join(jieba.cut(x))
 67 |         test_vec.append(model_dv.infer_vector(word_list.split(' ')))
 68 | 
 69 |     return train_vec,test_vec
 70 | 
 71 | def on_field(f:str,*vec)->Pipeline:
 72 |     return make_pipeline(FunctionTransformer(itemgetter(f),validate=False),*vec)
 73 | 
 74 | def to_records(df:pd.DataFrame)->List[Dict]:
 75 |     return df.to_dict(orient='records')
 76 | 
 77 | def model_lgb(trainX,testX,trainY)->np.array:
 78 | 
 79 |     trainY=[x-1 for x in trainY]
 80 | 
 81 |     params = {
 82 |         'learning_rate': 0.02,
 83 |         'boosting_type': 'gbdt',
 84 |         'objective': 'multiclass',
 85 |         'metric': 'multi_logloss',
 86 |         'num_class':5,
 87 |         'is_training_metric':True,
 88 |         'early_stopping':10,
 89 |         'sub_feature': 0.7,
 90 |         'num_leaves': 60,
 91 |         'min_data': 100,
 92 |         'min_hessian': 1,
 93 |         'verbose': -1,
 94 |     }
 95 | 
 96 |     trainX,validX,trainY,validY=train_test_split(trainX,trainY,test_size=0.2,random_state=1)
 97 | 
 98 |     lgb_train=lgb.Dataset(data=trainX,label=trainY)
 99 |     lgb_valid=lgb.Dataset(data=validX,label=validY,reference=lgb_train)
100 | 
101 |     gbm=lgb.train(params=params,train_set=lgb_train,num_boost_round=2300,valid_sets=lgb_valid)
102 |     gbm.save_model(resource_path+'lgb.model')
103 | 
104 |     pred=gbm.predict(testX,num_iteration=gbm.best_iteration)
105 | 
106 |     #临时
107 |     with open(resource_path+'result.pik','wb') as f:
108 |         pickle.dump(pred,f)
109 | 
110 |     pred=[np.argmax(x)+1 for x in pred]
111 | 
112 |     return pred
113 | 
114 | def model_svm(trainX,testX,trainY):
115 |     clf=SVC()
116 |     clf.fit(trainX,trainY)
117 |     result=clf.predict(testX)
118 |     return result
119 | 
120 | def store_result(pred_ls):
121 |     test=pd.read_csv(data_path+'test.csv')
122 |     test['pred']=pred_ls
123 |     test[['Id','pred']].to_csv(resource_path+'result.csv',index=None,header=None)
124 | 
125 | def main1():
126 |     vectorizer=make_union(
127 |         on_field('text',Tfidf(max_features=300000,token_pattern='\w+',ngram_range=(1,2))),
128 |         on_field(['len_text'],FunctionTransformer(to_records,validate=False),DictVectorizer())
129 |     )
130 | 
131 |     with timer('process train'):
132 |         if os.path.exists(resource_path+'dataset.pik'):
133 |             with open(resource_path+'dataset.pik','rb') as f:
134 |                 trainX,testX,trainY=pickle.load(f)
135 |         else:
136 |             train=pd.read_csv(data_path+'train.csv')
137 |             train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
138 | 
139 |             test=pd.read_csv(data_path+'test.csv')
140 |             test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
141 | 
142 |             train=train[train['Score']>0].reset_index(drop=True)
143 |             trainY=train['Score'].values
144 |             trainX=vectorizer.fit_transform(get_dataset_x(train)).astype(np.float32)
145 |             testX=vectorizer.fit_transform(get_dataset_x(test)).astype(np.float32)
146 | 
147 |             sk=SelectKBest(chi2,k=100000)
148 |             trainX=sk.fit_transform(trainX,trainY)
149 |             testX=sk.transform(testX)
150 | 
151 |             with open(resource_path+'dataset.pik','wb') as f:
152 |                 pickle.dump((trainX,testX,trainY),f)
153 | 
154 |         print(f'trainX: {trainX.shape} of {trainX.dtype} with{type(trainX)}')
155 |         print(f'testX: {testX.shape} of {testX.dtype} with{type(testX)}')
156 | 
157 |         #pred=model_lgb(trainX,testX,trainY)
158 |         pred=model_svm(trainX,testX,trainY)
159 |         store_result(pred)
160 | 
161 | def main():
162 |     with timer('process train'):
163 |         train=pd.read_csv(data_path+'train.csv')
164 |         test=pd.read_csv(data_path+'test.csv')
165 | 
166 |         #将score转化为list的list以供taggeddocment使用
167 |         score_list=train['Score'].values.tolist()
168 |         score_list=[[x] for x in score_list]
169 |         train_vec,test_vec=get_d2v_x(train['Discuss'].values.tolist(),score_list,test['Discuss'].values.tolist())
170 |         trainX=pd.DataFrame(train_vec)
171 |         trainX['len_text']=train['Discuss'].apply(lambda x:len(x))
172 |         trainY=train['Score'].values
173 | 
174 |         testX=pd.DataFrame(test_vec)
175 |         testX['len_text']=train['Discuss'].apply(lambda x:len(x))
176 | 
177 |         print(f'trainX: {trainX.shape} with{type(trainX)}')
178 |         print(f'testX: {testX.shape} with{type(testX)}')
179 |         #pred=model_lgb(trainX,testX,trainY)
180 |         pred=model_svm(trainX,testX,trainY)
181 |         store_result(pred)
182 | 
183 | 
184 | if __name__ == '__main__':
185 |     # sentence=[b'waste of time.', b'a shit movie.', b'a nb movie.', b'I love this movie!', b'shit.', b'worth my money.', b'sb movie.', b'worth it!']
186 |     # sentence=['他','你我','这']
187 |     # print(get_cv_one_vec(sentence).toarray())
188 | 
189 |     # sentence=['它会检查你已经拥有的库文件是否有更新的版本。','这个问题真是郁闷了我一天，网上也是各种找解决方案']
190 |     # tf=TfidfVectorizer(token_pattern='\w+',ngram_range=(1,2),binary=True)
191 |     # # print(tf.fit_transform(sentence).toarray())
192 |     # arr=tf.fit_transform(sentence)
193 |     # x=[x.astype(np.bool).astype(np.float32) for x in arr]
194 |     # print(x[1].toarray())
195 |     #
196 |     # with open(resource_path + 'result.pik', 'rb') as f:
197 |     #     pred=pickle.load(f)
198 |     # pred=[np.argmax(x)+1 for x in pred]
199 |     # print(len(pred))
200 |     # store_result(pred)
201 | 
202 |     # train_vec,test_vec=get_d2v_x(['根据官方api探索性的做了些尝试。','infer新文档向量'],[[0],[1]],['后期会继续改进'])
203 |     # print(train_vec)
204 |     # print(type(train_vec))
205 |     # print(test_vec)
206 | 
207 |     # main1()
208 |     main()
209 | 
210 | 


--------------------------------------------------------------------------------
/mlp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['OMP_NUM_THREADS'] = '4'
  3 | from contextlib import contextmanager
  4 | from functools import partial
  5 | from operator import itemgetter
  6 | from multiprocessing.pool import ThreadPool
  7 | import time
  8 | from typing import List, Dict
  9 | 
 10 | import tensorflow as tf
 11 | import keras as ks
 12 | import pandas as pd
 13 | import numpy as np
 14 | from sklearn.feature_extraction import DictVectorizer
 15 | from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
 16 | from sklearn.pipeline import make_pipeline, make_union, Pipeline
 17 | from sklearn.preprocessing import FunctionTransformer, StandardScaler
 18 | from sklearn.metrics import mean_squared_log_error,mean_squared_error
 19 | from sklearn.feature_extraction.text import CountVectorizer
 20 | from sklearn.model_selection import KFold
 21 | import math
 22 | import jieba
 23 | from snownlp import SnowNLP
 24 | from snownlp import sentiment
 25 | import multiprocessing as mp
 26 | from six.moves import cPickle as pickle
 27 | from sklearn.metrics import mean_squared_error
 28 | 
 29 | data_path='data/'
 30 | resource_path='resource/'
 31 | 
 32 | #一共原始的特征
 33 | @contextmanager  #创建一个上下文管理器，显示运行的时间
 34 | def timer(name):
 35 |     t0 = time.time()
 36 |     yield
 37 |     print(f'[{name}] done in {time.time() - t0:.0f} s')
 38 | 
 39 | 
 40 | def preprocess(df: pd.DataFrame) -> pd.DataFrame: #标注df是pd.DataFrame类型的，返回的也是pd.DataFrame的
 41 |     df['text']=df['Discuss'].fillna('')
 42 |     return df[['text','len_discuss']]
 43 | 
 44 | def on_field(f: str, *vec) -> Pipeline:# *代表可变长度的
 45 |     return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)
 46 | 
 47 | def to_records(df: pd.DataFrame) -> List[Dict]:
 48 |     #将df由{'a':[1,2,3],'b':[4,5,6]}转换为[{'a': 1, 'b': 4}, {'a': 2, 'b': 5}, {'a': 3, 'b': 6}]
 49 |     return df.to_dict(orient='records')
 50 | 
 51 | def get_nouns(sentences:list)->set:
 52 |     noun_list=[]
 53 |     for s in sentences:
 54 |         # print('sentence',s)
 55 |         st=SnowNLP(s)
 56 |         tags=list(st.tags)
 57 |         # print('tags',tags)
 58 |         for index,word in enumerate(st.words):
 59 |             if tags[index][1]=='n':
 60 |                 noun_list.append(word)
 61 |                 # print('word',word)
 62 |     return set(noun_list)
 63 | 
 64 | def get_keywords(sentences:list)->set:
 65 |     keywords_list=[]
 66 |     for s in sentences:
 67 |         st=SnowNLP(s)
 68 |         keywords_list.extend(st.keywords(limit=3))
 69 |     return set(keywords_list)
 70 | 
 71 | # def get_noun(sentence):
 72 | #     s=SnowNLP(sentence)
 73 | #     noun_word_list=[]
 74 | #     for index,word in enumerate(s.words):
 75 | #         if s.tags[index]=='n':
 76 | #             noun_word_list.append(word)
 77 | #     return noun_word_list
 78 | 
 79 | def filter_noun(nouns:list):
 80 |     nouns_filtered=[]
 81 |     for x in nouns:
 82 |         s=SnowNLP(x)
 83 |         if s.sentiments>=0.7 or s.sentences<=0.3:
 84 |             nouns_filtered.append(x)
 85 |     return nouns_filtered
 86 | 
 87 | def seg_sentence(sentence)->list:
 88 |     s=SnowNLP(sentence)
 89 |     return s.words
 90 | 
 91 | def fit_predict(xs, y_train) -> np.ndarray:
 92 |     #[ [[Xb_train, Xb_valid], [X_train, X_valid]] ,[[Xb_train, Xb_valid], [X_train, X_valid]] ]
 93 |     X_train, X_test = xs
 94 |     print("X_train:",X_train.shape)
 95 |     print("X_test:",X_test.shape)
 96 |     config = tf.ConfigProto(
 97 |         intra_op_parallelism_threads=1, use_per_session_threads=1, inter_op_parallelism_threads=1)
 98 |     with tf.Session(graph=tf.Graph(), config=config) as sess, timer('fit_predict'):
 99 |         ks.backend.set_session(sess)
100 |         model_in = ks.Input(shape=(X_train.shape[1],), dtype='float32', sparse=True)
101 |         out = ks.layers.Dense(192, activation='relu')(model_in)
102 |         out = ks.layers.Dense(64, activation='relu')(out)
103 |         out = ks.layers.Dense(64, activation='relu')(out)
104 |         out = ks.layers.Dense(1)(out)
105 |         model = ks.Model(model_in, out)
106 |         model.compile(loss='mean_squared_error', optimizer=ks.optimizers.Adam(lr=3e-3))
107 |         for i in range(3):
108 |             with timer(f'epoch {i + 1}'):
109 |                 model.fit(x=X_train, y=y_train, batch_size=2**(11 + i), epochs=1, verbose=0)
110 |         return model.predict(X_test)[:, 0]
111 | from sklearn import preprocessing
112 | from sklearn.feature_selection import SelectKBest
113 | from sklearn.feature_selection import chi2
114 | def main(evalation=True):
115 |     #make_union将各个特征组合到一起
116 |     vectorizer = make_union(
117 |         #先获取pd中name，进行Tfidf,根据语料库的出现词的频率排序，选择前300000个词，\w+匹配数字字母下划线的多个字符
118 |         #on_field('name', Tfidf(max_features=1000, token_pattern='\w+')),
119 |         #获取pd中的text，也是tfidf，不同的是使用ngram
120 |         on_field('text', Tfidf(max_features=300000, token_pattern='\w+', ngram_range=(1, 2))),
121 |         on_field(['len_discuss'],FunctionTransformer(to_records,validate=False),DictVectorizer()),
122 |         #on_field(['shipping', 'item_condition_id'],
123 |                  #FunctionTransformer(to_records, validate=False), DictVectorizer()),
124 |         n_jobs=1)
125 |     y_scaler = StandardScaler()
126 |     with timer('process train'):
127 |         train = pd.read_csv(data_path+'train_split.csv')
128 |         train['len_discuss']=train['Discuss'].apply(lambda x:len(x))
129 | 
130 |         train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
131 | 
132 |         test=pd.read_csv(data_path+"dev_split.csv")
133 |         test['len_discuss']=test['Discuss'].apply(lambda x:len(x))
134 |         test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
135 |         y_true=None
136 |         if evalation:
137 |             y_true=test['Score'].values
138 | 
139 | ##################### noun
140 |         # print('load noun set...')
141 |         #
142 |         # if os.path.exists(resource_path+'noun_set.pik'):
143 |         #     with open(resource_path+'noun_set.pik','rb') as f:
144 |         #         noun_set=pickle.load(f)
145 |         #         # noun_set=filter_noun(noun_set)
146 |         # else:
147 |         #     noun_set=get_nouns(train['Discuss'].values)
148 |         #     with open(resource_path+'noun_set.pik','wb') as f:
149 |         #         pickle.dump(noun_set,f)
150 |         #     # noun_set=filter_noun(noun_set)
151 |         #
152 |         # print(f'noun size:{len(noun_set)}')
153 | #######################
154 | 
155 | ###################### keyword
156 |         print('load keyword set...')
157 | 
158 |         if os.path.exists(resource_path+'keyword_set.pik'):
159 |             with open(resource_path+'keyword_set.pik','rb') as f:
160 |                 keyword_set=pickle.load(f)
161 |         else:
162 |             keyword_set=get_keywords(train['Discuss'].values)
163 |             with open(resource_path+'keyword_set.pik','wb') as f:
164 |                 pickle.dump(keyword_set,f)
165 | 
166 |         print(f'keyword size:{len(keyword_set)}')
167 | ######################
168 | 
169 |         train = train[train['Score'] > 0].reset_index(drop=True)#取出所有价格大于0的数据
170 |         # cv = KFold(n_splits=10, shuffle=True, random_state=42)#20折
171 |         # train_ids, valid_ids = next(cv.split(train))
172 |         # valid=train.iloc[valid_ids]
173 |         # train=train.iloc[train_ids]
174 |         y_train_start=train['Score'].values
175 |         y_train=y_scaler.fit_transform(train['Score'].values.reshape(-1,1))
176 |         X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
177 |         X_test=vectorizer.transform(preprocess(test)).astype(np.float32)
178 | 
179 |         #y_test=valid['Score']
180 | 
181 |         sk=SelectKBest(chi2,k=100000)
182 |         X_train=sk.fit_transform(X_train,y_train_start)
183 |         X_test=sk.transform(X_test)
184 | 
185 |         print(f'X_train: {X_train.shape} of {X_train.dtype}')
186 |         print(f'X_test: {X_test.shape} of {X_test.dtype}')
187 |         # del train
188 |     # with timer('process valid'):
189 |     #     X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
190 |     with ThreadPool(processes=6) as pool:
191 |         Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]]
192 | 
193 | ############################### noun
194 |         # vec=CountVectorizer(binary=True,tokenizer=seg_sentence)
195 |         # vec.fit(noun_set)
196 |         # Xn_train,Xn_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]]
197 | ##################################
198 | 
199 | ############################# keyword
200 |         if os.path.exists(resource_path+'keyword_train.pik'):
201 |             with open(resource_path+'resource_train.pik','rb') as f:
202 |                 Xk_train,Xk_valid=pickle.load(f)
203 |         else:
204 |             vec=CountVectorizer(binary=True,tokenizer=seg_sentence)
205 |             vec.fit(keyword_set)
206 |             Xk_train,Xk_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]]
207 |             with open(resource_path+'resource_train.pik','wb') as f:
208 |                 pickle.dump([Xk_train,Xk_valid],f)
209 | #############################
210 | 
211 | ############################
212 | 
213 | ####下面的xn_train,Xn_valid
214 | 
215 | ##############
216 | 
217 | ############# 拼接在内部
218 |         # Xb_a_train=np.concatenate([Xb_train,Xk_train],axis=1)
219 |         # Xb_a_valid=np.concatenate([Xb_valid,Xk_valid],axis=1)
220 |         # X_a_train=np.concatenate([X_train,Xk_train],axis=1)
221 |         # X_a_test=np.concatenate([X_test,Xk_valid],axis=1)
222 |         xs = [[Xb_train, Xb_valid], [X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍  #Xb表示单词的出现与否，而X使用的是tfidf特征权重
223 | ############## 放在训练
224 |         # xs = [[Xb_train, Xb_valid],[X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍  #Xb表示单词的出现与否，而X使用的是tfidf特征权重
225 | 
226 | ###############
227 | 
228 |         print(len(xs),len(xs[0]))
229 |         #print(len(xs[1]))
230 |         xx=pool.map(partial(fit_predict, y_train=y_train), xs)#np.mean指传入多次进行平均
231 |         print(len(xx))
232 |         y_pred = np.mean(xx,axis=0)
233 |         y_pred=y_scaler.inverse_transform(y_pred)
234 |     # print(y_pred)
235 | 
236 |     pre=[]
237 |     for i in y_pred:
238 |         if i>4.7:
239 |             pre.append(5)
240 |         else:
241 |             pre.append(i)
242 | 
243 |     if evalation and y_true is not None:
244 |         print('the score is :',evaluate(y_true,pre))
245 | 
246 |     result=pd.DataFrame({'ID':test.Id,'Discuss':test.Discuss,'Score':pre})
247 |     result.to_csv('MLP_simple_jieba_stopword_chibest.csv',header=None,index=None)
248 | 
249 | def evaluate(y_true,y_prediction):
250 |     return 1.0/(1.0+np.sqrt((mean_squared_error(y_pred=y_prediction,y_true=y_true))))
251 | 
252 | if __name__ == '__main__':
253 |     main(evalation=True)


--------------------------------------------------------------------------------
/ridge.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import jieba
  5 | import jieba.analyse
  6 | from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer
  7 | from sklearn.linear_model import Ridge
  8 | from sklearn.metrics import mean_squared_error
  9 | from sklearn.preprocessing import OneHotEncoder
 10 | import scipy
 11 | from sklearn.model_selection import KFold
 12 | # from scipy.sparse import csr_matrix
 13 | from scipy.sparse import csr_matrix, hstack
 14 | import numpy as np
 15 | 
 16 | def get_data():
 17 |     train = pd.read_csv('data/train_hebing.csv')
 18 |     #train2 = pd.read_csv('D:/yunyibei/result/3train0302.csv')
 19 |     #train.drop_duplicates(subset='Discuss', keep='first',inplace=True)
 20 |     test = pd.read_csv('data/predict_second.csv')
 21 |     #data1= pd.concat([train, train2])
 22 |     data = pd.concat([train, test])
 23 |     print('train %s test %s'%(train.shape,test.shape))
 24 |     print('train columns',train.columns)
 25 |     return data,train.shape[0],train['Score'],test['Id']
 26 | 
 27 | 
 28 | def split_discuss(data):#不用clean_str 用split_discuss分数要高 (split_discuss未用停用词，未去除字母和数字)
 29 |     data['length'] = data['Discuss'].apply(lambda x:len(x))
 30 |     data['Discuss'] = data['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
 31 | 
 32 |     return data
 33 | 
 34 | # 预处理
 35 | def pre_process():
 36 |     data,nrw_train,y,test_id = get_data()
 37 |     #y = np.where(y>=4,5,y)
 38 |     #y = np.where(y<4,1,y)
 39 |     data = split_discuss(data)
 40 |     #data['Discuss'] = data['Discuss'].map(lambda x : clean_str(x))
 41 |     #data['length'] = data['Discuss'].apply(lambda x:len(x))
 42 |     #data=clean_str(data)
 43 |     #cv = CountVectorizer(ngram_range=(1,2))
 44 |     #discuss = cv.fit_transform(data['Discuss'])
 45 |     tf = TfidfVectorizer(ngram_range=(1,6),analyzer='char')
 46 |     ha= HashingVectorizer(ngram_range=(1,2),lowercase=False)
 47 |     discuss_ha = ha.fit_transform(data['Discuss'])
 48 |     
 49 |     discuss_tf = tf.fit_transform(data['Discuss'])
 50 |     print("bb")
 51 |     data=hstack((discuss_tf,discuss_ha)).tocsr()
 52 |     # length = csr_matrix(pd.get_dummies(data['length'],sparse=True).values)
 53 |    # data = hstack((discuss,discuss_tf)).tocsr()
 54 |     #print(data.shape)
 55 |     return data[:nrw_train],data[nrw_train:],y,test_id
 56 | 
 57 | def xx_mse_s(y_true,y_pre):
 58 |     y_true = y_true
 59 |     y_pre = pd.DataFrame({'res':list(y_pre)})
 60 | 
 61 |     y_pre['res'] = y_pre['res'].astype(int)
 62 |     return 1 / ( 1 + mean_squared_error(y_true,y_pre['res'].values)**0.5)
 63 | 
 64 | 
 65 | X,test,y,test_id = pre_process()
 66 | print('aa')
 67 | print(y.shape)
 68 | print(X.shape)
 69 | print(test.shape)
 70 | print('.....')
 71 | kf = KFold(n_splits=3,shuffle=True,random_state=42)
 72 | cv_pred = []
 73 | kf = kf.split(X)
 74 | xx_mse = []
 75 | model_1 = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01)
 76 | 
 77 | for i ,(train_fold,test_fold) in enumerate(kf):
 78 |     X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]
 79 |     model_1.fit(X_train, label_train)
 80 | 
 81 |     val_ = model_1.predict(X=X_validate)
 82 |     print(xx_mse_s(label_validate, val_))
 83 | 
 84 |     cv_pred.append(model_1.predict(test))
 85 |     xx_mse.append(xx_mse_s(label_validate, val_))
 86 | 
 87 | print('xx_result',np.mean(xx_mse))
 88 | 
 89 | s = 0
 90 | for i in cv_pred:
 91 |     s = s + i
 92 | 
 93 | s = s/3
 94 | res = pd.DataFrame()
 95 | s = np.where(s>4.7,5,s)
 96 | #s = np.where((s>1.9) & (s<2.5),2,s)
 97 | #s = np.where(s>3.5 and s<=4.8,4,s)
 98 | res['Id'] = list(test_id)
 99 | res['pre'] = list(s)
100 | 
101 | res.to_csv('result/result.csv',index=False,header=False)
102 | 
103 | 
104 | 
105 | # 0.581334990703  0.528227538116  0.48691


--------------------------------------------------------------------------------