├── README.md ├── lgb_dc.py ├── mlp.py └── ridge.py /README.md: -------------------------------------------------------------------------------- 1 | # public_praise_prediction_yunyi 2 | 2018云移杯- 景区口碑评价分值预测  7/1186 “云南我来了”队伍 3 | 4 | 比赛地址:http://www.datafountain.cn/?u=7609847&&#/competitions/283/intro 5 | 6 | 对用户给出的评语,预测其相应的评分,评分范围0~5,实际上是一个情感预测的问题。 7 | 8 | ### 简介 9 | 10 | 分别使用lightgbm, ridge, mlp三种模型预测口碑分数,三种模型大约都在0.54左右,最后做加权平均。各模型尽可能强而不同。 11 | 12 | - 数据预处理 13 | 14 | 首先做中文分词,分别尝试了SnowNLP,jieba,各种分词工具效果相差不大,因此三种模型故意使用不同的分词工具。做word embedding时,有word2vec(doc2vec)和TF-IDF,经过尝试,在不同的模型上不同的embedding有较大的差异,比如mlp我们使用的是TF-IDF,而lightgbm我们使用的word2vec,个人感觉是不是word2vec本身就是神经网络捕获的特征,导致再用mlp做反而效果提升不明显。word2vec有更丰富的语义,但是在这种情感分类问题上,表现得并不非常突出。而且发现去除停用词有时会使得效果变差,有可能比如“!”这种也实际表达了一种强烈的情感,是一个强特。 15 | 16 | - 模型 17 | 18 | 将上面的embedding结果传到各个模型中,原先做5类分类,效果较差。后来改成预测0~5之间的数值,在这里用的一个小trick就是大于4.7,全部作为评价为5,会有小小的提升。模型就是不断尝试和调差的过程,包括使用GridSearch找最优参数。用的mlp是一个比较浅的全连接神经网络,192、64、64、1,情感预测的神经网络不需要特别复杂特别深。 19 | 20 | ### 文件结构 21 | 22 | - lgb_dc.py: lightgbm 分值预测模型的实现 23 | - mlp.py: mlp 分值预测模型的实现 24 | - ridge.py: ridge (linear model)分值预测模型的实现 25 | 26 | ### 联系我 27 | 28 | [cnmengnan@gmail.com](mailto:cnmengnan@gmail.com) 29 | 30 | blog: [WinterColor blog](http://www.cnblogs.com/mengnan/) -------------------------------------------------------------------------------- /lgb_dc.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | import pandas as pd 4 | import jieba 5 | import numpy as np 6 | from sklearn.feature_extraction import DictVectorizer 7 | from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf 8 | from sklearn.pipeline import make_pipeline, make_union, Pipeline 9 | from sklearn.preprocessing import FunctionTransformer, StandardScaler 10 | from sklearn.metrics import mean_squared_log_error,mean_squared_error 11 | from sklearn.model_selection import KFold 12 | from contextlib import contextmanager 13 | from functools import partial 14 | from operator import itemgetter 15 | from multiprocessing.pool import ThreadPool 16 | import time 17 | from typing import List, Dict 18 | from sklearn import preprocessing 19 | from sklearn.feature_selection import SelectKBest 20 | from sklearn.feature_selection import chi2 21 | from six.moves import cPickle as pickle 22 | import os 23 | import lightgbm as lgb 24 | from sklearn.model_selection import train_test_split 25 | from sklearn.svm import SVC 26 | from gensim.models.doc2vec import TaggedDocument 27 | from gensim.models.doc2vec import Doc2Vec 28 | 29 | resource_path='resource/lgb/' 30 | data_path='data/' 31 | 32 | @contextmanager 33 | def timer(name): 34 | start=time.clock() 35 | yield 36 | print(f'[{name}] done in {time.clock() - start:.0f} s') 37 | 38 | def get_dataset_x(df:pd.DataFrame)->pd.DataFrame: 39 | df['text']=df['Discuss'].fillna(' ') 40 | df['len_text']=df['Discuss'].apply(lambda x:len(x)) 41 | return df[['text','len_text']] 42 | 43 | def get_d2v_x(train_docs:list,train_tags:list,test_docs:list,vec_size=200)->list: 44 | trainX=[] 45 | for i,text in enumerate(train_docs): 46 | word_list=' '.join(jieba.cut(text)) 47 | document=TaggedDocument(word_list.split(' '),train_tags[i]) 48 | trainX.append(document) 49 | 50 | if os.path.exists(resource_path+'d2v.model'): 51 | model_dv=Doc2Vec.load(resource_path+'d2v.model') 52 | else: 53 | # train doc2vec 54 | with timer('train doc2vec'): 55 | model_dv=Doc2Vec(trainX,min_count=1, window = 3, size = vec_size, sample=1e-3, negative=5, workers=4) 56 | model_dv.train(trainX, total_examples=model_dv.corpus_count, epochs=70) 57 | model_dv.save(resource_path+'d2v.model') 58 | 59 | train_vec=[] 60 | for x in train_docs: 61 | word_list=' '.join(jieba.cut(x)) 62 | train_vec.append(model_dv.infer_vector(word_list.split(' '))) 63 | 64 | test_vec=[] 65 | for x in test_docs: 66 | word_list=' '.join(jieba.cut(x)) 67 | test_vec.append(model_dv.infer_vector(word_list.split(' '))) 68 | 69 | return train_vec,test_vec 70 | 71 | def on_field(f:str,*vec)->Pipeline: 72 | return make_pipeline(FunctionTransformer(itemgetter(f),validate=False),*vec) 73 | 74 | def to_records(df:pd.DataFrame)->List[Dict]: 75 | return df.to_dict(orient='records') 76 | 77 | def model_lgb(trainX,testX,trainY)->np.array: 78 | 79 | trainY=[x-1 for x in trainY] 80 | 81 | params = { 82 | 'learning_rate': 0.02, 83 | 'boosting_type': 'gbdt', 84 | 'objective': 'multiclass', 85 | 'metric': 'multi_logloss', 86 | 'num_class':5, 87 | 'is_training_metric':True, 88 | 'early_stopping':10, 89 | 'sub_feature': 0.7, 90 | 'num_leaves': 60, 91 | 'min_data': 100, 92 | 'min_hessian': 1, 93 | 'verbose': -1, 94 | } 95 | 96 | trainX,validX,trainY,validY=train_test_split(trainX,trainY,test_size=0.2,random_state=1) 97 | 98 | lgb_train=lgb.Dataset(data=trainX,label=trainY) 99 | lgb_valid=lgb.Dataset(data=validX,label=validY,reference=lgb_train) 100 | 101 | gbm=lgb.train(params=params,train_set=lgb_train,num_boost_round=2300,valid_sets=lgb_valid) 102 | gbm.save_model(resource_path+'lgb.model') 103 | 104 | pred=gbm.predict(testX,num_iteration=gbm.best_iteration) 105 | 106 | #临时 107 | with open(resource_path+'result.pik','wb') as f: 108 | pickle.dump(pred,f) 109 | 110 | pred=[np.argmax(x)+1 for x in pred] 111 | 112 | return pred 113 | 114 | def model_svm(trainX,testX,trainY): 115 | clf=SVC() 116 | clf.fit(trainX,trainY) 117 | result=clf.predict(testX) 118 | return result 119 | 120 | def store_result(pred_ls): 121 | test=pd.read_csv(data_path+'test.csv') 122 | test['pred']=pred_ls 123 | test[['Id','pred']].to_csv(resource_path+'result.csv',index=None,header=None) 124 | 125 | def main1(): 126 | vectorizer=make_union( 127 | on_field('text',Tfidf(max_features=300000,token_pattern='\w+',ngram_range=(1,2))), 128 | on_field(['len_text'],FunctionTransformer(to_records,validate=False),DictVectorizer()) 129 | ) 130 | 131 | with timer('process train'): 132 | if os.path.exists(resource_path+'dataset.pik'): 133 | with open(resource_path+'dataset.pik','rb') as f: 134 | trainX,testX,trainY=pickle.load(f) 135 | else: 136 | train=pd.read_csv(data_path+'train.csv') 137 | train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) 138 | 139 | test=pd.read_csv(data_path+'test.csv') 140 | test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) 141 | 142 | train=train[train['Score']>0].reset_index(drop=True) 143 | trainY=train['Score'].values 144 | trainX=vectorizer.fit_transform(get_dataset_x(train)).astype(np.float32) 145 | testX=vectorizer.fit_transform(get_dataset_x(test)).astype(np.float32) 146 | 147 | sk=SelectKBest(chi2,k=100000) 148 | trainX=sk.fit_transform(trainX,trainY) 149 | testX=sk.transform(testX) 150 | 151 | with open(resource_path+'dataset.pik','wb') as f: 152 | pickle.dump((trainX,testX,trainY),f) 153 | 154 | print(f'trainX: {trainX.shape} of {trainX.dtype} with{type(trainX)}') 155 | print(f'testX: {testX.shape} of {testX.dtype} with{type(testX)}') 156 | 157 | #pred=model_lgb(trainX,testX,trainY) 158 | pred=model_svm(trainX,testX,trainY) 159 | store_result(pred) 160 | 161 | def main(): 162 | with timer('process train'): 163 | train=pd.read_csv(data_path+'train.csv') 164 | test=pd.read_csv(data_path+'test.csv') 165 | 166 | #将score转化为list的list以供taggeddocment使用 167 | score_list=train['Score'].values.tolist() 168 | score_list=[[x] for x in score_list] 169 | train_vec,test_vec=get_d2v_x(train['Discuss'].values.tolist(),score_list,test['Discuss'].values.tolist()) 170 | trainX=pd.DataFrame(train_vec) 171 | trainX['len_text']=train['Discuss'].apply(lambda x:len(x)) 172 | trainY=train['Score'].values 173 | 174 | testX=pd.DataFrame(test_vec) 175 | testX['len_text']=train['Discuss'].apply(lambda x:len(x)) 176 | 177 | print(f'trainX: {trainX.shape} with{type(trainX)}') 178 | print(f'testX: {testX.shape} with{type(testX)}') 179 | #pred=model_lgb(trainX,testX,trainY) 180 | pred=model_svm(trainX,testX,trainY) 181 | store_result(pred) 182 | 183 | 184 | if __name__ == '__main__': 185 | # sentence=[b'waste of time.', b'a shit movie.', b'a nb movie.', b'I love this movie!', b'shit.', b'worth my money.', b'sb movie.', b'worth it!'] 186 | # sentence=['他','你我','这'] 187 | # print(get_cv_one_vec(sentence).toarray()) 188 | 189 | # sentence=['它会检查你已经拥有的库文件是否有更新的版本。','这个问题真是郁闷了我一天,网上也是各种找解决方案'] 190 | # tf=TfidfVectorizer(token_pattern='\w+',ngram_range=(1,2),binary=True) 191 | # # print(tf.fit_transform(sentence).toarray()) 192 | # arr=tf.fit_transform(sentence) 193 | # x=[x.astype(np.bool).astype(np.float32) for x in arr] 194 | # print(x[1].toarray()) 195 | # 196 | # with open(resource_path + 'result.pik', 'rb') as f: 197 | # pred=pickle.load(f) 198 | # pred=[np.argmax(x)+1 for x in pred] 199 | # print(len(pred)) 200 | # store_result(pred) 201 | 202 | # train_vec,test_vec=get_d2v_x(['根据官方api探索性的做了些尝试。','infer新文档向量'],[[0],[1]],['后期会继续改进']) 203 | # print(train_vec) 204 | # print(type(train_vec)) 205 | # print(test_vec) 206 | 207 | # main1() 208 | main() 209 | 210 | -------------------------------------------------------------------------------- /mlp.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['OMP_NUM_THREADS'] = '4' 3 | from contextlib import contextmanager 4 | from functools import partial 5 | from operator import itemgetter 6 | from multiprocessing.pool import ThreadPool 7 | import time 8 | from typing import List, Dict 9 | 10 | import tensorflow as tf 11 | import keras as ks 12 | import pandas as pd 13 | import numpy as np 14 | from sklearn.feature_extraction import DictVectorizer 15 | from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf 16 | from sklearn.pipeline import make_pipeline, make_union, Pipeline 17 | from sklearn.preprocessing import FunctionTransformer, StandardScaler 18 | from sklearn.metrics import mean_squared_log_error,mean_squared_error 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | from sklearn.model_selection import KFold 21 | import math 22 | import jieba 23 | from snownlp import SnowNLP 24 | from snownlp import sentiment 25 | import multiprocessing as mp 26 | from six.moves import cPickle as pickle 27 | from sklearn.metrics import mean_squared_error 28 | 29 | data_path='data/' 30 | resource_path='resource/' 31 | 32 | #一共原始的特征 33 | @contextmanager #创建一个上下文管理器,显示运行的时间 34 | def timer(name): 35 | t0 = time.time() 36 | yield 37 | print(f'[{name}] done in {time.time() - t0:.0f} s') 38 | 39 | 40 | def preprocess(df: pd.DataFrame) -> pd.DataFrame: #标注df是pd.DataFrame类型的,返回的也是pd.DataFrame的 41 | df['text']=df['Discuss'].fillna('') 42 | return df[['text','len_discuss']] 43 | 44 | def on_field(f: str, *vec) -> Pipeline:# *代表可变长度的 45 | return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec) 46 | 47 | def to_records(df: pd.DataFrame) -> List[Dict]: 48 | #将df由{'a':[1,2,3],'b':[4,5,6]}转换为[{'a': 1, 'b': 4}, {'a': 2, 'b': 5}, {'a': 3, 'b': 6}] 49 | return df.to_dict(orient='records') 50 | 51 | def get_nouns(sentences:list)->set: 52 | noun_list=[] 53 | for s in sentences: 54 | # print('sentence',s) 55 | st=SnowNLP(s) 56 | tags=list(st.tags) 57 | # print('tags',tags) 58 | for index,word in enumerate(st.words): 59 | if tags[index][1]=='n': 60 | noun_list.append(word) 61 | # print('word',word) 62 | return set(noun_list) 63 | 64 | def get_keywords(sentences:list)->set: 65 | keywords_list=[] 66 | for s in sentences: 67 | st=SnowNLP(s) 68 | keywords_list.extend(st.keywords(limit=3)) 69 | return set(keywords_list) 70 | 71 | # def get_noun(sentence): 72 | # s=SnowNLP(sentence) 73 | # noun_word_list=[] 74 | # for index,word in enumerate(s.words): 75 | # if s.tags[index]=='n': 76 | # noun_word_list.append(word) 77 | # return noun_word_list 78 | 79 | def filter_noun(nouns:list): 80 | nouns_filtered=[] 81 | for x in nouns: 82 | s=SnowNLP(x) 83 | if s.sentiments>=0.7 or s.sentences<=0.3: 84 | nouns_filtered.append(x) 85 | return nouns_filtered 86 | 87 | def seg_sentence(sentence)->list: 88 | s=SnowNLP(sentence) 89 | return s.words 90 | 91 | def fit_predict(xs, y_train) -> np.ndarray: 92 | #[ [[Xb_train, Xb_valid], [X_train, X_valid]] ,[[Xb_train, Xb_valid], [X_train, X_valid]] ] 93 | X_train, X_test = xs 94 | print("X_train:",X_train.shape) 95 | print("X_test:",X_test.shape) 96 | config = tf.ConfigProto( 97 | intra_op_parallelism_threads=1, use_per_session_threads=1, inter_op_parallelism_threads=1) 98 | with tf.Session(graph=tf.Graph(), config=config) as sess, timer('fit_predict'): 99 | ks.backend.set_session(sess) 100 | model_in = ks.Input(shape=(X_train.shape[1],), dtype='float32', sparse=True) 101 | out = ks.layers.Dense(192, activation='relu')(model_in) 102 | out = ks.layers.Dense(64, activation='relu')(out) 103 | out = ks.layers.Dense(64, activation='relu')(out) 104 | out = ks.layers.Dense(1)(out) 105 | model = ks.Model(model_in, out) 106 | model.compile(loss='mean_squared_error', optimizer=ks.optimizers.Adam(lr=3e-3)) 107 | for i in range(3): 108 | with timer(f'epoch {i + 1}'): 109 | model.fit(x=X_train, y=y_train, batch_size=2**(11 + i), epochs=1, verbose=0) 110 | return model.predict(X_test)[:, 0] 111 | from sklearn import preprocessing 112 | from sklearn.feature_selection import SelectKBest 113 | from sklearn.feature_selection import chi2 114 | def main(evalation=True): 115 | #make_union将各个特征组合到一起 116 | vectorizer = make_union( 117 | #先获取pd中name,进行Tfidf,根据语料库的出现词的频率排序,选择前300000个词,\w+匹配数字字母下划线的多个字符 118 | #on_field('name', Tfidf(max_features=1000, token_pattern='\w+')), 119 | #获取pd中的text,也是tfidf,不同的是使用ngram 120 | on_field('text', Tfidf(max_features=300000, token_pattern='\w+', ngram_range=(1, 2))), 121 | on_field(['len_discuss'],FunctionTransformer(to_records,validate=False),DictVectorizer()), 122 | #on_field(['shipping', 'item_condition_id'], 123 | #FunctionTransformer(to_records, validate=False), DictVectorizer()), 124 | n_jobs=1) 125 | y_scaler = StandardScaler() 126 | with timer('process train'): 127 | train = pd.read_csv(data_path+'train_split.csv') 128 | train['len_discuss']=train['Discuss'].apply(lambda x:len(x)) 129 | 130 | train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) 131 | 132 | test=pd.read_csv(data_path+"dev_split.csv") 133 | test['len_discuss']=test['Discuss'].apply(lambda x:len(x)) 134 | test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) 135 | y_true=None 136 | if evalation: 137 | y_true=test['Score'].values 138 | 139 | ##################### noun 140 | # print('load noun set...') 141 | # 142 | # if os.path.exists(resource_path+'noun_set.pik'): 143 | # with open(resource_path+'noun_set.pik','rb') as f: 144 | # noun_set=pickle.load(f) 145 | # # noun_set=filter_noun(noun_set) 146 | # else: 147 | # noun_set=get_nouns(train['Discuss'].values) 148 | # with open(resource_path+'noun_set.pik','wb') as f: 149 | # pickle.dump(noun_set,f) 150 | # # noun_set=filter_noun(noun_set) 151 | # 152 | # print(f'noun size:{len(noun_set)}') 153 | ####################### 154 | 155 | ###################### keyword 156 | print('load keyword set...') 157 | 158 | if os.path.exists(resource_path+'keyword_set.pik'): 159 | with open(resource_path+'keyword_set.pik','rb') as f: 160 | keyword_set=pickle.load(f) 161 | else: 162 | keyword_set=get_keywords(train['Discuss'].values) 163 | with open(resource_path+'keyword_set.pik','wb') as f: 164 | pickle.dump(keyword_set,f) 165 | 166 | print(f'keyword size:{len(keyword_set)}') 167 | ###################### 168 | 169 | train = train[train['Score'] > 0].reset_index(drop=True)#取出所有价格大于0的数据 170 | # cv = KFold(n_splits=10, shuffle=True, random_state=42)#20折 171 | # train_ids, valid_ids = next(cv.split(train)) 172 | # valid=train.iloc[valid_ids] 173 | # train=train.iloc[train_ids] 174 | y_train_start=train['Score'].values 175 | y_train=y_scaler.fit_transform(train['Score'].values.reshape(-1,1)) 176 | X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) 177 | X_test=vectorizer.transform(preprocess(test)).astype(np.float32) 178 | 179 | #y_test=valid['Score'] 180 | 181 | sk=SelectKBest(chi2,k=100000) 182 | X_train=sk.fit_transform(X_train,y_train_start) 183 | X_test=sk.transform(X_test) 184 | 185 | print(f'X_train: {X_train.shape} of {X_train.dtype}') 186 | print(f'X_test: {X_test.shape} of {X_test.dtype}') 187 | # del train 188 | # with timer('process valid'): 189 | # X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) 190 | with ThreadPool(processes=6) as pool: 191 | Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]] 192 | 193 | ############################### noun 194 | # vec=CountVectorizer(binary=True,tokenizer=seg_sentence) 195 | # vec.fit(noun_set) 196 | # Xn_train,Xn_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]] 197 | ################################## 198 | 199 | ############################# keyword 200 | if os.path.exists(resource_path+'keyword_train.pik'): 201 | with open(resource_path+'resource_train.pik','rb') as f: 202 | Xk_train,Xk_valid=pickle.load(f) 203 | else: 204 | vec=CountVectorizer(binary=True,tokenizer=seg_sentence) 205 | vec.fit(keyword_set) 206 | Xk_train,Xk_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]] 207 | with open(resource_path+'resource_train.pik','wb') as f: 208 | pickle.dump([Xk_train,Xk_valid],f) 209 | ############################# 210 | 211 | ############################ 212 | 213 | ####下面的xn_train,Xn_valid 214 | 215 | ############## 216 | 217 | ############# 拼接在内部 218 | # Xb_a_train=np.concatenate([Xb_train,Xk_train],axis=1) 219 | # Xb_a_valid=np.concatenate([Xb_valid,Xk_valid],axis=1) 220 | # X_a_train=np.concatenate([X_train,Xk_train],axis=1) 221 | # X_a_test=np.concatenate([X_test,Xk_valid],axis=1) 222 | xs = [[Xb_train, Xb_valid], [X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍 #Xb表示单词的出现与否,而X使用的是tfidf特征权重 223 | ############## 放在训练 224 | # xs = [[Xb_train, Xb_valid],[X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍 #Xb表示单词的出现与否,而X使用的是tfidf特征权重 225 | 226 | ############### 227 | 228 | print(len(xs),len(xs[0])) 229 | #print(len(xs[1])) 230 | xx=pool.map(partial(fit_predict, y_train=y_train), xs)#np.mean指传入多次进行平均 231 | print(len(xx)) 232 | y_pred = np.mean(xx,axis=0) 233 | y_pred=y_scaler.inverse_transform(y_pred) 234 | # print(y_pred) 235 | 236 | pre=[] 237 | for i in y_pred: 238 | if i>4.7: 239 | pre.append(5) 240 | else: 241 | pre.append(i) 242 | 243 | if evalation and y_true is not None: 244 | print('the score is :',evaluate(y_true,pre)) 245 | 246 | result=pd.DataFrame({'ID':test.Id,'Discuss':test.Discuss,'Score':pre}) 247 | result.to_csv('MLP_simple_jieba_stopword_chibest.csv',header=None,index=None) 248 | 249 | def evaluate(y_true,y_prediction): 250 | return 1.0/(1.0+np.sqrt((mean_squared_error(y_pred=y_prediction,y_true=y_true)))) 251 | 252 | if __name__ == '__main__': 253 | main(evalation=True) -------------------------------------------------------------------------------- /ridge.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import jieba 5 | import jieba.analyse 6 | from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer 7 | from sklearn.linear_model import Ridge 8 | from sklearn.metrics import mean_squared_error 9 | from sklearn.preprocessing import OneHotEncoder 10 | import scipy 11 | from sklearn.model_selection import KFold 12 | # from scipy.sparse import csr_matrix 13 | from scipy.sparse import csr_matrix, hstack 14 | import numpy as np 15 | 16 | def get_data(): 17 | train = pd.read_csv('data/train_hebing.csv') 18 | #train2 = pd.read_csv('D:/yunyibei/result/3train0302.csv') 19 | #train.drop_duplicates(subset='Discuss', keep='first',inplace=True) 20 | test = pd.read_csv('data/predict_second.csv') 21 | #data1= pd.concat([train, train2]) 22 | data = pd.concat([train, test]) 23 | print('train %s test %s'%(train.shape,test.shape)) 24 | print('train columns',train.columns) 25 | return data,train.shape[0],train['Score'],test['Id'] 26 | 27 | 28 | def split_discuss(data):#不用clean_str 用split_discuss分数要高 (split_discuss未用停用词,未去除字母和数字) 29 | data['length'] = data['Discuss'].apply(lambda x:len(x)) 30 | data['Discuss'] = data['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) 31 | 32 | return data 33 | 34 | # 预处理 35 | def pre_process(): 36 | data,nrw_train,y,test_id = get_data() 37 | #y = np.where(y>=4,5,y) 38 | #y = np.where(y<4,1,y) 39 | data = split_discuss(data) 40 | #data['Discuss'] = data['Discuss'].map(lambda x : clean_str(x)) 41 | #data['length'] = data['Discuss'].apply(lambda x:len(x)) 42 | #data=clean_str(data) 43 | #cv = CountVectorizer(ngram_range=(1,2)) 44 | #discuss = cv.fit_transform(data['Discuss']) 45 | tf = TfidfVectorizer(ngram_range=(1,6),analyzer='char') 46 | ha= HashingVectorizer(ngram_range=(1,2),lowercase=False) 47 | discuss_ha = ha.fit_transform(data['Discuss']) 48 | 49 | discuss_tf = tf.fit_transform(data['Discuss']) 50 | print("bb") 51 | data=hstack((discuss_tf,discuss_ha)).tocsr() 52 | # length = csr_matrix(pd.get_dummies(data['length'],sparse=True).values) 53 | # data = hstack((discuss,discuss_tf)).tocsr() 54 | #print(data.shape) 55 | return data[:nrw_train],data[nrw_train:],y,test_id 56 | 57 | def xx_mse_s(y_true,y_pre): 58 | y_true = y_true 59 | y_pre = pd.DataFrame({'res':list(y_pre)}) 60 | 61 | y_pre['res'] = y_pre['res'].astype(int) 62 | return 1 / ( 1 + mean_squared_error(y_true,y_pre['res'].values)**0.5) 63 | 64 | 65 | X,test,y,test_id = pre_process() 66 | print('aa') 67 | print(y.shape) 68 | print(X.shape) 69 | print(test.shape) 70 | print('.....') 71 | kf = KFold(n_splits=3,shuffle=True,random_state=42) 72 | cv_pred = [] 73 | kf = kf.split(X) 74 | xx_mse = [] 75 | model_1 = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01) 76 | 77 | for i ,(train_fold,test_fold) in enumerate(kf): 78 | X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold] 79 | model_1.fit(X_train, label_train) 80 | 81 | val_ = model_1.predict(X=X_validate) 82 | print(xx_mse_s(label_validate, val_)) 83 | 84 | cv_pred.append(model_1.predict(test)) 85 | xx_mse.append(xx_mse_s(label_validate, val_)) 86 | 87 | print('xx_result',np.mean(xx_mse)) 88 | 89 | s = 0 90 | for i in cv_pred: 91 | s = s + i 92 | 93 | s = s/3 94 | res = pd.DataFrame() 95 | s = np.where(s>4.7,5,s) 96 | #s = np.where((s>1.9) & (s<2.5),2,s) 97 | #s = np.where(s>3.5 and s<=4.8,4,s) 98 | res['Id'] = list(test_id) 99 | res['pre'] = list(s) 100 | 101 | res.to_csv('result/result.csv',index=False,header=False) 102 | 103 | 104 | 105 | # 0.581334990703 0.528227538116 0.48691 --------------------------------------------------------------------------------