├── README.md
└── 新浪微博互动预测.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 天池项目：新浪微博互动预测
 2 | 这项目还是比较有意思，时间序列+文本挖掘。这两块内容，我也是第一次做，收获挺多的。
 3 | 关于时间序列怎么操作，我发现一篇文章讲的是比较清楚的：https://www.jianshu.com/p/e81ab6846214
 4 | 这项目的时间跨度是由：训练集2015-2~7月 预测集2015-8月
 5 | 所以，我这里将时间分段：训练2-4月 预测5月；训练3-5月 预测6月；训练4-6月，预测7月；最后训练2-7月，预测8月
 6 | 
 7 | 特征工程：
 8 | 用户特征：
 9 | 最大评论数、点赞数、转发数 
10 | 求和评论数、点赞数、转发数
11 | 平均评论数、点赞数、转发数
12 | 每个用户的发博总量
13 | 
14 | 时间特征：
15 | 是星期几
16 | 是否周末
17 | 发文时间段（分5个时间段）凌晨、上午、下午、傍晚、晚上
18 |         
19 | 博文特征：
20 | 用jieba分词，提取关键字。
21 | 前10000个数据，排名前40个：
22 | [('...', 0.621514918030454), ('http', 0.2620447278302564), ('cn', 0.22392989946109595), ('红包', 0.0683616840195235), ('分享', 0.04735239014792837), ('打车', 0.0330619091787988), ('微博', 0.028256930809000994), ('2015', 0.02537066170997315), ('抽到', 0.022718078481150834), ('代金券', 0.01745600303503346), ('##', 0.01608703474574434), ('微盘', 0.015603918212659863), ('10', 0.015585687400090639), ('RAIN', 0.014911976008327894), ('抢红包', 0.01339633254518637), ('一个', 0.012809292333074929), ('JIHOON', 0.012663785349222944), ('文章', 0.012582612157540411), ('今天', 0.011443909235602808), ('视频', 0.01130905553212397), ('支付宝', 0.011130217143888016), ('现金', 0.01106126112415989), ('技术', 0.01083031209003015), ('羊年', 0.010433413756469983), ('我刚', 0.01030538114140404), ('互联网', 0.009558458441420841), ('自己', 0.009212253174121643), ('学习', 0.009186835304770708), ('一起', 0.009058810297090275), ('发红包', 0.009025999075583241), ('数据', 0.00897946475658576), ('....', 0.008800510431144331), ('客户端', 0.008775568523716314), ('doge', 0.008384516435246549), ('rain', 0.008372915009066131), ('手机', 0.008159279852325165), ('20', 0.008088680067645933), ('发出', 0.008050112517136382), ('钱包', 0.008047451882325371), ('刚刚', 0.00798569583889755)]
23 | 这里我提取了，包含：http，红包，分享，打车，cn，微博，##，@，【，代金卷，2015  关键字的博文。
24 | 
25 | 模型用xgboost，它的效果一直都不错。
26 | 
27 | 本项目的数据量太大了，140w+，电脑不太行，特别对于博文的关键字的提取，执行好几个小时都没有结果，效率太低了就没弄了。也可能我的代码不是很有效率，以后有机会再优化了。
28 | 


--------------------------------------------------------------------------------
/新浪微博互动预测.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import jieba
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.metrics import mean_squared_error,accuracy_score
  8 | import xgboost as xgb
  9 | 
 10 | data_train=pd.DataFrame(pd.read_table(r'C:\Users\朝花夕拾\Desktop\机器学习\tianchi\新浪微博互动预测\weibo_train_data.txt',sep='\t'))
 11 | data_test=pd.DataFrame(pd.read_table(r'C:\Users\朝花夕拾\Desktop\机器学习\tianchi\新浪微博互动预测\weibo_predict_data.txt',sep='\t'))
 12 | data_train.columns=['uid','mid','time','forward_count','comment_count','like_count','content']
 13 | data_train['type']='train'
 14 | data_test.columns=['uid','mid','time','content']
 15 | data_test['type']='test'
 16 | '''print('转发:',data_train['forward_count'].describe().astype(int))
 17 | print('评论:',data_train['comment_count'].describe().astype(int))
 18 | print('点赞:',data_train['like_count'].describe().astype(int))'''
 19 | #print(data_train.info())
 20 | #print(data_test.info())
 21 | data_all=pd.concat([data_train,data_test],axis=0,sort=False)
 22 | data_all['time']=pd.to_datetime(data_all['time'])
 23 | data_all['month']=data_all.time.dt.month
 24 | data_all['hour']=data_all.time.dt.hour
 25 | data_all['weekday']=data_all.time.dt.weekday
 26 | '''data_all['hour'].value_counts().sort_index().plot(kind='bar')
 27 | plt.show()'''
 28 | #temp=data_all.loc[data_all['month']==2|3|4|5,'content']
 29 | def hour_cut(x):
 30 |     if 0<=x<=7: #凌晨
 31 |         return 0
 32 |     elif  7<x<=12: #上午
 33 |         return 1
 34 |     elif  12<x<=17: # 下午
 35 |         return 2
 36 |     elif  17<x<=19: # 傍晚
 37 |         return 3
 38 |     elif  19<x<24: # 晚上
 39 |         return 4
 40 | data_all['hour_cut']=data_all['hour'].map(hour_cut)
 41 | data_all=data_all.drop(['hour'],axis=1)
 42 | ###### 特征工程 用户特征
 43 | # 评论特征
 44 | uid_and_commentCount=data_train.groupby('uid')['comment_count'].count()
 45 | uid_and_commentMean=data_train.groupby('uid')['comment_count'].mean()
 46 | uid_and_commentMax=data_train.groupby('uid')['comment_count'].max()
 47 | 
 48 | # 点赞特征
 49 | uid_and_likeCount=data_train.groupby('uid')['like_count'].count()
 50 | uid_and_likeMean=data_train.groupby('uid')['like_count'].mean()
 51 | uid_and_likeMax=data_train.groupby('uid')['like_count'].max()
 52 | 
 53 | # 转发特征
 54 | uid_and_forwardCount=data_train.groupby('uid')['forward_count'].count()
 55 | uid_and_forwardMean=data_train.groupby('uid')['forward_count'].mean()
 56 | uid_and_forwardMax=data_train.groupby('uid')['forward_count'].max()
 57 | 
 58 | # 博文特征
 59 | uid_and_contentCount=data_train.groupby('uid')['content'].count()
 60 | 
 61 | '''content=dict(data_all['content'])
 62 | contentjieba=jieba.lcut(content)
 63 | counts={}
 64 | for word in contentjieba:
 65 |     if len(word)==1:
 66 |         continue
 67 |     elif word==',':
 68 |         continue
 69 |     else:
 70 |         counts[word] = counts.get(word, 0) + 1
 71 | items=list(counts.items())
 72 | items.sort(key=lambda x: x[1], reverse=True)
 73 | for i in range(15):
 74 |     word, count = items[i]
 75 |     print("{0:<5}{1:>5}".format(word, count))
 76 | print(content)'''
 77 | 
 78 | # 数据合并
 79 | data_all['uid_and_commentCount']=data_all.loc[:,'uid'].map(uid_and_commentCount).fillna(0)
 80 | data_all['uid_and_commentMean']=data_all.loc[:,'uid'].map(uid_and_commentMean).fillna(0)
 81 | data_all['uid_and_commentMax']=data_all.loc[:,'uid'].map(uid_and_commentMax).fillna(0)
 82 | data_all['uid_and_likeCount']=data_all.loc[:,'uid'].map(uid_and_likeCount).fillna(0)
 83 | data_all['uid_and_likeMean']=data_all.loc[:,'uid'].map(uid_and_likeMean).fillna(0)
 84 | data_all[' uid_and_likeMax']=data_all.loc[:,'uid'].map( uid_and_likeMax).fillna(0)
 85 | data_all['uid_and_forwardCount']=data_all.loc[:,'uid'].map(uid_and_forwardCount).fillna(0)
 86 | data_all['uid_and_forwardMean']=data_all.loc[:,'uid'].map(uid_and_forwardMean).fillna(0)
 87 | data_all['uid_and_forwardMax']=data_all.loc[:,'uid'].map(uid_and_forwardMax).fillna(0)
 88 | data_all['uid_and_contentCount']=data_all.loc[:,'uid'].map(uid_and_contentCount).fillna(0)
 89 | 
 90 | data_all['http']=0
 91 | data_all['hongbao']=0
 92 | data_all['fengxiang']=0
 93 | data_all['dache']=0
 94 | data_all['cn']=0
 95 | data_all['weibo']=0
 96 | data_all['topic']=0
 97 | data_all['ai']=0
 98 | data_all['zhuangfa']=0
 99 | data_all['daijinjuan']=0
100 | data_all['nianfen']=0
101 | temp=data_all.loc[0:100,'content'].index
102 | for index in temp:
103 |     seg_list = jieba.cut(data_all.loc[index,'content'].to_string())
104 |     for j in seg_list:
105 |         if j=='http':
106 |             data_all.loc[index,'http']=1
107 |         elif j=='红包':
108 |             data_all.loc[index,'hongbao']=1
109 |         elif j=='分享':
110 |             data_all.loc[index,'fengxiang']=1
111 |         elif j=='打车':
112 |             data_all.loc[index,'dache']=1
113 |         elif j=='cn':
114 |             data_all.loc[index,'cn']=1
115 |         elif j=='微博':
116 |             data_all.loc[index,'weibo']=1
117 |         elif j=='##':
118 |             data_all.loc[index,'topic']=1
119 |         elif j=='@':
120 |             data_all.loc[index,'ai']=1
121 |         elif j=='[':
122 |             data_all.loc[index,'zhuangfa']=1
123 |         elif j=='代金券':
124 |             data_all.loc[index,'daijinjuan']=1
125 |         elif j=='2015':
126 |             data_all.loc[index,'nianfen']=1
127 | 
128 | data_all=data_all.drop(['uid','mid','time','content','type'],axis=1)
129 | train1=data_all.loc[data_all['month']==2 | 3 | 4,:]
130 | test1=data_all.loc[data_all['month']==5,:]
131 | '''train2=data_all.loc[data_all['month']== 3 | 4 | 5,:]
132 | test2=data_all.loc[data_all['month']==6,:]
133 | train3=data_all.loc[data_all['month']==4 | 5 | 6,:]
134 | test3=data_all.loc[data_all['month']==7,:]'''
135 | y_train=train1.loc[:,'forward_count','comment_count','like_count']
136 | X_train=train1.drop(['forward_count','comment_count','like_count'],axis=1)
137 | 
138 | y_test=test1.loc[:,'forward_count','comment_count','like_count']
139 | X_test=test1.drop(['forward_count','comment_count','like_count'],axis=1)
140 | 
141 | model_xgb= xgb.XGBRegressor(max_depth=4, colsample_btree=0.1, learning_rate=0.1, n_estimators=32, min_child_weight=2);
142 | model_xgb.fit(X_train,y_train)
143 | xgb_pred=model_xgb.predict(X_test)
144 | print('mse:',mean_squared_error(y_test,xgb_pred))
145 | print('正确率:',accuracy_score(y_test,xgb_pred))
146 | 


--------------------------------------------------------------------------------