├── plot ├── figure_1.png ├── figure_11.png ├── figure_12.png ├── figure_13.png ├── figure_14.png ├── figure_15.png ├── figure_16.png ├── figure_17.png ├── figure_18.png ├── figure_2.png ├── figure_3.png ├── figure_4.png ├── figure_5.png ├── figure_6.png ├── figure_7.png ├── figure_8.png ├── figure_9.png └── plot.py ├── README.md └── trick ├── runTime.py ├── evaluation.py ├── genUidStat.py ├── predict_with_fixed_value.py └── predict_by_search.py /plot/figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_1.png -------------------------------------------------------------------------------- /plot/figure_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_11.png -------------------------------------------------------------------------------- /plot/figure_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_12.png -------------------------------------------------------------------------------- /plot/figure_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_13.png -------------------------------------------------------------------------------- /plot/figure_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_14.png -------------------------------------------------------------------------------- /plot/figure_15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_15.png -------------------------------------------------------------------------------- /plot/figure_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_16.png -------------------------------------------------------------------------------- /plot/figure_17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_17.png -------------------------------------------------------------------------------- /plot/figure_18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_18.png -------------------------------------------------------------------------------- /plot/figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_2.png -------------------------------------------------------------------------------- /plot/figure_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_3.png -------------------------------------------------------------------------------- /plot/figure_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_4.png -------------------------------------------------------------------------------- /plot/figure_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_5.png -------------------------------------------------------------------------------- /plot/figure_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_6.png -------------------------------------------------------------------------------- /plot/figure_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_7.png -------------------------------------------------------------------------------- /plot/figure_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_8.png -------------------------------------------------------------------------------- /plot/figure_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_9.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | - long-tailed distributions for all users 3 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_1.png) 4 | 5 | - for single user 6 | 7 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_13.png) 8 | 9 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_7.png) 10 | 11 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_3.png) 12 | -------------------------------------------------------------------------------- /trick/runTime.py: -------------------------------------------------------------------------------- 1 | __author__ = "wepon, http://2hwp.com" 2 | __data__ = "2015/08/14" 3 | 4 | """ 5 | just for run time calculate 6 | 7 | """ 8 | 9 | def runTime(func): 10 | def wrapper(*args,**kwargs): 11 | import time 12 | t1 = time.time() 13 | func(*args,**kwargs) 14 | t2 = time.time() 15 | print "{0} run time: {1:.2f}s".format(func.__name__,t2-t1) 16 | return wrapper 17 | -------------------------------------------------------------------------------- /trick/evaluation.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Desmond & wepon' 2 | __date__ = "2015/08/14" 3 | 4 | """ 5 | evaluation function accoding to official rule: 6 | http://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.Grh4pl&raceId=5 7 | 8 | """ 9 | 10 | def _deviation(predict, real, kind): 11 | t = 5.0 if kind=='f' else 3.0 12 | return abs(predict - real) / (real + t) 13 | 14 | 15 | def _precision_i(fp, fr, cp, cr, lp, lr): 16 | return 1 - 0.5 * _deviation(fp, fr, 'f') - 0.25 * _deviation(cp, cr, 'c') - 0.25 * _deviation(lp, lr, 'l') 17 | 18 | 19 | def _sgn(x): 20 | return 1 if x>0 else 0 21 | 22 | 23 | def _count_i(fr, cr, lr): 24 | x = fr + cr + lr 25 | return 101 if x>100 else (x+1) 26 | 27 | 28 | def precision(real_and_predict): 29 | numerator,denominator = 0.0,0.0 30 | for fr, cr, lr,fp, cp, lp in real_and_predict: 31 | numerator += _count_i(fr, cr, lr) * _sgn(_precision_i(fp, fr, cp, cr, lp, lr) - 0.8) 32 | denominator += _count_i(fr, cr, lr) 33 | return numerator / denominator 34 | -------------------------------------------------------------------------------- /trick/genUidStat.py: -------------------------------------------------------------------------------- 1 | __author__ = "wepon, http://2hwp.com" 2 | __date__ = "2015/08/14" 3 | 4 | 5 | import csv,cPickle 6 | import pandas as pd 7 | 8 | def loadData(): 9 | traindata = pd.read_csv("weibo_train_data.txt",header=None,sep='\t') 10 | traindata.columns = ["uid","mid","date","forward","comment","like","content"] 11 | 12 | testdata = pd.read_csv("weibo_predict_data.txt",header=None,sep='\t') 13 | testdata.columns=["uid","mid","date","content"] 14 | 15 | return traindata,testdata 16 | 17 | #for every uid , generate statistics of forward,comment,like 18 | def genUidStat(): 19 | traindata, _ = loadData() 20 | train_stat = traindata[['uid','forward','comment','like']].groupby('uid').agg(['min','max','median','mean']) 21 | train_stat.columns = ['forward_min','forward_max','forward_median','forward_mean',\ 22 | 'comment_min','comment_max','comment_median','comment_mean',\ 23 | 'like_min','like_max','like_median','like_mean'] 24 | train_stat = train_stat.apply(pd.Series.round) 25 | #store into dictionary,for linear time search 26 | stat_dic = {} 27 | for uid,stats in train_stat.iterrows(): 28 | stat_dic[uid] = stats #type(stats) : pd.Series 29 | return stat_dic 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /plot/plot.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import pandas as pd 4 | import matplotlib.pylab as plt 5 | import numpy as np 6 | 7 | df = pd.read_csv('weibo_train_data.txt',sep='\t') 8 | df.columns = ['uid','mid','time','f','c','l','content'] 9 | 10 | 11 | #总体的forward分布 12 | f,c,l = df.f,df.c,df.l 13 | 14 | x_f,y_f = [],[] 15 | d = {} 16 | for i in f: 17 | if d.has_key(i): 18 | d[i] += 1 19 | else: 20 | d[i] = 1 21 | 22 | for key,value in d.items(): 23 | x_f.append(key) 24 | y_f.append(value) 25 | 26 | #总体的comment分布 27 | x_c,y_c = [],[] 28 | d = {} 29 | for i in c: 30 | if d.has_key(i): 31 | d[i] += 1 32 | else: 33 | d[i] = 1 34 | 35 | for key,value in d.items(): 36 | x_c.append(key) 37 | y_c.append(value) 38 | 39 | 40 | #总体的like分布 41 | x_l,y_l = [],[] 42 | d = {} 43 | for i in l: 44 | if d.has_key(i): 45 | d[i] += 1 46 | else: 47 | d[i] = 1 48 | 49 | for key,value in d.items(): 50 | x_l.append(key) 51 | y_l.append(value) 52 | 53 | plt.axis([0,1000,0,1000]) 54 | 55 | plt.plot(x_f,y_f,'k') 56 | #plt.plot(x_f,y_f,'r') 57 | #plt.plot(x_f,y_f,'b') 58 | 59 | plt.xticks(np.linspace(0,1000,11,endpoint=True)) 60 | plt.yticks(np.linspace(0,1000,11,endpoint=True)) 61 | plt.xlabel('forward') 62 | plt.ylabel('count') 63 | 64 | plt.show() 65 | 66 | 67 | 68 | #画出某个uid的分布 69 | uid = '0199d79415106bcb23aa22fdfeb595b4' 70 | uid_f = df[df.uid==uid]['f'] 71 | x_f,y_f = [],[] 72 | d = {} 73 | for i in uid_f: 74 | if d.has_key(i): 75 | d[i] += 1 76 | else: 77 | d[i] = 1 78 | 79 | for key,value in d.items(): 80 | x_f.append(key) 81 | y_f.append(value) 82 | 83 | plt.bar(x_f,y_f,color='b') 84 | plt.title('uid({}) forward'.format(uid)) 85 | plt.show() 86 | 87 | -------------------------------------------------------------------------------- /trick/predict_with_fixed_value.py: -------------------------------------------------------------------------------- 1 | __author__ = "wepon, http://2hwp.com" 2 | __date__ = "2015/08/14" 3 | 4 | """ 5 | about 80% of the training data are: 0 0 0 (forward_count,comment_count,like_count), 6 | inspired by this, we try some fixed value for all uid,and calculate their score on training data: 7 | 8 | predict score(%) 9 | 0 0 0 34.10% 10 | 1 0 0 29.23% 11 | 0 1 0 35.01% 12 | 0 0 1 32.20% 13 | 1 1 0 29.30% 14 | 1 0 1 29.39% 15 | 0 1 1 33.45% 16 | 1 1 1 13.46% 17 | 2 0 0 7.04% 18 | 0 2 0 29.93% 19 | 0 0 2 28.22% 20 | 0 1 2 12.85% 21 | .... 22 | 23 | 24 | 25 | another wise solution is to predict respectively with uid's statistics(E.g mean,median) , 26 | their score on the training data: 27 | 28 | mean 38.01% 29 | min 34.17% 30 | max 8.08% 31 | median 40.94% **best** 32 | 33 | """ 34 | 35 | import pandas as pd 36 | from genUidStat import loadData,genUidStat 37 | from evaluation import precision 38 | from runTime import runTime 39 | 40 | 41 | 42 | 43 | @runTime 44 | def predict_with_fixed_value(forward,comment,like,submission=True): 45 | # type check 46 | if isinstance(forward,int) and isinstance(forward,int) and isinstance(forward,int): 47 | pass 48 | else: 49 | raise TypeError("forward,comment,like should be type 'int' ") 50 | 51 | traindata,testdata = loadData() 52 | 53 | #score on the training set 54 | train_real_pred = traindata[['forward','comment','like']] 55 | train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like 56 | print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100) 57 | 58 | #predict on the test data with fixed value, generate submission file 59 | if submission: 60 | test_pred = testdata[['uid','mid']] 61 | test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like 62 | 63 | result = [] 64 | filename = "weibo_predict_{}_{}_{}.txt".format(forward,comment,like) 65 | for _,row in test_pred.iterrows(): 66 | result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4])) 67 | f = open(filename,'w') 68 | f.writelines(result) 69 | f.close() 70 | print 'generate submission file "{}"'.format(filename) 71 | 72 | 73 | @runTime 74 | def predict_with_stat(stat="median",submission=True): 75 | """ 76 | stat: 77 | string 78 | min,max,mean,median 79 | """ 80 | stat_dic = genUidStat() 81 | traindata,testdata = loadData() 82 | 83 | #get stat for each uid 84 | forward,comment,like = [],[],[] 85 | for uid in traindata['uid']: 86 | if stat_dic.has_key(uid): 87 | forward.append(int(stat_dic[uid]["forward_"+stat])) 88 | comment.append(int(stat_dic[uid]["comment_"+stat])) 89 | like.append(int(stat_dic[uid]["like_"+stat])) 90 | else: 91 | forward.append(0) 92 | comment.append(0) 93 | like.append(0) 94 | #score on the training set 95 | train_real_pred = traindata[['forward','comment','like']] 96 | train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like 97 | print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100) 98 | 99 | #predict on the test data with fixed value, generate submission file 100 | if submission: 101 | test_pred = testdata[['uid','mid']] 102 | forward,comment,like = [],[],[] 103 | for uid in testdata['uid']: 104 | if stat_dic.has_key(uid): 105 | forward.append(int(stat_dic[uid]["forward_"+stat])) 106 | comment.append(int(stat_dic[uid]["comment_"+stat])) 107 | like.append(int(stat_dic[uid]["like_"+stat])) 108 | else: 109 | forward.append(0) 110 | comment.append(0) 111 | like.append(0) 112 | 113 | 114 | test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like 115 | 116 | result = [] 117 | filename = "weibo_predict_{}.txt".format(stat) 118 | for _,row in test_pred.iterrows(): 119 | result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4])) 120 | f = open(filename,'w') 121 | f.writelines(result) 122 | f.close() 123 | print 'generate submission file "{}"'.format(filename) 124 | 125 | 126 | 127 | 128 | if __name__ == "__main__": 129 | #predict_with_fixed_value(0,1,1,submission=False) 130 | predict_with_stat(stat="median",submission=True) 131 | -------------------------------------------------------------------------------- /trick/predict_by_search.py: -------------------------------------------------------------------------------- 1 | __author__ = "wepon,http://2hwp.com" 2 | __date__ = "2015/08/14" 3 | 4 | """ 5 | It score 40.94% on training set when we predict with uid's (forward_median,comment_medain,like_median). 6 | This is good,and we can go further based on this: 7 | 8 | for each uid, we first get its (f_min,f_median,f_max),(c_min,c_median,c_max),(l_min,l_medain,l_max),and then: 9 | 1. fix c_median and l_medain, search a forward value between , which cause a higher score than (f_medain,c_medain,l_medain) 10 | if there exist several result that get the same highest score, we choose the one near f_medain. 11 | if not exist any result that get higher score than (f_medain,c_medain,l_medain), than we choose forward = f_medain 12 | 2. search a comment value, by the same method 13 | 3. search a like value, by the same method 14 | 15 | """ 16 | 17 | import cPickle 18 | import pandas as pd 19 | import numpy as np 20 | from genUidStat import loadData,genUidStat 21 | from evaluation import precision 22 | from runTime import runTime 23 | from multiprocessing import Pool 24 | 25 | def score(uid_data,pred): 26 | """ 27 | uid_data: 28 | pd.DataFrame 29 | pred: 30 | list, [fp,cp,lp] 31 | """ 32 | uid_real_pred = uid_data[['forward','comment','like']] 33 | uid_real_pred['fp'] = pred[0] 34 | uid_real_pred['cp'] = pred[1] 35 | uid_real_pred['lp'] = pred[2] 36 | return precision(uid_real_pred.values) 37 | 38 | 39 | 40 | 41 | #search and return the best target value for uid 42 | def search(uid_data,target,args): 43 | """ 44 | target: 45 | 'forward','comment','like' 46 | 47 | args: 48 | (f_min,f_median,f_max,c_min,c_median,c_max,l_min,l_medain,l_max) 49 | """ 50 | args = list(args) 51 | target_index = ['forward','comment','like'].index(target) 52 | target_min,target_median,target_max = args[3*target_index:3*target_index+3] 53 | del args[3*target_index:3*target_index+3] 54 | pred = (args[1],args[4]) 55 | 56 | best_num = [target_median] 57 | best_pred = list(pred) 58 | best_pred.insert(target_index,target_median) 59 | best_score = score(uid_data,best_pred) 60 | for num in range(target_min,target_max+1): 61 | this_pred = list(pred) 62 | this_pred.insert(target_index,num) 63 | this_score = score(uid_data,this_pred) 64 | if this_score >= best_score: 65 | if this_score > best_score: 66 | best_num = [num] 67 | best_score = this_score 68 | else: 69 | best_num.append(num) 70 | 71 | return best_num[np.array([abs(i - target_median) for i in best_num]).argmin()] 72 | 73 | ##search best target value for all uid,return a dictionary that store {uid:[f,c,l]} 74 | def search_all_uid(): 75 | """ 76 | traindata,testdata = loadData() 77 | stat_dic = genUidStat() 78 | 79 | #for each uid,search its best fp,cp,lp 80 | uid_best_pred = {} 81 | for uid in stat_dic: 82 | print "search uid: {}".format(uid) 83 | uid_data = traindata[traindata.uid == uid] 84 | args = stat_dic[uid][['forward_min','forward_median','forward_max','comment_min',\ 85 | 'comment_median','comment_max','like_min','like_median','like_max']] 86 | args = tuple([int(i) for i in args]) 87 | fp = search(uid_data,'forward',args) 88 | cp = search(uid_data,'comment',args) 89 | lp = search(uid_data,'like',args) 90 | uid_best_pred[uid] = [fp,cp,lp] 91 | """ 92 | #multiprocessing version for geting uid_best_pred 93 | traindata,testdata = loadData() 94 | stat_dic = genUidStat() 95 | uid_best_pred = {} 96 | pool = Pool() 97 | uids,f,c,l = [],[],[],[] 98 | for uid in stat_dic: 99 | print "search uid:{}".format(uid) 100 | uid_data = traindata[traindata.uid == uid] 101 | arguments = stat_dic[uid][['forward_min','forward_median','forward_max','comment_min',\ 102 | 'comment_median','comment_max','like_min','like_median','like_max']] 103 | arguments = tuple([int(i) for i in arguments]) 104 | f.append(pool.apply_async(search,args=(uid_data,'forward',arguments))) 105 | c.append(pool.apply_async(search,args=(uid_data,'comment',arguments))) 106 | l.append(pool.apply_async(search,args=(uid_data,'like',arguments))) 107 | uids.append(uid) 108 | pool.close() 109 | pool.join() 110 | f = [i.get() for i in f] 111 | c = [i.get() for i in c] 112 | l = [i.get() for i in l] 113 | 114 | for i in range(len(uids)): 115 | uid_best_pred[uids[i]] = [f[i],c[i],l[i]] 116 | 117 | try: 118 | cPickle.dump(uid_best_pred,open('uid_best_pred.pkl','w')) 119 | except Exception: 120 | pass 121 | 122 | return uid_best_pred 123 | 124 | 125 | @runTime 126 | def predict_by_search(submission=True): 127 | traindata,testdata = loadData() 128 | uid_best_pred = search_all_uid() 129 | print "search done,now predict on traindata and testdata..." 130 | 131 | #predict traindata with uid's best fp,cp,lp 132 | forward,comment,like = [],[],[] 133 | for uid in traindata['uid']: 134 | if uid_best_pred.has_key(uid): 135 | forward.append(int(uid_best_pred[uid][0])) 136 | comment.append(int(uid_best_pred[uid][1])) 137 | like.append(int(uid_best_pred[uid][2])) 138 | else: 139 | forward.append(0) 140 | comment.append(0) 141 | like.append(0) 142 | 143 | #score on the traindata 144 | train_real_pred = traindata[['forward','comment','like']] 145 | train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like 146 | print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100) 147 | 148 | 149 | if submission: 150 | test_pred = testdata[['uid','mid']] 151 | forward,comment,like = [],[],[] 152 | for uid in testdata['uid']: 153 | if uid_best_pred.has_key(uid): 154 | forward.append(int(uid_best_pred[uid][0])) 155 | comment.append(int(uid_best_pred[uid][1])) 156 | like.append(int(uid_best_pred[uid][2])) 157 | else: 158 | forward.append(0) 159 | comment.append(0) 160 | like.append(0) 161 | test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like 162 | 163 | #generate submission file 164 | result = [] 165 | filename = "weibo_predict_search.txt" 166 | for _,row in test_pred.iterrows(): 167 | result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4])) 168 | f = open(filename,'w') 169 | f.writelines(result) 170 | f.close() 171 | print 'generate submission file "{}"'.format(filename) 172 | 173 | if __name__ == "__main__": 174 | predict_by_search() 175 | 176 | --------------------------------------------------------------------------------