├── plot
    ├── figure_1.png
    ├── figure_11.png
    ├── figure_12.png
    ├── figure_13.png
    ├── figure_14.png
    ├── figure_15.png
    ├── figure_16.png
    ├── figure_17.png
    ├── figure_18.png
    ├── figure_2.png
    ├── figure_3.png
    ├── figure_4.png
    ├── figure_5.png
    ├── figure_6.png
    ├── figure_7.png
    ├── figure_8.png
    ├── figure_9.png
    └── plot.py
├── README.md
└── trick
    ├── runTime.py
    ├── evaluation.py
    ├── genUidStat.py
    ├── predict_with_fixed_value.py
    └── predict_by_search.py


/plot/figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_1.png


--------------------------------------------------------------------------------
/plot/figure_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_11.png


--------------------------------------------------------------------------------
/plot/figure_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_12.png


--------------------------------------------------------------------------------
/plot/figure_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_13.png


--------------------------------------------------------------------------------
/plot/figure_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_14.png


--------------------------------------------------------------------------------
/plot/figure_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_15.png


--------------------------------------------------------------------------------
/plot/figure_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_16.png


--------------------------------------------------------------------------------
/plot/figure_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_17.png


--------------------------------------------------------------------------------
/plot/figure_18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_18.png


--------------------------------------------------------------------------------
/plot/figure_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_2.png


--------------------------------------------------------------------------------
/plot/figure_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_3.png


--------------------------------------------------------------------------------
/plot/figure_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_4.png


--------------------------------------------------------------------------------
/plot/figure_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_5.png


--------------------------------------------------------------------------------
/plot/figure_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_6.png


--------------------------------------------------------------------------------
/plot/figure_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_7.png


--------------------------------------------------------------------------------
/plot/figure_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_8.png


--------------------------------------------------------------------------------
/plot/figure_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/weibo/HEAD/plot/figure_9.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | - long-tailed distributions for all users
 3 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_1.png)
 4 | 
 5 | - for single user
 6 | 
 7 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_13.png)
 8 | 
 9 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_7.png)
10 | 
11 | ![](https://github.com/wepe/weibo/blob/master/plot/figure_3.png)
12 | 


--------------------------------------------------------------------------------
/trick/runTime.py:
--------------------------------------------------------------------------------
 1 | __author__ = "wepon, http://2hwp.com"
 2 | __data__ = "2015/08/14"
 3 | 
 4 | """
 5 | just for run time calculate
 6 | 
 7 | """
 8 | 
 9 | def runTime(func):
10 | 	def wrapper(*args,**kwargs):
11 | 		import time
12 | 		t1 = time.time()
13 | 		func(*args,**kwargs)
14 | 		t2 = time.time()
15 | 		print "{0} run time: {1:.2f}s".format(func.__name__,t2-t1)
16 | 	return wrapper
17 | 


--------------------------------------------------------------------------------
/trick/evaluation.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Desmond & wepon'
 2 | __date__ = "2015/08/14"
 3 | 
 4 | """
 5 | evaluation function  accoding to official rule:
 6 |     http://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.Grh4pl&raceId=5
 7 |     
 8 | """
 9 | 
10 | def _deviation(predict, real, kind):
11 |     t = 5.0 if kind=='f' else 3.0
12 |     return abs(predict - real) / (real + t)
13 | 
14 | 
15 | def _precision_i(fp, fr, cp, cr, lp, lr):
16 |     return 1 - 0.5 * _deviation(fp, fr, 'f') - 0.25 * _deviation(cp, cr, 'c') - 0.25 * _deviation(lp, lr, 'l')
17 | 
18 | 
19 | def _sgn(x):
20 |     return 1 if x>0 else 0
21 | 
22 | 
23 | def _count_i(fr, cr, lr):
24 |     x = fr + cr + lr
25 |     return 101 if x>100 else (x+1)
26 | 
27 | 
28 | def precision(real_and_predict):
29 |     numerator,denominator = 0.0,0.0
30 |     for  fr, cr, lr,fp, cp, lp in real_and_predict:
31 |         numerator += _count_i(fr, cr, lr) * _sgn(_precision_i(fp, fr, cp, cr, lp, lr) - 0.8)
32 |         denominator += _count_i(fr, cr, lr)
33 |     return numerator / denominator
34 | 


--------------------------------------------------------------------------------
/trick/genUidStat.py:
--------------------------------------------------------------------------------
 1 | __author__ = "wepon, http://2hwp.com"
 2 | __date__ = "2015/08/14"
 3 | 
 4 | 
 5 | import csv,cPickle
 6 | import pandas as pd
 7 | 
 8 | def loadData():
 9 |     traindata = pd.read_csv("weibo_train_data.txt",header=None,sep='\t')
10 |     traindata.columns = ["uid","mid","date","forward","comment","like","content"]
11 | 
12 |     testdata = pd.read_csv("weibo_predict_data.txt",header=None,sep='\t')
13 |     testdata.columns=["uid","mid","date","content"]
14 | 
15 |     return traindata,testdata
16 | 
17 | #for every uid , generate statistics of forward,comment,like
18 | def genUidStat():
19 |     traindata, _ = loadData()
20 |     train_stat = traindata[['uid','forward','comment','like']].groupby('uid').agg(['min','max','median','mean'])
21 |     train_stat.columns = ['forward_min','forward_max','forward_median','forward_mean',\
22 |                           'comment_min','comment_max','comment_median','comment_mean',\
23 |                           'like_min','like_max','like_median','like_mean']
24 |     train_stat = train_stat.apply(pd.Series.round)
25 |     #store into dictionary,for linear time search
26 |     stat_dic = {}
27 |     for uid,stats in train_stat.iterrows():
28 |         stat_dic[uid] = stats   #type(stats) : pd.Series
29 |     return stat_dic
30 |     
31 |     
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/plot/plot.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | import pandas as pd
 4 | import matplotlib.pylab as plt
 5 | import numpy as np
 6 | 
 7 | df = pd.read_csv('weibo_train_data.txt',sep='\t')
 8 | df.columns = ['uid','mid','time','f','c','l','content']
 9 | 
10 | 
11 | #总体的forward分布
12 | f,c,l = df.f,df.c,df.l
13 | 
14 | x_f,y_f = [],[]
15 | d = {}
16 | for i in f:
17 | 	if d.has_key(i):
18 | 		d[i] += 1
19 | 	else:
20 | 		d[i] = 1
21 | 
22 | for key,value in d.items():
23 | 	x_f.append(key)
24 | 	y_f.append(value)
25 | 
26 | #总体的comment分布
27 | x_c,y_c = [],[]
28 | d = {}
29 | for i in c:
30 | 	if d.has_key(i):
31 | 		d[i] += 1
32 | 	else:
33 | 		d[i] = 1
34 | 
35 | for key,value in d.items():
36 | 	x_c.append(key)
37 | 	y_c.append(value)
38 | 
39 | 
40 | #总体的like分布
41 | x_l,y_l = [],[]
42 | d = {}
43 | for i in l:
44 | 	if d.has_key(i):
45 | 		d[i] += 1
46 | 	else:
47 | 		d[i] = 1
48 | 
49 | for key,value in d.items():
50 | 	x_l.append(key)
51 | 	y_l.append(value)
52 | 
53 | plt.axis([0,1000,0,1000])
54 | 
55 | plt.plot(x_f,y_f,'k')
56 | #plt.plot(x_f,y_f,'r')
57 | #plt.plot(x_f,y_f,'b')
58 | 
59 | plt.xticks(np.linspace(0,1000,11,endpoint=True))
60 | plt.yticks(np.linspace(0,1000,11,endpoint=True))
61 | plt.xlabel('forward')
62 | plt.ylabel('count')
63 | 
64 | plt.show()
65 | 
66 | 
67 | 
68 | #画出某个uid的分布
69 | uid = '0199d79415106bcb23aa22fdfeb595b4'
70 | uid_f = df[df.uid==uid]['f']
71 | x_f,y_f = [],[]
72 | d = {}
73 | for i in uid_f:
74 | 	if d.has_key(i):
75 | 		d[i] += 1
76 | 	else:
77 | 		d[i] = 1
78 | 
79 | for key,value in d.items():
80 | 	x_f.append(key)
81 | 	y_f.append(value)
82 | 
83 | plt.bar(x_f,y_f,color='b')
84 | plt.title('uid({}) forward'.format(uid))
85 | plt.show()
86 | 
87 | 


--------------------------------------------------------------------------------
/trick/predict_with_fixed_value.py:
--------------------------------------------------------------------------------
  1 | __author__ = "wepon, http://2hwp.com"
  2 | __date__ = "2015/08/14"
  3 | 
  4 | """
  5 | about 80% of the training data are: 0 0 0 (forward_count,comment_count,like_count),
  6 | inspired by this, we try some fixed value for all uid,and calculate their score on training data:
  7 | 	
  8 | 	predict score(%)
  9 | 	0 0 0  34.10%
 10 | 	1 0 0  29.23%
 11 | 	0 1 0  35.01%
 12 | 	0 0 1  32.20%
 13 | 	1 1 0  29.30%
 14 | 	1 0 1  29.39%
 15 | 	0 1 1  33.45%
 16 | 	1 1 1  13.46%
 17 | 	2 0 0  7.04%
 18 | 	0 2 0  29.93%
 19 | 	0 0 2  28.22%
 20 | 	0 1 2  12.85%
 21 | 	....          
 22 | 
 23 | 
 24 | 
 25 | another wise solution is to predict respectively with uid's statistics(E.g mean,median)	,
 26 | their score on the training data:
 27 | 
 28 | 	mean   38.01%
 29 | 	min    34.17%
 30 | 	max    8.08%
 31 | 	median 40.94%      **best**
 32 | 
 33 | """
 34 | 
 35 | import pandas as pd
 36 | from genUidStat import loadData,genUidStat
 37 | from evaluation import precision
 38 | from runTime import runTime
 39 | 
 40 | 
 41 | 
 42 | 
 43 | @runTime
 44 | def predict_with_fixed_value(forward,comment,like,submission=True):
 45 | 	# type check
 46 | 	if isinstance(forward,int) and isinstance(forward,int) and isinstance(forward,int):
 47 | 		pass
 48 | 	else:
 49 | 		raise TypeError("forward,comment,like should be type 'int' ")
 50 | 	
 51 | 	traindata,testdata = loadData()
 52 | 	
 53 | 	#score on the training set
 54 | 	train_real_pred = traindata[['forward','comment','like']]
 55 | 	train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like
 56 | 	print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100)
 57 | 	
 58 | 	#predict on the test data with fixed value, generate submission file
 59 | 	if submission:
 60 | 		test_pred = testdata[['uid','mid']]
 61 | 		test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like
 62 | 		
 63 | 		result = []
 64 | 		filename = "weibo_predict_{}_{}_{}.txt".format(forward,comment,like)
 65 | 		for _,row in test_pred.iterrows():
 66 | 			result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4]))
 67 | 		f = open(filename,'w')
 68 | 		f.writelines(result)
 69 | 		f.close()
 70 | 		print 'generate submission file "{}"'.format(filename)
 71 | 
 72 | 
 73 | @runTime	
 74 | def predict_with_stat(stat="median",submission=True):
 75 | 	"""
 76 | 	stat:
 77 | 		string
 78 | 		min,max,mean,median
 79 | 	"""
 80 | 	stat_dic = genUidStat()
 81 | 	traindata,testdata = loadData()
 82 | 	
 83 | 	#get stat for each uid
 84 | 	forward,comment,like = [],[],[]
 85 | 	for uid in traindata['uid']:
 86 | 		if stat_dic.has_key(uid):
 87 | 			forward.append(int(stat_dic[uid]["forward_"+stat]))
 88 | 			comment.append(int(stat_dic[uid]["comment_"+stat]))
 89 | 			like.append(int(stat_dic[uid]["like_"+stat]))
 90 | 		else:
 91 | 			forward.append(0)
 92 | 			comment.append(0)
 93 | 			like.append(0)
 94 | 	#score on the training set
 95 | 	train_real_pred = traindata[['forward','comment','like']]
 96 | 	train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like
 97 | 	print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100)
 98 | 	
 99 | 	#predict on the test data with fixed value, generate submission file
100 | 	if submission:
101 | 		test_pred = testdata[['uid','mid']]
102 | 		forward,comment,like = [],[],[]
103 | 		for uid in testdata['uid']:
104 | 			if stat_dic.has_key(uid):
105 | 				forward.append(int(stat_dic[uid]["forward_"+stat]))
106 | 				comment.append(int(stat_dic[uid]["comment_"+stat]))
107 | 				like.append(int(stat_dic[uid]["like_"+stat]))
108 | 			else:
109 | 				forward.append(0)
110 | 				comment.append(0)
111 | 				like.append(0)
112 | 				
113 | 				
114 | 		test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like
115 | 		
116 | 		result = []
117 | 		filename = "weibo_predict_{}.txt".format(stat)
118 | 		for _,row in test_pred.iterrows():
119 | 			result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4]))
120 | 		f = open(filename,'w')
121 | 		f.writelines(result)
122 | 		f.close()
123 | 		print 'generate submission file "{}"'.format(filename)
124 | 	
125 | 
126 | 
127 | 
128 | if __name__ == "__main__":
129 | 	 #predict_with_fixed_value(0,1,1,submission=False)
130 | 	 predict_with_stat(stat="median",submission=True)
131 | 	 


--------------------------------------------------------------------------------
/trick/predict_by_search.py:
--------------------------------------------------------------------------------
  1 | __author__ = "wepon,http://2hwp.com"
  2 | __date__ = "2015/08/14"
  3 | 
  4 | """
  5 | It score 40.94% on training set when we predict  with uid's (forward_median,comment_medain,like_median).
  6 | This is good,and we can go further based on this:
  7 | 	
  8 | 	for each uid, we first get its (f_min,f_median,f_max),(c_min,c_median,c_max),(l_min,l_medain,l_max),and then:
  9 | 		1. fix c_median and l_medain, search a  forward value between <f_min,f_max>  , which cause a higher score than (f_medain,c_medain,l_medain)
 10 | 		   if there exist several result that get the same highest score, we choose the one near f_medain.
 11 | 		   if not exist any result that get higher score than (f_medain,c_medain,l_medain), than we choose forward = f_medain
 12 | 		2. search a  comment value, by the same method 
 13 | 		3. search a  like value, by the same method
 14 | 
 15 | """
 16 | 
 17 | import cPickle
 18 | import pandas as pd
 19 | import numpy as np
 20 | from genUidStat import loadData,genUidStat
 21 | from evaluation import precision
 22 | from runTime import runTime
 23 | from multiprocessing import Pool
 24 | 
 25 | def score(uid_data,pred):
 26 | 	"""
 27 | 	uid_data:
 28 | 		pd.DataFrame
 29 | 	pred:
 30 | 		list, [fp,cp,lp]
 31 | 	"""
 32 | 	uid_real_pred = uid_data[['forward','comment','like']]
 33 | 	uid_real_pred['fp'] = pred[0]
 34 | 	uid_real_pred['cp'] = pred[1]
 35 | 	uid_real_pred['lp'] = pred[2]
 36 | 	return precision(uid_real_pred.values)
 37 | 	
 38 | 
 39 | 
 40 | 
 41 | #search and return the best target value for uid
 42 | def search(uid_data,target,args):
 43 | 	"""
 44 | 	target:
 45 | 		'forward','comment','like'
 46 | 	
 47 | 	args:
 48 | 		(f_min,f_median,f_max,c_min,c_median,c_max,l_min,l_medain,l_max)
 49 | 	"""
 50 | 	args = list(args)
 51 | 	target_index = ['forward','comment','like'].index(target)
 52 | 	target_min,target_median,target_max = args[3*target_index:3*target_index+3]
 53 | 	del args[3*target_index:3*target_index+3]
 54 | 	pred = (args[1],args[4])
 55 | 	
 56 | 	best_num = [target_median]
 57 | 	best_pred = list(pred)
 58 | 	best_pred.insert(target_index,target_median)
 59 | 	best_score = score(uid_data,best_pred)
 60 | 	for num in range(target_min,target_max+1):
 61 | 		this_pred = list(pred)
 62 | 		this_pred.insert(target_index,num)
 63 | 		this_score = score(uid_data,this_pred)
 64 | 		if this_score >= best_score:                  
 65 | 			if this_score > best_score:
 66 | 				best_num = [num]
 67 | 				best_score = this_score
 68 | 			else:
 69 | 				best_num.append(num)                       
 70 | 			
 71 | 	return best_num[np.array([abs(i - target_median) for i in best_num]).argmin()]
 72 | 
 73 | ##search best target value for all uid,return a dictionary that store {uid:[f,c,l]}
 74 | def search_all_uid():
 75 | 	"""
 76 | 	traindata,testdata = loadData()
 77 | 	stat_dic = genUidStat()
 78 | 	
 79 | 	#for each uid,search its best fp,cp,lp
 80 | 	uid_best_pred = {}
 81 | 	for uid in stat_dic:
 82 | 		print "search uid: {}".format(uid)
 83 | 		uid_data = traindata[traindata.uid == uid]
 84 | 		args = stat_dic[uid][['forward_min','forward_median','forward_max','comment_min',\
 85 | 					'comment_median','comment_max','like_min','like_median','like_max']]
 86 | 		args = tuple([int(i) for i in args]) 
 87 | 		fp = search(uid_data,'forward',args)	
 88 | 		cp = search(uid_data,'comment',args)	
 89 | 		lp = search(uid_data,'like',args)	
 90 | 		uid_best_pred[uid] = [fp,cp,lp]
 91 | 	"""
 92 | 	#multiprocessing version for geting uid_best_pred
 93 | 	traindata,testdata = loadData()
 94 | 	stat_dic = genUidStat()
 95 | 	uid_best_pred = {}
 96 | 	pool = Pool()
 97 | 	uids,f,c,l = [],[],[],[]
 98 | 	for uid in stat_dic:
 99 | 		print "search uid:{}".format(uid)
100 | 		uid_data = traindata[traindata.uid == uid]
101 | 		arguments = stat_dic[uid][['forward_min','forward_median','forward_max','comment_min',\
102 | 					'comment_median','comment_max','like_min','like_median','like_max']]
103 | 		arguments = tuple([int(i) for i in arguments]) 
104 | 		f.append(pool.apply_async(search,args=(uid_data,'forward',arguments)))
105 | 		c.append(pool.apply_async(search,args=(uid_data,'comment',arguments)))
106 | 		l.append(pool.apply_async(search,args=(uid_data,'like',arguments)))
107 | 		uids.append(uid)
108 | 	pool.close()
109 | 	pool.join()
110 | 	f = [i.get() for i in f]
111 | 	c = [i.get() for i in c]
112 | 	l = [i.get() for i in l]
113 | 	
114 | 	for i in range(len(uids)):
115 | 		uid_best_pred[uids[i]] = [f[i],c[i],l[i]]
116 | 	
117 | 	try:
118 | 		cPickle.dump(uid_best_pred,open('uid_best_pred.pkl','w'))
119 | 	except Exception:
120 | 		pass
121 | 		
122 | 	return uid_best_pred
123 | 
124 | 	
125 | @runTime
126 | def predict_by_search(submission=True):
127 | 	traindata,testdata = loadData()
128 | 	uid_best_pred = search_all_uid()
129 | 	print "search done,now predict on traindata and testdata..."
130 | 
131 | 	#predict traindata with uid's best fp,cp,lp
132 | 	forward,comment,like = [],[],[]
133 | 	for uid in traindata['uid']:
134 | 		if uid_best_pred.has_key(uid):
135 | 			forward.append(int(uid_best_pred[uid][0]))
136 | 			comment.append(int(uid_best_pred[uid][1]))
137 | 			like.append(int(uid_best_pred[uid][2]))
138 | 		else:
139 | 			forward.append(0)
140 | 			comment.append(0)
141 | 			like.append(0)
142 | 	
143 | 	#score on the traindata
144 | 	train_real_pred = traindata[['forward','comment','like']]
145 | 	train_real_pred['fp'],train_real_pred['cp'],train_real_pred['lp'] = forward,comment,like
146 | 	print "Score on the training set:{0:.2f}%".format(precision(train_real_pred.values)*100)	
147 | 	
148 | 	
149 | 	if submission:
150 | 		test_pred = testdata[['uid','mid']]
151 | 		forward,comment,like = [],[],[]
152 | 		for uid in testdata['uid']:
153 | 			if uid_best_pred.has_key(uid):
154 | 				forward.append(int(uid_best_pred[uid][0]))
155 | 				comment.append(int(uid_best_pred[uid][1]))
156 | 				like.append(int(uid_best_pred[uid][2]))
157 | 			else:
158 | 				forward.append(0)
159 | 				comment.append(0)
160 | 				like.append(0)
161 | 		test_pred['fp'],test_pred['cp'],test_pred['lp'] = forward,comment,like
162 | 		
163 | 		#generate submission file
164 | 		result = []
165 | 		filename = "weibo_predict_search.txt"
166 | 		for _,row in test_pred.iterrows():
167 | 			result.append("{0}\t{1}\t{2},{3},{4}\n".format(row[0],row[1],row[2],row[3],row[4]))
168 | 		f = open(filename,'w')
169 | 		f.writelines(result)
170 | 		f.close()
171 | 		print 'generate submission file "{}"'.format(filename)
172 | 		
173 | if __name__ == "__main__":
174 | 		predict_by_search()	
175 | 			
176 | 


--------------------------------------------------------------------------------