├── .gitignore
├── DeepLearn.py
├── README.md
├── average_precision.py
├── lfm.py
└── test.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/DeepLearn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | np.random.seed(2016)
  5 | 
  6 | import os
  7 | import glob
  8 | import math
  9 | import pickle
 10 | import datetime
 11 | 
 12 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge
 13 | from keras.models import Model
 14 | 
 15 | def load_train():
 16 |     X_train_uid=[]
 17 |     X_train_iid=[]
 18 |     Y_train_score=[]
 19 | 
 20 |     path = os.path.join('./data',  'train.csv')
 21 |     print('Read train data',path)
 22 | 
 23 |     f = open(path, 'r')
 24 |     line = f.readline()
 25 |     while (1):
 26 |         line = f.readline()
 27 |         if line == '':
 28 |             break
 29 |         arr = line.strip().split(',')
 30 |         X_train_uid.append(int(arr[0]))
 31 |         X_train_iid.append(int(arr[1]))
 32 |         Y_train_score.append(int(arr[2]))
 33 |     f.close()
 34 |     return X_train_uid,X_train_iid,Y_train_score
 35 | 
 36 | def load_test():
 37 |     X_test_uid=[]
 38 |     X_test_iid=[]
 39 | 
 40 |     path = os.path.join('./data',  'test.csv')
 41 |     print('Read test data',path)
 42 | 
 43 |     f = open(path, 'r')
 44 |     line = f.readline()
 45 |     while (1):
 46 |         line = f.readline()
 47 |         if line == '':
 48 |             break
 49 |         arr = line.strip().split(',')
 50 |         X_test_uid.append(int(arr[0]))
 51 |         X_test_iid.append(int(arr[1]))
 52 |     f.close()
 53 |     return X_test_uid,X_test_iid
 54 | 
 55 | 
 56 | X_train_uid,X_train_iid,Y_train_score = load_train()
 57 | #print len(X_train_uid),X_train_uid[33177260],max(X_train_uid)
 58 | #print len(X_train_iid),X_train_iid[33177260],max(X_train_iid)
 59 | #print len(Y_train_score),Y_train_score[33177260]
 60 | print "load train data OK."
 61 | 
 62 | X_test_uid,X_test_iid = load_test()
 63 | #print len(X_test_uid),X_test_uid[100],max(X_test_uid)
 64 | #print len(X_test_iid),X_test_iid[100],max(X_test_iid)
 65 | print "load test data OK."
 66 | 
 67 | # normalize train date
 68 | X_train_uid=np.array(X_train_uid)
 69 | X_train_uid=X_train_uid.reshape(X_train_uid.shape[0],1)
 70 | 
 71 | X_train_iid=np.array(X_train_iid)
 72 | X_train_iid=X_train_iid.reshape(X_train_iid.shape[0],1)
 73 | 
 74 | Y_train_score = np.array(Y_train_score).astype('float32')
 75 | Y_train_score = (Y_train_score - 1)/ 4
 76 | 
 77 | # normalize test date
 78 | X_test_uid=np.array(X_test_uid)
 79 | X_test_uid=X_test_uid.reshape(X_test_uid.shape[0],1)
 80 | 
 81 | X_test_iid=np.array(X_test_iid)
 82 | X_test_iid=X_test_iid.reshape(X_test_iid.shape[0],1)
 83 | 
 84 | # define model
 85 | input_1=Input(shape=(1,), dtype='int32')
 86 | input_2=Input(shape=(1,), dtype='int32')
 87 | x1=Embedding(output_dim=128, input_dim=223970, input_length=1)(input_1)
 88 | x2=Embedding(output_dim=128, input_dim=14726, input_length=1)(input_2)
 89 | x1=Flatten()(x1)
 90 | x2=Flatten()(x2)
 91 | x = merge([x1, x2], mode='concat')
 92 | x = Dropout(0.2)(x)
 93 | x = Dense(512, activation='relu')(x)
 94 | x = Dropout(0.2)(x)
 95 | x = Dense(64, activation='relu')(x)
 96 | x = Dropout(0.2)(x)
 97 | out = Dense(1, activation='sigmoid')(x)
 98 | model = Model(input=[input_1, input_2], output=out)
 99 | model.compile(optimizer='rmsprop',
100 |               loss='mean_squared_error',
101 |               metrics=[])
102 | # train model
103 | model.fit([X_train_uid, X_train_iid], Y_train_score,
104 |           nb_epoch=10, batch_size=1024*6)
105 | 
106 | # predict
107 | Y_test_score = model.predict([X_test_uid, X_test_iid],batch_size=2048)
108 | Y_test_score = Y_test_score * 4 + 1
109 | 
110 | f=open("out.csv","w")
111 | f.write("score\n")
112 | for i in range(Y_test_score.shape[0]):
113 |     f.write("{:1.4f}".format(Y_test_score[i,0]))
114 |     f.write("\n")
115 | f.close()
116 | 
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DC-猜你喜欢比赛
 2 | 基于用户行为的推荐算法大赛---第四名（临兵斗列）
 3 | 
 4 | 比赛平台：DataCastle
 5 | 
 6 | 软件运行环境：Python 2.x
 7 | 
 8 | 使用说明：文件夹包括数据以及代码，其中数据包括train和test两个数据集，其中test是用于提交到DC平台的测试集。代码包括三种方法测试的结果，最终第四名的成果采用的是深度学习。train数据集太大，请自行去官网下载【address：http://www.pkbigdata.com/】
 9 | 
10 | 代码详细说明：average_precision.py是利用用户的平均分进行的预测；
11 | lfm.py采用的是LFM算法测试的结果；
12 | DeepLearn.py采用的是深度学习方法，最终结果达到7.83398（10分为满分），取得第四名。
13 | 


--------------------------------------------------------------------------------
/average_precision.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def apk(actual, predicted, k=10):
 4 |     """
 5 |     Computes the average precision at k.
 6 | 
 7 |     This function computes the average prescision at k between two lists of
 8 |     items.
 9 | 
10 |     Parameters
11 |     ----------
12 |     actual : list
13 |              A list of elements that are to be predicted (order doesn't matter)
14 |     predicted : list
15 |                 A list of predicted elements (order does matter)
16 |     k : int, optional
17 |         The maximum number of predicted elements
18 | 
19 |     Returns
20 |     -------
21 |     score : double
22 |             The average precision at k over the input lists
23 | 
24 |     """
25 |     if len(predicted)>k:
26 |         predicted = predicted[:k]
27 | 
28 |     score = 0.0
29 |     num_hits = 0.0
30 | 
31 |     for i,p in enumerate(predicted):
32 |         if p in actual and p not in predicted[:i]:
33 |             num_hits += 1.0
34 |             score += num_hits / (i+1.0)
35 | 
36 |     if not actual:
37 |         return 1.0
38 | 
39 |     return score / min(len(actual), k)
40 | 
41 | def mapk(actual, predicted, k=10):
42 |     """
43 |     Computes the mean average precision at k.
44 | 
45 |     This function computes the mean average prescision at k between two lists
46 |     of lists of items.
47 | 
48 |     Parameters
49 |     ----------
50 |     actual : list
51 |              A list of lists of elements that are to be predicted 
52 |              (order doesn't matter in the lists)
53 |     predicted : list
54 |                 A list of lists of predicted elements
55 |                 (order matters in the lists)
56 |     k : int, optional
57 |         The maximum number of predicted elements
58 | 
59 |     Returns
60 |     -------
61 |     score : double
62 |             The mean average precision at k over the input lists
63 | 
64 |     """
65 |     return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
66 | 


--------------------------------------------------------------------------------
/lfm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import math  
 5 | from numba import jit
 6 | from datetime import datetime 
 7 | import mpmath as mp
 8 | import sys
 9 | import codecs
10 |   
11 | train = pd.read_csv('../data/train_set.csv')
12 | test = pd.read_csv('../data/test.csv')
13 | u_max = train['uid'].max()+1
14 | i_max = train['iid'].max()+1
15 | u_ms = train.groupby('uid')['score'].mean()
16 | i_ms = train.groupby('iid')['score'].mean()
17 | train_len = len(train)
18 | 
19 | def learning_lfm(n,alpha,lambd):
20 |     if os.path.exists('./p-pian.npy'):
21 |         sys.stdout.write('loading... \n')
22 |         p = np.load('./p-pian.npy')
23 |         q = np.load('./q-pian.npy')
24 |     else:
25 |         p = np.random.rand(u_max,5)*3
26 |         q = np.random.rand(i_max,5)
27 |     list_result(predict(p ,q), 'lfm-pian.csv')
28 |     for step in range(n):
29 |         error = 0.0
30 |         for i in range(train_len):
31 |             x = train.values[i]
32 |             uid = x[0]
33 |             iid = x[1]
34 |             rui = x[2]
35 |             pui = (u_ms[uid]+i_ms[iid])+sum(p[uid]*q[iid])
36 |             eui = rui-pui
37 |             error += eui
38 |             for k in range(5):
39 |                 p[uid][k] += alpha*(q[iid][k]*eui-lambd*p[uid][k])
40 |                 q[iid][k] += alpha*(p[uid][k]*eui-lambd*q[iid][k])
41 |                 if p[uid][k]> 5:
42 |                     p[uid][k] = 5
43 |                 if p[uid][k]<-5:
44 |                     p[uid][k] = -5
45 |                 if q[iid][k]< 0:
46 |                     q[iid][k]=0
47 |                 if q[iid][k]>1:
48 |                     q[iid][k]=1
49 |         np.save('p-pian.npy', p)
50 |         np.save('q-pian.npy', q)
51 |         sys.stdout.write("Train Setp: %d/%d, LR: %f, TE: %f \n" % (step, n , alpha, error))
52 |         list_result(predict(p ,q), 'lfm-pian.csv')
53 |         alpha = alpha * 0.9
54 |     return p,q
55 | 
56 | def predict(p, q):
57 |     n = len(test)
58 |     s = []
59 |     users = test['uid']
60 |     items = test['iid']
61 | 
62 |     count = 0
63 |     for i in range(n):
64 |         x = test.values[i]
65 |         uid = x[0]
66 |         iid = x[1]
67 |         tmp = (u_ms[uid]+i_ms[iid])+sum(p[uid]*q[iid])
68 |         if tmp >= 5.0:
69 |             count += 1 
70 |             s.append(i_ms[test['iid'][i]])
71 |         else:
72 |             s.append(tmp)
73 |     sys.stdout.write("Test Step Error : %d / %d \n\n" % (count, n))
74 |     return s
75 |     
76 | def list_result(data, filename):
77 |     result_file = codecs.open('../result/' + filename, 'w', 'utf-8')
78 |     result_file.write('score\n')
79 |     for socre in data:
80 |         result_file.write(str(socre) + '\n')
81 |     result_file.close()
82 |        
83 | start = datetime.now()
84 | p,q = learning_lfm(50,0.09,0.001)
85 | stop = datetime.now()
86 | print(stop-start)
87 | 


--------------------------------------------------------------------------------