├── .gitignore ├── DeepLearn.py ├── README.md ├── average_precision.py ├── lfm.py └── test.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /DeepLearn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | np.random.seed(2016) 5 | 6 | import os 7 | import glob 8 | import math 9 | import pickle 10 | import datetime 11 | 12 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge 13 | from keras.models import Model 14 | 15 | def load_train(): 16 | X_train_uid=[] 17 | X_train_iid=[] 18 | Y_train_score=[] 19 | 20 | path = os.path.join('./data', 'train.csv') 21 | print('Read train data',path) 22 | 23 | f = open(path, 'r') 24 | line = f.readline() 25 | while (1): 26 | line = f.readline() 27 | if line == '': 28 | break 29 | arr = line.strip().split(',') 30 | X_train_uid.append(int(arr[0])) 31 | X_train_iid.append(int(arr[1])) 32 | Y_train_score.append(int(arr[2])) 33 | f.close() 34 | return X_train_uid,X_train_iid,Y_train_score 35 | 36 | def load_test(): 37 | X_test_uid=[] 38 | X_test_iid=[] 39 | 40 | path = os.path.join('./data', 'test.csv') 41 | print('Read test data',path) 42 | 43 | f = open(path, 'r') 44 | line = f.readline() 45 | while (1): 46 | line = f.readline() 47 | if line == '': 48 | break 49 | arr = line.strip().split(',') 50 | X_test_uid.append(int(arr[0])) 51 | X_test_iid.append(int(arr[1])) 52 | f.close() 53 | return X_test_uid,X_test_iid 54 | 55 | 56 | X_train_uid,X_train_iid,Y_train_score = load_train() 57 | #print len(X_train_uid),X_train_uid[33177260],max(X_train_uid) 58 | #print len(X_train_iid),X_train_iid[33177260],max(X_train_iid) 59 | #print len(Y_train_score),Y_train_score[33177260] 60 | print "load train data OK." 61 | 62 | X_test_uid,X_test_iid = load_test() 63 | #print len(X_test_uid),X_test_uid[100],max(X_test_uid) 64 | #print len(X_test_iid),X_test_iid[100],max(X_test_iid) 65 | print "load test data OK." 66 | 67 | # normalize train date 68 | X_train_uid=np.array(X_train_uid) 69 | X_train_uid=X_train_uid.reshape(X_train_uid.shape[0],1) 70 | 71 | X_train_iid=np.array(X_train_iid) 72 | X_train_iid=X_train_iid.reshape(X_train_iid.shape[0],1) 73 | 74 | Y_train_score = np.array(Y_train_score).astype('float32') 75 | Y_train_score = (Y_train_score - 1)/ 4 76 | 77 | # normalize test date 78 | X_test_uid=np.array(X_test_uid) 79 | X_test_uid=X_test_uid.reshape(X_test_uid.shape[0],1) 80 | 81 | X_test_iid=np.array(X_test_iid) 82 | X_test_iid=X_test_iid.reshape(X_test_iid.shape[0],1) 83 | 84 | # define model 85 | input_1=Input(shape=(1,), dtype='int32') 86 | input_2=Input(shape=(1,), dtype='int32') 87 | x1=Embedding(output_dim=128, input_dim=223970, input_length=1)(input_1) 88 | x2=Embedding(output_dim=128, input_dim=14726, input_length=1)(input_2) 89 | x1=Flatten()(x1) 90 | x2=Flatten()(x2) 91 | x = merge([x1, x2], mode='concat') 92 | x = Dropout(0.2)(x) 93 | x = Dense(512, activation='relu')(x) 94 | x = Dropout(0.2)(x) 95 | x = Dense(64, activation='relu')(x) 96 | x = Dropout(0.2)(x) 97 | out = Dense(1, activation='sigmoid')(x) 98 | model = Model(input=[input_1, input_2], output=out) 99 | model.compile(optimizer='rmsprop', 100 | loss='mean_squared_error', 101 | metrics=[]) 102 | # train model 103 | model.fit([X_train_uid, X_train_iid], Y_train_score, 104 | nb_epoch=10, batch_size=1024*6) 105 | 106 | # predict 107 | Y_test_score = model.predict([X_test_uid, X_test_iid],batch_size=2048) 108 | Y_test_score = Y_test_score * 4 + 1 109 | 110 | f=open("out.csv","w") 111 | f.write("score\n") 112 | for i in range(Y_test_score.shape[0]): 113 | f.write("{:1.4f}".format(Y_test_score[i,0])) 114 | f.write("\n") 115 | f.close() 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DC-猜你喜欢比赛 2 | 基于用户行为的推荐算法大赛---第四名(临兵斗列) 3 | 4 | 比赛平台:DataCastle 5 | 6 | 软件运行环境:Python 2.x 7 | 8 | 使用说明:文件夹包括数据以及代码,其中数据包括train和test两个数据集,其中test是用于提交到DC平台的测试集。代码包括三种方法测试的结果,最终第四名的成果采用的是深度学习。train数据集太大,请自行去官网下载【address:http://www.pkbigdata.com/】 9 | 10 | 代码详细说明:average_precision.py是利用用户的平均分进行的预测; 11 | lfm.py采用的是LFM算法测试的结果; 12 | DeepLearn.py采用的是深度学习方法,最终结果达到7.83398(10分为满分),取得第四名。 13 | -------------------------------------------------------------------------------- /average_precision.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def apk(actual, predicted, k=10): 4 | """ 5 | Computes the average precision at k. 6 | 7 | This function computes the average prescision at k between two lists of 8 | items. 9 | 10 | Parameters 11 | ---------- 12 | actual : list 13 | A list of elements that are to be predicted (order doesn't matter) 14 | predicted : list 15 | A list of predicted elements (order does matter) 16 | k : int, optional 17 | The maximum number of predicted elements 18 | 19 | Returns 20 | ------- 21 | score : double 22 | The average precision at k over the input lists 23 | 24 | """ 25 | if len(predicted)>k: 26 | predicted = predicted[:k] 27 | 28 | score = 0.0 29 | num_hits = 0.0 30 | 31 | for i,p in enumerate(predicted): 32 | if p in actual and p not in predicted[:i]: 33 | num_hits += 1.0 34 | score += num_hits / (i+1.0) 35 | 36 | if not actual: 37 | return 1.0 38 | 39 | return score / min(len(actual), k) 40 | 41 | def mapk(actual, predicted, k=10): 42 | """ 43 | Computes the mean average precision at k. 44 | 45 | This function computes the mean average prescision at k between two lists 46 | of lists of items. 47 | 48 | Parameters 49 | ---------- 50 | actual : list 51 | A list of lists of elements that are to be predicted 52 | (order doesn't matter in the lists) 53 | predicted : list 54 | A list of lists of predicted elements 55 | (order matters in the lists) 56 | k : int, optional 57 | The maximum number of predicted elements 58 | 59 | Returns 60 | ------- 61 | score : double 62 | The mean average precision at k over the input lists 63 | 64 | """ 65 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 66 | -------------------------------------------------------------------------------- /lfm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import math 5 | from numba import jit 6 | from datetime import datetime 7 | import mpmath as mp 8 | import sys 9 | import codecs 10 | 11 | train = pd.read_csv('../data/train_set.csv') 12 | test = pd.read_csv('../data/test.csv') 13 | u_max = train['uid'].max()+1 14 | i_max = train['iid'].max()+1 15 | u_ms = train.groupby('uid')['score'].mean() 16 | i_ms = train.groupby('iid')['score'].mean() 17 | train_len = len(train) 18 | 19 | def learning_lfm(n,alpha,lambd): 20 | if os.path.exists('./p-pian.npy'): 21 | sys.stdout.write('loading... \n') 22 | p = np.load('./p-pian.npy') 23 | q = np.load('./q-pian.npy') 24 | else: 25 | p = np.random.rand(u_max,5)*3 26 | q = np.random.rand(i_max,5) 27 | list_result(predict(p ,q), 'lfm-pian.csv') 28 | for step in range(n): 29 | error = 0.0 30 | for i in range(train_len): 31 | x = train.values[i] 32 | uid = x[0] 33 | iid = x[1] 34 | rui = x[2] 35 | pui = (u_ms[uid]+i_ms[iid])+sum(p[uid]*q[iid]) 36 | eui = rui-pui 37 | error += eui 38 | for k in range(5): 39 | p[uid][k] += alpha*(q[iid][k]*eui-lambd*p[uid][k]) 40 | q[iid][k] += alpha*(p[uid][k]*eui-lambd*q[iid][k]) 41 | if p[uid][k]> 5: 42 | p[uid][k] = 5 43 | if p[uid][k]<-5: 44 | p[uid][k] = -5 45 | if q[iid][k]< 0: 46 | q[iid][k]=0 47 | if q[iid][k]>1: 48 | q[iid][k]=1 49 | np.save('p-pian.npy', p) 50 | np.save('q-pian.npy', q) 51 | sys.stdout.write("Train Setp: %d/%d, LR: %f, TE: %f \n" % (step, n , alpha, error)) 52 | list_result(predict(p ,q), 'lfm-pian.csv') 53 | alpha = alpha * 0.9 54 | return p,q 55 | 56 | def predict(p, q): 57 | n = len(test) 58 | s = [] 59 | users = test['uid'] 60 | items = test['iid'] 61 | 62 | count = 0 63 | for i in range(n): 64 | x = test.values[i] 65 | uid = x[0] 66 | iid = x[1] 67 | tmp = (u_ms[uid]+i_ms[iid])+sum(p[uid]*q[iid]) 68 | if tmp >= 5.0: 69 | count += 1 70 | s.append(i_ms[test['iid'][i]]) 71 | else: 72 | s.append(tmp) 73 | sys.stdout.write("Test Step Error : %d / %d \n\n" % (count, n)) 74 | return s 75 | 76 | def list_result(data, filename): 77 | result_file = codecs.open('../result/' + filename, 'w', 'utf-8') 78 | result_file.write('score\n') 79 | for socre in data: 80 | result_file.write(str(socre) + '\n') 81 | result_file.close() 82 | 83 | start = datetime.now() 84 | p,q = learning_lfm(50,0.09,0.001) 85 | stop = datetime.now() 86 | print(stop-start) 87 | --------------------------------------------------------------------------------