├── .gitignore ├── CDL.ipynb ├── CDL_tf.ipynb ├── LICENSE ├── README.md └── data_preprocess.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /CDL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CDL" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### import module" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "C:\\Users\\k12s35h813g\\AppData\\Local\\Continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 27 | " from ._conv import register_converters as _register_converters\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "import numpy as np\n", 33 | "import pickle\n", 34 | "import tensorflow as tf\n", 35 | "#init random seed\n", 36 | "np.random.seed(5)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## 1. data preprocess" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "#### build item information matrix of citeulike-a by bag of word" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "#find vocabulary_size = 8000\n", 60 | "with open(r\"ctrsr_datasets/citeulike-a/vocabulary.dat\") as vocabulary_file:\n", 61 | " vocabulary_size = len(vocabulary_file.readlines())\n", 62 | " \n", 63 | "#find item_size = 16980\n", 64 | "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n", 65 | " item_size = len(item_info_file.readlines())\n", 66 | "\n", 67 | "#initialize item_infomation_matrix (16980 , 8000)\n", 68 | "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n", 69 | "\n", 70 | "#build item_infomation_matrix\n", 71 | "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n", 72 | " sentences = item_info_file.readlines()\n", 73 | " \n", 74 | " for index,sentence in enumerate(sentences):\n", 75 | " words = sentence.strip().split(\" \")[1:]\n", 76 | " for word in words:\n", 77 | " vocabulary_index , number = word.split(\":\")\n", 78 | " item_infomation_matrix[index][int(vocabulary_index)] =number\n", 79 | " " 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "#### build rating matrix citeulike-a" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "#find user_size = 5551\n", 96 | "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n", 97 | " user_size = len(rating_file.readlines())\n", 98 | "\n", 99 | "#initialize rating_matrix (5551 , 16980)\n", 100 | "import numpy as np\n", 101 | "rating_matrix = np.zeros((user_size , item_size))\n", 102 | "\n", 103 | "#build rating_matrix\n", 104 | "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n", 105 | " lines = rating_file.readlines()\n", 106 | " for index,line in enumerate(lines):\n", 107 | " items = line.strip().split(\" \")\n", 108 | " for item in items: \n", 109 | " rating_matrix[index][int(item)] = 1" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "#### save matrix by pickle" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "with open(r'item_infomation_matrix.pickle', 'wb') as handle:\n", 126 | " pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", 127 | "with open(r'rating_matrix.pickle', 'wb') as handle:\n", 128 | " pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "#### load matrix from pickle " 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "with open(r'item_infomation_matrix.pickle', 'rb') as handle:\n", 145 | " item_infomation_matrix = pickle.load(handle) \n", 146 | " \n", 147 | "with open(r'rating_matrix.pickle', 'rb') as handle2:\n", 148 | " rating_matrix = pickle.load(handle2)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## 2. build model" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### matrix factorization model" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "class MF():\n", 172 | " def __init__(self , rating_matrix ):\n", 173 | " #### 參數設定\n", 174 | " self.num_u = rating_matrix.shape[0] #5551\n", 175 | " self.num_v = rating_matrix.shape[1] #16980\n", 176 | " self.u_lambda = 100\n", 177 | " self.v_lambda = 0.1\n", 178 | " self.k = 50 #latent維度\n", 179 | " self.a = 1\n", 180 | " self.b =0.01\n", 181 | " self.R = np.mat(rating_matrix)\n", 182 | " self.C = np.mat(np.ones(self.R.shape)) * self.b\n", 183 | " self.C[np.where(self.R>0)] = self.a\n", 184 | " self.I_U = np.mat(np.eye(self.k) * self.u_lambda)\n", 185 | " self.I_V = np.mat(np.eye(self.k) * self.v_lambda)\n", 186 | " self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k,self.num_u)))\n", 187 | " self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k,self.num_v)))\n", 188 | " \n", 189 | "\n", 190 | " def test(self):\n", 191 | " print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)\n", 192 | " def ALS(self , V_sdae):\n", 193 | " self.V_sdae = np.mat(V_sdae)\n", 194 | " \n", 195 | " V_sq = self.V * self.V.T * self.b\n", 196 | " for i in range(self.num_u):\n", 197 | " idx_a = np.ravel(np.where(self.R[i,:]>0)[1])\n", 198 | " V_cut = self.V[:,idx_a]\n", 199 | " self.U[:,i] = np.linalg.pinv( V_sq+ V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T\n", 200 | " \n", 201 | " U_sq = self.U * self.U.T * self.b\n", 202 | " for j in range(self.num_v):\n", 203 | " idx_a = np.ravel(np.where(self.R[:,j]>0)[1])\n", 204 | " U_cut = self.U[:,idx_a]\n", 205 | " self.V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*(self.a-self.b)+self.I_V)* (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))\n", 206 | " \n", 207 | " return self.U ,self.V" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "#### masking noise " 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 7, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def mask(corruption_level ,size):\n", 224 | " mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])\n", 225 | " return mask\n", 226 | "\n", 227 | "def add_noise(x , corruption_level ):\n", 228 | " x = x * mask(corruption_level , x.shape)\n", 229 | " return x" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 8, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "class CDL():\n", 239 | " def __init__(self , rating_matrix , item_infomation_matrix):\n", 240 | " # model參數設定\n", 241 | " self.n_input = 8000\n", 242 | " self.n_hidden1 = 200\n", 243 | " self.n_hidden2 = 50\n", 244 | " self.k = 50\n", 245 | " \n", 246 | " self.lambda_w = 1\n", 247 | " self.lambda_n = 1\n", 248 | " self.lambda_u = 1\n", 249 | " self.lambda_v = 1\n", 250 | " \n", 251 | " self.drop_ratio = 0.1\n", 252 | " self.learning_rate = 0.001\n", 253 | " self.epochs = 10\n", 254 | " self.batch_size = 32\n", 255 | " \n", 256 | " self.num_u = rating_matrix.shape[0]\n", 257 | " self.num_v = rating_matrix.shape[1]\n", 258 | " \n", 259 | " self.Weights = {\n", 260 | " 'w1' : tf.Variable(tf.random_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n", 261 | " 'w2' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),\n", 262 | " 'w3' : tf.Variable(tf.random_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n", 263 | " 'w4' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev=1 / self.lambda_w )) \n", 264 | " }\n", 265 | " self.Biases = {\n", 266 | " 'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n", 267 | " 'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),\n", 268 | " 'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n", 269 | " 'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))\n", 270 | " }\n", 271 | " \n", 272 | " self.item_infomation_matrix = item_infomation_matrix\n", 273 | " \n", 274 | " self.build_model()\n", 275 | " def encoder(self , x , drop_ratio):\n", 276 | " w1 = self.Weights['w1']\n", 277 | " b1 = self.Biases['b1']\n", 278 | " L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )\n", 279 | " L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )\n", 280 | " \n", 281 | " w2 = self.Weights['w2']\n", 282 | " b2 = self.Biases['b2']\n", 283 | " L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )\n", 284 | " L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)\n", 285 | " \n", 286 | " return L2\n", 287 | " \n", 288 | " def decoder(self , x , drop_ratio):\n", 289 | " w3 = self.Weights['w3']\n", 290 | " b3 = self.Biases['b3']\n", 291 | " L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)\n", 292 | " L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)\n", 293 | "\n", 294 | " w4 = self.Weights['w4']\n", 295 | " b4 = self.Biases['b4']\n", 296 | " L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)\n", 297 | " L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)\n", 298 | "\n", 299 | " return L4\n", 300 | " \n", 301 | " def build_model(self):\n", 302 | " self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n", 303 | " self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n", 304 | " self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))\n", 305 | " self.model_drop_ratio = tf.placeholder(tf.float32)\n", 306 | " \n", 307 | " self.V_sdae = self.encoder( self.model_X_0 , self.model_drop_ratio )\n", 308 | " self.y_pred = self.decoder( self.V_sdae , self.model_drop_ratio )\n", 309 | " \n", 310 | " self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])\n", 311 | " loss_r =1/2 * self.lambda_w * self.Regularization\n", 312 | " loss_a =1/2 * self.lambda_n * tf.reduce_sum(tf.pow( self.model_X_c - self.y_pred , 2 ))\n", 313 | " loss_v =1/2 * self.lambda_v * tf.reduce_sum(tf.pow( self.model_V - self.V_sdae , 2 ))\n", 314 | " self.Loss = loss_r + loss_a + loss_v\n", 315 | " \n", 316 | " self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)\n", 317 | " def training(self , rating_matrix):\n", 318 | " #np.random.shuffle(self.item_infomation_matrix) #random index of train data\n", 319 | " \n", 320 | " self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)\n", 321 | " \n", 322 | " sess = tf.Session()\n", 323 | " sess.run(tf.global_variables_initializer())\n", 324 | " \n", 325 | " mf = MF( rating_matrix )\n", 326 | " \n", 327 | " for epoch in range(self.epochs):\n", 328 | " print(\"%d / %d\"%(epoch+1 , self.epochs))\n", 329 | " \n", 330 | " V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , self.model_drop_ratio : 0.1})\n", 331 | " \n", 332 | " U , V = mf.ALS(V_sdae)\n", 333 | " V = np.resize(V,(16980 , 50))\n", 334 | " for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):\n", 335 | " X_train_batch = self.item_infomation_matrix_noise[i:i+self.batch_size]\n", 336 | " y_train_batch = self.item_infomation_matrix[i:i+self.batch_size]\n", 337 | " V_batch = V[i:i+self.batch_size]\n", 338 | " _ , my_loss = sess.run([self.optimizer, self.Loss] , feed_dict={self.model_X_0 :X_train_batch , self.model_X_c : y_train_batch , self.model_V:V_batch, self.model_drop_ratio : 0.1})\n", 339 | " print(my_loss)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 9, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "1 / 10\n", 352 | "387677.62\n", 353 | "2 / 10\n", 354 | "175559.14\n", 355 | "3 / 10\n", 356 | "76667.734\n", 357 | "4 / 10\n", 358 | "33305.188\n", 359 | "5 / 10\n", 360 | "14436.599\n", 361 | "6 / 10\n", 362 | "6843.848\n", 363 | "7 / 10\n", 364 | "3749.5586\n", 365 | "8 / 10\n", 366 | "2751.414\n", 367 | "9 / 10\n", 368 | "2292.7659\n", 369 | "10 / 10\n", 370 | "2268.0378\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "cdl = CDL(rating_matrix , item_infomation_matrix)\n", 376 | "cdl.build_model()\n", 377 | "cdl.training(rating_matrix)" 378 | ] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.6.5" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 2 402 | } 403 | -------------------------------------------------------------------------------- /CDL_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CDL by tensorflow" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### import module" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "C:\\Users\\k12s35h813g\\AppData\\Local\\Continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 27 | " from ._conv import register_converters as _register_converters\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "import numpy as np\n", 33 | "import pickle\n", 34 | "import tensorflow as tf\n", 35 | "import time\n", 36 | "#init random seed\n", 37 | "np.random.seed(5)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## 1. data preprocess" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "#### build item information matrix of citeulike-a by bag of word" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#find vocabulary_size = 8000\n", 61 | "with open(r\"ctrsr_datasets/citeulike-a/vocabulary.dat\") as vocabulary_file:\n", 62 | " vocabulary_size = len(vocabulary_file.readlines())\n", 63 | " \n", 64 | "#find item_size = 16980\n", 65 | "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n", 66 | " item_size = len(item_info_file.readlines())\n", 67 | "\n", 68 | "#initialize item_infomation_matrix (16980 , 8000)\n", 69 | "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n", 70 | "\n", 71 | "#build item_infomation_matrix\n", 72 | "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n", 73 | " sentences = item_info_file.readlines()\n", 74 | " \n", 75 | " for index,sentence in enumerate(sentences):\n", 76 | " words = sentence.strip().split(\" \")[1:]\n", 77 | " for word in words:\n", 78 | " vocabulary_index , number = word.split(\":\")\n", 79 | " item_infomation_matrix[index][int(vocabulary_index)] =number" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "#### build rating matrix citeulike-a" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "#find user_size = 5551\n", 96 | "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n", 97 | " user_size = len(rating_file.readlines())\n", 98 | "\n", 99 | "#initialize rating_matrix (5551 , 16980)\n", 100 | "import numpy as np\n", 101 | "rating_matrix = np.zeros((user_size , item_size))\n", 102 | "\n", 103 | "#build rating_matrix\n", 104 | "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n", 105 | " lines = rating_file.readlines()\n", 106 | " for index,line in enumerate(lines):\n", 107 | " items = line.strip().split(\" \")\n", 108 | " for item in items: \n", 109 | " rating_matrix[index][int(item)] = 1" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "#### save matrix by pickle" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "with open(r'item_infomation_matrix.pickle', 'wb') as handle:\n", 126 | " pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", 127 | "with open(r'rating_matrix.pickle', 'wb') as handle:\n", 128 | " pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "#### load matrix from pickle " 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "with open(r'item_infomation_matrix.pickle', 'rb') as handle:\n", 145 | " item_infomation_matrix = pickle.load(handle) \n", 146 | " \n", 147 | "with open(r'rating_matrix.pickle', 'rb') as handle2:\n", 148 | " rating_matrix = pickle.load(handle2)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## 2. build model" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "#### masking noise " 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "#apply SDAE : we hope to reconstruct item information by masking nosie\n", 172 | "def mask(corruption_level ,size):\n", 173 | " mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])\n", 174 | " return mask\n", 175 | "\n", 176 | "def add_noise(x , corruption_level ):\n", 177 | " x = x * mask(corruption_level , x.shape)\n", 178 | " return x" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "class CDL():\n", 188 | " def __init__(self , rating_matrix , item_infomation_matrix):\n", 189 | " \n", 190 | " # model參數設定\n", 191 | " self.n_input = item_infomation_matrix.shape[1]\n", 192 | " self.n_hidden1 = 200\n", 193 | " self.n_hidden2 = 50\n", 194 | " self.k = 50\n", 195 | " \n", 196 | " self.lambda_w = 0.1\n", 197 | " self.lambda_n = 10\n", 198 | " self.lambda_u = 1\n", 199 | " self.lambda_v = 10\n", 200 | " \n", 201 | " self.drop_ratio = 0.1\n", 202 | " self.learning_rate = 0.01\n", 203 | " self.epochs = 200\n", 204 | " self.batch_size = 256\n", 205 | " \n", 206 | " self.a = 1\n", 207 | " self.b =0.01\n", 208 | " self.P = 1\n", 209 | " \n", 210 | " self.num_u = rating_matrix.shape[0]\n", 211 | " self.num_v = rating_matrix.shape[1]\n", 212 | " \n", 213 | " self.Weights = {\n", 214 | " 'w1' : tf.Variable(tf.truncated_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n", 215 | " 'w2' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n", 216 | " 'w3' : tf.Variable(tf.truncated_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n", 217 | " 'w4' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))) \n", 218 | " }\n", 219 | " self.Biases = {\n", 220 | " 'b1' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),\n", 221 | " 'b2' : tf.Variable( tf.zeros(shape=self.n_hidden2) ),\n", 222 | " 'b3' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),\n", 223 | " 'b4' : tf.Variable( tf.zeros(shape=self.n_input) ),\n", 224 | " }\n", 225 | " \n", 226 | " self.item_infomation_matrix = item_infomation_matrix\n", 227 | " \n", 228 | " self.rating_matrix = rating_matrix\n", 229 | " \n", 230 | " for i in range(self.num_u):\n", 231 | " x = np.random.choice(np.where(self.rating_matrix[i,:]>0)[0] , self.P)\n", 232 | " self.rating_matrix[i,:].fill(0)\n", 233 | " self.rating_matrix[i,x] = 1\n", 234 | " \n", 235 | " self.confidence = np.mat(np.ones(self.rating_matrix.shape)) * self.b\n", 236 | " self.confidence[np.where(self.rating_matrix>0)] = self.a\n", 237 | " \n", 238 | " def encoder(self , x , drop_ratio):\n", 239 | " w1 = self.Weights['w1']\n", 240 | " b1 = self.Biases['b1']\n", 241 | " L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )\n", 242 | " L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )\n", 243 | " \n", 244 | " w2 = self.Weights['w2']\n", 245 | " b2 = self.Biases['b2']\n", 246 | " L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )\n", 247 | " L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)\n", 248 | " \n", 249 | " return L2\n", 250 | " \n", 251 | " def decoder(self , x , drop_ratio):\n", 252 | " w3 = self.Weights['w3']\n", 253 | " b3 = self.Biases['b3']\n", 254 | " L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)\n", 255 | " L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)\n", 256 | "\n", 257 | " w4 = self.Weights['w4']\n", 258 | " b4 = self.Biases['b4']\n", 259 | " L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)\n", 260 | " L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)\n", 261 | "\n", 262 | " return L4\n", 263 | " \n", 264 | "# def only_MF(self):\n", 265 | "# self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n", 266 | "# self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n", 267 | "# self.drop_ratio = tf.placeholder(tf.float32)\n", 268 | "# self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )\n", 269 | " \n", 270 | "# batch_size = tf.cast(tf.shape(self.R)[1], tf.int32)\n", 271 | " \n", 272 | " \n", 273 | "# self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) \n", 274 | "# self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )\n", 275 | " \n", 276 | "# batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])\n", 277 | " \n", 278 | "# loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) \n", 279 | "# loss_2 = tf.reduce_sum(tf.multiply(self.C ,\n", 280 | "# tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) \n", 281 | "# )\n", 282 | " \n", 283 | "# self.loss = loss_1 + loss_2 \n", 284 | "# self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)\n", 285 | " \n", 286 | " def build_model(self):\n", 287 | " \n", 288 | " self.X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n", 289 | " self.X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n", 290 | " self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n", 291 | " self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n", 292 | " self.drop_ratio = tf.placeholder(tf.float32)\n", 293 | " self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )\n", 294 | " #SDAE item factor\n", 295 | " V_sdae = self.encoder( self.X_0 , self.drop_ratio )\n", 296 | " \n", 297 | " #SDAE output \n", 298 | " sdae_output = self.decoder( V_sdae , self.drop_ratio )\n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " batch_size = tf.cast(tf.shape(self.X_0)[0], tf.int32)\n", 304 | " \n", 305 | " \n", 306 | " self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) \n", 307 | " self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )\n", 308 | " \n", 309 | " batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])\n", 310 | " \n", 311 | " loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) \n", 312 | " loss_2 = self.lambda_w * 1/2 * tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])\n", 313 | " loss_3 = self.lambda_v * tf.nn.l2_loss(batch_V - V_sdae)\n", 314 | " loss_4 = self.lambda_n * tf.nn.l2_loss(sdae_output - self.X_c)\n", 315 | " \n", 316 | " loss_5 = tf.reduce_sum(tf.multiply(self.C ,\n", 317 | " tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) \n", 318 | " )\n", 319 | " \n", 320 | " self.loss = loss_1 + loss_2 + loss_3 + loss_4 + loss_5\n", 321 | " self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)\n", 322 | " def train_model(self):\n", 323 | " self.sess = tf.Session()\n", 324 | " self.sess.run(tf.global_variables_initializer())\n", 325 | " \n", 326 | " start_time = time.time()\n", 327 | " \n", 328 | " random_idx = np.random.permutation(self.num_v)\n", 329 | " \n", 330 | " self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)\n", 331 | " \n", 332 | " for epoch in range(self.epochs):\n", 333 | " batch_cost = 0\n", 334 | " for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):\n", 335 | " \n", 336 | " batch_idx = random_idx[i:i+self.batch_size]\n", 337 | " _ , loss = self.sess.run([self.optimizer, self.loss] , \n", 338 | " feed_dict={self.X_0 : self.item_infomation_matrix_noise[batch_idx,:] , \n", 339 | " self.X_c : self.item_infomation_matrix[batch_idx,:] , \n", 340 | " self.R : self.rating_matrix[: , batch_idx], \n", 341 | " self.C : self.confidence[: , batch_idx], \n", 342 | " self.drop_ratio : 0.1 ,\n", 343 | " self.model_batch_data_idx : batch_idx })\n", 344 | " batch_cost = batch_cost + loss\n", 345 | "\n", 346 | " print (\"Training //\", \"Epoch %d //\" % (epoch+1), \" Total cost = {:.2f}\".format(batch_cost), \"Elapsed time : %d sec\" % (time.time() - start_time))\n", 347 | " \n", 348 | " return self.sess.run((tf.matmul(self.U, self.V, transpose_b=True)))" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "#### train model" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 8, 361 | "metadata": { 362 | "scrolled": true 363 | }, 364 | "outputs": [ 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "Training // Epoch 1 // Total cost = 699676798.00 Elapsed time : 13 sec\n", 370 | "Training // Epoch 2 // Total cost = 509001361.50 Elapsed time : 23 sec\n", 371 | "Training // Epoch 3 // Total cost = 426090375.00 Elapsed time : 32 sec\n", 372 | "Training // Epoch 4 // Total cost = 402229449.00 Elapsed time : 40 sec\n", 373 | "Training // Epoch 5 // Total cost = 385809588.50 Elapsed time : 50 sec\n", 374 | "Training // Epoch 6 // Total cost = 370855234.50 Elapsed time : 59 sec\n", 375 | "Training // Epoch 7 // Total cost = 356779552.50 Elapsed time : 68 sec\n", 376 | "Training // Epoch 8 // Total cost = 343552683.50 Elapsed time : 77 sec\n", 377 | "Training // Epoch 9 // Total cost = 330934537.50 Elapsed time : 85 sec\n", 378 | "Training // Epoch 10 // Total cost = 318868322.50 Elapsed time : 94 sec\n", 379 | "Training // Epoch 11 // Total cost = 307315671.75 Elapsed time : 102 sec\n", 380 | "Training // Epoch 12 // Total cost = 296283195.75 Elapsed time : 112 sec\n", 381 | "Training // Epoch 13 // Total cost = 285720805.75 Elapsed time : 121 sec\n", 382 | "Training // Epoch 14 // Total cost = 275675537.25 Elapsed time : 130 sec\n", 383 | "Training // Epoch 15 // Total cost = 265893138.00 Elapsed time : 139 sec\n", 384 | "Training // Epoch 16 // Total cost = 256381356.50 Elapsed time : 147 sec\n", 385 | "Training // Epoch 17 // Total cost = 247409903.50 Elapsed time : 157 sec\n", 386 | "Training // Epoch 18 // Total cost = 238836460.75 Elapsed time : 165 sec\n", 387 | "Training // Epoch 19 // Total cost = 230363394.25 Elapsed time : 174 sec\n", 388 | "Training // Epoch 20 // Total cost = 222478204.00 Elapsed time : 183 sec\n", 389 | "Training // Epoch 21 // Total cost = 214707066.00 Elapsed time : 192 sec\n", 390 | "Training // Epoch 22 // Total cost = 207398870.00 Elapsed time : 201 sec\n", 391 | "Training // Epoch 23 // Total cost = 200335661.00 Elapsed time : 210 sec\n", 392 | "Training // Epoch 24 // Total cost = 193502709.25 Elapsed time : 219 sec\n", 393 | "Training // Epoch 25 // Total cost = 187017720.00 Elapsed time : 227 sec\n", 394 | "Training // Epoch 26 // Total cost = 180656876.50 Elapsed time : 236 sec\n", 395 | "Training // Epoch 27 // Total cost = 174520094.75 Elapsed time : 244 sec\n", 396 | "Training // Epoch 28 // Total cost = 168618046.00 Elapsed time : 253 sec\n", 397 | "Training // Epoch 29 // Total cost = 162554788.25 Elapsed time : 262 sec\n", 398 | "Training // Epoch 30 // Total cost = 157096778.00 Elapsed time : 271 sec\n", 399 | "Training // Epoch 31 // Total cost = 151624672.62 Elapsed time : 280 sec\n", 400 | "Training // Epoch 32 // Total cost = 146411205.25 Elapsed time : 288 sec\n", 401 | "Training // Epoch 33 // Total cost = 141438258.50 Elapsed time : 297 sec\n", 402 | "Training // Epoch 34 // Total cost = 136530659.00 Elapsed time : 305 sec\n", 403 | "Training // Epoch 35 // Total cost = 131753547.00 Elapsed time : 314 sec\n", 404 | "Training // Epoch 36 // Total cost = 126863433.50 Elapsed time : 323 sec\n", 405 | "Training // Epoch 37 // Total cost = 122656202.88 Elapsed time : 332 sec\n", 406 | "Training // Epoch 38 // Total cost = 118599578.00 Elapsed time : 340 sec\n", 407 | "Training // Epoch 39 // Total cost = 114611759.88 Elapsed time : 348 sec\n", 408 | "Training // Epoch 40 // Total cost = 110903898.50 Elapsed time : 357 sec\n", 409 | "Training // Epoch 41 // Total cost = 107518569.25 Elapsed time : 365 sec\n", 410 | "Training // Epoch 42 // Total cost = 104367943.38 Elapsed time : 373 sec\n", 411 | "Training // Epoch 43 // Total cost = 101399079.12 Elapsed time : 382 sec\n", 412 | "Training // Epoch 44 // Total cost = 98611119.12 Elapsed time : 391 sec\n", 413 | "Training // Epoch 45 // Total cost = 95935625.75 Elapsed time : 400 sec\n", 414 | "Training // Epoch 46 // Total cost = 93366417.50 Elapsed time : 409 sec\n", 415 | "Training // Epoch 47 // Total cost = 90901260.00 Elapsed time : 417 sec\n", 416 | "Training // Epoch 48 // Total cost = 88532784.75 Elapsed time : 426 sec\n", 417 | "Training // Epoch 49 // Total cost = 86253099.31 Elapsed time : 435 sec\n", 418 | "Training // Epoch 50 // Total cost = 84065897.81 Elapsed time : 444 sec\n", 419 | "Training // Epoch 51 // Total cost = 81960649.31 Elapsed time : 452 sec\n", 420 | "Training // Epoch 52 // Total cost = 79932892.19 Elapsed time : 461 sec\n", 421 | "Training // Epoch 53 // Total cost = 77983110.50 Elapsed time : 470 sec\n", 422 | "Training // Epoch 54 // Total cost = 76107443.06 Elapsed time : 478 sec\n", 423 | "Training // Epoch 55 // Total cost = 74296636.31 Elapsed time : 486 sec\n", 424 | "Training // Epoch 56 // Total cost = 72557214.06 Elapsed time : 495 sec\n", 425 | "Training // Epoch 57 // Total cost = 70873424.69 Elapsed time : 503 sec\n", 426 | "Training // Epoch 58 // Total cost = 69249254.19 Elapsed time : 512 sec\n", 427 | "Training // Epoch 59 // Total cost = 67679117.94 Elapsed time : 521 sec\n", 428 | "Training // Epoch 60 // Total cost = 66165503.62 Elapsed time : 530 sec\n", 429 | "Training // Epoch 61 // Total cost = 64698699.00 Elapsed time : 539 sec\n", 430 | "Training // Epoch 62 // Total cost = 63282957.81 Elapsed time : 547 sec\n", 431 | "Training // Epoch 63 // Total cost = 61906538.56 Elapsed time : 556 sec\n", 432 | "Training // Epoch 64 // Total cost = 60575016.38 Elapsed time : 565 sec\n", 433 | "Training // Epoch 65 // Total cost = 59282706.25 Elapsed time : 574 sec\n", 434 | "Training // Epoch 66 // Total cost = 58029875.38 Elapsed time : 583 sec\n", 435 | "Training // Epoch 67 // Total cost = 56815220.81 Elapsed time : 592 sec\n", 436 | "Training // Epoch 68 // Total cost = 55630129.94 Elapsed time : 601 sec\n", 437 | "Training // Epoch 69 // Total cost = 54482065.69 Elapsed time : 609 sec\n", 438 | "Training // Epoch 70 // Total cost = 53366296.62 Elapsed time : 618 sec\n", 439 | "Training // Epoch 71 // Total cost = 52280660.06 Elapsed time : 627 sec\n", 440 | "Training // Epoch 72 // Total cost = 51223257.59 Elapsed time : 635 sec\n", 441 | "Training // Epoch 73 // Total cost = 50193597.88 Elapsed time : 644 sec\n", 442 | "Training // Epoch 74 // Total cost = 49194452.50 Elapsed time : 652 sec\n", 443 | "Training // Epoch 75 // Total cost = 48220310.16 Elapsed time : 661 sec\n", 444 | "Training // Epoch 76 // Total cost = 47270594.84 Elapsed time : 669 sec\n", 445 | "Training // Epoch 77 // Total cost = 46346913.22 Elapsed time : 678 sec\n", 446 | "Training // Epoch 78 // Total cost = 45445881.34 Elapsed time : 687 sec\n", 447 | "Training // Epoch 79 // Total cost = 44570288.53 Elapsed time : 696 sec\n", 448 | "Training // Epoch 80 // Total cost = 43717168.12 Elapsed time : 704 sec\n", 449 | "Training // Epoch 81 // Total cost = 42887369.91 Elapsed time : 713 sec\n", 450 | "Training // Epoch 82 // Total cost = 42079221.00 Elapsed time : 721 sec\n", 451 | "Training // Epoch 83 // Total cost = 41288647.12 Elapsed time : 730 sec\n", 452 | "Training // Epoch 84 // Total cost = 40523911.88 Elapsed time : 739 sec\n", 453 | "Training // Epoch 85 // Total cost = 39775534.69 Elapsed time : 747 sec\n", 454 | "Training // Epoch 86 // Total cost = 39051408.12 Elapsed time : 756 sec\n", 455 | "Training // Epoch 87 // Total cost = 38350553.22 Elapsed time : 764 sec\n", 456 | "Training // Epoch 88 // Total cost = 37661407.00 Elapsed time : 775 sec\n", 457 | "Training // Epoch 89 // Total cost = 36996213.59 Elapsed time : 783 sec\n", 458 | "Training // Epoch 90 // Total cost = 36349649.47 Elapsed time : 791 sec\n", 459 | "Training // Epoch 91 // Total cost = 35726645.00 Elapsed time : 800 sec\n", 460 | "Training // Epoch 92 // Total cost = 35117365.41 Elapsed time : 811 sec\n", 461 | "Training // Epoch 93 // Total cost = 34530359.38 Elapsed time : 822 sec\n", 462 | "Training // Epoch 94 // Total cost = 33960657.64 Elapsed time : 832 sec\n", 463 | "Training // Epoch 95 // Total cost = 33409376.77 Elapsed time : 841 sec\n", 464 | "Training // Epoch 96 // Total cost = 32873193.94 Elapsed time : 850 sec\n", 465 | "Training // Epoch 97 // Total cost = 32355168.34 Elapsed time : 858 sec\n", 466 | "Training // Epoch 98 // Total cost = 31859824.70 Elapsed time : 867 sec\n", 467 | "Training // Epoch 99 // Total cost = 31375273.75 Elapsed time : 876 sec\n", 468 | "Training // Epoch 100 // Total cost = 30909934.75 Elapsed time : 885 sec\n", 469 | "Training // Epoch 101 // Total cost = 30461370.30 Elapsed time : 893 sec\n", 470 | "Training // Epoch 102 // Total cost = 30028708.03 Elapsed time : 902 sec\n", 471 | "Training // Epoch 103 // Total cost = 29610802.08 Elapsed time : 911 sec\n", 472 | "Training // Epoch 104 // Total cost = 29209048.72 Elapsed time : 920 sec\n", 473 | "Training // Epoch 105 // Total cost = 28818532.34 Elapsed time : 928 sec\n", 474 | "Training // Epoch 106 // Total cost = 28447847.41 Elapsed time : 937 sec\n", 475 | "Training // Epoch 107 // Total cost = 28091455.08 Elapsed time : 946 sec\n", 476 | "Training // Epoch 108 // Total cost = 27748029.56 Elapsed time : 955 sec\n", 477 | "Training // Epoch 109 // Total cost = 27415978.22 Elapsed time : 963 sec\n", 478 | "Training // Epoch 110 // Total cost = 27100371.06 Elapsed time : 972 sec\n", 479 | "Training // Epoch 111 // Total cost = 26795215.59 Elapsed time : 981 sec\n", 480 | "Training // Epoch 112 // Total cost = 26506069.69 Elapsed time : 990 sec\n" 481 | ] 482 | }, 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "Training // Epoch 113 // Total cost = 26229822.77 Elapsed time : 999 sec\n", 488 | "Training // Epoch 114 // Total cost = 25965280.67 Elapsed time : 1008 sec\n", 489 | "Training // Epoch 115 // Total cost = 25713353.86 Elapsed time : 1017 sec\n", 490 | "Training // Epoch 116 // Total cost = 25468158.66 Elapsed time : 1025 sec\n", 491 | "Training // Epoch 117 // Total cost = 25241022.56 Elapsed time : 1033 sec\n", 492 | "Training // Epoch 118 // Total cost = 25019519.48 Elapsed time : 1042 sec\n", 493 | "Training // Epoch 119 // Total cost = 24811663.63 Elapsed time : 1050 sec\n", 494 | "Training // Epoch 120 // Total cost = 24609974.30 Elapsed time : 1058 sec\n", 495 | "Training // Epoch 121 // Total cost = 24420480.23 Elapsed time : 1066 sec\n", 496 | "Training // Epoch 122 // Total cost = 24235782.49 Elapsed time : 1074 sec\n", 497 | "Training // Epoch 123 // Total cost = 24066398.74 Elapsed time : 1082 sec\n", 498 | "Training // Epoch 124 // Total cost = 23899705.29 Elapsed time : 1091 sec\n", 499 | "Training // Epoch 125 // Total cost = 23745025.42 Elapsed time : 1099 sec\n", 500 | "Training // Epoch 126 // Total cost = 23600273.58 Elapsed time : 1107 sec\n", 501 | "Training // Epoch 127 // Total cost = 23457817.05 Elapsed time : 1115 sec\n", 502 | "Training // Epoch 128 // Total cost = 23326691.69 Elapsed time : 1123 sec\n", 503 | "Training // Epoch 129 // Total cost = 23200199.76 Elapsed time : 1132 sec\n", 504 | "Training // Epoch 130 // Total cost = 23082377.88 Elapsed time : 1140 sec\n", 505 | "Training // Epoch 131 // Total cost = 22968153.27 Elapsed time : 1148 sec\n", 506 | "Training // Epoch 132 // Total cost = 22860444.31 Elapsed time : 1156 sec\n", 507 | "Training // Epoch 133 // Total cost = 22760058.06 Elapsed time : 1164 sec\n", 508 | "Training // Epoch 134 // Total cost = 22666147.52 Elapsed time : 1172 sec\n", 509 | "Training // Epoch 135 // Total cost = 22571107.13 Elapsed time : 1180 sec\n", 510 | "Training // Epoch 136 // Total cost = 22487584.22 Elapsed time : 1188 sec\n", 511 | "Training // Epoch 137 // Total cost = 22405833.73 Elapsed time : 1196 sec\n", 512 | "Training // Epoch 138 // Total cost = 22330392.38 Elapsed time : 1205 sec\n", 513 | "Training // Epoch 139 // Total cost = 22255380.34 Elapsed time : 1213 sec\n", 514 | "Training // Epoch 140 // Total cost = 22186002.29 Elapsed time : 1221 sec\n", 515 | "Training // Epoch 141 // Total cost = 22117630.56 Elapsed time : 1229 sec\n", 516 | "Training // Epoch 142 // Total cost = 22055342.34 Elapsed time : 1237 sec\n", 517 | "Training // Epoch 143 // Total cost = 21993179.08 Elapsed time : 1245 sec\n", 518 | "Training // Epoch 144 // Total cost = 21932841.30 Elapsed time : 1253 sec\n", 519 | "Training // Epoch 145 // Total cost = 21884063.32 Elapsed time : 1262 sec\n", 520 | "Training // Epoch 146 // Total cost = 21835091.59 Elapsed time : 1270 sec\n", 521 | "Training // Epoch 147 // Total cost = 21789127.23 Elapsed time : 1278 sec\n", 522 | "Training // Epoch 148 // Total cost = 21742637.62 Elapsed time : 1286 sec\n", 523 | "Training // Epoch 149 // Total cost = 21705565.35 Elapsed time : 1294 sec\n", 524 | "Training // Epoch 150 // Total cost = 21668287.75 Elapsed time : 1303 sec\n", 525 | "Training // Epoch 151 // Total cost = 21630913.35 Elapsed time : 1312 sec\n", 526 | "Training // Epoch 152 // Total cost = 21599348.62 Elapsed time : 1320 sec\n", 527 | "Training // Epoch 153 // Total cost = 21571637.19 Elapsed time : 1328 sec\n", 528 | "Training // Epoch 154 // Total cost = 21543501.29 Elapsed time : 1336 sec\n", 529 | "Training // Epoch 155 // Total cost = 21513418.12 Elapsed time : 1345 sec\n", 530 | "Training // Epoch 156 // Total cost = 21485080.34 Elapsed time : 1353 sec\n", 531 | "Training // Epoch 157 // Total cost = 21438892.87 Elapsed time : 1363 sec\n", 532 | "Training // Epoch 158 // Total cost = 21377036.28 Elapsed time : 1372 sec\n", 533 | "Training // Epoch 159 // Total cost = 21229306.30 Elapsed time : 1381 sec\n", 534 | "Training // Epoch 160 // Total cost = 21071121.86 Elapsed time : 1389 sec\n", 535 | "Training // Epoch 161 // Total cost = 20925566.02 Elapsed time : 1398 sec\n", 536 | "Training // Epoch 162 // Total cost = 20836658.01 Elapsed time : 1406 sec\n", 537 | "Training // Epoch 163 // Total cost = 20774302.80 Elapsed time : 1415 sec\n", 538 | "Training // Epoch 164 // Total cost = 20716914.44 Elapsed time : 1424 sec\n", 539 | "Training // Epoch 165 // Total cost = 20707060.63 Elapsed time : 1432 sec\n", 540 | "Training // Epoch 166 // Total cost = 20661099.41 Elapsed time : 1441 sec\n", 541 | "Training // Epoch 167 // Total cost = 20625338.57 Elapsed time : 1451 sec\n", 542 | "Training // Epoch 168 // Total cost = 20589319.52 Elapsed time : 1460 sec\n", 543 | "Training // Epoch 169 // Total cost = 20572881.80 Elapsed time : 1469 sec\n", 544 | "Training // Epoch 170 // Total cost = 20567003.31 Elapsed time : 1477 sec\n", 545 | "Training // Epoch 171 // Total cost = 20557453.87 Elapsed time : 1485 sec\n", 546 | "Training // Epoch 172 // Total cost = 20514101.95 Elapsed time : 1494 sec\n", 547 | "Training // Epoch 173 // Total cost = 20469838.30 Elapsed time : 1502 sec\n", 548 | "Training // Epoch 174 // Total cost = 20426545.98 Elapsed time : 1511 sec\n", 549 | "Training // Epoch 175 // Total cost = 20403331.02 Elapsed time : 1519 sec\n", 550 | "Training // Epoch 176 // Total cost = 20382709.72 Elapsed time : 1527 sec\n", 551 | "Training // Epoch 177 // Total cost = 20339543.34 Elapsed time : 1535 sec\n", 552 | "Training // Epoch 178 // Total cost = 20291659.41 Elapsed time : 1543 sec\n", 553 | "Training // Epoch 179 // Total cost = 20293770.04 Elapsed time : 1551 sec\n", 554 | "Training // Epoch 180 // Total cost = 20236462.23 Elapsed time : 1560 sec\n", 555 | "Training // Epoch 181 // Total cost = 20202695.11 Elapsed time : 1568 sec\n", 556 | "Training // Epoch 182 // Total cost = 20165418.95 Elapsed time : 1576 sec\n", 557 | "Training // Epoch 183 // Total cost = 20137667.19 Elapsed time : 1584 sec\n", 558 | "Training // Epoch 184 // Total cost = 20112504.45 Elapsed time : 1592 sec\n", 559 | "Training // Epoch 185 // Total cost = 20064062.89 Elapsed time : 1600 sec\n", 560 | "Training // Epoch 186 // Total cost = 20035768.56 Elapsed time : 1609 sec\n", 561 | "Training // Epoch 187 // Total cost = 20022080.76 Elapsed time : 1617 sec\n", 562 | "Training // Epoch 188 // Total cost = 19988423.27 Elapsed time : 1625 sec\n", 563 | "Training // Epoch 189 // Total cost = 19961282.66 Elapsed time : 1633 sec\n", 564 | "Training // Epoch 190 // Total cost = 19926572.26 Elapsed time : 1641 sec\n", 565 | "Training // Epoch 191 // Total cost = 19882042.70 Elapsed time : 1649 sec\n", 566 | "Training // Epoch 192 // Total cost = 19857866.74 Elapsed time : 1658 sec\n", 567 | "Training // Epoch 193 // Total cost = 19820119.57 Elapsed time : 1666 sec\n", 568 | "Training // Epoch 194 // Total cost = 19791277.95 Elapsed time : 1674 sec\n", 569 | "Training // Epoch 195 // Total cost = 19769504.04 Elapsed time : 1682 sec\n", 570 | "Training // Epoch 196 // Total cost = 19738930.83 Elapsed time : 1690 sec\n", 571 | "Training // Epoch 197 // Total cost = 19727672.70 Elapsed time : 1698 sec\n", 572 | "Training // Epoch 198 // Total cost = 19704796.80 Elapsed time : 1706 sec\n", 573 | "Training // Epoch 199 // Total cost = 19676641.04 Elapsed time : 1715 sec\n", 574 | "Training // Epoch 200 // Total cost = 19652868.22 Elapsed time : 1723 sec\n" 575 | ] 576 | } 577 | ], 578 | "source": [ 579 | "R_train = rating_matrix.copy()\n", 580 | "cdl = CDL(R_train , item_infomation_matrix)\n", 581 | "cdl.build_model()\n", 582 | "R = cdl.train_model()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "#### evaluation" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 9, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "all_cnt = 0\n", 599 | "for i in range(rating_matrix.shape[0]):\n", 600 | " l_score = np.ravel(R[i,:]).tolist()\n", 601 | " pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True)\n", 602 | " l_rec = [i[0] for i in pl][:300]\n", 603 | " s_rec = set(l_rec)\n", 604 | " s_true = set(np.ravel(np.where(rating_matrix[i,:]>0)))\n", 605 | " cnt_hit = len(s_rec.intersection(s_true))\n", 606 | " all_cnt = all_cnt + cnt_hit/len(s_true)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 10, 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "name": "stdout", 616 | "output_type": "stream", 617 | "text": [ 618 | "accuracy : 0.081\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "#accuracy 0.085不能算太低 因為他是所有item(16980)去排序\n", 624 | "print(\"accuracy : %.3f\"%(all_cnt/rating_matrix.shape[0]))" 625 | ] 626 | } 627 | ], 628 | "metadata": { 629 | "kernelspec": { 630 | "display_name": "Python 3", 631 | "language": "python", 632 | "name": "python3" 633 | }, 634 | "language_info": { 635 | "codemirror_mode": { 636 | "name": "ipython", 637 | "version": 3 638 | }, 639 | "file_extension": ".py", 640 | "mimetype": "text/x-python", 641 | "name": "python", 642 | "nbconvert_exporter": "python", 643 | "pygments_lexer": "ipython3", 644 | "version": "3.6.5" 645 | } 646 | }, 647 | "nbformat": 4, 648 | "nbformat_minor": 2 649 | } 650 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 old cat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # implement-Collaborative-Deep-Learning-for-Recommender-Systems 2 | 3 | implement this paper "Collaborative Deep Learning for Recommender Systems" by python 4 | 5 | Collaborative Deep Learning (CDL) (Wang, H., Wang, N., & Yeung, D. Y. (2015, August). Collaborative deep learning for recommender systems. In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (pp. 1235-1244). ACM.) 6 | 7 | ----------------------------------------------------------------------------- 8 | 9 | ## Introduction 10 | This paper combine Collaborative filtering and stacked Denoising Autoencoder together. The original author implement by python and c++. And update the parameter by ALS algorithm . 11 | I implement this paper by tensorflow and try two method to update the parameter (1)ALS (2)gradient decent 12 | 13 | You can download the [slide](https://drive.google.com/file/d/1EtnYFQyRSd6A24NIniJtE_U5bm6f4-lZ/view?usp=sharing) for more detail information. 14 | 15 | ## dataset 16 | The dataset is from CiteULike . You can download it from the original author's website [here](http://www.wanghao.in/publication.html) 17 | 18 | ## usage 19 | 20 | CDL_tf.ipynb - train CDL by gradient decent 21 |
22 | CDL.ipynb - train CDL by ALS 23 | -------------------------------------------------------------------------------- /data_preprocess.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# data preprocess" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## build item information matrix of citeulike-a by bag of word" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 7, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "#find vocabulary_size = 8000\n", 24 | "with open(r\"ctrsr_datasets\\citeulike-a\\vocabulary.dat\") as vocabulary_file:\n", 25 | " vocabulary_size = len(vocabulary_file.readlines())\n", 26 | " \n", 27 | "#find item_size = 16980\n", 28 | "with open(r\"ctrsr_datasets\\citeulike-a\\mult.dat\") as item_info_file:\n", 29 | " item_size = len(item_info_file.readlines())\n", 30 | "\n", 31 | "#initialize item_infomation_matrix (16980 , 8000)\n", 32 | "import numpy as np\n", 33 | "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n", 34 | "\n", 35 | "#build item_infomation_matrix\n", 36 | "with open(r\"ctrsr_datasets\\citeulike-a\\mult.dat\") as item_info_file:\n", 37 | " sentences = item_info_file.readlines()\n", 38 | " \n", 39 | " for index,sentence in enumerate(sentences):\n", 40 | " words = sentence.strip().split(\" \")[1:]\n", 41 | " for word in words:\n", 42 | " vocabulary_index , number = word.split(\":\")\n", 43 | " item_infomation_matrix[index][int(vocabulary_index)] =number\n", 44 | " " 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## build rating matrix citeulike-a" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#find user_size = 5551\n", 61 | "with open(r\"ctrsr_datasets\\citeulike-a\\users.dat\") as rating_file:\n", 62 | " user_size = len(rating_file.readlines())\n", 63 | "\n", 64 | "#initialize rating_matrix (5551 , 16980)\n", 65 | "import numpy as np\n", 66 | "rating_matrix = np.zeros((user_size , item_size))\n", 67 | "\n", 68 | "#build rating_matrix\n", 69 | "with open(r\"ctrsr_datasets\\citeulike-a\\users.dat\") as rating_file:\n", 70 | " lines = rating_file.readlines()\n", 71 | " for index,line in enumerate(lines):\n", 72 | " items = line.strip().split(\" \")\n", 73 | " for item in items: \n", 74 | " rating_matrix[index][int(item)] = 1" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## save matrix by pickle" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 14, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "import pickle" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 15, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "with open(r'dataset\\citeulike-a\\item_infomation_matrix.pickle', 'wb') as handle:\n", 104 | " pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", 105 | "with open(r'dataset\\citeulike-a\\rating_matrix.pickle', 'wb') as handle:\n", 106 | " pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.6.5" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | --------------------------------------------------------------------------------