├── .gitignore
├── CDL.ipynb
├── CDL_tf.ipynb
├── LICENSE
├── README.md
└── data_preprocess.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/CDL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CDL"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### import module"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "C:\\Users\\k12s35h813g\\AppData\\Local\\Continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 27 |       "  from ._conv import register_converters as _register_converters\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "import numpy as np\n",
 33 |     "import pickle\n",
 34 |     "import tensorflow as tf\n",
 35 |     "#init random seed\n",
 36 |     "np.random.seed(5)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## 1. data preprocess"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "#### build item information matrix of citeulike-a by bag of word"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "#find vocabulary_size = 8000\n",
 60 |     "with open(r\"ctrsr_datasets/citeulike-a/vocabulary.dat\") as vocabulary_file:\n",
 61 |     "    vocabulary_size = len(vocabulary_file.readlines())\n",
 62 |     "    \n",
 63 |     "#find item_size = 16980\n",
 64 |     "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n",
 65 |     "    item_size = len(item_info_file.readlines())\n",
 66 |     "\n",
 67 |     "#initialize item_infomation_matrix (16980 , 8000)\n",
 68 |     "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n",
 69 |     "\n",
 70 |     "#build item_infomation_matrix\n",
 71 |     "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n",
 72 |     "    sentences = item_info_file.readlines()\n",
 73 |     "    \n",
 74 |     "    for index,sentence in enumerate(sentences):\n",
 75 |     "        words = sentence.strip().split(\" \")[1:]\n",
 76 |     "        for word in words:\n",
 77 |     "            vocabulary_index , number = word.split(\":\")\n",
 78 |     "            item_infomation_matrix[index][int(vocabulary_index)] =number\n",
 79 |     "        "
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "#### build rating matrix citeulike-a"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#find user_size = 5551\n",
 96 |     "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n",
 97 |     "    user_size = len(rating_file.readlines())\n",
 98 |     "\n",
 99 |     "#initialize rating_matrix (5551 , 16980)\n",
100 |     "import numpy as np\n",
101 |     "rating_matrix = np.zeros((user_size , item_size))\n",
102 |     "\n",
103 |     "#build rating_matrix\n",
104 |     "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n",
105 |     "    lines = rating_file.readlines()\n",
106 |     "    for index,line in enumerate(lines):\n",
107 |     "        items = line.strip().split(\" \")\n",
108 |     "        for item in items:  \n",
109 |     "            rating_matrix[index][int(item)] = 1"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "#### save matrix by pickle"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "with open(r'item_infomation_matrix.pickle', 'wb') as handle:\n",
126 |     "    pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
127 |     "with open(r'rating_matrix.pickle', 'wb') as handle:\n",
128 |     "    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "#### load matrix from pickle "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 5,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "with open(r'item_infomation_matrix.pickle', 'rb') as handle:\n",
145 |     "    item_infomation_matrix = pickle.load(handle)  \n",
146 |     "    \n",
147 |     "with open(r'rating_matrix.pickle', 'rb') as handle2:\n",
148 |     "    rating_matrix = pickle.load(handle2)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "## 2. build model"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "### matrix factorization model"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 6,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "class MF():\n",
172 |     "    def __init__(self , rating_matrix ):\n",
173 |     "        #### 參數設定\n",
174 |     "        self.num_u = rating_matrix.shape[0] #5551\n",
175 |     "        self.num_v = rating_matrix.shape[1] #16980\n",
176 |     "        self.u_lambda = 100\n",
177 |     "        self.v_lambda = 0.1\n",
178 |     "        self.k = 50 #latent維度\n",
179 |     "        self.a = 1\n",
180 |     "        self.b =0.01\n",
181 |     "        self.R = np.mat(rating_matrix)\n",
182 |     "        self.C = np.mat(np.ones(self.R.shape)) * self.b\n",
183 |     "        self.C[np.where(self.R>0)] = self.a\n",
184 |     "        self.I_U = np.mat(np.eye(self.k) * self.u_lambda)\n",
185 |     "        self.I_V = np.mat(np.eye(self.k) * self.v_lambda)\n",
186 |     "        self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k,self.num_u)))\n",
187 |     "        self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k,self.num_v)))\n",
188 |     "                        \n",
189 |     "\n",
190 |     "    def test(self):\n",
191 |     "        print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)\n",
192 |     "    def ALS(self , V_sdae):\n",
193 |     "        self.V_sdae = np.mat(V_sdae)\n",
194 |     "        \n",
195 |     "        V_sq = self.V * self.V.T * self.b\n",
196 |     "        for i in range(self.num_u):\n",
197 |     "            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])\n",
198 |     "            V_cut = self.V[:,idx_a]\n",
199 |     "            self.U[:,i] = np.linalg.pinv( V_sq+ V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T\n",
200 |     "        \n",
201 |     "        U_sq = self.U * self.U.T * self.b\n",
202 |     "        for j in range(self.num_v):\n",
203 |     "            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])\n",
204 |     "            U_cut = self.U[:,idx_a]\n",
205 |     "            self.V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*(self.a-self.b)+self.I_V)* (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))\n",
206 |     "        \n",
207 |     "        return self.U ,self.V"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "#### masking noise "
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 7,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "def mask(corruption_level ,size):\n",
224 |     "    mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])\n",
225 |     "    return mask\n",
226 |     "\n",
227 |     "def add_noise(x , corruption_level ):\n",
228 |     "    x = x * mask(corruption_level , x.shape)\n",
229 |     "    return x"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 8,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "class CDL():\n",
239 |     "    def __init__(self , rating_matrix , item_infomation_matrix):\n",
240 |     "        # model參數設定\n",
241 |     "        self.n_input = 8000\n",
242 |     "        self.n_hidden1 = 200\n",
243 |     "        self.n_hidden2 = 50\n",
244 |     "        self.k = 50\n",
245 |     "        \n",
246 |     "        self.lambda_w = 1\n",
247 |     "        self.lambda_n = 1\n",
248 |     "        self.lambda_u = 1\n",
249 |     "        self.lambda_v = 1\n",
250 |     "        \n",
251 |     "        self.drop_ratio = 0.1\n",
252 |     "        self.learning_rate = 0.001\n",
253 |     "        self.epochs = 10\n",
254 |     "        self.batch_size = 32\n",
255 |     "        \n",
256 |     "        self.num_u = rating_matrix.shape[0]\n",
257 |     "        self.num_v = rating_matrix.shape[1]\n",
258 |     "        \n",
259 |     "        self.Weights = {\n",
260 |     "            'w1' : tf.Variable(tf.random_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n",
261 |     "            'w2' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),\n",
262 |     "            'w3' : tf.Variable(tf.random_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n",
263 |     "            'w4' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))   \n",
264 |     "        }\n",
265 |     "        self.Biases = {\n",
266 |     "            'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n",
267 |     "            'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),\n",
268 |     "            'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),\n",
269 |     "            'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))\n",
270 |     "        }\n",
271 |     "        \n",
272 |     "        self.item_infomation_matrix = item_infomation_matrix\n",
273 |     "    \n",
274 |     "        self.build_model()\n",
275 |     "    def encoder(self , x , drop_ratio):\n",
276 |     "        w1 = self.Weights['w1']\n",
277 |     "        b1 = self.Biases['b1']\n",
278 |     "        L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )\n",
279 |     "        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )\n",
280 |     "        \n",
281 |     "        w2 = self.Weights['w2']\n",
282 |     "        b2 = self.Biases['b2']\n",
283 |     "        L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )\n",
284 |     "        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)\n",
285 |     "        \n",
286 |     "        return L2\n",
287 |     "    \n",
288 |     "    def decoder(self , x , drop_ratio):\n",
289 |     "        w3 = self.Weights['w3']\n",
290 |     "        b3 = self.Biases['b3']\n",
291 |     "        L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)\n",
292 |     "        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)\n",
293 |     "\n",
294 |     "        w4 = self.Weights['w4']\n",
295 |     "        b4 = self.Biases['b4']\n",
296 |     "        L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)\n",
297 |     "        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)\n",
298 |     "\n",
299 |     "        return L4\n",
300 |     "    \n",
301 |     "    def build_model(self):\n",
302 |     "        self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n",
303 |     "        self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n",
304 |     "        self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))\n",
305 |     "        self.model_drop_ratio = tf.placeholder(tf.float32)\n",
306 |     "        \n",
307 |     "        self.V_sdae = self.encoder( self.model_X_0 , self.model_drop_ratio )\n",
308 |     "        self.y_pred = self.decoder( self.V_sdae , self.model_drop_ratio )\n",
309 |     "        \n",
310 |     "        self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])\n",
311 |     "        loss_r =1/2 * self.lambda_w * self.Regularization\n",
312 |     "        loss_a =1/2 * self.lambda_n * tf.reduce_sum(tf.pow( self.model_X_c - self.y_pred , 2 ))\n",
313 |     "        loss_v =1/2 * self.lambda_v * tf.reduce_sum(tf.pow( self.model_V - self.V_sdae , 2 ))\n",
314 |     "        self.Loss = loss_r + loss_a + loss_v\n",
315 |     "        \n",
316 |     "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)\n",
317 |     "    def training(self , rating_matrix):\n",
318 |     "        #np.random.shuffle(self.item_infomation_matrix) #random index of train data\n",
319 |     "        \n",
320 |     "        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)\n",
321 |     "        \n",
322 |     "        sess = tf.Session()\n",
323 |     "        sess.run(tf.global_variables_initializer())\n",
324 |     "        \n",
325 |     "        mf = MF( rating_matrix )\n",
326 |     "        \n",
327 |     "        for epoch in range(self.epochs):\n",
328 |     "            print(\"%d / %d\"%(epoch+1 , self.epochs))\n",
329 |     "            \n",
330 |     "            V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , self.model_drop_ratio : 0.1})\n",
331 |     "            \n",
332 |     "            U , V = mf.ALS(V_sdae)\n",
333 |     "            V = np.resize(V,(16980 , 50))\n",
334 |     "            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):\n",
335 |     "                X_train_batch = self.item_infomation_matrix_noise[i:i+self.batch_size]\n",
336 |     "                y_train_batch = self.item_infomation_matrix[i:i+self.batch_size]\n",
337 |     "                V_batch = V[i:i+self.batch_size]\n",
338 |     "                _ , my_loss = sess.run([self.optimizer, self.Loss] , feed_dict={self.model_X_0 :X_train_batch , self.model_X_c : y_train_batch , self.model_V:V_batch, self.model_drop_ratio : 0.1})\n",
339 |     "            print(my_loss)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 9,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "name": "stdout",
349 |      "output_type": "stream",
350 |      "text": [
351 |       "1 / 10\n",
352 |       "387677.62\n",
353 |       "2 / 10\n",
354 |       "175559.14\n",
355 |       "3 / 10\n",
356 |       "76667.734\n",
357 |       "4 / 10\n",
358 |       "33305.188\n",
359 |       "5 / 10\n",
360 |       "14436.599\n",
361 |       "6 / 10\n",
362 |       "6843.848\n",
363 |       "7 / 10\n",
364 |       "3749.5586\n",
365 |       "8 / 10\n",
366 |       "2751.414\n",
367 |       "9 / 10\n",
368 |       "2292.7659\n",
369 |       "10 / 10\n",
370 |       "2268.0378\n"
371 |      ]
372 |     }
373 |    ],
374 |    "source": [
375 |     "cdl = CDL(rating_matrix , item_infomation_matrix)\n",
376 |     "cdl.build_model()\n",
377 |     "cdl.training(rating_matrix)"
378 |    ]
379 |   }
380 |  ],
381 |  "metadata": {
382 |   "kernelspec": {
383 |    "display_name": "Python 3",
384 |    "language": "python",
385 |    "name": "python3"
386 |   },
387 |   "language_info": {
388 |    "codemirror_mode": {
389 |     "name": "ipython",
390 |     "version": 3
391 |    },
392 |    "file_extension": ".py",
393 |    "mimetype": "text/x-python",
394 |    "name": "python",
395 |    "nbconvert_exporter": "python",
396 |    "pygments_lexer": "ipython3",
397 |    "version": "3.6.5"
398 |   }
399 |  },
400 |  "nbformat": 4,
401 |  "nbformat_minor": 2
402 | }
403 | 


--------------------------------------------------------------------------------
/CDL_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CDL by tensorflow"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### import module"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "C:\\Users\\k12s35h813g\\AppData\\Local\\Continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 27 |       "  from ._conv import register_converters as _register_converters\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "import numpy as np\n",
 33 |     "import pickle\n",
 34 |     "import tensorflow as tf\n",
 35 |     "import time\n",
 36 |     "#init random seed\n",
 37 |     "np.random.seed(5)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## 1. data preprocess"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "#### build item information matrix of citeulike-a by bag of word"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 2,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "#find vocabulary_size = 8000\n",
 61 |     "with open(r\"ctrsr_datasets/citeulike-a/vocabulary.dat\") as vocabulary_file:\n",
 62 |     "    vocabulary_size = len(vocabulary_file.readlines())\n",
 63 |     "    \n",
 64 |     "#find item_size = 16980\n",
 65 |     "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n",
 66 |     "    item_size = len(item_info_file.readlines())\n",
 67 |     "\n",
 68 |     "#initialize item_infomation_matrix (16980 , 8000)\n",
 69 |     "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n",
 70 |     "\n",
 71 |     "#build item_infomation_matrix\n",
 72 |     "with open(r\"ctrsr_datasets/citeulike-a/mult.dat\") as item_info_file:\n",
 73 |     "    sentences = item_info_file.readlines()\n",
 74 |     "    \n",
 75 |     "    for index,sentence in enumerate(sentences):\n",
 76 |     "        words = sentence.strip().split(\" \")[1:]\n",
 77 |     "        for word in words:\n",
 78 |     "            vocabulary_index , number = word.split(\":\")\n",
 79 |     "            item_infomation_matrix[index][int(vocabulary_index)] =number"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "#### build rating matrix citeulike-a"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#find user_size = 5551\n",
 96 |     "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n",
 97 |     "    user_size = len(rating_file.readlines())\n",
 98 |     "\n",
 99 |     "#initialize rating_matrix (5551 , 16980)\n",
100 |     "import numpy as np\n",
101 |     "rating_matrix = np.zeros((user_size , item_size))\n",
102 |     "\n",
103 |     "#build rating_matrix\n",
104 |     "with open(r\"ctrsr_datasets/citeulike-a/users.dat\") as rating_file:\n",
105 |     "    lines = rating_file.readlines()\n",
106 |     "    for index,line in enumerate(lines):\n",
107 |     "        items = line.strip().split(\" \")\n",
108 |     "        for item in items:  \n",
109 |     "            rating_matrix[index][int(item)] = 1"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "#### save matrix by pickle"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "with open(r'item_infomation_matrix.pickle', 'wb') as handle:\n",
126 |     "    pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
127 |     "with open(r'rating_matrix.pickle', 'wb') as handle:\n",
128 |     "    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "#### load matrix from pickle "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 5,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "with open(r'item_infomation_matrix.pickle', 'rb') as handle:\n",
145 |     "    item_infomation_matrix = pickle.load(handle)  \n",
146 |     "    \n",
147 |     "with open(r'rating_matrix.pickle', 'rb') as handle2:\n",
148 |     "    rating_matrix = pickle.load(handle2)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "## 2. build model"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "#### masking noise "
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 6,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "#apply SDAE : we hope to reconstruct item information by masking nosie\n",
172 |     "def mask(corruption_level ,size):\n",
173 |     "    mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])\n",
174 |     "    return mask\n",
175 |     "\n",
176 |     "def add_noise(x , corruption_level ):\n",
177 |     "    x = x * mask(corruption_level , x.shape)\n",
178 |     "    return x"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 7,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "class CDL():\n",
188 |     "    def __init__(self , rating_matrix , item_infomation_matrix):\n",
189 |     "        \n",
190 |     "        # model參數設定\n",
191 |     "        self.n_input = item_infomation_matrix.shape[1]\n",
192 |     "        self.n_hidden1 = 200\n",
193 |     "        self.n_hidden2 = 50\n",
194 |     "        self.k = 50\n",
195 |     "        \n",
196 |     "        self.lambda_w = 0.1\n",
197 |     "        self.lambda_n = 10\n",
198 |     "        self.lambda_u = 1\n",
199 |     "        self.lambda_v = 10\n",
200 |     "        \n",
201 |     "        self.drop_ratio = 0.1\n",
202 |     "        self.learning_rate = 0.01\n",
203 |     "        self.epochs = 200\n",
204 |     "        self.batch_size = 256\n",
205 |     "        \n",
206 |     "        self.a = 1\n",
207 |     "        self.b =0.01\n",
208 |     "        self.P = 1\n",
209 |     "        \n",
210 |     "        self.num_u = rating_matrix.shape[0]\n",
211 |     "        self.num_v = rating_matrix.shape[1]\n",
212 |     "        \n",
213 |     "        self.Weights = {\n",
214 |     "            'w1' : tf.Variable(tf.truncated_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n",
215 |     "            'w2' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n",
216 |     "            'w3' : tf.Variable(tf.truncated_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),\n",
217 |     "            'w4' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_input] , mean=0.0,  stddev= tf.truediv(1.0,self.lambda_w)))   \n",
218 |     "        }\n",
219 |     "        self.Biases = {\n",
220 |     "            'b1' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),\n",
221 |     "            'b2' : tf.Variable( tf.zeros(shape=self.n_hidden2) ),\n",
222 |     "            'b3' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),\n",
223 |     "            'b4' : tf.Variable( tf.zeros(shape=self.n_input) ),\n",
224 |     "        }\n",
225 |     "        \n",
226 |     "        self.item_infomation_matrix = item_infomation_matrix\n",
227 |     "        \n",
228 |     "        self.rating_matrix = rating_matrix\n",
229 |     "        \n",
230 |     "        for i in range(self.num_u):\n",
231 |     "            x = np.random.choice(np.where(self.rating_matrix[i,:]>0)[0] , self.P)\n",
232 |     "            self.rating_matrix[i,:].fill(0)\n",
233 |     "            self.rating_matrix[i,x] = 1\n",
234 |     "        \n",
235 |     "        self.confidence = np.mat(np.ones(self.rating_matrix.shape)) * self.b\n",
236 |     "        self.confidence[np.where(self.rating_matrix>0)] = self.a\n",
237 |     "        \n",
238 |     "    def encoder(self , x , drop_ratio):\n",
239 |     "        w1 = self.Weights['w1']\n",
240 |     "        b1 = self.Biases['b1']\n",
241 |     "        L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )\n",
242 |     "        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )\n",
243 |     "        \n",
244 |     "        w2 = self.Weights['w2']\n",
245 |     "        b2 = self.Biases['b2']\n",
246 |     "        L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )\n",
247 |     "        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)\n",
248 |     "        \n",
249 |     "        return L2\n",
250 |     "    \n",
251 |     "    def decoder(self , x , drop_ratio):\n",
252 |     "        w3 = self.Weights['w3']\n",
253 |     "        b3 = self.Biases['b3']\n",
254 |     "        L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)\n",
255 |     "        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)\n",
256 |     "\n",
257 |     "        w4 = self.Weights['w4']\n",
258 |     "        b4 = self.Biases['b4']\n",
259 |     "        L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)\n",
260 |     "        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)\n",
261 |     "\n",
262 |     "        return L4\n",
263 |     "    \n",
264 |     "#     def only_MF(self):\n",
265 |     "#         self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n",
266 |     "#         self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n",
267 |     "#         self.drop_ratio = tf.placeholder(tf.float32)\n",
268 |     "#         self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )\n",
269 |     "        \n",
270 |     "#         batch_size = tf.cast(tf.shape(self.R)[1], tf.int32)\n",
271 |     "        \n",
272 |     "        \n",
273 |     "#         self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) \n",
274 |     "#         self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )\n",
275 |     "        \n",
276 |     "#         batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])\n",
277 |     "        \n",
278 |     "#         loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) \n",
279 |     "#         loss_2 = tf.reduce_sum(tf.multiply(self.C ,\n",
280 |     "#                                     tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) \n",
281 |     "#                                 )\n",
282 |     "        \n",
283 |     "#         self.loss = loss_1 + loss_2 \n",
284 |     "#         self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)\n",
285 |     "        \n",
286 |     "    def build_model(self):\n",
287 |     "        \n",
288 |     "        self.X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n",
289 |     "        self.X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))\n",
290 |     "        self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n",
291 |     "        self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )\n",
292 |     "        self.drop_ratio = tf.placeholder(tf.float32)\n",
293 |     "        self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )\n",
294 |     "        #SDAE item factor\n",
295 |     "        V_sdae = self.encoder( self.X_0 , self.drop_ratio )\n",
296 |     "        \n",
297 |     "        #SDAE output \n",
298 |     "        sdae_output = self.decoder( V_sdae , self.drop_ratio )\n",
299 |     "        \n",
300 |     "        \n",
301 |     "        \n",
302 |     "        \n",
303 |     "        batch_size = tf.cast(tf.shape(self.X_0)[0], tf.int32)\n",
304 |     "        \n",
305 |     "        \n",
306 |     "        self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) \n",
307 |     "        self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )\n",
308 |     "        \n",
309 |     "        batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])\n",
310 |     "        \n",
311 |     "        loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) \n",
312 |     "        loss_2 = self.lambda_w * 1/2 * tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])\n",
313 |     "        loss_3 = self.lambda_v * tf.nn.l2_loss(batch_V - V_sdae)\n",
314 |     "        loss_4 = self.lambda_n * tf.nn.l2_loss(sdae_output - self.X_c)\n",
315 |     "        \n",
316 |     "        loss_5 = tf.reduce_sum(tf.multiply(self.C ,\n",
317 |     "                                    tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) \n",
318 |     "                                )\n",
319 |     "        \n",
320 |     "        self.loss = loss_1 + loss_2 + loss_3 + loss_4 + loss_5\n",
321 |     "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)\n",
322 |     "    def train_model(self):\n",
323 |     "        self.sess = tf.Session()\n",
324 |     "        self.sess.run(tf.global_variables_initializer())\n",
325 |     "        \n",
326 |     "        start_time = time.time()\n",
327 |     "        \n",
328 |     "        random_idx = np.random.permutation(self.num_v)\n",
329 |     "        \n",
330 |     "        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)\n",
331 |     "        \n",
332 |     "        for epoch in range(self.epochs):\n",
333 |     "            batch_cost = 0\n",
334 |     "            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):\n",
335 |     "                \n",
336 |     "                batch_idx = random_idx[i:i+self.batch_size]\n",
337 |     "                _ , loss = self.sess.run([self.optimizer, self.loss] , \n",
338 |     "                                            feed_dict={self.X_0 : self.item_infomation_matrix_noise[batch_idx,:] , \n",
339 |     "                                                       self.X_c : self.item_infomation_matrix[batch_idx,:] , \n",
340 |     "                                                       self.R : self.rating_matrix[: , batch_idx], \n",
341 |     "                                                       self.C : self.confidence[: , batch_idx], \n",
342 |     "                                                       self.drop_ratio : 0.1 ,\n",
343 |     "                                                       self.model_batch_data_idx  : batch_idx })\n",
344 |     "                batch_cost = batch_cost + loss\n",
345 |     "\n",
346 |     "            print (\"Training //\", \"Epoch %d //\" % (epoch+1), \" Total cost = {:.2f}\".format(batch_cost), \"Elapsed time : %d sec\" % (time.time() - start_time))\n",
347 |     "            \n",
348 |     "        return self.sess.run((tf.matmul(self.U, self.V, transpose_b=True)))"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "#### train model"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 8,
361 |    "metadata": {
362 |     "scrolled": true
363 |    },
364 |    "outputs": [
365 |     {
366 |      "name": "stdout",
367 |      "output_type": "stream",
368 |      "text": [
369 |       "Training // Epoch 1 //  Total cost = 699676798.00 Elapsed time : 13 sec\n",
370 |       "Training // Epoch 2 //  Total cost = 509001361.50 Elapsed time : 23 sec\n",
371 |       "Training // Epoch 3 //  Total cost = 426090375.00 Elapsed time : 32 sec\n",
372 |       "Training // Epoch 4 //  Total cost = 402229449.00 Elapsed time : 40 sec\n",
373 |       "Training // Epoch 5 //  Total cost = 385809588.50 Elapsed time : 50 sec\n",
374 |       "Training // Epoch 6 //  Total cost = 370855234.50 Elapsed time : 59 sec\n",
375 |       "Training // Epoch 7 //  Total cost = 356779552.50 Elapsed time : 68 sec\n",
376 |       "Training // Epoch 8 //  Total cost = 343552683.50 Elapsed time : 77 sec\n",
377 |       "Training // Epoch 9 //  Total cost = 330934537.50 Elapsed time : 85 sec\n",
378 |       "Training // Epoch 10 //  Total cost = 318868322.50 Elapsed time : 94 sec\n",
379 |       "Training // Epoch 11 //  Total cost = 307315671.75 Elapsed time : 102 sec\n",
380 |       "Training // Epoch 12 //  Total cost = 296283195.75 Elapsed time : 112 sec\n",
381 |       "Training // Epoch 13 //  Total cost = 285720805.75 Elapsed time : 121 sec\n",
382 |       "Training // Epoch 14 //  Total cost = 275675537.25 Elapsed time : 130 sec\n",
383 |       "Training // Epoch 15 //  Total cost = 265893138.00 Elapsed time : 139 sec\n",
384 |       "Training // Epoch 16 //  Total cost = 256381356.50 Elapsed time : 147 sec\n",
385 |       "Training // Epoch 17 //  Total cost = 247409903.50 Elapsed time : 157 sec\n",
386 |       "Training // Epoch 18 //  Total cost = 238836460.75 Elapsed time : 165 sec\n",
387 |       "Training // Epoch 19 //  Total cost = 230363394.25 Elapsed time : 174 sec\n",
388 |       "Training // Epoch 20 //  Total cost = 222478204.00 Elapsed time : 183 sec\n",
389 |       "Training // Epoch 21 //  Total cost = 214707066.00 Elapsed time : 192 sec\n",
390 |       "Training // Epoch 22 //  Total cost = 207398870.00 Elapsed time : 201 sec\n",
391 |       "Training // Epoch 23 //  Total cost = 200335661.00 Elapsed time : 210 sec\n",
392 |       "Training // Epoch 24 //  Total cost = 193502709.25 Elapsed time : 219 sec\n",
393 |       "Training // Epoch 25 //  Total cost = 187017720.00 Elapsed time : 227 sec\n",
394 |       "Training // Epoch 26 //  Total cost = 180656876.50 Elapsed time : 236 sec\n",
395 |       "Training // Epoch 27 //  Total cost = 174520094.75 Elapsed time : 244 sec\n",
396 |       "Training // Epoch 28 //  Total cost = 168618046.00 Elapsed time : 253 sec\n",
397 |       "Training // Epoch 29 //  Total cost = 162554788.25 Elapsed time : 262 sec\n",
398 |       "Training // Epoch 30 //  Total cost = 157096778.00 Elapsed time : 271 sec\n",
399 |       "Training // Epoch 31 //  Total cost = 151624672.62 Elapsed time : 280 sec\n",
400 |       "Training // Epoch 32 //  Total cost = 146411205.25 Elapsed time : 288 sec\n",
401 |       "Training // Epoch 33 //  Total cost = 141438258.50 Elapsed time : 297 sec\n",
402 |       "Training // Epoch 34 //  Total cost = 136530659.00 Elapsed time : 305 sec\n",
403 |       "Training // Epoch 35 //  Total cost = 131753547.00 Elapsed time : 314 sec\n",
404 |       "Training // Epoch 36 //  Total cost = 126863433.50 Elapsed time : 323 sec\n",
405 |       "Training // Epoch 37 //  Total cost = 122656202.88 Elapsed time : 332 sec\n",
406 |       "Training // Epoch 38 //  Total cost = 118599578.00 Elapsed time : 340 sec\n",
407 |       "Training // Epoch 39 //  Total cost = 114611759.88 Elapsed time : 348 sec\n",
408 |       "Training // Epoch 40 //  Total cost = 110903898.50 Elapsed time : 357 sec\n",
409 |       "Training // Epoch 41 //  Total cost = 107518569.25 Elapsed time : 365 sec\n",
410 |       "Training // Epoch 42 //  Total cost = 104367943.38 Elapsed time : 373 sec\n",
411 |       "Training // Epoch 43 //  Total cost = 101399079.12 Elapsed time : 382 sec\n",
412 |       "Training // Epoch 44 //  Total cost = 98611119.12 Elapsed time : 391 sec\n",
413 |       "Training // Epoch 45 //  Total cost = 95935625.75 Elapsed time : 400 sec\n",
414 |       "Training // Epoch 46 //  Total cost = 93366417.50 Elapsed time : 409 sec\n",
415 |       "Training // Epoch 47 //  Total cost = 90901260.00 Elapsed time : 417 sec\n",
416 |       "Training // Epoch 48 //  Total cost = 88532784.75 Elapsed time : 426 sec\n",
417 |       "Training // Epoch 49 //  Total cost = 86253099.31 Elapsed time : 435 sec\n",
418 |       "Training // Epoch 50 //  Total cost = 84065897.81 Elapsed time : 444 sec\n",
419 |       "Training // Epoch 51 //  Total cost = 81960649.31 Elapsed time : 452 sec\n",
420 |       "Training // Epoch 52 //  Total cost = 79932892.19 Elapsed time : 461 sec\n",
421 |       "Training // Epoch 53 //  Total cost = 77983110.50 Elapsed time : 470 sec\n",
422 |       "Training // Epoch 54 //  Total cost = 76107443.06 Elapsed time : 478 sec\n",
423 |       "Training // Epoch 55 //  Total cost = 74296636.31 Elapsed time : 486 sec\n",
424 |       "Training // Epoch 56 //  Total cost = 72557214.06 Elapsed time : 495 sec\n",
425 |       "Training // Epoch 57 //  Total cost = 70873424.69 Elapsed time : 503 sec\n",
426 |       "Training // Epoch 58 //  Total cost = 69249254.19 Elapsed time : 512 sec\n",
427 |       "Training // Epoch 59 //  Total cost = 67679117.94 Elapsed time : 521 sec\n",
428 |       "Training // Epoch 60 //  Total cost = 66165503.62 Elapsed time : 530 sec\n",
429 |       "Training // Epoch 61 //  Total cost = 64698699.00 Elapsed time : 539 sec\n",
430 |       "Training // Epoch 62 //  Total cost = 63282957.81 Elapsed time : 547 sec\n",
431 |       "Training // Epoch 63 //  Total cost = 61906538.56 Elapsed time : 556 sec\n",
432 |       "Training // Epoch 64 //  Total cost = 60575016.38 Elapsed time : 565 sec\n",
433 |       "Training // Epoch 65 //  Total cost = 59282706.25 Elapsed time : 574 sec\n",
434 |       "Training // Epoch 66 //  Total cost = 58029875.38 Elapsed time : 583 sec\n",
435 |       "Training // Epoch 67 //  Total cost = 56815220.81 Elapsed time : 592 sec\n",
436 |       "Training // Epoch 68 //  Total cost = 55630129.94 Elapsed time : 601 sec\n",
437 |       "Training // Epoch 69 //  Total cost = 54482065.69 Elapsed time : 609 sec\n",
438 |       "Training // Epoch 70 //  Total cost = 53366296.62 Elapsed time : 618 sec\n",
439 |       "Training // Epoch 71 //  Total cost = 52280660.06 Elapsed time : 627 sec\n",
440 |       "Training // Epoch 72 //  Total cost = 51223257.59 Elapsed time : 635 sec\n",
441 |       "Training // Epoch 73 //  Total cost = 50193597.88 Elapsed time : 644 sec\n",
442 |       "Training // Epoch 74 //  Total cost = 49194452.50 Elapsed time : 652 sec\n",
443 |       "Training // Epoch 75 //  Total cost = 48220310.16 Elapsed time : 661 sec\n",
444 |       "Training // Epoch 76 //  Total cost = 47270594.84 Elapsed time : 669 sec\n",
445 |       "Training // Epoch 77 //  Total cost = 46346913.22 Elapsed time : 678 sec\n",
446 |       "Training // Epoch 78 //  Total cost = 45445881.34 Elapsed time : 687 sec\n",
447 |       "Training // Epoch 79 //  Total cost = 44570288.53 Elapsed time : 696 sec\n",
448 |       "Training // Epoch 80 //  Total cost = 43717168.12 Elapsed time : 704 sec\n",
449 |       "Training // Epoch 81 //  Total cost = 42887369.91 Elapsed time : 713 sec\n",
450 |       "Training // Epoch 82 //  Total cost = 42079221.00 Elapsed time : 721 sec\n",
451 |       "Training // Epoch 83 //  Total cost = 41288647.12 Elapsed time : 730 sec\n",
452 |       "Training // Epoch 84 //  Total cost = 40523911.88 Elapsed time : 739 sec\n",
453 |       "Training // Epoch 85 //  Total cost = 39775534.69 Elapsed time : 747 sec\n",
454 |       "Training // Epoch 86 //  Total cost = 39051408.12 Elapsed time : 756 sec\n",
455 |       "Training // Epoch 87 //  Total cost = 38350553.22 Elapsed time : 764 sec\n",
456 |       "Training // Epoch 88 //  Total cost = 37661407.00 Elapsed time : 775 sec\n",
457 |       "Training // Epoch 89 //  Total cost = 36996213.59 Elapsed time : 783 sec\n",
458 |       "Training // Epoch 90 //  Total cost = 36349649.47 Elapsed time : 791 sec\n",
459 |       "Training // Epoch 91 //  Total cost = 35726645.00 Elapsed time : 800 sec\n",
460 |       "Training // Epoch 92 //  Total cost = 35117365.41 Elapsed time : 811 sec\n",
461 |       "Training // Epoch 93 //  Total cost = 34530359.38 Elapsed time : 822 sec\n",
462 |       "Training // Epoch 94 //  Total cost = 33960657.64 Elapsed time : 832 sec\n",
463 |       "Training // Epoch 95 //  Total cost = 33409376.77 Elapsed time : 841 sec\n",
464 |       "Training // Epoch 96 //  Total cost = 32873193.94 Elapsed time : 850 sec\n",
465 |       "Training // Epoch 97 //  Total cost = 32355168.34 Elapsed time : 858 sec\n",
466 |       "Training // Epoch 98 //  Total cost = 31859824.70 Elapsed time : 867 sec\n",
467 |       "Training // Epoch 99 //  Total cost = 31375273.75 Elapsed time : 876 sec\n",
468 |       "Training // Epoch 100 //  Total cost = 30909934.75 Elapsed time : 885 sec\n",
469 |       "Training // Epoch 101 //  Total cost = 30461370.30 Elapsed time : 893 sec\n",
470 |       "Training // Epoch 102 //  Total cost = 30028708.03 Elapsed time : 902 sec\n",
471 |       "Training // Epoch 103 //  Total cost = 29610802.08 Elapsed time : 911 sec\n",
472 |       "Training // Epoch 104 //  Total cost = 29209048.72 Elapsed time : 920 sec\n",
473 |       "Training // Epoch 105 //  Total cost = 28818532.34 Elapsed time : 928 sec\n",
474 |       "Training // Epoch 106 //  Total cost = 28447847.41 Elapsed time : 937 sec\n",
475 |       "Training // Epoch 107 //  Total cost = 28091455.08 Elapsed time : 946 sec\n",
476 |       "Training // Epoch 108 //  Total cost = 27748029.56 Elapsed time : 955 sec\n",
477 |       "Training // Epoch 109 //  Total cost = 27415978.22 Elapsed time : 963 sec\n",
478 |       "Training // Epoch 110 //  Total cost = 27100371.06 Elapsed time : 972 sec\n",
479 |       "Training // Epoch 111 //  Total cost = 26795215.59 Elapsed time : 981 sec\n",
480 |       "Training // Epoch 112 //  Total cost = 26506069.69 Elapsed time : 990 sec\n"
481 |      ]
482 |     },
483 |     {
484 |      "name": "stdout",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "Training // Epoch 113 //  Total cost = 26229822.77 Elapsed time : 999 sec\n",
488 |       "Training // Epoch 114 //  Total cost = 25965280.67 Elapsed time : 1008 sec\n",
489 |       "Training // Epoch 115 //  Total cost = 25713353.86 Elapsed time : 1017 sec\n",
490 |       "Training // Epoch 116 //  Total cost = 25468158.66 Elapsed time : 1025 sec\n",
491 |       "Training // Epoch 117 //  Total cost = 25241022.56 Elapsed time : 1033 sec\n",
492 |       "Training // Epoch 118 //  Total cost = 25019519.48 Elapsed time : 1042 sec\n",
493 |       "Training // Epoch 119 //  Total cost = 24811663.63 Elapsed time : 1050 sec\n",
494 |       "Training // Epoch 120 //  Total cost = 24609974.30 Elapsed time : 1058 sec\n",
495 |       "Training // Epoch 121 //  Total cost = 24420480.23 Elapsed time : 1066 sec\n",
496 |       "Training // Epoch 122 //  Total cost = 24235782.49 Elapsed time : 1074 sec\n",
497 |       "Training // Epoch 123 //  Total cost = 24066398.74 Elapsed time : 1082 sec\n",
498 |       "Training // Epoch 124 //  Total cost = 23899705.29 Elapsed time : 1091 sec\n",
499 |       "Training // Epoch 125 //  Total cost = 23745025.42 Elapsed time : 1099 sec\n",
500 |       "Training // Epoch 126 //  Total cost = 23600273.58 Elapsed time : 1107 sec\n",
501 |       "Training // Epoch 127 //  Total cost = 23457817.05 Elapsed time : 1115 sec\n",
502 |       "Training // Epoch 128 //  Total cost = 23326691.69 Elapsed time : 1123 sec\n",
503 |       "Training // Epoch 129 //  Total cost = 23200199.76 Elapsed time : 1132 sec\n",
504 |       "Training // Epoch 130 //  Total cost = 23082377.88 Elapsed time : 1140 sec\n",
505 |       "Training // Epoch 131 //  Total cost = 22968153.27 Elapsed time : 1148 sec\n",
506 |       "Training // Epoch 132 //  Total cost = 22860444.31 Elapsed time : 1156 sec\n",
507 |       "Training // Epoch 133 //  Total cost = 22760058.06 Elapsed time : 1164 sec\n",
508 |       "Training // Epoch 134 //  Total cost = 22666147.52 Elapsed time : 1172 sec\n",
509 |       "Training // Epoch 135 //  Total cost = 22571107.13 Elapsed time : 1180 sec\n",
510 |       "Training // Epoch 136 //  Total cost = 22487584.22 Elapsed time : 1188 sec\n",
511 |       "Training // Epoch 137 //  Total cost = 22405833.73 Elapsed time : 1196 sec\n",
512 |       "Training // Epoch 138 //  Total cost = 22330392.38 Elapsed time : 1205 sec\n",
513 |       "Training // Epoch 139 //  Total cost = 22255380.34 Elapsed time : 1213 sec\n",
514 |       "Training // Epoch 140 //  Total cost = 22186002.29 Elapsed time : 1221 sec\n",
515 |       "Training // Epoch 141 //  Total cost = 22117630.56 Elapsed time : 1229 sec\n",
516 |       "Training // Epoch 142 //  Total cost = 22055342.34 Elapsed time : 1237 sec\n",
517 |       "Training // Epoch 143 //  Total cost = 21993179.08 Elapsed time : 1245 sec\n",
518 |       "Training // Epoch 144 //  Total cost = 21932841.30 Elapsed time : 1253 sec\n",
519 |       "Training // Epoch 145 //  Total cost = 21884063.32 Elapsed time : 1262 sec\n",
520 |       "Training // Epoch 146 //  Total cost = 21835091.59 Elapsed time : 1270 sec\n",
521 |       "Training // Epoch 147 //  Total cost = 21789127.23 Elapsed time : 1278 sec\n",
522 |       "Training // Epoch 148 //  Total cost = 21742637.62 Elapsed time : 1286 sec\n",
523 |       "Training // Epoch 149 //  Total cost = 21705565.35 Elapsed time : 1294 sec\n",
524 |       "Training // Epoch 150 //  Total cost = 21668287.75 Elapsed time : 1303 sec\n",
525 |       "Training // Epoch 151 //  Total cost = 21630913.35 Elapsed time : 1312 sec\n",
526 |       "Training // Epoch 152 //  Total cost = 21599348.62 Elapsed time : 1320 sec\n",
527 |       "Training // Epoch 153 //  Total cost = 21571637.19 Elapsed time : 1328 sec\n",
528 |       "Training // Epoch 154 //  Total cost = 21543501.29 Elapsed time : 1336 sec\n",
529 |       "Training // Epoch 155 //  Total cost = 21513418.12 Elapsed time : 1345 sec\n",
530 |       "Training // Epoch 156 //  Total cost = 21485080.34 Elapsed time : 1353 sec\n",
531 |       "Training // Epoch 157 //  Total cost = 21438892.87 Elapsed time : 1363 sec\n",
532 |       "Training // Epoch 158 //  Total cost = 21377036.28 Elapsed time : 1372 sec\n",
533 |       "Training // Epoch 159 //  Total cost = 21229306.30 Elapsed time : 1381 sec\n",
534 |       "Training // Epoch 160 //  Total cost = 21071121.86 Elapsed time : 1389 sec\n",
535 |       "Training // Epoch 161 //  Total cost = 20925566.02 Elapsed time : 1398 sec\n",
536 |       "Training // Epoch 162 //  Total cost = 20836658.01 Elapsed time : 1406 sec\n",
537 |       "Training // Epoch 163 //  Total cost = 20774302.80 Elapsed time : 1415 sec\n",
538 |       "Training // Epoch 164 //  Total cost = 20716914.44 Elapsed time : 1424 sec\n",
539 |       "Training // Epoch 165 //  Total cost = 20707060.63 Elapsed time : 1432 sec\n",
540 |       "Training // Epoch 166 //  Total cost = 20661099.41 Elapsed time : 1441 sec\n",
541 |       "Training // Epoch 167 //  Total cost = 20625338.57 Elapsed time : 1451 sec\n",
542 |       "Training // Epoch 168 //  Total cost = 20589319.52 Elapsed time : 1460 sec\n",
543 |       "Training // Epoch 169 //  Total cost = 20572881.80 Elapsed time : 1469 sec\n",
544 |       "Training // Epoch 170 //  Total cost = 20567003.31 Elapsed time : 1477 sec\n",
545 |       "Training // Epoch 171 //  Total cost = 20557453.87 Elapsed time : 1485 sec\n",
546 |       "Training // Epoch 172 //  Total cost = 20514101.95 Elapsed time : 1494 sec\n",
547 |       "Training // Epoch 173 //  Total cost = 20469838.30 Elapsed time : 1502 sec\n",
548 |       "Training // Epoch 174 //  Total cost = 20426545.98 Elapsed time : 1511 sec\n",
549 |       "Training // Epoch 175 //  Total cost = 20403331.02 Elapsed time : 1519 sec\n",
550 |       "Training // Epoch 176 //  Total cost = 20382709.72 Elapsed time : 1527 sec\n",
551 |       "Training // Epoch 177 //  Total cost = 20339543.34 Elapsed time : 1535 sec\n",
552 |       "Training // Epoch 178 //  Total cost = 20291659.41 Elapsed time : 1543 sec\n",
553 |       "Training // Epoch 179 //  Total cost = 20293770.04 Elapsed time : 1551 sec\n",
554 |       "Training // Epoch 180 //  Total cost = 20236462.23 Elapsed time : 1560 sec\n",
555 |       "Training // Epoch 181 //  Total cost = 20202695.11 Elapsed time : 1568 sec\n",
556 |       "Training // Epoch 182 //  Total cost = 20165418.95 Elapsed time : 1576 sec\n",
557 |       "Training // Epoch 183 //  Total cost = 20137667.19 Elapsed time : 1584 sec\n",
558 |       "Training // Epoch 184 //  Total cost = 20112504.45 Elapsed time : 1592 sec\n",
559 |       "Training // Epoch 185 //  Total cost = 20064062.89 Elapsed time : 1600 sec\n",
560 |       "Training // Epoch 186 //  Total cost = 20035768.56 Elapsed time : 1609 sec\n",
561 |       "Training // Epoch 187 //  Total cost = 20022080.76 Elapsed time : 1617 sec\n",
562 |       "Training // Epoch 188 //  Total cost = 19988423.27 Elapsed time : 1625 sec\n",
563 |       "Training // Epoch 189 //  Total cost = 19961282.66 Elapsed time : 1633 sec\n",
564 |       "Training // Epoch 190 //  Total cost = 19926572.26 Elapsed time : 1641 sec\n",
565 |       "Training // Epoch 191 //  Total cost = 19882042.70 Elapsed time : 1649 sec\n",
566 |       "Training // Epoch 192 //  Total cost = 19857866.74 Elapsed time : 1658 sec\n",
567 |       "Training // Epoch 193 //  Total cost = 19820119.57 Elapsed time : 1666 sec\n",
568 |       "Training // Epoch 194 //  Total cost = 19791277.95 Elapsed time : 1674 sec\n",
569 |       "Training // Epoch 195 //  Total cost = 19769504.04 Elapsed time : 1682 sec\n",
570 |       "Training // Epoch 196 //  Total cost = 19738930.83 Elapsed time : 1690 sec\n",
571 |       "Training // Epoch 197 //  Total cost = 19727672.70 Elapsed time : 1698 sec\n",
572 |       "Training // Epoch 198 //  Total cost = 19704796.80 Elapsed time : 1706 sec\n",
573 |       "Training // Epoch 199 //  Total cost = 19676641.04 Elapsed time : 1715 sec\n",
574 |       "Training // Epoch 200 //  Total cost = 19652868.22 Elapsed time : 1723 sec\n"
575 |      ]
576 |     }
577 |    ],
578 |    "source": [
579 |     "R_train = rating_matrix.copy()\n",
580 |     "cdl = CDL(R_train , item_infomation_matrix)\n",
581 |     "cdl.build_model()\n",
582 |     "R = cdl.train_model()"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "markdown",
587 |    "metadata": {},
588 |    "source": [
589 |     "#### evaluation"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": 9,
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": [
598 |     "all_cnt = 0\n",
599 |     "for i in range(rating_matrix.shape[0]):\n",
600 |     "    l_score = np.ravel(R[i,:]).tolist()\n",
601 |     "    pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True)\n",
602 |     "    l_rec = [i[0] for i in pl][:300]\n",
603 |     "    s_rec = set(l_rec)\n",
604 |     "    s_true = set(np.ravel(np.where(rating_matrix[i,:]>0)))\n",
605 |     "    cnt_hit = len(s_rec.intersection(s_true))\n",
606 |     "    all_cnt = all_cnt + cnt_hit/len(s_true)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 10,
612 |    "metadata": {},
613 |    "outputs": [
614 |     {
615 |      "name": "stdout",
616 |      "output_type": "stream",
617 |      "text": [
618 |       "accuracy : 0.081\n"
619 |      ]
620 |     }
621 |    ],
622 |    "source": [
623 |     "#accuracy 0.085不能算太低 因為他是所有item(16980)去排序\n",
624 |     "print(\"accuracy : %.3f\"%(all_cnt/rating_matrix.shape[0]))"
625 |    ]
626 |   }
627 |  ],
628 |  "metadata": {
629 |   "kernelspec": {
630 |    "display_name": "Python 3",
631 |    "language": "python",
632 |    "name": "python3"
633 |   },
634 |   "language_info": {
635 |    "codemirror_mode": {
636 |     "name": "ipython",
637 |     "version": 3
638 |    },
639 |    "file_extension": ".py",
640 |    "mimetype": "text/x-python",
641 |    "name": "python",
642 |    "nbconvert_exporter": "python",
643 |    "pygments_lexer": "ipython3",
644 |    "version": "3.6.5"
645 |   }
646 |  },
647 |  "nbformat": 4,
648 |  "nbformat_minor": 2
649 | }
650 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 old cat
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # implement-Collaborative-Deep-Learning-for-Recommender-Systems
 2 | 
 3 | implement this paper "Collaborative Deep Learning for Recommender Systems" by python
 4 | 
 5 | Collaborative Deep Learning (CDL) (Wang, H., Wang, N., & Yeung, D. Y. (2015, August). Collaborative deep learning for recommender systems. In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (pp. 1235-1244). ACM.)
 6 | 
 7 | -----------------------------------------------------------------------------
 8 | 
 9 | ## Introduction
10 | This paper combine Collaborative filtering and stacked Denoising Autoencoder together. The original author implement by python and c++. And update the parameter by ALS algorithm . 
11 | I implement this paper by tensorflow and try two method to update the parameter (1)ALS (2)gradient decent
12 | 
13 | You can download the [slide](https://drive.google.com/file/d/1EtnYFQyRSd6A24NIniJtE_U5bm6f4-lZ/view?usp=sharing) for more detail information.
14 | 
15 | ## dataset
16 | The dataset is from CiteULike . You can download it from the original author's website [here](http://www.wanghao.in/publication.html)
17 | 
18 | ## usage
19 |    
20 |    CDL_tf.ipynb  - train CDL by gradient decent
21 |    <br>
22 |    CDL.ipynb - train CDL by ALS
23 | 


--------------------------------------------------------------------------------
/data_preprocess.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# data preprocess"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## build item information matrix of citeulike-a by bag of word"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 7,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "#find vocabulary_size = 8000\n",
 24 |     "with open(r\"ctrsr_datasets\\citeulike-a\\vocabulary.dat\") as vocabulary_file:\n",
 25 |     "    vocabulary_size = len(vocabulary_file.readlines())\n",
 26 |     "    \n",
 27 |     "#find item_size = 16980\n",
 28 |     "with open(r\"ctrsr_datasets\\citeulike-a\\mult.dat\") as item_info_file:\n",
 29 |     "    item_size = len(item_info_file.readlines())\n",
 30 |     "\n",
 31 |     "#initialize item_infomation_matrix (16980 , 8000)\n",
 32 |     "import numpy as np\n",
 33 |     "item_infomation_matrix = np.zeros((item_size , vocabulary_size))\n",
 34 |     "\n",
 35 |     "#build item_infomation_matrix\n",
 36 |     "with open(r\"ctrsr_datasets\\citeulike-a\\mult.dat\") as item_info_file:\n",
 37 |     "    sentences = item_info_file.readlines()\n",
 38 |     "    \n",
 39 |     "    for index,sentence in enumerate(sentences):\n",
 40 |     "        words = sentence.strip().split(\" \")[1:]\n",
 41 |     "        for word in words:\n",
 42 |     "            vocabulary_index , number = word.split(\":\")\n",
 43 |     "            item_infomation_matrix[index][int(vocabulary_index)] =number\n",
 44 |     "        "
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## build rating matrix citeulike-a"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 13,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "#find user_size = 5551\n",
 61 |     "with open(r\"ctrsr_datasets\\citeulike-a\\users.dat\") as rating_file:\n",
 62 |     "    user_size = len(rating_file.readlines())\n",
 63 |     "\n",
 64 |     "#initialize rating_matrix (5551 , 16980)\n",
 65 |     "import numpy as np\n",
 66 |     "rating_matrix = np.zeros((user_size , item_size))\n",
 67 |     "\n",
 68 |     "#build rating_matrix\n",
 69 |     "with open(r\"ctrsr_datasets\\citeulike-a\\users.dat\") as rating_file:\n",
 70 |     "    lines = rating_file.readlines()\n",
 71 |     "    for index,line in enumerate(lines):\n",
 72 |     "        items = line.strip().split(\" \")\n",
 73 |     "        for item in items:  \n",
 74 |     "            rating_matrix[index][int(item)] = 1"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## save matrix by pickle"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 14,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "import pickle"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 15,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "with open(r'dataset\\citeulike-a\\item_infomation_matrix.pickle', 'wb') as handle:\n",
104 |     "    pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
105 |     "with open(r'dataset\\citeulike-a\\rating_matrix.pickle', 'wb') as handle:\n",
106 |     "    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.6.5"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 2
131 | }
132 | 


--------------------------------------------------------------------------------