$\n",
23 | "\n",
24 | "### 我们需要最小化的loss计算如下(添加正则化项)\n",
25 | "$\\sum_{u, i} |y_{pred[u, i]} - y_{true[u, i]}|^2 + \\lambda(|embedding_{user[u]}|^2 + |embedding_{item[i]}|^2)$"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": []
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### 数据处理"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 1,
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "import numpy as np\n",
53 | "import pandas as pd\n",
54 | "\n",
55 | "\n",
56 | "def read_data_and_process(filname, sep=\"\\t\"):\n",
57 | " col_names = [\"user\", \"item\", \"rate\", \"st\"]\n",
58 | " df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')\n",
59 | " df[\"user\"] -= 1\n",
60 | " df[\"item\"] -= 1\n",
61 | " for col in (\"user\", \"item\"):\n",
62 | " df[col] = df[col].astype(np.int32)\n",
63 | " df[\"rate\"] = df[\"rate\"].astype(np.float32)\n",
64 | " return df\n",
65 | "\n",
66 | "\n",
67 | "class ShuffleDataIterator(object):\n",
68 | " \"\"\"\n",
69 | " 随机生成一个batch一个batch数据\n",
70 | " \"\"\"\n",
71 | " #初始化\n",
72 | " def __init__(self, inputs, batch_size=10):\n",
73 | " self.inputs = inputs\n",
74 | " self.batch_size = batch_size\n",
75 | " self.num_cols = len(self.inputs)\n",
76 | " self.len = len(self.inputs[0])\n",
77 | " self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))\n",
78 | "\n",
79 | " #总样本量\n",
80 | " def __len__(self):\n",
81 | " return self.len\n",
82 | "\n",
83 | " def __iter__(self):\n",
84 | " return self\n",
85 | "\n",
86 | " #取出下一个batch\n",
87 | " def __next__(self):\n",
88 | " return self.next()\n",
89 | " \n",
90 | " #随机生成batch_size个下标,取出对应的样本\n",
91 | " def next(self):\n",
92 | " ids = np.random.randint(0, self.len, (self.batch_size,))\n",
93 | " out = self.inputs[ids, :]\n",
94 | " return [out[:, i] for i in range(self.num_cols)]\n",
95 | "\n",
96 | "\n",
97 | "class OneEpochDataIterator(ShuffleDataIterator):\n",
98 | " \"\"\"\n",
99 | " 顺序产出一个epoch的数据,在测试中可能会用到\n",
100 | " \"\"\"\n",
101 | " def __init__(self, inputs, batch_size=10):\n",
102 | " super(OneEpochDataIterator, self).__init__(inputs, batch_size=batch_size)\n",
103 | " if batch_size > 0:\n",
104 | " self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))\n",
105 | " else:\n",
106 | " self.idx_group = [np.arange(self.len)]\n",
107 | " self.group_id = 0\n",
108 | "\n",
109 | " def next(self):\n",
110 | " if self.group_id >= len(self.idx_group):\n",
111 | " self.group_id = 0\n",
112 | " raise StopIteration\n",
113 | " out = self.inputs[self.idx_group[self.group_id], :]\n",
114 | " self.group_id += 1\n",
115 | " return [out[:, i] for i in range(self.num_cols)]"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "### 模型搭建\n",
123 | "用tensorflow去搭建一个可增量训练的矩阵分解模型,完成基于矩阵分解的推荐系统"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 2,
129 | "metadata": {
130 | "collapsed": true
131 | },
132 | "outputs": [],
133 | "source": [
134 | "\n",
135 | "import tensorflow as tf\n",
136 | "\n",
137 | "# 使用矩阵分解搭建的网络结构\n",
138 | "def inference_svd(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n",
139 | " #使用CPU\n",
140 | " with tf.device(\"/cpu:0\"):\n",
141 | " # 初始化几个bias项\n",
142 | " global_bias = tf.get_variable(\"global_bias\", shape=[])\n",
143 | " w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n",
144 | " w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n",
145 | " # bias向量\n",
146 | " bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n",
147 | " bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n",
148 | " w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n",
149 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
150 | " w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n",
151 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
152 | " # user向量与item向量\n",
153 | " embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n",
154 | " embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n",
155 | " with tf.device(device):\n",
156 | " # 按照实际公式进行计算\n",
157 | " # 先对user向量和item向量求内积\n",
158 | " infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)\n",
159 | " # 加上几个偏置项\n",
160 | " infer = tf.add(infer, global_bias)\n",
161 | " infer = tf.add(infer, bias_user)\n",
162 | " infer = tf.add(infer, bias_item, name=\"svd_inference\")\n",
163 | " # 加上正则化项\n",
164 | " regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name=\"svd_regularizer\")\n",
165 | " return infer, regularizer\n",
166 | "\n",
167 | "# 迭代优化部分\n",
168 | "def optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device=\"/cpu:0\"):\n",
169 | " global_step = tf.train.get_global_step()\n",
170 | " assert global_step is not None\n",
171 | " # 选择合适的optimizer做优化\n",
172 | " with tf.device(device):\n",
173 | " cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))\n",
174 | " penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n",
175 | " cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))\n",
176 | " train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)\n",
177 | " return cost, train_op"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "### 模型训练"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 3,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "import time\n",
196 | "from collections import deque\n",
197 | "\n",
198 | "import numpy as np\n",
199 | "import tensorflow as tf\n",
200 | "from six import next\n",
201 | "from tensorflow.core.framework import summary_pb2\n",
202 | "\n",
203 | "np.random.seed(13575)\n",
204 | "\n",
205 | "# 一批数据的大小\n",
206 | "BATCH_SIZE = 2000\n",
207 | "# 用户数\n",
208 | "USER_NUM = 6040\n",
209 | "# 电影数\n",
210 | "ITEM_NUM = 3952\n",
211 | "# factor维度\n",
212 | "DIM = 15\n",
213 | "# 最大迭代轮数\n",
214 | "EPOCH_MAX = 200\n",
215 | "# 使用cpu做训练\n",
216 | "DEVICE = \"/cpu:0\"\n",
217 | "\n",
218 | "# 截断\n",
219 | "def clip(x):\n",
220 | " return np.clip(x, 1.0, 5.0)\n",
221 | "\n",
222 | "# 这个是方便Tensorboard可视化做的summary\n",
223 | "def make_scalar_summary(name, val):\n",
224 | " return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])\n",
225 | "\n",
226 | "# 调用上面的函数获取数据\n",
227 | "def get_data():\n",
228 | " df = read_data_and_process(\"./movielens/ml-1m/ratings.dat\", sep=\"::\")\n",
229 | " rows = len(df)\n",
230 | " df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n",
231 | " split_index = int(rows * 0.9)\n",
232 | " df_train = df[0:split_index]\n",
233 | " df_test = df[split_index:].reset_index(drop=True)\n",
234 | " print(df_train.shape, df_test.shape)\n",
235 | " return df_train, df_test\n",
236 | "\n",
237 | "# 实际训练过程\n",
238 | "def svd(train, test):\n",
239 | " samples_per_batch = len(train) // BATCH_SIZE\n",
240 | "\n",
241 | " # 一批一批数据用于训练\n",
242 | " iter_train = ShuffleDataIterator([train[\"user\"],\n",
243 | " train[\"item\"],\n",
244 | " train[\"rate\"]],\n",
245 | " batch_size=BATCH_SIZE)\n",
246 | " # 测试数据\n",
247 | " iter_test = OneEpochDataIterator([test[\"user\"],\n",
248 | " test[\"item\"],\n",
249 | " test[\"rate\"]],\n",
250 | " batch_size=-1)\n",
251 | " # user和item batch\n",
252 | " user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n",
253 | " item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n",
254 | " rate_batch = tf.placeholder(tf.float32, shape=[None])\n",
255 | "\n",
256 | " # 构建graph和训练\n",
257 | " infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,\n",
258 | " device=DEVICE)\n",
259 | " global_step = tf.contrib.framework.get_or_create_global_step()\n",
260 | " _, train_op = optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)\n",
261 | "\n",
262 | " # 初始化所有变量\n",
263 | " init_op = tf.global_variables_initializer()\n",
264 | " # 开始迭代\n",
265 | " with tf.Session() as sess:\n",
266 | " sess.run(init_op)\n",
267 | " summary_writer = tf.summary.FileWriter(logdir=\"/tmp/svd/log\", graph=sess.graph)\n",
268 | " print(\"{} {} {} {}\".format(\"epoch\", \"train_error\", \"val_error\", \"elapsed_time\"))\n",
269 | " errors = deque(maxlen=samples_per_batch)\n",
270 | " start = time.time()\n",
271 | " for i in range(EPOCH_MAX * samples_per_batch):\n",
272 | " users, items, rates = next(iter_train)\n",
273 | " _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n",
274 | " item_batch: items,\n",
275 | " rate_batch: rates})\n",
276 | " pred_batch = clip(pred_batch)\n",
277 | " errors.append(np.power(pred_batch - rates, 2))\n",
278 | " if i % samples_per_batch == 0:\n",
279 | " train_err = np.sqrt(np.mean(errors))\n",
280 | " test_err2 = np.array([])\n",
281 | " for users, items, rates in iter_test:\n",
282 | " pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
283 | " item_batch: items})\n",
284 | " pred_batch = clip(pred_batch)\n",
285 | " test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
286 | " end = time.time()\n",
287 | " test_err = np.sqrt(np.mean(test_err2))\n",
288 | " print(\"{:3d} {:f} {:f} {:f}(s)\".format(i // samples_per_batch, train_err, test_err,\n",
289 | " end - start))\n",
290 | " train_err_summary = make_scalar_summary(\"training_error\", train_err)\n",
291 | " test_err_summary = make_scalar_summary(\"test_error\", test_err)\n",
292 | " summary_writer.add_summary(train_err_summary, i)\n",
293 | " summary_writer.add_summary(test_err_summary, i)\n",
294 | " start = end"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 4,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "(900188, 4) (100021, 4)\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "# 获取数据\n",
312 | "df_train, df_test = get_data()"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 5,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "name": "stderr",
322 | "output_type": "stream",
323 | "text": [
324 | "D:\\Anaconda\\install\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
325 | " from ._conv import register_converters as _register_converters\n"
326 | ]
327 | },
328 | {
329 | "name": "stdout",
330 | "output_type": "stream",
331 | "text": [
332 | "epoch train_error val_error elapsed_time\n",
333 | " 0 2.576278 2.577729 0.477677(s)\n",
334 | " 1 1.978902 1.152332 1.450331(s)\n",
335 | " 2 1.002632 0.949393 1.423475(s)\n",
336 | " 3 0.927719 0.926508 1.450366(s)\n",
337 | " 4 0.914275 0.919153 1.534211(s)\n",
338 | " 5 0.910865 0.915688 1.624905(s)\n",
339 | " 6 0.906089 0.913335 1.514213(s)\n",
340 | " 7 0.904977 0.911318 1.424107(s)\n",
341 | " 8 0.901721 0.908855 1.496397(s)\n",
342 | " 9 0.896913 0.906264 1.611612(s)\n",
343 | " 10 0.894468 0.903484 1.795376(s)\n",
344 | " 11 0.891712 0.899968 1.503599(s)\n",
345 | " 12 0.887555 0.895848 1.421528(s)\n",
346 | " 13 0.882009 0.891982 1.420877(s)\n",
347 | " 14 0.876975 0.888060 1.567020(s)\n",
348 | " 15 0.872943 0.884968 1.627547(s)\n",
349 | " 16 0.867226 0.881633 1.486202(s)\n",
350 | " 17 0.864066 0.878666 1.431954(s)\n",
351 | " 18 0.859931 0.875910 1.438102(s)\n",
352 | " 19 0.856037 0.873030 1.433395(s)\n",
353 | " 20 0.849924 0.870667 1.421893(s)\n",
354 | " 21 0.846303 0.868094 1.397581(s)\n",
355 | " 22 0.842261 0.865835 1.398386(s)\n",
356 | " 23 0.836717 0.863661 1.395191(s)\n",
357 | " 24 0.833121 0.861465 1.390639(s)\n",
358 | " 25 0.829651 0.859585 1.461942(s)\n",
359 | " 26 0.824811 0.857843 1.403177(s)\n",
360 | " 27 0.820917 0.856483 1.398302(s)\n",
361 | " 28 0.816505 0.854711 1.390662(s)\n",
362 | " 29 0.813360 0.853433 1.402261(s)\n",
363 | " 30 0.808135 0.852419 1.468770(s)\n",
364 | " 31 0.805145 0.851025 1.394093(s)\n",
365 | " 32 0.799418 0.849873 1.406268(s)\n",
366 | " 33 0.797527 0.849210 1.390641(s)\n",
367 | " 34 0.794350 0.848693 1.399524(s)\n",
368 | " 35 0.792427 0.848298 1.447494(s)\n",
369 | " 36 0.789376 0.847890 1.403100(s)\n",
370 | " 37 0.786277 0.847480 1.406268(s)\n",
371 | " 38 0.783722 0.847279 1.392901(s)\n",
372 | " 39 0.781859 0.846988 1.433542(s)\n",
373 | " 40 0.779194 0.846766 1.460803(s)\n",
374 | " 41 0.776687 0.846418 1.502838(s)\n",
375 | " 42 0.774345 0.846484 1.477898(s)\n",
376 | " 43 0.773097 0.846666 1.470419(s)\n",
377 | " 44 0.772025 0.846828 1.406287(s)\n",
378 | " 45 0.769199 0.846732 1.445719(s)\n",
379 | " 46 0.768910 0.846695 1.390641(s)\n",
380 | " 47 0.766496 0.846699 1.395308(s)\n",
381 | " 48 0.765846 0.846611 1.407348(s)\n",
382 | " 49 0.764256 0.846703 1.406266(s)\n",
383 | " 50 0.762772 0.846718 1.446595(s)\n",
384 | " 51 0.761644 0.847029 1.390661(s)\n",
385 | " 52 0.760738 0.847263 1.413254(s)\n",
386 | " 53 0.759950 0.847614 1.419673(s)\n",
387 | " 54 0.759713 0.847827 1.400326(s)\n",
388 | " 55 0.757802 0.847982 1.421893(s)\n",
389 | " 56 0.757559 0.848026 1.437567(s)\n",
390 | " 57 0.757013 0.848383 1.414295(s)\n",
391 | " 58 0.756566 0.848557 1.390639(s)\n",
392 | " 59 0.756866 0.848483 1.413066(s)\n",
393 | " 60 0.753830 0.848556 1.406267(s)\n",
394 | " 61 0.754405 0.848785 1.447202(s)\n",
395 | " 62 0.754867 0.848690 1.390621(s)\n",
396 | " 63 0.753079 0.848909 1.416439(s)\n",
397 | " 64 0.753559 0.848946 1.383843(s)\n",
398 | " 65 0.753118 0.849353 1.399104(s)\n",
399 | " 66 0.751364 0.849349 1.481309(s)\n",
400 | " 67 0.752177 0.849697 1.449126(s)\n",
401 | " 68 0.751095 0.849683 1.468768(s)\n",
402 | " 69 0.751063 0.849502 1.383999(s)\n",
403 | " 70 0.750350 0.849622 1.406266(s)\n",
404 | " 71 0.751395 0.849533 1.446464(s)\n",
405 | " 72 0.750082 0.849392 1.400997(s)\n",
406 | " 73 0.750379 0.849434 1.388548(s)\n",
407 | " 74 0.749501 0.849552 1.407498(s)\n",
408 | " 75 0.750194 0.849896 1.461215(s)\n",
409 | " 76 0.750201 0.849961 1.446918(s)\n",
410 | " 77 0.749083 0.850167 1.404643(s)\n",
411 | " 78 0.750445 0.850135 1.404541(s)\n",
412 | " 79 0.749501 0.849938 1.453143(s)\n",
413 | " 80 0.747849 0.850081 1.394538(s)\n",
414 | " 81 0.747658 0.850377 1.500019(s)\n",
415 | " 82 0.747445 0.850573 1.417488(s)\n",
416 | " 83 0.748725 0.850522 1.484394(s)\n",
417 | " 84 0.748016 0.850637 1.407718(s)\n",
418 | " 85 0.746435 0.850938 1.380105(s)\n",
419 | " 86 0.747316 0.850969 1.448309(s)\n",
420 | " 87 0.746777 0.850801 1.406286(s)\n",
421 | " 88 0.746731 0.850807 1.400060(s)\n",
422 | " 89 0.747924 0.850830 1.385019(s)\n",
423 | " 90 0.746106 0.850674 1.400585(s)\n",
424 | " 91 0.746864 0.850689 1.419417(s)\n",
425 | " 92 0.746962 0.850772 1.461914(s)\n",
426 | " 93 0.746395 0.850632 1.400366(s)\n",
427 | " 94 0.746491 0.850653 1.390425(s)\n",
428 | " 95 0.746701 0.850703 1.469709(s)\n",
429 | " 96 0.745090 0.850457 1.413338(s)\n",
430 | " 97 0.745649 0.850722 1.436355(s)\n",
431 | " 98 0.745338 0.850862 1.437517(s)\n",
432 | " 99 0.745499 0.850813 1.481134(s)\n",
433 | "100 0.745503 0.850798 1.374998(s)\n",
434 | "101 0.745268 0.850891 1.413138(s)\n",
435 | "102 0.745327 0.850786 1.476407(s)\n",
436 | "103 0.746660 0.850860 1.468770(s)\n",
437 | "104 0.745549 0.851016 1.390663(s)\n",
438 | "105 0.744760 0.850981 1.399773(s)\n",
439 | "106 0.745388 0.850703 1.395961(s)\n",
440 | "107 0.745142 0.850666 1.462982(s)\n",
441 | "108 0.746368 0.850706 1.390665(s)\n",
442 | "109 0.744704 0.850997 1.406251(s)\n",
443 | "110 0.745588 0.850987 1.399718(s)\n",
444 | "111 0.743731 0.851158 1.598807(s)\n",
445 | "112 0.744651 0.851077 1.611395(s)\n",
446 | "113 0.744472 0.850991 1.437518(s)\n",
447 | "114 0.744883 0.851003 1.408719(s)\n",
448 | "115 0.744321 0.850906 1.415562(s)\n",
449 | "116 0.744179 0.851158 1.406284(s)\n",
450 | "117 0.744853 0.851024 1.486450(s)\n",
451 | "118 0.743401 0.850973 1.420012(s)\n",
452 | "119 0.744809 0.851009 1.399587(s)\n",
453 | "120 0.744726 0.851097 1.390638(s)\n",
454 | "121 0.743952 0.850803 1.446391(s)\n",
455 | "122 0.744973 0.850798 1.439853(s)\n",
456 | "123 0.744382 0.850887 1.431332(s)\n",
457 | "124 0.744419 0.850841 1.460428(s)\n",
458 | "125 0.743825 0.851252 1.468669(s)\n",
459 | "126 0.744768 0.850956 1.406268(s)\n",
460 | "127 0.743264 0.850907 1.462467(s)\n",
461 | "128 0.743480 0.850931 1.390640(s)\n",
462 | "129 0.743621 0.851018 1.391578(s)\n",
463 | "130 0.744046 0.850966 1.407784(s)\n",
464 | "131 0.743349 0.850969 1.390645(s)\n",
465 | "132 0.743841 0.850785 1.462579(s)\n",
466 | "133 0.743074 0.850948 1.458358(s)\n",
467 | "134 0.744358 0.850806 1.407487(s)\n",
468 | "135 0.743972 0.851127 1.390640(s)\n",
469 | "136 0.743227 0.851148 1.406265(s)\n",
470 | "137 0.742984 0.851232 1.447071(s)\n",
471 | "138 0.744403 0.851532 1.411106(s)\n",
472 | "139 0.743451 0.851401 1.469621(s)\n",
473 | "140 0.743391 0.851384 1.390645(s)\n",
474 | "141 0.744516 0.851492 1.406285(s)\n",
475 | "142 0.743470 0.851447 1.410015(s)\n",
476 | "143 0.743198 0.851322 1.420781(s)\n",
477 | "144 0.744412 0.851270 1.403787(s)\n",
478 | "145 0.742384 0.851284 1.390643(s)\n",
479 | "146 0.743339 0.851364 1.399891(s)\n",
480 | "147 0.742802 0.851247 1.395700(s)\n",
481 | "148 0.742878 0.851421 1.469909(s)\n",
482 | "149 0.743484 0.851321 1.399727(s)\n",
483 | "150 0.743502 0.851572 1.399970(s)\n",
484 | "151 0.743406 0.851571 1.399521(s)\n",
485 | "152 0.742925 0.851396 1.395897(s)\n",
486 | "153 0.742553 0.851295 1.451918(s)\n",
487 | "154 0.743613 0.851278 1.406268(s)\n",
488 | "155 0.741762 0.851363 1.525129(s)\n",
489 | "156 0.743210 0.851457 1.406286(s)\n",
490 | "157 0.743032 0.851381 1.395900(s)\n",
491 | "158 0.741658 0.851501 1.455009(s)\n",
492 | "159 0.743116 0.851250 1.406258(s)\n",
493 | "160 0.743059 0.851320 1.399649(s)\n",
494 | "161 0.743155 0.851130 1.406287(s)\n",
495 | "162 0.741716 0.851186 1.395491(s)\n",
496 | "163 0.742589 0.851172 1.439167(s)\n",
497 | "164 0.742117 0.850974 1.468770(s)\n",
498 | "165 0.742390 0.851116 1.409128(s)\n",
499 | "166 0.744096 0.851254 1.435538(s)\n",
500 | "167 0.742634 0.851376 1.388916(s)\n",
501 | "168 0.741646 0.851275 1.457746(s)\n",
502 | "169 0.742897 0.851177 1.366868(s)\n",
503 | "170 0.743052 0.851266 1.426625(s)\n",
504 | "171 0.742376 0.851271 1.403697(s)\n",
505 | "172 0.742742 0.851338 1.390625(s)\n",
506 | "173 0.742256 0.851197 1.453141(s)\n",
507 | "174 0.742268 0.851046 1.399479(s)\n",
508 | "175 0.742002 0.850905 1.383932(s)\n",
509 | "176 0.741890 0.851078 1.421894(s)\n",
510 | "177 0.743085 0.851033 1.406270(s)\n",
511 | "178 0.741955 0.850917 1.416128(s)\n",
512 | "179 0.742171 0.851150 1.442452(s)\n",
513 | "180 0.742805 0.851376 1.423072(s)\n",
514 | "181 0.741585 0.851463 1.437518(s)\n",
515 | "182 0.742271 0.851443 1.437522(s)\n",
516 | "183 0.743029 0.851565 1.461741(s)\n",
517 | "184 0.742284 0.851456 1.504725(s)\n",
518 | "185 0.741653 0.851479 1.454393(s)\n",
519 | "186 0.743889 0.851576 1.564915(s)\n",
520 | "187 0.742872 0.851446 1.650403(s)\n",
521 | "188 0.741979 0.851394 1.431463(s)\n",
522 | "189 0.742107 0.851101 1.421539(s)\n",
523 | "190 0.742485 0.851297 1.406282(s)\n",
524 | "191 0.740788 0.851228 1.415401(s)\n",
525 | "192 0.742113 0.851329 1.414911(s)\n",
526 | "193 0.741579 0.851133 1.462146(s)\n",
527 | "194 0.742999 0.851144 1.455878(s)\n",
528 | "195 0.742513 0.851250 1.415932(s)\n",
529 | "196 0.743028 0.851395 1.390641(s)\n",
530 | "197 0.742302 0.851131 1.579862(s)\n",
531 | "198 0.741136 0.851173 1.607915(s)\n",
532 | "199 0.741375 0.851128 1.723560(s)\n"
533 | ]
534 | }
535 | ],
536 | "source": [
537 | "# 完成实际的训练\n",
538 | "svd(df_train, df_test)"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": null,
544 | "metadata": {
545 | "collapsed": true
546 | },
547 | "outputs": [],
548 | "source": []
549 | }
550 | ],
551 | "metadata": {
552 | "kernelspec": {
553 | "display_name": "Python 3",
554 | "language": "python",
555 | "name": "python3"
556 | },
557 | "language_info": {
558 | "codemirror_mode": {
559 | "name": "ipython",
560 | "version": 3
561 | },
562 | "file_extension": ".py",
563 | "mimetype": "text/x-python",
564 | "name": "python",
565 | "nbconvert_exporter": "python",
566 | "pygments_lexer": "ipython3",
567 | "version": "3.5.2"
568 | }
569 | },
570 | "nbformat": 4,
571 | "nbformat_minor": 2
572 | }
573 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 音乐推荐系统
2 | #### 语言:python3.5
3 | #### 库:Surprise
4 | #### 平台:jupyter notebook
5 | #### 描述:此推荐系统类似网易云音乐推荐歌单以及推荐相似歌曲。
6 | ### 1. 数据获取
7 | 使用爬虫爬取了网易云音乐中80w首歌400w+次收藏的歌单,存储格式为json格式,数据大小为3.59G,格式说明如下:
8 |
9 | 1)每个歌单的格式
10 | {
11 | "result": {
12 | "id": 111450065,
13 | "status": 0,
14 | "commentThreadId": "A_PL_0_111450065",
15 | "trackCount": 120,
16 | "updateTime": 1460164523907,
17 | "commentCount": 227,
18 | "ordered": true,
19 | "anonimous": false,
20 | "highQuality": false,
21 | "subscribers": [],
22 | "playCount": 687070,
23 | "trackNumberUpdateTime": 1460164523907,
24 | "createTime": 1443528317662,
25 | "name": "带本书去旅行吧,人生最美好的时光在路上。",
26 | "cloudTrackCount": 0,
27 | "shareCount": 149,
28 | "adType": 0,
29 | "trackUpdateTime": 1494134249465,
30 | "userId": 39256799,
31 | "coverImgId": 3359008023885470,
32 | "coverImgUrl": "http://p1.music.126.net/2ZFcuSJ6STR8WgzkIi2U-Q==/3359008023885470.jpg",
33 | "artists": null,
34 | "newImported": false,
35 | "subscribed": false,
36 | "privacy": 0,
37 | "specialType": 0,
38 | "description": "现在是一年中最美好的时节,世界上很多地方都不冷不热,有湛蓝的天空和清冽的空气,正是出游的好时光。长假将至,你是不是已经收拾行装准备出发了?行前焦虑症中把衣服、洗漱用品、充电器之类东西忙忙碌碌地丢进箱子,打进背包的时候,我打赌你肯定会留个位置给一位好朋友:书。不是吗?不管是打发时间,小读怡情,还是为了做好攻略备不时之需,亦或是为了小小地装上一把,你都得有一本书傍身呀。读大仲马,我是复仇的伯爵;读柯南道尔,我穿梭在雾都的暗夜;读村上春树,我是寻羊的冒险者;读马尔克斯,目睹百年家族兴衰;读三毛,让灵魂在撒哈拉流浪;读老舍,嗅着老北京的气息;读海茵莱茵,于科幻狂流遨游;读卡夫卡,在城堡中审判……读书的孩子不会孤单,读书的孩子永远幸福。",
39 | "subscribedCount": 10882,
40 | "totalDuration": 0,
41 | "tags": [
42 | "旅行",
43 | "钢琴",
44 | "安静"]
45 | "creator": {
46 | "followed": false,
47 | "remarkName": null,
48 | "expertTags": [
49 | "古典",
50 | "民谣",
51 | "华语"
52 | ],
53 | "userId": 39256799,
54 | "authority": 0,
55 | "userType": 0,
56 | "gender": 1,
57 | "backgroundImgId": 3427177752524551,
58 | "city": 360600,
59 | "mutual": false,
60 | "avatarUrl": "http://p1.music.126.net/TLRTrJpOM5lr68qJv1IyGQ==/1400777825738419.jpg",
61 | "avatarImgIdStr": "1400777825738419",
62 | "detailDescription": "",
63 | "province": 360000,
64 | "description": "",
65 | "birthday": 637516800000,
66 | "nickname": "有梦人生不觉寒",
67 | "vipType": 0,
68 | "avatarImgId": 1400777825738419,
69 | "defaultAvatar": false,
70 | "djStatus": 0,
71 | "accountStatus": 0,
72 | "backgroundImgIdStr": "3427177752524551",
73 | "backgroundUrl": "http://p1.music.126.net/LS96S_6VP9Hm7-T447-X0g==/3427177752524551.jpg",
74 | "signature": "漫无目的的乱听,听着,听着,竟然灵魂出窍了。更多精品音乐美图分享请加我微信hu272367751。微信是我的精神家园,有我最真诚的分享。",
75 | "authStatus": 0}
76 | "tracks": [{歌曲1},{歌曲2}, ...]
77 | }
78 | }
79 | 2)每首歌曲的格式为:
80 | {
81 | "id": 29738501,
82 | "name": "跟着你到天边 钢琴版",
83 | "duration": 174001,
84 | "hearTime": 0,
85 | "commentThreadId": "R_SO_4_29738501",
86 | "score": 40,
87 | "mvid": 0,
88 | "hMusic": null,
89 | "disc": "",
90 | "fee": 0,
91 | "no": 1,
92 | "rtUrl": null,
93 | "ringtone": null,
94 | "rtUrls": [],
95 | "rurl": null,
96 | "status": 0,
97 | "ftype": 0,
98 | "mp3Url": "http://m2.music.126.net/vrVa20wHs8iIe0G8Oe7I9Q==/3222668581877701.mp3",
99 | "audition": null,
100 | "playedNum": 0,
101 | "copyrightId": 0,
102 | "rtype": 0,
103 | "crbt": null,
104 | "popularity": 40,
105 | "dayPlays": 0,
106 | "alias": [],
107 | "copyFrom": "",
108 | "position": 1,
109 | "starred": false,,
110 | "starredNum": 0
111 | "bMusic": {
112 | "name": "跟着你到天边 钢琴版",
113 | "extension": "mp3",
114 | "volumeDelta": 0.0553125,
115 | "sr": 44100,
116 | "dfsId": 3222668581877701,
117 | "playTime": 174001,
118 | "bitrate": 96000,
119 | "id": 52423394,
120 | "size": 2089713
121 | },
122 | "lMusic": {
123 | "name": "跟着你到天边 钢琴版",
124 | "extension": "mp3",
125 | "volumeDelta": 0.0553125,
126 | "sr": 44100,
127 | "dfsId": 3222668581877701,
128 | "playTime": 174001,
129 | "bitrate": 96000,
130 | "id": 52423394,
131 | "size": 2089713
132 | },
133 | "mMusic": {
134 | "name": "跟着你到天边 钢琴版",
135 | "extension": "mp3",
136 | "volumeDelta": -0.000265076,
137 | "sr": 44100,
138 | "dfsId": 3222668581877702,
139 | "playTime": 174001,
140 | "bitrate": 128000,
141 | "id": 52423395,
142 | "size": 2785510
143 | },
144 | "artists": [
145 | {
146 | "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
147 | "name": "群星",
148 | "briefDesc": "",
149 | "albumSize": 0,
150 | "img1v1Id": 0,
151 | "musicSize": 0,
152 | "alias": [],
153 | "picId": 0,
154 | "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
155 | "trans": "",
156 | "id": 122455
157 | }
158 | ],
159 | "album": {
160 | "id": 3054006,
161 | "status": 2,
162 | "type": null,
163 | "tags": "",
164 | "size": 69,
165 | "blurPicUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
166 | "copyrightId": 0,
167 | "name": "热门华语248",
168 | "companyId": 0,
169 | "songs": [],
170 | "description": "",
171 | "pic": 3274345629219531,
172 | "commentThreadId": "R_AL_3_3054006",
173 | "publishTime": 1388505600004,
174 | "briefDesc": "",
175 | "company": "",
176 | "picId": 3274345629219531,
177 | "alias": [],
178 | "picUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
179 | "artists": [
180 | {
181 | "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
182 | "name": "群星",
183 | "briefDesc": "",
184 | "albumSize": 0,
185 | "img1v1Id": 0,
186 | "musicSize": 0,
187 | "alias": [],
188 | "picId": 0,
189 | "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
190 | "trans": "",
191 | "id": 122455
192 | }
193 | ],
194 | "artist": {
195 | "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
196 | "name": "",
197 | "briefDesc": "",
198 | "albumSize": 0,
199 | "img1v1Id": 0,
200 | "musicSize": 0,
201 | "alias": [],
202 | "picId": 0,
203 | "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
204 | "trans": "",
205 | "id": 0
206 | }
207 | }
208 | }
209 |
210 |
211 | ### 2. 数据解析
212 | #### 2.1 原始数据=>歌单数据
213 | 抽取 歌单名称,歌单id,收藏数,所属分类 4个歌单维度的信息
214 | 抽取 歌曲id,歌曲名,歌手,歌曲热度 等4个维度信息歌曲的信息
215 |
216 | 组织成如下格式:
217 |
218 | 漫步西欧小镇上##小语种,旅行##69413685##474 18682332::Wäg vo dir::Joy Amelie::70.0 4335372::Only When I Sleep::The Corrs::60.0 2925502::Si Seulement::Lynnsha::100.0 21014930::Tu N'As Pas Cherché...::La Grande Sophie::100.0 20932638::Du behöver aldrig mer vara rädd::Lasse Lindh::25.0 17100518::Silent Machine::Cat Power::60.0 3308096::Kor pai kon diew : ชอไปคนเดียว::Palmy::5.0 1648250::les choristes::Petits Chanteurs De Saint Marc::100.0 4376212::Paddy's Green Shamrock Shore::The High Kings::25.0 2925400::A Todo Color::Las Escarlatinas::95.0 19711402::Comme Toi::Vox Angeli::75.0 3977526::Stay::Blue Cafe::100.0 2538518::Shake::Elize::85.0 2866799::Mon Ange::Jena Lee::85.0 5191949::Je M'appelle Helene::Hélène Rolles::85.0 20036323::Ich Lieb' Dich Immer Noch So Sehr::Kate & Ben::100.0
219 |
220 | #### 2.2 歌单数据=>推荐系统格式数据
221 | 主流的python推荐系统框架,支持的最基本数据格式为movielens dataset,其评分数据格式为 user item rating timestamp,把数据处理成这个格式。
222 | #### 2.3 保存歌单和歌曲信息备用
223 | 保存 歌单id=>歌单名 和 歌曲id=>歌曲名 的信息
224 |
225 | ### 3.使用python推荐系统库Surprise完成项目
226 | #### 3.1用协同过滤构建模型并进行预测
227 | ##### 3.1.1 推荐歌单
228 | 
229 | ##### 3.1.2 推荐歌曲
230 | 
231 | 当然也可以使用其他的算法来实现,如:
232 |
233 | 基础算法/baseline algorithms
234 | 基于近邻方法(协同过滤)/neighborhood methods
235 | 矩阵分解方法/matrix factorization-based (SVD, PMF, SVD++, NMF)
236 |
237 |
238 | ### 4. 不同的推荐系统算法评估
239 | 可以使用不同的评估准则,如:
240 |
241 | rmse Compute RMSE (Root Mean Squared Error).
242 | msd Compute MAE (Mean Absolute Error).
243 | fcp Compute FCP (Fraction of Concordant Pairs).
244 |
245 |
--------------------------------------------------------------------------------
/Sequence Modelling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 歌曲序列建模\n",
8 | "### 从word2vec到song2vec\n",
9 | "把歌曲的id序列取出来,类比于分完词后的句子,送到word2vec中去学习"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 8,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "#coding: utf-8\n",
19 | "import multiprocessing\n",
20 | "import gensim\n",
21 | "import sys\n",
22 | "from random import shuffle\n",
23 | "\n",
24 | "def parse_playlist_get_sequence(in_line, playlist_sequence):\n",
25 | " song_sequence = []\n",
26 | " contents = in_line.strip().split(\"\\t\")\n",
27 | " # 解析歌单序列\n",
28 | " for song in contents[1:]:\n",
29 | " try:\n",
30 | " song_id, song_name, artist, popularity = song.split(\"::\")\n",
31 | " song_sequence.append(song_id)\n",
32 | " except:\n",
33 | " print (\"song format error\")\n",
34 | " print (song+\"\\n\")\n",
35 | " for i in range(len(song_sequence)):\n",
36 | " shuffle(song_sequence)\n",
37 | " playlist_sequence.append(song_sequence)\n",
38 | "\n",
39 | "\n",
40 | "def train_song2vec(in_file, out_file):\n",
41 | " #所有歌单序列\n",
42 | " playlist_sequence = []\n",
43 | " #遍历所有歌单\n",
44 | " for line in open(in_file, encoding='utf-8'):\n",
45 | " parse_playlist_get_sequence(line, playlist_sequence)\n",
46 | " #使用word2vec训练\n",
47 | " cores = multiprocessing.cpu_count()\n",
48 | " print (\"using all \"+str(cores)+\" cores\")\n",
49 | " print (\"Training word2vec model...\")\n",
50 | " model = gensim.models.Word2Vec(sentences=playlist_sequence, size=150, min_count=3, window=7, workers=cores)\n",
51 | " print (\"Saving model...\")\n",
52 | " model.save(out_file)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 9,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "song format error\n",
65 | "1870957::彩云国物语 セカンドシリーズ::君を想う::梁邦彦::80.0\n",
66 | "\n",
67 | "song format error\n",
68 | "4965888::桃华月惮::龙皇-リュウオウ-::多田彰文::25.0\n",
69 | "\n",
70 | "song format error\n",
71 | "456177::true tears::一阵の风::菊地創::95.0\n",
72 | "\n",
73 | "song format error\n",
74 | "22642373::\n",
75 | "\n",
76 | "song format error\n",
77 | " FAIRY TAIL メインテーマ -Slow ver.-::高梨康治::95.0\n",
78 | "\n",
79 | "song format error\n",
80 | "31563610::\n",
81 | "\n",
82 | "song format error\n",
83 | "苍之礼赞::花之祭P::60.0\n",
84 | "\n",
85 | "song format error\n",
86 | "4954593::リズム天国全曲集::恋の実験室::V.A.::55.0\n",
87 | "\n",
88 | "song format error\n",
89 | "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n",
90 | "\n",
91 | "song format error\n",
92 | "31654811::\n",
93 | "\n",
94 | "song format error\n",
95 | "American Cowboys::Tim Wynn::65.0\n",
96 | "\n",
97 | "song format error\n",
98 | "19169096::\n",
99 | "\n",
100 | "song format error\n",
101 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
102 | "\n",
103 | "song format error\n",
104 | "31563610::\n",
105 | "\n",
106 | "song format error\n",
107 | "苍之礼赞::花之祭P::60.0\n",
108 | "\n",
109 | "song format error\n",
110 | "31563610::\n",
111 | "\n",
112 | "song format error\n",
113 | "苍之礼赞::花之祭P::60.0\n",
114 | "\n",
115 | "song format error\n",
116 | "31563610::\n",
117 | "\n",
118 | "song format error\n",
119 | "苍之礼赞::花之祭P::60.0\n",
120 | "\n",
121 | "song format error\n",
122 | "19169096::\n",
123 | "\n",
124 | "song format error\n",
125 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
126 | "\n",
127 | "song format error\n",
128 | "376653::野弧禅狂叱(宿香之战)\n",
129 | "\n",
130 | "song format error\n",
131 | "::霹雳英雄::5.0\n",
132 | "\n",
133 | "song format error\n",
134 | "374524::赎?罪\n",
135 | "\n",
136 | "song format error\n",
137 | "赎罪岩::霹雳英雄::15.0\n",
138 | "\n",
139 | "song format error\n",
140 | "31563610::\n",
141 | "\n",
142 | "song format error\n",
143 | "苍之礼赞::花之祭P::65.0\n",
144 | "\n",
145 | "song format error\n",
146 | "37610597::ダウンタウン熱血物語::公園/河原にて~ひとときのやすらぎ~::V.A.::75.0\n",
147 | "\n",
148 | "song format error\n",
149 | "37610748::くにおくんの熱血サッカーリーグ::ねっけつ たいふーん♪::V.A.::80.0\n",
150 | "\n",
151 | "song format error\n",
152 | "37610755::くにおくんの熱血サッカーリーグ::てくのす じゃぱん かっぷの てーま♪::V.A.::75.0\n",
153 | "\n",
154 | "song format error\n",
155 | "37610745::くにおくんの熱血サッカーリーグ::ゲームモード選択::V.A.::75.0\n",
156 | "\n",
157 | "song format error\n",
158 | "37610643::ダウンタウン熱血行進曲 それゆけ大運動会::オープニングファンファーレ::V.A.::70.0\n",
159 | "\n",
160 | "song format error\n",
161 | "33054290::\n",
162 | "\n",
163 | "song format error\n",
164 | "Heartbeats::Dabin::90.0\n",
165 | "\n",
166 | "song format error\n",
167 | "405599088::Make Them Wheels Roll\n",
168 | "\n",
169 | "song format error\n",
170 | "::SAFIA::100.0\n",
171 | "\n",
172 | "song format error\n",
173 | "424496188::大王叫我来巡山 - (原唱:\n",
174 | "\n",
175 | "song format error\n",
176 | " 贾乃亮/贾云馨)::流浪的蛙蛙::65.0\n",
177 | "\n",
178 | "song format error\n",
179 | "19169096::\n",
180 | "\n",
181 | "song format error\n",
182 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
183 | "\n",
184 | "song format error\n",
185 | "26902203::What’s your name? (collaboration with 壇蜜)\n",
186 | "\n",
187 | "song format error\n",
188 | "::SoulJa::100.0\n",
189 | "\n",
190 | "song format error\n",
191 | "33054290::\n",
192 | "\n",
193 | "song format error\n",
194 | "Heartbeats::Dabin::95.0\n",
195 | "\n",
196 | "song format error\n",
197 | "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n",
198 | "\n",
199 | "song format error\n",
200 | "32272105::\n",
201 | "\n",
202 | "song format error\n",
203 | "Wonderful Love (DJ Raf Remix)::Money Penny::95.0\n",
204 | "\n",
205 | "song format error\n",
206 | "33054290::\n",
207 | "\n",
208 | "song format error\n",
209 | "Heartbeats::Dabin::95.0\n",
210 | "\n",
211 | "song format error\n",
212 | "427373827::Champions (From \"Hands of Stone\") \n",
213 | "\n",
214 | "song format error\n",
215 | "::Usher::30.0\n",
216 | "\n",
217 | "song format error\n",
218 | "29242687::「コード・エテスウェイ (Class::ETHES_WEI=>extends.COMMUNI_SAT/.)」::霜月はるか::70.0\n",
219 | "\n",
220 | "using all 4 cores\n",
221 | "Training word2vec model...\n",
222 | "Saving model...\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "song_sequence_file = \"./ori_data/popular.playlist\"\n",
228 | "model_file = \"./model/song2vec.model\"\n",
229 | "train_song2vec(song_sequence_file, model_file)"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "### 预测的过程,实际上就是对某首歌曲,查找“最近”的歌曲(向量距离最近的歌曲)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 10,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": [
247 | "import pickle\n",
248 | "song_dic = pickle.load(open(\"./pro_data/popular_song.pkl\",\"rb\"))\n",
249 | "model_str = \"./model/song2vec.model\"\n",
250 | "model = gensim.models.Word2Vec.load(model_str)"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 12,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "315958 那件疯狂的小事叫爱情\t袁泉\n",
263 | "28138980 为你我受冷风吹\t孙露\n",
264 | "247526 彗星的眼泪\t金莎\n",
265 | "5280395 慨古吟(琴歌)\t张铜霞\n",
266 | "31140395 一首简单的歌\t本兮\n",
267 | "27532150 Smoke Fly ft. JBo Escobar & Khaki\tAl Rocco\n",
268 | "440767373 メドゥーサ(美杜莎)\t月蝕原创音乐\n",
269 | "16323636 The Prayer\tAndrea Bocelli\n",
270 | "281436 夜曲\t彭芳\n",
271 | "5270404 渴望(二胡)\t群星\n"
272 | ]
273 | }
274 | ],
275 | "source": [
276 | "for song in list(song_dic.keys())[:10]:\n",
277 | " print (song, song_dic[song])"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 14,
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "name": "stderr",
287 | "output_type": "stream",
288 | "text": [
289 | "D:\\Anaconda\\install\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
290 | " This is separate from the ipykernel package so we can avoid doing imports until\n"
291 | ]
292 | },
293 | {
294 | "name": "stdout",
295 | "output_type": "stream",
296 | "text": [
297 | "368971 Ambulance of love\t脑浊\n",
298 | "\n",
299 | "相似歌曲 和 相似度 分别为:\n",
300 | "\t 新世界\t呼吸 0.8102102279663086\n",
301 | "\t 上苍保佑吃完了饭的人民\t张楚 0.8082322478294373\n",
302 | "\t 呀呀\t图腾 0.7943791151046753\n",
303 | "\t 昨日我从清晨开始等待\t钟立风 0.774426281452179\n",
304 | "\t 生命(Live) - live\t声音玩具 0.7557182312011719\n",
305 | "\t 两天\t许巍 0.7442638874053955\n",
306 | "\t 永远在一起\t飘乐队 0.7283462285995483\n",
307 | "\t 今夜\t许巍 0.7184534072875977\n",
308 | "\t 祖先的阴影\t超载 0.7170863747596741\n",
309 | "\t 我们走过的路\t天空 0.7092562913894653\n",
310 | "\n",
311 | "\n",
312 | "33599059 八秒之语\t洛天依\n",
313 | "\n",
314 | "相似歌曲 和 相似度 分别为:\n",
315 | "\t 乡村DISCO\tVOCALOID 0.6794760227203369\n",
316 | "\t 春雨\t乐正绫 0.649375319480896\n",
317 | "\t 远恋\t阿良良木健 0.6485384702682495\n",
318 | "\t 食之歌 VOCALOID Ver.\t泛音堂 0.6450830698013306\n",
319 | "\t 小幸运(Cover:田馥甄)\t星魂梦 0.6226800680160522\n",
320 | "\t 出格\t阿妍 0.6186865568161011\n",
321 | "\t 双向监禁\t洛天依 0.6172651052474976\n",
322 | "\t 山海默示录(洛天依版)\t小旭PRO 0.6170870661735535\n",
323 | "\t 甄姬\tVOCALOID 0.613639771938324\n",
324 | "\t 全世界都死了\t海鲜面 0.6097506284713745\n",
325 | "\n",
326 | "\n",
327 | "408332846 知足\t苏运莹\n",
328 | "\n",
329 | "相似歌曲 和 相似度 分别为:\n",
330 | "\t 垃圾车(cover 五月天)\t李昂星 0.714142918586731\n",
331 | "\t Happy Birth Day\t香蕉 0.7014893293380737\n",
332 | "\t 拥抱(Cover 五月天)\t橙大蕾蕾 0.6977276802062988\n",
333 | "\t 爱情的模样\t小平 0.6919869780540466\n",
334 | "\t 听不到(Live)\t梁静茹 0.6528257727622986\n",
335 | "\t 我就是这样的\t黄贯中 0.6462737917900085\n",
336 | "\t 拥抱(Cover 五月天)\t燕子姐姐弹吉他 0.6430615186691284\n",
337 | "\t 穿越时空遇见你\t萧亚轩 0.6319111585617065\n",
338 | "\t 神奇\t孙燕姿 0.6182767152786255\n",
339 | "\t 一个人的圣诞节\t张赫宣 0.6101440191268921\n",
340 | "\n",
341 | "\n",
342 | "34072696 酒馆小调\t洛天依\n",
343 | "\n",
344 | "相似歌曲 和 相似度 分别为:\n",
345 | "\t 菌裂\t言和 0.7842902541160583\n",
346 | "\t 妄想不到的恋曲\t烂兔子 0.748408317565918\n",
347 | "\t 女王\t洛天依 0.7209190726280212\n",
348 | "\t 清醒的梦 \tVilokun feat.言和 0.7056742906570435\n",
349 | "\t Mr 坷垃\t言和 0.7054739594459534\n",
350 | "\t 【心华】乌龟家的茶社【ick】\t缺钙体质ick 0.7002834677696228\n",
351 | "\t 偶像进行时\t言和 0.698686420917511\n",
352 | "\t 乐正绫 - 拉斯维加斯\t慕晓社 0.6933234333992004\n",
353 | "\t 流星之愿\t洛天依&言和 0.6909075975418091\n",
354 | "\t 病态的我\t洛天依 0.6879562139511108\n",
355 | "\n",
356 | "\n",
357 | "279713 十六夜的樱丘\t梦璟SAYA\n",
358 | "\n",
359 | "相似歌曲 和 相似度 分别为:\n",
360 | "\t 合金三国-沛县\t灰原穷 0.7301620841026306\n",
361 | "\t 岁月友情演唱会Live\t聂予词 0.7063505053520203\n",
362 | "\t 克罗地亚狂想曲(中文填词版)\t少年霜 0.6886762976646423\n",
363 | "\t 言君安\t倾夜 0.6869547963142395\n",
364 | "\t 凤鸣曲\t音频怪物 0.6675935387611389\n",
365 | "\t 相忘江湖\t玄觞 0.6578972935676575\n",
366 | "\t 伊人\t魏晨 0.6283127665519714\n",
367 | "\t 失落的遗迹:Lost Ruins ~ adventurers' tale~\tkaede 0.6206890344619751\n",
368 | "\t 【Moonlight组合】甜蜜具现式\tKBShinya 0.6184070110321045\n",
369 | "\t 老房子的故事【老歌搬家】\tWinky诗 0.6105965971946716\n",
370 | "\n",
371 | "\n",
372 | "33004911 听妈妈讲那过去的事情\t群星\n",
373 | "\n",
374 | "相似歌曲 和 相似度 分别为:\n",
375 | "\t 国旗多美丽\t群星 0.9948954582214355\n",
376 | "\t 劳动最光荣\t杨烁 0.9924089312553406\n",
377 | "\t 数青蛙\t群星 0.9401867985725403\n",
378 | "\t 儿童歌曲大联唱B\t群星 0.9401167631149292\n",
379 | "\t 母鸭带小鸭\t杨烁 0.937639594078064\n",
380 | "\t 嘀哩,嘀哩\t中央人民广播电台少年儿童合唱团 0.931469738483429\n",
381 | "\t 小小少年\t韩征 0.9063182473182678\n",
382 | "\t 世上只有妈妈好\t杨烁 0.893858790397644\n",
383 | "\t 真善美的小世界\t小蓓蕾组合 0.8826833963394165\n",
384 | "\t 我家住在北京城\t苑菁 0.8813650012016296\n",
385 | "\n",
386 | "\n",
387 | "26389372 水色\tUA\n",
388 | "\n",
389 | "相似歌曲 和 相似度 分别为:\n",
390 | "\t 最后の言い訳\t徳永英明 0.7948691248893738\n",
391 | "\t テルーの呗\t手嶌葵 0.7602044343948364\n",
392 | "\t もう君以外爱せない\tKinKi Kids 0.7559913992881775\n",
393 | "\t 氷点\t玉置浩二 0.7401448488235474\n",
394 | "\t 时の过ぎゆくままに\t沢田研二 0.7194046378135681\n",
395 | "\t 时代\t中島みゆき 0.7185744643211365\n",
396 | "\t 手紙 ~拝啓 十五の君へ~\tアンジェラ・アキ 0.6948644518852234\n",
397 | "\t あした\t中島みゆき 0.685759425163269\n",
398 | "\t MR.LONELY\t玉置浩二 0.6696567535400391\n",
399 | "\t Carcrashes [Album Version]\tStandfast 0.6691217422485352\n",
400 | "\n",
401 | "\n",
402 | "166471 别等离开才说爱我\t王志\n",
403 | "\n",
404 | "相似歌曲 和 相似度 分别为:\n",
405 | "\t 分爱 粤语版\t易欣 0.8755930066108704\n",
406 | "\t 反叛(Illegal Mix) - remix\t陈慧娴 0.873873233795166\n",
407 | "\t 背叛\t芭比 0.8577862977981567\n",
408 | "\t 罗盘上的指针\t群星 0.8541814088821411\n",
409 | "\t 最美丽的花\t王绎龙 0.8295896649360657\n",
410 | "\t 各种小曲各种嗨\t珊爷 0.8236122727394104\n",
411 | "\t 不要推我\t群星 0.7918290495872498\n",
412 | "\t 狂舞大麻\t群星 0.7873603105545044\n",
413 | "\t 真的不容易 (DJ阿圣 Remix)\t庄心妍 0.7646005749702454\n",
414 | "\t 看我72变\tM3 0.7498428225517273\n",
415 | "\n",
416 | "\n",
417 | "29414454 心中喜欢就说爱\t好妹妹乐队\n",
418 | "\n",
419 | "相似歌曲 和 相似度 分别为:\n",
420 | "\t 请你给我多一点点的温柔\t秦昊 0.8446059823036194\n",
421 | "\t 熟悉的拥抱 (Demo)\t好妹妹乐队 0.8162363767623901\n",
422 | "\t 秋诗篇篇 \t秦昊 0.8092836737632751\n",
423 | "\t 风从海面吹过来\t好妹妹乐队 0.792778491973877\n",
424 | "\t 军港之夜 \t秦昊 0.78443443775177\n",
425 | "\t 熟悉的拥抱\t好妹妹乐队 0.7571429014205933\n",
426 | "\t 心曲\t好妹妹乐队 0.7473146319389343\n",
427 | "\t 风又吹走了\t好妹妹乐队 0.7431649565696716\n",
428 | "\t 愿在秋天死去 (Demo)\t好妹妹乐队 0.7383122444152832\n",
429 | "\t 四季歌\t秦昊 0.7374083399772644\n",
430 | "\n",
431 | "\n",
432 | "347983 传奇 Legend\t春秋\n",
433 | "\n",
434 | "相似歌曲 和 相似度 分别为:\n",
435 | "\t 开始\t核聚变-G 0.8861181735992432\n",
436 | "\t 抢回一切\t岩浆乐队 0.85029536485672\n",
437 | "\t No One Can Change My Mind\t利事乐队 0.8492539525032043\n",
438 | "\t 武器\t浊乐队 0.809968888759613\n",
439 | "\t 【填翻】城池\t妖痴 0.786383867263794\n",
440 | "\t 杀手\t战斧 0.7508671879768372\n",
441 | "\t 梦魔\t大红袍 0.7423715591430664\n",
442 | "\t 梦已成\"血\"\t液氧罐头 0.7245020866394043\n",
443 | "\t 大鱼\t三火SAMA 0.7139255404472351\n",
444 | "\t 是什么,让我们留在这里?\t夜叉 0.7099397778511047\n",
445 | "\n",
446 | "\n"
447 | ]
448 | }
449 | ],
450 | "source": [
451 | "song_id_list = list(song_dic.keys())[1000:1500:50]\n",
452 | "for song_id in song_id_list:\n",
453 | " result_song_list = model.most_similar(song_id)\n",
454 | "\n",
455 | " print (song_id, song_dic[song_id])\n",
456 | " print (\"\\n相似歌曲 和 相似度 分别为:\")\n",
457 | " for song in result_song_list:\n",
458 | " print (\"\\t\", song_dic[song[0]], song[1])\n",
459 | " print (\"\\n\")"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {
466 | "collapsed": true
467 | },
468 | "outputs": [],
469 | "source": []
470 | }
471 | ],
472 | "metadata": {
473 | "kernelspec": {
474 | "display_name": "Python 3",
475 | "language": "python",
476 | "name": "python3"
477 | },
478 | "language_info": {
479 | "codemirror_mode": {
480 | "name": "ipython",
481 | "version": 3
482 | },
483 | "file_extension": ".py",
484 | "mimetype": "text/x-python",
485 | "name": "python",
486 | "nbconvert_exporter": "python",
487 | "pygments_lexer": "ipython3",
488 | "version": "3.5.2"
489 | }
490 | },
491 | "nbformat": 4,
492 | "nbformat_minor": 2
493 | }
494 |
--------------------------------------------------------------------------------
/Spark Recommendation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## pyspark协同过滤"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### user-based协同过滤"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "#-*- coding:utf8 -*-\n",
26 | "# pySpark实现的基于用户的协同过滤\n",
27 | "# 使用的余弦相似度\n",
28 | "\n",
29 | "import sys\n",
30 | "from collections import defaultdict\n",
31 | "from itertools import combinations\n",
32 | "import random\n",
33 | "import numpy as np\n",
34 | "import pdb\n",
35 | "\n",
36 | "from pyspark import SparkContext\n",
37 | "\n",
38 | "# user item rating timestamp\n",
39 | "def parseVectorOnUser(line):\n",
40 | " '''\n",
41 | " 解析数据,key是user,后面是item和打分\n",
42 | " '''\n",
43 | " line = line.split(\"|\")\n",
44 | " return line[0],(line[1],float(line[2]))\n",
45 | "\n",
46 | "def parseVectorOnItem(line):\n",
47 | " '''\n",
48 | " 解析数据,key是item,后面是user和打分\n",
49 | " '''\n",
50 | " line = line.split(\"|\")\n",
51 | " return line[1],(line[0],float(line[2]))\n",
52 | "\n",
53 | "def sampleInteractions(item_id,users_with_rating,n):\n",
54 | " '''\n",
55 | " 如果某个商品上用户行为特别多,可以选择适当做点下采样\n",
56 | " '''\n",
57 | " if len(users_with_rating) > n:\n",
58 | " return item_id, random.sample(users_with_rating,n)\n",
59 | " else:\n",
60 | " return item_id, users_with_rating\n",
61 | "\n",
62 | "def findUserPairs(item_id,users_with_rating):\n",
63 | " '''\n",
64 | " 对每个item,找到共同打分的user对\n",
65 | " '''\n",
66 | " for user1,user2 in combinations(users_with_rating,2):\n",
67 | " return (user1[0],user2[0]),(user1[1],user2[1])\n",
68 | "\n",
69 | "def calcSim(user_pair,rating_pairs):\n",
70 | " ''' \n",
71 | " 对每个user对,根据打分计算余弦距离,并返回共同打分的item个数\n",
72 | " '''\n",
73 | " sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n",
74 | " \n",
75 | " for rating_pair in rating_pairs:\n",
76 | " sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n",
77 | " sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n",
78 | " sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n",
79 | " # sum_y += rt[1]\n",
80 | " # sum_x += rt[0]\n",
81 | " n += 1\n",
82 | "\n",
83 | " cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n",
84 | " return user_pair, (cos_sim,n)\n",
85 | "\n",
86 | "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n",
87 | " '''\n",
88 | " 2个向量A和B的余弦相似度\n",
89 | " dotProduct(A, B) / (norm(A) * norm(B))\n",
90 | " '''\n",
91 | " numerator = dot_product\n",
92 | " denominator = rating_norm_squared * rating2_norm_squared\n",
93 | "\n",
94 | " return (numerator / (float(denominator))) if denominator else 0.0\n",
95 | "\n",
96 | "def keyOnFirstUser(user_pair,item_sim_data):\n",
97 | " '''\n",
98 | " 对于每个user-user对,用第一个user做key(好像有点粗暴...)\n",
99 | " '''\n",
100 | " (user1_id,user2_id) = user_pair\n",
101 | " return user1_id,(user2_id,item_sim_data)\n",
102 | "\n",
103 | "def nearestNeighbors(user,users_and_sims,n):\n",
104 | " '''\n",
105 | " 选出相似度最高的N个邻居\n",
106 | " '''\n",
107 | " users_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n",
108 | " return user, users_and_sims[:n]\n",
109 | "\n",
110 | "def topNRecommendations(user_id,user_sims,users_with_rating,n):\n",
111 | " '''\n",
112 | " 根据最近的N个邻居进行推荐\n",
113 | " '''\n",
114 | "\n",
115 | " totals = defaultdict(int)\n",
116 | " sim_sums = defaultdict(int)\n",
117 | "\n",
118 | " for (neighbor,(sim,count)) in user_sims:\n",
119 | "\n",
120 | " # 遍历邻居的打分\n",
121 | " unscored_items = users_with_rating.get(neighbor,None)\n",
122 | "\n",
123 | " if unscored_items:\n",
124 | " for (item,rating) in unscored_items:\n",
125 | " if neighbor != item:\n",
126 | "\n",
127 | " # 更新推荐度和相近度\n",
128 | " totals[neighbor] += sim * rating\n",
129 | " sim_sums[neighbor] += sim\n",
130 | "\n",
131 | " # 归一化\n",
132 | " scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n",
133 | "\n",
134 | " # 按照推荐度降序排列\n",
135 | " scored_items.sort(reverse=True)\n",
136 | "\n",
137 | " # 推荐度的item\n",
138 | " ranked_items = [x[1] for x in scored_items]\n",
139 | "\n",
140 | " return user_id,ranked_items[:n]\n",
141 | "\n",
142 | "if __name__ == \"__main__\":\n",
143 | " if len(sys.argv) < 3:\n",
144 | " print >> sys.stderr, \\\n",
145 | " \"Usage: PythonUserCF \"\n",
146 | " exit(-1)\n",
147 | "\n",
148 | " sc = SparkContext(sys.argv[1],\"PythonUserCF\")\n",
149 | " lines = sc.textFile(sys.argv[2])\n",
150 | "\n",
151 | " '''\n",
152 | " 处理数据,获得稀疏item-user矩阵:\n",
153 | " item_id -> ((user_1,rating),(user2,rating))\n",
154 | " '''\n",
155 | " item_user_pairs = lines.map(parseVectorOnItem).groupByKey().map(\n",
156 | " lambda p: sampleInteractions(p[0],p[1],500)).cache()\n",
157 | "\n",
158 | " '''\n",
159 | " 获得2个用户所有的item-item对得分组合:\n",
160 | " (user1_id,user2_id) -> [(rating1,rating2),\n",
161 | " (rating1,rating2),\n",
162 | " (rating1,rating2),\n",
163 | " ...]\n",
164 | " '''\n",
165 | " pairwise_users = item_user_pairs.filter(\n",
166 | " lambda p: len(p[1]) > 1).map(\n",
167 | " lambda p: findUserPairs(p[0],p[1])).groupByKey()\n",
168 | "\n",
169 | " '''\n",
170 | " 计算余弦相似度,找到最近的N个邻居:\n",
171 | " (user1,user2) -> (similarity,co_raters_count)\n",
172 | " '''\n",
173 | " user_sims = pairwise_users.map(\n",
174 | " lambda p: calcSim(p[0],p[1])).map(\n",
175 | " lambda p: keyOnFirstUser(p[0],p[1])).groupByKey().map(\n",
176 | " lambda p: nearestNeighbors(p[0],p[1],50))\n",
177 | "\n",
178 | " ''' \n",
179 | " 对每个用户的打分记录整理成如下形式\n",
180 | " user_id -> [(item_id_1, rating_1),\n",
181 | " [(item_id_2, rating_2),\n",
182 | " ...]\n",
183 | " '''\n",
184 | "\n",
185 | " user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect()\n",
186 | "\n",
187 | " ui_dict = {}\n",
188 | " for (user,items) in user_item_hist: \n",
189 | " ui_dict[user] = items\n",
190 | "\n",
191 | " uib = sc.broadcast(ui_dict)\n",
192 | "\n",
193 | " '''\n",
194 | " 为每个用户计算Top N的推荐\n",
195 | " user_id -> [item1,item2,item3,...]\n",
196 | " '''\n",
197 | " user_item_recs = user_sims.map(lambda p: topNRecommendations(p[0],p[1],uib.value,100)).collect()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "### item-based协同过滤"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": [
215 | "#-*- coding:utf8 -*-\n",
216 | "# pySpark实现的基于物品的协同过滤\n",
217 | "\n",
218 | "import sys\n",
219 | "from collections import defaultdict\n",
220 | "from itertools import combinations\n",
221 | "import numpy as np\n",
222 | "import random\n",
223 | "import csv\n",
224 | "import pdb\n",
225 | "\n",
226 | "from pyspark import SparkContext\n",
227 | "\n",
228 | "def parseVector(line):\n",
229 | " '''\n",
230 | " 解析数据,key是item,后面是user和打分\n",
231 | " '''\n",
232 | " line = line.split(\"|\")\n",
233 | " return line[0],(line[1],float(line[2]))\n",
234 | "\n",
235 | "def sampleInteractions(user_id,items_with_rating,n):\n",
236 | " '''\n",
237 | " 如果某个用户打分行为特别多,可以选择适当做点下采样\n",
238 | " '''\n",
239 | " if len(items_with_rating) > n:\n",
240 | " return user_id, random.sample(items_with_rating,n)\n",
241 | " else:\n",
242 | " return user_id, items_with_rating\n",
243 | "\n",
244 | "def findItemPairs(user_id,items_with_rating):\n",
245 | " '''\n",
246 | " 对每个用户的打分item,组对\n",
247 | " '''\n",
248 | " for item1,item2 in combinations(items_with_rating,2):\n",
249 | " return (item1[0],item2[0]),(item1[1],item2[1])\n",
250 | "\n",
251 | "def calcSim(item_pair,rating_pairs):\n",
252 | " ''' \n",
253 | " 对每个item对,根据打分计算余弦距离,并返回共同打分的user个数\n",
254 | " '''\n",
255 | " sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n",
256 | " \n",
257 | " for rating_pair in rating_pairs:\n",
258 | " sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n",
259 | " sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n",
260 | " sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n",
261 | " # sum_y += rt[1]\n",
262 | " # sum_x += rt[0]\n",
263 | " n += 1\n",
264 | "\n",
265 | " cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n",
266 | " return item_pair, (cos_sim,n)\n",
267 | "\n",
268 | "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n",
269 | " '''\n",
270 | " The cosine between two vectors A, B\n",
271 | " dotProduct(A, B) / (norm(A) * norm(B))\n",
272 | " '''\n",
273 | " numerator = dot_product\n",
274 | " denominator = rating_norm_squared * rating2_norm_squared\n",
275 | " return (numerator / (float(denominator))) if denominator else 0.0\n",
276 | "\n",
277 | "def correlation(size, dot_product, rating_sum, \\\n",
278 | " rating2sum, rating_norm_squared, rating2_norm_squared):\n",
279 | " '''\n",
280 | " 2个向量A和B的相似度\n",
281 | " [n * dotProduct(A, B) - sum(A) * sum(B)] /\n",
282 | " sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }\n",
283 | "\n",
284 | " '''\n",
285 | " numerator = size * dot_product - rating_sum * rating2sum\n",
286 | " denominator = sqrt(size * rating_norm_squared - rating_sum * rating_sum) * \\\n",
287 | " sqrt(size * rating2_norm_squared - rating2sum * rating2sum)\n",
288 | "\n",
289 | " return (numerator / (float(denominator))) if denominator else 0.0\n",
290 | "\n",
291 | "def keyOnFirstItem(item_pair,item_sim_data):\n",
292 | " '''\n",
293 | " 对于每个item-item对,用第一个item做key(好像有点粗暴...)\n",
294 | " '''\n",
295 | " (item1_id,item2_id) = item_pair\n",
296 | " return item1_id,(item2_id,item_sim_data)\n",
297 | "\n",
298 | "def nearestNeighbors(item_id,items_and_sims,n):\n",
299 | " '''\n",
300 | " 排序选出相似度最高的N个邻居\n",
301 | " '''\n",
302 | " items_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n",
303 | " return item_id, items_and_sims[:n]\n",
304 | "\n",
305 | "def topNRecommendations(user_id,items_with_rating,item_sims,n):\n",
306 | " '''\n",
307 | " 根据最近的N个邻居进行推荐\n",
308 | " '''\n",
309 | " \n",
310 | " totals = defaultdict(int)\n",
311 | " sim_sums = defaultdict(int)\n",
312 | "\n",
313 | " for (item,rating) in items_with_rating:\n",
314 | "\n",
315 | " # 遍历item的邻居\n",
316 | " nearest_neighbors = item_sims.get(item,None)\n",
317 | "\n",
318 | " if nearest_neighbors:\n",
319 | " for (neighbor,(sim,count)) in nearest_neighbors:\n",
320 | " if neighbor != item:\n",
321 | "\n",
322 | " # 更新推荐度和相近度\n",
323 | " totals[neighbor] += sim * rating\n",
324 | " sim_sums[neighbor] += sim\n",
325 | "\n",
326 | " # 归一化\n",
327 | " scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n",
328 | "\n",
329 | " # 按照推荐度降序排列\n",
330 | " scored_items.sort(reverse=True)\n",
331 | "\n",
332 | " ranked_items = [x[1] for x in scored_items]\n",
333 | "\n",
334 | " return user_id,ranked_items[:n]\n",
335 | "\n",
336 | "if __name__ == \"__main__\":\n",
337 | " if len(sys.argv) < 3:\n",
338 | " print >> sys.stderr, \\\n",
339 | " \"Usage: PythonItemCF \"\n",
340 | " exit(-1)\n",
341 | "\n",
342 | " sc = SparkContext(sys.argv[1], \"PythonItemCF\")\n",
343 | " lines = sc.textFile(sys.argv[2])\n",
344 | "\n",
345 | " ''' \n",
346 | " 处理数据,获得稀疏user-item矩阵:\n",
347 | " user_id -> [(item_id_1, rating_1),\n",
348 | " [(item_id_2, rating_2),\n",
349 | " ...]\n",
350 | " '''\n",
351 | " user_item_pairs = lines.map(parseVector).groupByKey().map(\n",
352 | " lambda p: sampleInteractions(p[0],p[1],500)).cache()\n",
353 | "\n",
354 | " '''\n",
355 | " 获取所有item-item组合对\n",
356 | " (item1,item2) -> [(item1_rating,item2_rating),\n",
357 | " (item1_rating,item2_rating),\n",
358 | " ...]\n",
359 | " '''\n",
360 | "\n",
361 | " pairwise_items = user_item_pairs.filter(\n",
362 | " lambda p: len(p[1]) > 1).map(\n",
363 | " lambda p: findItemPairs(p[0],p[1])).groupByKey()\n",
364 | "\n",
365 | " '''\n",
366 | " 计算余弦相似度,找到最近的N个邻居:\n",
367 | " (item1,item2) -> (similarity,co_raters_count)\n",
368 | " '''\n",
369 | "\n",
370 | " item_sims = pairwise_items.map(\n",
371 | " lambda p: calcSim(p[0],p[1])).map(\n",
372 | " lambda p: keyOnFirstItem(p[0],p[1])).groupByKey().map(\n",
373 | " lambda p: nearestNeighbors(p[0],p[1],50)).collect()\n",
374 | "\n",
375 | "\n",
376 | " item_sim_dict = {}\n",
377 | " for (item,data) in item_sims: \n",
378 | " item_sim_dict[item] = data\n",
379 | "\n",
380 | " isb = sc.broadcast(item_sim_dict)\n",
381 | "\n",
382 | " '''\n",
383 | " 计算最佳的N个推荐结果\n",
384 | " user_id -> [item1,item2,item3,...]\n",
385 | " '''\n",
386 | " user_item_recs = user_item_pairs.map(lambda p: topNRecommendations(p[0],p[1],isb.value,500)).collect()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {
393 | "collapsed": true
394 | },
395 | "outputs": [],
396 | "source": []
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "## Spark推荐系统"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "### spark自带了用于推荐的算法"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {
416 | "collapsed": true
417 | },
418 | "outputs": [],
419 | "source": [
420 | "#!/usr/bin/env python\n",
421 | "# 基于spark中ALS的推荐系统,针对movielens中电影打分数据做推荐\n",
422 | "# Edit:寒小阳(hanxiaoyang.ml@gmail.com)\n",
423 | "\n",
424 | "import sys\n",
425 | "import itertools\n",
426 | "from math import sqrt\n",
427 | "from operator import add\n",
428 | "from os.path import join, isfile, dirname\n",
429 | "\n",
430 | "from pyspark import SparkConf, SparkContext\n",
431 | "from pyspark.mllib.recommendation import ALS\n",
432 | "\n",
433 | "def parseRating(line):\n",
434 | " \"\"\"\n",
435 | " MovieLens的打分格式是userId::movieId::rating::timestamp\n",
436 | " 我们对格式做一个解析\n",
437 | " \"\"\"\n",
438 | " fields = line.strip().split(\"::\")\n",
439 | " return long(fields[3]) % 10, (int(fields[0]), int(fields[1]), float(fields[2]))\n",
440 | "\n",
441 | "def parseMovie(line):\n",
442 | " \"\"\"\n",
443 | " 对应的电影文件的格式为movieId::movieTitle\n",
444 | " 解析成int id, 文本\n",
445 | " \"\"\"\n",
446 | " fields = line.strip().split(\"::\")\n",
447 | " return int(fields[0]), fields[1]\n",
448 | "\n",
449 | "def loadRatings(ratingsFile):\n",
450 | " \"\"\"\n",
451 | " 载入得分\n",
452 | " \"\"\"\n",
453 | " if not isfile(ratingsFile):\n",
454 | " print \"File %s does not exist.\" % ratingsFile\n",
455 | " sys.exit(1)\n",
456 | " f = open(ratingsFile, 'r')\n",
457 | " ratings = filter(lambda r: r[2] > 0, [parseRating(line)[1] for line in f])\n",
458 | " f.close()\n",
459 | " if not ratings:\n",
460 | " print \"No ratings provided.\"\n",
461 | " sys.exit(1)\n",
462 | " else:\n",
463 | " return ratings\n",
464 | "\n",
465 | "def computeRmse(model, data, n):\n",
466 | " \"\"\"\n",
467 | " 评估的时候要用的,计算均方根误差\n",
468 | " \"\"\"\n",
469 | " predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))\n",
470 | " predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \\\n",
471 | " .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \\\n",
472 | " .values()\n",
473 | " return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))\n",
474 | "\n",
475 | "if __name__ == \"__main__\":\n",
476 | " if (len(sys.argv) != 3):\n",
477 | " print \"Usage: /path/to/spark/bin/spark-submit --driver-memory 2g \" + \\\n",
478 | " \"MovieLensALS.py movieLensDataDir personalRatingsFile\"\n",
479 | " sys.exit(1)\n",
480 | "\n",
481 | " # 设定环境\n",
482 | " conf = SparkConf() \\\n",
483 | " .setAppName(\"MovieLensALS\") \\\n",
484 | " .set(\"spark.executor.memory\", \"2g\")\n",
485 | " sc = SparkContext(conf=conf)\n",
486 | "\n",
487 | " # 载入打分数据\n",
488 | " myRatings = loadRatings(sys.argv[2])\n",
489 | " myRatingsRDD = sc.parallelize(myRatings, 1)\n",
490 | "\n",
491 | " movieLensHomeDir = sys.argv[1]\n",
492 | "\n",
493 | " # 得到的ratings为(时间戳最后一位整数, (userId, movieId, rating))格式的RDD\n",
494 | " ratings = sc.textFile(join(movieLensHomeDir, \"ratings.dat\")).map(parseRating)\n",
495 | "\n",
496 | " # 得到的movies为(movieId, movieTitle)格式的RDD\n",
497 | " movies = dict(sc.textFile(join(movieLensHomeDir, \"movies.dat\")).map(parseMovie).collect())\n",
498 | "\n",
499 | " numRatings = ratings.count()\n",
500 | " numUsers = ratings.values().map(lambda r: r[0]).distinct().count()\n",
501 | " numMovies = ratings.values().map(lambda r: r[1]).distinct().count()\n",
502 | "\n",
503 | " print \"Got %d ratings from %d users on %d movies.\" % (numRatings, numUsers, numMovies)\n",
504 | "\n",
505 | " # 根据时间戳最后一位把整个数据集分成训练集(60%), 交叉验证集(20%), 和评估集(20%)\n",
506 | "\n",
507 | " # 训练, 交叉验证, 测试 集都是(userId, movieId, rating)格式的RDD\n",
508 | "\n",
509 | " numPartitions = 4\n",
510 | " training = ratings.filter(lambda x: x[0] < 6) \\\n",
511 | " .values() \\\n",
512 | " .union(myRatingsRDD) \\\n",
513 | " .repartition(numPartitions) \\\n",
514 | " .cache()\n",
515 | "\n",
516 | " validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \\\n",
517 | " .values() \\\n",
518 | " .repartition(numPartitions) \\\n",
519 | " .cache()\n",
520 | "\n",
521 | " test = ratings.filter(lambda x: x[0] >= 8).values().cache()\n",
522 | "\n",
523 | " numTraining = training.count()\n",
524 | " numValidation = validation.count()\n",
525 | " numTest = test.count()\n",
526 | "\n",
527 | " print \"Training: %d, validation: %d, test: %d\" % (numTraining, numValidation, numTest)\n",
528 | "\n",
529 | " # 训练模型,在交叉验证集上看效果\n",
530 | "\n",
531 | " ranks = [8, 12]\n",
532 | " lambdas = [0.1, 10.0]\n",
533 | " numIters = [10, 20]\n",
534 | " bestModel = None\n",
535 | " bestValidationRmse = float(\"inf\")\n",
536 | " bestRank = 0\n",
537 | " bestLambda = -1.0\n",
538 | " bestNumIter = -1\n",
539 | "\n",
540 | " for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):\n",
541 | " model = ALS.train(training, rank, numIter, lmbda)\n",
542 | " validationRmse = computeRmse(model, validation, numValidation)\n",
543 | " print \"RMSE (validation) = %f for the model trained with \" % validationRmse + \\\n",
544 | " \"rank = %d, lambda = %.1f, and numIter = %d.\" % (rank, lmbda, numIter)\n",
545 | " if (validationRmse < bestValidationRmse):\n",
546 | " bestModel = model\n",
547 | " bestValidationRmse = validationRmse\n",
548 | " bestRank = rank\n",
549 | " bestLambda = lmbda\n",
550 | " bestNumIter = numIter\n",
551 | "\n",
552 | " testRmse = computeRmse(bestModel, test, numTest)\n",
553 | "\n",
554 | " # 在测试集上评估 交叉验证集上最好的模型\n",
555 | " print \"The best model was trained with rank = %d and lambda = %.1f, \" % (bestRank, bestLambda) \\\n",
556 | " + \"and numIter = %d, and its RMSE on the test set is %f.\" % (bestNumIter, testRmse)\n",
557 | "\n",
558 | " # 我们把基线模型设定为每次都返回平均得分的模型\n",
559 | " meanRating = training.union(validation).map(lambda x: x[2]).mean()\n",
560 | " baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)\n",
561 | " improvement = (baselineRmse - testRmse) / baselineRmse * 100\n",
562 | " print \"The best model improves the baseline by %.2f\" % (improvement) + \"%.\"\n",
563 | "\n",
564 | " # 个性化的推荐(针对某个用户)\n",
565 | "\n",
566 | " myRatedMovieIds = set([x[1] for x in myRatings])\n",
567 | " candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds])\n",
568 | " predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect()\n",
569 | " recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:50]\n",
570 | "\n",
571 | " print \"Movies recommended for you:\"\n",
572 | " for i in xrange(len(recommendations)):\n",
573 | " print (\"%2d: %s\" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore')\n",
574 | "\n",
575 | " # clean up\n",
576 | " sc.stop()"
577 | ]
578 | }
579 | ],
580 | "metadata": {
581 | "kernelspec": {
582 | "display_name": "Python 3",
583 | "language": "python",
584 | "name": "python3"
585 | },
586 | "language_info": {
587 | "codemirror_mode": {
588 | "name": "ipython",
589 | "version": 3
590 | },
591 | "file_extension": ".py",
592 | "mimetype": "text/x-python",
593 | "name": "python",
594 | "nbconvert_exporter": "python",
595 | "pygments_lexer": "ipython3",
596 | "version": "3.5.2"
597 | }
598 | },
599 | "nbformat": 4,
600 | "nbformat_minor": 2
601 | }
602 |
--------------------------------------------------------------------------------
/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/1.jpg
--------------------------------------------------------------------------------
/images/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/2.jpg
--------------------------------------------------------------------------------