├── Music Recommend System.ipynb
├── Music Recommend use Tensorflow .ipynb
├── README.md
├── Sequence Modelling.ipynb
├── Spark Recommendation.ipynb
└── images
    ├── 1.jpg
    └── 2.jpg


/Music Recommend use Tensorflow .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 目的：用tensorflow来完成一个在批量数据上更新，并且可以增量迭代优化的矩阵分解推荐系统"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 矩阵分解\n",
 15 |     "![](svd_recommendation.png)\n",
 16 |     "LFM：把用户再item上打分的行为，看作是有内部依据的，认为和k个factor有关系<br>\n",
 17 |     "每一个user i会有一个用户的向量(k维)，每一个item会有一个item的向量(k维)\n",
 18 |     "\n",
 19 |     "SVD是矩阵分解的一种方式\n",
 20 |     "\n",
 21 |     "### 预测公式如下\n",
 22 |     "$y_{pred[u, i]} = bias_{global} + bias_{user[u]} + bias_{item_[i]} + <embedding_{user[u]}, embedding_{item[i]}>$\n",
 23 |     "\n",
 24 |     "### 我们需要最小化的loss计算如下（添加正则化项）\n",
 25 |     "$\\sum_{u, i} |y_{pred[u, i]} - y_{true[u, i]}|^2 + \\lambda(|embedding_{user[u]}|^2 + |embedding_{item[i]}|^2)$"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": []
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### 数据处理"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 1,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import numpy as np\n",
 53 |     "import pandas as pd\n",
 54 |     "\n",
 55 |     "\n",
 56 |     "def read_data_and_process(filname, sep=\"\\t\"):\n",
 57 |     "    col_names = [\"user\", \"item\", \"rate\", \"st\"]\n",
 58 |     "    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')\n",
 59 |     "    df[\"user\"] -= 1\n",
 60 |     "    df[\"item\"] -= 1\n",
 61 |     "    for col in (\"user\", \"item\"):\n",
 62 |     "        df[col] = df[col].astype(np.int32)\n",
 63 |     "    df[\"rate\"] = df[\"rate\"].astype(np.float32)\n",
 64 |     "    return df\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "class ShuffleDataIterator(object):\n",
 68 |     "    \"\"\"\n",
 69 |     "    随机生成一个batch一个batch数据\n",
 70 |     "    \"\"\"\n",
 71 |     "    #初始化\n",
 72 |     "    def __init__(self, inputs, batch_size=10):\n",
 73 |     "        self.inputs = inputs\n",
 74 |     "        self.batch_size = batch_size\n",
 75 |     "        self.num_cols = len(self.inputs)\n",
 76 |     "        self.len = len(self.inputs[0])\n",
 77 |     "        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))\n",
 78 |     "\n",
 79 |     "    #总样本量\n",
 80 |     "    def __len__(self):\n",
 81 |     "        return self.len\n",
 82 |     "\n",
 83 |     "    def __iter__(self):\n",
 84 |     "        return self\n",
 85 |     "\n",
 86 |     "    #取出下一个batch\n",
 87 |     "    def __next__(self):\n",
 88 |     "        return self.next()\n",
 89 |     "    \n",
 90 |     "    #随机生成batch_size个下标，取出对应的样本\n",
 91 |     "    def next(self):\n",
 92 |     "        ids = np.random.randint(0, self.len, (self.batch_size,))\n",
 93 |     "        out = self.inputs[ids, :]\n",
 94 |     "        return [out[:, i] for i in range(self.num_cols)]\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "class OneEpochDataIterator(ShuffleDataIterator):\n",
 98 |     "    \"\"\"\n",
 99 |     "    顺序产出一个epoch的数据，在测试中可能会用到\n",
100 |     "    \"\"\"\n",
101 |     "    def __init__(self, inputs, batch_size=10):\n",
102 |     "        super(OneEpochDataIterator, self).__init__(inputs, batch_size=batch_size)\n",
103 |     "        if batch_size > 0:\n",
104 |     "            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))\n",
105 |     "        else:\n",
106 |     "            self.idx_group = [np.arange(self.len)]\n",
107 |     "        self.group_id = 0\n",
108 |     "\n",
109 |     "    def next(self):\n",
110 |     "        if self.group_id >= len(self.idx_group):\n",
111 |     "            self.group_id = 0\n",
112 |     "            raise StopIteration\n",
113 |     "        out = self.inputs[self.idx_group[self.group_id], :]\n",
114 |     "        self.group_id += 1\n",
115 |     "        return [out[:, i] for i in range(self.num_cols)]"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### 模型搭建\n",
123 |     "用tensorflow去搭建一个可增量训练的矩阵分解模型，完成基于矩阵分解的推荐系统"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 2,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "\n",
135 |     "import tensorflow as tf\n",
136 |     "\n",
137 |     "# 使用矩阵分解搭建的网络结构\n",
138 |     "def inference_svd(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n",
139 |     "    #使用CPU\n",
140 |     "    with tf.device(\"/cpu:0\"):\n",
141 |     "        # 初始化几个bias项\n",
142 |     "        global_bias = tf.get_variable(\"global_bias\", shape=[])\n",
143 |     "        w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n",
144 |     "        w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n",
145 |     "        # bias向量\n",
146 |     "        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n",
147 |     "        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n",
148 |     "        w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n",
149 |     "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
150 |     "        w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n",
151 |     "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
152 |     "        # user向量与item向量\n",
153 |     "        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n",
154 |     "        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n",
155 |     "    with tf.device(device):\n",
156 |     "        # 按照实际公式进行计算\n",
157 |     "        # 先对user向量和item向量求内积\n",
158 |     "        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)\n",
159 |     "        # 加上几个偏置项\n",
160 |     "        infer = tf.add(infer, global_bias)\n",
161 |     "        infer = tf.add(infer, bias_user)\n",
162 |     "        infer = tf.add(infer, bias_item, name=\"svd_inference\")\n",
163 |     "        # 加上正则化项\n",
164 |     "        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name=\"svd_regularizer\")\n",
165 |     "    return infer, regularizer\n",
166 |     "\n",
167 |     "# 迭代优化部分\n",
168 |     "def optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device=\"/cpu:0\"):\n",
169 |     "    global_step = tf.train.get_global_step()\n",
170 |     "    assert global_step is not None\n",
171 |     "    # 选择合适的optimizer做优化\n",
172 |     "    with tf.device(device):\n",
173 |     "        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))\n",
174 |     "        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n",
175 |     "        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))\n",
176 |     "        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)\n",
177 |     "    return cost, train_op"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "### 模型训练"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 3,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "import time\n",
196 |     "from collections import deque\n",
197 |     "\n",
198 |     "import numpy as np\n",
199 |     "import tensorflow as tf\n",
200 |     "from six import next\n",
201 |     "from tensorflow.core.framework import summary_pb2\n",
202 |     "\n",
203 |     "np.random.seed(13575)\n",
204 |     "\n",
205 |     "# 一批数据的大小\n",
206 |     "BATCH_SIZE = 2000\n",
207 |     "# 用户数\n",
208 |     "USER_NUM = 6040\n",
209 |     "# 电影数\n",
210 |     "ITEM_NUM = 3952\n",
211 |     "# factor维度\n",
212 |     "DIM = 15\n",
213 |     "# 最大迭代轮数\n",
214 |     "EPOCH_MAX = 200\n",
215 |     "# 使用cpu做训练\n",
216 |     "DEVICE = \"/cpu:0\"\n",
217 |     "\n",
218 |     "# 截断\n",
219 |     "def clip(x):\n",
220 |     "    return np.clip(x, 1.0, 5.0)\n",
221 |     "\n",
222 |     "# 这个是方便Tensorboard可视化做的summary\n",
223 |     "def make_scalar_summary(name, val):\n",
224 |     "    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])\n",
225 |     "\n",
226 |     "# 调用上面的函数获取数据\n",
227 |     "def get_data():\n",
228 |     "    df = read_data_and_process(\"./movielens/ml-1m/ratings.dat\", sep=\"::\")\n",
229 |     "    rows = len(df)\n",
230 |     "    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n",
231 |     "    split_index = int(rows * 0.9)\n",
232 |     "    df_train = df[0:split_index]\n",
233 |     "    df_test = df[split_index:].reset_index(drop=True)\n",
234 |     "    print(df_train.shape, df_test.shape)\n",
235 |     "    return df_train, df_test\n",
236 |     "\n",
237 |     "# 实际训练过程\n",
238 |     "def svd(train, test):\n",
239 |     "    samples_per_batch = len(train) // BATCH_SIZE\n",
240 |     "\n",
241 |     "    # 一批一批数据用于训练\n",
242 |     "    iter_train = ShuffleDataIterator([train[\"user\"],\n",
243 |     "                                         train[\"item\"],\n",
244 |     "                                         train[\"rate\"]],\n",
245 |     "                                        batch_size=BATCH_SIZE)\n",
246 |     "    # 测试数据\n",
247 |     "    iter_test = OneEpochDataIterator([test[\"user\"],\n",
248 |     "                                         test[\"item\"],\n",
249 |     "                                         test[\"rate\"]],\n",
250 |     "                                        batch_size=-1)\n",
251 |     "    # user和item batch\n",
252 |     "    user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n",
253 |     "    item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n",
254 |     "    rate_batch = tf.placeholder(tf.float32, shape=[None])\n",
255 |     "\n",
256 |     "    # 构建graph和训练\n",
257 |     "    infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,\n",
258 |     "                                           device=DEVICE)\n",
259 |     "    global_step = tf.contrib.framework.get_or_create_global_step()\n",
260 |     "    _, train_op = optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)\n",
261 |     "\n",
262 |     "    # 初始化所有变量\n",
263 |     "    init_op = tf.global_variables_initializer()\n",
264 |     "    # 开始迭代\n",
265 |     "    with tf.Session() as sess:\n",
266 |     "        sess.run(init_op)\n",
267 |     "        summary_writer = tf.summary.FileWriter(logdir=\"/tmp/svd/log\", graph=sess.graph)\n",
268 |     "        print(\"{} {} {} {}\".format(\"epoch\", \"train_error\", \"val_error\", \"elapsed_time\"))\n",
269 |     "        errors = deque(maxlen=samples_per_batch)\n",
270 |     "        start = time.time()\n",
271 |     "        for i in range(EPOCH_MAX * samples_per_batch):\n",
272 |     "            users, items, rates = next(iter_train)\n",
273 |     "            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n",
274 |     "                                                                   item_batch: items,\n",
275 |     "                                                                   rate_batch: rates})\n",
276 |     "            pred_batch = clip(pred_batch)\n",
277 |     "            errors.append(np.power(pred_batch - rates, 2))\n",
278 |     "            if i % samples_per_batch == 0:\n",
279 |     "                train_err = np.sqrt(np.mean(errors))\n",
280 |     "                test_err2 = np.array([])\n",
281 |     "                for users, items, rates in iter_test:\n",
282 |     "                    pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
283 |     "                                                            item_batch: items})\n",
284 |     "                    pred_batch = clip(pred_batch)\n",
285 |     "                    test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
286 |     "                end = time.time()\n",
287 |     "                test_err = np.sqrt(np.mean(test_err2))\n",
288 |     "                print(\"{:3d} {:f} {:f} {:f}(s)\".format(i // samples_per_batch, train_err, test_err,\n",
289 |     "                                                       end - start))\n",
290 |     "                train_err_summary = make_scalar_summary(\"training_error\", train_err)\n",
291 |     "                test_err_summary = make_scalar_summary(\"test_error\", test_err)\n",
292 |     "                summary_writer.add_summary(train_err_summary, i)\n",
293 |     "                summary_writer.add_summary(test_err_summary, i)\n",
294 |     "                start = end"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 4,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "(900188, 4) (100021, 4)\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "# 获取数据\n",
312 |     "df_train, df_test = get_data()"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 5,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "name": "stderr",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "D:\\Anaconda\\install\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
325 |       "  from ._conv import register_converters as _register_converters\n"
326 |      ]
327 |     },
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "epoch train_error val_error elapsed_time\n",
333 |       "  0 2.576278 2.577729 0.477677(s)\n",
334 |       "  1 1.978902 1.152332 1.450331(s)\n",
335 |       "  2 1.002632 0.949393 1.423475(s)\n",
336 |       "  3 0.927719 0.926508 1.450366(s)\n",
337 |       "  4 0.914275 0.919153 1.534211(s)\n",
338 |       "  5 0.910865 0.915688 1.624905(s)\n",
339 |       "  6 0.906089 0.913335 1.514213(s)\n",
340 |       "  7 0.904977 0.911318 1.424107(s)\n",
341 |       "  8 0.901721 0.908855 1.496397(s)\n",
342 |       "  9 0.896913 0.906264 1.611612(s)\n",
343 |       " 10 0.894468 0.903484 1.795376(s)\n",
344 |       " 11 0.891712 0.899968 1.503599(s)\n",
345 |       " 12 0.887555 0.895848 1.421528(s)\n",
346 |       " 13 0.882009 0.891982 1.420877(s)\n",
347 |       " 14 0.876975 0.888060 1.567020(s)\n",
348 |       " 15 0.872943 0.884968 1.627547(s)\n",
349 |       " 16 0.867226 0.881633 1.486202(s)\n",
350 |       " 17 0.864066 0.878666 1.431954(s)\n",
351 |       " 18 0.859931 0.875910 1.438102(s)\n",
352 |       " 19 0.856037 0.873030 1.433395(s)\n",
353 |       " 20 0.849924 0.870667 1.421893(s)\n",
354 |       " 21 0.846303 0.868094 1.397581(s)\n",
355 |       " 22 0.842261 0.865835 1.398386(s)\n",
356 |       " 23 0.836717 0.863661 1.395191(s)\n",
357 |       " 24 0.833121 0.861465 1.390639(s)\n",
358 |       " 25 0.829651 0.859585 1.461942(s)\n",
359 |       " 26 0.824811 0.857843 1.403177(s)\n",
360 |       " 27 0.820917 0.856483 1.398302(s)\n",
361 |       " 28 0.816505 0.854711 1.390662(s)\n",
362 |       " 29 0.813360 0.853433 1.402261(s)\n",
363 |       " 30 0.808135 0.852419 1.468770(s)\n",
364 |       " 31 0.805145 0.851025 1.394093(s)\n",
365 |       " 32 0.799418 0.849873 1.406268(s)\n",
366 |       " 33 0.797527 0.849210 1.390641(s)\n",
367 |       " 34 0.794350 0.848693 1.399524(s)\n",
368 |       " 35 0.792427 0.848298 1.447494(s)\n",
369 |       " 36 0.789376 0.847890 1.403100(s)\n",
370 |       " 37 0.786277 0.847480 1.406268(s)\n",
371 |       " 38 0.783722 0.847279 1.392901(s)\n",
372 |       " 39 0.781859 0.846988 1.433542(s)\n",
373 |       " 40 0.779194 0.846766 1.460803(s)\n",
374 |       " 41 0.776687 0.846418 1.502838(s)\n",
375 |       " 42 0.774345 0.846484 1.477898(s)\n",
376 |       " 43 0.773097 0.846666 1.470419(s)\n",
377 |       " 44 0.772025 0.846828 1.406287(s)\n",
378 |       " 45 0.769199 0.846732 1.445719(s)\n",
379 |       " 46 0.768910 0.846695 1.390641(s)\n",
380 |       " 47 0.766496 0.846699 1.395308(s)\n",
381 |       " 48 0.765846 0.846611 1.407348(s)\n",
382 |       " 49 0.764256 0.846703 1.406266(s)\n",
383 |       " 50 0.762772 0.846718 1.446595(s)\n",
384 |       " 51 0.761644 0.847029 1.390661(s)\n",
385 |       " 52 0.760738 0.847263 1.413254(s)\n",
386 |       " 53 0.759950 0.847614 1.419673(s)\n",
387 |       " 54 0.759713 0.847827 1.400326(s)\n",
388 |       " 55 0.757802 0.847982 1.421893(s)\n",
389 |       " 56 0.757559 0.848026 1.437567(s)\n",
390 |       " 57 0.757013 0.848383 1.414295(s)\n",
391 |       " 58 0.756566 0.848557 1.390639(s)\n",
392 |       " 59 0.756866 0.848483 1.413066(s)\n",
393 |       " 60 0.753830 0.848556 1.406267(s)\n",
394 |       " 61 0.754405 0.848785 1.447202(s)\n",
395 |       " 62 0.754867 0.848690 1.390621(s)\n",
396 |       " 63 0.753079 0.848909 1.416439(s)\n",
397 |       " 64 0.753559 0.848946 1.383843(s)\n",
398 |       " 65 0.753118 0.849353 1.399104(s)\n",
399 |       " 66 0.751364 0.849349 1.481309(s)\n",
400 |       " 67 0.752177 0.849697 1.449126(s)\n",
401 |       " 68 0.751095 0.849683 1.468768(s)\n",
402 |       " 69 0.751063 0.849502 1.383999(s)\n",
403 |       " 70 0.750350 0.849622 1.406266(s)\n",
404 |       " 71 0.751395 0.849533 1.446464(s)\n",
405 |       " 72 0.750082 0.849392 1.400997(s)\n",
406 |       " 73 0.750379 0.849434 1.388548(s)\n",
407 |       " 74 0.749501 0.849552 1.407498(s)\n",
408 |       " 75 0.750194 0.849896 1.461215(s)\n",
409 |       " 76 0.750201 0.849961 1.446918(s)\n",
410 |       " 77 0.749083 0.850167 1.404643(s)\n",
411 |       " 78 0.750445 0.850135 1.404541(s)\n",
412 |       " 79 0.749501 0.849938 1.453143(s)\n",
413 |       " 80 0.747849 0.850081 1.394538(s)\n",
414 |       " 81 0.747658 0.850377 1.500019(s)\n",
415 |       " 82 0.747445 0.850573 1.417488(s)\n",
416 |       " 83 0.748725 0.850522 1.484394(s)\n",
417 |       " 84 0.748016 0.850637 1.407718(s)\n",
418 |       " 85 0.746435 0.850938 1.380105(s)\n",
419 |       " 86 0.747316 0.850969 1.448309(s)\n",
420 |       " 87 0.746777 0.850801 1.406286(s)\n",
421 |       " 88 0.746731 0.850807 1.400060(s)\n",
422 |       " 89 0.747924 0.850830 1.385019(s)\n",
423 |       " 90 0.746106 0.850674 1.400585(s)\n",
424 |       " 91 0.746864 0.850689 1.419417(s)\n",
425 |       " 92 0.746962 0.850772 1.461914(s)\n",
426 |       " 93 0.746395 0.850632 1.400366(s)\n",
427 |       " 94 0.746491 0.850653 1.390425(s)\n",
428 |       " 95 0.746701 0.850703 1.469709(s)\n",
429 |       " 96 0.745090 0.850457 1.413338(s)\n",
430 |       " 97 0.745649 0.850722 1.436355(s)\n",
431 |       " 98 0.745338 0.850862 1.437517(s)\n",
432 |       " 99 0.745499 0.850813 1.481134(s)\n",
433 |       "100 0.745503 0.850798 1.374998(s)\n",
434 |       "101 0.745268 0.850891 1.413138(s)\n",
435 |       "102 0.745327 0.850786 1.476407(s)\n",
436 |       "103 0.746660 0.850860 1.468770(s)\n",
437 |       "104 0.745549 0.851016 1.390663(s)\n",
438 |       "105 0.744760 0.850981 1.399773(s)\n",
439 |       "106 0.745388 0.850703 1.395961(s)\n",
440 |       "107 0.745142 0.850666 1.462982(s)\n",
441 |       "108 0.746368 0.850706 1.390665(s)\n",
442 |       "109 0.744704 0.850997 1.406251(s)\n",
443 |       "110 0.745588 0.850987 1.399718(s)\n",
444 |       "111 0.743731 0.851158 1.598807(s)\n",
445 |       "112 0.744651 0.851077 1.611395(s)\n",
446 |       "113 0.744472 0.850991 1.437518(s)\n",
447 |       "114 0.744883 0.851003 1.408719(s)\n",
448 |       "115 0.744321 0.850906 1.415562(s)\n",
449 |       "116 0.744179 0.851158 1.406284(s)\n",
450 |       "117 0.744853 0.851024 1.486450(s)\n",
451 |       "118 0.743401 0.850973 1.420012(s)\n",
452 |       "119 0.744809 0.851009 1.399587(s)\n",
453 |       "120 0.744726 0.851097 1.390638(s)\n",
454 |       "121 0.743952 0.850803 1.446391(s)\n",
455 |       "122 0.744973 0.850798 1.439853(s)\n",
456 |       "123 0.744382 0.850887 1.431332(s)\n",
457 |       "124 0.744419 0.850841 1.460428(s)\n",
458 |       "125 0.743825 0.851252 1.468669(s)\n",
459 |       "126 0.744768 0.850956 1.406268(s)\n",
460 |       "127 0.743264 0.850907 1.462467(s)\n",
461 |       "128 0.743480 0.850931 1.390640(s)\n",
462 |       "129 0.743621 0.851018 1.391578(s)\n",
463 |       "130 0.744046 0.850966 1.407784(s)\n",
464 |       "131 0.743349 0.850969 1.390645(s)\n",
465 |       "132 0.743841 0.850785 1.462579(s)\n",
466 |       "133 0.743074 0.850948 1.458358(s)\n",
467 |       "134 0.744358 0.850806 1.407487(s)\n",
468 |       "135 0.743972 0.851127 1.390640(s)\n",
469 |       "136 0.743227 0.851148 1.406265(s)\n",
470 |       "137 0.742984 0.851232 1.447071(s)\n",
471 |       "138 0.744403 0.851532 1.411106(s)\n",
472 |       "139 0.743451 0.851401 1.469621(s)\n",
473 |       "140 0.743391 0.851384 1.390645(s)\n",
474 |       "141 0.744516 0.851492 1.406285(s)\n",
475 |       "142 0.743470 0.851447 1.410015(s)\n",
476 |       "143 0.743198 0.851322 1.420781(s)\n",
477 |       "144 0.744412 0.851270 1.403787(s)\n",
478 |       "145 0.742384 0.851284 1.390643(s)\n",
479 |       "146 0.743339 0.851364 1.399891(s)\n",
480 |       "147 0.742802 0.851247 1.395700(s)\n",
481 |       "148 0.742878 0.851421 1.469909(s)\n",
482 |       "149 0.743484 0.851321 1.399727(s)\n",
483 |       "150 0.743502 0.851572 1.399970(s)\n",
484 |       "151 0.743406 0.851571 1.399521(s)\n",
485 |       "152 0.742925 0.851396 1.395897(s)\n",
486 |       "153 0.742553 0.851295 1.451918(s)\n",
487 |       "154 0.743613 0.851278 1.406268(s)\n",
488 |       "155 0.741762 0.851363 1.525129(s)\n",
489 |       "156 0.743210 0.851457 1.406286(s)\n",
490 |       "157 0.743032 0.851381 1.395900(s)\n",
491 |       "158 0.741658 0.851501 1.455009(s)\n",
492 |       "159 0.743116 0.851250 1.406258(s)\n",
493 |       "160 0.743059 0.851320 1.399649(s)\n",
494 |       "161 0.743155 0.851130 1.406287(s)\n",
495 |       "162 0.741716 0.851186 1.395491(s)\n",
496 |       "163 0.742589 0.851172 1.439167(s)\n",
497 |       "164 0.742117 0.850974 1.468770(s)\n",
498 |       "165 0.742390 0.851116 1.409128(s)\n",
499 |       "166 0.744096 0.851254 1.435538(s)\n",
500 |       "167 0.742634 0.851376 1.388916(s)\n",
501 |       "168 0.741646 0.851275 1.457746(s)\n",
502 |       "169 0.742897 0.851177 1.366868(s)\n",
503 |       "170 0.743052 0.851266 1.426625(s)\n",
504 |       "171 0.742376 0.851271 1.403697(s)\n",
505 |       "172 0.742742 0.851338 1.390625(s)\n",
506 |       "173 0.742256 0.851197 1.453141(s)\n",
507 |       "174 0.742268 0.851046 1.399479(s)\n",
508 |       "175 0.742002 0.850905 1.383932(s)\n",
509 |       "176 0.741890 0.851078 1.421894(s)\n",
510 |       "177 0.743085 0.851033 1.406270(s)\n",
511 |       "178 0.741955 0.850917 1.416128(s)\n",
512 |       "179 0.742171 0.851150 1.442452(s)\n",
513 |       "180 0.742805 0.851376 1.423072(s)\n",
514 |       "181 0.741585 0.851463 1.437518(s)\n",
515 |       "182 0.742271 0.851443 1.437522(s)\n",
516 |       "183 0.743029 0.851565 1.461741(s)\n",
517 |       "184 0.742284 0.851456 1.504725(s)\n",
518 |       "185 0.741653 0.851479 1.454393(s)\n",
519 |       "186 0.743889 0.851576 1.564915(s)\n",
520 |       "187 0.742872 0.851446 1.650403(s)\n",
521 |       "188 0.741979 0.851394 1.431463(s)\n",
522 |       "189 0.742107 0.851101 1.421539(s)\n",
523 |       "190 0.742485 0.851297 1.406282(s)\n",
524 |       "191 0.740788 0.851228 1.415401(s)\n",
525 |       "192 0.742113 0.851329 1.414911(s)\n",
526 |       "193 0.741579 0.851133 1.462146(s)\n",
527 |       "194 0.742999 0.851144 1.455878(s)\n",
528 |       "195 0.742513 0.851250 1.415932(s)\n",
529 |       "196 0.743028 0.851395 1.390641(s)\n",
530 |       "197 0.742302 0.851131 1.579862(s)\n",
531 |       "198 0.741136 0.851173 1.607915(s)\n",
532 |       "199 0.741375 0.851128 1.723560(s)\n"
533 |      ]
534 |     }
535 |    ],
536 |    "source": [
537 |     "# 完成实际的训练\n",
538 |     "svd(df_train, df_test)"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": null,
544 |    "metadata": {
545 |     "collapsed": true
546 |    },
547 |    "outputs": [],
548 |    "source": []
549 |   }
550 |  ],
551 |  "metadata": {
552 |   "kernelspec": {
553 |    "display_name": "Python 3",
554 |    "language": "python",
555 |    "name": "python3"
556 |   },
557 |   "language_info": {
558 |    "codemirror_mode": {
559 |     "name": "ipython",
560 |     "version": 3
561 |    },
562 |    "file_extension": ".py",
563 |    "mimetype": "text/x-python",
564 |    "name": "python",
565 |    "nbconvert_exporter": "python",
566 |    "pygments_lexer": "ipython3",
567 |    "version": "3.5.2"
568 |   }
569 |  },
570 |  "nbformat": 4,
571 |  "nbformat_minor": 2
572 | }
573 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 音乐推荐系统
  2 | #### 语言：python3.5
  3 | #### 库：Surprise
  4 | #### 平台：jupyter notebook
  5 | #### 描述：此推荐系统类似网易云音乐推荐歌单以及推荐相似歌曲。
  6 | ### 1. 数据获取
  7 | 使用爬虫爬取了网易云音乐中80w首歌400w+次收藏的歌单，存储格式为json格式，数据大小为3.59G，格式说明如下：
  8 | <pre>
  9 | 1）每个歌单的格式
 10 | {
 11 |     "result": {
 12 |         "id": 111450065,
 13 |         "status": 0,
 14 |         "commentThreadId": "A_PL_0_111450065",
 15 |         "trackCount": 120,
 16 |         "updateTime": 1460164523907,
 17 |         "commentCount": 227,
 18 |         "ordered": true,
 19 |         "anonimous": false,
 20 |         "highQuality": false,
 21 |         "subscribers": [],
 22 |         "playCount": 687070,
 23 |         "trackNumberUpdateTime": 1460164523907,
 24 |         "createTime": 1443528317662,
 25 |         "name": "带本书去旅行吧,人生最美好的时光在路上。",
 26 |         "cloudTrackCount": 0,
 27 |         "shareCount": 149,
 28 |         "adType": 0,
 29 |         "trackUpdateTime": 1494134249465,
 30 |         "userId": 39256799,
 31 |         "coverImgId": 3359008023885470,
 32 |         "coverImgUrl": "http://p1.music.126.net/2ZFcuSJ6STR8WgzkIi2U-Q==/3359008023885470.jpg",
 33 |         "artists": null,
 34 |         "newImported": false,
 35 |         "subscribed": false,
 36 |         "privacy": 0,
 37 |         "specialType": 0,
 38 |         "description": "现在是一年中最美好的时节，世界上很多地方都不冷不热，有湛蓝的天空和清冽的空气，正是出游的好时光。长假将至，你是不是已经收拾行装准备出发了？行前焦虑症中把衣服、洗漱用品、充电器之类东西忙忙碌碌地丢进箱子，打进背包的时候，我打赌你肯定会留个位置给一位好朋友：书。不是吗？不管是打发时间，小读怡情，还是为了做好攻略备不时之需，亦或是为了小小地装上一把，你都得有一本书傍身呀。读大仲马，我是复仇的伯爵；读柯南道尔，我穿梭在雾都的暗夜；读村上春树，我是寻羊的冒险者；读马尔克斯，目睹百年家族兴衰；读三毛，让灵魂在撒哈拉流浪；读老舍，嗅着老北京的气息；读海茵莱茵，于科幻狂流遨游；读卡夫卡，在城堡中审判……读书的孩子不会孤单，读书的孩子永远幸福。",
 39 |         "subscribedCount": 10882,
 40 |         "totalDuration": 0,
 41 |         "tags": [
 42 |             "旅行",
 43 |             "钢琴",
 44 |             "安静"]
 45 |         "creator": {
 46 |             "followed": false,
 47 |             "remarkName": null,
 48 |             "expertTags": [
 49 |                 "古典",
 50 |                 "民谣",
 51 |                 "华语"
 52 |             ],
 53 |             "userId": 39256799,
 54 |             "authority": 0,
 55 |             "userType": 0,
 56 |             "gender": 1,
 57 |             "backgroundImgId": 3427177752524551,
 58 |             "city": 360600,
 59 |             "mutual": false,
 60 |             "avatarUrl": "http://p1.music.126.net/TLRTrJpOM5lr68qJv1IyGQ==/1400777825738419.jpg",
 61 |             "avatarImgIdStr": "1400777825738419",
 62 |             "detailDescription": "",
 63 |             "province": 360000,
 64 |             "description": "",
 65 |             "birthday": 637516800000,
 66 |             "nickname": "有梦人生不觉寒",
 67 |             "vipType": 0,
 68 |             "avatarImgId": 1400777825738419,
 69 |             "defaultAvatar": false,
 70 |             "djStatus": 0,
 71 |             "accountStatus": 0,
 72 |             "backgroundImgIdStr": "3427177752524551",
 73 |             "backgroundUrl": "http://p1.music.126.net/LS96S_6VP9Hm7-T447-X0g==/3427177752524551.jpg",
 74 |             "signature": "漫无目的的乱听，听着，听着，竟然灵魂出窍了。更多精品音乐美图分享请加我微信hu272367751。微信是我的精神家园，有我最真诚的分享。",
 75 |             "authStatus": 0}
 76 |         "tracks": [{歌曲1},{歌曲2}, ...]
 77 |      }
 78 | }
 79 | 2）每首歌曲的格式为：
 80 | {
 81 |     "id": 29738501,
 82 |     "name": "跟着你到天边 钢琴版",
 83 |     "duration": 174001,
 84 |     "hearTime": 0,
 85 |     "commentThreadId": "R_SO_4_29738501",
 86 |     "score": 40,
 87 |     "mvid": 0,
 88 |     "hMusic": null,
 89 |     "disc": "",
 90 |     "fee": 0,
 91 |     "no": 1,
 92 |     "rtUrl": null,
 93 |     "ringtone": null,
 94 |     "rtUrls": [],
 95 |     "rurl": null,
 96 |     "status": 0,
 97 |     "ftype": 0,
 98 |     "mp3Url": "http://m2.music.126.net/vrVa20wHs8iIe0G8Oe7I9Q==/3222668581877701.mp3",
 99 |     "audition": null,
100 |     "playedNum": 0,
101 |     "copyrightId": 0,
102 |     "rtype": 0,
103 |     "crbt": null,
104 |     "popularity": 40,
105 |     "dayPlays": 0,
106 |     "alias": [],
107 |     "copyFrom": "",
108 |     "position": 1,
109 |     "starred": false,,
110 |     "starredNum": 0
111 |     "bMusic": {
112 |         "name": "跟着你到天边 钢琴版",
113 |         "extension": "mp3",
114 |         "volumeDelta": 0.0553125,
115 |         "sr": 44100,
116 |         "dfsId": 3222668581877701,
117 |         "playTime": 174001,
118 |         "bitrate": 96000,
119 |         "id": 52423394,
120 |         "size": 2089713
121 |     },
122 |     "lMusic": {
123 |         "name": "跟着你到天边 钢琴版",
124 |         "extension": "mp3",
125 |         "volumeDelta": 0.0553125,
126 |         "sr": 44100,
127 |         "dfsId": 3222668581877701,
128 |         "playTime": 174001,
129 |         "bitrate": 96000,
130 |         "id": 52423394,
131 |         "size": 2089713
132 |     },
133 |     "mMusic": {
134 |         "name": "跟着你到天边 钢琴版",
135 |         "extension": "mp3",
136 |         "volumeDelta": -0.000265076,
137 |         "sr": 44100,
138 |         "dfsId": 3222668581877702,
139 |         "playTime": 174001,
140 |         "bitrate": 128000,
141 |         "id": 52423395,
142 |         "size": 2785510
143 |     },
144 |     "artists": [
145 |         {
146 |         "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
147 |         "name": "群星",
148 |         "briefDesc": "",
149 |         "albumSize": 0,
150 |         "img1v1Id": 0,
151 |         "musicSize": 0,
152 |         "alias": [],
153 |         "picId": 0,
154 |         "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
155 |         "trans": "",
156 |         "id": 122455
157 |         }
158 |     ],
159 |     "album": {
160 |         "id": 3054006,
161 |         "status": 2,
162 |         "type": null,
163 |         "tags": "",
164 |         "size": 69,
165 |         "blurPicUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
166 |         "copyrightId": 0,
167 |         "name": "热门华语248",
168 |         "companyId": 0,
169 |         "songs": [],
170 |         "description": "",
171 |         "pic": 3274345629219531,
172 |         "commentThreadId": "R_AL_3_3054006",
173 |         "publishTime": 1388505600004,
174 |         "briefDesc": "",
175 |         "company": "",
176 |         "picId": 3274345629219531,
177 |         "alias": [],
178 |         "picUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
179 |         "artists": [
180 |         {
181 |             "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
182 |             "name": "群星",
183 |             "briefDesc": "",
184 |             "albumSize": 0,
185 |             "img1v1Id": 0,
186 |             "musicSize": 0,
187 |             "alias": [],
188 |             "picId": 0,
189 |             "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
190 |             "trans": "",
191 |             "id": 122455
192 |         }
193 |         ],
194 |         "artist": {
195 |         "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
196 |         "name": "",
197 |         "briefDesc": "",
198 |         "albumSize": 0,
199 |         "img1v1Id": 0,
200 |         "musicSize": 0,
201 |         "alias": [],
202 |         "picId": 0,
203 |         "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
204 |         "trans": "",
205 |         "id": 0
206 |         }
207 |     }
208 | }
209 | 
210 | </pre>
211 | ### 2. 数据解析
212 | #### 2.1 原始数据=>歌单数据
213 | 抽取 歌单名称，歌单id，收藏数，所属分类 4个歌单维度的信息 
214 | 抽取 歌曲id，歌曲名，歌手，歌曲热度 等4个维度信息歌曲的信息
215 | 
216 | 组织成如下格式：
217 | <pre>
218 | 漫步西欧小镇上##小语种,旅行##69413685##474    18682332::Wäg vo dir::Joy Amelie::70.0    4335372::Only When I Sleep::The Corrs::60.0    2925502::Si Seulement::Lynnsha::100.0    21014930::Tu N'As Pas Cherché...::La Grande Sophie::100.0    20932638::Du behöver aldrig mer vara rädd::Lasse Lindh::25.0    17100518::Silent Machine::Cat Power::60.0    3308096::Kor pai kon diew : ชอไปคนเดียว::Palmy::5.0    1648250::les choristes::Petits Chanteurs De Saint Marc::100.0    4376212::Paddy's Green Shamrock Shore::The High Kings::25.0    2925400::A Todo Color::Las Escarlatinas::95.0    19711402::Comme Toi::Vox Angeli::75.0    3977526::Stay::Blue Cafe::100.0    2538518::Shake::Elize::85.0    2866799::Mon Ange::Jena Lee::85.0    5191949::Je M'appelle Helene::Hélène Rolles::85.0    20036323::Ich Lieb' Dich Immer Noch So Sehr::Kate & Ben::100.0
219 | </pre>
220 | #### 2.2 歌单数据=>推荐系统格式数据
221 | 主流的python推荐系统框架，支持的最基本数据格式为movielens dataset，其评分数据格式为 user item rating timestamp，把数据处理成这个格式。
222 | #### 2.3 保存歌单和歌曲信息备用
223 | 保存 歌单id=>歌单名 和 歌曲id=>歌曲名 的信息
224 | 
225 | ### 3.使用python推荐系统库Surprise完成项目
226 | #### 3.1用协同过滤构建模型并进行预测
227 | ##### 3.1.1 推荐歌单
228 | ![推荐歌单](./images/1.jpg)
229 | ##### 3.1.2 推荐歌曲
230 | ![推荐歌曲](./images/2.jpg)
231 | 当然也可以使用其他的算法来实现，如：
232 | <pre>
233 | 基础算法/baseline algorithms
234 | 基于近邻方法(协同过滤)/neighborhood methods
235 | 矩阵分解方法/matrix factorization-based (SVD, PMF, SVD++, NMF)
236 | </pre>
237 | 
238 | ### 4. 不同的推荐系统算法评估
239 | 可以使用不同的评估准则，如：
240 | <pre>
241 | rmse Compute RMSE (Root Mean Squared Error).
242 | msd Compute MAE (Mean Absolute Error).
243 | fcp Compute FCP (Fraction of Concordant Pairs).
244 | </pre>
245 | 


--------------------------------------------------------------------------------
/Sequence Modelling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 歌曲序列建模\n",
  8 |     "### 从word2vec到song2vec\n",
  9 |     "把歌曲的id序列取出来，类比于分完词后的句子，送到word2vec中去学习"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 8,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#coding: utf-8\n",
 19 |     "import multiprocessing\n",
 20 |     "import gensim\n",
 21 |     "import sys\n",
 22 |     "from random import shuffle\n",
 23 |     "\n",
 24 |     "def parse_playlist_get_sequence(in_line, playlist_sequence):\n",
 25 |     "    song_sequence = []\n",
 26 |     "    contents = in_line.strip().split(\"\\t\")\n",
 27 |     "    # 解析歌单序列\n",
 28 |     "    for song in contents[1:]:\n",
 29 |     "        try:\n",
 30 |     "            song_id, song_name, artist, popularity = song.split(\"::\")\n",
 31 |     "            song_sequence.append(song_id)\n",
 32 |     "        except:\n",
 33 |     "            print (\"song format error\")\n",
 34 |     "            print (song+\"\\n\")\n",
 35 |     "    for i in range(len(song_sequence)):\n",
 36 |     "        shuffle(song_sequence)\n",
 37 |     "        playlist_sequence.append(song_sequence)\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "def train_song2vec(in_file, out_file):\n",
 41 |     "    #所有歌单序列\n",
 42 |     "    playlist_sequence = []\n",
 43 |     "    #遍历所有歌单\n",
 44 |     "    for line in open(in_file, encoding='utf-8'):\n",
 45 |     "        parse_playlist_get_sequence(line, playlist_sequence)\n",
 46 |     "    #使用word2vec训练\n",
 47 |     "    cores = multiprocessing.cpu_count()\n",
 48 |     "    print (\"using all \"+str(cores)+\" cores\")\n",
 49 |     "    print (\"Training word2vec model...\")\n",
 50 |     "    model = gensim.models.Word2Vec(sentences=playlist_sequence, size=150, min_count=3, window=7, workers=cores)\n",
 51 |     "    print (\"Saving model...\")\n",
 52 |     "    model.save(out_file)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 9,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "song format error\n",
 65 |       "1870957::彩云国物语 セカンドシリーズ::君を想う::梁邦彦::80.0\n",
 66 |       "\n",
 67 |       "song format error\n",
 68 |       "4965888::桃华月惮::龙皇-リュウオウ-::多田彰文::25.0\n",
 69 |       "\n",
 70 |       "song format error\n",
 71 |       "456177::true tears::一阵の风::菊地創::95.0\n",
 72 |       "\n",
 73 |       "song format error\n",
 74 |       "22642373::\n",
 75 |       "\n",
 76 |       "song format error\n",
 77 |       " FAIRY TAIL メインテーマ -Slow ver.-::高梨康治::95.0\n",
 78 |       "\n",
 79 |       "song format error\n",
 80 |       "31563610::\n",
 81 |       "\n",
 82 |       "song format error\n",
 83 |       "苍之礼赞::花之祭P::60.0\n",
 84 |       "\n",
 85 |       "song format error\n",
 86 |       "4954593::リズム天国全曲集::恋の実験室::V.A.::55.0\n",
 87 |       "\n",
 88 |       "song format error\n",
 89 |       "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n",
 90 |       "\n",
 91 |       "song format error\n",
 92 |       "31654811::\n",
 93 |       "\n",
 94 |       "song format error\n",
 95 |       "American Cowboys::Tim Wynn::65.0\n",
 96 |       "\n",
 97 |       "song format error\n",
 98 |       "19169096::\n",
 99 |       "\n",
100 |       "song format error\n",
101 |       " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
102 |       "\n",
103 |       "song format error\n",
104 |       "31563610::\n",
105 |       "\n",
106 |       "song format error\n",
107 |       "苍之礼赞::花之祭P::60.0\n",
108 |       "\n",
109 |       "song format error\n",
110 |       "31563610::\n",
111 |       "\n",
112 |       "song format error\n",
113 |       "苍之礼赞::花之祭P::60.0\n",
114 |       "\n",
115 |       "song format error\n",
116 |       "31563610::\n",
117 |       "\n",
118 |       "song format error\n",
119 |       "苍之礼赞::花之祭P::60.0\n",
120 |       "\n",
121 |       "song format error\n",
122 |       "19169096::\n",
123 |       "\n",
124 |       "song format error\n",
125 |       " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
126 |       "\n",
127 |       "song format error\n",
128 |       "376653::野弧禅狂叱(宿香之战)\n",
129 |       "\n",
130 |       "song format error\n",
131 |       "::霹雳英雄::5.0\n",
132 |       "\n",
133 |       "song format error\n",
134 |       "374524::赎?罪\n",
135 |       "\n",
136 |       "song format error\n",
137 |       "赎罪岩::霹雳英雄::15.0\n",
138 |       "\n",
139 |       "song format error\n",
140 |       "31563610::\n",
141 |       "\n",
142 |       "song format error\n",
143 |       "苍之礼赞::花之祭P::65.0\n",
144 |       "\n",
145 |       "song format error\n",
146 |       "37610597::ダウンタウン熱血物語::公園/河原にて~ひとときのやすらぎ~::V.A.::75.0\n",
147 |       "\n",
148 |       "song format error\n",
149 |       "37610748::くにおくんの熱血サッカーリーグ::ねっけつ たいふーん♪::V.A.::80.0\n",
150 |       "\n",
151 |       "song format error\n",
152 |       "37610755::くにおくんの熱血サッカーリーグ::てくのす じゃぱん かっぷの てーま♪::V.A.::75.0\n",
153 |       "\n",
154 |       "song format error\n",
155 |       "37610745::くにおくんの熱血サッカーリーグ::ゲームモード選択::V.A.::75.0\n",
156 |       "\n",
157 |       "song format error\n",
158 |       "37610643::ダウンタウン熱血行進曲 それゆけ大運動会::オープニングファンファーレ::V.A.::70.0\n",
159 |       "\n",
160 |       "song format error\n",
161 |       "33054290::\n",
162 |       "\n",
163 |       "song format error\n",
164 |       "Heartbeats::Dabin::90.0\n",
165 |       "\n",
166 |       "song format error\n",
167 |       "405599088::Make Them Wheels Roll\n",
168 |       "\n",
169 |       "song format error\n",
170 |       "::SAFIA::100.0\n",
171 |       "\n",
172 |       "song format error\n",
173 |       "424496188::大王叫我来巡山 - （原唱：\n",
174 |       "\n",
175 |       "song format error\n",
176 |       " 贾乃亮/贾云馨）::流浪的蛙蛙::65.0\n",
177 |       "\n",
178 |       "song format error\n",
179 |       "19169096::\n",
180 |       "\n",
181 |       "song format error\n",
182 |       " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n",
183 |       "\n",
184 |       "song format error\n",
185 |       "26902203::What’s your name? (collaboration with 壇蜜)\n",
186 |       "\n",
187 |       "song format error\n",
188 |       "::SoulJa::100.0\n",
189 |       "\n",
190 |       "song format error\n",
191 |       "33054290::\n",
192 |       "\n",
193 |       "song format error\n",
194 |       "Heartbeats::Dabin::95.0\n",
195 |       "\n",
196 |       "song format error\n",
197 |       "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n",
198 |       "\n",
199 |       "song format error\n",
200 |       "32272105::\n",
201 |       "\n",
202 |       "song format error\n",
203 |       "Wonderful Love (DJ Raf Remix)::Money Penny::95.0\n",
204 |       "\n",
205 |       "song format error\n",
206 |       "33054290::\n",
207 |       "\n",
208 |       "song format error\n",
209 |       "Heartbeats::Dabin::95.0\n",
210 |       "\n",
211 |       "song format error\n",
212 |       "427373827::Champions (From \"Hands of Stone\") \n",
213 |       "\n",
214 |       "song format error\n",
215 |       "::Usher::30.0\n",
216 |       "\n",
217 |       "song format error\n",
218 |       "29242687::「コード・エテスウェイ (Class::ETHES_WEI=>extends.COMMUNI_SAT/.)」::霜月はるか::70.0\n",
219 |       "\n",
220 |       "using all 4 cores\n",
221 |       "Training word2vec model...\n",
222 |       "Saving model...\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "song_sequence_file = \"./ori_data/popular.playlist\"\n",
228 |     "model_file = \"./model/song2vec.model\"\n",
229 |     "train_song2vec(song_sequence_file, model_file)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "### 预测的过程，实际上就是对某首歌曲，查找“最近”的歌曲（向量距离最近的歌曲）"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 10,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "import pickle\n",
248 |     "song_dic = pickle.load(open(\"./pro_data/popular_song.pkl\",\"rb\"))\n",
249 |     "model_str = \"./model/song2vec.model\"\n",
250 |     "model = gensim.models.Word2Vec.load(model_str)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 12,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "315958 那件疯狂的小事叫爱情\t袁泉\n",
263 |       "28138980 为你我受冷风吹\t孙露\n",
264 |       "247526 彗星的眼泪\t金莎\n",
265 |       "5280395 慨古吟(琴歌)\t张铜霞\n",
266 |       "31140395 一首简单的歌\t本兮\n",
267 |       "27532150 Smoke Fly ft. JBo Escobar & Khaki\tAl Rocco\n",
268 |       "440767373 メドゥーサ（美杜莎）\t月蝕原创音乐\n",
269 |       "16323636 The Prayer\tAndrea Bocelli\n",
270 |       "281436 夜曲\t彭芳\n",
271 |       "5270404 渴望(二胡)\t群星\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "for song in list(song_dic.keys())[:10]:\n",
277 |     "    print (song, song_dic[song])"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 14,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stderr",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "D:\\Anaconda\\install\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
290 |       "  This is separate from the ipykernel package so we can avoid doing imports until\n"
291 |      ]
292 |     },
293 |     {
294 |      "name": "stdout",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "368971 Ambulance of love\t脑浊\n",
298 |       "\n",
299 |       "相似歌曲 和 相似度 分别为:\n",
300 |       "\t 新世界\t呼吸 0.8102102279663086\n",
301 |       "\t 上苍保佑吃完了饭的人民\t张楚 0.8082322478294373\n",
302 |       "\t 呀呀\t图腾 0.7943791151046753\n",
303 |       "\t 昨日我从清晨开始等待\t钟立风 0.774426281452179\n",
304 |       "\t 生命(Live) - live\t声音玩具 0.7557182312011719\n",
305 |       "\t 两天\t许巍 0.7442638874053955\n",
306 |       "\t 永远在一起\t飘乐队 0.7283462285995483\n",
307 |       "\t 今夜\t许巍 0.7184534072875977\n",
308 |       "\t 祖先的阴影\t超载 0.7170863747596741\n",
309 |       "\t 我们走过的路\t天空 0.7092562913894653\n",
310 |       "\n",
311 |       "\n",
312 |       "33599059 八秒之语\t洛天依\n",
313 |       "\n",
314 |       "相似歌曲 和 相似度 分别为:\n",
315 |       "\t 乡村DISCO\tVOCALOID 0.6794760227203369\n",
316 |       "\t 春雨\t乐正绫 0.649375319480896\n",
317 |       "\t 远恋\t阿良良木健 0.6485384702682495\n",
318 |       "\t 食之歌 VOCALOID Ver.\t泛音堂 0.6450830698013306\n",
319 |       "\t 小幸运（Cover：田馥甄）\t星魂梦 0.6226800680160522\n",
320 |       "\t 出格\t阿妍 0.6186865568161011\n",
321 |       "\t 双向监禁\t洛天依 0.6172651052474976\n",
322 |       "\t 山海默示录（洛天依版）\t小旭PRO 0.6170870661735535\n",
323 |       "\t 甄姬\tVOCALOID 0.613639771938324\n",
324 |       "\t 全世界都死了\t海鲜面 0.6097506284713745\n",
325 |       "\n",
326 |       "\n",
327 |       "408332846 知足\t苏运莹\n",
328 |       "\n",
329 |       "相似歌曲 和 相似度 分别为:\n",
330 |       "\t 垃圾车（cover 五月天）\t李昂星 0.714142918586731\n",
331 |       "\t Happy Birth Day\t香蕉 0.7014893293380737\n",
332 |       "\t 拥抱（Cover 五月天)\t橙大蕾蕾 0.6977276802062988\n",
333 |       "\t 爱情的模样\t小平 0.6919869780540466\n",
334 |       "\t 听不到(Live)\t梁静茹 0.6528257727622986\n",
335 |       "\t 我就是这样的\t黄贯中 0.6462737917900085\n",
336 |       "\t 拥抱（Cover 五月天）\t燕子姐姐弹吉他 0.6430615186691284\n",
337 |       "\t 穿越时空遇见你\t萧亚轩 0.6319111585617065\n",
338 |       "\t 神奇\t孙燕姿 0.6182767152786255\n",
339 |       "\t 一个人的圣诞节\t张赫宣 0.6101440191268921\n",
340 |       "\n",
341 |       "\n",
342 |       "34072696 酒馆小调\t洛天依\n",
343 |       "\n",
344 |       "相似歌曲 和 相似度 分别为:\n",
345 |       "\t 菌裂\t言和 0.7842902541160583\n",
346 |       "\t 妄想不到的恋曲\t烂兔子 0.748408317565918\n",
347 |       "\t 女王\t洛天依 0.7209190726280212\n",
348 |       "\t 清醒的梦 \tVilokun feat.言和 0.7056742906570435\n",
349 |       "\t Mr 坷垃\t言和 0.7054739594459534\n",
350 |       "\t 【心华】乌龟家的茶社【ick】\t缺钙体质ick 0.7002834677696228\n",
351 |       "\t 偶像进行时\t言和 0.698686420917511\n",
352 |       "\t 乐正绫 - 拉斯维加斯\t慕晓社 0.6933234333992004\n",
353 |       "\t 流星之愿\t洛天依&言和 0.6909075975418091\n",
354 |       "\t 病态的我\t洛天依 0.6879562139511108\n",
355 |       "\n",
356 |       "\n",
357 |       "279713 十六夜的樱丘\t梦璟SAYA\n",
358 |       "\n",
359 |       "相似歌曲 和 相似度 分别为:\n",
360 |       "\t 合金三国-沛县\t灰原穷 0.7301620841026306\n",
361 |       "\t 岁月友情演唱会Live\t聂予词 0.7063505053520203\n",
362 |       "\t 克罗地亚狂想曲（中文填词版）\t少年霜 0.6886762976646423\n",
363 |       "\t 言君安\t倾夜 0.6869547963142395\n",
364 |       "\t 凤鸣曲\t音频怪物 0.6675935387611389\n",
365 |       "\t 相忘江湖\t玄觞 0.6578972935676575\n",
366 |       "\t 伊人\t魏晨 0.6283127665519714\n",
367 |       "\t 失落的遗迹:Lost Ruins ~ adventurers' tale~\tkaede 0.6206890344619751\n",
368 |       "\t 【Moonlight组合】甜蜜具现式\tKBShinya 0.6184070110321045\n",
369 |       "\t 老房子的故事【老歌搬家】\tWinky诗 0.6105965971946716\n",
370 |       "\n",
371 |       "\n",
372 |       "33004911 听妈妈讲那过去的事情\t群星\n",
373 |       "\n",
374 |       "相似歌曲 和 相似度 分别为:\n",
375 |       "\t 国旗多美丽\t群星 0.9948954582214355\n",
376 |       "\t 劳动最光荣\t杨烁 0.9924089312553406\n",
377 |       "\t 数青蛙\t群星 0.9401867985725403\n",
378 |       "\t 儿童歌曲大联唱B\t群星 0.9401167631149292\n",
379 |       "\t 母鸭带小鸭\t杨烁 0.937639594078064\n",
380 |       "\t 嘀哩,嘀哩\t中央人民广播电台少年儿童合唱团 0.931469738483429\n",
381 |       "\t 小小少年\t韩征 0.9063182473182678\n",
382 |       "\t 世上只有妈妈好\t杨烁 0.893858790397644\n",
383 |       "\t 真善美的小世界\t小蓓蕾组合 0.8826833963394165\n",
384 |       "\t 我家住在北京城\t苑菁 0.8813650012016296\n",
385 |       "\n",
386 |       "\n",
387 |       "26389372 水色\tUA\n",
388 |       "\n",
389 |       "相似歌曲 和 相似度 分别为:\n",
390 |       "\t 最后の言い訳\t徳永英明 0.7948691248893738\n",
391 |       "\t テルーの呗\t手嶌葵 0.7602044343948364\n",
392 |       "\t もう君以外爱せない\tKinKi Kids 0.7559913992881775\n",
393 |       "\t 氷点\t玉置浩二 0.7401448488235474\n",
394 |       "\t 时の过ぎゆくままに\t沢田研二 0.7194046378135681\n",
395 |       "\t 时代\t中島みゆき 0.7185744643211365\n",
396 |       "\t 手紙 ~拝啓 十五の君へ~\tアンジェラ・アキ 0.6948644518852234\n",
397 |       "\t あした\t中島みゆき 0.685759425163269\n",
398 |       "\t MR.LONELY\t玉置浩二 0.6696567535400391\n",
399 |       "\t Carcrashes [Album Version]\tStandfast 0.6691217422485352\n",
400 |       "\n",
401 |       "\n",
402 |       "166471 别等离开才说爱我\t王志\n",
403 |       "\n",
404 |       "相似歌曲 和 相似度 分别为:\n",
405 |       "\t 分爱  粤语版\t易欣 0.8755930066108704\n",
406 |       "\t 反叛(Illegal Mix) - remix\t陈慧娴 0.873873233795166\n",
407 |       "\t 背叛\t芭比 0.8577862977981567\n",
408 |       "\t 罗盘上的指针\t群星 0.8541814088821411\n",
409 |       "\t 最美丽的花\t王绎龙 0.8295896649360657\n",
410 |       "\t 各种小曲各种嗨\t珊爷 0.8236122727394104\n",
411 |       "\t 不要推我\t群星 0.7918290495872498\n",
412 |       "\t 狂舞大麻\t群星 0.7873603105545044\n",
413 |       "\t 真的不容易 (DJ阿圣 Remix)\t庄心妍 0.7646005749702454\n",
414 |       "\t 看我72变\tM3 0.7498428225517273\n",
415 |       "\n",
416 |       "\n",
417 |       "29414454 心中喜欢就说爱\t好妹妹乐队\n",
418 |       "\n",
419 |       "相似歌曲 和 相似度 分别为:\n",
420 |       "\t 请你给我多一点点的温柔\t秦昊 0.8446059823036194\n",
421 |       "\t 熟悉的拥抱 (Demo)\t好妹妹乐队 0.8162363767623901\n",
422 |       "\t 秋诗篇篇 \t秦昊 0.8092836737632751\n",
423 |       "\t 风从海面吹过来\t好妹妹乐队 0.792778491973877\n",
424 |       "\t 军港之夜 \t秦昊 0.78443443775177\n",
425 |       "\t 熟悉的拥抱\t好妹妹乐队 0.7571429014205933\n",
426 |       "\t 心曲\t好妹妹乐队 0.7473146319389343\n",
427 |       "\t 风又吹走了\t好妹妹乐队 0.7431649565696716\n",
428 |       "\t 愿在秋天死去 (Demo)\t好妹妹乐队 0.7383122444152832\n",
429 |       "\t 四季歌\t秦昊 0.7374083399772644\n",
430 |       "\n",
431 |       "\n",
432 |       "347983 传奇 Legend\t春秋\n",
433 |       "\n",
434 |       "相似歌曲 和 相似度 分别为:\n",
435 |       "\t 开始\t核聚变-G 0.8861181735992432\n",
436 |       "\t 抢回一切\t岩浆乐队 0.85029536485672\n",
437 |       "\t No One Can Change My Mind\t利事乐队 0.8492539525032043\n",
438 |       "\t 武器\t浊乐队 0.809968888759613\n",
439 |       "\t 【填翻】城池\t妖痴 0.786383867263794\n",
440 |       "\t 杀手\t战斧 0.7508671879768372\n",
441 |       "\t 梦魔\t大红袍 0.7423715591430664\n",
442 |       "\t 梦已成\"血\"\t液氧罐头 0.7245020866394043\n",
443 |       "\t 大鱼\t三火SAMA 0.7139255404472351\n",
444 |       "\t 是什么，让我们留在这里？\t夜叉 0.7099397778511047\n",
445 |       "\n",
446 |       "\n"
447 |      ]
448 |     }
449 |    ],
450 |    "source": [
451 |     "song_id_list = list(song_dic.keys())[1000:1500:50]\n",
452 |     "for song_id in song_id_list:\n",
453 |     "    result_song_list = model.most_similar(song_id)\n",
454 |     "\n",
455 |     "    print (song_id, song_dic[song_id])\n",
456 |     "    print (\"\\n相似歌曲 和 相似度 分别为:\")\n",
457 |     "    for song in result_song_list:\n",
458 |     "        print (\"\\t\", song_dic[song[0]], song[1])\n",
459 |     "    print (\"\\n\")"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": []
470 |   }
471 |  ],
472 |  "metadata": {
473 |   "kernelspec": {
474 |    "display_name": "Python 3",
475 |    "language": "python",
476 |    "name": "python3"
477 |   },
478 |   "language_info": {
479 |    "codemirror_mode": {
480 |     "name": "ipython",
481 |     "version": 3
482 |    },
483 |    "file_extension": ".py",
484 |    "mimetype": "text/x-python",
485 |    "name": "python",
486 |    "nbconvert_exporter": "python",
487 |    "pygments_lexer": "ipython3",
488 |    "version": "3.5.2"
489 |   }
490 |  },
491 |  "nbformat": 4,
492 |  "nbformat_minor": 2
493 | }
494 | 


--------------------------------------------------------------------------------
/Spark Recommendation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## pyspark协同过滤"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### user-based协同过滤"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "#-*- coding:utf8 -*-\n",
 26 |     "# pySpark实现的基于用户的协同过滤\n",
 27 |     "# 使用的余弦相似度\n",
 28 |     "\n",
 29 |     "import sys\n",
 30 |     "from collections import defaultdict\n",
 31 |     "from itertools import combinations\n",
 32 |     "import random\n",
 33 |     "import numpy as np\n",
 34 |     "import pdb\n",
 35 |     "\n",
 36 |     "from pyspark import SparkContext\n",
 37 |     "\n",
 38 |     "# user item rating timestamp\n",
 39 |     "def parseVectorOnUser(line):\n",
 40 |     "    '''\n",
 41 |     "        解析数据，key是user，后面是item和打分\n",
 42 |     "    '''\n",
 43 |     "    line = line.split(\"|\")\n",
 44 |     "    return line[0],(line[1],float(line[2]))\n",
 45 |     "\n",
 46 |     "def parseVectorOnItem(line):\n",
 47 |     "    '''\n",
 48 |     "        解析数据，key是item，后面是user和打分\n",
 49 |     "    '''\n",
 50 |     "    line = line.split(\"|\")\n",
 51 |     "    return line[1],(line[0],float(line[2]))\n",
 52 |     "\n",
 53 |     "def sampleInteractions(item_id,users_with_rating,n):\n",
 54 |     "    '''\n",
 55 |     "        如果某个商品上用户行为特别多，可以选择适当做点下采样\n",
 56 |     "    '''\n",
 57 |     "    if len(users_with_rating) > n:\n",
 58 |     "        return item_id, random.sample(users_with_rating,n)\n",
 59 |     "    else:\n",
 60 |     "        return item_id, users_with_rating\n",
 61 |     "\n",
 62 |     "def findUserPairs(item_id,users_with_rating):\n",
 63 |     "    '''\n",
 64 |     "        对每个item，找到共同打分的user对\n",
 65 |     "    '''\n",
 66 |     "    for user1,user2 in combinations(users_with_rating,2):\n",
 67 |     "        return (user1[0],user2[0]),(user1[1],user2[1])\n",
 68 |     "\n",
 69 |     "def calcSim(user_pair,rating_pairs):\n",
 70 |     "    ''' \n",
 71 |     "        对每个user对，根据打分计算余弦距离，并返回共同打分的item个数\n",
 72 |     "    '''\n",
 73 |     "    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n",
 74 |     "    \n",
 75 |     "    for rating_pair in rating_pairs:\n",
 76 |     "        sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n",
 77 |     "        sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n",
 78 |     "        sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n",
 79 |     "        # sum_y += rt[1]\n",
 80 |     "        # sum_x += rt[0]\n",
 81 |     "        n += 1\n",
 82 |     "\n",
 83 |     "    cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n",
 84 |     "    return user_pair, (cos_sim,n)\n",
 85 |     "\n",
 86 |     "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n",
 87 |     "    '''\n",
 88 |     "        2个向量A和B的余弦相似度\n",
 89 |     "       dotProduct(A, B) / (norm(A) * norm(B))\n",
 90 |     "    '''\n",
 91 |     "    numerator = dot_product\n",
 92 |     "    denominator = rating_norm_squared * rating2_norm_squared\n",
 93 |     "\n",
 94 |     "    return (numerator / (float(denominator))) if denominator else 0.0\n",
 95 |     "\n",
 96 |     "def keyOnFirstUser(user_pair,item_sim_data):\n",
 97 |     "    '''\n",
 98 |     "        对于每个user-user对，用第一个user做key(好像有点粗暴...)\n",
 99 |     "    '''\n",
100 |     "    (user1_id,user2_id) = user_pair\n",
101 |     "    return user1_id,(user2_id,item_sim_data)\n",
102 |     "\n",
103 |     "def nearestNeighbors(user,users_and_sims,n):\n",
104 |     "    '''\n",
105 |     "        选出相似度最高的N个邻居\n",
106 |     "    '''\n",
107 |     "    users_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n",
108 |     "    return user, users_and_sims[:n]\n",
109 |     "\n",
110 |     "def topNRecommendations(user_id,user_sims,users_with_rating,n):\n",
111 |     "    '''\n",
112 |     "        根据最近的N个邻居进行推荐\n",
113 |     "    '''\n",
114 |     "\n",
115 |     "    totals = defaultdict(int)\n",
116 |     "    sim_sums = defaultdict(int)\n",
117 |     "\n",
118 |     "    for (neighbor,(sim,count)) in user_sims:\n",
119 |     "\n",
120 |     "        # 遍历邻居的打分\n",
121 |     "        unscored_items = users_with_rating.get(neighbor,None)\n",
122 |     "\n",
123 |     "        if unscored_items:\n",
124 |     "            for (item,rating) in unscored_items:\n",
125 |     "                if neighbor != item:\n",
126 |     "\n",
127 |     "                    # 更新推荐度和相近度\n",
128 |     "                    totals[neighbor] += sim * rating\n",
129 |     "                    sim_sums[neighbor] += sim\n",
130 |     "\n",
131 |     "    # 归一化\n",
132 |     "    scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n",
133 |     "\n",
134 |     "    # 按照推荐度降序排列\n",
135 |     "    scored_items.sort(reverse=True)\n",
136 |     "\n",
137 |     "    # 推荐度的item\n",
138 |     "    ranked_items = [x[1] for x in scored_items]\n",
139 |     "\n",
140 |     "    return user_id,ranked_items[:n]\n",
141 |     "\n",
142 |     "if __name__ == \"__main__\":\n",
143 |     "    if len(sys.argv) < 3:\n",
144 |     "        print >> sys.stderr, \\\n",
145 |     "            \"Usage: PythonUserCF <master> <file>\"\n",
146 |     "        exit(-1)\n",
147 |     "\n",
148 |     "    sc = SparkContext(sys.argv[1],\"PythonUserCF\")\n",
149 |     "    lines = sc.textFile(sys.argv[2])\n",
150 |     "\n",
151 |     "    '''\n",
152 |     "        处理数据，获得稀疏item-user矩阵:\n",
153 |     "        item_id -> ((user_1,rating),(user2,rating))\n",
154 |     "    '''\n",
155 |     "    item_user_pairs = lines.map(parseVectorOnItem).groupByKey().map(\n",
156 |     "        lambda p: sampleInteractions(p[0],p[1],500)).cache()\n",
157 |     "\n",
158 |     "    '''\n",
159 |     "        获得2个用户所有的item-item对得分组合:\n",
160 |     "        (user1_id,user2_id) -> [(rating1,rating2),\n",
161 |     "                                (rating1,rating2),\n",
162 |     "                                (rating1,rating2),\n",
163 |     "                                ...]\n",
164 |     "    '''\n",
165 |     "    pairwise_users = item_user_pairs.filter(\n",
166 |     "        lambda p: len(p[1]) > 1).map(\n",
167 |     "        lambda p: findUserPairs(p[0],p[1])).groupByKey()\n",
168 |     "\n",
169 |     "    '''\n",
170 |     "        计算余弦相似度，找到最近的N个邻居:\n",
171 |     "        (user1,user2) ->    (similarity,co_raters_count)\n",
172 |     "    '''\n",
173 |     "    user_sims = pairwise_users.map(\n",
174 |     "        lambda p: calcSim(p[0],p[1])).map(\n",
175 |     "        lambda p: keyOnFirstUser(p[0],p[1])).groupByKey().map(\n",
176 |     "        lambda p: nearestNeighbors(p[0],p[1],50))\n",
177 |     "\n",
178 |     "    ''' \n",
179 |     "        对每个用户的打分记录整理成如下形式\n",
180 |     "        user_id -> [(item_id_1, rating_1),\n",
181 |     "                   [(item_id_2, rating_2),\n",
182 |     "                    ...]\n",
183 |     "    '''\n",
184 |     "\n",
185 |     "    user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect()\n",
186 |     "\n",
187 |     "    ui_dict = {}\n",
188 |     "    for (user,items) in user_item_hist: \n",
189 |     "        ui_dict[user] = items\n",
190 |     "\n",
191 |     "    uib = sc.broadcast(ui_dict)\n",
192 |     "\n",
193 |     "    '''\n",
194 |     "        为每个用户计算Top N的推荐\n",
195 |     "        user_id -> [item1,item2,item3,...]\n",
196 |     "    '''\n",
197 |     "    user_item_recs = user_sims.map(lambda p: topNRecommendations(p[0],p[1],uib.value,100)).collect()"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### item-based协同过滤"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "#-*- coding:utf8 -*-\n",
216 |     "# pySpark实现的基于物品的协同过滤\n",
217 |     "\n",
218 |     "import sys\n",
219 |     "from collections import defaultdict\n",
220 |     "from itertools import combinations\n",
221 |     "import numpy as np\n",
222 |     "import random\n",
223 |     "import csv\n",
224 |     "import pdb\n",
225 |     "\n",
226 |     "from pyspark import SparkContext\n",
227 |     "\n",
228 |     "def parseVector(line):\n",
229 |     "    '''\n",
230 |     "        解析数据，key是item，后面是user和打分\n",
231 |     "    '''\n",
232 |     "    line = line.split(\"|\")\n",
233 |     "    return line[0],(line[1],float(line[2]))\n",
234 |     "\n",
235 |     "def sampleInteractions(user_id,items_with_rating,n):\n",
236 |     "    '''\n",
237 |     "        如果某个用户打分行为特别多，可以选择适当做点下采样\n",
238 |     "    '''\n",
239 |     "    if len(items_with_rating) > n:\n",
240 |     "        return user_id, random.sample(items_with_rating,n)\n",
241 |     "    else:\n",
242 |     "        return user_id, items_with_rating\n",
243 |     "\n",
244 |     "def findItemPairs(user_id,items_with_rating):\n",
245 |     "    '''\n",
246 |     "        对每个用户的打分item，组对\n",
247 |     "    '''\n",
248 |     "    for item1,item2 in combinations(items_with_rating,2):\n",
249 |     "        return (item1[0],item2[0]),(item1[1],item2[1])\n",
250 |     "\n",
251 |     "def calcSim(item_pair,rating_pairs):\n",
252 |     "    ''' \n",
253 |     "        对每个item对，根据打分计算余弦距离，并返回共同打分的user个数\n",
254 |     "    '''\n",
255 |     "    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n",
256 |     "    \n",
257 |     "    for rating_pair in rating_pairs:\n",
258 |     "        sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n",
259 |     "        sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n",
260 |     "        sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n",
261 |     "        # sum_y += rt[1]\n",
262 |     "        # sum_x += rt[0]\n",
263 |     "        n += 1\n",
264 |     "\n",
265 |     "    cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n",
266 |     "    return item_pair, (cos_sim,n)\n",
267 |     "\n",
268 |     "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n",
269 |     "    '''\n",
270 |     "    The cosine between two vectors A, B\n",
271 |     "       dotProduct(A, B) / (norm(A) * norm(B))\n",
272 |     "    '''\n",
273 |     "    numerator = dot_product\n",
274 |     "    denominator = rating_norm_squared * rating2_norm_squared\n",
275 |     "    return (numerator / (float(denominator))) if denominator else 0.0\n",
276 |     "\n",
277 |     "def correlation(size, dot_product, rating_sum, \\\n",
278 |     "            rating2sum, rating_norm_squared, rating2_norm_squared):\n",
279 |     "    '''\n",
280 |     "        2个向量A和B的相似度\n",
281 |     "        [n * dotProduct(A, B) - sum(A) * sum(B)] /\n",
282 |     "        sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }\n",
283 |     "\n",
284 |     "    '''\n",
285 |     "    numerator = size * dot_product - rating_sum * rating2sum\n",
286 |     "    denominator = sqrt(size * rating_norm_squared - rating_sum * rating_sum) * \\\n",
287 |     "                    sqrt(size * rating2_norm_squared - rating2sum * rating2sum)\n",
288 |     "\n",
289 |     "    return (numerator / (float(denominator))) if denominator else 0.0\n",
290 |     "\n",
291 |     "def keyOnFirstItem(item_pair,item_sim_data):\n",
292 |     "    '''\n",
293 |     "        对于每个item-item对，用第一个item做key(好像有点粗暴...)\n",
294 |     "    '''\n",
295 |     "    (item1_id,item2_id) = item_pair\n",
296 |     "    return item1_id,(item2_id,item_sim_data)\n",
297 |     "\n",
298 |     "def nearestNeighbors(item_id,items_and_sims,n):\n",
299 |     "    '''\n",
300 |     "        排序选出相似度最高的N个邻居\n",
301 |     "    '''\n",
302 |     "    items_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n",
303 |     "    return item_id, items_and_sims[:n]\n",
304 |     "\n",
305 |     "def topNRecommendations(user_id,items_with_rating,item_sims,n):\n",
306 |     "    '''\n",
307 |     "        根据最近的N个邻居进行推荐\n",
308 |     "    '''\n",
309 |     "    \n",
310 |     "    totals = defaultdict(int)\n",
311 |     "    sim_sums = defaultdict(int)\n",
312 |     "\n",
313 |     "    for (item,rating) in items_with_rating:\n",
314 |     "\n",
315 |     "        # 遍历item的邻居\n",
316 |     "        nearest_neighbors = item_sims.get(item,None)\n",
317 |     "\n",
318 |     "        if nearest_neighbors:\n",
319 |     "            for (neighbor,(sim,count)) in nearest_neighbors:\n",
320 |     "                if neighbor != item:\n",
321 |     "\n",
322 |     "                    # 更新推荐度和相近度\n",
323 |     "                    totals[neighbor] += sim * rating\n",
324 |     "                    sim_sums[neighbor] += sim\n",
325 |     "\n",
326 |     "    # 归一化\n",
327 |     "    scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n",
328 |     "\n",
329 |     "    # 按照推荐度降序排列\n",
330 |     "    scored_items.sort(reverse=True)\n",
331 |     "\n",
332 |     "    ranked_items = [x[1] for x in scored_items]\n",
333 |     "\n",
334 |     "    return user_id,ranked_items[:n]\n",
335 |     "\n",
336 |     "if __name__ == \"__main__\":\n",
337 |     "    if len(sys.argv) < 3:\n",
338 |     "        print >> sys.stderr, \\\n",
339 |     "            \"Usage: PythonItemCF <master> <file>\"\n",
340 |     "        exit(-1)\n",
341 |     "\n",
342 |     "    sc = SparkContext(sys.argv[1], \"PythonItemCF\")\n",
343 |     "    lines = sc.textFile(sys.argv[2])\n",
344 |     "\n",
345 |     "    ''' \n",
346 |     "        处理数据，获得稀疏user-item矩阵:\n",
347 |     "        user_id -> [(item_id_1, rating_1),\n",
348 |     "                   [(item_id_2, rating_2),\n",
349 |     "                    ...]\n",
350 |     "    '''\n",
351 |     "    user_item_pairs = lines.map(parseVector).groupByKey().map(\n",
352 |     "        lambda p: sampleInteractions(p[0],p[1],500)).cache()\n",
353 |     "\n",
354 |     "    '''\n",
355 |     "        获取所有item-item组合对\n",
356 |     "        (item1,item2) ->    [(item1_rating,item2_rating),\n",
357 |     "                             (item1_rating,item2_rating),\n",
358 |     "                             ...]\n",
359 |     "    '''\n",
360 |     "\n",
361 |     "    pairwise_items = user_item_pairs.filter(\n",
362 |     "        lambda p: len(p[1]) > 1).map(\n",
363 |     "        lambda p: findItemPairs(p[0],p[1])).groupByKey()\n",
364 |     "\n",
365 |     "    '''\n",
366 |     "        计算余弦相似度，找到最近的N个邻居:\n",
367 |     "        (item1,item2) ->    (similarity,co_raters_count)\n",
368 |     "    '''\n",
369 |     "\n",
370 |     "    item_sims = pairwise_items.map(\n",
371 |     "        lambda p: calcSim(p[0],p[1])).map(\n",
372 |     "        lambda p: keyOnFirstItem(p[0],p[1])).groupByKey().map(\n",
373 |     "        lambda p: nearestNeighbors(p[0],p[1],50)).collect()\n",
374 |     "\n",
375 |     "\n",
376 |     "    item_sim_dict = {}\n",
377 |     "    for (item,data) in item_sims: \n",
378 |     "        item_sim_dict[item] = data\n",
379 |     "\n",
380 |     "    isb = sc.broadcast(item_sim_dict)\n",
381 |     "\n",
382 |     "    '''\n",
383 |     "        计算最佳的N个推荐结果\n",
384 |     "        user_id -> [item1,item2,item3,...]\n",
385 |     "    '''\n",
386 |     "    user_item_recs = user_item_pairs.map(lambda p: topNRecommendations(p[0],p[1],isb.value,500)).collect()"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "collapsed": true
394 |    },
395 |    "outputs": [],
396 |    "source": []
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "## Spark推荐系统"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "### spark自带了用于推荐的算法"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "#!/usr/bin/env python\n",
421 |     "# 基于spark中ALS的推荐系统，针对movielens中电影打分数据做推荐\n",
422 |     "# Edit：寒小阳(hanxiaoyang.ml@gmail.com)\n",
423 |     "\n",
424 |     "import sys\n",
425 |     "import itertools\n",
426 |     "from math import sqrt\n",
427 |     "from operator import add\n",
428 |     "from os.path import join, isfile, dirname\n",
429 |     "\n",
430 |     "from pyspark import SparkConf, SparkContext\n",
431 |     "from pyspark.mllib.recommendation import ALS\n",
432 |     "\n",
433 |     "def parseRating(line):\n",
434 |     "    \"\"\"\n",
435 |     "        MovieLens的打分格式是userId::movieId::rating::timestamp\n",
436 |     "        我们对格式做一个解析\n",
437 |     "    \"\"\"\n",
438 |     "    fields = line.strip().split(\"::\")\n",
439 |     "    return long(fields[3]) % 10, (int(fields[0]), int(fields[1]), float(fields[2]))\n",
440 |     "\n",
441 |     "def parseMovie(line):\n",
442 |     "    \"\"\"\n",
443 |     "        对应的电影文件的格式为movieId::movieTitle\n",
444 |     "        解析成int id, 文本\n",
445 |     "    \"\"\"\n",
446 |     "    fields = line.strip().split(\"::\")\n",
447 |     "    return int(fields[0]), fields[1]\n",
448 |     "\n",
449 |     "def loadRatings(ratingsFile):\n",
450 |     "    \"\"\"\n",
451 |     "        载入得分\n",
452 |     "    \"\"\"\n",
453 |     "    if not isfile(ratingsFile):\n",
454 |     "        print \"File %s does not exist.\" % ratingsFile\n",
455 |     "        sys.exit(1)\n",
456 |     "    f = open(ratingsFile, 'r')\n",
457 |     "    ratings = filter(lambda r: r[2] > 0, [parseRating(line)[1] for line in f])\n",
458 |     "    f.close()\n",
459 |     "    if not ratings:\n",
460 |     "        print \"No ratings provided.\"\n",
461 |     "        sys.exit(1)\n",
462 |     "    else:\n",
463 |     "        return ratings\n",
464 |     "\n",
465 |     "def computeRmse(model, data, n):\n",
466 |     "    \"\"\"\n",
467 |     "        评估的时候要用的，计算均方根误差\n",
468 |     "    \"\"\"\n",
469 |     "    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))\n",
470 |     "    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \\\n",
471 |     "      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \\\n",
472 |     "      .values()\n",
473 |     "    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))\n",
474 |     "\n",
475 |     "if __name__ == \"__main__\":\n",
476 |     "    if (len(sys.argv) != 3):\n",
477 |     "        print \"Usage: /path/to/spark/bin/spark-submit --driver-memory 2g \" + \\\n",
478 |     "          \"MovieLensALS.py movieLensDataDir personalRatingsFile\"\n",
479 |     "        sys.exit(1)\n",
480 |     "\n",
481 |     "    # 设定环境\n",
482 |     "    conf = SparkConf() \\\n",
483 |     "      .setAppName(\"MovieLensALS\") \\\n",
484 |     "      .set(\"spark.executor.memory\", \"2g\")\n",
485 |     "    sc = SparkContext(conf=conf)\n",
486 |     "\n",
487 |     "    # 载入打分数据\n",
488 |     "    myRatings = loadRatings(sys.argv[2])\n",
489 |     "    myRatingsRDD = sc.parallelize(myRatings, 1)\n",
490 |     "\n",
491 |     "    movieLensHomeDir = sys.argv[1]\n",
492 |     "\n",
493 |     "    # 得到的ratings为(时间戳最后一位整数, (userId, movieId, rating))格式的RDD\n",
494 |     "    ratings = sc.textFile(join(movieLensHomeDir, \"ratings.dat\")).map(parseRating)\n",
495 |     "\n",
496 |     "    # 得到的movies为(movieId, movieTitle)格式的RDD\n",
497 |     "    movies = dict(sc.textFile(join(movieLensHomeDir, \"movies.dat\")).map(parseMovie).collect())\n",
498 |     "\n",
499 |     "    numRatings = ratings.count()\n",
500 |     "    numUsers = ratings.values().map(lambda r: r[0]).distinct().count()\n",
501 |     "    numMovies = ratings.values().map(lambda r: r[1]).distinct().count()\n",
502 |     "\n",
503 |     "    print \"Got %d ratings from %d users on %d movies.\" % (numRatings, numUsers, numMovies)\n",
504 |     "\n",
505 |     "    # 根据时间戳最后一位把整个数据集分成训练集(60%), 交叉验证集(20%), 和评估集(20%)\n",
506 |     "\n",
507 |     "    # 训练, 交叉验证, 测试 集都是(userId, movieId, rating)格式的RDD\n",
508 |     "\n",
509 |     "    numPartitions = 4\n",
510 |     "    training = ratings.filter(lambda x: x[0] < 6) \\\n",
511 |     "      .values() \\\n",
512 |     "      .union(myRatingsRDD) \\\n",
513 |     "      .repartition(numPartitions) \\\n",
514 |     "      .cache()\n",
515 |     "\n",
516 |     "    validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \\\n",
517 |     "      .values() \\\n",
518 |     "      .repartition(numPartitions) \\\n",
519 |     "      .cache()\n",
520 |     "\n",
521 |     "    test = ratings.filter(lambda x: x[0] >= 8).values().cache()\n",
522 |     "\n",
523 |     "    numTraining = training.count()\n",
524 |     "    numValidation = validation.count()\n",
525 |     "    numTest = test.count()\n",
526 |     "\n",
527 |     "    print \"Training: %d, validation: %d, test: %d\" % (numTraining, numValidation, numTest)\n",
528 |     "\n",
529 |     "    # 训练模型，在交叉验证集上看效果\n",
530 |     "\n",
531 |     "    ranks = [8, 12]\n",
532 |     "    lambdas = [0.1, 10.0]\n",
533 |     "    numIters = [10, 20]\n",
534 |     "    bestModel = None\n",
535 |     "    bestValidationRmse = float(\"inf\")\n",
536 |     "    bestRank = 0\n",
537 |     "    bestLambda = -1.0\n",
538 |     "    bestNumIter = -1\n",
539 |     "\n",
540 |     "    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):\n",
541 |     "        model = ALS.train(training, rank, numIter, lmbda)\n",
542 |     "        validationRmse = computeRmse(model, validation, numValidation)\n",
543 |     "        print \"RMSE (validation) = %f for the model trained with \" % validationRmse + \\\n",
544 |     "              \"rank = %d, lambda = %.1f, and numIter = %d.\" % (rank, lmbda, numIter)\n",
545 |     "        if (validationRmse < bestValidationRmse):\n",
546 |     "            bestModel = model\n",
547 |     "            bestValidationRmse = validationRmse\n",
548 |     "            bestRank = rank\n",
549 |     "            bestLambda = lmbda\n",
550 |     "            bestNumIter = numIter\n",
551 |     "\n",
552 |     "    testRmse = computeRmse(bestModel, test, numTest)\n",
553 |     "\n",
554 |     "    # 在测试集上评估 交叉验证集上最好的模型\n",
555 |     "    print \"The best model was trained with rank = %d and lambda = %.1f, \" % (bestRank, bestLambda) \\\n",
556 |     "      + \"and numIter = %d, and its RMSE on the test set is %f.\" % (bestNumIter, testRmse)\n",
557 |     "\n",
558 |     "    # 我们把基线模型设定为每次都返回平均得分的模型\n",
559 |     "    meanRating = training.union(validation).map(lambda x: x[2]).mean()\n",
560 |     "    baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)\n",
561 |     "    improvement = (baselineRmse - testRmse) / baselineRmse * 100\n",
562 |     "    print \"The best model improves the baseline by %.2f\" % (improvement) + \"%.\"\n",
563 |     "\n",
564 |     "    # 个性化的推荐(针对某个用户)\n",
565 |     "\n",
566 |     "    myRatedMovieIds = set([x[1] for x in myRatings])\n",
567 |     "    candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds])\n",
568 |     "    predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect()\n",
569 |     "    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:50]\n",
570 |     "\n",
571 |     "    print \"Movies recommended for you:\"\n",
572 |     "    for i in xrange(len(recommendations)):\n",
573 |     "        print (\"%2d: %s\" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore')\n",
574 |     "\n",
575 |     "    # clean up\n",
576 |     "    sc.stop()"
577 |    ]
578 |   }
579 |  ],
580 |  "metadata": {
581 |   "kernelspec": {
582 |    "display_name": "Python 3",
583 |    "language": "python",
584 |    "name": "python3"
585 |   },
586 |   "language_info": {
587 |    "codemirror_mode": {
588 |     "name": "ipython",
589 |     "version": 3
590 |    },
591 |    "file_extension": ".py",
592 |    "mimetype": "text/x-python",
593 |    "name": "python",
594 |    "nbconvert_exporter": "python",
595 |    "pygments_lexer": "ipython3",
596 |    "version": "3.5.2"
597 |   }
598 |  },
599 |  "nbformat": 4,
600 |  "nbformat_minor": 2
601 | }
602 | 


--------------------------------------------------------------------------------
/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/1.jpg


--------------------------------------------------------------------------------
/images/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/2.jpg


--------------------------------------------------------------------------------