├── Music Recommend System.ipynb ├── Music Recommend use Tensorflow .ipynb ├── README.md ├── Sequence Modelling.ipynb ├── Spark Recommendation.ipynb └── images ├── 1.jpg └── 2.jpg /Music Recommend use Tensorflow .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 目的:用tensorflow来完成一个在批量数据上更新,并且可以增量迭代优化的矩阵分解推荐系统" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 矩阵分解\n", 15 | "![](svd_recommendation.png)\n", 16 | "LFM:把用户再item上打分的行为,看作是有内部依据的,认为和k个factor有关系
\n", 17 | "每一个user i会有一个用户的向量(k维),每一个item会有一个item的向量(k维)\n", 18 | "\n", 19 | "SVD是矩阵分解的一种方式\n", 20 | "\n", 21 | "### 预测公式如下\n", 22 | "$y_{pred[u, i]} = bias_{global} + bias_{user[u]} + bias_{item_[i]} + $\n", 23 | "\n", 24 | "### 我们需要最小化的loss计算如下(添加正则化项)\n", 25 | "$\\sum_{u, i} |y_{pred[u, i]} - y_{true[u, i]}|^2 + \\lambda(|embedding_{user[u]}|^2 + |embedding_{item[i]}|^2)$" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### 数据处理" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "import numpy as np\n", 53 | "import pandas as pd\n", 54 | "\n", 55 | "\n", 56 | "def read_data_and_process(filname, sep=\"\\t\"):\n", 57 | " col_names = [\"user\", \"item\", \"rate\", \"st\"]\n", 58 | " df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')\n", 59 | " df[\"user\"] -= 1\n", 60 | " df[\"item\"] -= 1\n", 61 | " for col in (\"user\", \"item\"):\n", 62 | " df[col] = df[col].astype(np.int32)\n", 63 | " df[\"rate\"] = df[\"rate\"].astype(np.float32)\n", 64 | " return df\n", 65 | "\n", 66 | "\n", 67 | "class ShuffleDataIterator(object):\n", 68 | " \"\"\"\n", 69 | " 随机生成一个batch一个batch数据\n", 70 | " \"\"\"\n", 71 | " #初始化\n", 72 | " def __init__(self, inputs, batch_size=10):\n", 73 | " self.inputs = inputs\n", 74 | " self.batch_size = batch_size\n", 75 | " self.num_cols = len(self.inputs)\n", 76 | " self.len = len(self.inputs[0])\n", 77 | " self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))\n", 78 | "\n", 79 | " #总样本量\n", 80 | " def __len__(self):\n", 81 | " return self.len\n", 82 | "\n", 83 | " def __iter__(self):\n", 84 | " return self\n", 85 | "\n", 86 | " #取出下一个batch\n", 87 | " def __next__(self):\n", 88 | " return self.next()\n", 89 | " \n", 90 | " #随机生成batch_size个下标,取出对应的样本\n", 91 | " def next(self):\n", 92 | " ids = np.random.randint(0, self.len, (self.batch_size,))\n", 93 | " out = self.inputs[ids, :]\n", 94 | " return [out[:, i] for i in range(self.num_cols)]\n", 95 | "\n", 96 | "\n", 97 | "class OneEpochDataIterator(ShuffleDataIterator):\n", 98 | " \"\"\"\n", 99 | " 顺序产出一个epoch的数据,在测试中可能会用到\n", 100 | " \"\"\"\n", 101 | " def __init__(self, inputs, batch_size=10):\n", 102 | " super(OneEpochDataIterator, self).__init__(inputs, batch_size=batch_size)\n", 103 | " if batch_size > 0:\n", 104 | " self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))\n", 105 | " else:\n", 106 | " self.idx_group = [np.arange(self.len)]\n", 107 | " self.group_id = 0\n", 108 | "\n", 109 | " def next(self):\n", 110 | " if self.group_id >= len(self.idx_group):\n", 111 | " self.group_id = 0\n", 112 | " raise StopIteration\n", 113 | " out = self.inputs[self.idx_group[self.group_id], :]\n", 114 | " self.group_id += 1\n", 115 | " return [out[:, i] for i in range(self.num_cols)]" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### 模型搭建\n", 123 | "用tensorflow去搭建一个可增量训练的矩阵分解模型,完成基于矩阵分解的推荐系统" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 2, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "\n", 135 | "import tensorflow as tf\n", 136 | "\n", 137 | "# 使用矩阵分解搭建的网络结构\n", 138 | "def inference_svd(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n", 139 | " #使用CPU\n", 140 | " with tf.device(\"/cpu:0\"):\n", 141 | " # 初始化几个bias项\n", 142 | " global_bias = tf.get_variable(\"global_bias\", shape=[])\n", 143 | " w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n", 144 | " w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n", 145 | " # bias向量\n", 146 | " bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n", 147 | " bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n", 148 | " w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n", 149 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", 150 | " w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n", 151 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", 152 | " # user向量与item向量\n", 153 | " embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n", 154 | " embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n", 155 | " with tf.device(device):\n", 156 | " # 按照实际公式进行计算\n", 157 | " # 先对user向量和item向量求内积\n", 158 | " infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)\n", 159 | " # 加上几个偏置项\n", 160 | " infer = tf.add(infer, global_bias)\n", 161 | " infer = tf.add(infer, bias_user)\n", 162 | " infer = tf.add(infer, bias_item, name=\"svd_inference\")\n", 163 | " # 加上正则化项\n", 164 | " regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name=\"svd_regularizer\")\n", 165 | " return infer, regularizer\n", 166 | "\n", 167 | "# 迭代优化部分\n", 168 | "def optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device=\"/cpu:0\"):\n", 169 | " global_step = tf.train.get_global_step()\n", 170 | " assert global_step is not None\n", 171 | " # 选择合适的optimizer做优化\n", 172 | " with tf.device(device):\n", 173 | " cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))\n", 174 | " penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n", 175 | " cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))\n", 176 | " train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)\n", 177 | " return cost, train_op" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "### 模型训练" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 3, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "import time\n", 196 | "from collections import deque\n", 197 | "\n", 198 | "import numpy as np\n", 199 | "import tensorflow as tf\n", 200 | "from six import next\n", 201 | "from tensorflow.core.framework import summary_pb2\n", 202 | "\n", 203 | "np.random.seed(13575)\n", 204 | "\n", 205 | "# 一批数据的大小\n", 206 | "BATCH_SIZE = 2000\n", 207 | "# 用户数\n", 208 | "USER_NUM = 6040\n", 209 | "# 电影数\n", 210 | "ITEM_NUM = 3952\n", 211 | "# factor维度\n", 212 | "DIM = 15\n", 213 | "# 最大迭代轮数\n", 214 | "EPOCH_MAX = 200\n", 215 | "# 使用cpu做训练\n", 216 | "DEVICE = \"/cpu:0\"\n", 217 | "\n", 218 | "# 截断\n", 219 | "def clip(x):\n", 220 | " return np.clip(x, 1.0, 5.0)\n", 221 | "\n", 222 | "# 这个是方便Tensorboard可视化做的summary\n", 223 | "def make_scalar_summary(name, val):\n", 224 | " return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])\n", 225 | "\n", 226 | "# 调用上面的函数获取数据\n", 227 | "def get_data():\n", 228 | " df = read_data_and_process(\"./movielens/ml-1m/ratings.dat\", sep=\"::\")\n", 229 | " rows = len(df)\n", 230 | " df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n", 231 | " split_index = int(rows * 0.9)\n", 232 | " df_train = df[0:split_index]\n", 233 | " df_test = df[split_index:].reset_index(drop=True)\n", 234 | " print(df_train.shape, df_test.shape)\n", 235 | " return df_train, df_test\n", 236 | "\n", 237 | "# 实际训练过程\n", 238 | "def svd(train, test):\n", 239 | " samples_per_batch = len(train) // BATCH_SIZE\n", 240 | "\n", 241 | " # 一批一批数据用于训练\n", 242 | " iter_train = ShuffleDataIterator([train[\"user\"],\n", 243 | " train[\"item\"],\n", 244 | " train[\"rate\"]],\n", 245 | " batch_size=BATCH_SIZE)\n", 246 | " # 测试数据\n", 247 | " iter_test = OneEpochDataIterator([test[\"user\"],\n", 248 | " test[\"item\"],\n", 249 | " test[\"rate\"]],\n", 250 | " batch_size=-1)\n", 251 | " # user和item batch\n", 252 | " user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n", 253 | " item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n", 254 | " rate_batch = tf.placeholder(tf.float32, shape=[None])\n", 255 | "\n", 256 | " # 构建graph和训练\n", 257 | " infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,\n", 258 | " device=DEVICE)\n", 259 | " global_step = tf.contrib.framework.get_or_create_global_step()\n", 260 | " _, train_op = optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)\n", 261 | "\n", 262 | " # 初始化所有变量\n", 263 | " init_op = tf.global_variables_initializer()\n", 264 | " # 开始迭代\n", 265 | " with tf.Session() as sess:\n", 266 | " sess.run(init_op)\n", 267 | " summary_writer = tf.summary.FileWriter(logdir=\"/tmp/svd/log\", graph=sess.graph)\n", 268 | " print(\"{} {} {} {}\".format(\"epoch\", \"train_error\", \"val_error\", \"elapsed_time\"))\n", 269 | " errors = deque(maxlen=samples_per_batch)\n", 270 | " start = time.time()\n", 271 | " for i in range(EPOCH_MAX * samples_per_batch):\n", 272 | " users, items, rates = next(iter_train)\n", 273 | " _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n", 274 | " item_batch: items,\n", 275 | " rate_batch: rates})\n", 276 | " pred_batch = clip(pred_batch)\n", 277 | " errors.append(np.power(pred_batch - rates, 2))\n", 278 | " if i % samples_per_batch == 0:\n", 279 | " train_err = np.sqrt(np.mean(errors))\n", 280 | " test_err2 = np.array([])\n", 281 | " for users, items, rates in iter_test:\n", 282 | " pred_batch = sess.run(infer, feed_dict={user_batch: users,\n", 283 | " item_batch: items})\n", 284 | " pred_batch = clip(pred_batch)\n", 285 | " test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n", 286 | " end = time.time()\n", 287 | " test_err = np.sqrt(np.mean(test_err2))\n", 288 | " print(\"{:3d} {:f} {:f} {:f}(s)\".format(i // samples_per_batch, train_err, test_err,\n", 289 | " end - start))\n", 290 | " train_err_summary = make_scalar_summary(\"training_error\", train_err)\n", 291 | " test_err_summary = make_scalar_summary(\"test_error\", test_err)\n", 292 | " summary_writer.add_summary(train_err_summary, i)\n", 293 | " summary_writer.add_summary(test_err_summary, i)\n", 294 | " start = end" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 4, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "(900188, 4) (100021, 4)\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "# 获取数据\n", 312 | "df_train, df_test = get_data()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 5, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "name": "stderr", 322 | "output_type": "stream", 323 | "text": [ 324 | "D:\\Anaconda\\install\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 325 | " from ._conv import register_converters as _register_converters\n" 326 | ] 327 | }, 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "epoch train_error val_error elapsed_time\n", 333 | " 0 2.576278 2.577729 0.477677(s)\n", 334 | " 1 1.978902 1.152332 1.450331(s)\n", 335 | " 2 1.002632 0.949393 1.423475(s)\n", 336 | " 3 0.927719 0.926508 1.450366(s)\n", 337 | " 4 0.914275 0.919153 1.534211(s)\n", 338 | " 5 0.910865 0.915688 1.624905(s)\n", 339 | " 6 0.906089 0.913335 1.514213(s)\n", 340 | " 7 0.904977 0.911318 1.424107(s)\n", 341 | " 8 0.901721 0.908855 1.496397(s)\n", 342 | " 9 0.896913 0.906264 1.611612(s)\n", 343 | " 10 0.894468 0.903484 1.795376(s)\n", 344 | " 11 0.891712 0.899968 1.503599(s)\n", 345 | " 12 0.887555 0.895848 1.421528(s)\n", 346 | " 13 0.882009 0.891982 1.420877(s)\n", 347 | " 14 0.876975 0.888060 1.567020(s)\n", 348 | " 15 0.872943 0.884968 1.627547(s)\n", 349 | " 16 0.867226 0.881633 1.486202(s)\n", 350 | " 17 0.864066 0.878666 1.431954(s)\n", 351 | " 18 0.859931 0.875910 1.438102(s)\n", 352 | " 19 0.856037 0.873030 1.433395(s)\n", 353 | " 20 0.849924 0.870667 1.421893(s)\n", 354 | " 21 0.846303 0.868094 1.397581(s)\n", 355 | " 22 0.842261 0.865835 1.398386(s)\n", 356 | " 23 0.836717 0.863661 1.395191(s)\n", 357 | " 24 0.833121 0.861465 1.390639(s)\n", 358 | " 25 0.829651 0.859585 1.461942(s)\n", 359 | " 26 0.824811 0.857843 1.403177(s)\n", 360 | " 27 0.820917 0.856483 1.398302(s)\n", 361 | " 28 0.816505 0.854711 1.390662(s)\n", 362 | " 29 0.813360 0.853433 1.402261(s)\n", 363 | " 30 0.808135 0.852419 1.468770(s)\n", 364 | " 31 0.805145 0.851025 1.394093(s)\n", 365 | " 32 0.799418 0.849873 1.406268(s)\n", 366 | " 33 0.797527 0.849210 1.390641(s)\n", 367 | " 34 0.794350 0.848693 1.399524(s)\n", 368 | " 35 0.792427 0.848298 1.447494(s)\n", 369 | " 36 0.789376 0.847890 1.403100(s)\n", 370 | " 37 0.786277 0.847480 1.406268(s)\n", 371 | " 38 0.783722 0.847279 1.392901(s)\n", 372 | " 39 0.781859 0.846988 1.433542(s)\n", 373 | " 40 0.779194 0.846766 1.460803(s)\n", 374 | " 41 0.776687 0.846418 1.502838(s)\n", 375 | " 42 0.774345 0.846484 1.477898(s)\n", 376 | " 43 0.773097 0.846666 1.470419(s)\n", 377 | " 44 0.772025 0.846828 1.406287(s)\n", 378 | " 45 0.769199 0.846732 1.445719(s)\n", 379 | " 46 0.768910 0.846695 1.390641(s)\n", 380 | " 47 0.766496 0.846699 1.395308(s)\n", 381 | " 48 0.765846 0.846611 1.407348(s)\n", 382 | " 49 0.764256 0.846703 1.406266(s)\n", 383 | " 50 0.762772 0.846718 1.446595(s)\n", 384 | " 51 0.761644 0.847029 1.390661(s)\n", 385 | " 52 0.760738 0.847263 1.413254(s)\n", 386 | " 53 0.759950 0.847614 1.419673(s)\n", 387 | " 54 0.759713 0.847827 1.400326(s)\n", 388 | " 55 0.757802 0.847982 1.421893(s)\n", 389 | " 56 0.757559 0.848026 1.437567(s)\n", 390 | " 57 0.757013 0.848383 1.414295(s)\n", 391 | " 58 0.756566 0.848557 1.390639(s)\n", 392 | " 59 0.756866 0.848483 1.413066(s)\n", 393 | " 60 0.753830 0.848556 1.406267(s)\n", 394 | " 61 0.754405 0.848785 1.447202(s)\n", 395 | " 62 0.754867 0.848690 1.390621(s)\n", 396 | " 63 0.753079 0.848909 1.416439(s)\n", 397 | " 64 0.753559 0.848946 1.383843(s)\n", 398 | " 65 0.753118 0.849353 1.399104(s)\n", 399 | " 66 0.751364 0.849349 1.481309(s)\n", 400 | " 67 0.752177 0.849697 1.449126(s)\n", 401 | " 68 0.751095 0.849683 1.468768(s)\n", 402 | " 69 0.751063 0.849502 1.383999(s)\n", 403 | " 70 0.750350 0.849622 1.406266(s)\n", 404 | " 71 0.751395 0.849533 1.446464(s)\n", 405 | " 72 0.750082 0.849392 1.400997(s)\n", 406 | " 73 0.750379 0.849434 1.388548(s)\n", 407 | " 74 0.749501 0.849552 1.407498(s)\n", 408 | " 75 0.750194 0.849896 1.461215(s)\n", 409 | " 76 0.750201 0.849961 1.446918(s)\n", 410 | " 77 0.749083 0.850167 1.404643(s)\n", 411 | " 78 0.750445 0.850135 1.404541(s)\n", 412 | " 79 0.749501 0.849938 1.453143(s)\n", 413 | " 80 0.747849 0.850081 1.394538(s)\n", 414 | " 81 0.747658 0.850377 1.500019(s)\n", 415 | " 82 0.747445 0.850573 1.417488(s)\n", 416 | " 83 0.748725 0.850522 1.484394(s)\n", 417 | " 84 0.748016 0.850637 1.407718(s)\n", 418 | " 85 0.746435 0.850938 1.380105(s)\n", 419 | " 86 0.747316 0.850969 1.448309(s)\n", 420 | " 87 0.746777 0.850801 1.406286(s)\n", 421 | " 88 0.746731 0.850807 1.400060(s)\n", 422 | " 89 0.747924 0.850830 1.385019(s)\n", 423 | " 90 0.746106 0.850674 1.400585(s)\n", 424 | " 91 0.746864 0.850689 1.419417(s)\n", 425 | " 92 0.746962 0.850772 1.461914(s)\n", 426 | " 93 0.746395 0.850632 1.400366(s)\n", 427 | " 94 0.746491 0.850653 1.390425(s)\n", 428 | " 95 0.746701 0.850703 1.469709(s)\n", 429 | " 96 0.745090 0.850457 1.413338(s)\n", 430 | " 97 0.745649 0.850722 1.436355(s)\n", 431 | " 98 0.745338 0.850862 1.437517(s)\n", 432 | " 99 0.745499 0.850813 1.481134(s)\n", 433 | "100 0.745503 0.850798 1.374998(s)\n", 434 | "101 0.745268 0.850891 1.413138(s)\n", 435 | "102 0.745327 0.850786 1.476407(s)\n", 436 | "103 0.746660 0.850860 1.468770(s)\n", 437 | "104 0.745549 0.851016 1.390663(s)\n", 438 | "105 0.744760 0.850981 1.399773(s)\n", 439 | "106 0.745388 0.850703 1.395961(s)\n", 440 | "107 0.745142 0.850666 1.462982(s)\n", 441 | "108 0.746368 0.850706 1.390665(s)\n", 442 | "109 0.744704 0.850997 1.406251(s)\n", 443 | "110 0.745588 0.850987 1.399718(s)\n", 444 | "111 0.743731 0.851158 1.598807(s)\n", 445 | "112 0.744651 0.851077 1.611395(s)\n", 446 | "113 0.744472 0.850991 1.437518(s)\n", 447 | "114 0.744883 0.851003 1.408719(s)\n", 448 | "115 0.744321 0.850906 1.415562(s)\n", 449 | "116 0.744179 0.851158 1.406284(s)\n", 450 | "117 0.744853 0.851024 1.486450(s)\n", 451 | "118 0.743401 0.850973 1.420012(s)\n", 452 | "119 0.744809 0.851009 1.399587(s)\n", 453 | "120 0.744726 0.851097 1.390638(s)\n", 454 | "121 0.743952 0.850803 1.446391(s)\n", 455 | "122 0.744973 0.850798 1.439853(s)\n", 456 | "123 0.744382 0.850887 1.431332(s)\n", 457 | "124 0.744419 0.850841 1.460428(s)\n", 458 | "125 0.743825 0.851252 1.468669(s)\n", 459 | "126 0.744768 0.850956 1.406268(s)\n", 460 | "127 0.743264 0.850907 1.462467(s)\n", 461 | "128 0.743480 0.850931 1.390640(s)\n", 462 | "129 0.743621 0.851018 1.391578(s)\n", 463 | "130 0.744046 0.850966 1.407784(s)\n", 464 | "131 0.743349 0.850969 1.390645(s)\n", 465 | "132 0.743841 0.850785 1.462579(s)\n", 466 | "133 0.743074 0.850948 1.458358(s)\n", 467 | "134 0.744358 0.850806 1.407487(s)\n", 468 | "135 0.743972 0.851127 1.390640(s)\n", 469 | "136 0.743227 0.851148 1.406265(s)\n", 470 | "137 0.742984 0.851232 1.447071(s)\n", 471 | "138 0.744403 0.851532 1.411106(s)\n", 472 | "139 0.743451 0.851401 1.469621(s)\n", 473 | "140 0.743391 0.851384 1.390645(s)\n", 474 | "141 0.744516 0.851492 1.406285(s)\n", 475 | "142 0.743470 0.851447 1.410015(s)\n", 476 | "143 0.743198 0.851322 1.420781(s)\n", 477 | "144 0.744412 0.851270 1.403787(s)\n", 478 | "145 0.742384 0.851284 1.390643(s)\n", 479 | "146 0.743339 0.851364 1.399891(s)\n", 480 | "147 0.742802 0.851247 1.395700(s)\n", 481 | "148 0.742878 0.851421 1.469909(s)\n", 482 | "149 0.743484 0.851321 1.399727(s)\n", 483 | "150 0.743502 0.851572 1.399970(s)\n", 484 | "151 0.743406 0.851571 1.399521(s)\n", 485 | "152 0.742925 0.851396 1.395897(s)\n", 486 | "153 0.742553 0.851295 1.451918(s)\n", 487 | "154 0.743613 0.851278 1.406268(s)\n", 488 | "155 0.741762 0.851363 1.525129(s)\n", 489 | "156 0.743210 0.851457 1.406286(s)\n", 490 | "157 0.743032 0.851381 1.395900(s)\n", 491 | "158 0.741658 0.851501 1.455009(s)\n", 492 | "159 0.743116 0.851250 1.406258(s)\n", 493 | "160 0.743059 0.851320 1.399649(s)\n", 494 | "161 0.743155 0.851130 1.406287(s)\n", 495 | "162 0.741716 0.851186 1.395491(s)\n", 496 | "163 0.742589 0.851172 1.439167(s)\n", 497 | "164 0.742117 0.850974 1.468770(s)\n", 498 | "165 0.742390 0.851116 1.409128(s)\n", 499 | "166 0.744096 0.851254 1.435538(s)\n", 500 | "167 0.742634 0.851376 1.388916(s)\n", 501 | "168 0.741646 0.851275 1.457746(s)\n", 502 | "169 0.742897 0.851177 1.366868(s)\n", 503 | "170 0.743052 0.851266 1.426625(s)\n", 504 | "171 0.742376 0.851271 1.403697(s)\n", 505 | "172 0.742742 0.851338 1.390625(s)\n", 506 | "173 0.742256 0.851197 1.453141(s)\n", 507 | "174 0.742268 0.851046 1.399479(s)\n", 508 | "175 0.742002 0.850905 1.383932(s)\n", 509 | "176 0.741890 0.851078 1.421894(s)\n", 510 | "177 0.743085 0.851033 1.406270(s)\n", 511 | "178 0.741955 0.850917 1.416128(s)\n", 512 | "179 0.742171 0.851150 1.442452(s)\n", 513 | "180 0.742805 0.851376 1.423072(s)\n", 514 | "181 0.741585 0.851463 1.437518(s)\n", 515 | "182 0.742271 0.851443 1.437522(s)\n", 516 | "183 0.743029 0.851565 1.461741(s)\n", 517 | "184 0.742284 0.851456 1.504725(s)\n", 518 | "185 0.741653 0.851479 1.454393(s)\n", 519 | "186 0.743889 0.851576 1.564915(s)\n", 520 | "187 0.742872 0.851446 1.650403(s)\n", 521 | "188 0.741979 0.851394 1.431463(s)\n", 522 | "189 0.742107 0.851101 1.421539(s)\n", 523 | "190 0.742485 0.851297 1.406282(s)\n", 524 | "191 0.740788 0.851228 1.415401(s)\n", 525 | "192 0.742113 0.851329 1.414911(s)\n", 526 | "193 0.741579 0.851133 1.462146(s)\n", 527 | "194 0.742999 0.851144 1.455878(s)\n", 528 | "195 0.742513 0.851250 1.415932(s)\n", 529 | "196 0.743028 0.851395 1.390641(s)\n", 530 | "197 0.742302 0.851131 1.579862(s)\n", 531 | "198 0.741136 0.851173 1.607915(s)\n", 532 | "199 0.741375 0.851128 1.723560(s)\n" 533 | ] 534 | } 535 | ], 536 | "source": [ 537 | "# 完成实际的训练\n", 538 | "svd(df_train, df_test)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": true 546 | }, 547 | "outputs": [], 548 | "source": [] 549 | } 550 | ], 551 | "metadata": { 552 | "kernelspec": { 553 | "display_name": "Python 3", 554 | "language": "python", 555 | "name": "python3" 556 | }, 557 | "language_info": { 558 | "codemirror_mode": { 559 | "name": "ipython", 560 | "version": 3 561 | }, 562 | "file_extension": ".py", 563 | "mimetype": "text/x-python", 564 | "name": "python", 565 | "nbconvert_exporter": "python", 566 | "pygments_lexer": "ipython3", 567 | "version": "3.5.2" 568 | } 569 | }, 570 | "nbformat": 4, 571 | "nbformat_minor": 2 572 | } 573 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 音乐推荐系统 2 | #### 语言:python3.5 3 | #### 库:Surprise 4 | #### 平台:jupyter notebook 5 | #### 描述:此推荐系统类似网易云音乐推荐歌单以及推荐相似歌曲。 6 | ### 1. 数据获取 7 | 使用爬虫爬取了网易云音乐中80w首歌400w+次收藏的歌单,存储格式为json格式,数据大小为3.59G,格式说明如下: 8 |
  9 | 1)每个歌单的格式
 10 | {
 11 |     "result": {
 12 |         "id": 111450065,
 13 |         "status": 0,
 14 |         "commentThreadId": "A_PL_0_111450065",
 15 |         "trackCount": 120,
 16 |         "updateTime": 1460164523907,
 17 |         "commentCount": 227,
 18 |         "ordered": true,
 19 |         "anonimous": false,
 20 |         "highQuality": false,
 21 |         "subscribers": [],
 22 |         "playCount": 687070,
 23 |         "trackNumberUpdateTime": 1460164523907,
 24 |         "createTime": 1443528317662,
 25 |         "name": "带本书去旅行吧,人生最美好的时光在路上。",
 26 |         "cloudTrackCount": 0,
 27 |         "shareCount": 149,
 28 |         "adType": 0,
 29 |         "trackUpdateTime": 1494134249465,
 30 |         "userId": 39256799,
 31 |         "coverImgId": 3359008023885470,
 32 |         "coverImgUrl": "http://p1.music.126.net/2ZFcuSJ6STR8WgzkIi2U-Q==/3359008023885470.jpg",
 33 |         "artists": null,
 34 |         "newImported": false,
 35 |         "subscribed": false,
 36 |         "privacy": 0,
 37 |         "specialType": 0,
 38 |         "description": "现在是一年中最美好的时节,世界上很多地方都不冷不热,有湛蓝的天空和清冽的空气,正是出游的好时光。长假将至,你是不是已经收拾行装准备出发了?行前焦虑症中把衣服、洗漱用品、充电器之类东西忙忙碌碌地丢进箱子,打进背包的时候,我打赌你肯定会留个位置给一位好朋友:书。不是吗?不管是打发时间,小读怡情,还是为了做好攻略备不时之需,亦或是为了小小地装上一把,你都得有一本书傍身呀。读大仲马,我是复仇的伯爵;读柯南道尔,我穿梭在雾都的暗夜;读村上春树,我是寻羊的冒险者;读马尔克斯,目睹百年家族兴衰;读三毛,让灵魂在撒哈拉流浪;读老舍,嗅着老北京的气息;读海茵莱茵,于科幻狂流遨游;读卡夫卡,在城堡中审判……读书的孩子不会孤单,读书的孩子永远幸福。",
 39 |         "subscribedCount": 10882,
 40 |         "totalDuration": 0,
 41 |         "tags": [
 42 |             "旅行",
 43 |             "钢琴",
 44 |             "安静"]
 45 |         "creator": {
 46 |             "followed": false,
 47 |             "remarkName": null,
 48 |             "expertTags": [
 49 |                 "古典",
 50 |                 "民谣",
 51 |                 "华语"
 52 |             ],
 53 |             "userId": 39256799,
 54 |             "authority": 0,
 55 |             "userType": 0,
 56 |             "gender": 1,
 57 |             "backgroundImgId": 3427177752524551,
 58 |             "city": 360600,
 59 |             "mutual": false,
 60 |             "avatarUrl": "http://p1.music.126.net/TLRTrJpOM5lr68qJv1IyGQ==/1400777825738419.jpg",
 61 |             "avatarImgIdStr": "1400777825738419",
 62 |             "detailDescription": "",
 63 |             "province": 360000,
 64 |             "description": "",
 65 |             "birthday": 637516800000,
 66 |             "nickname": "有梦人生不觉寒",
 67 |             "vipType": 0,
 68 |             "avatarImgId": 1400777825738419,
 69 |             "defaultAvatar": false,
 70 |             "djStatus": 0,
 71 |             "accountStatus": 0,
 72 |             "backgroundImgIdStr": "3427177752524551",
 73 |             "backgroundUrl": "http://p1.music.126.net/LS96S_6VP9Hm7-T447-X0g==/3427177752524551.jpg",
 74 |             "signature": "漫无目的的乱听,听着,听着,竟然灵魂出窍了。更多精品音乐美图分享请加我微信hu272367751。微信是我的精神家园,有我最真诚的分享。",
 75 |             "authStatus": 0}
 76 |         "tracks": [{歌曲1},{歌曲2}, ...]
 77 |      }
 78 | }
 79 | 2)每首歌曲的格式为:
 80 | {
 81 |     "id": 29738501,
 82 |     "name": "跟着你到天边 钢琴版",
 83 |     "duration": 174001,
 84 |     "hearTime": 0,
 85 |     "commentThreadId": "R_SO_4_29738501",
 86 |     "score": 40,
 87 |     "mvid": 0,
 88 |     "hMusic": null,
 89 |     "disc": "",
 90 |     "fee": 0,
 91 |     "no": 1,
 92 |     "rtUrl": null,
 93 |     "ringtone": null,
 94 |     "rtUrls": [],
 95 |     "rurl": null,
 96 |     "status": 0,
 97 |     "ftype": 0,
 98 |     "mp3Url": "http://m2.music.126.net/vrVa20wHs8iIe0G8Oe7I9Q==/3222668581877701.mp3",
 99 |     "audition": null,
100 |     "playedNum": 0,
101 |     "copyrightId": 0,
102 |     "rtype": 0,
103 |     "crbt": null,
104 |     "popularity": 40,
105 |     "dayPlays": 0,
106 |     "alias": [],
107 |     "copyFrom": "",
108 |     "position": 1,
109 |     "starred": false,,
110 |     "starredNum": 0
111 |     "bMusic": {
112 |         "name": "跟着你到天边 钢琴版",
113 |         "extension": "mp3",
114 |         "volumeDelta": 0.0553125,
115 |         "sr": 44100,
116 |         "dfsId": 3222668581877701,
117 |         "playTime": 174001,
118 |         "bitrate": 96000,
119 |         "id": 52423394,
120 |         "size": 2089713
121 |     },
122 |     "lMusic": {
123 |         "name": "跟着你到天边 钢琴版",
124 |         "extension": "mp3",
125 |         "volumeDelta": 0.0553125,
126 |         "sr": 44100,
127 |         "dfsId": 3222668581877701,
128 |         "playTime": 174001,
129 |         "bitrate": 96000,
130 |         "id": 52423394,
131 |         "size": 2089713
132 |     },
133 |     "mMusic": {
134 |         "name": "跟着你到天边 钢琴版",
135 |         "extension": "mp3",
136 |         "volumeDelta": -0.000265076,
137 |         "sr": 44100,
138 |         "dfsId": 3222668581877702,
139 |         "playTime": 174001,
140 |         "bitrate": 128000,
141 |         "id": 52423395,
142 |         "size": 2785510
143 |     },
144 |     "artists": [
145 |         {
146 |         "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
147 |         "name": "群星",
148 |         "briefDesc": "",
149 |         "albumSize": 0,
150 |         "img1v1Id": 0,
151 |         "musicSize": 0,
152 |         "alias": [],
153 |         "picId": 0,
154 |         "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
155 |         "trans": "",
156 |         "id": 122455
157 |         }
158 |     ],
159 |     "album": {
160 |         "id": 3054006,
161 |         "status": 2,
162 |         "type": null,
163 |         "tags": "",
164 |         "size": 69,
165 |         "blurPicUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
166 |         "copyrightId": 0,
167 |         "name": "热门华语248",
168 |         "companyId": 0,
169 |         "songs": [],
170 |         "description": "",
171 |         "pic": 3274345629219531,
172 |         "commentThreadId": "R_AL_3_3054006",
173 |         "publishTime": 1388505600004,
174 |         "briefDesc": "",
175 |         "company": "",
176 |         "picId": 3274345629219531,
177 |         "alias": [],
178 |         "picUrl": "http://p1.music.126.net/2XLMVZhzVZCOunaRCOQ7Bg==/3274345629219531.jpg",
179 |         "artists": [
180 |         {
181 |             "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
182 |             "name": "群星",
183 |             "briefDesc": "",
184 |             "albumSize": 0,
185 |             "img1v1Id": 0,
186 |             "musicSize": 0,
187 |             "alias": [],
188 |             "picId": 0,
189 |             "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
190 |             "trans": "",
191 |             "id": 122455
192 |         }
193 |         ],
194 |         "artist": {
195 |         "img1v1Url": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
196 |         "name": "",
197 |         "briefDesc": "",
198 |         "albumSize": 0,
199 |         "img1v1Id": 0,
200 |         "musicSize": 0,
201 |         "alias": [],
202 |         "picId": 0,
203 |         "picUrl": "http://p1.music.126.net/6y-UleORITEDbvrOLV0Q8A==/5639395138885805.jpg",
204 |         "trans": "",
205 |         "id": 0
206 |         }
207 |     }
208 | }
209 | 
210 | 
211 | ### 2. 数据解析 212 | #### 2.1 原始数据=>歌单数据 213 | 抽取 歌单名称,歌单id,收藏数,所属分类 4个歌单维度的信息 214 | 抽取 歌曲id,歌曲名,歌手,歌曲热度 等4个维度信息歌曲的信息 215 | 216 | 组织成如下格式: 217 |
218 | 漫步西欧小镇上##小语种,旅行##69413685##474    18682332::Wäg vo dir::Joy Amelie::70.0    4335372::Only When I Sleep::The Corrs::60.0    2925502::Si Seulement::Lynnsha::100.0    21014930::Tu N'As Pas Cherché...::La Grande Sophie::100.0    20932638::Du behöver aldrig mer vara rädd::Lasse Lindh::25.0    17100518::Silent Machine::Cat Power::60.0    3308096::Kor pai kon diew : ชอไปคนเดียว::Palmy::5.0    1648250::les choristes::Petits Chanteurs De Saint Marc::100.0    4376212::Paddy's Green Shamrock Shore::The High Kings::25.0    2925400::A Todo Color::Las Escarlatinas::95.0    19711402::Comme Toi::Vox Angeli::75.0    3977526::Stay::Blue Cafe::100.0    2538518::Shake::Elize::85.0    2866799::Mon Ange::Jena Lee::85.0    5191949::Je M'appelle Helene::Hélène Rolles::85.0    20036323::Ich Lieb' Dich Immer Noch So Sehr::Kate & Ben::100.0
219 | 
220 | #### 2.2 歌单数据=>推荐系统格式数据 221 | 主流的python推荐系统框架,支持的最基本数据格式为movielens dataset,其评分数据格式为 user item rating timestamp,把数据处理成这个格式。 222 | #### 2.3 保存歌单和歌曲信息备用 223 | 保存 歌单id=>歌单名 和 歌曲id=>歌曲名 的信息 224 | 225 | ### 3.使用python推荐系统库Surprise完成项目 226 | #### 3.1用协同过滤构建模型并进行预测 227 | ##### 3.1.1 推荐歌单 228 | ![推荐歌单](./images/1.jpg) 229 | ##### 3.1.2 推荐歌曲 230 | ![推荐歌曲](./images/2.jpg) 231 | 当然也可以使用其他的算法来实现,如: 232 |
233 | 基础算法/baseline algorithms
234 | 基于近邻方法(协同过滤)/neighborhood methods
235 | 矩阵分解方法/matrix factorization-based (SVD, PMF, SVD++, NMF)
236 | 
237 | 238 | ### 4. 不同的推荐系统算法评估 239 | 可以使用不同的评估准则,如: 240 |
241 | rmse Compute RMSE (Root Mean Squared Error).
242 | msd Compute MAE (Mean Absolute Error).
243 | fcp Compute FCP (Fraction of Concordant Pairs).
244 | 
245 | -------------------------------------------------------------------------------- /Sequence Modelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 歌曲序列建模\n", 8 | "### 从word2vec到song2vec\n", 9 | "把歌曲的id序列取出来,类比于分完词后的句子,送到word2vec中去学习" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 8, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#coding: utf-8\n", 19 | "import multiprocessing\n", 20 | "import gensim\n", 21 | "import sys\n", 22 | "from random import shuffle\n", 23 | "\n", 24 | "def parse_playlist_get_sequence(in_line, playlist_sequence):\n", 25 | " song_sequence = []\n", 26 | " contents = in_line.strip().split(\"\\t\")\n", 27 | " # 解析歌单序列\n", 28 | " for song in contents[1:]:\n", 29 | " try:\n", 30 | " song_id, song_name, artist, popularity = song.split(\"::\")\n", 31 | " song_sequence.append(song_id)\n", 32 | " except:\n", 33 | " print (\"song format error\")\n", 34 | " print (song+\"\\n\")\n", 35 | " for i in range(len(song_sequence)):\n", 36 | " shuffle(song_sequence)\n", 37 | " playlist_sequence.append(song_sequence)\n", 38 | "\n", 39 | "\n", 40 | "def train_song2vec(in_file, out_file):\n", 41 | " #所有歌单序列\n", 42 | " playlist_sequence = []\n", 43 | " #遍历所有歌单\n", 44 | " for line in open(in_file, encoding='utf-8'):\n", 45 | " parse_playlist_get_sequence(line, playlist_sequence)\n", 46 | " #使用word2vec训练\n", 47 | " cores = multiprocessing.cpu_count()\n", 48 | " print (\"using all \"+str(cores)+\" cores\")\n", 49 | " print (\"Training word2vec model...\")\n", 50 | " model = gensim.models.Word2Vec(sentences=playlist_sequence, size=150, min_count=3, window=7, workers=cores)\n", 51 | " print (\"Saving model...\")\n", 52 | " model.save(out_file)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 9, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "song format error\n", 65 | "1870957::彩云国物语 セカンドシリーズ::君を想う::梁邦彦::80.0\n", 66 | "\n", 67 | "song format error\n", 68 | "4965888::桃华月惮::龙皇-リュウオウ-::多田彰文::25.0\n", 69 | "\n", 70 | "song format error\n", 71 | "456177::true tears::一阵の风::菊地創::95.0\n", 72 | "\n", 73 | "song format error\n", 74 | "22642373::\n", 75 | "\n", 76 | "song format error\n", 77 | " FAIRY TAIL メインテーマ -Slow ver.-::高梨康治::95.0\n", 78 | "\n", 79 | "song format error\n", 80 | "31563610::\n", 81 | "\n", 82 | "song format error\n", 83 | "苍之礼赞::花之祭P::60.0\n", 84 | "\n", 85 | "song format error\n", 86 | "4954593::リズム天国全曲集::恋の実験室::V.A.::55.0\n", 87 | "\n", 88 | "song format error\n", 89 | "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n", 90 | "\n", 91 | "song format error\n", 92 | "31654811::\n", 93 | "\n", 94 | "song format error\n", 95 | "American Cowboys::Tim Wynn::65.0\n", 96 | "\n", 97 | "song format error\n", 98 | "19169096::\n", 99 | "\n", 100 | "song format error\n", 101 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n", 102 | "\n", 103 | "song format error\n", 104 | "31563610::\n", 105 | "\n", 106 | "song format error\n", 107 | "苍之礼赞::花之祭P::60.0\n", 108 | "\n", 109 | "song format error\n", 110 | "31563610::\n", 111 | "\n", 112 | "song format error\n", 113 | "苍之礼赞::花之祭P::60.0\n", 114 | "\n", 115 | "song format error\n", 116 | "31563610::\n", 117 | "\n", 118 | "song format error\n", 119 | "苍之礼赞::花之祭P::60.0\n", 120 | "\n", 121 | "song format error\n", 122 | "19169096::\n", 123 | "\n", 124 | "song format error\n", 125 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n", 126 | "\n", 127 | "song format error\n", 128 | "376653::野弧禅狂叱(宿香之战)\n", 129 | "\n", 130 | "song format error\n", 131 | "::霹雳英雄::5.0\n", 132 | "\n", 133 | "song format error\n", 134 | "374524::赎?罪\n", 135 | "\n", 136 | "song format error\n", 137 | "赎罪岩::霹雳英雄::15.0\n", 138 | "\n", 139 | "song format error\n", 140 | "31563610::\n", 141 | "\n", 142 | "song format error\n", 143 | "苍之礼赞::花之祭P::65.0\n", 144 | "\n", 145 | "song format error\n", 146 | "37610597::ダウンタウン熱血物語::公園/河原にて~ひとときのやすらぎ~::V.A.::75.0\n", 147 | "\n", 148 | "song format error\n", 149 | "37610748::くにおくんの熱血サッカーリーグ::ねっけつ たいふーん♪::V.A.::80.0\n", 150 | "\n", 151 | "song format error\n", 152 | "37610755::くにおくんの熱血サッカーリーグ::てくのす じゃぱん かっぷの てーま♪::V.A.::75.0\n", 153 | "\n", 154 | "song format error\n", 155 | "37610745::くにおくんの熱血サッカーリーグ::ゲームモード選択::V.A.::75.0\n", 156 | "\n", 157 | "song format error\n", 158 | "37610643::ダウンタウン熱血行進曲 それゆけ大運動会::オープニングファンファーレ::V.A.::70.0\n", 159 | "\n", 160 | "song format error\n", 161 | "33054290::\n", 162 | "\n", 163 | "song format error\n", 164 | "Heartbeats::Dabin::90.0\n", 165 | "\n", 166 | "song format error\n", 167 | "405599088::Make Them Wheels Roll\n", 168 | "\n", 169 | "song format error\n", 170 | "::SAFIA::100.0\n", 171 | "\n", 172 | "song format error\n", 173 | "424496188::大王叫我来巡山 - (原唱:\n", 174 | "\n", 175 | "song format error\n", 176 | " 贾乃亮/贾云馨)::流浪的蛙蛙::65.0\n", 177 | "\n", 178 | "song format error\n", 179 | "19169096::\n", 180 | "\n", 181 | "song format error\n", 182 | " Time to Say Goodbye (Con te partirò)::Sarah Brightman::100.0\n", 183 | "\n", 184 | "song format error\n", 185 | "26902203::What’s your name? (collaboration with 壇蜜)\n", 186 | "\n", 187 | "song format error\n", 188 | "::SoulJa::100.0\n", 189 | "\n", 190 | "song format error\n", 191 | "33054290::\n", 192 | "\n", 193 | "song format error\n", 194 | "Heartbeats::Dabin::95.0\n", 195 | "\n", 196 | "song format error\n", 197 | "4954596::リズム天国全曲集::シンクロ::V.A.::60.0\n", 198 | "\n", 199 | "song format error\n", 200 | "32272105::\n", 201 | "\n", 202 | "song format error\n", 203 | "Wonderful Love (DJ Raf Remix)::Money Penny::95.0\n", 204 | "\n", 205 | "song format error\n", 206 | "33054290::\n", 207 | "\n", 208 | "song format error\n", 209 | "Heartbeats::Dabin::95.0\n", 210 | "\n", 211 | "song format error\n", 212 | "427373827::Champions (From \"Hands of Stone\") \n", 213 | "\n", 214 | "song format error\n", 215 | "::Usher::30.0\n", 216 | "\n", 217 | "song format error\n", 218 | "29242687::「コード・エテスウェイ (Class::ETHES_WEI=>extends.COMMUNI_SAT/.)」::霜月はるか::70.0\n", 219 | "\n", 220 | "using all 4 cores\n", 221 | "Training word2vec model...\n", 222 | "Saving model...\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "song_sequence_file = \"./ori_data/popular.playlist\"\n", 228 | "model_file = \"./model/song2vec.model\"\n", 229 | "train_song2vec(song_sequence_file, model_file)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "### 预测的过程,实际上就是对某首歌曲,查找“最近”的歌曲(向量距离最近的歌曲)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "import pickle\n", 248 | "song_dic = pickle.load(open(\"./pro_data/popular_song.pkl\",\"rb\"))\n", 249 | "model_str = \"./model/song2vec.model\"\n", 250 | "model = gensim.models.Word2Vec.load(model_str)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 12, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "315958 那件疯狂的小事叫爱情\t袁泉\n", 263 | "28138980 为你我受冷风吹\t孙露\n", 264 | "247526 彗星的眼泪\t金莎\n", 265 | "5280395 慨古吟(琴歌)\t张铜霞\n", 266 | "31140395 一首简单的歌\t本兮\n", 267 | "27532150 Smoke Fly ft. JBo Escobar & Khaki\tAl Rocco\n", 268 | "440767373 メドゥーサ(美杜莎)\t月蝕原创音乐\n", 269 | "16323636 The Prayer\tAndrea Bocelli\n", 270 | "281436 夜曲\t彭芳\n", 271 | "5270404 渴望(二胡)\t群星\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "for song in list(song_dic.keys())[:10]:\n", 277 | " print (song, song_dic[song])" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 14, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stderr", 287 | "output_type": "stream", 288 | "text": [ 289 | "D:\\Anaconda\\install\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n", 290 | " This is separate from the ipykernel package so we can avoid doing imports until\n" 291 | ] 292 | }, 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "368971 Ambulance of love\t脑浊\n", 298 | "\n", 299 | "相似歌曲 和 相似度 分别为:\n", 300 | "\t 新世界\t呼吸 0.8102102279663086\n", 301 | "\t 上苍保佑吃完了饭的人民\t张楚 0.8082322478294373\n", 302 | "\t 呀呀\t图腾 0.7943791151046753\n", 303 | "\t 昨日我从清晨开始等待\t钟立风 0.774426281452179\n", 304 | "\t 生命(Live) - live\t声音玩具 0.7557182312011719\n", 305 | "\t 两天\t许巍 0.7442638874053955\n", 306 | "\t 永远在一起\t飘乐队 0.7283462285995483\n", 307 | "\t 今夜\t许巍 0.7184534072875977\n", 308 | "\t 祖先的阴影\t超载 0.7170863747596741\n", 309 | "\t 我们走过的路\t天空 0.7092562913894653\n", 310 | "\n", 311 | "\n", 312 | "33599059 八秒之语\t洛天依\n", 313 | "\n", 314 | "相似歌曲 和 相似度 分别为:\n", 315 | "\t 乡村DISCO\tVOCALOID 0.6794760227203369\n", 316 | "\t 春雨\t乐正绫 0.649375319480896\n", 317 | "\t 远恋\t阿良良木健 0.6485384702682495\n", 318 | "\t 食之歌 VOCALOID Ver.\t泛音堂 0.6450830698013306\n", 319 | "\t 小幸运(Cover:田馥甄)\t星魂梦 0.6226800680160522\n", 320 | "\t 出格\t阿妍 0.6186865568161011\n", 321 | "\t 双向监禁\t洛天依 0.6172651052474976\n", 322 | "\t 山海默示录(洛天依版)\t小旭PRO 0.6170870661735535\n", 323 | "\t 甄姬\tVOCALOID 0.613639771938324\n", 324 | "\t 全世界都死了\t海鲜面 0.6097506284713745\n", 325 | "\n", 326 | "\n", 327 | "408332846 知足\t苏运莹\n", 328 | "\n", 329 | "相似歌曲 和 相似度 分别为:\n", 330 | "\t 垃圾车(cover 五月天)\t李昂星 0.714142918586731\n", 331 | "\t Happy Birth Day\t香蕉 0.7014893293380737\n", 332 | "\t 拥抱(Cover 五月天)\t橙大蕾蕾 0.6977276802062988\n", 333 | "\t 爱情的模样\t小平 0.6919869780540466\n", 334 | "\t 听不到(Live)\t梁静茹 0.6528257727622986\n", 335 | "\t 我就是这样的\t黄贯中 0.6462737917900085\n", 336 | "\t 拥抱(Cover 五月天)\t燕子姐姐弹吉他 0.6430615186691284\n", 337 | "\t 穿越时空遇见你\t萧亚轩 0.6319111585617065\n", 338 | "\t 神奇\t孙燕姿 0.6182767152786255\n", 339 | "\t 一个人的圣诞节\t张赫宣 0.6101440191268921\n", 340 | "\n", 341 | "\n", 342 | "34072696 酒馆小调\t洛天依\n", 343 | "\n", 344 | "相似歌曲 和 相似度 分别为:\n", 345 | "\t 菌裂\t言和 0.7842902541160583\n", 346 | "\t 妄想不到的恋曲\t烂兔子 0.748408317565918\n", 347 | "\t 女王\t洛天依 0.7209190726280212\n", 348 | "\t 清醒的梦 \tVilokun feat.言和 0.7056742906570435\n", 349 | "\t Mr 坷垃\t言和 0.7054739594459534\n", 350 | "\t 【心华】乌龟家的茶社【ick】\t缺钙体质ick 0.7002834677696228\n", 351 | "\t 偶像进行时\t言和 0.698686420917511\n", 352 | "\t 乐正绫 - 拉斯维加斯\t慕晓社 0.6933234333992004\n", 353 | "\t 流星之愿\t洛天依&言和 0.6909075975418091\n", 354 | "\t 病态的我\t洛天依 0.6879562139511108\n", 355 | "\n", 356 | "\n", 357 | "279713 十六夜的樱丘\t梦璟SAYA\n", 358 | "\n", 359 | "相似歌曲 和 相似度 分别为:\n", 360 | "\t 合金三国-沛县\t灰原穷 0.7301620841026306\n", 361 | "\t 岁月友情演唱会Live\t聂予词 0.7063505053520203\n", 362 | "\t 克罗地亚狂想曲(中文填词版)\t少年霜 0.6886762976646423\n", 363 | "\t 言君安\t倾夜 0.6869547963142395\n", 364 | "\t 凤鸣曲\t音频怪物 0.6675935387611389\n", 365 | "\t 相忘江湖\t玄觞 0.6578972935676575\n", 366 | "\t 伊人\t魏晨 0.6283127665519714\n", 367 | "\t 失落的遗迹:Lost Ruins ~ adventurers' tale~\tkaede 0.6206890344619751\n", 368 | "\t 【Moonlight组合】甜蜜具现式\tKBShinya 0.6184070110321045\n", 369 | "\t 老房子的故事【老歌搬家】\tWinky诗 0.6105965971946716\n", 370 | "\n", 371 | "\n", 372 | "33004911 听妈妈讲那过去的事情\t群星\n", 373 | "\n", 374 | "相似歌曲 和 相似度 分别为:\n", 375 | "\t 国旗多美丽\t群星 0.9948954582214355\n", 376 | "\t 劳动最光荣\t杨烁 0.9924089312553406\n", 377 | "\t 数青蛙\t群星 0.9401867985725403\n", 378 | "\t 儿童歌曲大联唱B\t群星 0.9401167631149292\n", 379 | "\t 母鸭带小鸭\t杨烁 0.937639594078064\n", 380 | "\t 嘀哩,嘀哩\t中央人民广播电台少年儿童合唱团 0.931469738483429\n", 381 | "\t 小小少年\t韩征 0.9063182473182678\n", 382 | "\t 世上只有妈妈好\t杨烁 0.893858790397644\n", 383 | "\t 真善美的小世界\t小蓓蕾组合 0.8826833963394165\n", 384 | "\t 我家住在北京城\t苑菁 0.8813650012016296\n", 385 | "\n", 386 | "\n", 387 | "26389372 水色\tUA\n", 388 | "\n", 389 | "相似歌曲 和 相似度 分别为:\n", 390 | "\t 最后の言い訳\t徳永英明 0.7948691248893738\n", 391 | "\t テルーの呗\t手嶌葵 0.7602044343948364\n", 392 | "\t もう君以外爱せない\tKinKi Kids 0.7559913992881775\n", 393 | "\t 氷点\t玉置浩二 0.7401448488235474\n", 394 | "\t 时の过ぎゆくままに\t沢田研二 0.7194046378135681\n", 395 | "\t 时代\t中島みゆき 0.7185744643211365\n", 396 | "\t 手紙 ~拝啓 十五の君へ~\tアンジェラ・アキ 0.6948644518852234\n", 397 | "\t あした\t中島みゆき 0.685759425163269\n", 398 | "\t MR.LONELY\t玉置浩二 0.6696567535400391\n", 399 | "\t Carcrashes [Album Version]\tStandfast 0.6691217422485352\n", 400 | "\n", 401 | "\n", 402 | "166471 别等离开才说爱我\t王志\n", 403 | "\n", 404 | "相似歌曲 和 相似度 分别为:\n", 405 | "\t 分爱 粤语版\t易欣 0.8755930066108704\n", 406 | "\t 反叛(Illegal Mix) - remix\t陈慧娴 0.873873233795166\n", 407 | "\t 背叛\t芭比 0.8577862977981567\n", 408 | "\t 罗盘上的指针\t群星 0.8541814088821411\n", 409 | "\t 最美丽的花\t王绎龙 0.8295896649360657\n", 410 | "\t 各种小曲各种嗨\t珊爷 0.8236122727394104\n", 411 | "\t 不要推我\t群星 0.7918290495872498\n", 412 | "\t 狂舞大麻\t群星 0.7873603105545044\n", 413 | "\t 真的不容易 (DJ阿圣 Remix)\t庄心妍 0.7646005749702454\n", 414 | "\t 看我72变\tM3 0.7498428225517273\n", 415 | "\n", 416 | "\n", 417 | "29414454 心中喜欢就说爱\t好妹妹乐队\n", 418 | "\n", 419 | "相似歌曲 和 相似度 分别为:\n", 420 | "\t 请你给我多一点点的温柔\t秦昊 0.8446059823036194\n", 421 | "\t 熟悉的拥抱 (Demo)\t好妹妹乐队 0.8162363767623901\n", 422 | "\t 秋诗篇篇 \t秦昊 0.8092836737632751\n", 423 | "\t 风从海面吹过来\t好妹妹乐队 0.792778491973877\n", 424 | "\t 军港之夜 \t秦昊 0.78443443775177\n", 425 | "\t 熟悉的拥抱\t好妹妹乐队 0.7571429014205933\n", 426 | "\t 心曲\t好妹妹乐队 0.7473146319389343\n", 427 | "\t 风又吹走了\t好妹妹乐队 0.7431649565696716\n", 428 | "\t 愿在秋天死去 (Demo)\t好妹妹乐队 0.7383122444152832\n", 429 | "\t 四季歌\t秦昊 0.7374083399772644\n", 430 | "\n", 431 | "\n", 432 | "347983 传奇 Legend\t春秋\n", 433 | "\n", 434 | "相似歌曲 和 相似度 分别为:\n", 435 | "\t 开始\t核聚变-G 0.8861181735992432\n", 436 | "\t 抢回一切\t岩浆乐队 0.85029536485672\n", 437 | "\t No One Can Change My Mind\t利事乐队 0.8492539525032043\n", 438 | "\t 武器\t浊乐队 0.809968888759613\n", 439 | "\t 【填翻】城池\t妖痴 0.786383867263794\n", 440 | "\t 杀手\t战斧 0.7508671879768372\n", 441 | "\t 梦魔\t大红袍 0.7423715591430664\n", 442 | "\t 梦已成\"血\"\t液氧罐头 0.7245020866394043\n", 443 | "\t 大鱼\t三火SAMA 0.7139255404472351\n", 444 | "\t 是什么,让我们留在这里?\t夜叉 0.7099397778511047\n", 445 | "\n", 446 | "\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "song_id_list = list(song_dic.keys())[1000:1500:50]\n", 452 | "for song_id in song_id_list:\n", 453 | " result_song_list = model.most_similar(song_id)\n", 454 | "\n", 455 | " print (song_id, song_dic[song_id])\n", 456 | " print (\"\\n相似歌曲 和 相似度 分别为:\")\n", 457 | " for song in result_song_list:\n", 458 | " print (\"\\t\", song_dic[song[0]], song[1])\n", 459 | " print (\"\\n\")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "collapsed": true 467 | }, 468 | "outputs": [], 469 | "source": [] 470 | } 471 | ], 472 | "metadata": { 473 | "kernelspec": { 474 | "display_name": "Python 3", 475 | "language": "python", 476 | "name": "python3" 477 | }, 478 | "language_info": { 479 | "codemirror_mode": { 480 | "name": "ipython", 481 | "version": 3 482 | }, 483 | "file_extension": ".py", 484 | "mimetype": "text/x-python", 485 | "name": "python", 486 | "nbconvert_exporter": "python", 487 | "pygments_lexer": "ipython3", 488 | "version": "3.5.2" 489 | } 490 | }, 491 | "nbformat": 4, 492 | "nbformat_minor": 2 493 | } 494 | -------------------------------------------------------------------------------- /Spark Recommendation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## pyspark协同过滤" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### user-based协同过滤" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "#-*- coding:utf8 -*-\n", 26 | "# pySpark实现的基于用户的协同过滤\n", 27 | "# 使用的余弦相似度\n", 28 | "\n", 29 | "import sys\n", 30 | "from collections import defaultdict\n", 31 | "from itertools import combinations\n", 32 | "import random\n", 33 | "import numpy as np\n", 34 | "import pdb\n", 35 | "\n", 36 | "from pyspark import SparkContext\n", 37 | "\n", 38 | "# user item rating timestamp\n", 39 | "def parseVectorOnUser(line):\n", 40 | " '''\n", 41 | " 解析数据,key是user,后面是item和打分\n", 42 | " '''\n", 43 | " line = line.split(\"|\")\n", 44 | " return line[0],(line[1],float(line[2]))\n", 45 | "\n", 46 | "def parseVectorOnItem(line):\n", 47 | " '''\n", 48 | " 解析数据,key是item,后面是user和打分\n", 49 | " '''\n", 50 | " line = line.split(\"|\")\n", 51 | " return line[1],(line[0],float(line[2]))\n", 52 | "\n", 53 | "def sampleInteractions(item_id,users_with_rating,n):\n", 54 | " '''\n", 55 | " 如果某个商品上用户行为特别多,可以选择适当做点下采样\n", 56 | " '''\n", 57 | " if len(users_with_rating) > n:\n", 58 | " return item_id, random.sample(users_with_rating,n)\n", 59 | " else:\n", 60 | " return item_id, users_with_rating\n", 61 | "\n", 62 | "def findUserPairs(item_id,users_with_rating):\n", 63 | " '''\n", 64 | " 对每个item,找到共同打分的user对\n", 65 | " '''\n", 66 | " for user1,user2 in combinations(users_with_rating,2):\n", 67 | " return (user1[0],user2[0]),(user1[1],user2[1])\n", 68 | "\n", 69 | "def calcSim(user_pair,rating_pairs):\n", 70 | " ''' \n", 71 | " 对每个user对,根据打分计算余弦距离,并返回共同打分的item个数\n", 72 | " '''\n", 73 | " sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n", 74 | " \n", 75 | " for rating_pair in rating_pairs:\n", 76 | " sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n", 77 | " sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n", 78 | " sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n", 79 | " # sum_y += rt[1]\n", 80 | " # sum_x += rt[0]\n", 81 | " n += 1\n", 82 | "\n", 83 | " cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n", 84 | " return user_pair, (cos_sim,n)\n", 85 | "\n", 86 | "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n", 87 | " '''\n", 88 | " 2个向量A和B的余弦相似度\n", 89 | " dotProduct(A, B) / (norm(A) * norm(B))\n", 90 | " '''\n", 91 | " numerator = dot_product\n", 92 | " denominator = rating_norm_squared * rating2_norm_squared\n", 93 | "\n", 94 | " return (numerator / (float(denominator))) if denominator else 0.0\n", 95 | "\n", 96 | "def keyOnFirstUser(user_pair,item_sim_data):\n", 97 | " '''\n", 98 | " 对于每个user-user对,用第一个user做key(好像有点粗暴...)\n", 99 | " '''\n", 100 | " (user1_id,user2_id) = user_pair\n", 101 | " return user1_id,(user2_id,item_sim_data)\n", 102 | "\n", 103 | "def nearestNeighbors(user,users_and_sims,n):\n", 104 | " '''\n", 105 | " 选出相似度最高的N个邻居\n", 106 | " '''\n", 107 | " users_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n", 108 | " return user, users_and_sims[:n]\n", 109 | "\n", 110 | "def topNRecommendations(user_id,user_sims,users_with_rating,n):\n", 111 | " '''\n", 112 | " 根据最近的N个邻居进行推荐\n", 113 | " '''\n", 114 | "\n", 115 | " totals = defaultdict(int)\n", 116 | " sim_sums = defaultdict(int)\n", 117 | "\n", 118 | " for (neighbor,(sim,count)) in user_sims:\n", 119 | "\n", 120 | " # 遍历邻居的打分\n", 121 | " unscored_items = users_with_rating.get(neighbor,None)\n", 122 | "\n", 123 | " if unscored_items:\n", 124 | " for (item,rating) in unscored_items:\n", 125 | " if neighbor != item:\n", 126 | "\n", 127 | " # 更新推荐度和相近度\n", 128 | " totals[neighbor] += sim * rating\n", 129 | " sim_sums[neighbor] += sim\n", 130 | "\n", 131 | " # 归一化\n", 132 | " scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n", 133 | "\n", 134 | " # 按照推荐度降序排列\n", 135 | " scored_items.sort(reverse=True)\n", 136 | "\n", 137 | " # 推荐度的item\n", 138 | " ranked_items = [x[1] for x in scored_items]\n", 139 | "\n", 140 | " return user_id,ranked_items[:n]\n", 141 | "\n", 142 | "if __name__ == \"__main__\":\n", 143 | " if len(sys.argv) < 3:\n", 144 | " print >> sys.stderr, \\\n", 145 | " \"Usage: PythonUserCF \"\n", 146 | " exit(-1)\n", 147 | "\n", 148 | " sc = SparkContext(sys.argv[1],\"PythonUserCF\")\n", 149 | " lines = sc.textFile(sys.argv[2])\n", 150 | "\n", 151 | " '''\n", 152 | " 处理数据,获得稀疏item-user矩阵:\n", 153 | " item_id -> ((user_1,rating),(user2,rating))\n", 154 | " '''\n", 155 | " item_user_pairs = lines.map(parseVectorOnItem).groupByKey().map(\n", 156 | " lambda p: sampleInteractions(p[0],p[1],500)).cache()\n", 157 | "\n", 158 | " '''\n", 159 | " 获得2个用户所有的item-item对得分组合:\n", 160 | " (user1_id,user2_id) -> [(rating1,rating2),\n", 161 | " (rating1,rating2),\n", 162 | " (rating1,rating2),\n", 163 | " ...]\n", 164 | " '''\n", 165 | " pairwise_users = item_user_pairs.filter(\n", 166 | " lambda p: len(p[1]) > 1).map(\n", 167 | " lambda p: findUserPairs(p[0],p[1])).groupByKey()\n", 168 | "\n", 169 | " '''\n", 170 | " 计算余弦相似度,找到最近的N个邻居:\n", 171 | " (user1,user2) -> (similarity,co_raters_count)\n", 172 | " '''\n", 173 | " user_sims = pairwise_users.map(\n", 174 | " lambda p: calcSim(p[0],p[1])).map(\n", 175 | " lambda p: keyOnFirstUser(p[0],p[1])).groupByKey().map(\n", 176 | " lambda p: nearestNeighbors(p[0],p[1],50))\n", 177 | "\n", 178 | " ''' \n", 179 | " 对每个用户的打分记录整理成如下形式\n", 180 | " user_id -> [(item_id_1, rating_1),\n", 181 | " [(item_id_2, rating_2),\n", 182 | " ...]\n", 183 | " '''\n", 184 | "\n", 185 | " user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect()\n", 186 | "\n", 187 | " ui_dict = {}\n", 188 | " for (user,items) in user_item_hist: \n", 189 | " ui_dict[user] = items\n", 190 | "\n", 191 | " uib = sc.broadcast(ui_dict)\n", 192 | "\n", 193 | " '''\n", 194 | " 为每个用户计算Top N的推荐\n", 195 | " user_id -> [item1,item2,item3,...]\n", 196 | " '''\n", 197 | " user_item_recs = user_sims.map(lambda p: topNRecommendations(p[0],p[1],uib.value,100)).collect()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### item-based协同过滤" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "#-*- coding:utf8 -*-\n", 216 | "# pySpark实现的基于物品的协同过滤\n", 217 | "\n", 218 | "import sys\n", 219 | "from collections import defaultdict\n", 220 | "from itertools import combinations\n", 221 | "import numpy as np\n", 222 | "import random\n", 223 | "import csv\n", 224 | "import pdb\n", 225 | "\n", 226 | "from pyspark import SparkContext\n", 227 | "\n", 228 | "def parseVector(line):\n", 229 | " '''\n", 230 | " 解析数据,key是item,后面是user和打分\n", 231 | " '''\n", 232 | " line = line.split(\"|\")\n", 233 | " return line[0],(line[1],float(line[2]))\n", 234 | "\n", 235 | "def sampleInteractions(user_id,items_with_rating,n):\n", 236 | " '''\n", 237 | " 如果某个用户打分行为特别多,可以选择适当做点下采样\n", 238 | " '''\n", 239 | " if len(items_with_rating) > n:\n", 240 | " return user_id, random.sample(items_with_rating,n)\n", 241 | " else:\n", 242 | " return user_id, items_with_rating\n", 243 | "\n", 244 | "def findItemPairs(user_id,items_with_rating):\n", 245 | " '''\n", 246 | " 对每个用户的打分item,组对\n", 247 | " '''\n", 248 | " for item1,item2 in combinations(items_with_rating,2):\n", 249 | " return (item1[0],item2[0]),(item1[1],item2[1])\n", 250 | "\n", 251 | "def calcSim(item_pair,rating_pairs):\n", 252 | " ''' \n", 253 | " 对每个item对,根据打分计算余弦距离,并返回共同打分的user个数\n", 254 | " '''\n", 255 | " sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)\n", 256 | " \n", 257 | " for rating_pair in rating_pairs:\n", 258 | " sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])\n", 259 | " sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])\n", 260 | " sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])\n", 261 | " # sum_y += rt[1]\n", 262 | " # sum_x += rt[0]\n", 263 | " n += 1\n", 264 | "\n", 265 | " cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))\n", 266 | " return item_pair, (cos_sim,n)\n", 267 | "\n", 268 | "def cosine(dot_product,rating_norm_squared,rating2_norm_squared):\n", 269 | " '''\n", 270 | " The cosine between two vectors A, B\n", 271 | " dotProduct(A, B) / (norm(A) * norm(B))\n", 272 | " '''\n", 273 | " numerator = dot_product\n", 274 | " denominator = rating_norm_squared * rating2_norm_squared\n", 275 | " return (numerator / (float(denominator))) if denominator else 0.0\n", 276 | "\n", 277 | "def correlation(size, dot_product, rating_sum, \\\n", 278 | " rating2sum, rating_norm_squared, rating2_norm_squared):\n", 279 | " '''\n", 280 | " 2个向量A和B的相似度\n", 281 | " [n * dotProduct(A, B) - sum(A) * sum(B)] /\n", 282 | " sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }\n", 283 | "\n", 284 | " '''\n", 285 | " numerator = size * dot_product - rating_sum * rating2sum\n", 286 | " denominator = sqrt(size * rating_norm_squared - rating_sum * rating_sum) * \\\n", 287 | " sqrt(size * rating2_norm_squared - rating2sum * rating2sum)\n", 288 | "\n", 289 | " return (numerator / (float(denominator))) if denominator else 0.0\n", 290 | "\n", 291 | "def keyOnFirstItem(item_pair,item_sim_data):\n", 292 | " '''\n", 293 | " 对于每个item-item对,用第一个item做key(好像有点粗暴...)\n", 294 | " '''\n", 295 | " (item1_id,item2_id) = item_pair\n", 296 | " return item1_id,(item2_id,item_sim_data)\n", 297 | "\n", 298 | "def nearestNeighbors(item_id,items_and_sims,n):\n", 299 | " '''\n", 300 | " 排序选出相似度最高的N个邻居\n", 301 | " '''\n", 302 | " items_and_sims.sort(key=lambda x: x[1][0],reverse=True)\n", 303 | " return item_id, items_and_sims[:n]\n", 304 | "\n", 305 | "def topNRecommendations(user_id,items_with_rating,item_sims,n):\n", 306 | " '''\n", 307 | " 根据最近的N个邻居进行推荐\n", 308 | " '''\n", 309 | " \n", 310 | " totals = defaultdict(int)\n", 311 | " sim_sums = defaultdict(int)\n", 312 | "\n", 313 | " for (item,rating) in items_with_rating:\n", 314 | "\n", 315 | " # 遍历item的邻居\n", 316 | " nearest_neighbors = item_sims.get(item,None)\n", 317 | "\n", 318 | " if nearest_neighbors:\n", 319 | " for (neighbor,(sim,count)) in nearest_neighbors:\n", 320 | " if neighbor != item:\n", 321 | "\n", 322 | " # 更新推荐度和相近度\n", 323 | " totals[neighbor] += sim * rating\n", 324 | " sim_sums[neighbor] += sim\n", 325 | "\n", 326 | " # 归一化\n", 327 | " scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]\n", 328 | "\n", 329 | " # 按照推荐度降序排列\n", 330 | " scored_items.sort(reverse=True)\n", 331 | "\n", 332 | " ranked_items = [x[1] for x in scored_items]\n", 333 | "\n", 334 | " return user_id,ranked_items[:n]\n", 335 | "\n", 336 | "if __name__ == \"__main__\":\n", 337 | " if len(sys.argv) < 3:\n", 338 | " print >> sys.stderr, \\\n", 339 | " \"Usage: PythonItemCF \"\n", 340 | " exit(-1)\n", 341 | "\n", 342 | " sc = SparkContext(sys.argv[1], \"PythonItemCF\")\n", 343 | " lines = sc.textFile(sys.argv[2])\n", 344 | "\n", 345 | " ''' \n", 346 | " 处理数据,获得稀疏user-item矩阵:\n", 347 | " user_id -> [(item_id_1, rating_1),\n", 348 | " [(item_id_2, rating_2),\n", 349 | " ...]\n", 350 | " '''\n", 351 | " user_item_pairs = lines.map(parseVector).groupByKey().map(\n", 352 | " lambda p: sampleInteractions(p[0],p[1],500)).cache()\n", 353 | "\n", 354 | " '''\n", 355 | " 获取所有item-item组合对\n", 356 | " (item1,item2) -> [(item1_rating,item2_rating),\n", 357 | " (item1_rating,item2_rating),\n", 358 | " ...]\n", 359 | " '''\n", 360 | "\n", 361 | " pairwise_items = user_item_pairs.filter(\n", 362 | " lambda p: len(p[1]) > 1).map(\n", 363 | " lambda p: findItemPairs(p[0],p[1])).groupByKey()\n", 364 | "\n", 365 | " '''\n", 366 | " 计算余弦相似度,找到最近的N个邻居:\n", 367 | " (item1,item2) -> (similarity,co_raters_count)\n", 368 | " '''\n", 369 | "\n", 370 | " item_sims = pairwise_items.map(\n", 371 | " lambda p: calcSim(p[0],p[1])).map(\n", 372 | " lambda p: keyOnFirstItem(p[0],p[1])).groupByKey().map(\n", 373 | " lambda p: nearestNeighbors(p[0],p[1],50)).collect()\n", 374 | "\n", 375 | "\n", 376 | " item_sim_dict = {}\n", 377 | " for (item,data) in item_sims: \n", 378 | " item_sim_dict[item] = data\n", 379 | "\n", 380 | " isb = sc.broadcast(item_sim_dict)\n", 381 | "\n", 382 | " '''\n", 383 | " 计算最佳的N个推荐结果\n", 384 | " user_id -> [item1,item2,item3,...]\n", 385 | " '''\n", 386 | " user_item_recs = user_item_pairs.map(lambda p: topNRecommendations(p[0],p[1],isb.value,500)).collect()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Spark推荐系统" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### spark自带了用于推荐的算法" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "#!/usr/bin/env python\n", 421 | "# 基于spark中ALS的推荐系统,针对movielens中电影打分数据做推荐\n", 422 | "# Edit:寒小阳(hanxiaoyang.ml@gmail.com)\n", 423 | "\n", 424 | "import sys\n", 425 | "import itertools\n", 426 | "from math import sqrt\n", 427 | "from operator import add\n", 428 | "from os.path import join, isfile, dirname\n", 429 | "\n", 430 | "from pyspark import SparkConf, SparkContext\n", 431 | "from pyspark.mllib.recommendation import ALS\n", 432 | "\n", 433 | "def parseRating(line):\n", 434 | " \"\"\"\n", 435 | " MovieLens的打分格式是userId::movieId::rating::timestamp\n", 436 | " 我们对格式做一个解析\n", 437 | " \"\"\"\n", 438 | " fields = line.strip().split(\"::\")\n", 439 | " return long(fields[3]) % 10, (int(fields[0]), int(fields[1]), float(fields[2]))\n", 440 | "\n", 441 | "def parseMovie(line):\n", 442 | " \"\"\"\n", 443 | " 对应的电影文件的格式为movieId::movieTitle\n", 444 | " 解析成int id, 文本\n", 445 | " \"\"\"\n", 446 | " fields = line.strip().split(\"::\")\n", 447 | " return int(fields[0]), fields[1]\n", 448 | "\n", 449 | "def loadRatings(ratingsFile):\n", 450 | " \"\"\"\n", 451 | " 载入得分\n", 452 | " \"\"\"\n", 453 | " if not isfile(ratingsFile):\n", 454 | " print \"File %s does not exist.\" % ratingsFile\n", 455 | " sys.exit(1)\n", 456 | " f = open(ratingsFile, 'r')\n", 457 | " ratings = filter(lambda r: r[2] > 0, [parseRating(line)[1] for line in f])\n", 458 | " f.close()\n", 459 | " if not ratings:\n", 460 | " print \"No ratings provided.\"\n", 461 | " sys.exit(1)\n", 462 | " else:\n", 463 | " return ratings\n", 464 | "\n", 465 | "def computeRmse(model, data, n):\n", 466 | " \"\"\"\n", 467 | " 评估的时候要用的,计算均方根误差\n", 468 | " \"\"\"\n", 469 | " predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))\n", 470 | " predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \\\n", 471 | " .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \\\n", 472 | " .values()\n", 473 | " return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))\n", 474 | "\n", 475 | "if __name__ == \"__main__\":\n", 476 | " if (len(sys.argv) != 3):\n", 477 | " print \"Usage: /path/to/spark/bin/spark-submit --driver-memory 2g \" + \\\n", 478 | " \"MovieLensALS.py movieLensDataDir personalRatingsFile\"\n", 479 | " sys.exit(1)\n", 480 | "\n", 481 | " # 设定环境\n", 482 | " conf = SparkConf() \\\n", 483 | " .setAppName(\"MovieLensALS\") \\\n", 484 | " .set(\"spark.executor.memory\", \"2g\")\n", 485 | " sc = SparkContext(conf=conf)\n", 486 | "\n", 487 | " # 载入打分数据\n", 488 | " myRatings = loadRatings(sys.argv[2])\n", 489 | " myRatingsRDD = sc.parallelize(myRatings, 1)\n", 490 | "\n", 491 | " movieLensHomeDir = sys.argv[1]\n", 492 | "\n", 493 | " # 得到的ratings为(时间戳最后一位整数, (userId, movieId, rating))格式的RDD\n", 494 | " ratings = sc.textFile(join(movieLensHomeDir, \"ratings.dat\")).map(parseRating)\n", 495 | "\n", 496 | " # 得到的movies为(movieId, movieTitle)格式的RDD\n", 497 | " movies = dict(sc.textFile(join(movieLensHomeDir, \"movies.dat\")).map(parseMovie).collect())\n", 498 | "\n", 499 | " numRatings = ratings.count()\n", 500 | " numUsers = ratings.values().map(lambda r: r[0]).distinct().count()\n", 501 | " numMovies = ratings.values().map(lambda r: r[1]).distinct().count()\n", 502 | "\n", 503 | " print \"Got %d ratings from %d users on %d movies.\" % (numRatings, numUsers, numMovies)\n", 504 | "\n", 505 | " # 根据时间戳最后一位把整个数据集分成训练集(60%), 交叉验证集(20%), 和评估集(20%)\n", 506 | "\n", 507 | " # 训练, 交叉验证, 测试 集都是(userId, movieId, rating)格式的RDD\n", 508 | "\n", 509 | " numPartitions = 4\n", 510 | " training = ratings.filter(lambda x: x[0] < 6) \\\n", 511 | " .values() \\\n", 512 | " .union(myRatingsRDD) \\\n", 513 | " .repartition(numPartitions) \\\n", 514 | " .cache()\n", 515 | "\n", 516 | " validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \\\n", 517 | " .values() \\\n", 518 | " .repartition(numPartitions) \\\n", 519 | " .cache()\n", 520 | "\n", 521 | " test = ratings.filter(lambda x: x[0] >= 8).values().cache()\n", 522 | "\n", 523 | " numTraining = training.count()\n", 524 | " numValidation = validation.count()\n", 525 | " numTest = test.count()\n", 526 | "\n", 527 | " print \"Training: %d, validation: %d, test: %d\" % (numTraining, numValidation, numTest)\n", 528 | "\n", 529 | " # 训练模型,在交叉验证集上看效果\n", 530 | "\n", 531 | " ranks = [8, 12]\n", 532 | " lambdas = [0.1, 10.0]\n", 533 | " numIters = [10, 20]\n", 534 | " bestModel = None\n", 535 | " bestValidationRmse = float(\"inf\")\n", 536 | " bestRank = 0\n", 537 | " bestLambda = -1.0\n", 538 | " bestNumIter = -1\n", 539 | "\n", 540 | " for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):\n", 541 | " model = ALS.train(training, rank, numIter, lmbda)\n", 542 | " validationRmse = computeRmse(model, validation, numValidation)\n", 543 | " print \"RMSE (validation) = %f for the model trained with \" % validationRmse + \\\n", 544 | " \"rank = %d, lambda = %.1f, and numIter = %d.\" % (rank, lmbda, numIter)\n", 545 | " if (validationRmse < bestValidationRmse):\n", 546 | " bestModel = model\n", 547 | " bestValidationRmse = validationRmse\n", 548 | " bestRank = rank\n", 549 | " bestLambda = lmbda\n", 550 | " bestNumIter = numIter\n", 551 | "\n", 552 | " testRmse = computeRmse(bestModel, test, numTest)\n", 553 | "\n", 554 | " # 在测试集上评估 交叉验证集上最好的模型\n", 555 | " print \"The best model was trained with rank = %d and lambda = %.1f, \" % (bestRank, bestLambda) \\\n", 556 | " + \"and numIter = %d, and its RMSE on the test set is %f.\" % (bestNumIter, testRmse)\n", 557 | "\n", 558 | " # 我们把基线模型设定为每次都返回平均得分的模型\n", 559 | " meanRating = training.union(validation).map(lambda x: x[2]).mean()\n", 560 | " baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)\n", 561 | " improvement = (baselineRmse - testRmse) / baselineRmse * 100\n", 562 | " print \"The best model improves the baseline by %.2f\" % (improvement) + \"%.\"\n", 563 | "\n", 564 | " # 个性化的推荐(针对某个用户)\n", 565 | "\n", 566 | " myRatedMovieIds = set([x[1] for x in myRatings])\n", 567 | " candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds])\n", 568 | " predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect()\n", 569 | " recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:50]\n", 570 | "\n", 571 | " print \"Movies recommended for you:\"\n", 572 | " for i in xrange(len(recommendations)):\n", 573 | " print (\"%2d: %s\" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore')\n", 574 | "\n", 575 | " # clean up\n", 576 | " sc.stop()" 577 | ] 578 | } 579 | ], 580 | "metadata": { 581 | "kernelspec": { 582 | "display_name": "Python 3", 583 | "language": "python", 584 | "name": "python3" 585 | }, 586 | "language_info": { 587 | "codemirror_mode": { 588 | "name": "ipython", 589 | "version": 3 590 | }, 591 | "file_extension": ".py", 592 | "mimetype": "text/x-python", 593 | "name": "python", 594 | "nbconvert_exporter": "python", 595 | "pygments_lexer": "ipython3", 596 | "version": "3.5.2" 597 | } 598 | }, 599 | "nbformat": 4, 600 | "nbformat_minor": 2 601 | } 602 | -------------------------------------------------------------------------------- /images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/1.jpg -------------------------------------------------------------------------------- /images/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microw/Music_recommendation/348852716f9aa619e24890953853dbac8f141a5c/images/2.jpg --------------------------------------------------------------------------------