├── README.md └── music_box ├── profit_curve.png ├── 1. Unzip & Prepare Raw Data.ipynb ├── Data Preparation-Churn labeling and Downsampling3.ipynb └── 4Cleansing.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Music_box_project 2 | 3 | extract data file 4 | data processing 5 | EDA 6 | modeling 7 | -------------------------------------------------------------------------------- /music_box/profit_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhijingye1992/Music_box_project/HEAD/music_box/profit_curve.png -------------------------------------------------------------------------------- /music_box/1. Unzip & Prepare Raw Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "# In MAC command shell, copy and run the commands below for data upzip and clean.\n", 9 | "\n", 10 | "## first make pwd to the data folder\n", 11 | "\n", 12 | "## unzip uid\n", 13 | "cp ../data/raw/3_1.uids.gz ../data/all_uid.txt.gz\n", 14 | "\n", 15 | "gunzip ../data/all_uid.txt.gz\n", 16 | "\n", 17 | "## unzip play log\n", 18 | "for f in ../data/raw/*_play.log.tar.gz\n", 19 | "\n", 20 | "do\n", 21 | " \n", 22 | " echo \"Processing $f\"\n", 23 | " \n", 24 | " tar -xvzf $f\n", 25 | "\n", 26 | "done\n", 27 | "\n", 28 | "## make a play folder\n", 29 | "mkdir play\n", 30 | "\n", 31 | "## move play.log files to play folder\n", 32 | "mv *_play.log ../data/play/\n", 33 | "\n", 34 | "cp ../data/raw/*_play.log.gz ../data/play/ \n", 35 | "\n", 36 | "gunzip ../data/play/*.gz\n", 37 | "\n", 38 | "## append file_name to each row (will be used for date)\n", 39 | "cd ../data/play/\n", 40 | "\n", 41 | "for f in *.log\n", 42 | "\n", 43 | "do\n", 44 | "\n", 45 | " echo \"Processing $f\"\n", 46 | " \n", 47 | " awk -v var=\"$f\" '{print $0,\"\\t\",var}' $f > ${f}.fn\n", 48 | "\n", 49 | "done\n", 50 | "\n", 51 | "## cat all log with filename to one file\n", 52 | "\n", 53 | "cat ../data/play/*.log.fn > /Users/Xiaoxi/Desktop/BitTiger/Capstone/data/all_play.log.fn\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 2", 87 | "language": "python", 88 | "name": "python2" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 2 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython2", 100 | "version": "2.7.13" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /music_box/Data Preparation-Churn labeling and Downsampling3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data Preparation: Churn labeling and Downsampling\n", 8 | "\n", 9 | "#### 1. Read the *play.log files line by line, and write only the user ID, device and date of log into a separate file.\n", 10 | "\n", 11 | "#### 2. Label churn users: those who played more than three times before the cutoff day but had no acitivity after the cutoff.\n", 12 | "\n", 13 | "#### 3. Down sampling is necessary. There are more than 50,000 users and 15 GB log data, which is not necessary for the churn prediction and which costs too much time to process. I used a down sampling ratio of 1/10, to only include 1/10 users from the active and churn users for the churn prediction model." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### 1. Churn labeling\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import glob\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "138" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n", 52 | "files = glob.glob(filepath)\n", 53 | "# amount of files\n", 54 | "len(files)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "'/Users/ZhijingYe/Desktop/data/play/20170410_2_play.log'" 66 | ] 67 | }, 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "# take a look at one of the files\n", 75 | "files[0]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "1149628" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "# get an idea how many lines are in one .log file\n", 96 | "with open(files[0],'r') as f:\n", 97 | " lines = f.readlines()\n", 98 | " log_lines = len(lines)\n", 99 | "log_lines" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "'168071768\\tar\\t1248464\\t0\\t\\xe6\\x88\\x90\\xe7\\x8e\\x8b\\xe8\\xb4\\xa5\\xe5\\xaf\\x87\\t\\xe9\\x99\\x88\\xe5\\xb0\\x8f\\xe6\\x98\\xa5\\t187\\t187\\t0\\n'" 111 | ] 112 | }, 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "# Check one line\n", 120 | "lines[3]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "['168071768',\n", 132 | " 'ar',\n", 133 | " '1248464',\n", 134 | " '0',\n", 135 | " '\\xe6\\x88\\x90\\xe7\\x8e\\x8b\\xe8\\xb4\\xa5\\xe5\\xaf\\x87',\n", 136 | " '\\xe9\\x99\\x88\\xe5\\xb0\\x8f\\xe6\\x98\\xa5',\n", 137 | " '187',\n", 138 | " '187',\n", 139 | " '0',\n", 140 | " '20170410_2_play.log']" 141 | ] 142 | }, 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "test_list = lines[3].strip('\\n').split('\\t')\n", 150 | "test_list.append(files[0].split('/')[-1])\n", 151 | "test_list" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stderr", 161 | "output_type": "stream", 162 | "text": [ 163 | "/Users/ZhijingYe/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,2,7) have mixed types. Specify dtype option on import or set low_memory=False.\n", 164 | " data = self._reader.read(nrows)\n" 165 | ] 166 | }, 167 | { 168 | "data": { 169 | "text/html": [ 170 | "
\n", 171 | "\n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | "
uiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_name
0 1.683352e+08 ar 6429024 0 相对湿度 郑希怡 238 238 0NaN
1 1.683105e+08 ar 3348254 0 曾经心痛 袁娅维 21 312 0NaN
2 1.683082e+08 ar 5436214 0 Dream A Little Dream Robbie Williams 246 247 0NaN
3 1.680718e+08 ar 1248464 0 成王败寇 陈小春 187 187 0NaN
4 1.684808e+08 ar 317412 0 Kissy Kissy Smile.DK 188 189 0NaN
\n", 255 | "
" 256 | ], 257 | "text/plain": [ 258 | " uid device song_id song_type song_name \\\n", 259 | "0 1.683352e+08 ar 6429024 0 相对湿度 \n", 260 | "1 1.683105e+08 ar 3348254 0 曾经心痛 \n", 261 | "2 1.683082e+08 ar 5436214 0 Dream A Little Dream \n", 262 | "3 1.680718e+08 ar 1248464 0 成王败寇 \n", 263 | "4 1.684808e+08 ar 317412 0 Kissy Kissy \n", 264 | "\n", 265 | " singer play_time song_length paid_flag file_name \n", 266 | "0 郑希怡 238 238 0 NaN \n", 267 | "1 袁娅维 21 312 0 NaN \n", 268 | "2 Robbie Williams 246 247 0 NaN \n", 269 | "3 陈小春 187 187 0 NaN \n", 270 | "4 Smile.DK 188 189 0 NaN " 271 | ] 272 | }, 273 | "execution_count": 7, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "import pandas as pd\n", 280 | "import numpy as np\n", 281 | "\n", 282 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name']\n", 283 | "df = pd.read_csv(files[0], sep='\\t',header=None,index_col=None,names=schema )\n", 284 | "df.head()\n", 285 | "# Note the file_name will be added later" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### Save reduced play logs to two log files.\n", 302 | "Only the first two items of each line, user id and device, and the date of the log are saved, so it's called reduced play logs." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 8, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "# 04/22 is the cutoff date for labeling churns\n", 314 | "cutoff = '20170422'" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 9, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "# destination file names to save the reduced logs.\n", 326 | "first_period_log = '/Users/ZhijingYe/Desktop/data/output/play_till_cutoff.log'\n", 327 | "second_period_log = '/Users/ZhijingYe/Desktop/data/output/play_after_cutoff.log'" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 10, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "processing file: 20170410_2_play.log\n", 340 | "...costs 3.02 seconds\n", 341 | "processing file: 20170410_3_play.log\n", 342 | "...costs 2.89 seconds\n", 343 | "processing file: 20170427_3_play.log\n", 344 | "...costs 2.90 seconds\n", 345 | "processing file: 20170427_2_play.log\n", 346 | "...costs 2.27 seconds\n", 347 | "processing file: 20170504_3_play.log\n", 348 | "...costs 2.05 seconds\n", 349 | "processing file: 20170504_2_play.log\n", 350 | "...costs 1.93 seconds\n", 351 | "processing file: 20170508_1_play.log\n", 352 | "...costs 2.01 seconds\n", 353 | "processing file: 20170505_1_play.log\n", 354 | "...costs 2.07 seconds\n", 355 | "processing file: 20170411_1_play.log\n", 356 | "...costs 2.74 seconds\n", 357 | "processing file: 20170426_1_play.log\n", 358 | "...costs 2.23 seconds\n", 359 | "processing file: 20170509_3_play.log\n", 360 | "...costs 1.81 seconds\n", 361 | "processing file: 20170509_2_play.log\n", 362 | "...costs 1.93 seconds\n", 363 | "processing file: 20170401_2_play.log\n", 364 | "...costs 4.37 seconds\n", 365 | "processing file: 20170401_3_play.log\n", 366 | "...costs 5.01 seconds\n", 367 | "processing file: 20170423_1_play.log\n", 368 | "...costs 2.54 seconds\n", 369 | "processing file: 20170414_1_play.log\n", 370 | "...costs 1.88 seconds\n", 371 | "processing file: 20170418_2_play.log\n", 372 | "...costs 2.39 seconds\n", 373 | "processing file: 20170418_3_play.log\n", 374 | "...costs 2.35 seconds\n", 375 | "processing file: 20170408_1_play.log\n", 376 | "...costs 3.25 seconds\n", 377 | "processing file: 20170404_2_play.log\n", 378 | "...costs 3.94 seconds\n", 379 | "processing file: 20170404_3_play.log\n", 380 | "...costs 3.89 seconds\n", 381 | "processing file: 20170510_3_play.log\n", 382 | "...costs 1.98 seconds\n", 383 | "processing file: 20170308_1_play.log\n", 384 | "...costs 3.13 seconds\n", 385 | "processing file: 20170510_2_play.log\n", 386 | "...costs 1.88 seconds\n", 387 | "processing file: 20170501_3_play.log\n", 388 | "...costs 2.21 seconds\n", 389 | "processing file: 20170501_2_play.log\n", 390 | "...costs 2.12 seconds\n", 391 | "processing file: 20170422_3_play.log\n", 392 | "...costs 2.34 seconds\n", 393 | "processing file: 20170422_2_play.log\n", 394 | "...costs 2.33 seconds\n", 395 | "processing file: 20170415_2_play.log\n", 396 | "...costs 2.56 seconds\n", 397 | "processing file: 20170415_3_play.log\n", 398 | "...costs 2.65 seconds\n", 399 | "processing file: 20170419_1_play.log\n", 400 | "...costs 2.25 seconds\n", 401 | "processing file: 20170409_2_play.log\n", 402 | "...costs 3.18 seconds\n", 403 | "processing file: 20170409_3_play.log\n", 404 | "...costs 3.18 seconds\n", 405 | "processing file: 20170305_1_play.log\n", 406 | "...costs 3.99 seconds\n", 407 | "processing file: 20170511_1_play.log\n", 408 | "...costs 2.23 seconds\n", 409 | "processing file: 20170405_1_play.log\n", 410 | "...costs 2.99 seconds\n", 411 | "processing file: 20170302_1_play.log\n", 412 | "...costs 6.22 seconds\n", 413 | "processing file: 20170402_1_play.log\n", 414 | "...costs 4.54 seconds\n", 415 | "processing file: 20170425_2_play.log\n", 416 | "...costs 2.09 seconds\n", 417 | "processing file: 20170425_3_play.log\n", 418 | "...costs 2.31 seconds\n", 419 | "processing file: 20170412_3_play.log\n", 420 | "...costs 2.76 seconds\n", 421 | "processing file: 20170412_2_play.log\n", 422 | "...costs 2.81 seconds\n", 423 | "processing file: 20170506_2_play.log\n", 424 | "...costs 2.34 seconds\n", 425 | "processing file: 20170506_3_play.log\n", 426 | "...costs 2.26 seconds\n", 427 | "processing file: 20170429_1_play.log\n", 428 | "...costs 2.55 seconds\n", 429 | "processing file: 20170403_2_play.log\n", 430 | "...costs 4.32 seconds\n", 431 | "processing file: 20170507_1_play.log\n", 432 | "...costs 2.19 seconds\n", 433 | "processing file: 20170424_1_play.log\n", 434 | "...costs 1.34 seconds\n", 435 | "processing file: 20170413_1_play.log\n", 436 | "...costs 2.59 seconds\n", 437 | "processing file: 20170428_2_play.log\n", 438 | "...costs 2.48 seconds\n", 439 | "processing file: 20170428_3_play.log\n", 440 | "...costs 2.34 seconds\n", 441 | "processing file: 20170331_2_play.log\n", 442 | "...costs 5.28 seconds\n", 443 | "processing file: 20170331_3_play.log\n", 444 | "...costs 7.89 seconds\n", 445 | "processing file: 20170406_3_play.log\n", 446 | "...costs 3.33 seconds\n", 447 | "processing file: 20170406_2_play.log\n", 448 | "...costs 3.08 seconds\n", 449 | "processing file: 20170512_2_play.log\n", 450 | "...costs 2.07 seconds\n", 451 | "processing file: 20170512_3_play.log\n", 452 | "...costs 1.92 seconds\n", 453 | "processing file: 20170416_1_play.log\n", 454 | "...costs 2.89 seconds\n", 455 | "processing file: 20170421_1_play.log\n", 456 | "...costs 2.38 seconds\n", 457 | "processing file: 20170502_1_play.log\n", 458 | "...costs 2.10 seconds\n", 459 | "processing file: 20170307_1_play.log\n", 460 | "...costs 3.02 seconds\n", 461 | "processing file: 20170430_1_play.log\n", 462 | "...costs 2.43 seconds\n", 463 | "processing file: 20170407_1_play.log\n", 464 | "...costs 2.90 seconds\n", 465 | "processing file: 20170503_2_play.log\n", 466 | "...costs 1.95 seconds\n", 467 | "processing file: 20170503_3_play.log\n", 468 | "...costs 1.95 seconds\n", 469 | "processing file: 20170417_3_play.log\n", 470 | "...costs 2.40 seconds\n", 471 | "processing file: 20170417_2_play.log\n", 472 | "...costs 2.43 seconds\n", 473 | "processing file: 20170420_2_play.log\n", 474 | "...costs 2.32 seconds\n", 475 | "processing file: 20170420_3_play.log\n", 476 | "...costs 2.18 seconds\n", 477 | "processing file: 20170422_1_play.log\n", 478 | "...costs 2.57 seconds\n", 479 | "processing file: 20170415_1_play.log\n", 480 | "...costs 2.78 seconds\n", 481 | "processing file: 20170501_1_play.log\n", 482 | "...costs 2.41 seconds\n", 483 | "processing file: 20170419_3_play.log\n", 484 | "...costs 2.56 seconds\n", 485 | "processing file: 20170419_2_play.log\n", 486 | "...costs 2.61 seconds\n", 487 | "processing file: 20170409_1_play.log\n", 488 | "...costs 3.09 seconds\n", 489 | "processing file: 20170405_3_play.log\n", 490 | "...costs 3.17 seconds\n", 491 | "processing file: 20170405_2_play.log\n", 492 | "...costs 3.11 seconds\n", 493 | "processing file: 20170309_1_play.log\n", 494 | "...costs 2.89 seconds\n", 495 | "processing file: 20170511_2_play.log\n", 496 | "...costs 1.89 seconds\n", 497 | "processing file: 20170511_3_play.log\n", 498 | "...costs 1.89 seconds\n", 499 | "processing file: 20170423_2_play.log\n", 500 | "...costs 2.45 seconds\n", 501 | "processing file: 20170423_3_play.log\n", 502 | "...costs 2.47 seconds\n", 503 | "processing file: 20170414_3_play.log\n", 504 | "...costs 1.81 seconds\n", 505 | "processing file: 20170414_2_play.log\n", 506 | "...costs 1.89 seconds\n", 507 | "processing file: 20170418_1_play.log\n", 508 | "...costs 2.35 seconds\n", 509 | "processing file: 20170301_play.log\n", 510 | "...costs 8.80 seconds\n", 511 | "processing file: 20170408_3_play.log\n", 512 | "...costs 3.43 seconds\n", 513 | "processing file: 20170408_2_play.log\n", 514 | "...costs 3.13 seconds\n", 515 | "processing file: 20170304_1_play.log\n", 516 | "...costs 4.05 seconds\n", 517 | "processing file: 20170510_1_play.log\n", 518 | "...costs 2.04 seconds\n", 519 | "processing file: 20170404_1_play.log\n", 520 | "...costs 4.52 seconds\n", 521 | "processing file: 20170411_2_play.log\n", 522 | "...costs 2.94 seconds\n", 523 | "processing file: 20170426_2_play.log\n", 524 | "...costs 2.28 seconds\n", 525 | "processing file: 20170426_3_play.log\n", 526 | "...costs 2.49 seconds\n", 527 | "processing file: 20170505_2_play.log\n", 528 | "...costs 2.09 seconds\n", 529 | "processing file: 20170505_3_play.log\n", 530 | "...costs 1.88 seconds\n", 531 | "processing file: 20170509_1_play.log\n", 532 | "...costs 2.10 seconds\n", 533 | "processing file: 20170401_1_play.log\n", 534 | "...costs 3.91 seconds\n", 535 | "processing file: 20170504_1_play.log\n", 536 | "...costs 2.01 seconds\n", 537 | "processing file: 20170410_1_play.log\n", 538 | "...costs 2.69 seconds\n", 539 | "processing file: 20170427_1_play.log\n", 540 | "...costs 2.17 seconds\n", 541 | "processing file: 20170508_2_play.log\n", 542 | "...costs 1.90 seconds\n", 543 | "processing file: 20170508_3_play.log\n", 544 | "...costs 1.90 seconds\n", 545 | "processing file: 20170330_3_play.log\n", 546 | "...costs 12.15 seconds\n", 547 | "processing file: 20170430_3_play.log\n", 548 | "...costs 2.70 seconds\n", 549 | "processing file: 20170430_2_play.log\n", 550 | "...costs 2.59 seconds\n", 551 | "processing file: 20170407_2_play.log\n", 552 | "...costs 2.81 seconds\n", 553 | "processing file: 20170407_3_play.log\n", 554 | "...costs 2.87 seconds\n", 555 | "processing file: 20170417_1_play.log\n", 556 | "...costs 2.49 seconds\n", 557 | "processing file: 20170420_1_play.log\n", 558 | "...costs 2.35 seconds\n", 559 | "processing file: 20170503_1_play.log\n", 560 | "...costs 1.98 seconds\n", 561 | "processing file: 20170331_1_play.log\n", 562 | "...costs 4.26 seconds\n", 563 | "processing file: 20170306_1_play.log\n", 564 | "...costs 3.23 seconds\n", 565 | "processing file: 20170512_1_play.log\n", 566 | "...costs 1.97 seconds\n", 567 | "processing file: 20170406_1_play.log\n", 568 | "...costs 3.00 seconds\n", 569 | "processing file: 20170502_3_play.log\n", 570 | "...costs 1.86 seconds\n", 571 | "processing file: 20170502_2_play.log\n", 572 | "...costs 1.87 seconds\n", 573 | "processing file: 20170416_2_play.log\n", 574 | "...costs 2.72 seconds\n", 575 | "processing file: 20170416_3_play.log\n", 576 | "...costs 2.72 seconds\n", 577 | "processing file: 20170421_3_play.log\n", 578 | "...costs 2.26 seconds\n", 579 | "processing file: 20170421_2_play.log\n", 580 | "...costs 2.30 seconds\n", 581 | "processing file: 20170303_1_play.log\n", 582 | "...costs 4.44 seconds\n", 583 | "processing file: 20170403_1_play.log\n", 584 | "...costs 4.09 seconds\n", 585 | "processing file: 20170424_3_play.log\n", 586 | "...costs 2.11 seconds\n", 587 | "processing file: 20170424_2_play.log\n", 588 | "...costs 0.34 seconds\n", 589 | "processing file: 20170413_2_play.log\n", 590 | "...costs 2.58 seconds\n", 591 | "processing file: 20170413_3_play.log\n", 592 | "...costs 2.67 seconds\n", 593 | "processing file: 20170507_3_play.log\n", 594 | "...costs 2.08 seconds\n", 595 | "processing file: 20170507_2_play.log\n", 596 | "...costs 2.16 seconds\n", 597 | "processing file: 20170428_1_play.log\n", 598 | "...costs 2.14 seconds\n", 599 | "processing file: 20170339_1_play.log\n", 600 | "...costs 5.11 seconds\n", 601 | "processing file: 20170402_2_play.log\n", 602 | "...costs 4.55 seconds\n", 603 | "processing file: 20170402_3_play.log\n", 604 | "...costs 4.52 seconds\n", 605 | "processing file: 20170506_1_play.log\n", 606 | "...costs 2.29 seconds\n", 607 | "processing file: 20170425_1_play.log\n", 608 | "...costs 2.27 seconds\n", 609 | "processing file: 20170412_1_play.log\n", 610 | "...costs 2.61 seconds\n", 611 | "processing file: 20170429_3_play.log\n", 612 | "...costs 2.68 seconds\n", 613 | "processing file: 20170429_2_play.log\n", 614 | "...costs 2.60 seconds\n" 615 | ] 616 | } 617 | ], 618 | "source": [ 619 | "import time\n", 620 | "\n", 621 | "for each_file in files:\n", 622 | " current_time = time.clock()\n", 623 | "\n", 624 | " with open(each_file, 'r') as f:\n", 625 | " lines = f.readlines()\n", 626 | " filename = f.name.split('/')[-1]\n", 627 | " print('processing file: %s' % filename)\n", 628 | " #choose the output path\n", 629 | " if filename < cutoff:\n", 630 | " output_path = first_period_log\n", 631 | " else:\n", 632 | " output_path = second_period_log\n", 633 | " # write to the output file\n", 634 | " with open(output_path, 'a') as output:\n", 635 | " for line in lines:\n", 636 | " fields_to_keep = line.strip('\\n').split('\\t')[:2]\n", 637 | " fields_to_keep.append(filename)\n", 638 | " output.write('\\t'.join(fields_to_keep)+'\\n')\n", 639 | " print('...costs %.2f seconds' % (time.clock()-current_time))" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 11, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/html": [ 650 | "
\n", 651 | "\n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | "
uiddevicefile_name
0 168335198 ar 20170410_2_play.log
1 168310452 ar 20170410_2_play.log
2 168308159 ar 20170410_2_play.log
3 168071768 ar 20170410_2_play.log
4 168480816 ar 20170410_2_play.log
\n", 693 | "
" 694 | ], 695 | "text/plain": [ 696 | " uid device file_name\n", 697 | "0 168335198 ar 20170410_2_play.log\n", 698 | "1 168310452 ar 20170410_2_play.log\n", 699 | "2 168308159 ar 20170410_2_play.log\n", 700 | "3 168071768 ar 20170410_2_play.log\n", 701 | "4 168480816 ar 20170410_2_play.log" 702 | ] 703 | }, 704 | "execution_count": 11, 705 | "metadata": {}, 706 | "output_type": "execute_result" 707 | } 708 | ], 709 | "source": [ 710 | "import pandas as pd\n", 711 | "\n", 712 | "schema = ['uid','device','file_name']\n", 713 | "df_1 = pd.read_csv(first_period_log,delimiter='\\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})\n", 714 | "df_1.head()" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 12, 720 | "metadata": {}, 721 | "outputs": [ 722 | { 723 | "name": "stdout", 724 | "output_type": "stream", 725 | "text": [ 726 | "\n", 727 | "Int64Index: 321455544 entries, 0 to 321455543\n", 728 | "Data columns (total 3 columns):\n", 729 | "uid object\n", 730 | "device object\n", 731 | "file_name object\n", 732 | "dtypes: object(3)\n", 733 | "memory usage: 9.6+ GB\n" 734 | ] 735 | } 736 | ], 737 | "source": [ 738 | "df_1.info()" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 13, 744 | "metadata": { 745 | "collapsed": true 746 | }, 747 | "outputs": [], 748 | "source": [ 749 | "# change file_name to date\n", 750 | "def get_date(file_name):\n", 751 | " tmp_list = str(file_name).split('_')\n", 752 | " return tmp_list[0]\n", 753 | "df_1['date'] = df_1['file_name'].map(get_date)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 14, 759 | "metadata": { 760 | "collapsed": true 761 | }, 762 | "outputs": [], 763 | "source": [ 764 | "df_1 = df_1.drop(['file_name'], axis = 1)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": { 771 | "collapsed": true 772 | }, 773 | "outputs": [], 774 | "source": [] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "metadata": { 780 | "collapsed": true 781 | }, 782 | "outputs": [], 783 | "source": [] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "### Explore the data" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": 18, 795 | "metadata": {}, 796 | "outputs": [ 797 | { 798 | "data": { 799 | "text/plain": [ 800 | "847330" 801 | ] 802 | }, 803 | "execution_count": 18, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "len(df_1['uid'].unique())" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 19, 815 | "metadata": {}, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/plain": [ 820 | "array(['ar', 'ip', 'mc', 'wp', 'ar ', 'ip ', '20170302_1_play.log',\n", 821 | " '168589573', '20170301_play.log', nan, '20170303_1_play.log',\n", 822 | " '20170339_1_play.log'], dtype=object)" 823 | ] 824 | }, 825 | "execution_count": 19, 826 | "metadata": {}, 827 | "output_type": "execute_result" 828 | } 829 | ], 830 | "source": [ 831 | "df_1['device'].unique()" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": 21, 837 | "metadata": { 838 | "collapsed": true 839 | }, 840 | "outputs": [], 841 | "source": [ 842 | "# len(df_1['file_name'].unique())" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 22, 848 | "metadata": {}, 849 | "outputs": [ 850 | { 851 | "data": { 852 | "text/plain": [ 853 | "1685126 11778180\n", 854 | "37025504 8535228\n", 855 | "751824 6796068\n", 856 | "1791497 5987916\n", 857 | "497685 4519674\n", 858 | "1062806 3776580\n", 859 | "736305 2829009\n", 860 | "1685126 1884981\n", 861 | "0 1815633\n", 862 | "37025504 1381125\n", 863 | "1749320 1207488\n", 864 | "1679121 784674\n", 865 | "46532274 756681\n", 866 | "28638487 634440\n", 867 | "637650 350460\n", 868 | "...\n", 869 | "167679654 3\n", 870 | "168963526 3\n", 871 | "168327556 3\n", 872 | "154699061 3\n", 873 | "168963528 3\n", 874 | "168761341 3\n", 875 | "154652167 3\n", 876 | "168686496 3\n", 877 | "154828622 3\n", 878 | "154494259 3\n", 879 | "154502301 3\n", 880 | "168280933 3\n", 881 | "154426629 3\n", 882 | "167932419 3\n", 883 | "168891406 3\n", 884 | "Length: 847329, dtype: int64" 885 | ] 886 | }, 887 | "execution_count": 22, 888 | "metadata": {}, 889 | "output_type": "execute_result" 890 | } 891 | ], 892 | "source": [ 893 | "df_1.uid.value_counts()" 894 | ] 895 | }, 896 | { 897 | "cell_type": "markdown", 898 | "metadata": {}, 899 | "source": [ 900 | "Looks like uid = 0 should be testing id. And those uid with log numbers larger than that of uid = 0 may be robot. Check the device type of these ids. These user id will be deleted later.\n" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "### Criteria of active user: number of activities before cutoff date >= 3\n", 908 | "### Criteria of churn user: active users that have no activity after cutoff date\n", 909 | "### Criteria of loyal user: a user has>= 3 activities before cutoff date and has recent activity after cutoff date" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 23, 915 | "metadata": {}, 916 | "outputs": [ 917 | { 918 | "data": { 919 | "text/plain": [ 920 | "(847329, 0)" 921 | ] 922 | }, 923 | "execution_count": 23, 924 | "metadata": {}, 925 | "output_type": "execute_result" 926 | } 927 | ], 928 | "source": [ 929 | "# total number of active users and inactive users before the cutoff date\n", 930 | "active = df_1.uid.value_counts()>=3\n", 931 | "sum(active),sum(active==0)" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 24, 937 | "metadata": { 938 | "collapsed": true 939 | }, 940 | "outputs": [], 941 | "source": [ 942 | "active_users = [active.index[i] for i in xrange(len(active)) if active[i]]" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 25, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "data": { 952 | "text/plain": [ 953 | "847329" 954 | ] 955 | }, 956 | "execution_count": 25, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "len(active_users)" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 26, 968 | "metadata": { 969 | "collapsed": true 970 | }, 971 | "outputs": [], 972 | "source": [ 973 | "active_set = set(active_users)" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 27, 979 | "metadata": {}, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/plain": [ 984 | "847329" 985 | ] 986 | }, 987 | "execution_count": 27, 988 | "metadata": {}, 989 | "output_type": "execute_result" 990 | } 991 | ], 992 | "source": [ 993 | "len(active_set)" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": 28, 999 | "metadata": {}, 1000 | "outputs": [ 1001 | { 1002 | "data": { 1003 | "text/html": [ 1004 | "
\n", 1005 | "\n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | "
uiddevicefile_name
0 169026646 ar 20170427_3_play.log
1 168553991 ar 20170427_3_play.log
2 1685126 ar 20170427_3_play.log
3 168845172 ar 20170427_3_play.log
4 168538454 ar 20170427_3_play.log
\n", 1047 | "
" 1048 | ], 1049 | "text/plain": [ 1050 | " uid device file_name\n", 1051 | "0 169026646 ar 20170427_3_play.log\n", 1052 | "1 168553991 ar 20170427_3_play.log\n", 1053 | "2 1685126 ar 20170427_3_play.log\n", 1054 | "3 168845172 ar 20170427_3_play.log\n", 1055 | "4 168538454 ar 20170427_3_play.log" 1056 | ] 1057 | }, 1058 | "execution_count": 28, 1059 | "metadata": {}, 1060 | "output_type": "execute_result" 1061 | } 1062 | ], 1063 | "source": [ 1064 | "# Now process the recent play.log file to get recent users.\n", 1065 | "df_2 = pd.read_csv(second_period_log,delimiter='\\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})\n", 1066 | "df_2.head()" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "execution_count": 29, 1072 | "metadata": {}, 1073 | "outputs": [ 1074 | { 1075 | "name": "stdout", 1076 | "output_type": "stream", 1077 | "text": [ 1078 | "\n", 1079 | "Int64Index: 172407126 entries, 0 to 172407125\n", 1080 | "Data columns (total 3 columns):\n", 1081 | "uid object\n", 1082 | "device object\n", 1083 | "file_name object\n", 1084 | "dtypes: object(3)\n", 1085 | "memory usage: 5.1+ GB\n" 1086 | ] 1087 | } 1088 | ], 1089 | "source": [ 1090 | "df_2.info()\n" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 30, 1096 | "metadata": { 1097 | "collapsed": true 1098 | }, 1099 | "outputs": [], 1100 | "source": [ 1101 | "active_recent = df_2.uid.value_counts()" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": 31, 1107 | "metadata": {}, 1108 | "outputs": [ 1109 | { 1110 | "data": { 1111 | "text/plain": [ 1112 | "273222" 1113 | ] 1114 | }, 1115 | "execution_count": 31, 1116 | "metadata": {}, 1117 | "output_type": "execute_result" 1118 | } 1119 | ], 1120 | "source": [ 1121 | "len(active_recent) " 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "code", 1126 | "execution_count": 32, 1127 | "metadata": {}, 1128 | "outputs": [ 1129 | { 1130 | "data": { 1131 | "text/plain": [ 1132 | "numpy.int64" 1133 | ] 1134 | }, 1135 | "execution_count": 32, 1136 | "metadata": {}, 1137 | "output_type": "execute_result" 1138 | } 1139 | ], 1140 | "source": [ 1141 | "type(active_recent[0])" 1142 | ] 1143 | }, 1144 | { 1145 | "cell_type": "code", 1146 | "execution_count": 33, 1147 | "metadata": {}, 1148 | "outputs": [ 1149 | { 1150 | "data": { 1151 | "text/plain": [ 1152 | "273222" 1153 | ] 1154 | }, 1155 | "execution_count": 33, 1156 | "metadata": {}, 1157 | "output_type": "execute_result" 1158 | } 1159 | ], 1160 | "source": [ 1161 | "active_set_recent = set(active_recent.index)\n", 1162 | "len(active_set_recent)" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 34, 1168 | "metadata": {}, 1169 | "outputs": [ 1170 | { 1171 | "data": { 1172 | "text/plain": [ 1173 | "598465" 1174 | ] 1175 | }, 1176 | "execution_count": 34, 1177 | "metadata": {}, 1178 | "output_type": "execute_result" 1179 | } 1180 | ], 1181 | "source": [ 1182 | "# Churn user set:\n", 1183 | "churn_set = active_set - active_set_recent\n", 1184 | "len(churn_set)" 1185 | ] 1186 | }, 1187 | { 1188 | "cell_type": "code", 1189 | "execution_count": 35, 1190 | "metadata": {}, 1191 | "outputs": [ 1192 | { 1193 | "data": { 1194 | "text/plain": [ 1195 | "248864" 1196 | ] 1197 | }, 1198 | "execution_count": 35, 1199 | "metadata": {}, 1200 | "output_type": "execute_result" 1201 | } 1202 | ], 1203 | "source": [ 1204 | "# Loyal user set:\n", 1205 | "loyal_set = active_set & active_set_recent\n", 1206 | "len(loyal_set)" 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "markdown", 1211 | "metadata": {}, 1212 | "source": [ 1213 | "### Down sample and save reduced dataframe" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "code", 1218 | "execution_count": 36, 1219 | "metadata": { 1220 | "collapsed": true 1221 | }, 1222 | "outputs": [], 1223 | "source": [ 1224 | "import random\n", 1225 | "\n", 1226 | "random.seed(42)" 1227 | ] 1228 | }, 1229 | { 1230 | "cell_type": "code", 1231 | "execution_count": 37, 1232 | "metadata": {}, 1233 | "outputs": [ 1234 | { 1235 | "data": { 1236 | "text/plain": [ 1237 | "24886" 1238 | ] 1239 | }, 1240 | "execution_count": 37, 1241 | "metadata": {}, 1242 | "output_type": "execute_result" 1243 | } 1244 | ], 1245 | "source": [ 1246 | "loyal_sample = random.sample(loyal_set,len(loyal_set)/10)\n", 1247 | "len(loyal_sample)" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": 38, 1253 | "metadata": {}, 1254 | "outputs": [ 1255 | { 1256 | "data": { 1257 | "text/plain": [ 1258 | "59846" 1259 | ] 1260 | }, 1261 | "execution_count": 38, 1262 | "metadata": {}, 1263 | "output_type": "execute_result" 1264 | } 1265 | ], 1266 | "source": [ 1267 | "churn_sample = random.sample(churn_set,len(churn_set)/10)\n", 1268 | "len(churn_sample)" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": 39, 1274 | "metadata": { 1275 | "collapsed": true 1276 | }, 1277 | "outputs": [], 1278 | "source": [ 1279 | "churn_sample_list = list(churn_sample)" 1280 | ] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "execution_count": 40, 1285 | "metadata": { 1286 | "collapsed": true 1287 | }, 1288 | "outputs": [], 1289 | "source": [ 1290 | "loyal_sample_list = list(loyal_sample)" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "code", 1295 | "execution_count": 41, 1296 | "metadata": { 1297 | "collapsed": true 1298 | }, 1299 | "outputs": [], 1300 | "source": [ 1301 | "outfile = open(\"/Users/ZhijingYe/Desktop/data/output/churn_sample_list.pkl\",\"w\") " 1302 | ] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": 42, 1307 | "metadata": { 1308 | "collapsed": true 1309 | }, 1310 | "outputs": [], 1311 | "source": [ 1312 | "import numpy as np\n", 1313 | "np.save(\"/Users/ZhijingYe/Desktop/data/output/churn_sample_list\",churn_sample_list)" 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "code", 1318 | "execution_count": 43, 1319 | "metadata": { 1320 | "collapsed": true 1321 | }, 1322 | "outputs": [], 1323 | "source": [ 1324 | "np.save(\"/Users/ZhijingYe/Desktop/data/output/loyal_sample_list\",loyal_sample_list)" 1325 | ] 1326 | }, 1327 | { 1328 | "cell_type": "code", 1329 | "execution_count": null, 1330 | "metadata": { 1331 | "collapsed": true 1332 | }, 1333 | "outputs": [], 1334 | "source": [] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": null, 1339 | "metadata": { 1340 | "collapsed": true 1341 | }, 1342 | "outputs": [], 1343 | "source": [] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": null, 1348 | "metadata": { 1349 | "collapsed": true 1350 | }, 1351 | "outputs": [], 1352 | "source": [] 1353 | }, 1354 | { 1355 | "cell_type": "code", 1356 | "execution_count": null, 1357 | "metadata": { 1358 | "collapsed": true 1359 | }, 1360 | "outputs": [], 1361 | "source": [] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": null, 1366 | "metadata": { 1367 | "collapsed": true 1368 | }, 1369 | "outputs": [], 1370 | "source": [] 1371 | }, 1372 | { 1373 | "cell_type": "code", 1374 | "execution_count": 44, 1375 | "metadata": { 1376 | "collapsed": true 1377 | }, 1378 | "outputs": [], 1379 | "source": [ 1380 | "df_churn = df_1.loc[df_1.uid.isin(churn_sample),:]" 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "code", 1385 | "execution_count": 45, 1386 | "metadata": {}, 1387 | "outputs": [ 1388 | { 1389 | "data": { 1390 | "text/plain": [ 1391 | "(9814302, 3)" 1392 | ] 1393 | }, 1394 | "execution_count": 45, 1395 | "metadata": {}, 1396 | "output_type": "execute_result" 1397 | } 1398 | ], 1399 | "source": [ 1400 | "df_churn.shape" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": 46, 1406 | "metadata": { 1407 | "collapsed": true 1408 | }, 1409 | "outputs": [], 1410 | "source": [ 1411 | "df_loyal_log = df_1.loc[df_1.uid.isin(loyal_sample),:]" 1412 | ] 1413 | }, 1414 | { 1415 | "cell_type": "code", 1416 | "execution_count": 47, 1417 | "metadata": {}, 1418 | "outputs": [ 1419 | { 1420 | "data": { 1421 | "text/plain": [ 1422 | "(35070042, 3)" 1423 | ] 1424 | }, 1425 | "execution_count": 47, 1426 | "metadata": {}, 1427 | "output_type": "execute_result" 1428 | } 1429 | ], 1430 | "source": [ 1431 | "df_loyal_log.shape" 1432 | ] 1433 | }, 1434 | { 1435 | "cell_type": "code", 1436 | "execution_count": 48, 1437 | "metadata": { 1438 | "collapsed": true 1439 | }, 1440 | "outputs": [], 1441 | "source": [ 1442 | "df_churn.to_csv('/Users/ZhijingYe/Desktop/data/output/churn_df_sample.csv',sep='\\t', encoding='utf-8')\n", 1443 | "df_loyal_log.to_csv('/Users/ZhijingYe/Desktop/data/output/loyal_df_sample.csv',sep='\\t', encoding='utf-8')" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "markdown", 1448 | "metadata": {}, 1449 | "source": [] 1450 | }, 1451 | { 1452 | "cell_type": "code", 1453 | "execution_count": 49, 1454 | "metadata": { 1455 | "collapsed": true 1456 | }, 1457 | "outputs": [], 1458 | "source": [ 1459 | "import glob" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": 50, 1465 | "metadata": {}, 1466 | "outputs": [ 1467 | { 1468 | "data": { 1469 | "text/plain": [ 1470 | "138" 1471 | ] 1472 | }, 1473 | "execution_count": 50, 1474 | "metadata": {}, 1475 | "output_type": "execute_result" 1476 | } 1477 | ], 1478 | "source": [ 1479 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n", 1480 | "files = glob.glob(filepath)\n", 1481 | "# amount of files\n", 1482 | "len(files)" 1483 | ] 1484 | }, 1485 | { 1486 | "cell_type": "code", 1487 | "execution_count": 51, 1488 | "metadata": { 1489 | "collapsed": true 1490 | }, 1491 | "outputs": [], 1492 | "source": [ 1493 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']" 1494 | ] 1495 | }, 1496 | { 1497 | "cell_type": "code", 1498 | "execution_count": 52, 1499 | "metadata": { 1500 | "collapsed": true 1501 | }, 1502 | "outputs": [], 1503 | "source": [ 1504 | "output = open('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log','a')" 1505 | ] 1506 | }, 1507 | { 1508 | "cell_type": "code", 1509 | "execution_count": null, 1510 | "metadata": {}, 1511 | "outputs": [ 1512 | { 1513 | "name": "stdout", 1514 | "output_type": "stream", 1515 | "text": [ 1516 | "processing file: 20170410_2_play.log\n" 1517 | ] 1518 | } 1519 | ], 1520 | "source": [ 1521 | "import time\n", 1522 | "\n", 1523 | "\n", 1524 | "for the_file in files:\n", 1525 | " current_time = time.clock()\n", 1526 | "\n", 1527 | " with open(the_file, 'r') as f:\n", 1528 | " lines = f.readlines()\n", 1529 | " file_name = f.name.split('/')[-1]\n", 1530 | " print('processing file: %s' % file_name)\n", 1531 | " for line in lines:\n", 1532 | " user_id = line.strip('\\n').split('\\t')[0]\n", 1533 | " if user_id in churn_sample:\n", 1534 | " contents_to_wirte = line.strip('\\n').split('\\t')\n", 1535 | " contents_to_wirte.extend((file_name, '1'))\n", 1536 | " elif user_id in loyal_sample:\n", 1537 | " contents_to_wirte = line.strip('\\n').split('\\t')\n", 1538 | " contents_to_wirte.extend((file_name, '0'))\n", 1539 | " else:\n", 1540 | " continue \n", 1541 | " output.write('\\t'.join(contents_to_wirte)+'\\n')\n", 1542 | " print('...costs %.2f seconds' % (time.clock()-current_time))" 1543 | ] 1544 | }, 1545 | { 1546 | "cell_type": "code", 1547 | "execution_count": null, 1548 | "metadata": { 1549 | "collapsed": true 1550 | }, 1551 | "outputs": [], 1552 | "source": [ 1553 | "output.close()" 1554 | ] 1555 | }, 1556 | { 1557 | "cell_type": "code", 1558 | "execution_count": null, 1559 | "metadata": { 1560 | "collapsed": true 1561 | }, 1562 | "outputs": [], 1563 | "source": [ 1564 | "df_play = pd.read_csv('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log',\n", 1565 | " delimiter='\\t',header=None,index_col=None,names = schema,\n", 1566 | " dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})\n", 1567 | "df_play.head()" 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": null, 1573 | "metadata": { 1574 | "collapsed": true 1575 | }, 1576 | "outputs": [], 1577 | "source": [ 1578 | "df_play.info()" 1579 | ] 1580 | } 1581 | ], 1582 | "metadata": { 1583 | "kernelspec": { 1584 | "display_name": "Python 3", 1585 | "language": "python", 1586 | "name": "python3" 1587 | }, 1588 | "language_info": { 1589 | "codemirror_mode": { 1590 | "name": "ipython", 1591 | "version": 3 1592 | }, 1593 | "file_extension": ".py", 1594 | "mimetype": "text/x-python", 1595 | "name": "python", 1596 | "nbconvert_exporter": "python", 1597 | "pygments_lexer": "ipython3", 1598 | "version": "3.6.2" 1599 | } 1600 | }, 1601 | "nbformat": 4, 1602 | "nbformat_minor": 2 1603 | } 1604 | -------------------------------------------------------------------------------- /music_box/4Cleansing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# Load saved list for churn and loyal sample users: churn_sample_list, loyal_sample_list\n", 28 | "churn_list = np.load('/Users/ZhijingYe/Desktop/data/output/churn_sample_list.npy').tolist()\n", 29 | "loyal_list = np.load('/Users/ZhijingYe/Desktop/data/output/loyal_sample_list.npy').tolist()\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "(59846, 24886)" 41 | ] 42 | }, 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "len(churn_list), len(loyal_list)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "set()" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "set(churn_list) & set(loyal_list)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "59846" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "len(set(churn_list) - set(loyal_list))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Read in the other columns line by line by matching uid with user_set" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "import glob" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "138" 119 | ] 120 | }, 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n", 128 | "files = glob.glob(filepath)\n", 129 | "# amount of files\n", 130 | "len(files)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 8, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 9, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "output = open('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log','a')" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 10, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "churn_set = set(churn_list)\n", 164 | "loyal_set = set(loyal_list)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "processing file: 20170410_2_play.log\n", 177 | "...costs 1.45 seconds\n", 178 | "processing file: 20170410_3_play.log\n", 179 | "...costs 1.54 seconds\n", 180 | "processing file: 20170427_3_play.log\n", 181 | "...costs 1.64 seconds\n", 182 | "processing file: 20170427_2_play.log\n", 183 | "...costs 1.22 seconds\n", 184 | "processing file: 20170504_3_play.log\n", 185 | "...costs 1.15 seconds\n", 186 | "processing file: 20170504_2_play.log\n", 187 | "...costs 1.12 seconds\n", 188 | "processing file: 20170508_1_play.log\n", 189 | "...costs 1.19 seconds\n", 190 | "processing file: 20170505_1_play.log\n", 191 | "...costs 1.14 seconds\n", 192 | "processing file: 20170411_1_play.log\n", 193 | "...costs 1.51 seconds\n", 194 | "processing file: 20170426_1_play.log\n", 195 | "...costs 1.25 seconds\n", 196 | "processing file: 20170509_3_play.log\n", 197 | "...costs 1.06 seconds\n", 198 | "processing file: 20170509_2_play.log\n", 199 | "...costs 1.08 seconds\n", 200 | "processing file: 20170401_2_play.log\n", 201 | "...costs 2.43 seconds\n", 202 | "processing file: 20170401_3_play.log\n", 203 | "...costs 2.74 seconds\n", 204 | "processing file: 20170423_1_play.log\n", 205 | "...costs 1.46 seconds\n", 206 | "processing file: 20170414_1_play.log\n", 207 | "...costs 1.03 seconds\n", 208 | "processing file: 20170418_2_play.log\n", 209 | "...costs 1.30 seconds\n", 210 | "processing file: 20170418_3_play.log\n", 211 | "...costs 1.28 seconds\n", 212 | "processing file: 20170408_1_play.log\n", 213 | "...costs 1.73 seconds\n", 214 | "processing file: 20170404_2_play.log\n", 215 | "...costs 2.19 seconds\n", 216 | "processing file: 20170404_3_play.log\n", 217 | "...costs 2.16 seconds\n", 218 | "processing file: 20170510_3_play.log\n", 219 | "...costs 1.11 seconds\n", 220 | "processing file: 20170308_1_play.log\n", 221 | "...costs 1.48 seconds\n", 222 | "processing file: 20170510_2_play.log\n", 223 | "...costs 1.12 seconds\n", 224 | "processing file: 20170501_3_play.log\n", 225 | "...costs 1.35 seconds\n", 226 | "processing file: 20170501_2_play.log\n", 227 | "...costs 1.33 seconds\n", 228 | "processing file: 20170422_3_play.log\n", 229 | "...costs 1.49 seconds\n", 230 | "processing file: 20170422_2_play.log\n", 231 | "...costs 1.44 seconds\n", 232 | "processing file: 20170415_2_play.log\n", 233 | "...costs 1.66 seconds\n", 234 | "processing file: 20170415_3_play.log\n", 235 | "...costs 1.65 seconds\n", 236 | "processing file: 20170419_1_play.log\n", 237 | "...costs 1.41 seconds\n", 238 | "processing file: 20170409_2_play.log\n", 239 | "...costs 1.82 seconds\n", 240 | "processing file: 20170409_3_play.log\n", 241 | "...costs 1.81 seconds\n", 242 | "processing file: 20170305_1_play.log\n", 243 | "...costs 2.02 seconds\n", 244 | "processing file: 20170511_1_play.log\n", 245 | "...costs 1.12 seconds\n", 246 | "processing file: 20170405_1_play.log\n", 247 | "...costs 1.64 seconds\n", 248 | "processing file: 20170302_1_play.log\n", 249 | "...costs 3.06 seconds\n", 250 | "processing file: 20170402_1_play.log\n", 251 | "...costs 2.36 seconds\n", 252 | "processing file: 20170425_2_play.log\n", 253 | "...costs 1.22 seconds\n", 254 | "processing file: 20170425_3_play.log\n", 255 | "...costs 1.22 seconds\n", 256 | "processing file: 20170412_3_play.log\n", 257 | "...costs 1.54 seconds\n", 258 | "processing file: 20170412_2_play.log\n", 259 | "...costs 1.51 seconds\n", 260 | "processing file: 20170506_2_play.log\n", 261 | "...costs 1.29 seconds\n", 262 | "processing file: 20170506_3_play.log\n", 263 | "...costs 1.29 seconds\n", 264 | "processing file: 20170429_1_play.log\n", 265 | "...costs 1.39 seconds\n", 266 | "processing file: 20170403_2_play.log\n", 267 | "...costs 2.37 seconds\n", 268 | "processing file: 20170507_1_play.log\n", 269 | "...costs 1.24 seconds\n", 270 | "processing file: 20170424_1_play.log\n", 271 | "...costs 0.77 seconds\n", 272 | "processing file: 20170413_1_play.log\n", 273 | "...costs 1.43 seconds\n", 274 | "processing file: 20170428_2_play.log\n", 275 | "...costs 1.28 seconds\n", 276 | "processing file: 20170428_3_play.log\n", 277 | "...costs 1.28 seconds\n", 278 | "processing file: 20170331_2_play.log\n", 279 | "...costs 2.72 seconds\n", 280 | "processing file: 20170331_3_play.log\n", 281 | "...costs 3.77 seconds\n", 282 | "processing file: 20170406_3_play.log\n", 283 | "...costs 1.80 seconds\n", 284 | "processing file: 20170406_2_play.log\n", 285 | "...costs 1.64 seconds\n", 286 | "processing file: 20170512_2_play.log\n", 287 | "...costs 1.11 seconds\n", 288 | "processing file: 20170512_3_play.log\n", 289 | "...costs 1.07 seconds\n", 290 | "processing file: 20170416_1_play.log\n", 291 | "...costs 1.54 seconds\n", 292 | "processing file: 20170421_1_play.log\n", 293 | "...costs 1.31 seconds\n", 294 | "processing file: 20170502_1_play.log\n", 295 | "...costs 1.10 seconds\n", 296 | "processing file: 20170307_1_play.log\n", 297 | "...costs 1.53 seconds\n", 298 | "processing file: 20170430_1_play.log\n", 299 | "...costs 1.35 seconds\n", 300 | "processing file: 20170407_1_play.log\n", 301 | "...costs 1.59 seconds\n", 302 | "processing file: 20170503_2_play.log\n", 303 | "...costs 1.13 seconds\n", 304 | "processing file: 20170503_3_play.log\n", 305 | "...costs 1.10 seconds\n", 306 | "processing file: 20170417_3_play.log\n", 307 | "...costs 1.43 seconds\n", 308 | "processing file: 20170417_2_play.log\n", 309 | "...costs 1.43 seconds\n", 310 | "processing file: 20170420_2_play.log\n", 311 | "...costs 1.42 seconds\n", 312 | "processing file: 20170420_3_play.log\n", 313 | "...costs 1.45 seconds\n", 314 | "processing file: 20170422_1_play.log\n", 315 | "...costs 1.57 seconds\n", 316 | "processing file: 20170415_1_play.log\n", 317 | "...costs 1.69 seconds\n", 318 | "processing file: 20170501_1_play.log\n", 319 | "...costs 1.42 seconds\n", 320 | "processing file: 20170419_3_play.log\n", 321 | "...costs 1.40 seconds\n", 322 | "processing file: 20170419_2_play.log\n", 323 | "...costs 1.46 seconds\n", 324 | "processing file: 20170409_1_play.log\n", 325 | "...costs 1.82 seconds\n", 326 | "processing file: 20170405_3_play.log\n", 327 | "...costs 1.86 seconds\n", 328 | "processing file: 20170405_2_play.log\n", 329 | "...costs 1.72 seconds\n", 330 | "processing file: 20170309_1_play.log\n", 331 | "...costs 1.46 seconds\n", 332 | "processing file: 20170511_2_play.log\n", 333 | "...costs 1.11 seconds\n", 334 | "processing file: 20170511_3_play.log\n", 335 | "...costs 1.13 seconds\n", 336 | "processing file: 20170423_2_play.log\n", 337 | "...costs 1.46 seconds\n", 338 | "processing file: 20170423_3_play.log\n", 339 | "...costs 1.52 seconds\n", 340 | "processing file: 20170414_3_play.log\n", 341 | "...costs 1.13 seconds\n", 342 | "processing file: 20170414_2_play.log\n", 343 | "...costs 1.10 seconds\n", 344 | "processing file: 20170418_1_play.log\n", 345 | "...costs 1.46 seconds\n", 346 | "processing file: 20170301_play.log\n", 347 | "...costs 4.65 seconds\n", 348 | "processing file: 20170408_3_play.log\n", 349 | "...costs 2.06 seconds\n", 350 | "processing file: 20170408_2_play.log\n", 351 | "...costs 1.92 seconds\n", 352 | "processing file: 20170304_1_play.log\n", 353 | "...costs 2.24 seconds\n", 354 | "processing file: 20170510_1_play.log\n", 355 | "...costs 1.21 seconds\n", 356 | "processing file: 20170404_1_play.log\n", 357 | "...costs 2.25 seconds\n", 358 | "processing file: 20170411_2_play.log\n", 359 | "...costs 1.60 seconds\n", 360 | "processing file: 20170426_2_play.log\n", 361 | "...costs 1.31 seconds\n", 362 | "processing file: 20170426_3_play.log\n", 363 | "...costs 1.25 seconds\n", 364 | "processing file: 20170505_2_play.log\n", 365 | "...costs 1.23 seconds\n", 366 | "processing file: 20170505_3_play.log\n", 367 | "...costs 1.19 seconds\n", 368 | "processing file: 20170509_1_play.log\n", 369 | "...costs 1.21 seconds\n", 370 | "processing file: 20170401_1_play.log\n", 371 | "...costs 2.38 seconds\n", 372 | "processing file: 20170504_1_play.log\n", 373 | "...costs 1.19 seconds\n", 374 | "processing file: 20170410_1_play.log\n", 375 | "...costs 1.55 seconds\n", 376 | "processing file: 20170427_1_play.log\n", 377 | "...costs 1.30 seconds\n", 378 | "processing file: 20170508_2_play.log\n", 379 | "...costs 1.15 seconds\n", 380 | "processing file: 20170508_3_play.log\n", 381 | "...costs 1.15 seconds\n", 382 | "processing file: 20170330_3_play.log\n", 383 | "...costs 6.34 seconds\n", 384 | "processing file: 20170430_3_play.log\n", 385 | "...costs 1.65 seconds\n", 386 | "processing file: 20170430_2_play.log\n", 387 | "...costs 1.45 seconds\n", 388 | "processing file: 20170407_2_play.log\n", 389 | "...costs 1.71 seconds\n", 390 | "processing file: 20170407_3_play.log\n", 391 | "...costs 1.82 seconds\n", 392 | "processing file: 20170417_1_play.log\n", 393 | "...costs 1.41 seconds\n", 394 | "processing file: 20170420_1_play.log\n", 395 | "...costs 1.37 seconds\n", 396 | "processing file: 20170503_1_play.log\n", 397 | "...costs 1.21 seconds\n", 398 | "processing file: 20170331_1_play.log\n", 399 | "...costs 2.58 seconds\n", 400 | "processing file: 20170306_1_play.log\n", 401 | "...costs 1.78 seconds\n", 402 | "processing file: 20170512_1_play.log\n", 403 | "...costs 1.23 seconds\n", 404 | "processing file: 20170406_1_play.log\n", 405 | "...costs 1.65 seconds\n", 406 | "processing file: 20170502_3_play.log\n", 407 | "...costs 1.09 seconds\n", 408 | "processing file: 20170502_2_play.log\n", 409 | "...costs 1.15 seconds\n", 410 | "processing file: 20170416_2_play.log\n", 411 | "...costs 1.71 seconds\n", 412 | "processing file: 20170416_3_play.log\n", 413 | "...costs 1.70 seconds\n", 414 | "processing file: 20170421_3_play.log\n", 415 | "...costs 1.41 seconds\n", 416 | "processing file: 20170421_2_play.log\n", 417 | "...costs 1.41 seconds\n", 418 | "processing file: 20170303_1_play.log\n", 419 | "...costs 2.44 seconds\n", 420 | "processing file: 20170403_1_play.log\n", 421 | "...costs 2.35 seconds\n", 422 | "processing file: 20170424_3_play.log\n", 423 | "...costs 1.31 seconds\n", 424 | "processing file: 20170424_2_play.log\n", 425 | "...costs 0.26 seconds\n", 426 | "processing file: 20170413_2_play.log\n", 427 | "...costs 1.51 seconds\n", 428 | "processing file: 20170413_3_play.log\n", 429 | "...costs 1.60 seconds\n", 430 | "processing file: 20170507_3_play.log\n", 431 | "...costs 1.32 seconds\n", 432 | "processing file: 20170507_2_play.log\n", 433 | "...costs 1.35 seconds\n", 434 | "processing file: 20170428_1_play.log\n", 435 | "...costs 1.28 seconds\n", 436 | "processing file: 20170339_1_play.log\n", 437 | "...costs 2.89 seconds\n", 438 | "processing file: 20170402_2_play.log\n", 439 | "...costs 2.67 seconds\n", 440 | "processing file: 20170402_3_play.log\n", 441 | "...costs 2.71 seconds\n", 442 | "processing file: 20170506_1_play.log\n", 443 | "...costs 1.41 seconds\n", 444 | "processing file: 20170425_1_play.log\n", 445 | "...costs 1.36 seconds\n", 446 | "processing file: 20170412_1_play.log\n", 447 | "...costs 1.64 seconds\n", 448 | "processing file: 20170429_3_play.log\n", 449 | "...costs 1.74 seconds\n", 450 | "processing file: 20170429_2_play.log\n", 451 | "...costs 1.60 seconds\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "import time\n", 457 | "\n", 458 | "\n", 459 | "for the_file in files:\n", 460 | " current_time = time.clock()\n", 461 | "\n", 462 | " with open(the_file, 'r') as f:\n", 463 | " lines = f.readlines()\n", 464 | " file_name = f.name.split('/')[-1]\n", 465 | " print('processing file: %s' % file_name)\n", 466 | " for line in lines:\n", 467 | " user_id = line.split('\\t')[0]\n", 468 | " if user_id in churn_set:\n", 469 | " contents_to_wirte = line.strip('\\n').split('\\t')\n", 470 | " contents_to_wirte.extend((file_name, '1'))\n", 471 | " elif user_id in loyal_set:\n", 472 | " contents_to_wirte = line.strip('\\n').split('\\t')\n", 473 | " contents_to_wirte.extend((file_name, '0'))\n", 474 | " else:\n", 475 | " continue \n", 476 | " output.write('\\t'.join(contents_to_wirte)+'\\n')\n", 477 | " print('...costs %.2f seconds' % (time.clock()-current_time))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 12, 483 | "metadata": { 484 | "collapsed": true 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "output.close()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 13, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "name": "stderr", 498 | "output_type": "stream", 499 | "text": [ 500 | "/Users/ZhijingYe/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (7,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", 501 | " data = self._reader.read(nrows)\n" 502 | ] 503 | }, 504 | { 505 | "data": { 506 | "text/html": [ 507 | "
\n", 508 | "\n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | "
uiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_namelabel
0 168308107 ar 162455 0 最初的梦想 范玮琪 296 296 0 20170410_2_play.log 0
1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 0 0 20170410_2_play.log 0
2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 156 0 20170410_2_play.log 0
3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 0 0 20170410_2_play.log 0
4 168274411 ar 176292 0 爱不爱我 零点乐队 333 334 0 20170410_2_play.log 0
\n", 598 | "
" 599 | ], 600 | "text/plain": [ 601 | " uid device song_id song_type song_name singer play_time \\\n", 602 | "0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n", 603 | "1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n", 604 | "2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n", 605 | "3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n", 606 | "4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n", 607 | "\n", 608 | " song_length paid_flag file_name label \n", 609 | "0 296 0 20170410_2_play.log 0 \n", 610 | "1 0 0 20170410_2_play.log 0 \n", 611 | "2 156 0 20170410_2_play.log 0 \n", 612 | "3 0 0 20170410_2_play.log 0 \n", 613 | "4 334 0 20170410_2_play.log 0 " 614 | ] 615 | }, 616 | "execution_count": 13, 617 | "metadata": {}, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "df_play = pd.read_csv('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log',\n", 623 | " delimiter='\\t',header=None,index_col=None,names = schema,\n", 624 | " dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})\n", 625 | "df_play.head()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 14, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "\n", 638 | "Int64Index: 39701302 entries, 0 to 39701301\n", 639 | "Data columns (total 11 columns):\n", 640 | "uid object\n", 641 | "device object\n", 642 | "song_id object\n", 643 | "song_type object\n", 644 | "song_name object\n", 645 | "singer object\n", 646 | "play_time object\n", 647 | "song_length object\n", 648 | "paid_flag object\n", 649 | "file_name object\n", 650 | "label object\n", 651 | "dtypes: object(11)\n", 652 | "memory usage: 3.5+ GB\n" 653 | ] 654 | } 655 | ], 656 | "source": [ 657 | "df_play.info()" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 15, 663 | "metadata": { 664 | "collapsed": true 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "user_set = set(df_play['uid'])" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 16, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/plain": [ 679 | "150884" 680 | ] 681 | }, 682 | "execution_count": 16, 683 | "metadata": {}, 684 | "output_type": "execute_result" 685 | } 686 | ], 687 | "source": [ 688 | "len(user_set)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 17, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/plain": [ 699 | "-30" 700 | ] 701 | }, 702 | "execution_count": 17, 703 | "metadata": {}, 704 | "output_type": "execute_result" 705 | } 706 | ], 707 | "source": [ 708 | "len(user_set & churn_set)-len(churn_set)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 18, 714 | "metadata": {}, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": [ 719 | "0" 720 | ] 721 | }, 722 | "execution_count": 18, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "len(user_set & loyal_set)-len(loyal_set)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 19, 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "data": { 738 | "text/plain": [ 739 | "set()" 740 | ] 741 | }, 742 | "execution_count": 19, 743 | "metadata": {}, 744 | "output_type": "execute_result" 745 | } 746 | ], 747 | "source": [ 748 | "user_set - user_set.union(loyal_set)" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 20, 754 | "metadata": {}, 755 | "outputs": [ 756 | { 757 | "data": { 758 | "text/plain": [ 759 | "uid 9426\n", 760 | "device 0\n", 761 | "song_id 7510\n", 762 | "song_type 23109\n", 763 | "song_name 5524\n", 764 | "singer 49189\n", 765 | "play_time 160232\n", 766 | "song_length 20717\n", 767 | "paid_flag 51979\n", 768 | "file_name 51980\n", 769 | "label 51980\n", 770 | "dtype: int64" 771 | ] 772 | }, 773 | "execution_count": 20, 774 | "metadata": {}, 775 | "output_type": "execute_result" 776 | } 777 | ], 778 | "source": [ 779 | "df_play.isnull().sum(axis = 0)" 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "### Missing values\n" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 21, 792 | "metadata": {}, 793 | "outputs": [ 794 | { 795 | "data": { 796 | "text/html": [ 797 | "
\n", 798 | "\n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | "
uiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_namelabel
3164 NaN 朴树 2 23 0 20170410_2_play.log 0 NaN NaN NaN NaN
5618 NaN 刘涛 31 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
5690 NaN 刘涛 51 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
6643 NaN 薛之谦 19 41 0 20170410_2_play.log 0 NaN NaN NaN NaN
7879 NaN EXO 26 26 0 20170410_2_play.log 0 NaN NaN NaN NaN
11400 NaN 佛教音乐 51 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
11918 NaN 佛教音乐 51 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
19485 NaN 杨洋 0 29 0 20170410_2_play.log 0 NaN NaN NaN NaN
21698 NaN 左宏元&amp;张慧清 0 46 0 20170410_2_play.log 0 NaN NaN NaN NaN
25202 NaN 好想好想(51秒铃声版) 0 52 0 20170410_2_play.log 0 NaN NaN NaN NaN
25782 NaN 汤晶锦 40 40 0 20170410_2_play.log 0 NaN NaN NaN NaN
40061 NaN 本兮 26 26 0 20170410_2_play.log 0 NaN NaN NaN NaN
40706 NaN 云菲菲 30 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
48130 NaN 庄心妍 56 56 0 20170410_2_play.log 0 NaN NaN NaN NaN
51385 NaN 冷漠 30 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
51818 NaN 薛之谦 130 41 0 20170410_2_play.log 0 NaN NaN NaN NaN
59378 NaN 阿悄 27 28 0 20170410_2_play.log 0 NaN NaN NaN NaN
69626 NaN 魏栾 30 29 0 20170410_2_play.log 0 NaN NaN NaN NaN
74606 NaN 马頔 29 29 0 20170410_2_play.log 0 NaN NaN NaN NaN
74608 NaN Beyond 27 27 0 20170410_2_play.log 0 NaN NaN NaN NaN
74612 NaN 云菲菲 30 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
75153 NaN Delacey 5 35 0 20170410_2_play.log 1 NaN NaN NaN NaN
76095 NaN Fall Out Boy 30 30 0 20170410_2_play.log 1 NaN NaN NaN NaN
76368 NaN 庄心妍 15 36 0 20170410_2_play.log 0 NaN NaN NaN NaN
76661 NaN 冷漠 30 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
78444 NaN 佛教音乐 50 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
79352 NaN 佛教音乐 50 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
80956 NaN 佛教音乐 50 51 0 20170410_2_play.log 0 NaN NaN NaN NaN
81504 NaN 刁寒 30 30 0 20170410_2_play.log 0 NaN NaN NaN NaN
84367 NaN 唐古&amp;蔡晓 24 24 0 20170410_2_play.log 0 NaN NaN NaN NaN
....................................
39571851 NaN 曹龙 1 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39580074 NaN 铃声 0 60 0 20170429_2_play.log 0 NaN NaN NaN NaN
39587988 NaN 陈小云 39 39 0 20170429_2_play.log 0 NaN NaN NaN NaN
39604491 NaN Delacey 35 35 0 20170429_2_play.log 0 NaN NaN NaN NaN
39614509 NaN 后弦 52 53 0 20170429_2_play.log 0 NaN NaN NaN NaN
39627721 NaN 王馨平 27 28 0 20170429_2_play.log 0 NaN NaN NaN NaN
39628315 NaN 还舍不得离别(36秒铃声版)-(电视剧《美丽的秘密》插曲) 36 36 0 20170429_2_play.log 0 NaN NaN NaN NaN
39658465 NaN Beyond 0 27 0 20170429_2_play.log 0 NaN NaN NaN NaN
39658467 NaN 朴翔 0 47 0 20170429_2_play.log 0 NaN NaN NaN NaN
39663960 NaN 何鹏[男] 30 29 0 20170429_2_play.log 0 NaN NaN NaN NaN
39678847 NaN 任妙音&amp;何鹏 33 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679672 NaN 汪苏泷 33 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679732 NaN 汪苏泷 0 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679736 NaN 汪苏泷 1 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679739 NaN 汪苏泷 33 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679745 NaN 汪苏泷 0 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39679759 NaN 汪苏泷 7 34 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681793 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681801 NaN 杨洋 0 29 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681828 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681834 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681840 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681857 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39681866 NaN 杨洋 1 29 0 20170429_2_play.log 0 NaN NaN NaN NaN
39682159 NaN 林俊杰 0 33 0 20170429_2_play.log 0 NaN NaN NaN NaN
39684928 NaN 王俊凯 1 46 0 20170429_2_play.log 0 NaN NaN NaN NaN
39685029 NaN 王俊凯 0 46 0 20170429_2_play.log 0 NaN NaN NaN NaN
39701112 NaN 任妙音 39 40 0 20170429_2_play.log 0 NaN NaN NaN NaN
39701132 NaN 任妙音 39 40 0 20170429_2_play.log 0 NaN NaN NaN NaN
39701169 NaN 任妙音 39 40 0 20170429_2_play.log 0 NaN NaN NaN NaN
\n", 1672 | "

9426 rows × 11 columns

\n", 1673 | "
" 1674 | ], 1675 | "text/plain": [ 1676 | " uid device song_id song_type song_name \\\n", 1677 | "3164 NaN 朴树 2 23 0 \n", 1678 | "5618 NaN 刘涛 31 30 0 \n", 1679 | "5690 NaN 刘涛 51 51 0 \n", 1680 | "6643 NaN 薛之谦 19 41 0 \n", 1681 | "7879 NaN EXO 26 26 0 \n", 1682 | "11400 NaN 佛教音乐 51 51 0 \n", 1683 | "11918 NaN 佛教音乐 51 51 0 \n", 1684 | "19485 NaN 杨洋 0 29 0 \n", 1685 | "21698 NaN 左宏元&张慧清 0 46 0 \n", 1686 | "25202 NaN 好想好想(51秒铃声版) 0 52 0 \n", 1687 | "25782 NaN 汤晶锦 40 40 0 \n", 1688 | "40061 NaN 本兮 26 26 0 \n", 1689 | "40706 NaN 云菲菲 30 30 0 \n", 1690 | "48130 NaN 庄心妍 56 56 0 \n", 1691 | "51385 NaN 冷漠 30 30 0 \n", 1692 | "51818 NaN 薛之谦 130 41 0 \n", 1693 | "59378 NaN 阿悄 27 28 0 \n", 1694 | "69626 NaN 魏栾 30 29 0 \n", 1695 | "74606 NaN 马頔 29 29 0 \n", 1696 | "74608 NaN Beyond 27 27 0 \n", 1697 | "74612 NaN 云菲菲 30 30 0 \n", 1698 | "75153 NaN Delacey 5 35 0 \n", 1699 | "76095 NaN Fall Out Boy 30 30 0 \n", 1700 | "76368 NaN 庄心妍 15 36 0 \n", 1701 | "76661 NaN 冷漠 30 30 0 \n", 1702 | "78444 NaN 佛教音乐 50 51 0 \n", 1703 | "79352 NaN 佛教音乐 50 51 0 \n", 1704 | "80956 NaN 佛教音乐 50 51 0 \n", 1705 | "81504 NaN 刁寒 30 30 0 \n", 1706 | "84367 NaN 唐古&蔡晓 24 24 0 \n", 1707 | "... ... ... ... ... ... \n", 1708 | "39571851 NaN 曹龙 1 33 0 \n", 1709 | "39580074 NaN 铃声 0 60 0 \n", 1710 | "39587988 NaN 陈小云 39 39 0 \n", 1711 | "39604491 NaN Delacey 35 35 0 \n", 1712 | "39614509 NaN 后弦 52 53 0 \n", 1713 | "39627721 NaN 王馨平 27 28 0 \n", 1714 | "39628315 NaN 还舍不得离别(36秒铃声版)-(电视剧《美丽的秘密》插曲) 36 36 0 \n", 1715 | "39658465 NaN Beyond 0 27 0 \n", 1716 | "39658467 NaN 朴翔 0 47 0 \n", 1717 | "39663960 NaN 何鹏[男] 30 29 0 \n", 1718 | "39678847 NaN 任妙音&何鹏 33 33 0 \n", 1719 | "39679672 NaN 汪苏泷 33 34 0 \n", 1720 | "39679732 NaN 汪苏泷 0 34 0 \n", 1721 | "39679736 NaN 汪苏泷 1 34 0 \n", 1722 | "39679739 NaN 汪苏泷 33 34 0 \n", 1723 | "39679745 NaN 汪苏泷 0 34 0 \n", 1724 | "39679759 NaN 汪苏泷 7 34 0 \n", 1725 | "39681793 NaN 林俊杰 0 33 0 \n", 1726 | "39681801 NaN 杨洋 0 29 0 \n", 1727 | "39681828 NaN 林俊杰 0 33 0 \n", 1728 | "39681834 NaN 林俊杰 0 33 0 \n", 1729 | "39681840 NaN 林俊杰 0 33 0 \n", 1730 | "39681857 NaN 林俊杰 0 33 0 \n", 1731 | "39681866 NaN 杨洋 1 29 0 \n", 1732 | "39682159 NaN 林俊杰 0 33 0 \n", 1733 | "39684928 NaN 王俊凯 1 46 0 \n", 1734 | "39685029 NaN 王俊凯 0 46 0 \n", 1735 | "39701112 NaN 任妙音 39 40 0 \n", 1736 | "39701132 NaN 任妙音 39 40 0 \n", 1737 | "39701169 NaN 任妙音 39 40 0 \n", 1738 | "\n", 1739 | " singer play_time song_length paid_flag file_name label \n", 1740 | "3164 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1741 | "5618 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1742 | "5690 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1743 | "6643 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1744 | "7879 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1745 | "11400 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1746 | "11918 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1747 | "19485 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1748 | "21698 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1749 | "25202 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1750 | "25782 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1751 | "40061 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1752 | "40706 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1753 | "48130 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1754 | "51385 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1755 | "51818 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1756 | "59378 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1757 | "69626 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1758 | "74606 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1759 | "74608 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1760 | "74612 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1761 | "75153 20170410_2_play.log 1 NaN NaN NaN NaN \n", 1762 | "76095 20170410_2_play.log 1 NaN NaN NaN NaN \n", 1763 | "76368 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1764 | "76661 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1765 | "78444 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1766 | "79352 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1767 | "80956 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1768 | "81504 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1769 | "84367 20170410_2_play.log 0 NaN NaN NaN NaN \n", 1770 | "... ... ... ... ... ... ... \n", 1771 | "39571851 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1772 | "39580074 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1773 | "39587988 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1774 | "39604491 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1775 | "39614509 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1776 | "39627721 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1777 | "39628315 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1778 | "39658465 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1779 | "39658467 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1780 | "39663960 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1781 | "39678847 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1782 | "39679672 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1783 | "39679732 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1784 | "39679736 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1785 | "39679739 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1786 | "39679745 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1787 | "39679759 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1788 | "39681793 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1789 | "39681801 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1790 | "39681828 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1791 | "39681834 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1792 | "39681840 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1793 | "39681857 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1794 | "39681866 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1795 | "39682159 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1796 | "39684928 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1797 | "39685029 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1798 | "39701112 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1799 | "39701132 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1800 | "39701169 20170429_2_play.log 0 NaN NaN NaN NaN \n", 1801 | "\n", 1802 | "[9426 rows x 11 columns]" 1803 | ] 1804 | }, 1805 | "execution_count": 21, 1806 | "metadata": {}, 1807 | "output_type": "execute_result" 1808 | } 1809 | ], 1810 | "source": [ 1811 | "# Check those logs with missing uid\n", 1812 | "df_play[df_play['uid'].isnull()]" 1813 | ] 1814 | }, 1815 | { 1816 | "cell_type": "code", 1817 | "execution_count": 22, 1818 | "metadata": { 1819 | "collapsed": true 1820 | }, 1821 | "outputs": [], 1822 | "source": [ 1823 | "# Looks suspecious as there are lots of missing inputs in these logs, delete these logs\n", 1824 | "df_play = df_play.loc[df_play.uid.notnull()]\n" 1825 | ] 1826 | }, 1827 | { 1828 | "cell_type": "code", 1829 | "execution_count": 23, 1830 | "metadata": {}, 1831 | "outputs": [ 1832 | { 1833 | "data": { 1834 | "text/plain": [ 1835 | "uid 0\n", 1836 | "device 0\n", 1837 | "song_id 7510\n", 1838 | "song_type 23109\n", 1839 | "song_name 5524\n", 1840 | "singer 49164\n", 1841 | "play_time 160207\n", 1842 | "song_length 11291\n", 1843 | "paid_flag 42553\n", 1844 | "file_name 42554\n", 1845 | "label 42554\n", 1846 | "dtype: int64" 1847 | ] 1848 | }, 1849 | "execution_count": 23, 1850 | "metadata": {}, 1851 | "output_type": "execute_result" 1852 | } 1853 | ], 1854 | "source": [ 1855 | "df_play.isnull().sum(axis = 0)" 1856 | ] 1857 | }, 1858 | { 1859 | "cell_type": "code", 1860 | "execution_count": 24, 1861 | "metadata": {}, 1862 | "outputs": [ 1863 | { 1864 | "data": { 1865 | "text/plain": [ 1866 | "0 29022270\n", 1867 | "1 6836180\n", 1868 | "0 2932205\n", 1869 | "2 474861\n", 1870 | "1 325546\n", 1871 | "2 32711\n", 1872 | " 9908\n", 1873 | "3 4658\n", 1874 | "刚好遇见你 129\n", 1875 | "没有你陪伴真的好孤单 93\n", 1876 | "暧昧 76\n", 1877 | "逆流成河 67\n", 1878 | "演员 62\n", 1879 | "3 55\n", 1880 | "走着走着就散了 52\n", 1881 | "...\n", 1882 | "扎西哥哥(DJ版) 1\n", 1883 | "女孩你知道吗 1\n", 1884 | "爱的魔法 1\n", 1885 | "拥抱你 1\n", 1886 | "太阳雨 1\n", 1887 | "克罗地亚狂想曲 1\n", 1888 | "情翼 1\n", 1889 | "腐草为萤 1\n", 1890 | "Can You Feel The Love Tonight 1\n", 1891 | "164水浒全传 1\n", 1892 | "The wheels on the bus 1\n", 1893 | "十年戎马心孤单 - KTV版伴奏 1\n", 1894 | "Звезда 1\n", 1895 | "The Sounds Of Silence 1\n", 1896 | "续小八义043 1\n", 1897 | "Length: 12334, dtype: int64" 1898 | ] 1899 | }, 1900 | "execution_count": 24, 1901 | "metadata": {}, 1902 | "output_type": "execute_result" 1903 | } 1904 | ], 1905 | "source": [ 1906 | "df_play.song_type.value_counts()" 1907 | ] 1908 | }, 1909 | { 1910 | "cell_type": "code", 1911 | "execution_count": 25, 1912 | "metadata": {}, 1913 | "outputs": [ 1914 | { 1915 | "data": { 1916 | "text/plain": [ 1917 | "0.0 39583273\n", 1918 | "0 65467\n", 1919 | "415.0 427\n", 1920 | "430.0 32\n", 1921 | "252.0 26\n", 1922 | "219.0 19\n", 1923 | "3.0 13\n", 1924 | "259.0 6\n", 1925 | "245.0 6\n", 1926 | "169.0 6\n", 1927 | "375.0 4\n", 1928 | "209.0 4\n", 1929 | "241.0 4\n", 1930 | "9.0 3\n", 1931 | "7.0 3\n", 1932 | "191.0 3\n", 1933 | "200.0 3\n", 1934 | "128.0 2\n", 1935 | "260.0 2\n", 1936 | "248.0 2\n", 1937 | "237.0 2\n", 1938 | "8.0 1\n", 1939 | "6.0 1\n", 1940 | "183.0 1\n", 1941 | "1026.0 1\n", 1942 | "211.0 1\n", 1943 | "235.0 1\n", 1944 | "725.0 1\n", 1945 | "爱上一匹野马 1\n", 1946 | "278.0 1\n", 1947 | "289.0 1\n", 1948 | "292.0 1\n", 1949 | "385.0 1\n", 1950 | "473.0 1\n", 1951 | "666.0 1\n", 1952 | "683.0 1\n", 1953 | "247.0 1\n", 1954 | "dtype: int64" 1955 | ] 1956 | }, 1957 | "execution_count": 25, 1958 | "metadata": {}, 1959 | "output_type": "execute_result" 1960 | } 1961 | ], 1962 | "source": [ 1963 | "df_play.paid_flag.value_counts()" 1964 | ] 1965 | }, 1966 | { 1967 | "cell_type": "code", 1968 | "execution_count": 26, 1969 | "metadata": {}, 1970 | "outputs": [ 1971 | { 1972 | "data": { 1973 | "text/html": [ 1974 | "
\n", 1975 | "\n", 1976 | " \n", 1977 | " \n", 1978 | " \n", 1979 | " \n", 1980 | " \n", 1981 | " \n", 1982 | " \n", 1983 | " \n", 1984 | " \n", 1985 | " \n", 1986 | " \n", 1987 | " \n", 1988 | " \n", 1989 | " \n", 1990 | " \n", 1991 | " \n", 1992 | " \n", 1993 | " \n", 1994 | " \n", 1995 | " \n", 1996 | " \n", 1997 | " \n", 1998 | " \n", 1999 | " \n", 2000 | " \n", 2001 | " \n", 2002 | " \n", 2003 | " \n", 2004 | " \n", 2005 | " \n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | " \n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | " \n", 2179 | " \n", 2180 | " \n", 2181 | " \n", 2182 | " \n", 2183 | " \n", 2184 | " \n", 2185 | " \n", 2186 | " \n", 2187 | " \n", 2188 | " \n", 2189 | " \n", 2190 | " \n", 2191 | " \n", 2192 | " \n", 2193 | " \n", 2194 | " \n", 2195 | " \n", 2196 | " \n", 2197 | " \n", 2198 | " \n", 2199 | " \n", 2200 | " \n", 2201 | " \n", 2202 | " \n", 2203 | " \n", 2204 | " \n", 2205 | " \n", 2206 | " \n", 2207 | " \n", 2208 | " \n", 2209 | " \n", 2210 | " \n", 2211 | " \n", 2212 | " \n", 2213 | " \n", 2214 | " \n", 2215 | " \n", 2216 | " \n", 2217 | " \n", 2218 | " \n", 2219 | " \n", 2220 | " \n", 2221 | " \n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | " \n", 2266 | " \n", 2267 | " \n", 2268 | " \n", 2269 | " \n", 2270 | " \n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | " \n", 2278 | " \n", 2279 | " \n", 2280 | " \n", 2281 | " \n", 2282 | " \n", 2283 | " \n", 2284 | " \n", 2285 | " \n", 2286 | " \n", 2287 | " \n", 2288 | " \n", 2289 | " \n", 2290 | " \n", 2291 | " \n", 2292 | " \n", 2293 | " \n", 2294 | " \n", 2295 | " \n", 2296 | " \n", 2297 | " \n", 2298 | " \n", 2299 | " \n", 2300 | " \n", 2301 | " \n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | " \n", 2527 | " \n", 2528 | " \n", 2529 | " \n", 2530 | " \n", 2531 | " \n", 2532 | " \n", 2533 | " \n", 2534 | " \n", 2535 | " \n", 2536 | " \n", 2537 | " \n", 2538 | " \n", 2539 | " \n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | " \n", 2582 | " \n", 2583 | " \n", 2584 | " \n", 2585 | " \n", 2586 | " \n", 2587 | " \n", 2588 | " \n", 2589 | " \n", 2590 | " \n", 2591 | " \n", 2592 | " \n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | " \n", 2610 | " \n", 2611 | " \n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | " \n", 2670 | " \n", 2671 | " \n", 2672 | " \n", 2673 | " \n", 2674 | " \n", 2675 | " \n", 2676 | " \n", 2677 | " \n", 2678 | " \n", 2679 | " \n", 2680 | " \n", 2681 | " \n", 2682 | " \n", 2683 | " \n", 2684 | " \n", 2685 | " \n", 2686 | " \n", 2687 | " \n", 2688 | " \n", 2689 | " \n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | " \n", 2725 | " \n", 2726 | " \n", 2727 | " \n", 2728 | " \n", 2729 | " \n", 2730 | " \n", 2731 | " \n", 2732 | " \n", 2733 | " \n", 2734 | " \n", 2735 | " \n", 2736 | " \n", 2737 | " \n", 2738 | " \n", 2739 | " \n", 2740 | " \n", 2741 | " \n", 2742 | " \n", 2743 | " \n", 2744 | " \n", 2745 | " \n", 2746 | " \n", 2747 | " \n", 2748 | " \n", 2749 | " \n", 2750 | " \n", 2751 | " \n", 2752 | " \n", 2753 | " \n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | " \n", 2766 | " \n", 2767 | " \n", 2768 | " \n", 2769 | " \n", 2770 | " \n", 2771 | " \n", 2772 | " \n", 2773 | " \n", 2774 | " \n", 2775 | " \n", 2776 | " \n", 2777 | " \n", 2778 | " \n", 2779 | " \n", 2780 | " \n", 2781 | " \n", 2782 | " \n", 2783 | " \n", 2784 | " \n", 2785 | " \n", 2786 | " \n", 2787 | " \n", 2788 | " \n", 2789 | " \n", 2790 | " \n", 2791 | " \n", 2792 | " \n", 2793 | " \n", 2794 | " \n", 2795 | " \n", 2796 | " \n", 2797 | " \n", 2798 | " \n", 2799 | " \n", 2800 | " \n", 2801 | " \n", 2802 | " \n", 2803 | " \n", 2804 | " \n", 2805 | " \n", 2806 | " \n", 2807 | " \n", 2808 | " \n", 2809 | " \n", 2810 | " \n", 2811 | " \n", 2812 | " \n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | " \n", 2839 | " \n", 2840 | " \n", 2841 | " \n", 2842 | " \n", 2843 | " \n", 2844 | " \n", 2845 | " \n", 2846 | " \n", 2847 | " \n", 2848 | "
uiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_namelabel
58059 168146144 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 412 415 0 20170410_2_play.log
58741 168146144 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 237>\u000f}(222.219.141.68)TM 430 0 20170410_2_play.log
164818 168700735 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 249 415 0 20170410_3_play.log
178183 168647140 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 258 415 0 20170410_3_play.log
178398 168647140 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 103 415 0 20170410_3_play.log
433073 0 ar 235500 1 分裂 [内地版 周杰伦 104931 252 0 20170504_3_play.log
447433 168647140 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 211 415 0 20170504_3_play.log
469348 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170504_2_play.log
486671 168271854 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 1 415 0 20170504_2_play.log
505531 0 ar 235500 1 分裂 [内地版 周杰伦 104931 252 0 20170504_2_play.log
506282 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170504_2_play.log
506327 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 0 415 0 20170504_2_play.log
506369 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 0 415 0 20170504_2_play.log
506412 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170504_2_play.log
506466 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170504_2_play.log
506513 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170504_2_play.log
510426 137084142 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170504_2_play.log
532688 168164990 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 345 415 0 20170504_2_play.log
586643 168263591 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170508_1_play.log
655350 167636505 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 0 415 0 20170505_1_play.log
682423 168263591 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 121 415 0 20170505_1_play.log
760977 167988542 ar 235500 0 分裂 [内地版 周杰伦 13 252 0 20170411_1_play.log
801213 167636505 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 0 415 0 20170426_1_play.log
806276 167894057 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 207 415 0 20170426_1_play.log
819955 167721050 ar 5989117 0 38. Hurry or you'll be late for school. (快点儿,上学该迟到了。) 分级加字幕轻松练听力 3 9 0 20170426_1_play.log
895604 168820890 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 58 430 0 20170509_3_play.log
896653 168820890 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 409 430 0 20170509_3_play.log
910055 161741167 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 47>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log
910056 161741167 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 5>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log
910057 161741167 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 5>=(223.104.38.39)TM 430 0 20170509_3_play.log
....................................
37008272 167778188 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 415 415 0 20170403_1_play.log
37178187 168725697 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 50 415 0 20170424_3_play.log
37292655 1685126 ar 0 1 LOSER Bigbang NaN 219336 219 0 20170413_2_play.log
37313545 168248860 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 213 415 0 20170413_2_play.log
37465800 1685126 ar 0 1 LOSER Bigbang NaN 219336 219 0 20170413_3_play.log
37612451 168851902 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 414 415 0 20170507_3_play.log
37920919 167775288 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 48 415 0 20170428_1_play.log
38044243 167775288 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170428_1_play.log
38063913 167632676 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170339_1_play.log
38128310 167819640 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 0 415 0 20170339_1_play.log
38166424 168318095 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 6 415 0 20170339_1_play.log
38226566 167998697 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 414 415 0 20170339_1_play.log
38231484 167604900 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 164 415 0 20170339_1_play.log
38348109 168257965 ip 235500 0 分裂 [内地版 周杰伦 0 252 0 20170402_2_play.log
38348113 168257965 ip 235500 0 分裂 [内地版 周杰伦 0 252 0 20170402_2_play.log
38505140 168521650 ar 7080647 0 曹云金、刘云天《奋斗》(2012) NaN 春晚相声集锦 219 725 0 20170402_2_play.log
38518065 168851902 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 414 415 0 20170402_3_play.log
38519838 168851902 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 415 415 0 20170402_3_play.log
38679451 169001484 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 400 415 0 20170402_3_play.log
38842511 167947839 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 3 415 0 20170506_1_play.log
38867412 167922308 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170506_1_play.log
38870804 167922308 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 415 415 0 20170506_1_play.log
38990351 167775288 ar 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 414 415 0 20170425_1_play.log
39088806 167947839 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 0 415 0 20170425_1_play.log
39191178 167947839 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 29 415 0 20170412_1_play.log
39230197 167778188 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 9>\u000f}(183.214.21.145)TM 430 0 20170412_1_play.log
39291614 167906765 ip 6916311 0 多少的爱都不要(???? ???????? Ten Nararak 5 415 0 20170412_1_play.log
39439120 168697697 ar 6916311 0 多少的爱都不要(???? ???????? ) Ten Nararak 4 415 0 20170429_3_play.log
39534457 168084784 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 191>(39.88.19.100)TM 430 0 20170429_2_play.log
39540010 168084784 ar 6916311 2 多少的爱都不要(???? ???????? ) Ten Nararak 19>(39.88.19.100)TM 430 0 20170429_2_play.log
\n", 2849 | "

66050 rows × 11 columns

\n", 2850 | "
" 2851 | ], 2852 | "text/plain": [ 2853 | " uid device song_id song_type song_name \\\n", 2854 | "58059 168146144 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2855 | "58741 168146144 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2856 | "164818 168700735 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2857 | "178183 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2858 | "178398 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2859 | "433073 0 ar 235500 1 分裂 \n", 2860 | "447433 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2861 | "469348 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2862 | "486671 168271854 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2863 | "505531 0 ar 235500 1 分裂 \n", 2864 | "506282 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2865 | "506327 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2866 | "506369 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2867 | "506412 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2868 | "506466 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2869 | "506513 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2870 | "510426 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2871 | "532688 168164990 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2872 | "586643 168263591 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2873 | "655350 167636505 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2874 | "682423 168263591 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2875 | "760977 167988542 ar 235500 0 分裂 \n", 2876 | "801213 167636505 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2877 | "806276 167894057 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2878 | "819955 167721050 ar 5989117 0 38. \n", 2879 | "895604 168820890 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2880 | "896653 168820890 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2881 | "910055 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2882 | "910056 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2883 | "910057 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2884 | "... ... ... ... ... ... \n", 2885 | "37008272 167778188 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2886 | "37178187 168725697 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2887 | "37292655 1685126 ar 0 1 LOSER \n", 2888 | "37313545 168248860 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2889 | "37465800 1685126 ar 0 1 LOSER \n", 2890 | "37612451 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2891 | "37920919 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2892 | "38044243 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2893 | "38063913 167632676 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2894 | "38128310 167819640 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2895 | "38166424 168318095 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2896 | "38226566 167998697 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2897 | "38231484 167604900 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2898 | "38348109 168257965 ip 235500 0 分裂 \n", 2899 | "38348113 168257965 ip 235500 0 分裂 \n", 2900 | "38505140 168521650 ar 7080647 0 曹云金、刘云天《奋斗》(2012) \n", 2901 | "38518065 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2902 | "38519838 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2903 | "38679451 169001484 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2904 | "38842511 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2905 | "38867412 167922308 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2906 | "38870804 167922308 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2907 | "38990351 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2908 | "39088806 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2909 | "39191178 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2910 | "39230197 167778188 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2911 | "39291614 167906765 ip 6916311 0 多少的爱都不要(???? ???????? \n", 2912 | "39439120 168697697 ar 6916311 0 多少的爱都不要(???? ???????? \n", 2913 | "39534457 168084784 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2914 | "39540010 168084784 ar 6916311 2 多少的爱都不要(???? ???????? \n", 2915 | "\n", 2916 | " singer play_time \\\n", 2917 | "58059 ) Ten Nararak \n", 2918 | "58741 ) Ten Nararak \n", 2919 | "164818 ) Ten Nararak \n", 2920 | "178183 ) Ten Nararak \n", 2921 | "178398 ) Ten Nararak \n", 2922 | "433073 [内地版 周杰伦 \n", 2923 | "447433 ) Ten Nararak \n", 2924 | "469348 ) Ten Nararak \n", 2925 | "486671 ) Ten Nararak \n", 2926 | "505531 [内地版 周杰伦 \n", 2927 | "506282 ) Ten Nararak \n", 2928 | "506327 ) Ten Nararak \n", 2929 | "506369 ) Ten Nararak \n", 2930 | "506412 ) Ten Nararak \n", 2931 | "506466 ) Ten Nararak \n", 2932 | "506513 ) Ten Nararak \n", 2933 | "510426 ) Ten Nararak \n", 2934 | "532688 ) Ten Nararak \n", 2935 | "586643 ) Ten Nararak \n", 2936 | "655350 ) Ten Nararak \n", 2937 | "682423 ) Ten Nararak \n", 2938 | "760977 [内地版 周杰伦 \n", 2939 | "801213 ) Ten Nararak \n", 2940 | "806276 ) Ten Nararak \n", 2941 | "819955 Hurry or you'll be late for school. (快点儿,上学该迟到了。) 分级加字幕轻松练听力 \n", 2942 | "895604 ) Ten Nararak \n", 2943 | "896653 ) Ten Nararak \n", 2944 | "910055 ) Ten Nararak \n", 2945 | "910056 ) Ten Nararak \n", 2946 | "910057 ) Ten Nararak \n", 2947 | "... ... ... \n", 2948 | "37008272 ) Ten Nararak \n", 2949 | "37178187 ) Ten Nararak \n", 2950 | "37292655 Bigbang NaN \n", 2951 | "37313545 ) Ten Nararak \n", 2952 | "37465800 Bigbang NaN \n", 2953 | "37612451 ) Ten Nararak \n", 2954 | "37920919 ) Ten Nararak \n", 2955 | "38044243 ) Ten Nararak \n", 2956 | "38063913 ) Ten Nararak \n", 2957 | "38128310 ) Ten Nararak \n", 2958 | "38166424 ) Ten Nararak \n", 2959 | "38226566 ) Ten Nararak \n", 2960 | "38231484 ) Ten Nararak \n", 2961 | "38348109 [内地版 周杰伦 \n", 2962 | "38348113 [内地版 周杰伦 \n", 2963 | "38505140 NaN 春晚相声集锦 \n", 2964 | "38518065 ) Ten Nararak \n", 2965 | "38519838 ) Ten Nararak \n", 2966 | "38679451 ) Ten Nararak \n", 2967 | "38842511 ) Ten Nararak \n", 2968 | "38867412 ) Ten Nararak \n", 2969 | "38870804 ) Ten Nararak \n", 2970 | "38990351 ) Ten Nararak \n", 2971 | "39088806 ) Ten Nararak \n", 2972 | "39191178 ) Ten Nararak \n", 2973 | "39230197 ) Ten Nararak \n", 2974 | "39291614 ) Ten Nararak \n", 2975 | "39439120 ) Ten Nararak \n", 2976 | "39534457 ) Ten Nararak \n", 2977 | "39540010 ) Ten Nararak \n", 2978 | "\n", 2979 | " song_length paid_flag file_name label \n", 2980 | "58059 412 415 0 20170410_2_play.log \n", 2981 | "58741 237>\u000f}(222.219.141.68)TM 430 0 20170410_2_play.log \n", 2982 | "164818 249 415 0 20170410_3_play.log \n", 2983 | "178183 258 415 0 20170410_3_play.log \n", 2984 | "178398 103 415 0 20170410_3_play.log \n", 2985 | "433073 104931 252 0 20170504_3_play.log \n", 2986 | "447433 211 415 0 20170504_3_play.log \n", 2987 | "469348 414 415 0 20170504_2_play.log \n", 2988 | "486671 1 415 0 20170504_2_play.log \n", 2989 | "505531 104931 252 0 20170504_2_play.log \n", 2990 | "506282 415 415 0 20170504_2_play.log \n", 2991 | "506327 0 415 0 20170504_2_play.log \n", 2992 | "506369 0 415 0 20170504_2_play.log \n", 2993 | "506412 415 415 0 20170504_2_play.log \n", 2994 | "506466 415 415 0 20170504_2_play.log \n", 2995 | "506513 415 415 0 20170504_2_play.log \n", 2996 | "510426 415 415 0 20170504_2_play.log \n", 2997 | "532688 345 415 0 20170504_2_play.log \n", 2998 | "586643 414 415 0 20170508_1_play.log \n", 2999 | "655350 0 415 0 20170505_1_play.log \n", 3000 | "682423 121 415 0 20170505_1_play.log \n", 3001 | "760977 13 252 0 20170411_1_play.log \n", 3002 | "801213 0 415 0 20170426_1_play.log \n", 3003 | "806276 207 415 0 20170426_1_play.log \n", 3004 | "819955 3 9 0 20170426_1_play.log \n", 3005 | "895604 58 430 0 20170509_3_play.log \n", 3006 | "896653 409 430 0 20170509_3_play.log \n", 3007 | "910055 47>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log \n", 3008 | "910056 5>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log \n", 3009 | "910057 5>=(223.104.38.39)TM 430 0 20170509_3_play.log \n", 3010 | "... ... ... ... ... \n", 3011 | "37008272 415 415 0 20170403_1_play.log \n", 3012 | "37178187 50 415 0 20170424_3_play.log \n", 3013 | "37292655 219336 219 0 20170413_2_play.log \n", 3014 | "37313545 213 415 0 20170413_2_play.log \n", 3015 | "37465800 219336 219 0 20170413_3_play.log \n", 3016 | "37612451 414 415 0 20170507_3_play.log \n", 3017 | "37920919 48 415 0 20170428_1_play.log \n", 3018 | "38044243 414 415 0 20170428_1_play.log \n", 3019 | "38063913 414 415 0 20170339_1_play.log \n", 3020 | "38128310 0 415 0 20170339_1_play.log \n", 3021 | "38166424 6 415 0 20170339_1_play.log \n", 3022 | "38226566 414 415 0 20170339_1_play.log \n", 3023 | "38231484 164 415 0 20170339_1_play.log \n", 3024 | "38348109 0 252 0 20170402_2_play.log \n", 3025 | "38348113 0 252 0 20170402_2_play.log \n", 3026 | "38505140 219 725 0 20170402_2_play.log \n", 3027 | "38518065 414 415 0 20170402_3_play.log \n", 3028 | "38519838 415 415 0 20170402_3_play.log \n", 3029 | "38679451 400 415 0 20170402_3_play.log \n", 3030 | "38842511 3 415 0 20170506_1_play.log \n", 3031 | "38867412 414 415 0 20170506_1_play.log \n", 3032 | "38870804 415 415 0 20170506_1_play.log \n", 3033 | "38990351 414 415 0 20170425_1_play.log \n", 3034 | "39088806 0 415 0 20170425_1_play.log \n", 3035 | "39191178 29 415 0 20170412_1_play.log \n", 3036 | "39230197 9>\u000f}(183.214.21.145)TM 430 0 20170412_1_play.log \n", 3037 | "39291614 5 415 0 20170412_1_play.log \n", 3038 | "39439120 4 415 0 20170429_3_play.log \n", 3039 | "39534457 191>(39.88.19.100)TM 430 0 20170429_2_play.log \n", 3040 | "39540010 19>(39.88.19.100)TM 430 0 20170429_2_play.log \n", 3041 | "\n", 3042 | "[66050 rows x 11 columns]" 3043 | ] 3044 | }, 3045 | "execution_count": 26, 3046 | "metadata": {}, 3047 | "output_type": "execute_result" 3048 | } 3049 | ], 3050 | "source": [ 3051 | "df_play[df_play.paid_flag>0]" 3052 | ] 3053 | }, 3054 | { 3055 | "cell_type": "markdown", 3056 | "metadata": {}, 3057 | "source": [ 3058 | "#### Logs with paid_flag >0 are actually logs with input format errors and paid_flag =0\n", 3059 | "#### There are no real inputs with paid_flag >0" 3060 | ] 3061 | }, 3062 | { 3063 | "cell_type": "markdown", 3064 | "metadata": {}, 3065 | "source": [ 3066 | "The important columns for analysis are: song_id, play_time, song_length and file_name so drop the rows with missing values in these three fields." 3067 | ] 3068 | }, 3069 | { 3070 | "cell_type": "code", 3071 | "execution_count": 27, 3072 | "metadata": { 3073 | "collapsed": true 3074 | }, 3075 | "outputs": [], 3076 | "source": [ 3077 | "df_play = df_play.loc[df_play.file_name.notnull() & \n", 3078 | " df_play.play_time.notnull() & \n", 3079 | " df_play.song_id.notnull() &\n", 3080 | " df_play.song_length.notnull()]\n" 3081 | ] 3082 | }, 3083 | { 3084 | "cell_type": "code", 3085 | "execution_count": 28, 3086 | "metadata": {}, 3087 | "outputs": [ 3088 | { 3089 | "data": { 3090 | "text/plain": [ 3091 | "uid 0\n", 3092 | "device 0\n", 3093 | "song_id 0\n", 3094 | "song_type 12913\n", 3095 | "song_name 771\n", 3096 | "singer 37883\n", 3097 | "play_time 0\n", 3098 | "song_length 0\n", 3099 | "paid_flag 0\n", 3100 | "file_name 0\n", 3101 | "label 0\n", 3102 | "dtype: int64" 3103 | ] 3104 | }, 3105 | "execution_count": 28, 3106 | "metadata": {}, 3107 | "output_type": "execute_result" 3108 | } 3109 | ], 3110 | "source": [ 3111 | "df_play.isnull().sum(axis = 0)" 3112 | ] 3113 | }, 3114 | { 3115 | "cell_type": "markdown", 3116 | "metadata": {}, 3117 | "source": [ 3118 | "#### Now assign int 0 to the missing values of song_type, which is the most populated song_type" 3119 | ] 3120 | }, 3121 | { 3122 | "cell_type": "code", 3123 | "execution_count": 29, 3124 | "metadata": {}, 3125 | "outputs": [ 3126 | { 3127 | "data": { 3128 | "text/plain": [ 3129 | "uid 0\n", 3130 | "device 0\n", 3131 | "song_id 0\n", 3132 | "song_type 0\n", 3133 | "song_name 771\n", 3134 | "singer 37883\n", 3135 | "play_time 0\n", 3136 | "song_length 0\n", 3137 | "paid_flag 0\n", 3138 | "file_name 0\n", 3139 | "label 0\n", 3140 | "dtype: int64" 3141 | ] 3142 | }, 3143 | "execution_count": 29, 3144 | "metadata": {}, 3145 | "output_type": "execute_result" 3146 | } 3147 | ], 3148 | "source": [ 3149 | "df_play.loc[df_play.song_type.isnull(),'song_type'] = 0\n", 3150 | "df_play.isnull().sum(axis = 0)" 3151 | ] 3152 | }, 3153 | { 3154 | "cell_type": "code", 3155 | "execution_count": 30, 3156 | "metadata": {}, 3157 | "outputs": [ 3158 | { 3159 | "data": { 3160 | "text/plain": [ 3161 | "(39492892, 11)" 3162 | ] 3163 | }, 3164 | "execution_count": 30, 3165 | "metadata": {}, 3166 | "output_type": "execute_result" 3167 | } 3168 | ], 3169 | "source": [ 3170 | "df_play.shape" 3171 | ] 3172 | }, 3173 | { 3174 | "cell_type": "code", 3175 | "execution_count": 31, 3176 | "metadata": {}, 3177 | "outputs": [ 3178 | { 3179 | "data": { 3180 | "text/plain": [ 3181 | "uid 0\n", 3182 | "device 0\n", 3183 | "song_id 0\n", 3184 | "song_type 0\n", 3185 | "song_name 205\n", 3186 | "singer 37715\n", 3187 | "play_time 0\n", 3188 | "song_length 0\n", 3189 | "paid_flag 0\n", 3190 | "file_name 0\n", 3191 | "label 0\n", 3192 | "dtype: int64" 3193 | ] 3194 | }, 3195 | "execution_count": 31, 3196 | "metadata": {}, 3197 | "output_type": "execute_result" 3198 | } 3199 | ], 3200 | "source": [ 3201 | "# As song_length will be used later, delete logs with >= 0 song_length and null song_name\n", 3202 | "df_play = df_play.loc[df_play.song_name.notnull() | (df_play.song_length > 0)]\n", 3203 | "df_play.isnull().sum(axis = 0)" 3204 | ] 3205 | }, 3206 | { 3207 | "cell_type": "code", 3208 | "execution_count": 32, 3209 | "metadata": { 3210 | "collapsed": true 3211 | }, 3212 | "outputs": [], 3213 | "source": [ 3214 | "df_play = df_play.reset_index()" 3215 | ] 3216 | }, 3217 | { 3218 | "cell_type": "code", 3219 | "execution_count": 33, 3220 | "metadata": {}, 3221 | "outputs": [ 3222 | { 3223 | "data": { 3224 | "text/html": [ 3225 | "
\n", 3226 | "\n", 3227 | " \n", 3228 | " \n", 3229 | " \n", 3230 | " \n", 3231 | " \n", 3232 | " \n", 3233 | " \n", 3234 | " \n", 3235 | " \n", 3236 | " \n", 3237 | " \n", 3238 | " \n", 3239 | " \n", 3240 | " \n", 3241 | " \n", 3242 | " \n", 3243 | " \n", 3244 | " \n", 3245 | " \n", 3246 | " \n", 3247 | " \n", 3248 | " \n", 3249 | " \n", 3250 | " \n", 3251 | " \n", 3252 | " \n", 3253 | " \n", 3254 | " \n", 3255 | " \n", 3256 | " \n", 3257 | " \n", 3258 | " \n", 3259 | " \n", 3260 | " \n", 3261 | " \n", 3262 | " \n", 3263 | " \n", 3264 | " \n", 3265 | " \n", 3266 | " \n", 3267 | " \n", 3268 | " \n", 3269 | " \n", 3270 | " \n", 3271 | " \n", 3272 | " \n", 3273 | " \n", 3274 | " \n", 3275 | " \n", 3276 | " \n", 3277 | " \n", 3278 | " \n", 3279 | " \n", 3280 | " \n", 3281 | " \n", 3282 | " \n", 3283 | " \n", 3284 | " \n", 3285 | " \n", 3286 | " \n", 3287 | " \n", 3288 | " \n", 3289 | " \n", 3290 | " \n", 3291 | " \n", 3292 | " \n", 3293 | " \n", 3294 | " \n", 3295 | " \n", 3296 | " \n", 3297 | " \n", 3298 | " \n", 3299 | " \n", 3300 | " \n", 3301 | " \n", 3302 | " \n", 3303 | " \n", 3304 | " \n", 3305 | " \n", 3306 | " \n", 3307 | " \n", 3308 | " \n", 3309 | " \n", 3310 | " \n", 3311 | " \n", 3312 | " \n", 3313 | " \n", 3314 | " \n", 3315 | " \n", 3316 | " \n", 3317 | " \n", 3318 | " \n", 3319 | " \n", 3320 | " \n", 3321 | " \n", 3322 | " \n", 3323 | " \n", 3324 | " \n", 3325 | " \n", 3326 | " \n", 3327 | " \n", 3328 | " \n", 3329 | " \n", 3330 | " \n", 3331 | " \n", 3332 | " \n", 3333 | " \n", 3334 | " \n", 3335 | " \n", 3336 | " \n", 3337 | " \n", 3338 | " \n", 3339 | " \n", 3340 | " \n", 3341 | " \n", 3342 | " \n", 3343 | " \n", 3344 | " \n", 3345 | " \n", 3346 | " \n", 3347 | " \n", 3348 | " \n", 3349 | " \n", 3350 | " \n", 3351 | " \n", 3352 | " \n", 3353 | " \n", 3354 | " \n", 3355 | " \n", 3356 | " \n", 3357 | " \n", 3358 | " \n", 3359 | " \n", 3360 | " \n", 3361 | " \n", 3362 | " \n", 3363 | " \n", 3364 | " \n", 3365 | " \n", 3366 | " \n", 3367 | " \n", 3368 | " \n", 3369 | " \n", 3370 | " \n", 3371 | " \n", 3372 | " \n", 3373 | " \n", 3374 | " \n", 3375 | " \n", 3376 | " \n", 3377 | " \n", 3378 | " \n", 3379 | " \n", 3380 | " \n", 3381 | " \n", 3382 | " \n", 3383 | " \n", 3384 | " \n", 3385 | " \n", 3386 | " \n", 3387 | " \n", 3388 | " \n", 3389 | " \n", 3390 | " \n", 3391 | " \n", 3392 | " \n", 3393 | " \n", 3394 | " \n", 3395 | " \n", 3396 | "
indexuiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_namelabel
0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 296 0 20170410_2_play.log 0
1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 0 0 20170410_2_play.log 0
2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 156 0 20170410_2_play.log 0
3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 0 0 20170410_2_play.log 0
4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 334 0 20170410_2_play.log 0
5 5 168274411 ar 22833011 0 宽恕 宽恕乐队 155 156 0 20170410_2_play.log 0
6 6 168274411 ar 105279 0 曲终人散 张宇 0 0 0 20170410_2_play.log 0
7 7 168274411 ar 176292 0 爱不爱我 零点乐队 0 0 0 20170410_2_play.log 0
8 8 168515688 ip 6586179 0 漂洋过海来看你 孙露 326 326 0 20170410_2_play.log 0
9 9 32166203 ar 1034767 0 好想再爱你 颜亚涛 270 0 0 20170410_2_play.log 0
\n", 3397 | "
" 3398 | ], 3399 | "text/plain": [ 3400 | " index uid device song_id song_type song_name singer play_time \\\n", 3401 | "0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n", 3402 | "1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n", 3403 | "2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n", 3404 | "3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n", 3405 | "4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n", 3406 | "5 5 168274411 ar 22833011 0 宽恕 宽恕乐队 155 \n", 3407 | "6 6 168274411 ar 105279 0 曲终人散 张宇 0 \n", 3408 | "7 7 168274411 ar 176292 0 爱不爱我 零点乐队 0 \n", 3409 | "8 8 168515688 ip 6586179 0 漂洋过海来看你 孙露 326 \n", 3410 | "9 9 32166203 ar 1034767 0 好想再爱你 颜亚涛 270 \n", 3411 | "\n", 3412 | " song_length paid_flag file_name label \n", 3413 | "0 296 0 20170410_2_play.log 0 \n", 3414 | "1 0 0 20170410_2_play.log 0 \n", 3415 | "2 156 0 20170410_2_play.log 0 \n", 3416 | "3 0 0 20170410_2_play.log 0 \n", 3417 | "4 334 0 20170410_2_play.log 0 \n", 3418 | "5 156 0 20170410_2_play.log 0 \n", 3419 | "6 0 0 20170410_2_play.log 0 \n", 3420 | "7 0 0 20170410_2_play.log 0 \n", 3421 | "8 326 0 20170410_2_play.log 0 \n", 3422 | "9 0 0 20170410_2_play.log 0 " 3423 | ] 3424 | }, 3425 | "execution_count": 33, 3426 | "metadata": {}, 3427 | "output_type": "execute_result" 3428 | } 3429 | ], 3430 | "source": [ 3431 | "df_play.head(10)" 3432 | ] 3433 | }, 3434 | { 3435 | "cell_type": "code", 3436 | "execution_count": 34, 3437 | "metadata": { 3438 | "collapsed": true 3439 | }, 3440 | "outputs": [], 3441 | "source": [ 3442 | "# add date column\n", 3443 | "def get_date(file_name):\n", 3444 | " tmp_list = str(file_name).split('_')\n", 3445 | " return tmp_list[0]" 3446 | ] 3447 | }, 3448 | { 3449 | "cell_type": "code", 3450 | "execution_count": 35, 3451 | "metadata": { 3452 | "collapsed": true 3453 | }, 3454 | "outputs": [], 3455 | "source": [ 3456 | "df_play['date'] = df_play['file_name'].map(get_date)" 3457 | ] 3458 | }, 3459 | { 3460 | "cell_type": "code", 3461 | "execution_count": 36, 3462 | "metadata": {}, 3463 | "outputs": [ 3464 | { 3465 | "data": { 3466 | "text/plain": [ 3467 | "ar 32365780\n", 3468 | "ip 3826808\n", 3469 | "ar 2744122\n", 3470 | "ip 555615\n", 3471 | "168589573 1\n", 3472 | "dtype: int64" 3473 | ] 3474 | }, 3475 | "execution_count": 36, 3476 | "metadata": {}, 3477 | "output_type": "execute_result" 3478 | } 3479 | ], 3480 | "source": [ 3481 | "df_play.device.value_counts()" 3482 | ] 3483 | }, 3484 | { 3485 | "cell_type": "code", 3486 | "execution_count": 37, 3487 | "metadata": {}, 3488 | "outputs": [ 3489 | { 3490 | "data": { 3491 | "text/plain": [ 3492 | "array(['ar', 'ip', 'ip ', 'ar ', '168589573'], dtype=object)" 3493 | ] 3494 | }, 3495 | "execution_count": 37, 3496 | "metadata": {}, 3497 | "output_type": "execute_result" 3498 | } 3499 | ], 3500 | "source": [ 3501 | "# Looks like all selected columns using android or iphone.\n", 3502 | "df_play.device.unique()" 3503 | ] 3504 | }, 3505 | { 3506 | "cell_type": "code", 3507 | "execution_count": 38, 3508 | "metadata": { 3509 | "collapsed": true 3510 | }, 3511 | "outputs": [], 3512 | "source": [ 3513 | "def remove_space(word):\n", 3514 | " word = str(word).rstrip()\n", 3515 | " return word\n", 3516 | "df_play['device'] = df_play['device'].map(remove_space)" 3517 | ] 3518 | }, 3519 | { 3520 | "cell_type": "code", 3521 | "execution_count": 39, 3522 | "metadata": {}, 3523 | "outputs": [ 3524 | { 3525 | "data": { 3526 | "text/plain": [ 3527 | "array(['ar', 'ip', '168589573'], dtype=object)" 3528 | ] 3529 | }, 3530 | "execution_count": 39, 3531 | "metadata": {}, 3532 | "output_type": "execute_result" 3533 | } 3534 | ], 3535 | "source": [ 3536 | "df_play.device.unique()" 3537 | ] 3538 | }, 3539 | { 3540 | "cell_type": "code", 3541 | "execution_count": 40, 3542 | "metadata": {}, 3543 | "outputs": [ 3544 | { 3545 | "data": { 3546 | "text/html": [ 3547 | "
\n", 3548 | "\n", 3549 | " \n", 3550 | " \n", 3551 | " \n", 3552 | " \n", 3553 | " \n", 3554 | " \n", 3555 | " \n", 3556 | " \n", 3557 | " \n", 3558 | " \n", 3559 | " \n", 3560 | " \n", 3561 | " \n", 3562 | " \n", 3563 | " \n", 3564 | " \n", 3565 | " \n", 3566 | " \n", 3567 | " \n", 3568 | " \n", 3569 | " \n", 3570 | " \n", 3571 | " \n", 3572 | " \n", 3573 | " \n", 3574 | " \n", 3575 | " \n", 3576 | " \n", 3577 | " \n", 3578 | " \n", 3579 | " \n", 3580 | " \n", 3581 | " \n", 3582 | " \n", 3583 | " \n", 3584 | " \n", 3585 | " \n", 3586 | " \n", 3587 | " \n", 3588 | " \n", 3589 | " \n", 3590 | " \n", 3591 | " \n", 3592 | " \n", 3593 | " \n", 3594 | " \n", 3595 | " \n", 3596 | " \n", 3597 | " \n", 3598 | " \n", 3599 | " \n", 3600 | " \n", 3601 | " \n", 3602 | " \n", 3603 | " \n", 3604 | " \n", 3605 | " \n", 3606 | " \n", 3607 | " \n", 3608 | " \n", 3609 | " \n", 3610 | " \n", 3611 | " \n", 3612 | " \n", 3613 | " \n", 3614 | " \n", 3615 | " \n", 3616 | " \n", 3617 | " \n", 3618 | " \n", 3619 | " \n", 3620 | " \n", 3621 | " \n", 3622 | " \n", 3623 | " \n", 3624 | " \n", 3625 | " \n", 3626 | " \n", 3627 | " \n", 3628 | " \n", 3629 | " \n", 3630 | " \n", 3631 | " \n", 3632 | " \n", 3633 | " \n", 3634 | " \n", 3635 | " \n", 3636 | " \n", 3637 | " \n", 3638 | " \n", 3639 | " \n", 3640 | " \n", 3641 | " \n", 3642 | " \n", 3643 | " \n", 3644 | " \n", 3645 | " \n", 3646 | " \n", 3647 | " \n", 3648 | " \n", 3649 | "
indexuiddevicesong_idsong_typesong_namesingerplay_timesong_lengthpaid_flagfile_namelabeldate
0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 296 0 20170410_2_play.log 0 20170410
1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 0 0 20170410_2_play.log 0 20170410
2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 156 0 20170410_2_play.log 0 20170410
3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 0 0 20170410_2_play.log 0 20170410
4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 334 0 20170410_2_play.log 0 20170410
\n", 3650 | "
" 3651 | ], 3652 | "text/plain": [ 3653 | " index uid device song_id song_type song_name singer play_time \\\n", 3654 | "0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n", 3655 | "1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n", 3656 | "2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n", 3657 | "3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n", 3658 | "4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n", 3659 | "\n", 3660 | " song_length paid_flag file_name label date \n", 3661 | "0 296 0 20170410_2_play.log 0 20170410 \n", 3662 | "1 0 0 20170410_2_play.log 0 20170410 \n", 3663 | "2 156 0 20170410_2_play.log 0 20170410 \n", 3664 | "3 0 0 20170410_2_play.log 0 20170410 \n", 3665 | "4 334 0 20170410_2_play.log 0 20170410 " 3666 | ] 3667 | }, 3668 | "execution_count": 40, 3669 | "metadata": {}, 3670 | "output_type": "execute_result" 3671 | } 3672 | ], 3673 | "source": [ 3674 | "df_play.head()" 3675 | ] 3676 | }, 3677 | { 3678 | "cell_type": "markdown", 3679 | "metadata": {}, 3680 | "source": [ 3681 | "### save file" 3682 | ] 3683 | }, 3684 | { 3685 | "cell_type": "code", 3686 | "execution_count": 41, 3687 | "metadata": { 3688 | "collapsed": true 3689 | }, 3690 | "outputs": [], 3691 | "source": [ 3692 | "df_play.to_csv('/Users/ZhijingYe/Desktop/data/output/play_sample_log.csv',sep='\\t')" 3693 | ] 3694 | }, 3695 | { 3696 | "cell_type": "code", 3697 | "execution_count": null, 3698 | "metadata": { 3699 | "collapsed": true 3700 | }, 3701 | "outputs": [], 3702 | "source": [] 3703 | } 3704 | ], 3705 | "metadata": { 3706 | "kernelspec": { 3707 | "display_name": "python27", 3708 | "language": "python", 3709 | "name": "python27" 3710 | }, 3711 | "language_info": { 3712 | "codemirror_mode": { 3713 | "name": "ipython", 3714 | "version": 2 3715 | }, 3716 | "file_extension": ".py", 3717 | "mimetype": "text/x-python", 3718 | "name": "python", 3719 | "nbconvert_exporter": "python", 3720 | "pygments_lexer": "ipython2", 3721 | "version": "2.7.14" 3722 | } 3723 | }, 3724 | "nbformat": 4, 3725 | "nbformat_minor": 2 3726 | } 3727 | --------------------------------------------------------------------------------