├── README.md
└── music_box
├── profit_curve.png
├── 1. Unzip & Prepare Raw Data.ipynb
├── Data Preparation-Churn labeling and Downsampling3.ipynb
└── 4Cleansing.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Music_box_project
2 |
3 | extract data file
4 | data processing
5 | EDA
6 | modeling
7 |
--------------------------------------------------------------------------------
/music_box/profit_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhijingye1992/Music_box_project/HEAD/music_box/profit_curve.png
--------------------------------------------------------------------------------
/music_box/1. Unzip & Prepare Raw Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "# In MAC command shell, copy and run the commands below for data upzip and clean.\n",
9 | "\n",
10 | "## first make pwd to the data folder\n",
11 | "\n",
12 | "## unzip uid\n",
13 | "cp ../data/raw/3_1.uids.gz ../data/all_uid.txt.gz\n",
14 | "\n",
15 | "gunzip ../data/all_uid.txt.gz\n",
16 | "\n",
17 | "## unzip play log\n",
18 | "for f in ../data/raw/*_play.log.tar.gz\n",
19 | "\n",
20 | "do\n",
21 | " \n",
22 | " echo \"Processing $f\"\n",
23 | " \n",
24 | " tar -xvzf $f\n",
25 | "\n",
26 | "done\n",
27 | "\n",
28 | "## make a play folder\n",
29 | "mkdir play\n",
30 | "\n",
31 | "## move play.log files to play folder\n",
32 | "mv *_play.log ../data/play/\n",
33 | "\n",
34 | "cp ../data/raw/*_play.log.gz ../data/play/ \n",
35 | "\n",
36 | "gunzip ../data/play/*.gz\n",
37 | "\n",
38 | "## append file_name to each row (will be used for date)\n",
39 | "cd ../data/play/\n",
40 | "\n",
41 | "for f in *.log\n",
42 | "\n",
43 | "do\n",
44 | "\n",
45 | " echo \"Processing $f\"\n",
46 | " \n",
47 | " awk -v var=\"$f\" '{print $0,\"\\t\",var}' $f > ${f}.fn\n",
48 | "\n",
49 | "done\n",
50 | "\n",
51 | "## cat all log with filename to one file\n",
52 | "\n",
53 | "cat ../data/play/*.log.fn > /Users/Xiaoxi/Desktop/BitTiger/Capstone/data/all_play.log.fn\n"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": []
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {
69 | "collapsed": true
70 | },
71 | "outputs": [],
72 | "source": []
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 2",
87 | "language": "python",
88 | "name": "python2"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 2
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython2",
100 | "version": "2.7.13"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 2
105 | }
106 |
--------------------------------------------------------------------------------
/music_box/Data Preparation-Churn labeling and Downsampling3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Data Preparation: Churn labeling and Downsampling\n",
8 | "\n",
9 | "#### 1. Read the *play.log files line by line, and write only the user ID, device and date of log into a separate file.\n",
10 | "\n",
11 | "#### 2. Label churn users: those who played more than three times before the cutoff day but had no acitivity after the cutoff.\n",
12 | "\n",
13 | "#### 3. Down sampling is necessary. There are more than 50,000 users and 15 GB log data, which is not necessary for the churn prediction and which costs too much time to process. I used a down sampling ratio of 1/10, to only include 1/10 users from the active and churn users for the churn prediction model."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "### 1. Churn labeling\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "import glob\n"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "138"
43 | ]
44 | },
45 | "execution_count": 2,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n",
52 | "files = glob.glob(filepath)\n",
53 | "# amount of files\n",
54 | "len(files)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "'/Users/ZhijingYe/Desktop/data/play/20170410_2_play.log'"
66 | ]
67 | },
68 | "execution_count": 3,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "# take a look at one of the files\n",
75 | "files[0]"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 4,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "1149628"
87 | ]
88 | },
89 | "execution_count": 4,
90 | "metadata": {},
91 | "output_type": "execute_result"
92 | }
93 | ],
94 | "source": [
95 | "# get an idea how many lines are in one .log file\n",
96 | "with open(files[0],'r') as f:\n",
97 | " lines = f.readlines()\n",
98 | " log_lines = len(lines)\n",
99 | "log_lines"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "data": {
109 | "text/plain": [
110 | "'168071768\\tar\\t1248464\\t0\\t\\xe6\\x88\\x90\\xe7\\x8e\\x8b\\xe8\\xb4\\xa5\\xe5\\xaf\\x87\\t\\xe9\\x99\\x88\\xe5\\xb0\\x8f\\xe6\\x98\\xa5\\t187\\t187\\t0\\n'"
111 | ]
112 | },
113 | "execution_count": 5,
114 | "metadata": {},
115 | "output_type": "execute_result"
116 | }
117 | ],
118 | "source": [
119 | "# Check one line\n",
120 | "lines[3]"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 6,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "['168071768',\n",
132 | " 'ar',\n",
133 | " '1248464',\n",
134 | " '0',\n",
135 | " '\\xe6\\x88\\x90\\xe7\\x8e\\x8b\\xe8\\xb4\\xa5\\xe5\\xaf\\x87',\n",
136 | " '\\xe9\\x99\\x88\\xe5\\xb0\\x8f\\xe6\\x98\\xa5',\n",
137 | " '187',\n",
138 | " '187',\n",
139 | " '0',\n",
140 | " '20170410_2_play.log']"
141 | ]
142 | },
143 | "execution_count": 6,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "test_list = lines[3].strip('\\n').split('\\t')\n",
150 | "test_list.append(files[0].split('/')[-1])\n",
151 | "test_list"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 7,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "name": "stderr",
161 | "output_type": "stream",
162 | "text": [
163 | "/Users/ZhijingYe/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,2,7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
164 | " data = self._reader.read(nrows)\n"
165 | ]
166 | },
167 | {
168 | "data": {
169 | "text/html": [
170 | "
\n",
171 | "
\n",
172 | " \n",
173 | " \n",
174 | " | \n",
175 | " uid | \n",
176 | " device | \n",
177 | " song_id | \n",
178 | " song_type | \n",
179 | " song_name | \n",
180 | " singer | \n",
181 | " play_time | \n",
182 | " song_length | \n",
183 | " paid_flag | \n",
184 | " file_name | \n",
185 | "
\n",
186 | " \n",
187 | " \n",
188 | " \n",
189 | " | 0 | \n",
190 | " 1.683352e+08 | \n",
191 | " ar | \n",
192 | " 6429024 | \n",
193 | " 0 | \n",
194 | " 相对湿度 | \n",
195 | " 郑希怡 | \n",
196 | " 238 | \n",
197 | " 238 | \n",
198 | " 0 | \n",
199 | " NaN | \n",
200 | "
\n",
201 | " \n",
202 | " | 1 | \n",
203 | " 1.683105e+08 | \n",
204 | " ar | \n",
205 | " 3348254 | \n",
206 | " 0 | \n",
207 | " 曾经心痛 | \n",
208 | " 袁娅维 | \n",
209 | " 21 | \n",
210 | " 312 | \n",
211 | " 0 | \n",
212 | " NaN | \n",
213 | "
\n",
214 | " \n",
215 | " | 2 | \n",
216 | " 1.683082e+08 | \n",
217 | " ar | \n",
218 | " 5436214 | \n",
219 | " 0 | \n",
220 | " Dream A Little Dream | \n",
221 | " Robbie Williams | \n",
222 | " 246 | \n",
223 | " 247 | \n",
224 | " 0 | \n",
225 | " NaN | \n",
226 | "
\n",
227 | " \n",
228 | " | 3 | \n",
229 | " 1.680718e+08 | \n",
230 | " ar | \n",
231 | " 1248464 | \n",
232 | " 0 | \n",
233 | " 成王败寇 | \n",
234 | " 陈小春 | \n",
235 | " 187 | \n",
236 | " 187 | \n",
237 | " 0 | \n",
238 | " NaN | \n",
239 | "
\n",
240 | " \n",
241 | " | 4 | \n",
242 | " 1.684808e+08 | \n",
243 | " ar | \n",
244 | " 317412 | \n",
245 | " 0 | \n",
246 | " Kissy Kissy | \n",
247 | " Smile.DK | \n",
248 | " 188 | \n",
249 | " 189 | \n",
250 | " 0 | \n",
251 | " NaN | \n",
252 | "
\n",
253 | " \n",
254 | "
\n",
255 | "
"
256 | ],
257 | "text/plain": [
258 | " uid device song_id song_type song_name \\\n",
259 | "0 1.683352e+08 ar 6429024 0 相对湿度 \n",
260 | "1 1.683105e+08 ar 3348254 0 曾经心痛 \n",
261 | "2 1.683082e+08 ar 5436214 0 Dream A Little Dream \n",
262 | "3 1.680718e+08 ar 1248464 0 成王败寇 \n",
263 | "4 1.684808e+08 ar 317412 0 Kissy Kissy \n",
264 | "\n",
265 | " singer play_time song_length paid_flag file_name \n",
266 | "0 郑希怡 238 238 0 NaN \n",
267 | "1 袁娅维 21 312 0 NaN \n",
268 | "2 Robbie Williams 246 247 0 NaN \n",
269 | "3 陈小春 187 187 0 NaN \n",
270 | "4 Smile.DK 188 189 0 NaN "
271 | ]
272 | },
273 | "execution_count": 7,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "import pandas as pd\n",
280 | "import numpy as np\n",
281 | "\n",
282 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name']\n",
283 | "df = pd.read_csv(files[0], sep='\\t',header=None,index_col=None,names=schema )\n",
284 | "df.head()\n",
285 | "# Note the file_name will be added later"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {
292 | "collapsed": true
293 | },
294 | "outputs": [],
295 | "source": []
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {},
300 | "source": [
301 | "### Save reduced play logs to two log files.\n",
302 | "Only the first two items of each line, user id and device, and the date of the log are saved, so it's called reduced play logs."
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 8,
308 | "metadata": {
309 | "collapsed": true
310 | },
311 | "outputs": [],
312 | "source": [
313 | "# 04/22 is the cutoff date for labeling churns\n",
314 | "cutoff = '20170422'"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 9,
320 | "metadata": {
321 | "collapsed": true
322 | },
323 | "outputs": [],
324 | "source": [
325 | "# destination file names to save the reduced logs.\n",
326 | "first_period_log = '/Users/ZhijingYe/Desktop/data/output/play_till_cutoff.log'\n",
327 | "second_period_log = '/Users/ZhijingYe/Desktop/data/output/play_after_cutoff.log'"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 10,
333 | "metadata": {},
334 | "outputs": [
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "processing file: 20170410_2_play.log\n",
340 | "...costs 3.02 seconds\n",
341 | "processing file: 20170410_3_play.log\n",
342 | "...costs 2.89 seconds\n",
343 | "processing file: 20170427_3_play.log\n",
344 | "...costs 2.90 seconds\n",
345 | "processing file: 20170427_2_play.log\n",
346 | "...costs 2.27 seconds\n",
347 | "processing file: 20170504_3_play.log\n",
348 | "...costs 2.05 seconds\n",
349 | "processing file: 20170504_2_play.log\n",
350 | "...costs 1.93 seconds\n",
351 | "processing file: 20170508_1_play.log\n",
352 | "...costs 2.01 seconds\n",
353 | "processing file: 20170505_1_play.log\n",
354 | "...costs 2.07 seconds\n",
355 | "processing file: 20170411_1_play.log\n",
356 | "...costs 2.74 seconds\n",
357 | "processing file: 20170426_1_play.log\n",
358 | "...costs 2.23 seconds\n",
359 | "processing file: 20170509_3_play.log\n",
360 | "...costs 1.81 seconds\n",
361 | "processing file: 20170509_2_play.log\n",
362 | "...costs 1.93 seconds\n",
363 | "processing file: 20170401_2_play.log\n",
364 | "...costs 4.37 seconds\n",
365 | "processing file: 20170401_3_play.log\n",
366 | "...costs 5.01 seconds\n",
367 | "processing file: 20170423_1_play.log\n",
368 | "...costs 2.54 seconds\n",
369 | "processing file: 20170414_1_play.log\n",
370 | "...costs 1.88 seconds\n",
371 | "processing file: 20170418_2_play.log\n",
372 | "...costs 2.39 seconds\n",
373 | "processing file: 20170418_3_play.log\n",
374 | "...costs 2.35 seconds\n",
375 | "processing file: 20170408_1_play.log\n",
376 | "...costs 3.25 seconds\n",
377 | "processing file: 20170404_2_play.log\n",
378 | "...costs 3.94 seconds\n",
379 | "processing file: 20170404_3_play.log\n",
380 | "...costs 3.89 seconds\n",
381 | "processing file: 20170510_3_play.log\n",
382 | "...costs 1.98 seconds\n",
383 | "processing file: 20170308_1_play.log\n",
384 | "...costs 3.13 seconds\n",
385 | "processing file: 20170510_2_play.log\n",
386 | "...costs 1.88 seconds\n",
387 | "processing file: 20170501_3_play.log\n",
388 | "...costs 2.21 seconds\n",
389 | "processing file: 20170501_2_play.log\n",
390 | "...costs 2.12 seconds\n",
391 | "processing file: 20170422_3_play.log\n",
392 | "...costs 2.34 seconds\n",
393 | "processing file: 20170422_2_play.log\n",
394 | "...costs 2.33 seconds\n",
395 | "processing file: 20170415_2_play.log\n",
396 | "...costs 2.56 seconds\n",
397 | "processing file: 20170415_3_play.log\n",
398 | "...costs 2.65 seconds\n",
399 | "processing file: 20170419_1_play.log\n",
400 | "...costs 2.25 seconds\n",
401 | "processing file: 20170409_2_play.log\n",
402 | "...costs 3.18 seconds\n",
403 | "processing file: 20170409_3_play.log\n",
404 | "...costs 3.18 seconds\n",
405 | "processing file: 20170305_1_play.log\n",
406 | "...costs 3.99 seconds\n",
407 | "processing file: 20170511_1_play.log\n",
408 | "...costs 2.23 seconds\n",
409 | "processing file: 20170405_1_play.log\n",
410 | "...costs 2.99 seconds\n",
411 | "processing file: 20170302_1_play.log\n",
412 | "...costs 6.22 seconds\n",
413 | "processing file: 20170402_1_play.log\n",
414 | "...costs 4.54 seconds\n",
415 | "processing file: 20170425_2_play.log\n",
416 | "...costs 2.09 seconds\n",
417 | "processing file: 20170425_3_play.log\n",
418 | "...costs 2.31 seconds\n",
419 | "processing file: 20170412_3_play.log\n",
420 | "...costs 2.76 seconds\n",
421 | "processing file: 20170412_2_play.log\n",
422 | "...costs 2.81 seconds\n",
423 | "processing file: 20170506_2_play.log\n",
424 | "...costs 2.34 seconds\n",
425 | "processing file: 20170506_3_play.log\n",
426 | "...costs 2.26 seconds\n",
427 | "processing file: 20170429_1_play.log\n",
428 | "...costs 2.55 seconds\n",
429 | "processing file: 20170403_2_play.log\n",
430 | "...costs 4.32 seconds\n",
431 | "processing file: 20170507_1_play.log\n",
432 | "...costs 2.19 seconds\n",
433 | "processing file: 20170424_1_play.log\n",
434 | "...costs 1.34 seconds\n",
435 | "processing file: 20170413_1_play.log\n",
436 | "...costs 2.59 seconds\n",
437 | "processing file: 20170428_2_play.log\n",
438 | "...costs 2.48 seconds\n",
439 | "processing file: 20170428_3_play.log\n",
440 | "...costs 2.34 seconds\n",
441 | "processing file: 20170331_2_play.log\n",
442 | "...costs 5.28 seconds\n",
443 | "processing file: 20170331_3_play.log\n",
444 | "...costs 7.89 seconds\n",
445 | "processing file: 20170406_3_play.log\n",
446 | "...costs 3.33 seconds\n",
447 | "processing file: 20170406_2_play.log\n",
448 | "...costs 3.08 seconds\n",
449 | "processing file: 20170512_2_play.log\n",
450 | "...costs 2.07 seconds\n",
451 | "processing file: 20170512_3_play.log\n",
452 | "...costs 1.92 seconds\n",
453 | "processing file: 20170416_1_play.log\n",
454 | "...costs 2.89 seconds\n",
455 | "processing file: 20170421_1_play.log\n",
456 | "...costs 2.38 seconds\n",
457 | "processing file: 20170502_1_play.log\n",
458 | "...costs 2.10 seconds\n",
459 | "processing file: 20170307_1_play.log\n",
460 | "...costs 3.02 seconds\n",
461 | "processing file: 20170430_1_play.log\n",
462 | "...costs 2.43 seconds\n",
463 | "processing file: 20170407_1_play.log\n",
464 | "...costs 2.90 seconds\n",
465 | "processing file: 20170503_2_play.log\n",
466 | "...costs 1.95 seconds\n",
467 | "processing file: 20170503_3_play.log\n",
468 | "...costs 1.95 seconds\n",
469 | "processing file: 20170417_3_play.log\n",
470 | "...costs 2.40 seconds\n",
471 | "processing file: 20170417_2_play.log\n",
472 | "...costs 2.43 seconds\n",
473 | "processing file: 20170420_2_play.log\n",
474 | "...costs 2.32 seconds\n",
475 | "processing file: 20170420_3_play.log\n",
476 | "...costs 2.18 seconds\n",
477 | "processing file: 20170422_1_play.log\n",
478 | "...costs 2.57 seconds\n",
479 | "processing file: 20170415_1_play.log\n",
480 | "...costs 2.78 seconds\n",
481 | "processing file: 20170501_1_play.log\n",
482 | "...costs 2.41 seconds\n",
483 | "processing file: 20170419_3_play.log\n",
484 | "...costs 2.56 seconds\n",
485 | "processing file: 20170419_2_play.log\n",
486 | "...costs 2.61 seconds\n",
487 | "processing file: 20170409_1_play.log\n",
488 | "...costs 3.09 seconds\n",
489 | "processing file: 20170405_3_play.log\n",
490 | "...costs 3.17 seconds\n",
491 | "processing file: 20170405_2_play.log\n",
492 | "...costs 3.11 seconds\n",
493 | "processing file: 20170309_1_play.log\n",
494 | "...costs 2.89 seconds\n",
495 | "processing file: 20170511_2_play.log\n",
496 | "...costs 1.89 seconds\n",
497 | "processing file: 20170511_3_play.log\n",
498 | "...costs 1.89 seconds\n",
499 | "processing file: 20170423_2_play.log\n",
500 | "...costs 2.45 seconds\n",
501 | "processing file: 20170423_3_play.log\n",
502 | "...costs 2.47 seconds\n",
503 | "processing file: 20170414_3_play.log\n",
504 | "...costs 1.81 seconds\n",
505 | "processing file: 20170414_2_play.log\n",
506 | "...costs 1.89 seconds\n",
507 | "processing file: 20170418_1_play.log\n",
508 | "...costs 2.35 seconds\n",
509 | "processing file: 20170301_play.log\n",
510 | "...costs 8.80 seconds\n",
511 | "processing file: 20170408_3_play.log\n",
512 | "...costs 3.43 seconds\n",
513 | "processing file: 20170408_2_play.log\n",
514 | "...costs 3.13 seconds\n",
515 | "processing file: 20170304_1_play.log\n",
516 | "...costs 4.05 seconds\n",
517 | "processing file: 20170510_1_play.log\n",
518 | "...costs 2.04 seconds\n",
519 | "processing file: 20170404_1_play.log\n",
520 | "...costs 4.52 seconds\n",
521 | "processing file: 20170411_2_play.log\n",
522 | "...costs 2.94 seconds\n",
523 | "processing file: 20170426_2_play.log\n",
524 | "...costs 2.28 seconds\n",
525 | "processing file: 20170426_3_play.log\n",
526 | "...costs 2.49 seconds\n",
527 | "processing file: 20170505_2_play.log\n",
528 | "...costs 2.09 seconds\n",
529 | "processing file: 20170505_3_play.log\n",
530 | "...costs 1.88 seconds\n",
531 | "processing file: 20170509_1_play.log\n",
532 | "...costs 2.10 seconds\n",
533 | "processing file: 20170401_1_play.log\n",
534 | "...costs 3.91 seconds\n",
535 | "processing file: 20170504_1_play.log\n",
536 | "...costs 2.01 seconds\n",
537 | "processing file: 20170410_1_play.log\n",
538 | "...costs 2.69 seconds\n",
539 | "processing file: 20170427_1_play.log\n",
540 | "...costs 2.17 seconds\n",
541 | "processing file: 20170508_2_play.log\n",
542 | "...costs 1.90 seconds\n",
543 | "processing file: 20170508_3_play.log\n",
544 | "...costs 1.90 seconds\n",
545 | "processing file: 20170330_3_play.log\n",
546 | "...costs 12.15 seconds\n",
547 | "processing file: 20170430_3_play.log\n",
548 | "...costs 2.70 seconds\n",
549 | "processing file: 20170430_2_play.log\n",
550 | "...costs 2.59 seconds\n",
551 | "processing file: 20170407_2_play.log\n",
552 | "...costs 2.81 seconds\n",
553 | "processing file: 20170407_3_play.log\n",
554 | "...costs 2.87 seconds\n",
555 | "processing file: 20170417_1_play.log\n",
556 | "...costs 2.49 seconds\n",
557 | "processing file: 20170420_1_play.log\n",
558 | "...costs 2.35 seconds\n",
559 | "processing file: 20170503_1_play.log\n",
560 | "...costs 1.98 seconds\n",
561 | "processing file: 20170331_1_play.log\n",
562 | "...costs 4.26 seconds\n",
563 | "processing file: 20170306_1_play.log\n",
564 | "...costs 3.23 seconds\n",
565 | "processing file: 20170512_1_play.log\n",
566 | "...costs 1.97 seconds\n",
567 | "processing file: 20170406_1_play.log\n",
568 | "...costs 3.00 seconds\n",
569 | "processing file: 20170502_3_play.log\n",
570 | "...costs 1.86 seconds\n",
571 | "processing file: 20170502_2_play.log\n",
572 | "...costs 1.87 seconds\n",
573 | "processing file: 20170416_2_play.log\n",
574 | "...costs 2.72 seconds\n",
575 | "processing file: 20170416_3_play.log\n",
576 | "...costs 2.72 seconds\n",
577 | "processing file: 20170421_3_play.log\n",
578 | "...costs 2.26 seconds\n",
579 | "processing file: 20170421_2_play.log\n",
580 | "...costs 2.30 seconds\n",
581 | "processing file: 20170303_1_play.log\n",
582 | "...costs 4.44 seconds\n",
583 | "processing file: 20170403_1_play.log\n",
584 | "...costs 4.09 seconds\n",
585 | "processing file: 20170424_3_play.log\n",
586 | "...costs 2.11 seconds\n",
587 | "processing file: 20170424_2_play.log\n",
588 | "...costs 0.34 seconds\n",
589 | "processing file: 20170413_2_play.log\n",
590 | "...costs 2.58 seconds\n",
591 | "processing file: 20170413_3_play.log\n",
592 | "...costs 2.67 seconds\n",
593 | "processing file: 20170507_3_play.log\n",
594 | "...costs 2.08 seconds\n",
595 | "processing file: 20170507_2_play.log\n",
596 | "...costs 2.16 seconds\n",
597 | "processing file: 20170428_1_play.log\n",
598 | "...costs 2.14 seconds\n",
599 | "processing file: 20170339_1_play.log\n",
600 | "...costs 5.11 seconds\n",
601 | "processing file: 20170402_2_play.log\n",
602 | "...costs 4.55 seconds\n",
603 | "processing file: 20170402_3_play.log\n",
604 | "...costs 4.52 seconds\n",
605 | "processing file: 20170506_1_play.log\n",
606 | "...costs 2.29 seconds\n",
607 | "processing file: 20170425_1_play.log\n",
608 | "...costs 2.27 seconds\n",
609 | "processing file: 20170412_1_play.log\n",
610 | "...costs 2.61 seconds\n",
611 | "processing file: 20170429_3_play.log\n",
612 | "...costs 2.68 seconds\n",
613 | "processing file: 20170429_2_play.log\n",
614 | "...costs 2.60 seconds\n"
615 | ]
616 | }
617 | ],
618 | "source": [
619 | "import time\n",
620 | "\n",
621 | "for each_file in files:\n",
622 | " current_time = time.clock()\n",
623 | "\n",
624 | " with open(each_file, 'r') as f:\n",
625 | " lines = f.readlines()\n",
626 | " filename = f.name.split('/')[-1]\n",
627 | " print('processing file: %s' % filename)\n",
628 | " #choose the output path\n",
629 | " if filename < cutoff:\n",
630 | " output_path = first_period_log\n",
631 | " else:\n",
632 | " output_path = second_period_log\n",
633 | " # write to the output file\n",
634 | " with open(output_path, 'a') as output:\n",
635 | " for line in lines:\n",
636 | " fields_to_keep = line.strip('\\n').split('\\t')[:2]\n",
637 | " fields_to_keep.append(filename)\n",
638 | " output.write('\\t'.join(fields_to_keep)+'\\n')\n",
639 | " print('...costs %.2f seconds' % (time.clock()-current_time))"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 11,
645 | "metadata": {},
646 | "outputs": [
647 | {
648 | "data": {
649 | "text/html": [
650 | "\n",
651 | "
\n",
652 | " \n",
653 | " \n",
654 | " | \n",
655 | " uid | \n",
656 | " device | \n",
657 | " file_name | \n",
658 | "
\n",
659 | " \n",
660 | " \n",
661 | " \n",
662 | " | 0 | \n",
663 | " 168335198 | \n",
664 | " ar | \n",
665 | " 20170410_2_play.log | \n",
666 | "
\n",
667 | " \n",
668 | " | 1 | \n",
669 | " 168310452 | \n",
670 | " ar | \n",
671 | " 20170410_2_play.log | \n",
672 | "
\n",
673 | " \n",
674 | " | 2 | \n",
675 | " 168308159 | \n",
676 | " ar | \n",
677 | " 20170410_2_play.log | \n",
678 | "
\n",
679 | " \n",
680 | " | 3 | \n",
681 | " 168071768 | \n",
682 | " ar | \n",
683 | " 20170410_2_play.log | \n",
684 | "
\n",
685 | " \n",
686 | " | 4 | \n",
687 | " 168480816 | \n",
688 | " ar | \n",
689 | " 20170410_2_play.log | \n",
690 | "
\n",
691 | " \n",
692 | "
\n",
693 | "
"
694 | ],
695 | "text/plain": [
696 | " uid device file_name\n",
697 | "0 168335198 ar 20170410_2_play.log\n",
698 | "1 168310452 ar 20170410_2_play.log\n",
699 | "2 168308159 ar 20170410_2_play.log\n",
700 | "3 168071768 ar 20170410_2_play.log\n",
701 | "4 168480816 ar 20170410_2_play.log"
702 | ]
703 | },
704 | "execution_count": 11,
705 | "metadata": {},
706 | "output_type": "execute_result"
707 | }
708 | ],
709 | "source": [
710 | "import pandas as pd\n",
711 | "\n",
712 | "schema = ['uid','device','file_name']\n",
713 | "df_1 = pd.read_csv(first_period_log,delimiter='\\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})\n",
714 | "df_1.head()"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 12,
720 | "metadata": {},
721 | "outputs": [
722 | {
723 | "name": "stdout",
724 | "output_type": "stream",
725 | "text": [
726 | "\n",
727 | "Int64Index: 321455544 entries, 0 to 321455543\n",
728 | "Data columns (total 3 columns):\n",
729 | "uid object\n",
730 | "device object\n",
731 | "file_name object\n",
732 | "dtypes: object(3)\n",
733 | "memory usage: 9.6+ GB\n"
734 | ]
735 | }
736 | ],
737 | "source": [
738 | "df_1.info()"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": 13,
744 | "metadata": {
745 | "collapsed": true
746 | },
747 | "outputs": [],
748 | "source": [
749 | "# change file_name to date\n",
750 | "def get_date(file_name):\n",
751 | " tmp_list = str(file_name).split('_')\n",
752 | " return tmp_list[0]\n",
753 | "df_1['date'] = df_1['file_name'].map(get_date)"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": 14,
759 | "metadata": {
760 | "collapsed": true
761 | },
762 | "outputs": [],
763 | "source": [
764 | "df_1 = df_1.drop(['file_name'], axis = 1)"
765 | ]
766 | },
767 | {
768 | "cell_type": "code",
769 | "execution_count": null,
770 | "metadata": {
771 | "collapsed": true
772 | },
773 | "outputs": [],
774 | "source": []
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": null,
779 | "metadata": {
780 | "collapsed": true
781 | },
782 | "outputs": [],
783 | "source": []
784 | },
785 | {
786 | "cell_type": "markdown",
787 | "metadata": {},
788 | "source": [
789 | "### Explore the data"
790 | ]
791 | },
792 | {
793 | "cell_type": "code",
794 | "execution_count": 18,
795 | "metadata": {},
796 | "outputs": [
797 | {
798 | "data": {
799 | "text/plain": [
800 | "847330"
801 | ]
802 | },
803 | "execution_count": 18,
804 | "metadata": {},
805 | "output_type": "execute_result"
806 | }
807 | ],
808 | "source": [
809 | "len(df_1['uid'].unique())"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": 19,
815 | "metadata": {},
816 | "outputs": [
817 | {
818 | "data": {
819 | "text/plain": [
820 | "array(['ar', 'ip', 'mc', 'wp', 'ar ', 'ip ', '20170302_1_play.log',\n",
821 | " '168589573', '20170301_play.log', nan, '20170303_1_play.log',\n",
822 | " '20170339_1_play.log'], dtype=object)"
823 | ]
824 | },
825 | "execution_count": 19,
826 | "metadata": {},
827 | "output_type": "execute_result"
828 | }
829 | ],
830 | "source": [
831 | "df_1['device'].unique()"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": 21,
837 | "metadata": {
838 | "collapsed": true
839 | },
840 | "outputs": [],
841 | "source": [
842 | "# len(df_1['file_name'].unique())"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": 22,
848 | "metadata": {},
849 | "outputs": [
850 | {
851 | "data": {
852 | "text/plain": [
853 | "1685126 11778180\n",
854 | "37025504 8535228\n",
855 | "751824 6796068\n",
856 | "1791497 5987916\n",
857 | "497685 4519674\n",
858 | "1062806 3776580\n",
859 | "736305 2829009\n",
860 | "1685126 1884981\n",
861 | "0 1815633\n",
862 | "37025504 1381125\n",
863 | "1749320 1207488\n",
864 | "1679121 784674\n",
865 | "46532274 756681\n",
866 | "28638487 634440\n",
867 | "637650 350460\n",
868 | "...\n",
869 | "167679654 3\n",
870 | "168963526 3\n",
871 | "168327556 3\n",
872 | "154699061 3\n",
873 | "168963528 3\n",
874 | "168761341 3\n",
875 | "154652167 3\n",
876 | "168686496 3\n",
877 | "154828622 3\n",
878 | "154494259 3\n",
879 | "154502301 3\n",
880 | "168280933 3\n",
881 | "154426629 3\n",
882 | "167932419 3\n",
883 | "168891406 3\n",
884 | "Length: 847329, dtype: int64"
885 | ]
886 | },
887 | "execution_count": 22,
888 | "metadata": {},
889 | "output_type": "execute_result"
890 | }
891 | ],
892 | "source": [
893 | "df_1.uid.value_counts()"
894 | ]
895 | },
896 | {
897 | "cell_type": "markdown",
898 | "metadata": {},
899 | "source": [
900 | "Looks like uid = 0 should be testing id. And those uid with log numbers larger than that of uid = 0 may be robot. Check the device type of these ids. These user id will be deleted later.\n"
901 | ]
902 | },
903 | {
904 | "cell_type": "markdown",
905 | "metadata": {},
906 | "source": [
907 | "### Criteria of active user: number of activities before cutoff date >= 3\n",
908 | "### Criteria of churn user: active users that have no activity after cutoff date\n",
909 | "### Criteria of loyal user: a user has>= 3 activities before cutoff date and has recent activity after cutoff date"
910 | ]
911 | },
912 | {
913 | "cell_type": "code",
914 | "execution_count": 23,
915 | "metadata": {},
916 | "outputs": [
917 | {
918 | "data": {
919 | "text/plain": [
920 | "(847329, 0)"
921 | ]
922 | },
923 | "execution_count": 23,
924 | "metadata": {},
925 | "output_type": "execute_result"
926 | }
927 | ],
928 | "source": [
929 | "# total number of active users and inactive users before the cutoff date\n",
930 | "active = df_1.uid.value_counts()>=3\n",
931 | "sum(active),sum(active==0)"
932 | ]
933 | },
934 | {
935 | "cell_type": "code",
936 | "execution_count": 24,
937 | "metadata": {
938 | "collapsed": true
939 | },
940 | "outputs": [],
941 | "source": [
942 | "active_users = [active.index[i] for i in xrange(len(active)) if active[i]]"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 25,
948 | "metadata": {},
949 | "outputs": [
950 | {
951 | "data": {
952 | "text/plain": [
953 | "847329"
954 | ]
955 | },
956 | "execution_count": 25,
957 | "metadata": {},
958 | "output_type": "execute_result"
959 | }
960 | ],
961 | "source": [
962 | "len(active_users)"
963 | ]
964 | },
965 | {
966 | "cell_type": "code",
967 | "execution_count": 26,
968 | "metadata": {
969 | "collapsed": true
970 | },
971 | "outputs": [],
972 | "source": [
973 | "active_set = set(active_users)"
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 27,
979 | "metadata": {},
980 | "outputs": [
981 | {
982 | "data": {
983 | "text/plain": [
984 | "847329"
985 | ]
986 | },
987 | "execution_count": 27,
988 | "metadata": {},
989 | "output_type": "execute_result"
990 | }
991 | ],
992 | "source": [
993 | "len(active_set)"
994 | ]
995 | },
996 | {
997 | "cell_type": "code",
998 | "execution_count": 28,
999 | "metadata": {},
1000 | "outputs": [
1001 | {
1002 | "data": {
1003 | "text/html": [
1004 | "\n",
1005 | "
\n",
1006 | " \n",
1007 | " \n",
1008 | " | \n",
1009 | " uid | \n",
1010 | " device | \n",
1011 | " file_name | \n",
1012 | "
\n",
1013 | " \n",
1014 | " \n",
1015 | " \n",
1016 | " | 0 | \n",
1017 | " 169026646 | \n",
1018 | " ar | \n",
1019 | " 20170427_3_play.log | \n",
1020 | "
\n",
1021 | " \n",
1022 | " | 1 | \n",
1023 | " 168553991 | \n",
1024 | " ar | \n",
1025 | " 20170427_3_play.log | \n",
1026 | "
\n",
1027 | " \n",
1028 | " | 2 | \n",
1029 | " 1685126 | \n",
1030 | " ar | \n",
1031 | " 20170427_3_play.log | \n",
1032 | "
\n",
1033 | " \n",
1034 | " | 3 | \n",
1035 | " 168845172 | \n",
1036 | " ar | \n",
1037 | " 20170427_3_play.log | \n",
1038 | "
\n",
1039 | " \n",
1040 | " | 4 | \n",
1041 | " 168538454 | \n",
1042 | " ar | \n",
1043 | " 20170427_3_play.log | \n",
1044 | "
\n",
1045 | " \n",
1046 | "
\n",
1047 | "
"
1048 | ],
1049 | "text/plain": [
1050 | " uid device file_name\n",
1051 | "0 169026646 ar 20170427_3_play.log\n",
1052 | "1 168553991 ar 20170427_3_play.log\n",
1053 | "2 1685126 ar 20170427_3_play.log\n",
1054 | "3 168845172 ar 20170427_3_play.log\n",
1055 | "4 168538454 ar 20170427_3_play.log"
1056 | ]
1057 | },
1058 | "execution_count": 28,
1059 | "metadata": {},
1060 | "output_type": "execute_result"
1061 | }
1062 | ],
1063 | "source": [
1064 | "# Now process the recent play.log file to get recent users.\n",
1065 | "df_2 = pd.read_csv(second_period_log,delimiter='\\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})\n",
1066 | "df_2.head()"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "execution_count": 29,
1072 | "metadata": {},
1073 | "outputs": [
1074 | {
1075 | "name": "stdout",
1076 | "output_type": "stream",
1077 | "text": [
1078 | "\n",
1079 | "Int64Index: 172407126 entries, 0 to 172407125\n",
1080 | "Data columns (total 3 columns):\n",
1081 | "uid object\n",
1082 | "device object\n",
1083 | "file_name object\n",
1084 | "dtypes: object(3)\n",
1085 | "memory usage: 5.1+ GB\n"
1086 | ]
1087 | }
1088 | ],
1089 | "source": [
1090 | "df_2.info()\n"
1091 | ]
1092 | },
1093 | {
1094 | "cell_type": "code",
1095 | "execution_count": 30,
1096 | "metadata": {
1097 | "collapsed": true
1098 | },
1099 | "outputs": [],
1100 | "source": [
1101 | "active_recent = df_2.uid.value_counts()"
1102 | ]
1103 | },
1104 | {
1105 | "cell_type": "code",
1106 | "execution_count": 31,
1107 | "metadata": {},
1108 | "outputs": [
1109 | {
1110 | "data": {
1111 | "text/plain": [
1112 | "273222"
1113 | ]
1114 | },
1115 | "execution_count": 31,
1116 | "metadata": {},
1117 | "output_type": "execute_result"
1118 | }
1119 | ],
1120 | "source": [
1121 | "len(active_recent) "
1122 | ]
1123 | },
1124 | {
1125 | "cell_type": "code",
1126 | "execution_count": 32,
1127 | "metadata": {},
1128 | "outputs": [
1129 | {
1130 | "data": {
1131 | "text/plain": [
1132 | "numpy.int64"
1133 | ]
1134 | },
1135 | "execution_count": 32,
1136 | "metadata": {},
1137 | "output_type": "execute_result"
1138 | }
1139 | ],
1140 | "source": [
1141 | "type(active_recent[0])"
1142 | ]
1143 | },
1144 | {
1145 | "cell_type": "code",
1146 | "execution_count": 33,
1147 | "metadata": {},
1148 | "outputs": [
1149 | {
1150 | "data": {
1151 | "text/plain": [
1152 | "273222"
1153 | ]
1154 | },
1155 | "execution_count": 33,
1156 | "metadata": {},
1157 | "output_type": "execute_result"
1158 | }
1159 | ],
1160 | "source": [
1161 | "active_set_recent = set(active_recent.index)\n",
1162 | "len(active_set_recent)"
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": 34,
1168 | "metadata": {},
1169 | "outputs": [
1170 | {
1171 | "data": {
1172 | "text/plain": [
1173 | "598465"
1174 | ]
1175 | },
1176 | "execution_count": 34,
1177 | "metadata": {},
1178 | "output_type": "execute_result"
1179 | }
1180 | ],
1181 | "source": [
1182 | "# Churn user set:\n",
1183 | "churn_set = active_set - active_set_recent\n",
1184 | "len(churn_set)"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "code",
1189 | "execution_count": 35,
1190 | "metadata": {},
1191 | "outputs": [
1192 | {
1193 | "data": {
1194 | "text/plain": [
1195 | "248864"
1196 | ]
1197 | },
1198 | "execution_count": 35,
1199 | "metadata": {},
1200 | "output_type": "execute_result"
1201 | }
1202 | ],
1203 | "source": [
1204 | "# Loyal user set:\n",
1205 | "loyal_set = active_set & active_set_recent\n",
1206 | "len(loyal_set)"
1207 | ]
1208 | },
1209 | {
1210 | "cell_type": "markdown",
1211 | "metadata": {},
1212 | "source": [
1213 | "### Down sample and save reduced dataframe"
1214 | ]
1215 | },
1216 | {
1217 | "cell_type": "code",
1218 | "execution_count": 36,
1219 | "metadata": {
1220 | "collapsed": true
1221 | },
1222 | "outputs": [],
1223 | "source": [
1224 | "import random\n",
1225 | "\n",
1226 | "random.seed(42)"
1227 | ]
1228 | },
1229 | {
1230 | "cell_type": "code",
1231 | "execution_count": 37,
1232 | "metadata": {},
1233 | "outputs": [
1234 | {
1235 | "data": {
1236 | "text/plain": [
1237 | "24886"
1238 | ]
1239 | },
1240 | "execution_count": 37,
1241 | "metadata": {},
1242 | "output_type": "execute_result"
1243 | }
1244 | ],
1245 | "source": [
1246 | "loyal_sample = random.sample(loyal_set,len(loyal_set)/10)\n",
1247 | "len(loyal_sample)"
1248 | ]
1249 | },
1250 | {
1251 | "cell_type": "code",
1252 | "execution_count": 38,
1253 | "metadata": {},
1254 | "outputs": [
1255 | {
1256 | "data": {
1257 | "text/plain": [
1258 | "59846"
1259 | ]
1260 | },
1261 | "execution_count": 38,
1262 | "metadata": {},
1263 | "output_type": "execute_result"
1264 | }
1265 | ],
1266 | "source": [
1267 | "churn_sample = random.sample(churn_set,len(churn_set)/10)\n",
1268 | "len(churn_sample)"
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "code",
1273 | "execution_count": 39,
1274 | "metadata": {
1275 | "collapsed": true
1276 | },
1277 | "outputs": [],
1278 | "source": [
1279 | "churn_sample_list = list(churn_sample)"
1280 | ]
1281 | },
1282 | {
1283 | "cell_type": "code",
1284 | "execution_count": 40,
1285 | "metadata": {
1286 | "collapsed": true
1287 | },
1288 | "outputs": [],
1289 | "source": [
1290 | "loyal_sample_list = list(loyal_sample)"
1291 | ]
1292 | },
1293 | {
1294 | "cell_type": "code",
1295 | "execution_count": 41,
1296 | "metadata": {
1297 | "collapsed": true
1298 | },
1299 | "outputs": [],
1300 | "source": [
1301 | "outfile = open(\"/Users/ZhijingYe/Desktop/data/output/churn_sample_list.pkl\",\"w\") "
1302 | ]
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "execution_count": 42,
1307 | "metadata": {
1308 | "collapsed": true
1309 | },
1310 | "outputs": [],
1311 | "source": [
1312 | "import numpy as np\n",
1313 | "np.save(\"/Users/ZhijingYe/Desktop/data/output/churn_sample_list\",churn_sample_list)"
1314 | ]
1315 | },
1316 | {
1317 | "cell_type": "code",
1318 | "execution_count": 43,
1319 | "metadata": {
1320 | "collapsed": true
1321 | },
1322 | "outputs": [],
1323 | "source": [
1324 | "np.save(\"/Users/ZhijingYe/Desktop/data/output/loyal_sample_list\",loyal_sample_list)"
1325 | ]
1326 | },
1327 | {
1328 | "cell_type": "code",
1329 | "execution_count": null,
1330 | "metadata": {
1331 | "collapsed": true
1332 | },
1333 | "outputs": [],
1334 | "source": []
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "execution_count": null,
1339 | "metadata": {
1340 | "collapsed": true
1341 | },
1342 | "outputs": [],
1343 | "source": []
1344 | },
1345 | {
1346 | "cell_type": "code",
1347 | "execution_count": null,
1348 | "metadata": {
1349 | "collapsed": true
1350 | },
1351 | "outputs": [],
1352 | "source": []
1353 | },
1354 | {
1355 | "cell_type": "code",
1356 | "execution_count": null,
1357 | "metadata": {
1358 | "collapsed": true
1359 | },
1360 | "outputs": [],
1361 | "source": []
1362 | },
1363 | {
1364 | "cell_type": "code",
1365 | "execution_count": null,
1366 | "metadata": {
1367 | "collapsed": true
1368 | },
1369 | "outputs": [],
1370 | "source": []
1371 | },
1372 | {
1373 | "cell_type": "code",
1374 | "execution_count": 44,
1375 | "metadata": {
1376 | "collapsed": true
1377 | },
1378 | "outputs": [],
1379 | "source": [
1380 | "df_churn = df_1.loc[df_1.uid.isin(churn_sample),:]"
1381 | ]
1382 | },
1383 | {
1384 | "cell_type": "code",
1385 | "execution_count": 45,
1386 | "metadata": {},
1387 | "outputs": [
1388 | {
1389 | "data": {
1390 | "text/plain": [
1391 | "(9814302, 3)"
1392 | ]
1393 | },
1394 | "execution_count": 45,
1395 | "metadata": {},
1396 | "output_type": "execute_result"
1397 | }
1398 | ],
1399 | "source": [
1400 | "df_churn.shape"
1401 | ]
1402 | },
1403 | {
1404 | "cell_type": "code",
1405 | "execution_count": 46,
1406 | "metadata": {
1407 | "collapsed": true
1408 | },
1409 | "outputs": [],
1410 | "source": [
1411 | "df_loyal_log = df_1.loc[df_1.uid.isin(loyal_sample),:]"
1412 | ]
1413 | },
1414 | {
1415 | "cell_type": "code",
1416 | "execution_count": 47,
1417 | "metadata": {},
1418 | "outputs": [
1419 | {
1420 | "data": {
1421 | "text/plain": [
1422 | "(35070042, 3)"
1423 | ]
1424 | },
1425 | "execution_count": 47,
1426 | "metadata": {},
1427 | "output_type": "execute_result"
1428 | }
1429 | ],
1430 | "source": [
1431 | "df_loyal_log.shape"
1432 | ]
1433 | },
1434 | {
1435 | "cell_type": "code",
1436 | "execution_count": 48,
1437 | "metadata": {
1438 | "collapsed": true
1439 | },
1440 | "outputs": [],
1441 | "source": [
1442 | "df_churn.to_csv('/Users/ZhijingYe/Desktop/data/output/churn_df_sample.csv',sep='\\t', encoding='utf-8')\n",
1443 | "df_loyal_log.to_csv('/Users/ZhijingYe/Desktop/data/output/loyal_df_sample.csv',sep='\\t', encoding='utf-8')"
1444 | ]
1445 | },
1446 | {
1447 | "cell_type": "markdown",
1448 | "metadata": {},
1449 | "source": []
1450 | },
1451 | {
1452 | "cell_type": "code",
1453 | "execution_count": 49,
1454 | "metadata": {
1455 | "collapsed": true
1456 | },
1457 | "outputs": [],
1458 | "source": [
1459 | "import glob"
1460 | ]
1461 | },
1462 | {
1463 | "cell_type": "code",
1464 | "execution_count": 50,
1465 | "metadata": {},
1466 | "outputs": [
1467 | {
1468 | "data": {
1469 | "text/plain": [
1470 | "138"
1471 | ]
1472 | },
1473 | "execution_count": 50,
1474 | "metadata": {},
1475 | "output_type": "execute_result"
1476 | }
1477 | ],
1478 | "source": [
1479 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n",
1480 | "files = glob.glob(filepath)\n",
1481 | "# amount of files\n",
1482 | "len(files)"
1483 | ]
1484 | },
1485 | {
1486 | "cell_type": "code",
1487 | "execution_count": 51,
1488 | "metadata": {
1489 | "collapsed": true
1490 | },
1491 | "outputs": [],
1492 | "source": [
1493 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']"
1494 | ]
1495 | },
1496 | {
1497 | "cell_type": "code",
1498 | "execution_count": 52,
1499 | "metadata": {
1500 | "collapsed": true
1501 | },
1502 | "outputs": [],
1503 | "source": [
1504 | "output = open('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log','a')"
1505 | ]
1506 | },
1507 | {
1508 | "cell_type": "code",
1509 | "execution_count": null,
1510 | "metadata": {},
1511 | "outputs": [
1512 | {
1513 | "name": "stdout",
1514 | "output_type": "stream",
1515 | "text": [
1516 | "processing file: 20170410_2_play.log\n"
1517 | ]
1518 | }
1519 | ],
1520 | "source": [
1521 | "import time\n",
1522 | "\n",
1523 | "\n",
1524 | "for the_file in files:\n",
1525 | " current_time = time.clock()\n",
1526 | "\n",
1527 | " with open(the_file, 'r') as f:\n",
1528 | " lines = f.readlines()\n",
1529 | " file_name = f.name.split('/')[-1]\n",
1530 | " print('processing file: %s' % file_name)\n",
1531 | " for line in lines:\n",
1532 | " user_id = line.strip('\\n').split('\\t')[0]\n",
1533 | " if user_id in churn_sample:\n",
1534 | " contents_to_wirte = line.strip('\\n').split('\\t')\n",
1535 | " contents_to_wirte.extend((file_name, '1'))\n",
1536 | " elif user_id in loyal_sample:\n",
1537 | " contents_to_wirte = line.strip('\\n').split('\\t')\n",
1538 | " contents_to_wirte.extend((file_name, '0'))\n",
1539 | " else:\n",
1540 | " continue \n",
1541 | " output.write('\\t'.join(contents_to_wirte)+'\\n')\n",
1542 | " print('...costs %.2f seconds' % (time.clock()-current_time))"
1543 | ]
1544 | },
1545 | {
1546 | "cell_type": "code",
1547 | "execution_count": null,
1548 | "metadata": {
1549 | "collapsed": true
1550 | },
1551 | "outputs": [],
1552 | "source": [
1553 | "output.close()"
1554 | ]
1555 | },
1556 | {
1557 | "cell_type": "code",
1558 | "execution_count": null,
1559 | "metadata": {
1560 | "collapsed": true
1561 | },
1562 | "outputs": [],
1563 | "source": [
1564 | "df_play = pd.read_csv('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log',\n",
1565 | " delimiter='\\t',header=None,index_col=None,names = schema,\n",
1566 | " dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})\n",
1567 | "df_play.head()"
1568 | ]
1569 | },
1570 | {
1571 | "cell_type": "code",
1572 | "execution_count": null,
1573 | "metadata": {
1574 | "collapsed": true
1575 | },
1576 | "outputs": [],
1577 | "source": [
1578 | "df_play.info()"
1579 | ]
1580 | }
1581 | ],
1582 | "metadata": {
1583 | "kernelspec": {
1584 | "display_name": "Python 3",
1585 | "language": "python",
1586 | "name": "python3"
1587 | },
1588 | "language_info": {
1589 | "codemirror_mode": {
1590 | "name": "ipython",
1591 | "version": 3
1592 | },
1593 | "file_extension": ".py",
1594 | "mimetype": "text/x-python",
1595 | "name": "python",
1596 | "nbconvert_exporter": "python",
1597 | "pygments_lexer": "ipython3",
1598 | "version": "3.6.2"
1599 | }
1600 | },
1601 | "nbformat": 4,
1602 | "nbformat_minor": 2
1603 | }
1604 |
--------------------------------------------------------------------------------
/music_box/4Cleansing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "\n",
14 | "import matplotlib.pyplot as plt\n",
15 | "\n",
16 | "%matplotlib inline"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {
23 | "collapsed": true
24 | },
25 | "outputs": [],
26 | "source": [
27 | "# Load saved list for churn and loyal sample users: churn_sample_list, loyal_sample_list\n",
28 | "churn_list = np.load('/Users/ZhijingYe/Desktop/data/output/churn_sample_list.npy').tolist()\n",
29 | "loyal_list = np.load('/Users/ZhijingYe/Desktop/data/output/loyal_sample_list.npy').tolist()\n"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/plain": [
40 | "(59846, 24886)"
41 | ]
42 | },
43 | "execution_count": 3,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "len(churn_list), len(loyal_list)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 4,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "set()"
61 | ]
62 | },
63 | "execution_count": 4,
64 | "metadata": {},
65 | "output_type": "execute_result"
66 | }
67 | ],
68 | "source": [
69 | "set(churn_list) & set(loyal_list)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "data": {
79 | "text/plain": [
80 | "59846"
81 | ]
82 | },
83 | "execution_count": 5,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "len(set(churn_list) - set(loyal_list))"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "### Read in the other columns line by line by matching uid with user_set"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {
103 | "collapsed": true
104 | },
105 | "outputs": [],
106 | "source": [
107 | "import glob"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 7,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "138"
119 | ]
120 | },
121 | "execution_count": 7,
122 | "metadata": {},
123 | "output_type": "execute_result"
124 | }
125 | ],
126 | "source": [
127 | "filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'\n",
128 | "files = glob.glob(filepath)\n",
129 | "# amount of files\n",
130 | "len(files)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 8,
136 | "metadata": {
137 | "collapsed": true
138 | },
139 | "outputs": [],
140 | "source": [
141 | "schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 9,
147 | "metadata": {
148 | "collapsed": true
149 | },
150 | "outputs": [],
151 | "source": [
152 | "output = open('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log','a')"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 10,
158 | "metadata": {
159 | "collapsed": true
160 | },
161 | "outputs": [],
162 | "source": [
163 | "churn_set = set(churn_list)\n",
164 | "loyal_set = set(loyal_list)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 11,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "processing file: 20170410_2_play.log\n",
177 | "...costs 1.45 seconds\n",
178 | "processing file: 20170410_3_play.log\n",
179 | "...costs 1.54 seconds\n",
180 | "processing file: 20170427_3_play.log\n",
181 | "...costs 1.64 seconds\n",
182 | "processing file: 20170427_2_play.log\n",
183 | "...costs 1.22 seconds\n",
184 | "processing file: 20170504_3_play.log\n",
185 | "...costs 1.15 seconds\n",
186 | "processing file: 20170504_2_play.log\n",
187 | "...costs 1.12 seconds\n",
188 | "processing file: 20170508_1_play.log\n",
189 | "...costs 1.19 seconds\n",
190 | "processing file: 20170505_1_play.log\n",
191 | "...costs 1.14 seconds\n",
192 | "processing file: 20170411_1_play.log\n",
193 | "...costs 1.51 seconds\n",
194 | "processing file: 20170426_1_play.log\n",
195 | "...costs 1.25 seconds\n",
196 | "processing file: 20170509_3_play.log\n",
197 | "...costs 1.06 seconds\n",
198 | "processing file: 20170509_2_play.log\n",
199 | "...costs 1.08 seconds\n",
200 | "processing file: 20170401_2_play.log\n",
201 | "...costs 2.43 seconds\n",
202 | "processing file: 20170401_3_play.log\n",
203 | "...costs 2.74 seconds\n",
204 | "processing file: 20170423_1_play.log\n",
205 | "...costs 1.46 seconds\n",
206 | "processing file: 20170414_1_play.log\n",
207 | "...costs 1.03 seconds\n",
208 | "processing file: 20170418_2_play.log\n",
209 | "...costs 1.30 seconds\n",
210 | "processing file: 20170418_3_play.log\n",
211 | "...costs 1.28 seconds\n",
212 | "processing file: 20170408_1_play.log\n",
213 | "...costs 1.73 seconds\n",
214 | "processing file: 20170404_2_play.log\n",
215 | "...costs 2.19 seconds\n",
216 | "processing file: 20170404_3_play.log\n",
217 | "...costs 2.16 seconds\n",
218 | "processing file: 20170510_3_play.log\n",
219 | "...costs 1.11 seconds\n",
220 | "processing file: 20170308_1_play.log\n",
221 | "...costs 1.48 seconds\n",
222 | "processing file: 20170510_2_play.log\n",
223 | "...costs 1.12 seconds\n",
224 | "processing file: 20170501_3_play.log\n",
225 | "...costs 1.35 seconds\n",
226 | "processing file: 20170501_2_play.log\n",
227 | "...costs 1.33 seconds\n",
228 | "processing file: 20170422_3_play.log\n",
229 | "...costs 1.49 seconds\n",
230 | "processing file: 20170422_2_play.log\n",
231 | "...costs 1.44 seconds\n",
232 | "processing file: 20170415_2_play.log\n",
233 | "...costs 1.66 seconds\n",
234 | "processing file: 20170415_3_play.log\n",
235 | "...costs 1.65 seconds\n",
236 | "processing file: 20170419_1_play.log\n",
237 | "...costs 1.41 seconds\n",
238 | "processing file: 20170409_2_play.log\n",
239 | "...costs 1.82 seconds\n",
240 | "processing file: 20170409_3_play.log\n",
241 | "...costs 1.81 seconds\n",
242 | "processing file: 20170305_1_play.log\n",
243 | "...costs 2.02 seconds\n",
244 | "processing file: 20170511_1_play.log\n",
245 | "...costs 1.12 seconds\n",
246 | "processing file: 20170405_1_play.log\n",
247 | "...costs 1.64 seconds\n",
248 | "processing file: 20170302_1_play.log\n",
249 | "...costs 3.06 seconds\n",
250 | "processing file: 20170402_1_play.log\n",
251 | "...costs 2.36 seconds\n",
252 | "processing file: 20170425_2_play.log\n",
253 | "...costs 1.22 seconds\n",
254 | "processing file: 20170425_3_play.log\n",
255 | "...costs 1.22 seconds\n",
256 | "processing file: 20170412_3_play.log\n",
257 | "...costs 1.54 seconds\n",
258 | "processing file: 20170412_2_play.log\n",
259 | "...costs 1.51 seconds\n",
260 | "processing file: 20170506_2_play.log\n",
261 | "...costs 1.29 seconds\n",
262 | "processing file: 20170506_3_play.log\n",
263 | "...costs 1.29 seconds\n",
264 | "processing file: 20170429_1_play.log\n",
265 | "...costs 1.39 seconds\n",
266 | "processing file: 20170403_2_play.log\n",
267 | "...costs 2.37 seconds\n",
268 | "processing file: 20170507_1_play.log\n",
269 | "...costs 1.24 seconds\n",
270 | "processing file: 20170424_1_play.log\n",
271 | "...costs 0.77 seconds\n",
272 | "processing file: 20170413_1_play.log\n",
273 | "...costs 1.43 seconds\n",
274 | "processing file: 20170428_2_play.log\n",
275 | "...costs 1.28 seconds\n",
276 | "processing file: 20170428_3_play.log\n",
277 | "...costs 1.28 seconds\n",
278 | "processing file: 20170331_2_play.log\n",
279 | "...costs 2.72 seconds\n",
280 | "processing file: 20170331_3_play.log\n",
281 | "...costs 3.77 seconds\n",
282 | "processing file: 20170406_3_play.log\n",
283 | "...costs 1.80 seconds\n",
284 | "processing file: 20170406_2_play.log\n",
285 | "...costs 1.64 seconds\n",
286 | "processing file: 20170512_2_play.log\n",
287 | "...costs 1.11 seconds\n",
288 | "processing file: 20170512_3_play.log\n",
289 | "...costs 1.07 seconds\n",
290 | "processing file: 20170416_1_play.log\n",
291 | "...costs 1.54 seconds\n",
292 | "processing file: 20170421_1_play.log\n",
293 | "...costs 1.31 seconds\n",
294 | "processing file: 20170502_1_play.log\n",
295 | "...costs 1.10 seconds\n",
296 | "processing file: 20170307_1_play.log\n",
297 | "...costs 1.53 seconds\n",
298 | "processing file: 20170430_1_play.log\n",
299 | "...costs 1.35 seconds\n",
300 | "processing file: 20170407_1_play.log\n",
301 | "...costs 1.59 seconds\n",
302 | "processing file: 20170503_2_play.log\n",
303 | "...costs 1.13 seconds\n",
304 | "processing file: 20170503_3_play.log\n",
305 | "...costs 1.10 seconds\n",
306 | "processing file: 20170417_3_play.log\n",
307 | "...costs 1.43 seconds\n",
308 | "processing file: 20170417_2_play.log\n",
309 | "...costs 1.43 seconds\n",
310 | "processing file: 20170420_2_play.log\n",
311 | "...costs 1.42 seconds\n",
312 | "processing file: 20170420_3_play.log\n",
313 | "...costs 1.45 seconds\n",
314 | "processing file: 20170422_1_play.log\n",
315 | "...costs 1.57 seconds\n",
316 | "processing file: 20170415_1_play.log\n",
317 | "...costs 1.69 seconds\n",
318 | "processing file: 20170501_1_play.log\n",
319 | "...costs 1.42 seconds\n",
320 | "processing file: 20170419_3_play.log\n",
321 | "...costs 1.40 seconds\n",
322 | "processing file: 20170419_2_play.log\n",
323 | "...costs 1.46 seconds\n",
324 | "processing file: 20170409_1_play.log\n",
325 | "...costs 1.82 seconds\n",
326 | "processing file: 20170405_3_play.log\n",
327 | "...costs 1.86 seconds\n",
328 | "processing file: 20170405_2_play.log\n",
329 | "...costs 1.72 seconds\n",
330 | "processing file: 20170309_1_play.log\n",
331 | "...costs 1.46 seconds\n",
332 | "processing file: 20170511_2_play.log\n",
333 | "...costs 1.11 seconds\n",
334 | "processing file: 20170511_3_play.log\n",
335 | "...costs 1.13 seconds\n",
336 | "processing file: 20170423_2_play.log\n",
337 | "...costs 1.46 seconds\n",
338 | "processing file: 20170423_3_play.log\n",
339 | "...costs 1.52 seconds\n",
340 | "processing file: 20170414_3_play.log\n",
341 | "...costs 1.13 seconds\n",
342 | "processing file: 20170414_2_play.log\n",
343 | "...costs 1.10 seconds\n",
344 | "processing file: 20170418_1_play.log\n",
345 | "...costs 1.46 seconds\n",
346 | "processing file: 20170301_play.log\n",
347 | "...costs 4.65 seconds\n",
348 | "processing file: 20170408_3_play.log\n",
349 | "...costs 2.06 seconds\n",
350 | "processing file: 20170408_2_play.log\n",
351 | "...costs 1.92 seconds\n",
352 | "processing file: 20170304_1_play.log\n",
353 | "...costs 2.24 seconds\n",
354 | "processing file: 20170510_1_play.log\n",
355 | "...costs 1.21 seconds\n",
356 | "processing file: 20170404_1_play.log\n",
357 | "...costs 2.25 seconds\n",
358 | "processing file: 20170411_2_play.log\n",
359 | "...costs 1.60 seconds\n",
360 | "processing file: 20170426_2_play.log\n",
361 | "...costs 1.31 seconds\n",
362 | "processing file: 20170426_3_play.log\n",
363 | "...costs 1.25 seconds\n",
364 | "processing file: 20170505_2_play.log\n",
365 | "...costs 1.23 seconds\n",
366 | "processing file: 20170505_3_play.log\n",
367 | "...costs 1.19 seconds\n",
368 | "processing file: 20170509_1_play.log\n",
369 | "...costs 1.21 seconds\n",
370 | "processing file: 20170401_1_play.log\n",
371 | "...costs 2.38 seconds\n",
372 | "processing file: 20170504_1_play.log\n",
373 | "...costs 1.19 seconds\n",
374 | "processing file: 20170410_1_play.log\n",
375 | "...costs 1.55 seconds\n",
376 | "processing file: 20170427_1_play.log\n",
377 | "...costs 1.30 seconds\n",
378 | "processing file: 20170508_2_play.log\n",
379 | "...costs 1.15 seconds\n",
380 | "processing file: 20170508_3_play.log\n",
381 | "...costs 1.15 seconds\n",
382 | "processing file: 20170330_3_play.log\n",
383 | "...costs 6.34 seconds\n",
384 | "processing file: 20170430_3_play.log\n",
385 | "...costs 1.65 seconds\n",
386 | "processing file: 20170430_2_play.log\n",
387 | "...costs 1.45 seconds\n",
388 | "processing file: 20170407_2_play.log\n",
389 | "...costs 1.71 seconds\n",
390 | "processing file: 20170407_3_play.log\n",
391 | "...costs 1.82 seconds\n",
392 | "processing file: 20170417_1_play.log\n",
393 | "...costs 1.41 seconds\n",
394 | "processing file: 20170420_1_play.log\n",
395 | "...costs 1.37 seconds\n",
396 | "processing file: 20170503_1_play.log\n",
397 | "...costs 1.21 seconds\n",
398 | "processing file: 20170331_1_play.log\n",
399 | "...costs 2.58 seconds\n",
400 | "processing file: 20170306_1_play.log\n",
401 | "...costs 1.78 seconds\n",
402 | "processing file: 20170512_1_play.log\n",
403 | "...costs 1.23 seconds\n",
404 | "processing file: 20170406_1_play.log\n",
405 | "...costs 1.65 seconds\n",
406 | "processing file: 20170502_3_play.log\n",
407 | "...costs 1.09 seconds\n",
408 | "processing file: 20170502_2_play.log\n",
409 | "...costs 1.15 seconds\n",
410 | "processing file: 20170416_2_play.log\n",
411 | "...costs 1.71 seconds\n",
412 | "processing file: 20170416_3_play.log\n",
413 | "...costs 1.70 seconds\n",
414 | "processing file: 20170421_3_play.log\n",
415 | "...costs 1.41 seconds\n",
416 | "processing file: 20170421_2_play.log\n",
417 | "...costs 1.41 seconds\n",
418 | "processing file: 20170303_1_play.log\n",
419 | "...costs 2.44 seconds\n",
420 | "processing file: 20170403_1_play.log\n",
421 | "...costs 2.35 seconds\n",
422 | "processing file: 20170424_3_play.log\n",
423 | "...costs 1.31 seconds\n",
424 | "processing file: 20170424_2_play.log\n",
425 | "...costs 0.26 seconds\n",
426 | "processing file: 20170413_2_play.log\n",
427 | "...costs 1.51 seconds\n",
428 | "processing file: 20170413_3_play.log\n",
429 | "...costs 1.60 seconds\n",
430 | "processing file: 20170507_3_play.log\n",
431 | "...costs 1.32 seconds\n",
432 | "processing file: 20170507_2_play.log\n",
433 | "...costs 1.35 seconds\n",
434 | "processing file: 20170428_1_play.log\n",
435 | "...costs 1.28 seconds\n",
436 | "processing file: 20170339_1_play.log\n",
437 | "...costs 2.89 seconds\n",
438 | "processing file: 20170402_2_play.log\n",
439 | "...costs 2.67 seconds\n",
440 | "processing file: 20170402_3_play.log\n",
441 | "...costs 2.71 seconds\n",
442 | "processing file: 20170506_1_play.log\n",
443 | "...costs 1.41 seconds\n",
444 | "processing file: 20170425_1_play.log\n",
445 | "...costs 1.36 seconds\n",
446 | "processing file: 20170412_1_play.log\n",
447 | "...costs 1.64 seconds\n",
448 | "processing file: 20170429_3_play.log\n",
449 | "...costs 1.74 seconds\n",
450 | "processing file: 20170429_2_play.log\n",
451 | "...costs 1.60 seconds\n"
452 | ]
453 | }
454 | ],
455 | "source": [
456 | "import time\n",
457 | "\n",
458 | "\n",
459 | "for the_file in files:\n",
460 | " current_time = time.clock()\n",
461 | "\n",
462 | " with open(the_file, 'r') as f:\n",
463 | " lines = f.readlines()\n",
464 | " file_name = f.name.split('/')[-1]\n",
465 | " print('processing file: %s' % file_name)\n",
466 | " for line in lines:\n",
467 | " user_id = line.split('\\t')[0]\n",
468 | " if user_id in churn_set:\n",
469 | " contents_to_wirte = line.strip('\\n').split('\\t')\n",
470 | " contents_to_wirte.extend((file_name, '1'))\n",
471 | " elif user_id in loyal_set:\n",
472 | " contents_to_wirte = line.strip('\\n').split('\\t')\n",
473 | " contents_to_wirte.extend((file_name, '0'))\n",
474 | " else:\n",
475 | " continue \n",
476 | " output.write('\\t'.join(contents_to_wirte)+'\\n')\n",
477 | " print('...costs %.2f seconds' % (time.clock()-current_time))"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 12,
483 | "metadata": {
484 | "collapsed": true
485 | },
486 | "outputs": [],
487 | "source": [
488 | "output.close()"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 13,
494 | "metadata": {},
495 | "outputs": [
496 | {
497 | "name": "stderr",
498 | "output_type": "stream",
499 | "text": [
500 | "/Users/ZhijingYe/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (7,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
501 | " data = self._reader.read(nrows)\n"
502 | ]
503 | },
504 | {
505 | "data": {
506 | "text/html": [
507 | "\n",
508 | "
\n",
509 | " \n",
510 | " \n",
511 | " | \n",
512 | " uid | \n",
513 | " device | \n",
514 | " song_id | \n",
515 | " song_type | \n",
516 | " song_name | \n",
517 | " singer | \n",
518 | " play_time | \n",
519 | " song_length | \n",
520 | " paid_flag | \n",
521 | " file_name | \n",
522 | " label | \n",
523 | "
\n",
524 | " \n",
525 | " \n",
526 | " \n",
527 | " | 0 | \n",
528 | " 168308107 | \n",
529 | " ar | \n",
530 | " 162455 | \n",
531 | " 0 | \n",
532 | " 最初的梦想 | \n",
533 | " 范玮琪 | \n",
534 | " 296 | \n",
535 | " 296 | \n",
536 | " 0 | \n",
537 | " 20170410_2_play.log | \n",
538 | " 0 | \n",
539 | "
\n",
540 | " \n",
541 | " | 1 | \n",
542 | " 168112765 | \n",
543 | " ar | \n",
544 | " 4393501 | \n",
545 | " 0 | \n",
546 | " 喜欢你(f101 粤) | \n",
547 | " Beyond | \n",
548 | " 272 | \n",
549 | " 0 | \n",
550 | " 0 | \n",
551 | " 20170410_2_play.log | \n",
552 | " 0 | \n",
553 | "
\n",
554 | " \n",
555 | " | 2 | \n",
556 | " 168274411 | \n",
557 | " ar | \n",
558 | " 22833011 | \n",
559 | " 0 | \n",
560 | " 宽恕 | \n",
561 | " 宽恕乐队 | \n",
562 | " 24 | \n",
563 | " 156 | \n",
564 | " 0 | \n",
565 | " 20170410_2_play.log | \n",
566 | " 0 | \n",
567 | "
\n",
568 | " \n",
569 | " | 3 | \n",
570 | " 0 | \n",
571 | " ar | \n",
572 | " 4266814 | \n",
573 | " 1 | \n",
574 | " 天使的翅膀 | \n",
575 | " 徐誉滕 | \n",
576 | " 214384 | \n",
577 | " 0 | \n",
578 | " 0 | \n",
579 | " 20170410_2_play.log | \n",
580 | " 0 | \n",
581 | "
\n",
582 | " \n",
583 | " | 4 | \n",
584 | " 168274411 | \n",
585 | " ar | \n",
586 | " 176292 | \n",
587 | " 0 | \n",
588 | " 爱不爱我 | \n",
589 | " 零点乐队 | \n",
590 | " 333 | \n",
591 | " 334 | \n",
592 | " 0 | \n",
593 | " 20170410_2_play.log | \n",
594 | " 0 | \n",
595 | "
\n",
596 | " \n",
597 | "
\n",
598 | "
"
599 | ],
600 | "text/plain": [
601 | " uid device song_id song_type song_name singer play_time \\\n",
602 | "0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n",
603 | "1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n",
604 | "2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n",
605 | "3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n",
606 | "4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n",
607 | "\n",
608 | " song_length paid_flag file_name label \n",
609 | "0 296 0 20170410_2_play.log 0 \n",
610 | "1 0 0 20170410_2_play.log 0 \n",
611 | "2 156 0 20170410_2_play.log 0 \n",
612 | "3 0 0 20170410_2_play.log 0 \n",
613 | "4 334 0 20170410_2_play.log 0 "
614 | ]
615 | },
616 | "execution_count": 13,
617 | "metadata": {},
618 | "output_type": "execute_result"
619 | }
620 | ],
621 | "source": [
622 | "df_play = pd.read_csv('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log',\n",
623 | " delimiter='\\t',header=None,index_col=None,names = schema,\n",
624 | " dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})\n",
625 | "df_play.head()"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 14,
631 | "metadata": {},
632 | "outputs": [
633 | {
634 | "name": "stdout",
635 | "output_type": "stream",
636 | "text": [
637 | "\n",
638 | "Int64Index: 39701302 entries, 0 to 39701301\n",
639 | "Data columns (total 11 columns):\n",
640 | "uid object\n",
641 | "device object\n",
642 | "song_id object\n",
643 | "song_type object\n",
644 | "song_name object\n",
645 | "singer object\n",
646 | "play_time object\n",
647 | "song_length object\n",
648 | "paid_flag object\n",
649 | "file_name object\n",
650 | "label object\n",
651 | "dtypes: object(11)\n",
652 | "memory usage: 3.5+ GB\n"
653 | ]
654 | }
655 | ],
656 | "source": [
657 | "df_play.info()"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 15,
663 | "metadata": {
664 | "collapsed": true
665 | },
666 | "outputs": [],
667 | "source": [
668 | "user_set = set(df_play['uid'])"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 16,
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "data": {
678 | "text/plain": [
679 | "150884"
680 | ]
681 | },
682 | "execution_count": 16,
683 | "metadata": {},
684 | "output_type": "execute_result"
685 | }
686 | ],
687 | "source": [
688 | "len(user_set)"
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": 17,
694 | "metadata": {},
695 | "outputs": [
696 | {
697 | "data": {
698 | "text/plain": [
699 | "-30"
700 | ]
701 | },
702 | "execution_count": 17,
703 | "metadata": {},
704 | "output_type": "execute_result"
705 | }
706 | ],
707 | "source": [
708 | "len(user_set & churn_set)-len(churn_set)"
709 | ]
710 | },
711 | {
712 | "cell_type": "code",
713 | "execution_count": 18,
714 | "metadata": {},
715 | "outputs": [
716 | {
717 | "data": {
718 | "text/plain": [
719 | "0"
720 | ]
721 | },
722 | "execution_count": 18,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "len(user_set & loyal_set)-len(loyal_set)"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 19,
734 | "metadata": {},
735 | "outputs": [
736 | {
737 | "data": {
738 | "text/plain": [
739 | "set()"
740 | ]
741 | },
742 | "execution_count": 19,
743 | "metadata": {},
744 | "output_type": "execute_result"
745 | }
746 | ],
747 | "source": [
748 | "user_set - user_set.union(loyal_set)"
749 | ]
750 | },
751 | {
752 | "cell_type": "code",
753 | "execution_count": 20,
754 | "metadata": {},
755 | "outputs": [
756 | {
757 | "data": {
758 | "text/plain": [
759 | "uid 9426\n",
760 | "device 0\n",
761 | "song_id 7510\n",
762 | "song_type 23109\n",
763 | "song_name 5524\n",
764 | "singer 49189\n",
765 | "play_time 160232\n",
766 | "song_length 20717\n",
767 | "paid_flag 51979\n",
768 | "file_name 51980\n",
769 | "label 51980\n",
770 | "dtype: int64"
771 | ]
772 | },
773 | "execution_count": 20,
774 | "metadata": {},
775 | "output_type": "execute_result"
776 | }
777 | ],
778 | "source": [
779 | "df_play.isnull().sum(axis = 0)"
780 | ]
781 | },
782 | {
783 | "cell_type": "markdown",
784 | "metadata": {},
785 | "source": [
786 | "### Missing values\n"
787 | ]
788 | },
789 | {
790 | "cell_type": "code",
791 | "execution_count": 21,
792 | "metadata": {},
793 | "outputs": [
794 | {
795 | "data": {
796 | "text/html": [
797 | "\n",
798 | "
\n",
799 | " \n",
800 | " \n",
801 | " | \n",
802 | " uid | \n",
803 | " device | \n",
804 | " song_id | \n",
805 | " song_type | \n",
806 | " song_name | \n",
807 | " singer | \n",
808 | " play_time | \n",
809 | " song_length | \n",
810 | " paid_flag | \n",
811 | " file_name | \n",
812 | " label | \n",
813 | "
\n",
814 | " \n",
815 | " \n",
816 | " \n",
817 | " | 3164 | \n",
818 | " NaN | \n",
819 | " 朴树 | \n",
820 | " 2 | \n",
821 | " 23 | \n",
822 | " 0 | \n",
823 | " 20170410_2_play.log | \n",
824 | " 0 | \n",
825 | " NaN | \n",
826 | " NaN | \n",
827 | " NaN | \n",
828 | " NaN | \n",
829 | "
\n",
830 | " \n",
831 | " | 5618 | \n",
832 | " NaN | \n",
833 | " 刘涛 | \n",
834 | " 31 | \n",
835 | " 30 | \n",
836 | " 0 | \n",
837 | " 20170410_2_play.log | \n",
838 | " 0 | \n",
839 | " NaN | \n",
840 | " NaN | \n",
841 | " NaN | \n",
842 | " NaN | \n",
843 | "
\n",
844 | " \n",
845 | " | 5690 | \n",
846 | " NaN | \n",
847 | " 刘涛 | \n",
848 | " 51 | \n",
849 | " 51 | \n",
850 | " 0 | \n",
851 | " 20170410_2_play.log | \n",
852 | " 0 | \n",
853 | " NaN | \n",
854 | " NaN | \n",
855 | " NaN | \n",
856 | " NaN | \n",
857 | "
\n",
858 | " \n",
859 | " | 6643 | \n",
860 | " NaN | \n",
861 | " 薛之谦 | \n",
862 | " 19 | \n",
863 | " 41 | \n",
864 | " 0 | \n",
865 | " 20170410_2_play.log | \n",
866 | " 0 | \n",
867 | " NaN | \n",
868 | " NaN | \n",
869 | " NaN | \n",
870 | " NaN | \n",
871 | "
\n",
872 | " \n",
873 | " | 7879 | \n",
874 | " NaN | \n",
875 | " EXO | \n",
876 | " 26 | \n",
877 | " 26 | \n",
878 | " 0 | \n",
879 | " 20170410_2_play.log | \n",
880 | " 0 | \n",
881 | " NaN | \n",
882 | " NaN | \n",
883 | " NaN | \n",
884 | " NaN | \n",
885 | "
\n",
886 | " \n",
887 | " | 11400 | \n",
888 | " NaN | \n",
889 | " 佛教音乐 | \n",
890 | " 51 | \n",
891 | " 51 | \n",
892 | " 0 | \n",
893 | " 20170410_2_play.log | \n",
894 | " 0 | \n",
895 | " NaN | \n",
896 | " NaN | \n",
897 | " NaN | \n",
898 | " NaN | \n",
899 | "
\n",
900 | " \n",
901 | " | 11918 | \n",
902 | " NaN | \n",
903 | " 佛教音乐 | \n",
904 | " 51 | \n",
905 | " 51 | \n",
906 | " 0 | \n",
907 | " 20170410_2_play.log | \n",
908 | " 0 | \n",
909 | " NaN | \n",
910 | " NaN | \n",
911 | " NaN | \n",
912 | " NaN | \n",
913 | "
\n",
914 | " \n",
915 | " | 19485 | \n",
916 | " NaN | \n",
917 | " 杨洋 | \n",
918 | " 0 | \n",
919 | " 29 | \n",
920 | " 0 | \n",
921 | " 20170410_2_play.log | \n",
922 | " 0 | \n",
923 | " NaN | \n",
924 | " NaN | \n",
925 | " NaN | \n",
926 | " NaN | \n",
927 | "
\n",
928 | " \n",
929 | " | 21698 | \n",
930 | " NaN | \n",
931 | " 左宏元&张慧清 | \n",
932 | " 0 | \n",
933 | " 46 | \n",
934 | " 0 | \n",
935 | " 20170410_2_play.log | \n",
936 | " 0 | \n",
937 | " NaN | \n",
938 | " NaN | \n",
939 | " NaN | \n",
940 | " NaN | \n",
941 | "
\n",
942 | " \n",
943 | " | 25202 | \n",
944 | " NaN | \n",
945 | " 好想好想(51秒铃声版) | \n",
946 | " 0 | \n",
947 | " 52 | \n",
948 | " 0 | \n",
949 | " 20170410_2_play.log | \n",
950 | " 0 | \n",
951 | " NaN | \n",
952 | " NaN | \n",
953 | " NaN | \n",
954 | " NaN | \n",
955 | "
\n",
956 | " \n",
957 | " | 25782 | \n",
958 | " NaN | \n",
959 | " 汤晶锦 | \n",
960 | " 40 | \n",
961 | " 40 | \n",
962 | " 0 | \n",
963 | " 20170410_2_play.log | \n",
964 | " 0 | \n",
965 | " NaN | \n",
966 | " NaN | \n",
967 | " NaN | \n",
968 | " NaN | \n",
969 | "
\n",
970 | " \n",
971 | " | 40061 | \n",
972 | " NaN | \n",
973 | " 本兮 | \n",
974 | " 26 | \n",
975 | " 26 | \n",
976 | " 0 | \n",
977 | " 20170410_2_play.log | \n",
978 | " 0 | \n",
979 | " NaN | \n",
980 | " NaN | \n",
981 | " NaN | \n",
982 | " NaN | \n",
983 | "
\n",
984 | " \n",
985 | " | 40706 | \n",
986 | " NaN | \n",
987 | " 云菲菲 | \n",
988 | " 30 | \n",
989 | " 30 | \n",
990 | " 0 | \n",
991 | " 20170410_2_play.log | \n",
992 | " 0 | \n",
993 | " NaN | \n",
994 | " NaN | \n",
995 | " NaN | \n",
996 | " NaN | \n",
997 | "
\n",
998 | " \n",
999 | " | 48130 | \n",
1000 | " NaN | \n",
1001 | " 庄心妍 | \n",
1002 | " 56 | \n",
1003 | " 56 | \n",
1004 | " 0 | \n",
1005 | " 20170410_2_play.log | \n",
1006 | " 0 | \n",
1007 | " NaN | \n",
1008 | " NaN | \n",
1009 | " NaN | \n",
1010 | " NaN | \n",
1011 | "
\n",
1012 | " \n",
1013 | " | 51385 | \n",
1014 | " NaN | \n",
1015 | " 冷漠 | \n",
1016 | " 30 | \n",
1017 | " 30 | \n",
1018 | " 0 | \n",
1019 | " 20170410_2_play.log | \n",
1020 | " 0 | \n",
1021 | " NaN | \n",
1022 | " NaN | \n",
1023 | " NaN | \n",
1024 | " NaN | \n",
1025 | "
\n",
1026 | " \n",
1027 | " | 51818 | \n",
1028 | " NaN | \n",
1029 | " 薛之谦 | \n",
1030 | " 130 | \n",
1031 | " 41 | \n",
1032 | " 0 | \n",
1033 | " 20170410_2_play.log | \n",
1034 | " 0 | \n",
1035 | " NaN | \n",
1036 | " NaN | \n",
1037 | " NaN | \n",
1038 | " NaN | \n",
1039 | "
\n",
1040 | " \n",
1041 | " | 59378 | \n",
1042 | " NaN | \n",
1043 | " 阿悄 | \n",
1044 | " 27 | \n",
1045 | " 28 | \n",
1046 | " 0 | \n",
1047 | " 20170410_2_play.log | \n",
1048 | " 0 | \n",
1049 | " NaN | \n",
1050 | " NaN | \n",
1051 | " NaN | \n",
1052 | " NaN | \n",
1053 | "
\n",
1054 | " \n",
1055 | " | 69626 | \n",
1056 | " NaN | \n",
1057 | " 魏栾 | \n",
1058 | " 30 | \n",
1059 | " 29 | \n",
1060 | " 0 | \n",
1061 | " 20170410_2_play.log | \n",
1062 | " 0 | \n",
1063 | " NaN | \n",
1064 | " NaN | \n",
1065 | " NaN | \n",
1066 | " NaN | \n",
1067 | "
\n",
1068 | " \n",
1069 | " | 74606 | \n",
1070 | " NaN | \n",
1071 | " 马頔 | \n",
1072 | " 29 | \n",
1073 | " 29 | \n",
1074 | " 0 | \n",
1075 | " 20170410_2_play.log | \n",
1076 | " 0 | \n",
1077 | " NaN | \n",
1078 | " NaN | \n",
1079 | " NaN | \n",
1080 | " NaN | \n",
1081 | "
\n",
1082 | " \n",
1083 | " | 74608 | \n",
1084 | " NaN | \n",
1085 | " Beyond | \n",
1086 | " 27 | \n",
1087 | " 27 | \n",
1088 | " 0 | \n",
1089 | " 20170410_2_play.log | \n",
1090 | " 0 | \n",
1091 | " NaN | \n",
1092 | " NaN | \n",
1093 | " NaN | \n",
1094 | " NaN | \n",
1095 | "
\n",
1096 | " \n",
1097 | " | 74612 | \n",
1098 | " NaN | \n",
1099 | " 云菲菲 | \n",
1100 | " 30 | \n",
1101 | " 30 | \n",
1102 | " 0 | \n",
1103 | " 20170410_2_play.log | \n",
1104 | " 0 | \n",
1105 | " NaN | \n",
1106 | " NaN | \n",
1107 | " NaN | \n",
1108 | " NaN | \n",
1109 | "
\n",
1110 | " \n",
1111 | " | 75153 | \n",
1112 | " NaN | \n",
1113 | " Delacey | \n",
1114 | " 5 | \n",
1115 | " 35 | \n",
1116 | " 0 | \n",
1117 | " 20170410_2_play.log | \n",
1118 | " 1 | \n",
1119 | " NaN | \n",
1120 | " NaN | \n",
1121 | " NaN | \n",
1122 | " NaN | \n",
1123 | "
\n",
1124 | " \n",
1125 | " | 76095 | \n",
1126 | " NaN | \n",
1127 | " Fall Out Boy | \n",
1128 | " 30 | \n",
1129 | " 30 | \n",
1130 | " 0 | \n",
1131 | " 20170410_2_play.log | \n",
1132 | " 1 | \n",
1133 | " NaN | \n",
1134 | " NaN | \n",
1135 | " NaN | \n",
1136 | " NaN | \n",
1137 | "
\n",
1138 | " \n",
1139 | " | 76368 | \n",
1140 | " NaN | \n",
1141 | " 庄心妍 | \n",
1142 | " 15 | \n",
1143 | " 36 | \n",
1144 | " 0 | \n",
1145 | " 20170410_2_play.log | \n",
1146 | " 0 | \n",
1147 | " NaN | \n",
1148 | " NaN | \n",
1149 | " NaN | \n",
1150 | " NaN | \n",
1151 | "
\n",
1152 | " \n",
1153 | " | 76661 | \n",
1154 | " NaN | \n",
1155 | " 冷漠 | \n",
1156 | " 30 | \n",
1157 | " 30 | \n",
1158 | " 0 | \n",
1159 | " 20170410_2_play.log | \n",
1160 | " 0 | \n",
1161 | " NaN | \n",
1162 | " NaN | \n",
1163 | " NaN | \n",
1164 | " NaN | \n",
1165 | "
\n",
1166 | " \n",
1167 | " | 78444 | \n",
1168 | " NaN | \n",
1169 | " 佛教音乐 | \n",
1170 | " 50 | \n",
1171 | " 51 | \n",
1172 | " 0 | \n",
1173 | " 20170410_2_play.log | \n",
1174 | " 0 | \n",
1175 | " NaN | \n",
1176 | " NaN | \n",
1177 | " NaN | \n",
1178 | " NaN | \n",
1179 | "
\n",
1180 | " \n",
1181 | " | 79352 | \n",
1182 | " NaN | \n",
1183 | " 佛教音乐 | \n",
1184 | " 50 | \n",
1185 | " 51 | \n",
1186 | " 0 | \n",
1187 | " 20170410_2_play.log | \n",
1188 | " 0 | \n",
1189 | " NaN | \n",
1190 | " NaN | \n",
1191 | " NaN | \n",
1192 | " NaN | \n",
1193 | "
\n",
1194 | " \n",
1195 | " | 80956 | \n",
1196 | " NaN | \n",
1197 | " 佛教音乐 | \n",
1198 | " 50 | \n",
1199 | " 51 | \n",
1200 | " 0 | \n",
1201 | " 20170410_2_play.log | \n",
1202 | " 0 | \n",
1203 | " NaN | \n",
1204 | " NaN | \n",
1205 | " NaN | \n",
1206 | " NaN | \n",
1207 | "
\n",
1208 | " \n",
1209 | " | 81504 | \n",
1210 | " NaN | \n",
1211 | " 刁寒 | \n",
1212 | " 30 | \n",
1213 | " 30 | \n",
1214 | " 0 | \n",
1215 | " 20170410_2_play.log | \n",
1216 | " 0 | \n",
1217 | " NaN | \n",
1218 | " NaN | \n",
1219 | " NaN | \n",
1220 | " NaN | \n",
1221 | "
\n",
1222 | " \n",
1223 | " | 84367 | \n",
1224 | " NaN | \n",
1225 | " 唐古&蔡晓 | \n",
1226 | " 24 | \n",
1227 | " 24 | \n",
1228 | " 0 | \n",
1229 | " 20170410_2_play.log | \n",
1230 | " 0 | \n",
1231 | " NaN | \n",
1232 | " NaN | \n",
1233 | " NaN | \n",
1234 | " NaN | \n",
1235 | "
\n",
1236 | " \n",
1237 | " | ... | \n",
1238 | " ... | \n",
1239 | " ... | \n",
1240 | " ... | \n",
1241 | " ... | \n",
1242 | " ... | \n",
1243 | " ... | \n",
1244 | " ... | \n",
1245 | " ... | \n",
1246 | " ... | \n",
1247 | " ... | \n",
1248 | " ... | \n",
1249 | "
\n",
1250 | " \n",
1251 | " | 39571851 | \n",
1252 | " NaN | \n",
1253 | " 曹龙 | \n",
1254 | " 1 | \n",
1255 | " 33 | \n",
1256 | " 0 | \n",
1257 | " 20170429_2_play.log | \n",
1258 | " 0 | \n",
1259 | " NaN | \n",
1260 | " NaN | \n",
1261 | " NaN | \n",
1262 | " NaN | \n",
1263 | "
\n",
1264 | " \n",
1265 | " | 39580074 | \n",
1266 | " NaN | \n",
1267 | " 铃声 | \n",
1268 | " 0 | \n",
1269 | " 60 | \n",
1270 | " 0 | \n",
1271 | " 20170429_2_play.log | \n",
1272 | " 0 | \n",
1273 | " NaN | \n",
1274 | " NaN | \n",
1275 | " NaN | \n",
1276 | " NaN | \n",
1277 | "
\n",
1278 | " \n",
1279 | " | 39587988 | \n",
1280 | " NaN | \n",
1281 | " 陈小云 | \n",
1282 | " 39 | \n",
1283 | " 39 | \n",
1284 | " 0 | \n",
1285 | " 20170429_2_play.log | \n",
1286 | " 0 | \n",
1287 | " NaN | \n",
1288 | " NaN | \n",
1289 | " NaN | \n",
1290 | " NaN | \n",
1291 | "
\n",
1292 | " \n",
1293 | " | 39604491 | \n",
1294 | " NaN | \n",
1295 | " Delacey | \n",
1296 | " 35 | \n",
1297 | " 35 | \n",
1298 | " 0 | \n",
1299 | " 20170429_2_play.log | \n",
1300 | " 0 | \n",
1301 | " NaN | \n",
1302 | " NaN | \n",
1303 | " NaN | \n",
1304 | " NaN | \n",
1305 | "
\n",
1306 | " \n",
1307 | " | 39614509 | \n",
1308 | " NaN | \n",
1309 | " 后弦 | \n",
1310 | " 52 | \n",
1311 | " 53 | \n",
1312 | " 0 | \n",
1313 | " 20170429_2_play.log | \n",
1314 | " 0 | \n",
1315 | " NaN | \n",
1316 | " NaN | \n",
1317 | " NaN | \n",
1318 | " NaN | \n",
1319 | "
\n",
1320 | " \n",
1321 | " | 39627721 | \n",
1322 | " NaN | \n",
1323 | " 王馨平 | \n",
1324 | " 27 | \n",
1325 | " 28 | \n",
1326 | " 0 | \n",
1327 | " 20170429_2_play.log | \n",
1328 | " 0 | \n",
1329 | " NaN | \n",
1330 | " NaN | \n",
1331 | " NaN | \n",
1332 | " NaN | \n",
1333 | "
\n",
1334 | " \n",
1335 | " | 39628315 | \n",
1336 | " NaN | \n",
1337 | " 还舍不得离别(36秒铃声版)-(电视剧《美丽的秘密》插曲) | \n",
1338 | " 36 | \n",
1339 | " 36 | \n",
1340 | " 0 | \n",
1341 | " 20170429_2_play.log | \n",
1342 | " 0 | \n",
1343 | " NaN | \n",
1344 | " NaN | \n",
1345 | " NaN | \n",
1346 | " NaN | \n",
1347 | "
\n",
1348 | " \n",
1349 | " | 39658465 | \n",
1350 | " NaN | \n",
1351 | " Beyond | \n",
1352 | " 0 | \n",
1353 | " 27 | \n",
1354 | " 0 | \n",
1355 | " 20170429_2_play.log | \n",
1356 | " 0 | \n",
1357 | " NaN | \n",
1358 | " NaN | \n",
1359 | " NaN | \n",
1360 | " NaN | \n",
1361 | "
\n",
1362 | " \n",
1363 | " | 39658467 | \n",
1364 | " NaN | \n",
1365 | " 朴翔 | \n",
1366 | " 0 | \n",
1367 | " 47 | \n",
1368 | " 0 | \n",
1369 | " 20170429_2_play.log | \n",
1370 | " 0 | \n",
1371 | " NaN | \n",
1372 | " NaN | \n",
1373 | " NaN | \n",
1374 | " NaN | \n",
1375 | "
\n",
1376 | " \n",
1377 | " | 39663960 | \n",
1378 | " NaN | \n",
1379 | " 何鹏[男] | \n",
1380 | " 30 | \n",
1381 | " 29 | \n",
1382 | " 0 | \n",
1383 | " 20170429_2_play.log | \n",
1384 | " 0 | \n",
1385 | " NaN | \n",
1386 | " NaN | \n",
1387 | " NaN | \n",
1388 | " NaN | \n",
1389 | "
\n",
1390 | " \n",
1391 | " | 39678847 | \n",
1392 | " NaN | \n",
1393 | " 任妙音&何鹏 | \n",
1394 | " 33 | \n",
1395 | " 33 | \n",
1396 | " 0 | \n",
1397 | " 20170429_2_play.log | \n",
1398 | " 0 | \n",
1399 | " NaN | \n",
1400 | " NaN | \n",
1401 | " NaN | \n",
1402 | " NaN | \n",
1403 | "
\n",
1404 | " \n",
1405 | " | 39679672 | \n",
1406 | " NaN | \n",
1407 | " 汪苏泷 | \n",
1408 | " 33 | \n",
1409 | " 34 | \n",
1410 | " 0 | \n",
1411 | " 20170429_2_play.log | \n",
1412 | " 0 | \n",
1413 | " NaN | \n",
1414 | " NaN | \n",
1415 | " NaN | \n",
1416 | " NaN | \n",
1417 | "
\n",
1418 | " \n",
1419 | " | 39679732 | \n",
1420 | " NaN | \n",
1421 | " 汪苏泷 | \n",
1422 | " 0 | \n",
1423 | " 34 | \n",
1424 | " 0 | \n",
1425 | " 20170429_2_play.log | \n",
1426 | " 0 | \n",
1427 | " NaN | \n",
1428 | " NaN | \n",
1429 | " NaN | \n",
1430 | " NaN | \n",
1431 | "
\n",
1432 | " \n",
1433 | " | 39679736 | \n",
1434 | " NaN | \n",
1435 | " 汪苏泷 | \n",
1436 | " 1 | \n",
1437 | " 34 | \n",
1438 | " 0 | \n",
1439 | " 20170429_2_play.log | \n",
1440 | " 0 | \n",
1441 | " NaN | \n",
1442 | " NaN | \n",
1443 | " NaN | \n",
1444 | " NaN | \n",
1445 | "
\n",
1446 | " \n",
1447 | " | 39679739 | \n",
1448 | " NaN | \n",
1449 | " 汪苏泷 | \n",
1450 | " 33 | \n",
1451 | " 34 | \n",
1452 | " 0 | \n",
1453 | " 20170429_2_play.log | \n",
1454 | " 0 | \n",
1455 | " NaN | \n",
1456 | " NaN | \n",
1457 | " NaN | \n",
1458 | " NaN | \n",
1459 | "
\n",
1460 | " \n",
1461 | " | 39679745 | \n",
1462 | " NaN | \n",
1463 | " 汪苏泷 | \n",
1464 | " 0 | \n",
1465 | " 34 | \n",
1466 | " 0 | \n",
1467 | " 20170429_2_play.log | \n",
1468 | " 0 | \n",
1469 | " NaN | \n",
1470 | " NaN | \n",
1471 | " NaN | \n",
1472 | " NaN | \n",
1473 | "
\n",
1474 | " \n",
1475 | " | 39679759 | \n",
1476 | " NaN | \n",
1477 | " 汪苏泷 | \n",
1478 | " 7 | \n",
1479 | " 34 | \n",
1480 | " 0 | \n",
1481 | " 20170429_2_play.log | \n",
1482 | " 0 | \n",
1483 | " NaN | \n",
1484 | " NaN | \n",
1485 | " NaN | \n",
1486 | " NaN | \n",
1487 | "
\n",
1488 | " \n",
1489 | " | 39681793 | \n",
1490 | " NaN | \n",
1491 | " 林俊杰 | \n",
1492 | " 0 | \n",
1493 | " 33 | \n",
1494 | " 0 | \n",
1495 | " 20170429_2_play.log | \n",
1496 | " 0 | \n",
1497 | " NaN | \n",
1498 | " NaN | \n",
1499 | " NaN | \n",
1500 | " NaN | \n",
1501 | "
\n",
1502 | " \n",
1503 | " | 39681801 | \n",
1504 | " NaN | \n",
1505 | " 杨洋 | \n",
1506 | " 0 | \n",
1507 | " 29 | \n",
1508 | " 0 | \n",
1509 | " 20170429_2_play.log | \n",
1510 | " 0 | \n",
1511 | " NaN | \n",
1512 | " NaN | \n",
1513 | " NaN | \n",
1514 | " NaN | \n",
1515 | "
\n",
1516 | " \n",
1517 | " | 39681828 | \n",
1518 | " NaN | \n",
1519 | " 林俊杰 | \n",
1520 | " 0 | \n",
1521 | " 33 | \n",
1522 | " 0 | \n",
1523 | " 20170429_2_play.log | \n",
1524 | " 0 | \n",
1525 | " NaN | \n",
1526 | " NaN | \n",
1527 | " NaN | \n",
1528 | " NaN | \n",
1529 | "
\n",
1530 | " \n",
1531 | " | 39681834 | \n",
1532 | " NaN | \n",
1533 | " 林俊杰 | \n",
1534 | " 0 | \n",
1535 | " 33 | \n",
1536 | " 0 | \n",
1537 | " 20170429_2_play.log | \n",
1538 | " 0 | \n",
1539 | " NaN | \n",
1540 | " NaN | \n",
1541 | " NaN | \n",
1542 | " NaN | \n",
1543 | "
\n",
1544 | " \n",
1545 | " | 39681840 | \n",
1546 | " NaN | \n",
1547 | " 林俊杰 | \n",
1548 | " 0 | \n",
1549 | " 33 | \n",
1550 | " 0 | \n",
1551 | " 20170429_2_play.log | \n",
1552 | " 0 | \n",
1553 | " NaN | \n",
1554 | " NaN | \n",
1555 | " NaN | \n",
1556 | " NaN | \n",
1557 | "
\n",
1558 | " \n",
1559 | " | 39681857 | \n",
1560 | " NaN | \n",
1561 | " 林俊杰 | \n",
1562 | " 0 | \n",
1563 | " 33 | \n",
1564 | " 0 | \n",
1565 | " 20170429_2_play.log | \n",
1566 | " 0 | \n",
1567 | " NaN | \n",
1568 | " NaN | \n",
1569 | " NaN | \n",
1570 | " NaN | \n",
1571 | "
\n",
1572 | " \n",
1573 | " | 39681866 | \n",
1574 | " NaN | \n",
1575 | " 杨洋 | \n",
1576 | " 1 | \n",
1577 | " 29 | \n",
1578 | " 0 | \n",
1579 | " 20170429_2_play.log | \n",
1580 | " 0 | \n",
1581 | " NaN | \n",
1582 | " NaN | \n",
1583 | " NaN | \n",
1584 | " NaN | \n",
1585 | "
\n",
1586 | " \n",
1587 | " | 39682159 | \n",
1588 | " NaN | \n",
1589 | " 林俊杰 | \n",
1590 | " 0 | \n",
1591 | " 33 | \n",
1592 | " 0 | \n",
1593 | " 20170429_2_play.log | \n",
1594 | " 0 | \n",
1595 | " NaN | \n",
1596 | " NaN | \n",
1597 | " NaN | \n",
1598 | " NaN | \n",
1599 | "
\n",
1600 | " \n",
1601 | " | 39684928 | \n",
1602 | " NaN | \n",
1603 | " 王俊凯 | \n",
1604 | " 1 | \n",
1605 | " 46 | \n",
1606 | " 0 | \n",
1607 | " 20170429_2_play.log | \n",
1608 | " 0 | \n",
1609 | " NaN | \n",
1610 | " NaN | \n",
1611 | " NaN | \n",
1612 | " NaN | \n",
1613 | "
\n",
1614 | " \n",
1615 | " | 39685029 | \n",
1616 | " NaN | \n",
1617 | " 王俊凯 | \n",
1618 | " 0 | \n",
1619 | " 46 | \n",
1620 | " 0 | \n",
1621 | " 20170429_2_play.log | \n",
1622 | " 0 | \n",
1623 | " NaN | \n",
1624 | " NaN | \n",
1625 | " NaN | \n",
1626 | " NaN | \n",
1627 | "
\n",
1628 | " \n",
1629 | " | 39701112 | \n",
1630 | " NaN | \n",
1631 | " 任妙音 | \n",
1632 | " 39 | \n",
1633 | " 40 | \n",
1634 | " 0 | \n",
1635 | " 20170429_2_play.log | \n",
1636 | " 0 | \n",
1637 | " NaN | \n",
1638 | " NaN | \n",
1639 | " NaN | \n",
1640 | " NaN | \n",
1641 | "
\n",
1642 | " \n",
1643 | " | 39701132 | \n",
1644 | " NaN | \n",
1645 | " 任妙音 | \n",
1646 | " 39 | \n",
1647 | " 40 | \n",
1648 | " 0 | \n",
1649 | " 20170429_2_play.log | \n",
1650 | " 0 | \n",
1651 | " NaN | \n",
1652 | " NaN | \n",
1653 | " NaN | \n",
1654 | " NaN | \n",
1655 | "
\n",
1656 | " \n",
1657 | " | 39701169 | \n",
1658 | " NaN | \n",
1659 | " 任妙音 | \n",
1660 | " 39 | \n",
1661 | " 40 | \n",
1662 | " 0 | \n",
1663 | " 20170429_2_play.log | \n",
1664 | " 0 | \n",
1665 | " NaN | \n",
1666 | " NaN | \n",
1667 | " NaN | \n",
1668 | " NaN | \n",
1669 | "
\n",
1670 | " \n",
1671 | "
\n",
1672 | "
9426 rows × 11 columns
\n",
1673 | "
"
1674 | ],
1675 | "text/plain": [
1676 | " uid device song_id song_type song_name \\\n",
1677 | "3164 NaN 朴树 2 23 0 \n",
1678 | "5618 NaN 刘涛 31 30 0 \n",
1679 | "5690 NaN 刘涛 51 51 0 \n",
1680 | "6643 NaN 薛之谦 19 41 0 \n",
1681 | "7879 NaN EXO 26 26 0 \n",
1682 | "11400 NaN 佛教音乐 51 51 0 \n",
1683 | "11918 NaN 佛教音乐 51 51 0 \n",
1684 | "19485 NaN 杨洋 0 29 0 \n",
1685 | "21698 NaN 左宏元&张慧清 0 46 0 \n",
1686 | "25202 NaN 好想好想(51秒铃声版) 0 52 0 \n",
1687 | "25782 NaN 汤晶锦 40 40 0 \n",
1688 | "40061 NaN 本兮 26 26 0 \n",
1689 | "40706 NaN 云菲菲 30 30 0 \n",
1690 | "48130 NaN 庄心妍 56 56 0 \n",
1691 | "51385 NaN 冷漠 30 30 0 \n",
1692 | "51818 NaN 薛之谦 130 41 0 \n",
1693 | "59378 NaN 阿悄 27 28 0 \n",
1694 | "69626 NaN 魏栾 30 29 0 \n",
1695 | "74606 NaN 马頔 29 29 0 \n",
1696 | "74608 NaN Beyond 27 27 0 \n",
1697 | "74612 NaN 云菲菲 30 30 0 \n",
1698 | "75153 NaN Delacey 5 35 0 \n",
1699 | "76095 NaN Fall Out Boy 30 30 0 \n",
1700 | "76368 NaN 庄心妍 15 36 0 \n",
1701 | "76661 NaN 冷漠 30 30 0 \n",
1702 | "78444 NaN 佛教音乐 50 51 0 \n",
1703 | "79352 NaN 佛教音乐 50 51 0 \n",
1704 | "80956 NaN 佛教音乐 50 51 0 \n",
1705 | "81504 NaN 刁寒 30 30 0 \n",
1706 | "84367 NaN 唐古&蔡晓 24 24 0 \n",
1707 | "... ... ... ... ... ... \n",
1708 | "39571851 NaN 曹龙 1 33 0 \n",
1709 | "39580074 NaN 铃声 0 60 0 \n",
1710 | "39587988 NaN 陈小云 39 39 0 \n",
1711 | "39604491 NaN Delacey 35 35 0 \n",
1712 | "39614509 NaN 后弦 52 53 0 \n",
1713 | "39627721 NaN 王馨平 27 28 0 \n",
1714 | "39628315 NaN 还舍不得离别(36秒铃声版)-(电视剧《美丽的秘密》插曲) 36 36 0 \n",
1715 | "39658465 NaN Beyond 0 27 0 \n",
1716 | "39658467 NaN 朴翔 0 47 0 \n",
1717 | "39663960 NaN 何鹏[男] 30 29 0 \n",
1718 | "39678847 NaN 任妙音&何鹏 33 33 0 \n",
1719 | "39679672 NaN 汪苏泷 33 34 0 \n",
1720 | "39679732 NaN 汪苏泷 0 34 0 \n",
1721 | "39679736 NaN 汪苏泷 1 34 0 \n",
1722 | "39679739 NaN 汪苏泷 33 34 0 \n",
1723 | "39679745 NaN 汪苏泷 0 34 0 \n",
1724 | "39679759 NaN 汪苏泷 7 34 0 \n",
1725 | "39681793 NaN 林俊杰 0 33 0 \n",
1726 | "39681801 NaN 杨洋 0 29 0 \n",
1727 | "39681828 NaN 林俊杰 0 33 0 \n",
1728 | "39681834 NaN 林俊杰 0 33 0 \n",
1729 | "39681840 NaN 林俊杰 0 33 0 \n",
1730 | "39681857 NaN 林俊杰 0 33 0 \n",
1731 | "39681866 NaN 杨洋 1 29 0 \n",
1732 | "39682159 NaN 林俊杰 0 33 0 \n",
1733 | "39684928 NaN 王俊凯 1 46 0 \n",
1734 | "39685029 NaN 王俊凯 0 46 0 \n",
1735 | "39701112 NaN 任妙音 39 40 0 \n",
1736 | "39701132 NaN 任妙音 39 40 0 \n",
1737 | "39701169 NaN 任妙音 39 40 0 \n",
1738 | "\n",
1739 | " singer play_time song_length paid_flag file_name label \n",
1740 | "3164 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1741 | "5618 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1742 | "5690 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1743 | "6643 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1744 | "7879 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1745 | "11400 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1746 | "11918 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1747 | "19485 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1748 | "21698 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1749 | "25202 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1750 | "25782 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1751 | "40061 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1752 | "40706 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1753 | "48130 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1754 | "51385 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1755 | "51818 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1756 | "59378 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1757 | "69626 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1758 | "74606 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1759 | "74608 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1760 | "74612 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1761 | "75153 20170410_2_play.log 1 NaN NaN NaN NaN \n",
1762 | "76095 20170410_2_play.log 1 NaN NaN NaN NaN \n",
1763 | "76368 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1764 | "76661 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1765 | "78444 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1766 | "79352 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1767 | "80956 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1768 | "81504 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1769 | "84367 20170410_2_play.log 0 NaN NaN NaN NaN \n",
1770 | "... ... ... ... ... ... ... \n",
1771 | "39571851 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1772 | "39580074 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1773 | "39587988 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1774 | "39604491 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1775 | "39614509 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1776 | "39627721 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1777 | "39628315 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1778 | "39658465 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1779 | "39658467 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1780 | "39663960 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1781 | "39678847 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1782 | "39679672 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1783 | "39679732 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1784 | "39679736 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1785 | "39679739 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1786 | "39679745 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1787 | "39679759 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1788 | "39681793 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1789 | "39681801 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1790 | "39681828 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1791 | "39681834 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1792 | "39681840 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1793 | "39681857 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1794 | "39681866 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1795 | "39682159 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1796 | "39684928 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1797 | "39685029 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1798 | "39701112 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1799 | "39701132 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1800 | "39701169 20170429_2_play.log 0 NaN NaN NaN NaN \n",
1801 | "\n",
1802 | "[9426 rows x 11 columns]"
1803 | ]
1804 | },
1805 | "execution_count": 21,
1806 | "metadata": {},
1807 | "output_type": "execute_result"
1808 | }
1809 | ],
1810 | "source": [
1811 | "# Check those logs with missing uid\n",
1812 | "df_play[df_play['uid'].isnull()]"
1813 | ]
1814 | },
1815 | {
1816 | "cell_type": "code",
1817 | "execution_count": 22,
1818 | "metadata": {
1819 | "collapsed": true
1820 | },
1821 | "outputs": [],
1822 | "source": [
1823 | "# Looks suspecious as there are lots of missing inputs in these logs, delete these logs\n",
1824 | "df_play = df_play.loc[df_play.uid.notnull()]\n"
1825 | ]
1826 | },
1827 | {
1828 | "cell_type": "code",
1829 | "execution_count": 23,
1830 | "metadata": {},
1831 | "outputs": [
1832 | {
1833 | "data": {
1834 | "text/plain": [
1835 | "uid 0\n",
1836 | "device 0\n",
1837 | "song_id 7510\n",
1838 | "song_type 23109\n",
1839 | "song_name 5524\n",
1840 | "singer 49164\n",
1841 | "play_time 160207\n",
1842 | "song_length 11291\n",
1843 | "paid_flag 42553\n",
1844 | "file_name 42554\n",
1845 | "label 42554\n",
1846 | "dtype: int64"
1847 | ]
1848 | },
1849 | "execution_count": 23,
1850 | "metadata": {},
1851 | "output_type": "execute_result"
1852 | }
1853 | ],
1854 | "source": [
1855 | "df_play.isnull().sum(axis = 0)"
1856 | ]
1857 | },
1858 | {
1859 | "cell_type": "code",
1860 | "execution_count": 24,
1861 | "metadata": {},
1862 | "outputs": [
1863 | {
1864 | "data": {
1865 | "text/plain": [
1866 | "0 29022270\n",
1867 | "1 6836180\n",
1868 | "0 2932205\n",
1869 | "2 474861\n",
1870 | "1 325546\n",
1871 | "2 32711\n",
1872 | " 9908\n",
1873 | "3 4658\n",
1874 | "刚好遇见你 129\n",
1875 | "没有你陪伴真的好孤单 93\n",
1876 | "暧昧 76\n",
1877 | "逆流成河 67\n",
1878 | "演员 62\n",
1879 | "3 55\n",
1880 | "走着走着就散了 52\n",
1881 | "...\n",
1882 | "扎西哥哥(DJ版) 1\n",
1883 | "女孩你知道吗 1\n",
1884 | "爱的魔法 1\n",
1885 | "拥抱你 1\n",
1886 | "太阳雨 1\n",
1887 | "克罗地亚狂想曲 1\n",
1888 | "情翼 1\n",
1889 | "腐草为萤 1\n",
1890 | "Can You Feel The Love Tonight 1\n",
1891 | "164水浒全传 1\n",
1892 | "The wheels on the bus 1\n",
1893 | "十年戎马心孤单 - KTV版伴奏 1\n",
1894 | "Звезда 1\n",
1895 | "The Sounds Of Silence 1\n",
1896 | "续小八义043 1\n",
1897 | "Length: 12334, dtype: int64"
1898 | ]
1899 | },
1900 | "execution_count": 24,
1901 | "metadata": {},
1902 | "output_type": "execute_result"
1903 | }
1904 | ],
1905 | "source": [
1906 | "df_play.song_type.value_counts()"
1907 | ]
1908 | },
1909 | {
1910 | "cell_type": "code",
1911 | "execution_count": 25,
1912 | "metadata": {},
1913 | "outputs": [
1914 | {
1915 | "data": {
1916 | "text/plain": [
1917 | "0.0 39583273\n",
1918 | "0 65467\n",
1919 | "415.0 427\n",
1920 | "430.0 32\n",
1921 | "252.0 26\n",
1922 | "219.0 19\n",
1923 | "3.0 13\n",
1924 | "259.0 6\n",
1925 | "245.0 6\n",
1926 | "169.0 6\n",
1927 | "375.0 4\n",
1928 | "209.0 4\n",
1929 | "241.0 4\n",
1930 | "9.0 3\n",
1931 | "7.0 3\n",
1932 | "191.0 3\n",
1933 | "200.0 3\n",
1934 | "128.0 2\n",
1935 | "260.0 2\n",
1936 | "248.0 2\n",
1937 | "237.0 2\n",
1938 | "8.0 1\n",
1939 | "6.0 1\n",
1940 | "183.0 1\n",
1941 | "1026.0 1\n",
1942 | "211.0 1\n",
1943 | "235.0 1\n",
1944 | "725.0 1\n",
1945 | "爱上一匹野马 1\n",
1946 | "278.0 1\n",
1947 | "289.0 1\n",
1948 | "292.0 1\n",
1949 | "385.0 1\n",
1950 | "473.0 1\n",
1951 | "666.0 1\n",
1952 | "683.0 1\n",
1953 | "247.0 1\n",
1954 | "dtype: int64"
1955 | ]
1956 | },
1957 | "execution_count": 25,
1958 | "metadata": {},
1959 | "output_type": "execute_result"
1960 | }
1961 | ],
1962 | "source": [
1963 | "df_play.paid_flag.value_counts()"
1964 | ]
1965 | },
1966 | {
1967 | "cell_type": "code",
1968 | "execution_count": 26,
1969 | "metadata": {},
1970 | "outputs": [
1971 | {
1972 | "data": {
1973 | "text/html": [
1974 | "\n",
1975 | "
\n",
1976 | " \n",
1977 | " \n",
1978 | " | \n",
1979 | " uid | \n",
1980 | " device | \n",
1981 | " song_id | \n",
1982 | " song_type | \n",
1983 | " song_name | \n",
1984 | " singer | \n",
1985 | " play_time | \n",
1986 | " song_length | \n",
1987 | " paid_flag | \n",
1988 | " file_name | \n",
1989 | " label | \n",
1990 | "
\n",
1991 | " \n",
1992 | " \n",
1993 | " \n",
1994 | " | 58059 | \n",
1995 | " 168146144 | \n",
1996 | " ar | \n",
1997 | " 6916311 | \n",
1998 | " 0 | \n",
1999 | " 多少的爱都不要(???? ???????? | \n",
2000 | " ) | \n",
2001 | " Ten Nararak | \n",
2002 | " 412 | \n",
2003 | " 415 | \n",
2004 | " 0 | \n",
2005 | " 20170410_2_play.log | \n",
2006 | "
\n",
2007 | " \n",
2008 | " | 58741 | \n",
2009 | " 168146144 | \n",
2010 | " ar | \n",
2011 | " 6916311 | \n",
2012 | " 2 | \n",
2013 | " 多少的爱都不要(???? ???????? | \n",
2014 | " ) | \n",
2015 | " Ten Nararak | \n",
2016 | " 237>\u000f}(222.219.141.68)TM | \n",
2017 | " 430 | \n",
2018 | " 0 | \n",
2019 | " 20170410_2_play.log | \n",
2020 | "
\n",
2021 | " \n",
2022 | " | 164818 | \n",
2023 | " 168700735 | \n",
2024 | " ar | \n",
2025 | " 6916311 | \n",
2026 | " 0 | \n",
2027 | " 多少的爱都不要(???? ???????? | \n",
2028 | " ) | \n",
2029 | " Ten Nararak | \n",
2030 | " 249 | \n",
2031 | " 415 | \n",
2032 | " 0 | \n",
2033 | " 20170410_3_play.log | \n",
2034 | "
\n",
2035 | " \n",
2036 | " | 178183 | \n",
2037 | " 168647140 | \n",
2038 | " ar | \n",
2039 | " 6916311 | \n",
2040 | " 0 | \n",
2041 | " 多少的爱都不要(???? ???????? | \n",
2042 | " ) | \n",
2043 | " Ten Nararak | \n",
2044 | " 258 | \n",
2045 | " 415 | \n",
2046 | " 0 | \n",
2047 | " 20170410_3_play.log | \n",
2048 | "
\n",
2049 | " \n",
2050 | " | 178398 | \n",
2051 | " 168647140 | \n",
2052 | " ar | \n",
2053 | " 6916311 | \n",
2054 | " 0 | \n",
2055 | " 多少的爱都不要(???? ???????? | \n",
2056 | " ) | \n",
2057 | " Ten Nararak | \n",
2058 | " 103 | \n",
2059 | " 415 | \n",
2060 | " 0 | \n",
2061 | " 20170410_3_play.log | \n",
2062 | "
\n",
2063 | " \n",
2064 | " | 433073 | \n",
2065 | " 0 | \n",
2066 | " ar | \n",
2067 | " 235500 | \n",
2068 | " 1 | \n",
2069 | " 分裂 | \n",
2070 | " [内地版 | \n",
2071 | " 周杰伦 | \n",
2072 | " 104931 | \n",
2073 | " 252 | \n",
2074 | " 0 | \n",
2075 | " 20170504_3_play.log | \n",
2076 | "
\n",
2077 | " \n",
2078 | " | 447433 | \n",
2079 | " 168647140 | \n",
2080 | " ar | \n",
2081 | " 6916311 | \n",
2082 | " 0 | \n",
2083 | " 多少的爱都不要(???? ???????? | \n",
2084 | " ) | \n",
2085 | " Ten Nararak | \n",
2086 | " 211 | \n",
2087 | " 415 | \n",
2088 | " 0 | \n",
2089 | " 20170504_3_play.log | \n",
2090 | "
\n",
2091 | " \n",
2092 | " | 469348 | \n",
2093 | " 137084142 | \n",
2094 | " ar | \n",
2095 | " 6916311 | \n",
2096 | " 0 | \n",
2097 | " 多少的爱都不要(???? ???????? | \n",
2098 | " ) | \n",
2099 | " Ten Nararak | \n",
2100 | " 414 | \n",
2101 | " 415 | \n",
2102 | " 0 | \n",
2103 | " 20170504_2_play.log | \n",
2104 | "
\n",
2105 | " \n",
2106 | " | 486671 | \n",
2107 | " 168271854 | \n",
2108 | " ip | \n",
2109 | " 6916311 | \n",
2110 | " 0 | \n",
2111 | " 多少的爱都不要(???? ???????? | \n",
2112 | " ) | \n",
2113 | " Ten Nararak | \n",
2114 | " 1 | \n",
2115 | " 415 | \n",
2116 | " 0 | \n",
2117 | " 20170504_2_play.log | \n",
2118 | "
\n",
2119 | " \n",
2120 | " | 505531 | \n",
2121 | " 0 | \n",
2122 | " ar | \n",
2123 | " 235500 | \n",
2124 | " 1 | \n",
2125 | " 分裂 | \n",
2126 | " [内地版 | \n",
2127 | " 周杰伦 | \n",
2128 | " 104931 | \n",
2129 | " 252 | \n",
2130 | " 0 | \n",
2131 | " 20170504_2_play.log | \n",
2132 | "
\n",
2133 | " \n",
2134 | " | 506282 | \n",
2135 | " 137084142 | \n",
2136 | " ar | \n",
2137 | " 6916311 | \n",
2138 | " 0 | \n",
2139 | " 多少的爱都不要(???? ???????? | \n",
2140 | " ) | \n",
2141 | " Ten Nararak | \n",
2142 | " 415 | \n",
2143 | " 415 | \n",
2144 | " 0 | \n",
2145 | " 20170504_2_play.log | \n",
2146 | "
\n",
2147 | " \n",
2148 | " | 506327 | \n",
2149 | " 137084142 | \n",
2150 | " ar | \n",
2151 | " 6916311 | \n",
2152 | " 0 | \n",
2153 | " 多少的爱都不要(???? ???????? | \n",
2154 | " ) | \n",
2155 | " Ten Nararak | \n",
2156 | " 0 | \n",
2157 | " 415 | \n",
2158 | " 0 | \n",
2159 | " 20170504_2_play.log | \n",
2160 | "
\n",
2161 | " \n",
2162 | " | 506369 | \n",
2163 | " 137084142 | \n",
2164 | " ar | \n",
2165 | " 6916311 | \n",
2166 | " 0 | \n",
2167 | " 多少的爱都不要(???? ???????? | \n",
2168 | " ) | \n",
2169 | " Ten Nararak | \n",
2170 | " 0 | \n",
2171 | " 415 | \n",
2172 | " 0 | \n",
2173 | " 20170504_2_play.log | \n",
2174 | "
\n",
2175 | " \n",
2176 | " | 506412 | \n",
2177 | " 137084142 | \n",
2178 | " ar | \n",
2179 | " 6916311 | \n",
2180 | " 0 | \n",
2181 | " 多少的爱都不要(???? ???????? | \n",
2182 | " ) | \n",
2183 | " Ten Nararak | \n",
2184 | " 415 | \n",
2185 | " 415 | \n",
2186 | " 0 | \n",
2187 | " 20170504_2_play.log | \n",
2188 | "
\n",
2189 | " \n",
2190 | " | 506466 | \n",
2191 | " 137084142 | \n",
2192 | " ar | \n",
2193 | " 6916311 | \n",
2194 | " 0 | \n",
2195 | " 多少的爱都不要(???? ???????? | \n",
2196 | " ) | \n",
2197 | " Ten Nararak | \n",
2198 | " 415 | \n",
2199 | " 415 | \n",
2200 | " 0 | \n",
2201 | " 20170504_2_play.log | \n",
2202 | "
\n",
2203 | " \n",
2204 | " | 506513 | \n",
2205 | " 137084142 | \n",
2206 | " ar | \n",
2207 | " 6916311 | \n",
2208 | " 0 | \n",
2209 | " 多少的爱都不要(???? ???????? | \n",
2210 | " ) | \n",
2211 | " Ten Nararak | \n",
2212 | " 415 | \n",
2213 | " 415 | \n",
2214 | " 0 | \n",
2215 | " 20170504_2_play.log | \n",
2216 | "
\n",
2217 | " \n",
2218 | " | 510426 | \n",
2219 | " 137084142 | \n",
2220 | " ar | \n",
2221 | " 6916311 | \n",
2222 | " 0 | \n",
2223 | " 多少的爱都不要(???? ???????? | \n",
2224 | " ) | \n",
2225 | " Ten Nararak | \n",
2226 | " 415 | \n",
2227 | " 415 | \n",
2228 | " 0 | \n",
2229 | " 20170504_2_play.log | \n",
2230 | "
\n",
2231 | " \n",
2232 | " | 532688 | \n",
2233 | " 168164990 | \n",
2234 | " ar | \n",
2235 | " 6916311 | \n",
2236 | " 0 | \n",
2237 | " 多少的爱都不要(???? ???????? | \n",
2238 | " ) | \n",
2239 | " Ten Nararak | \n",
2240 | " 345 | \n",
2241 | " 415 | \n",
2242 | " 0 | \n",
2243 | " 20170504_2_play.log | \n",
2244 | "
\n",
2245 | " \n",
2246 | " | 586643 | \n",
2247 | " 168263591 | \n",
2248 | " ar | \n",
2249 | " 6916311 | \n",
2250 | " 0 | \n",
2251 | " 多少的爱都不要(???? ???????? | \n",
2252 | " ) | \n",
2253 | " Ten Nararak | \n",
2254 | " 414 | \n",
2255 | " 415 | \n",
2256 | " 0 | \n",
2257 | " 20170508_1_play.log | \n",
2258 | "
\n",
2259 | " \n",
2260 | " | 655350 | \n",
2261 | " 167636505 | \n",
2262 | " ar | \n",
2263 | " 6916311 | \n",
2264 | " 0 | \n",
2265 | " 多少的爱都不要(???? ???????? | \n",
2266 | " ) | \n",
2267 | " Ten Nararak | \n",
2268 | " 0 | \n",
2269 | " 415 | \n",
2270 | " 0 | \n",
2271 | " 20170505_1_play.log | \n",
2272 | "
\n",
2273 | " \n",
2274 | " | 682423 | \n",
2275 | " 168263591 | \n",
2276 | " ar | \n",
2277 | " 6916311 | \n",
2278 | " 0 | \n",
2279 | " 多少的爱都不要(???? ???????? | \n",
2280 | " ) | \n",
2281 | " Ten Nararak | \n",
2282 | " 121 | \n",
2283 | " 415 | \n",
2284 | " 0 | \n",
2285 | " 20170505_1_play.log | \n",
2286 | "
\n",
2287 | " \n",
2288 | " | 760977 | \n",
2289 | " 167988542 | \n",
2290 | " ar | \n",
2291 | " 235500 | \n",
2292 | " 0 | \n",
2293 | " 分裂 | \n",
2294 | " [内地版 | \n",
2295 | " 周杰伦 | \n",
2296 | " 13 | \n",
2297 | " 252 | \n",
2298 | " 0 | \n",
2299 | " 20170411_1_play.log | \n",
2300 | "
\n",
2301 | " \n",
2302 | " | 801213 | \n",
2303 | " 167636505 | \n",
2304 | " ar | \n",
2305 | " 6916311 | \n",
2306 | " 0 | \n",
2307 | " 多少的爱都不要(???? ???????? | \n",
2308 | " ) | \n",
2309 | " Ten Nararak | \n",
2310 | " 0 | \n",
2311 | " 415 | \n",
2312 | " 0 | \n",
2313 | " 20170426_1_play.log | \n",
2314 | "
\n",
2315 | " \n",
2316 | " | 806276 | \n",
2317 | " 167894057 | \n",
2318 | " ar | \n",
2319 | " 6916311 | \n",
2320 | " 0 | \n",
2321 | " 多少的爱都不要(???? ???????? | \n",
2322 | " ) | \n",
2323 | " Ten Nararak | \n",
2324 | " 207 | \n",
2325 | " 415 | \n",
2326 | " 0 | \n",
2327 | " 20170426_1_play.log | \n",
2328 | "
\n",
2329 | " \n",
2330 | " | 819955 | \n",
2331 | " 167721050 | \n",
2332 | " ar | \n",
2333 | " 5989117 | \n",
2334 | " 0 | \n",
2335 | " 38. | \n",
2336 | " Hurry or you'll be late for school. (快点儿,上学该迟到了。) | \n",
2337 | " 分级加字幕轻松练听力 | \n",
2338 | " 3 | \n",
2339 | " 9 | \n",
2340 | " 0 | \n",
2341 | " 20170426_1_play.log | \n",
2342 | "
\n",
2343 | " \n",
2344 | " | 895604 | \n",
2345 | " 168820890 | \n",
2346 | " ar | \n",
2347 | " 6916311 | \n",
2348 | " 2 | \n",
2349 | " 多少的爱都不要(???? ???????? | \n",
2350 | " ) | \n",
2351 | " Ten Nararak | \n",
2352 | " 58 | \n",
2353 | " 430 | \n",
2354 | " 0 | \n",
2355 | " 20170509_3_play.log | \n",
2356 | "
\n",
2357 | " \n",
2358 | " | 896653 | \n",
2359 | " 168820890 | \n",
2360 | " ar | \n",
2361 | " 6916311 | \n",
2362 | " 2 | \n",
2363 | " 多少的爱都不要(???? ???????? | \n",
2364 | " ) | \n",
2365 | " Ten Nararak | \n",
2366 | " 409 | \n",
2367 | " 430 | \n",
2368 | " 0 | \n",
2369 | " 20170509_3_play.log | \n",
2370 | "
\n",
2371 | " \n",
2372 | " | 910055 | \n",
2373 | " 161741167 | \n",
2374 | " ar | \n",
2375 | " 6916311 | \n",
2376 | " 2 | \n",
2377 | " 多少的爱都不要(???? ???????? | \n",
2378 | " ) | \n",
2379 | " Ten Nararak | \n",
2380 | " 47>\u000f}(223.104.38.39)TM | \n",
2381 | " 430 | \n",
2382 | " 0 | \n",
2383 | " 20170509_3_play.log | \n",
2384 | "
\n",
2385 | " \n",
2386 | " | 910056 | \n",
2387 | " 161741167 | \n",
2388 | " ar | \n",
2389 | " 6916311 | \n",
2390 | " 2 | \n",
2391 | " 多少的爱都不要(???? ???????? | \n",
2392 | " ) | \n",
2393 | " Ten Nararak | \n",
2394 | " 5>\u000f}(223.104.38.39)TM | \n",
2395 | " 430 | \n",
2396 | " 0 | \n",
2397 | " 20170509_3_play.log | \n",
2398 | "
\n",
2399 | " \n",
2400 | " | 910057 | \n",
2401 | " 161741167 | \n",
2402 | " ar | \n",
2403 | " 6916311 | \n",
2404 | " 2 | \n",
2405 | " 多少的爱都不要(???? ???????? | \n",
2406 | " ) | \n",
2407 | " Ten Nararak | \n",
2408 | " 5>=(223.104.38.39)TM | \n",
2409 | " 430 | \n",
2410 | " 0 | \n",
2411 | " 20170509_3_play.log | \n",
2412 | "
\n",
2413 | " \n",
2414 | " | ... | \n",
2415 | " ... | \n",
2416 | " ... | \n",
2417 | " ... | \n",
2418 | " ... | \n",
2419 | " ... | \n",
2420 | " ... | \n",
2421 | " ... | \n",
2422 | " ... | \n",
2423 | " ... | \n",
2424 | " ... | \n",
2425 | " ... | \n",
2426 | "
\n",
2427 | " \n",
2428 | " | 37008272 | \n",
2429 | " 167778188 | \n",
2430 | " ar | \n",
2431 | " 6916311 | \n",
2432 | " 0 | \n",
2433 | " 多少的爱都不要(???? ???????? | \n",
2434 | " ) | \n",
2435 | " Ten Nararak | \n",
2436 | " 415 | \n",
2437 | " 415 | \n",
2438 | " 0 | \n",
2439 | " 20170403_1_play.log | \n",
2440 | "
\n",
2441 | " \n",
2442 | " | 37178187 | \n",
2443 | " 168725697 | \n",
2444 | " ar | \n",
2445 | " 6916311 | \n",
2446 | " 0 | \n",
2447 | " 多少的爱都不要(???? ???????? | \n",
2448 | " ) | \n",
2449 | " Ten Nararak | \n",
2450 | " 50 | \n",
2451 | " 415 | \n",
2452 | " 0 | \n",
2453 | " 20170424_3_play.log | \n",
2454 | "
\n",
2455 | " \n",
2456 | " | 37292655 | \n",
2457 | " 1685126 | \n",
2458 | " ar | \n",
2459 | " 0 | \n",
2460 | " 1 | \n",
2461 | " LOSER | \n",
2462 | " Bigbang | \n",
2463 | " NaN | \n",
2464 | " 219336 | \n",
2465 | " 219 | \n",
2466 | " 0 | \n",
2467 | " 20170413_2_play.log | \n",
2468 | "
\n",
2469 | " \n",
2470 | " | 37313545 | \n",
2471 | " 168248860 | \n",
2472 | " ar | \n",
2473 | " 6916311 | \n",
2474 | " 0 | \n",
2475 | " 多少的爱都不要(???? ???????? | \n",
2476 | " ) | \n",
2477 | " Ten Nararak | \n",
2478 | " 213 | \n",
2479 | " 415 | \n",
2480 | " 0 | \n",
2481 | " 20170413_2_play.log | \n",
2482 | "
\n",
2483 | " \n",
2484 | " | 37465800 | \n",
2485 | " 1685126 | \n",
2486 | " ar | \n",
2487 | " 0 | \n",
2488 | " 1 | \n",
2489 | " LOSER | \n",
2490 | " Bigbang | \n",
2491 | " NaN | \n",
2492 | " 219336 | \n",
2493 | " 219 | \n",
2494 | " 0 | \n",
2495 | " 20170413_3_play.log | \n",
2496 | "
\n",
2497 | " \n",
2498 | " | 37612451 | \n",
2499 | " 168851902 | \n",
2500 | " ar | \n",
2501 | " 6916311 | \n",
2502 | " 0 | \n",
2503 | " 多少的爱都不要(???? ???????? | \n",
2504 | " ) | \n",
2505 | " Ten Nararak | \n",
2506 | " 414 | \n",
2507 | " 415 | \n",
2508 | " 0 | \n",
2509 | " 20170507_3_play.log | \n",
2510 | "
\n",
2511 | " \n",
2512 | " | 37920919 | \n",
2513 | " 167775288 | \n",
2514 | " ar | \n",
2515 | " 6916311 | \n",
2516 | " 0 | \n",
2517 | " 多少的爱都不要(???? ???????? | \n",
2518 | " ) | \n",
2519 | " Ten Nararak | \n",
2520 | " 48 | \n",
2521 | " 415 | \n",
2522 | " 0 | \n",
2523 | " 20170428_1_play.log | \n",
2524 | "
\n",
2525 | " \n",
2526 | " | 38044243 | \n",
2527 | " 167775288 | \n",
2528 | " ar | \n",
2529 | " 6916311 | \n",
2530 | " 0 | \n",
2531 | " 多少的爱都不要(???? ???????? | \n",
2532 | " ) | \n",
2533 | " Ten Nararak | \n",
2534 | " 414 | \n",
2535 | " 415 | \n",
2536 | " 0 | \n",
2537 | " 20170428_1_play.log | \n",
2538 | "
\n",
2539 | " \n",
2540 | " | 38063913 | \n",
2541 | " 167632676 | \n",
2542 | " ip | \n",
2543 | " 6916311 | \n",
2544 | " 0 | \n",
2545 | " 多少的爱都不要(???? ???????? | \n",
2546 | " ) | \n",
2547 | " Ten Nararak | \n",
2548 | " 414 | \n",
2549 | " 415 | \n",
2550 | " 0 | \n",
2551 | " 20170339_1_play.log | \n",
2552 | "
\n",
2553 | " \n",
2554 | " | 38128310 | \n",
2555 | " 167819640 | \n",
2556 | " ar | \n",
2557 | " 6916311 | \n",
2558 | " 0 | \n",
2559 | " 多少的爱都不要(???? ???????? | \n",
2560 | " ) | \n",
2561 | " Ten Nararak | \n",
2562 | " 0 | \n",
2563 | " 415 | \n",
2564 | " 0 | \n",
2565 | " 20170339_1_play.log | \n",
2566 | "
\n",
2567 | " \n",
2568 | " | 38166424 | \n",
2569 | " 168318095 | \n",
2570 | " ar | \n",
2571 | " 6916311 | \n",
2572 | " 0 | \n",
2573 | " 多少的爱都不要(???? ???????? | \n",
2574 | " ) | \n",
2575 | " Ten Nararak | \n",
2576 | " 6 | \n",
2577 | " 415 | \n",
2578 | " 0 | \n",
2579 | " 20170339_1_play.log | \n",
2580 | "
\n",
2581 | " \n",
2582 | " | 38226566 | \n",
2583 | " 167998697 | \n",
2584 | " ar | \n",
2585 | " 6916311 | \n",
2586 | " 0 | \n",
2587 | " 多少的爱都不要(???? ???????? | \n",
2588 | " ) | \n",
2589 | " Ten Nararak | \n",
2590 | " 414 | \n",
2591 | " 415 | \n",
2592 | " 0 | \n",
2593 | " 20170339_1_play.log | \n",
2594 | "
\n",
2595 | " \n",
2596 | " | 38231484 | \n",
2597 | " 167604900 | \n",
2598 | " ip | \n",
2599 | " 6916311 | \n",
2600 | " 0 | \n",
2601 | " 多少的爱都不要(???? ???????? | \n",
2602 | " ) | \n",
2603 | " Ten Nararak | \n",
2604 | " 164 | \n",
2605 | " 415 | \n",
2606 | " 0 | \n",
2607 | " 20170339_1_play.log | \n",
2608 | "
\n",
2609 | " \n",
2610 | " | 38348109 | \n",
2611 | " 168257965 | \n",
2612 | " ip | \n",
2613 | " 235500 | \n",
2614 | " 0 | \n",
2615 | " 分裂 | \n",
2616 | " [内地版 | \n",
2617 | " 周杰伦 | \n",
2618 | " 0 | \n",
2619 | " 252 | \n",
2620 | " 0 | \n",
2621 | " 20170402_2_play.log | \n",
2622 | "
\n",
2623 | " \n",
2624 | " | 38348113 | \n",
2625 | " 168257965 | \n",
2626 | " ip | \n",
2627 | " 235500 | \n",
2628 | " 0 | \n",
2629 | " 分裂 | \n",
2630 | " [内地版 | \n",
2631 | " 周杰伦 | \n",
2632 | " 0 | \n",
2633 | " 252 | \n",
2634 | " 0 | \n",
2635 | " 20170402_2_play.log | \n",
2636 | "
\n",
2637 | " \n",
2638 | " | 38505140 | \n",
2639 | " 168521650 | \n",
2640 | " ar | \n",
2641 | " 7080647 | \n",
2642 | " 0 | \n",
2643 | " 曹云金、刘云天《奋斗》(2012) | \n",
2644 | " NaN | \n",
2645 | " 春晚相声集锦 | \n",
2646 | " 219 | \n",
2647 | " 725 | \n",
2648 | " 0 | \n",
2649 | " 20170402_2_play.log | \n",
2650 | "
\n",
2651 | " \n",
2652 | " | 38518065 | \n",
2653 | " 168851902 | \n",
2654 | " ar | \n",
2655 | " 6916311 | \n",
2656 | " 0 | \n",
2657 | " 多少的爱都不要(???? ???????? | \n",
2658 | " ) | \n",
2659 | " Ten Nararak | \n",
2660 | " 414 | \n",
2661 | " 415 | \n",
2662 | " 0 | \n",
2663 | " 20170402_3_play.log | \n",
2664 | "
\n",
2665 | " \n",
2666 | " | 38519838 | \n",
2667 | " 168851902 | \n",
2668 | " ar | \n",
2669 | " 6916311 | \n",
2670 | " 0 | \n",
2671 | " 多少的爱都不要(???? ???????? | \n",
2672 | " ) | \n",
2673 | " Ten Nararak | \n",
2674 | " 415 | \n",
2675 | " 415 | \n",
2676 | " 0 | \n",
2677 | " 20170402_3_play.log | \n",
2678 | "
\n",
2679 | " \n",
2680 | " | 38679451 | \n",
2681 | " 169001484 | \n",
2682 | " ar | \n",
2683 | " 6916311 | \n",
2684 | " 0 | \n",
2685 | " 多少的爱都不要(???? ???????? | \n",
2686 | " ) | \n",
2687 | " Ten Nararak | \n",
2688 | " 400 | \n",
2689 | " 415 | \n",
2690 | " 0 | \n",
2691 | " 20170402_3_play.log | \n",
2692 | "
\n",
2693 | " \n",
2694 | " | 38842511 | \n",
2695 | " 167947839 | \n",
2696 | " ip | \n",
2697 | " 6916311 | \n",
2698 | " 0 | \n",
2699 | " 多少的爱都不要(???? ???????? | \n",
2700 | " ) | \n",
2701 | " Ten Nararak | \n",
2702 | " 3 | \n",
2703 | " 415 | \n",
2704 | " 0 | \n",
2705 | " 20170506_1_play.log | \n",
2706 | "
\n",
2707 | " \n",
2708 | " | 38867412 | \n",
2709 | " 167922308 | \n",
2710 | " ar | \n",
2711 | " 6916311 | \n",
2712 | " 0 | \n",
2713 | " 多少的爱都不要(???? ???????? | \n",
2714 | " ) | \n",
2715 | " Ten Nararak | \n",
2716 | " 414 | \n",
2717 | " 415 | \n",
2718 | " 0 | \n",
2719 | " 20170506_1_play.log | \n",
2720 | "
\n",
2721 | " \n",
2722 | " | 38870804 | \n",
2723 | " 167922308 | \n",
2724 | " ar | \n",
2725 | " 6916311 | \n",
2726 | " 0 | \n",
2727 | " 多少的爱都不要(???? ???????? | \n",
2728 | " ) | \n",
2729 | " Ten Nararak | \n",
2730 | " 415 | \n",
2731 | " 415 | \n",
2732 | " 0 | \n",
2733 | " 20170506_1_play.log | \n",
2734 | "
\n",
2735 | " \n",
2736 | " | 38990351 | \n",
2737 | " 167775288 | \n",
2738 | " ar | \n",
2739 | " 6916311 | \n",
2740 | " 0 | \n",
2741 | " 多少的爱都不要(???? ???????? | \n",
2742 | " ) | \n",
2743 | " Ten Nararak | \n",
2744 | " 414 | \n",
2745 | " 415 | \n",
2746 | " 0 | \n",
2747 | " 20170425_1_play.log | \n",
2748 | "
\n",
2749 | " \n",
2750 | " | 39088806 | \n",
2751 | " 167947839 | \n",
2752 | " ip | \n",
2753 | " 6916311 | \n",
2754 | " 0 | \n",
2755 | " 多少的爱都不要(???? ???????? | \n",
2756 | " ) | \n",
2757 | " Ten Nararak | \n",
2758 | " 0 | \n",
2759 | " 415 | \n",
2760 | " 0 | \n",
2761 | " 20170425_1_play.log | \n",
2762 | "
\n",
2763 | " \n",
2764 | " | 39191178 | \n",
2765 | " 167947839 | \n",
2766 | " ip | \n",
2767 | " 6916311 | \n",
2768 | " 0 | \n",
2769 | " 多少的爱都不要(???? ???????? | \n",
2770 | " ) | \n",
2771 | " Ten Nararak | \n",
2772 | " 29 | \n",
2773 | " 415 | \n",
2774 | " 0 | \n",
2775 | " 20170412_1_play.log | \n",
2776 | "
\n",
2777 | " \n",
2778 | " | 39230197 | \n",
2779 | " 167778188 | \n",
2780 | " ar | \n",
2781 | " 6916311 | \n",
2782 | " 2 | \n",
2783 | " 多少的爱都不要(???? ???????? | \n",
2784 | " ) | \n",
2785 | " Ten Nararak | \n",
2786 | " 9>\u000f}(183.214.21.145)TM | \n",
2787 | " 430 | \n",
2788 | " 0 | \n",
2789 | " 20170412_1_play.log | \n",
2790 | "
\n",
2791 | " \n",
2792 | " | 39291614 | \n",
2793 | " 167906765 | \n",
2794 | " ip | \n",
2795 | " 6916311 | \n",
2796 | " 0 | \n",
2797 | " 多少的爱都不要(???? ???????? | \n",
2798 | " ) | \n",
2799 | " Ten Nararak | \n",
2800 | " 5 | \n",
2801 | " 415 | \n",
2802 | " 0 | \n",
2803 | " 20170412_1_play.log | \n",
2804 | "
\n",
2805 | " \n",
2806 | " | 39439120 | \n",
2807 | " 168697697 | \n",
2808 | " ar | \n",
2809 | " 6916311 | \n",
2810 | " 0 | \n",
2811 | " 多少的爱都不要(???? ???????? | \n",
2812 | " ) | \n",
2813 | " Ten Nararak | \n",
2814 | " 4 | \n",
2815 | " 415 | \n",
2816 | " 0 | \n",
2817 | " 20170429_3_play.log | \n",
2818 | "
\n",
2819 | " \n",
2820 | " | 39534457 | \n",
2821 | " 168084784 | \n",
2822 | " ar | \n",
2823 | " 6916311 | \n",
2824 | " 2 | \n",
2825 | " 多少的爱都不要(???? ???????? | \n",
2826 | " ) | \n",
2827 | " Ten Nararak | \n",
2828 | " 191>(39.88.19.100)TM | \n",
2829 | " 430 | \n",
2830 | " 0 | \n",
2831 | " 20170429_2_play.log | \n",
2832 | "
\n",
2833 | " \n",
2834 | " | 39540010 | \n",
2835 | " 168084784 | \n",
2836 | " ar | \n",
2837 | " 6916311 | \n",
2838 | " 2 | \n",
2839 | " 多少的爱都不要(???? ???????? | \n",
2840 | " ) | \n",
2841 | " Ten Nararak | \n",
2842 | " 19>(39.88.19.100)TM | \n",
2843 | " 430 | \n",
2844 | " 0 | \n",
2845 | " 20170429_2_play.log | \n",
2846 | "
\n",
2847 | " \n",
2848 | "
\n",
2849 | "
66050 rows × 11 columns
\n",
2850 | "
"
2851 | ],
2852 | "text/plain": [
2853 | " uid device song_id song_type song_name \\\n",
2854 | "58059 168146144 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2855 | "58741 168146144 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2856 | "164818 168700735 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2857 | "178183 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2858 | "178398 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2859 | "433073 0 ar 235500 1 分裂 \n",
2860 | "447433 168647140 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2861 | "469348 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2862 | "486671 168271854 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2863 | "505531 0 ar 235500 1 分裂 \n",
2864 | "506282 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2865 | "506327 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2866 | "506369 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2867 | "506412 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2868 | "506466 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2869 | "506513 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2870 | "510426 137084142 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2871 | "532688 168164990 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2872 | "586643 168263591 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2873 | "655350 167636505 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2874 | "682423 168263591 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2875 | "760977 167988542 ar 235500 0 分裂 \n",
2876 | "801213 167636505 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2877 | "806276 167894057 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2878 | "819955 167721050 ar 5989117 0 38. \n",
2879 | "895604 168820890 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2880 | "896653 168820890 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2881 | "910055 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2882 | "910056 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2883 | "910057 161741167 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2884 | "... ... ... ... ... ... \n",
2885 | "37008272 167778188 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2886 | "37178187 168725697 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2887 | "37292655 1685126 ar 0 1 LOSER \n",
2888 | "37313545 168248860 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2889 | "37465800 1685126 ar 0 1 LOSER \n",
2890 | "37612451 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2891 | "37920919 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2892 | "38044243 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2893 | "38063913 167632676 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2894 | "38128310 167819640 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2895 | "38166424 168318095 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2896 | "38226566 167998697 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2897 | "38231484 167604900 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2898 | "38348109 168257965 ip 235500 0 分裂 \n",
2899 | "38348113 168257965 ip 235500 0 分裂 \n",
2900 | "38505140 168521650 ar 7080647 0 曹云金、刘云天《奋斗》(2012) \n",
2901 | "38518065 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2902 | "38519838 168851902 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2903 | "38679451 169001484 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2904 | "38842511 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2905 | "38867412 167922308 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2906 | "38870804 167922308 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2907 | "38990351 167775288 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2908 | "39088806 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2909 | "39191178 167947839 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2910 | "39230197 167778188 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2911 | "39291614 167906765 ip 6916311 0 多少的爱都不要(???? ???????? \n",
2912 | "39439120 168697697 ar 6916311 0 多少的爱都不要(???? ???????? \n",
2913 | "39534457 168084784 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2914 | "39540010 168084784 ar 6916311 2 多少的爱都不要(???? ???????? \n",
2915 | "\n",
2916 | " singer play_time \\\n",
2917 | "58059 ) Ten Nararak \n",
2918 | "58741 ) Ten Nararak \n",
2919 | "164818 ) Ten Nararak \n",
2920 | "178183 ) Ten Nararak \n",
2921 | "178398 ) Ten Nararak \n",
2922 | "433073 [内地版 周杰伦 \n",
2923 | "447433 ) Ten Nararak \n",
2924 | "469348 ) Ten Nararak \n",
2925 | "486671 ) Ten Nararak \n",
2926 | "505531 [内地版 周杰伦 \n",
2927 | "506282 ) Ten Nararak \n",
2928 | "506327 ) Ten Nararak \n",
2929 | "506369 ) Ten Nararak \n",
2930 | "506412 ) Ten Nararak \n",
2931 | "506466 ) Ten Nararak \n",
2932 | "506513 ) Ten Nararak \n",
2933 | "510426 ) Ten Nararak \n",
2934 | "532688 ) Ten Nararak \n",
2935 | "586643 ) Ten Nararak \n",
2936 | "655350 ) Ten Nararak \n",
2937 | "682423 ) Ten Nararak \n",
2938 | "760977 [内地版 周杰伦 \n",
2939 | "801213 ) Ten Nararak \n",
2940 | "806276 ) Ten Nararak \n",
2941 | "819955 Hurry or you'll be late for school. (快点儿,上学该迟到了。) 分级加字幕轻松练听力 \n",
2942 | "895604 ) Ten Nararak \n",
2943 | "896653 ) Ten Nararak \n",
2944 | "910055 ) Ten Nararak \n",
2945 | "910056 ) Ten Nararak \n",
2946 | "910057 ) Ten Nararak \n",
2947 | "... ... ... \n",
2948 | "37008272 ) Ten Nararak \n",
2949 | "37178187 ) Ten Nararak \n",
2950 | "37292655 Bigbang NaN \n",
2951 | "37313545 ) Ten Nararak \n",
2952 | "37465800 Bigbang NaN \n",
2953 | "37612451 ) Ten Nararak \n",
2954 | "37920919 ) Ten Nararak \n",
2955 | "38044243 ) Ten Nararak \n",
2956 | "38063913 ) Ten Nararak \n",
2957 | "38128310 ) Ten Nararak \n",
2958 | "38166424 ) Ten Nararak \n",
2959 | "38226566 ) Ten Nararak \n",
2960 | "38231484 ) Ten Nararak \n",
2961 | "38348109 [内地版 周杰伦 \n",
2962 | "38348113 [内地版 周杰伦 \n",
2963 | "38505140 NaN 春晚相声集锦 \n",
2964 | "38518065 ) Ten Nararak \n",
2965 | "38519838 ) Ten Nararak \n",
2966 | "38679451 ) Ten Nararak \n",
2967 | "38842511 ) Ten Nararak \n",
2968 | "38867412 ) Ten Nararak \n",
2969 | "38870804 ) Ten Nararak \n",
2970 | "38990351 ) Ten Nararak \n",
2971 | "39088806 ) Ten Nararak \n",
2972 | "39191178 ) Ten Nararak \n",
2973 | "39230197 ) Ten Nararak \n",
2974 | "39291614 ) Ten Nararak \n",
2975 | "39439120 ) Ten Nararak \n",
2976 | "39534457 ) Ten Nararak \n",
2977 | "39540010 ) Ten Nararak \n",
2978 | "\n",
2979 | " song_length paid_flag file_name label \n",
2980 | "58059 412 415 0 20170410_2_play.log \n",
2981 | "58741 237>\u000f}(222.219.141.68)TM 430 0 20170410_2_play.log \n",
2982 | "164818 249 415 0 20170410_3_play.log \n",
2983 | "178183 258 415 0 20170410_3_play.log \n",
2984 | "178398 103 415 0 20170410_3_play.log \n",
2985 | "433073 104931 252 0 20170504_3_play.log \n",
2986 | "447433 211 415 0 20170504_3_play.log \n",
2987 | "469348 414 415 0 20170504_2_play.log \n",
2988 | "486671 1 415 0 20170504_2_play.log \n",
2989 | "505531 104931 252 0 20170504_2_play.log \n",
2990 | "506282 415 415 0 20170504_2_play.log \n",
2991 | "506327 0 415 0 20170504_2_play.log \n",
2992 | "506369 0 415 0 20170504_2_play.log \n",
2993 | "506412 415 415 0 20170504_2_play.log \n",
2994 | "506466 415 415 0 20170504_2_play.log \n",
2995 | "506513 415 415 0 20170504_2_play.log \n",
2996 | "510426 415 415 0 20170504_2_play.log \n",
2997 | "532688 345 415 0 20170504_2_play.log \n",
2998 | "586643 414 415 0 20170508_1_play.log \n",
2999 | "655350 0 415 0 20170505_1_play.log \n",
3000 | "682423 121 415 0 20170505_1_play.log \n",
3001 | "760977 13 252 0 20170411_1_play.log \n",
3002 | "801213 0 415 0 20170426_1_play.log \n",
3003 | "806276 207 415 0 20170426_1_play.log \n",
3004 | "819955 3 9 0 20170426_1_play.log \n",
3005 | "895604 58 430 0 20170509_3_play.log \n",
3006 | "896653 409 430 0 20170509_3_play.log \n",
3007 | "910055 47>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log \n",
3008 | "910056 5>\u000f}(223.104.38.39)TM 430 0 20170509_3_play.log \n",
3009 | "910057 5>=(223.104.38.39)TM 430 0 20170509_3_play.log \n",
3010 | "... ... ... ... ... \n",
3011 | "37008272 415 415 0 20170403_1_play.log \n",
3012 | "37178187 50 415 0 20170424_3_play.log \n",
3013 | "37292655 219336 219 0 20170413_2_play.log \n",
3014 | "37313545 213 415 0 20170413_2_play.log \n",
3015 | "37465800 219336 219 0 20170413_3_play.log \n",
3016 | "37612451 414 415 0 20170507_3_play.log \n",
3017 | "37920919 48 415 0 20170428_1_play.log \n",
3018 | "38044243 414 415 0 20170428_1_play.log \n",
3019 | "38063913 414 415 0 20170339_1_play.log \n",
3020 | "38128310 0 415 0 20170339_1_play.log \n",
3021 | "38166424 6 415 0 20170339_1_play.log \n",
3022 | "38226566 414 415 0 20170339_1_play.log \n",
3023 | "38231484 164 415 0 20170339_1_play.log \n",
3024 | "38348109 0 252 0 20170402_2_play.log \n",
3025 | "38348113 0 252 0 20170402_2_play.log \n",
3026 | "38505140 219 725 0 20170402_2_play.log \n",
3027 | "38518065 414 415 0 20170402_3_play.log \n",
3028 | "38519838 415 415 0 20170402_3_play.log \n",
3029 | "38679451 400 415 0 20170402_3_play.log \n",
3030 | "38842511 3 415 0 20170506_1_play.log \n",
3031 | "38867412 414 415 0 20170506_1_play.log \n",
3032 | "38870804 415 415 0 20170506_1_play.log \n",
3033 | "38990351 414 415 0 20170425_1_play.log \n",
3034 | "39088806 0 415 0 20170425_1_play.log \n",
3035 | "39191178 29 415 0 20170412_1_play.log \n",
3036 | "39230197 9>\u000f}(183.214.21.145)TM 430 0 20170412_1_play.log \n",
3037 | "39291614 5 415 0 20170412_1_play.log \n",
3038 | "39439120 4 415 0 20170429_3_play.log \n",
3039 | "39534457 191>(39.88.19.100)TM 430 0 20170429_2_play.log \n",
3040 | "39540010 19>(39.88.19.100)TM 430 0 20170429_2_play.log \n",
3041 | "\n",
3042 | "[66050 rows x 11 columns]"
3043 | ]
3044 | },
3045 | "execution_count": 26,
3046 | "metadata": {},
3047 | "output_type": "execute_result"
3048 | }
3049 | ],
3050 | "source": [
3051 | "df_play[df_play.paid_flag>0]"
3052 | ]
3053 | },
3054 | {
3055 | "cell_type": "markdown",
3056 | "metadata": {},
3057 | "source": [
3058 | "#### Logs with paid_flag >0 are actually logs with input format errors and paid_flag =0\n",
3059 | "#### There are no real inputs with paid_flag >0"
3060 | ]
3061 | },
3062 | {
3063 | "cell_type": "markdown",
3064 | "metadata": {},
3065 | "source": [
3066 | "The important columns for analysis are: song_id, play_time, song_length and file_name so drop the rows with missing values in these three fields."
3067 | ]
3068 | },
3069 | {
3070 | "cell_type": "code",
3071 | "execution_count": 27,
3072 | "metadata": {
3073 | "collapsed": true
3074 | },
3075 | "outputs": [],
3076 | "source": [
3077 | "df_play = df_play.loc[df_play.file_name.notnull() & \n",
3078 | " df_play.play_time.notnull() & \n",
3079 | " df_play.song_id.notnull() &\n",
3080 | " df_play.song_length.notnull()]\n"
3081 | ]
3082 | },
3083 | {
3084 | "cell_type": "code",
3085 | "execution_count": 28,
3086 | "metadata": {},
3087 | "outputs": [
3088 | {
3089 | "data": {
3090 | "text/plain": [
3091 | "uid 0\n",
3092 | "device 0\n",
3093 | "song_id 0\n",
3094 | "song_type 12913\n",
3095 | "song_name 771\n",
3096 | "singer 37883\n",
3097 | "play_time 0\n",
3098 | "song_length 0\n",
3099 | "paid_flag 0\n",
3100 | "file_name 0\n",
3101 | "label 0\n",
3102 | "dtype: int64"
3103 | ]
3104 | },
3105 | "execution_count": 28,
3106 | "metadata": {},
3107 | "output_type": "execute_result"
3108 | }
3109 | ],
3110 | "source": [
3111 | "df_play.isnull().sum(axis = 0)"
3112 | ]
3113 | },
3114 | {
3115 | "cell_type": "markdown",
3116 | "metadata": {},
3117 | "source": [
3118 | "#### Now assign int 0 to the missing values of song_type, which is the most populated song_type"
3119 | ]
3120 | },
3121 | {
3122 | "cell_type": "code",
3123 | "execution_count": 29,
3124 | "metadata": {},
3125 | "outputs": [
3126 | {
3127 | "data": {
3128 | "text/plain": [
3129 | "uid 0\n",
3130 | "device 0\n",
3131 | "song_id 0\n",
3132 | "song_type 0\n",
3133 | "song_name 771\n",
3134 | "singer 37883\n",
3135 | "play_time 0\n",
3136 | "song_length 0\n",
3137 | "paid_flag 0\n",
3138 | "file_name 0\n",
3139 | "label 0\n",
3140 | "dtype: int64"
3141 | ]
3142 | },
3143 | "execution_count": 29,
3144 | "metadata": {},
3145 | "output_type": "execute_result"
3146 | }
3147 | ],
3148 | "source": [
3149 | "df_play.loc[df_play.song_type.isnull(),'song_type'] = 0\n",
3150 | "df_play.isnull().sum(axis = 0)"
3151 | ]
3152 | },
3153 | {
3154 | "cell_type": "code",
3155 | "execution_count": 30,
3156 | "metadata": {},
3157 | "outputs": [
3158 | {
3159 | "data": {
3160 | "text/plain": [
3161 | "(39492892, 11)"
3162 | ]
3163 | },
3164 | "execution_count": 30,
3165 | "metadata": {},
3166 | "output_type": "execute_result"
3167 | }
3168 | ],
3169 | "source": [
3170 | "df_play.shape"
3171 | ]
3172 | },
3173 | {
3174 | "cell_type": "code",
3175 | "execution_count": 31,
3176 | "metadata": {},
3177 | "outputs": [
3178 | {
3179 | "data": {
3180 | "text/plain": [
3181 | "uid 0\n",
3182 | "device 0\n",
3183 | "song_id 0\n",
3184 | "song_type 0\n",
3185 | "song_name 205\n",
3186 | "singer 37715\n",
3187 | "play_time 0\n",
3188 | "song_length 0\n",
3189 | "paid_flag 0\n",
3190 | "file_name 0\n",
3191 | "label 0\n",
3192 | "dtype: int64"
3193 | ]
3194 | },
3195 | "execution_count": 31,
3196 | "metadata": {},
3197 | "output_type": "execute_result"
3198 | }
3199 | ],
3200 | "source": [
3201 | "# As song_length will be used later, delete logs with >= 0 song_length and null song_name\n",
3202 | "df_play = df_play.loc[df_play.song_name.notnull() | (df_play.song_length > 0)]\n",
3203 | "df_play.isnull().sum(axis = 0)"
3204 | ]
3205 | },
3206 | {
3207 | "cell_type": "code",
3208 | "execution_count": 32,
3209 | "metadata": {
3210 | "collapsed": true
3211 | },
3212 | "outputs": [],
3213 | "source": [
3214 | "df_play = df_play.reset_index()"
3215 | ]
3216 | },
3217 | {
3218 | "cell_type": "code",
3219 | "execution_count": 33,
3220 | "metadata": {},
3221 | "outputs": [
3222 | {
3223 | "data": {
3224 | "text/html": [
3225 | "\n",
3226 | "
\n",
3227 | " \n",
3228 | " \n",
3229 | " | \n",
3230 | " index | \n",
3231 | " uid | \n",
3232 | " device | \n",
3233 | " song_id | \n",
3234 | " song_type | \n",
3235 | " song_name | \n",
3236 | " singer | \n",
3237 | " play_time | \n",
3238 | " song_length | \n",
3239 | " paid_flag | \n",
3240 | " file_name | \n",
3241 | " label | \n",
3242 | "
\n",
3243 | " \n",
3244 | " \n",
3245 | " \n",
3246 | " | 0 | \n",
3247 | " 0 | \n",
3248 | " 168308107 | \n",
3249 | " ar | \n",
3250 | " 162455 | \n",
3251 | " 0 | \n",
3252 | " 最初的梦想 | \n",
3253 | " 范玮琪 | \n",
3254 | " 296 | \n",
3255 | " 296 | \n",
3256 | " 0 | \n",
3257 | " 20170410_2_play.log | \n",
3258 | " 0 | \n",
3259 | "
\n",
3260 | " \n",
3261 | " | 1 | \n",
3262 | " 1 | \n",
3263 | " 168112765 | \n",
3264 | " ar | \n",
3265 | " 4393501 | \n",
3266 | " 0 | \n",
3267 | " 喜欢你(f101 粤) | \n",
3268 | " Beyond | \n",
3269 | " 272 | \n",
3270 | " 0 | \n",
3271 | " 0 | \n",
3272 | " 20170410_2_play.log | \n",
3273 | " 0 | \n",
3274 | "
\n",
3275 | " \n",
3276 | " | 2 | \n",
3277 | " 2 | \n",
3278 | " 168274411 | \n",
3279 | " ar | \n",
3280 | " 22833011 | \n",
3281 | " 0 | \n",
3282 | " 宽恕 | \n",
3283 | " 宽恕乐队 | \n",
3284 | " 24 | \n",
3285 | " 156 | \n",
3286 | " 0 | \n",
3287 | " 20170410_2_play.log | \n",
3288 | " 0 | \n",
3289 | "
\n",
3290 | " \n",
3291 | " | 3 | \n",
3292 | " 3 | \n",
3293 | " 0 | \n",
3294 | " ar | \n",
3295 | " 4266814 | \n",
3296 | " 1 | \n",
3297 | " 天使的翅膀 | \n",
3298 | " 徐誉滕 | \n",
3299 | " 214384 | \n",
3300 | " 0 | \n",
3301 | " 0 | \n",
3302 | " 20170410_2_play.log | \n",
3303 | " 0 | \n",
3304 | "
\n",
3305 | " \n",
3306 | " | 4 | \n",
3307 | " 4 | \n",
3308 | " 168274411 | \n",
3309 | " ar | \n",
3310 | " 176292 | \n",
3311 | " 0 | \n",
3312 | " 爱不爱我 | \n",
3313 | " 零点乐队 | \n",
3314 | " 333 | \n",
3315 | " 334 | \n",
3316 | " 0 | \n",
3317 | " 20170410_2_play.log | \n",
3318 | " 0 | \n",
3319 | "
\n",
3320 | " \n",
3321 | " | 5 | \n",
3322 | " 5 | \n",
3323 | " 168274411 | \n",
3324 | " ar | \n",
3325 | " 22833011 | \n",
3326 | " 0 | \n",
3327 | " 宽恕 | \n",
3328 | " 宽恕乐队 | \n",
3329 | " 155 | \n",
3330 | " 156 | \n",
3331 | " 0 | \n",
3332 | " 20170410_2_play.log | \n",
3333 | " 0 | \n",
3334 | "
\n",
3335 | " \n",
3336 | " | 6 | \n",
3337 | " 6 | \n",
3338 | " 168274411 | \n",
3339 | " ar | \n",
3340 | " 105279 | \n",
3341 | " 0 | \n",
3342 | " 曲终人散 | \n",
3343 | " 张宇 | \n",
3344 | " 0 | \n",
3345 | " 0 | \n",
3346 | " 0 | \n",
3347 | " 20170410_2_play.log | \n",
3348 | " 0 | \n",
3349 | "
\n",
3350 | " \n",
3351 | " | 7 | \n",
3352 | " 7 | \n",
3353 | " 168274411 | \n",
3354 | " ar | \n",
3355 | " 176292 | \n",
3356 | " 0 | \n",
3357 | " 爱不爱我 | \n",
3358 | " 零点乐队 | \n",
3359 | " 0 | \n",
3360 | " 0 | \n",
3361 | " 0 | \n",
3362 | " 20170410_2_play.log | \n",
3363 | " 0 | \n",
3364 | "
\n",
3365 | " \n",
3366 | " | 8 | \n",
3367 | " 8 | \n",
3368 | " 168515688 | \n",
3369 | " ip | \n",
3370 | " 6586179 | \n",
3371 | " 0 | \n",
3372 | " 漂洋过海来看你 | \n",
3373 | " 孙露 | \n",
3374 | " 326 | \n",
3375 | " 326 | \n",
3376 | " 0 | \n",
3377 | " 20170410_2_play.log | \n",
3378 | " 0 | \n",
3379 | "
\n",
3380 | " \n",
3381 | " | 9 | \n",
3382 | " 9 | \n",
3383 | " 32166203 | \n",
3384 | " ar | \n",
3385 | " 1034767 | \n",
3386 | " 0 | \n",
3387 | " 好想再爱你 | \n",
3388 | " 颜亚涛 | \n",
3389 | " 270 | \n",
3390 | " 0 | \n",
3391 | " 0 | \n",
3392 | " 20170410_2_play.log | \n",
3393 | " 0 | \n",
3394 | "
\n",
3395 | " \n",
3396 | "
\n",
3397 | "
"
3398 | ],
3399 | "text/plain": [
3400 | " index uid device song_id song_type song_name singer play_time \\\n",
3401 | "0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n",
3402 | "1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n",
3403 | "2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n",
3404 | "3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n",
3405 | "4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n",
3406 | "5 5 168274411 ar 22833011 0 宽恕 宽恕乐队 155 \n",
3407 | "6 6 168274411 ar 105279 0 曲终人散 张宇 0 \n",
3408 | "7 7 168274411 ar 176292 0 爱不爱我 零点乐队 0 \n",
3409 | "8 8 168515688 ip 6586179 0 漂洋过海来看你 孙露 326 \n",
3410 | "9 9 32166203 ar 1034767 0 好想再爱你 颜亚涛 270 \n",
3411 | "\n",
3412 | " song_length paid_flag file_name label \n",
3413 | "0 296 0 20170410_2_play.log 0 \n",
3414 | "1 0 0 20170410_2_play.log 0 \n",
3415 | "2 156 0 20170410_2_play.log 0 \n",
3416 | "3 0 0 20170410_2_play.log 0 \n",
3417 | "4 334 0 20170410_2_play.log 0 \n",
3418 | "5 156 0 20170410_2_play.log 0 \n",
3419 | "6 0 0 20170410_2_play.log 0 \n",
3420 | "7 0 0 20170410_2_play.log 0 \n",
3421 | "8 326 0 20170410_2_play.log 0 \n",
3422 | "9 0 0 20170410_2_play.log 0 "
3423 | ]
3424 | },
3425 | "execution_count": 33,
3426 | "metadata": {},
3427 | "output_type": "execute_result"
3428 | }
3429 | ],
3430 | "source": [
3431 | "df_play.head(10)"
3432 | ]
3433 | },
3434 | {
3435 | "cell_type": "code",
3436 | "execution_count": 34,
3437 | "metadata": {
3438 | "collapsed": true
3439 | },
3440 | "outputs": [],
3441 | "source": [
3442 | "# add date column\n",
3443 | "def get_date(file_name):\n",
3444 | " tmp_list = str(file_name).split('_')\n",
3445 | " return tmp_list[0]"
3446 | ]
3447 | },
3448 | {
3449 | "cell_type": "code",
3450 | "execution_count": 35,
3451 | "metadata": {
3452 | "collapsed": true
3453 | },
3454 | "outputs": [],
3455 | "source": [
3456 | "df_play['date'] = df_play['file_name'].map(get_date)"
3457 | ]
3458 | },
3459 | {
3460 | "cell_type": "code",
3461 | "execution_count": 36,
3462 | "metadata": {},
3463 | "outputs": [
3464 | {
3465 | "data": {
3466 | "text/plain": [
3467 | "ar 32365780\n",
3468 | "ip 3826808\n",
3469 | "ar 2744122\n",
3470 | "ip 555615\n",
3471 | "168589573 1\n",
3472 | "dtype: int64"
3473 | ]
3474 | },
3475 | "execution_count": 36,
3476 | "metadata": {},
3477 | "output_type": "execute_result"
3478 | }
3479 | ],
3480 | "source": [
3481 | "df_play.device.value_counts()"
3482 | ]
3483 | },
3484 | {
3485 | "cell_type": "code",
3486 | "execution_count": 37,
3487 | "metadata": {},
3488 | "outputs": [
3489 | {
3490 | "data": {
3491 | "text/plain": [
3492 | "array(['ar', 'ip', 'ip ', 'ar ', '168589573'], dtype=object)"
3493 | ]
3494 | },
3495 | "execution_count": 37,
3496 | "metadata": {},
3497 | "output_type": "execute_result"
3498 | }
3499 | ],
3500 | "source": [
3501 | "# Looks like all selected columns using android or iphone.\n",
3502 | "df_play.device.unique()"
3503 | ]
3504 | },
3505 | {
3506 | "cell_type": "code",
3507 | "execution_count": 38,
3508 | "metadata": {
3509 | "collapsed": true
3510 | },
3511 | "outputs": [],
3512 | "source": [
3513 | "def remove_space(word):\n",
3514 | " word = str(word).rstrip()\n",
3515 | " return word\n",
3516 | "df_play['device'] = df_play['device'].map(remove_space)"
3517 | ]
3518 | },
3519 | {
3520 | "cell_type": "code",
3521 | "execution_count": 39,
3522 | "metadata": {},
3523 | "outputs": [
3524 | {
3525 | "data": {
3526 | "text/plain": [
3527 | "array(['ar', 'ip', '168589573'], dtype=object)"
3528 | ]
3529 | },
3530 | "execution_count": 39,
3531 | "metadata": {},
3532 | "output_type": "execute_result"
3533 | }
3534 | ],
3535 | "source": [
3536 | "df_play.device.unique()"
3537 | ]
3538 | },
3539 | {
3540 | "cell_type": "code",
3541 | "execution_count": 40,
3542 | "metadata": {},
3543 | "outputs": [
3544 | {
3545 | "data": {
3546 | "text/html": [
3547 | "\n",
3548 | "
\n",
3549 | " \n",
3550 | " \n",
3551 | " | \n",
3552 | " index | \n",
3553 | " uid | \n",
3554 | " device | \n",
3555 | " song_id | \n",
3556 | " song_type | \n",
3557 | " song_name | \n",
3558 | " singer | \n",
3559 | " play_time | \n",
3560 | " song_length | \n",
3561 | " paid_flag | \n",
3562 | " file_name | \n",
3563 | " label | \n",
3564 | " date | \n",
3565 | "
\n",
3566 | " \n",
3567 | " \n",
3568 | " \n",
3569 | " | 0 | \n",
3570 | " 0 | \n",
3571 | " 168308107 | \n",
3572 | " ar | \n",
3573 | " 162455 | \n",
3574 | " 0 | \n",
3575 | " 最初的梦想 | \n",
3576 | " 范玮琪 | \n",
3577 | " 296 | \n",
3578 | " 296 | \n",
3579 | " 0 | \n",
3580 | " 20170410_2_play.log | \n",
3581 | " 0 | \n",
3582 | " 20170410 | \n",
3583 | "
\n",
3584 | " \n",
3585 | " | 1 | \n",
3586 | " 1 | \n",
3587 | " 168112765 | \n",
3588 | " ar | \n",
3589 | " 4393501 | \n",
3590 | " 0 | \n",
3591 | " 喜欢你(f101 粤) | \n",
3592 | " Beyond | \n",
3593 | " 272 | \n",
3594 | " 0 | \n",
3595 | " 0 | \n",
3596 | " 20170410_2_play.log | \n",
3597 | " 0 | \n",
3598 | " 20170410 | \n",
3599 | "
\n",
3600 | " \n",
3601 | " | 2 | \n",
3602 | " 2 | \n",
3603 | " 168274411 | \n",
3604 | " ar | \n",
3605 | " 22833011 | \n",
3606 | " 0 | \n",
3607 | " 宽恕 | \n",
3608 | " 宽恕乐队 | \n",
3609 | " 24 | \n",
3610 | " 156 | \n",
3611 | " 0 | \n",
3612 | " 20170410_2_play.log | \n",
3613 | " 0 | \n",
3614 | " 20170410 | \n",
3615 | "
\n",
3616 | " \n",
3617 | " | 3 | \n",
3618 | " 3 | \n",
3619 | " 0 | \n",
3620 | " ar | \n",
3621 | " 4266814 | \n",
3622 | " 1 | \n",
3623 | " 天使的翅膀 | \n",
3624 | " 徐誉滕 | \n",
3625 | " 214384 | \n",
3626 | " 0 | \n",
3627 | " 0 | \n",
3628 | " 20170410_2_play.log | \n",
3629 | " 0 | \n",
3630 | " 20170410 | \n",
3631 | "
\n",
3632 | " \n",
3633 | " | 4 | \n",
3634 | " 4 | \n",
3635 | " 168274411 | \n",
3636 | " ar | \n",
3637 | " 176292 | \n",
3638 | " 0 | \n",
3639 | " 爱不爱我 | \n",
3640 | " 零点乐队 | \n",
3641 | " 333 | \n",
3642 | " 334 | \n",
3643 | " 0 | \n",
3644 | " 20170410_2_play.log | \n",
3645 | " 0 | \n",
3646 | " 20170410 | \n",
3647 | "
\n",
3648 | " \n",
3649 | "
\n",
3650 | "
"
3651 | ],
3652 | "text/plain": [
3653 | " index uid device song_id song_type song_name singer play_time \\\n",
3654 | "0 0 168308107 ar 162455 0 最初的梦想 范玮琪 296 \n",
3655 | "1 1 168112765 ar 4393501 0 喜欢你(f101 粤) Beyond 272 \n",
3656 | "2 2 168274411 ar 22833011 0 宽恕 宽恕乐队 24 \n",
3657 | "3 3 0 ar 4266814 1 天使的翅膀 徐誉滕 214384 \n",
3658 | "4 4 168274411 ar 176292 0 爱不爱我 零点乐队 333 \n",
3659 | "\n",
3660 | " song_length paid_flag file_name label date \n",
3661 | "0 296 0 20170410_2_play.log 0 20170410 \n",
3662 | "1 0 0 20170410_2_play.log 0 20170410 \n",
3663 | "2 156 0 20170410_2_play.log 0 20170410 \n",
3664 | "3 0 0 20170410_2_play.log 0 20170410 \n",
3665 | "4 334 0 20170410_2_play.log 0 20170410 "
3666 | ]
3667 | },
3668 | "execution_count": 40,
3669 | "metadata": {},
3670 | "output_type": "execute_result"
3671 | }
3672 | ],
3673 | "source": [
3674 | "df_play.head()"
3675 | ]
3676 | },
3677 | {
3678 | "cell_type": "markdown",
3679 | "metadata": {},
3680 | "source": [
3681 | "### save file"
3682 | ]
3683 | },
3684 | {
3685 | "cell_type": "code",
3686 | "execution_count": 41,
3687 | "metadata": {
3688 | "collapsed": true
3689 | },
3690 | "outputs": [],
3691 | "source": [
3692 | "df_play.to_csv('/Users/ZhijingYe/Desktop/data/output/play_sample_log.csv',sep='\\t')"
3693 | ]
3694 | },
3695 | {
3696 | "cell_type": "code",
3697 | "execution_count": null,
3698 | "metadata": {
3699 | "collapsed": true
3700 | },
3701 | "outputs": [],
3702 | "source": []
3703 | }
3704 | ],
3705 | "metadata": {
3706 | "kernelspec": {
3707 | "display_name": "python27",
3708 | "language": "python",
3709 | "name": "python27"
3710 | },
3711 | "language_info": {
3712 | "codemirror_mode": {
3713 | "name": "ipython",
3714 | "version": 2
3715 | },
3716 | "file_extension": ".py",
3717 | "mimetype": "text/x-python",
3718 | "name": "python",
3719 | "nbconvert_exporter": "python",
3720 | "pygments_lexer": "ipython2",
3721 | "version": "2.7.14"
3722 | }
3723 | },
3724 | "nbformat": 4,
3725 | "nbformat_minor": 2
3726 | }
3727 |
--------------------------------------------------------------------------------