├── README.md ├── python操作 ├── python操作.ipynb └── read_split_csv.py ├── 数据分析实践 ├── task1_数据探索分析.ipynb ├── task2_特征衍生和选择.ipynb ├── task3_建模和评分.ipynb ├── task4_模型评估.ipynb ├── task4_模型评估(整理版).ipynb ├── task5_模型调优.ipynb ├── task6_模型融合.ipynb └── 多分类的评估问题.ipynb └── 爬虫与网页分析 ├── data ├── 1396354 ├── 1404079 └── actor_info1 └── 网页解析.py /README.md: -------------------------------------------------------------------------------- 1 | # python_data_analyse 2 | python常用数据分析代码和技巧 3 | 4 | 1.python操作: 5 | 目的:积累python操作 6 | 内容: 7 | 8 | 2.爬虫与网页分析: 9 | 目的:爬虫与网页分析 10 | 内容:(1)网页解析.py:把爬取的人物豆瓣网页和数据库的人物百度百科信息合并起来。 11 | 用BeautifulSoup做豆瓣网页的解析和信息提取。 12 | 数据在data/下面 13 | 14 | 3.数据分析实践: 15 | 目的:数据分析组队学习内容和数据竞赛代码积累 16 | 17 | -------------------------------------------------------------------------------- /python操作/python操作.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### bz2压缩文件的操作" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# 对于bz2压缩文件的操作\n", 19 | "\n", 20 | "import bz2\n", 21 | "with bz2.BZ2File(txtpath,'r') as f:\n", 22 | " for line in f:\n", 23 | " line = str(line, encoding = \"utf8\") #必须要加,否则\n", 24 | " \n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## 对于json的操作\n", 32 | "\n", 33 | "一、json是一种编码格式,并不是dict\n", 34 | "1. json.dump() 和 json.load() 来编码和解码JSON数据,用于处理文件 \n", 35 | "2. json.dumps将一个Python数据结构转换为JSON,json.loads将一个JSON编码的字符串转换回一个Python数据结构:\n", 36 | "3. ensure_ascii=False 用在dump的时候,中文不乱码\n", 37 | "\n", 38 | "\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import json\n", 50 | "\n", 51 | "# 1. 将整个数据结构写入json文件或者从json文件读出:\n", 52 | "\n", 53 | "with open('test.json','w',encoding = 'utf-8') as fw:\n", 54 | " json.dump(data_dict, fw, ensure_ascii=False)\n", 55 | "with open('test.json','r',encoding = 'utf-8') as fr:\n", 56 | " date = json.load(fr)\n", 57 | " \n", 58 | "# 2 . 将数据结构一行行json编码写入文件,或者从文件读出来\n", 59 | " #一行行写入\n", 60 | "tmp_dict = {'question':'fasdfa','head':'fadf'}\n", 61 | "result_f.write(json.dumps(tmp_dict,ensure_ascii=False) + '\\n')\n", 62 | " # 一行行读出来\n", 63 | " with open('ner_result.txt','r',encoding='utf-8') as fr:\n", 64 | " for line in fr:\n", 65 | " line = line.strip()\n", 66 | " tmp_dict = json.loads(line)\n", 67 | "\n", 68 | "\n", 69 | "\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### 文件操作" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "### 1 . 同时对两个文件操作\n", 88 | "with open('a1.txt','r',encoding='utf-8') as fr, open('a2.txt','a',encoding='utf-8') as fw:\n", 89 | " for line in fr: # 读文件\n", 90 | " line = line.strip() # 惯用处理,可以去掉头尾空格和换行符\n", 91 | " # strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符,包括'\\n', '\\r', '\\t', ' ')\n", 92 | " part = line.split('\\t') # split \n", 93 | " if len(part) == 2:\n", 94 | " key = part[0]\n", 95 | " value = part[1]\n", 96 | " \n", 97 | " \n", 98 | "# 2 . 把一个数组遍历写入文件 writelines: 可以写迭代结构, write只能写入str\n", 99 | "\"\"\"读+写\"\"\"\n", 100 | "#读文件,做处理,再写入文件\n", 101 | "li = []\n", 102 | "with open(\"./data/test\",encoding='utf-8') as fr:\n", 103 | " for line in fr:\n", 104 | " \n", 105 | " line =line.strip()\n", 106 | " li.append('1'+'\\t'+ line)#做处理 ,放到列表中\n", 107 | " \n", 108 | "with open('data/test_xkj1','a',encoding='utf-8') as fw:\n", 109 | " fw.writelines(e + '\\n' for e in li) " 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### 字符串操作" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 20, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "ename": "TypeError", 126 | "evalue": "write() argument must be str, not generator", 127 | "output_type": "error", 128 | "traceback": [ 129 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 130 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 131 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mli\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'b'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data/test_xkj2'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfw\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mfw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m'\\n'\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0me\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mli\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 132 | "\u001b[1;31mTypeError\u001b[0m: write() argument must be str, not generator" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# \n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### 字典 dict操作" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 22, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "dict_keys(['b', 'c'])\n", 157 | "['b', 'c']\n", 158 | "dict_values([2, 3])\n", 159 | "dict_items([('b', 2), ('c', 3)])\n" 160 | ] 161 | }, 162 | { 163 | "ename": "NameError", 164 | "evalue": "name 'cmp' is not defined", 165 | "output_type": "error", 166 | "traceback": [ 167 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 168 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 169 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_dict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmp\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_dict\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdata_dict2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 170 | "\u001b[1;31mNameError\u001b[0m: name 'cmp' is not defined" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "data_dict = {}\n", 176 | "data_dict2 = {\"a\":1,\"b\":2, \"c\":3}\n", 177 | "# 增加\n", 178 | "data_dict.update({\"a\":1,\"b\":2, \"c\":3})\n", 179 | "\n", 180 | "# 删除某个键值\n", 181 | "del data_dict[\"a\"]\n", 182 | "\n", 183 | "# 判断键值在不在\n", 184 | "if \"a\" in data_dict:\n", 185 | " print(\"a\")\n", 186 | "\n", 187 | "# 返回key值\n", 188 | "print(data_dict.keys()) #不是list类型要强制转换,\n", 189 | "print(list(data_dict.keys()))\n", 190 | "\n", 191 | "# 返回value值\n", 192 | "print(data_dict.values())\n", 193 | "\n", 194 | "# 返回可遍历的元组形式\n", 195 | "print(data_dict.items())\n", 196 | "\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### 条件判断" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "# if not array\n", 215 | "\n", 216 | "# 哪些东西相当于空" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.6.9" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 2 241 | } 242 | -------------------------------------------------------------------------------- /python操作/read_split_csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import pickle 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.cross_validation import train_test_split 7 | 8 | """ 9 | python将大csv文件划分成小csv文件做训练集和测试集及对应label文件 10 | """ 11 | 12 | def read_data(test_data='input/train.csv', n=0, label=1): 13 | ''' 14 | 加载数据的功能 15 | n:特征数据起始位 16 | label:是否是监督样本数据 17 | ''' 18 | csv_reader = csv.reader(open(test_data, encoding="utf8", errors="ignore")) 19 | data_list = [] 20 | for one_line in csv_reader: 21 | data_list.append(one_line) 22 | x_list = [] 23 | y_list = [] 24 | for one_line in data_list[1:]: 25 | if label == 1:#如果是监督样本数据 26 | y_list.append(int(one_line[-1])) # 标志位(最后一位都是标签位) 27 | one_list = [o for o in one_line[n:-1]] 28 | x_list.append(one_list) 29 | else: 30 | one_list = [o for o in one_line[n:]] 31 | x_list.append(one_list) 32 | return x_list, y_list 33 | 34 | def split_data(data_list, y_list, ratio=0.30):#70%训练集,30%测试集: 914285,391837 35 | ''' 36 | 按照指定的比例,划分样本数据集 37 | ratio: 测试数据的比率 38 | ''' 39 | X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, test_size=ratio, random_state=50) 40 | 41 | """训练集""" 42 | with open('input/sub_train.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来 43 | fieldnames = ['qid', 'question_text','target'] 44 | write = csv.DictWriter(csvfile,fieldnames=fieldnames) 45 | write.writeheader()#写表头 46 | for i in range(len(X_train)): 47 | write.writerow({'qid':X_train[i][0],'question_text':X_train[i][1],'target':y_train[i]}) 48 | 49 | """测试集""" 50 | #标签文件 51 | with open('input/sub_test_y', 'w') as fp: 52 | json.dump(y_test, fp) 53 | #测试csv 54 | with open('input/sub_test_x.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来 55 | fieldnames = ['qid', 'question_text'] 56 | write = csv.DictWriter(csvfile,fieldnames=fieldnames) 57 | write.writeheader()#写表头 58 | for i in range(len(X_test)): 59 | write.writerow({'qid':X_test[i][0],'question_text':X_test[i][1]}) 60 | return X_train, X_test, y_train, y_test 61 | 62 | if __name__ == '__main__': 63 | """获取大文件的数据""" 64 | x_list, y_list=read_data() 65 | """划分为训练集和测试集及label文件""" 66 | split_data(x_list,y_list) 67 | -------------------------------------------------------------------------------- /数据分析实践/task1_数据探索分析.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 题目: \n", 8 | "这份数据集是金融数据(非原始数据,已经处理过了),我们要做的是预测贷款用户是否会逾期。表格中 \"status\" 是结果标签:0表示未逾期,1表示逾期。\n", 9 | "### 要求: \n", 10 | "数据切分方式 - 三七分,其中测试集30%,训练集70%,随机种子设置为2018\n", 11 | "### 任务1: \n", 12 | "对数据进行探索和分析。 \n", 13 | "\n", 14 | "数据类型的分析 \n", 15 | "无关特征删除 \n", 16 | "数据类型转换 \n", 17 | "缺失值处理 \n", 18 | "……以及你能想到和借鉴的数据分析处理" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### 一、 观察数据" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
Unnamed: 0custidtrade_nobank_card_nolow_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_day...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
05279185820180507115231274000000023057383卡号10.010.9900.900.550.313...2900.01688.01200.075.01.02.01200.01200.012.018.0
11053404720180507121002192000000023073000卡号10.020.9420001.281.000.458...3500.01758.015100.080.05.06.022800.09360.04.02.0
212284978720180507125159718000000023114911卡号10.040.9601.001.000.114...1600.01250.04200.087.01.01.04200.04200.02.06.0
313180970820180507121358683000000388283484卡号10.000.9620000.130.570.777...3200.01541.016300.080.05.05.030000.012180.02.04.0
414249982920180507115448545000000388205844卡号10.010.9900.461.000.175...2300.01630.08300.079.02.02.08400.08250.022.0120.0
\n", 202 | "

5 rows × 90 columns

\n", 203 | "
" 204 | ], 205 | "text/plain": [ 206 | " Unnamed: 0 custid trade_no bank_card_no \\\n", 207 | "0 5 2791858 20180507115231274000000023057383 卡号1 \n", 208 | "1 10 534047 20180507121002192000000023073000 卡号1 \n", 209 | "2 12 2849787 20180507125159718000000023114911 卡号1 \n", 210 | "3 13 1809708 20180507121358683000000388283484 卡号1 \n", 211 | "4 14 2499829 20180507115448545000000388205844 卡号1 \n", 212 | "\n", 213 | " low_volume_percent middle_volume_percent \\\n", 214 | "0 0.01 0.99 \n", 215 | "1 0.02 0.94 \n", 216 | "2 0.04 0.96 \n", 217 | "3 0.00 0.96 \n", 218 | "4 0.01 0.99 \n", 219 | "\n", 220 | " take_amount_in_later_12_month_highest trans_amount_increase_rate_lately \\\n", 221 | "0 0 0.90 \n", 222 | "1 2000 1.28 \n", 223 | "2 0 1.00 \n", 224 | "3 2000 0.13 \n", 225 | "4 0 0.46 \n", 226 | "\n", 227 | " trans_activity_month trans_activity_day ... \\\n", 228 | "0 0.55 0.313 ... \n", 229 | "1 1.00 0.458 ... \n", 230 | "2 1.00 0.114 ... \n", 231 | "3 0.57 0.777 ... \n", 232 | "4 1.00 0.175 ... \n", 233 | "\n", 234 | " loans_max_limit loans_avg_limit consfin_credit_limit \\\n", 235 | "0 2900.0 1688.0 1200.0 \n", 236 | "1 3500.0 1758.0 15100.0 \n", 237 | "2 1600.0 1250.0 4200.0 \n", 238 | "3 3200.0 1541.0 16300.0 \n", 239 | "4 2300.0 1630.0 8300.0 \n", 240 | "\n", 241 | " consfin_credibility consfin_org_count_current consfin_product_count \\\n", 242 | "0 75.0 1.0 2.0 \n", 243 | "1 80.0 5.0 6.0 \n", 244 | "2 87.0 1.0 1.0 \n", 245 | "3 80.0 5.0 5.0 \n", 246 | "4 79.0 2.0 2.0 \n", 247 | "\n", 248 | " consfin_max_limit consfin_avg_limit latest_query_day loans_latest_day \n", 249 | "0 1200.0 1200.0 12.0 18.0 \n", 250 | "1 22800.0 9360.0 4.0 2.0 \n", 251 | "2 4200.0 4200.0 2.0 6.0 \n", 252 | "3 30000.0 12180.0 2.0 4.0 \n", 253 | "4 8400.0 8250.0 22.0 120.0 \n", 254 | "\n", 255 | "[5 rows x 90 columns]" 256 | ] 257 | }, 258 | "execution_count": 1, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "import pandas as pd \n", 265 | "\n", 266 | "df = pd.read_csv('data.csv',encoding = 'gbk')\n", 267 | "df.head(5) " 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 2, 273 | "metadata": { 274 | "collapsed": false, 275 | "scrolled": true 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "\n", 283 | "RangeIndex: 4754 entries, 0 to 4753\n", 284 | "Data columns (total 90 columns):\n", 285 | "Unnamed: 0 4754 non-null int64\n", 286 | "custid 4754 non-null int64\n", 287 | "trade_no 4754 non-null object\n", 288 | "bank_card_no 4754 non-null object\n", 289 | "low_volume_percent 4752 non-null float64\n", 290 | "middle_volume_percent 4752 non-null float64\n", 291 | "take_amount_in_later_12_month_highest 4754 non-null int64\n", 292 | "trans_amount_increase_rate_lately 4751 non-null float64\n", 293 | "trans_activity_month 4752 non-null float64\n", 294 | "trans_activity_day 4752 non-null float64\n", 295 | "transd_mcc 4752 non-null float64\n", 296 | "trans_days_interval_filter 4746 non-null float64\n", 297 | "trans_days_interval 4752 non-null float64\n", 298 | "regional_mobility 4752 non-null float64\n", 299 | "student_feature 1756 non-null float64\n", 300 | "repayment_capability 4754 non-null int64\n", 301 | "is_high_user 4754 non-null int64\n", 302 | "number_of_trans_from_2011 4752 non-null float64\n", 303 | "first_transaction_time 4752 non-null float64\n", 304 | "historical_trans_amount 4754 non-null int64\n", 305 | "historical_trans_day 4752 non-null float64\n", 306 | "rank_trad_1_month 4752 non-null float64\n", 307 | "trans_amount_3_month 4754 non-null int64\n", 308 | "avg_consume_less_12_valid_month 4752 non-null float64\n", 309 | "abs 4754 non-null int64\n", 310 | "top_trans_count_last_1_month 4752 non-null float64\n", 311 | "avg_price_last_12_month 4754 non-null int64\n", 312 | "avg_price_top_last_12_valid_month 4650 non-null float64\n", 313 | "reg_preference_for_trad 4752 non-null object\n", 314 | "trans_top_time_last_1_month 4746 non-null float64\n", 315 | "trans_top_time_last_6_month 4746 non-null float64\n", 316 | "consume_top_time_last_1_month 4746 non-null float64\n", 317 | "consume_top_time_last_6_month 4746 non-null float64\n", 318 | "cross_consume_count_last_1_month 4328 non-null float64\n", 319 | "trans_fail_top_count_enum_last_1_month 4738 non-null float64\n", 320 | "trans_fail_top_count_enum_last_6_month 4738 non-null float64\n", 321 | "trans_fail_top_count_enum_last_12_month 4738 non-null float64\n", 322 | "consume_mini_time_last_1_month 4728 non-null float64\n", 323 | "max_cumulative_consume_later_1_month 4754 non-null int64\n", 324 | "max_consume_count_later_6_month 4746 non-null float64\n", 325 | "railway_consume_count_last_12_month 4742 non-null float64\n", 326 | "pawns_auctions_trusts_consume_last_1_month 4754 non-null int64\n", 327 | "pawns_auctions_trusts_consume_last_6_month 4754 non-null int64\n", 328 | "jewelry_consume_count_last_6_month 4742 non-null float64\n", 329 | "status 4754 non-null int64\n", 330 | "source 4754 non-null object\n", 331 | "first_transaction_day 4752 non-null float64\n", 332 | "trans_day_last_12_month 4752 non-null float64\n", 333 | "id_name 4478 non-null object\n", 334 | "apply_score 4450 non-null float64\n", 335 | "apply_credibility 4450 non-null float64\n", 336 | "query_org_count 4450 non-null float64\n", 337 | "query_finance_count 4450 non-null float64\n", 338 | "query_cash_count 4450 non-null float64\n", 339 | "query_sum_count 4450 non-null float64\n", 340 | "latest_query_time 4450 non-null object\n", 341 | "latest_one_month_apply 4450 non-null float64\n", 342 | "latest_three_month_apply 4450 non-null float64\n", 343 | "latest_six_month_apply 4450 non-null float64\n", 344 | "loans_score 4457 non-null float64\n", 345 | "loans_credibility_behavior 4457 non-null float64\n", 346 | "loans_count 4457 non-null float64\n", 347 | "loans_settle_count 4457 non-null float64\n", 348 | "loans_overdue_count 4457 non-null float64\n", 349 | "loans_org_count_behavior 4457 non-null float64\n", 350 | "consfin_org_count_behavior 4457 non-null float64\n", 351 | "loans_cash_count 4457 non-null float64\n", 352 | "latest_one_month_loan 4457 non-null float64\n", 353 | "latest_three_month_loan 4457 non-null float64\n", 354 | "latest_six_month_loan 4457 non-null float64\n", 355 | "history_suc_fee 4457 non-null float64\n", 356 | "history_fail_fee 4457 non-null float64\n", 357 | "latest_one_month_suc 4457 non-null float64\n", 358 | "latest_one_month_fail 4457 non-null float64\n", 359 | "loans_long_time 4457 non-null float64\n", 360 | "loans_latest_time 4457 non-null object\n", 361 | "loans_credit_limit 4457 non-null float64\n", 362 | "loans_credibility_limit 4457 non-null float64\n", 363 | "loans_org_count_current 4457 non-null float64\n", 364 | "loans_product_count 4457 non-null float64\n", 365 | "loans_max_limit 4457 non-null float64\n", 366 | "loans_avg_limit 4457 non-null float64\n", 367 | "consfin_credit_limit 4457 non-null float64\n", 368 | "consfin_credibility 4457 non-null float64\n", 369 | "consfin_org_count_current 4457 non-null float64\n", 370 | "consfin_product_count 4457 non-null float64\n", 371 | "consfin_max_limit 4457 non-null float64\n", 372 | "consfin_avg_limit 4457 non-null float64\n", 373 | "latest_query_day 4450 non-null float64\n", 374 | "loans_latest_day 4457 non-null float64\n", 375 | "dtypes: float64(70), int64(13), object(7)\n", 376 | "memory usage: 3.3+ MB\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "df.info() #返回df的所有信息\n", 382 | "#我们可以知道 有4754条数据,有90个特征(列名)和他们的类别(有些特征的数据不够4754,但是也没看到空值这是咋回事?)\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 3, 388 | "metadata": { 389 | "collapsed": false 390 | }, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/html": [ 395 | "
\n", 396 | "\n", 409 | "\n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | "
Unnamed: 0custidlow_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filter...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
count4754.0000004.754000e+034752.0000004752.0000004754.0000004751.0000004752.0000004752.0000004752.0000004746.000000...4457.0000004457.0000004457.0000004457.0000004457.0000004457.0000004457.0000004457.0000004450.0000004457.000000
mean6008.4141781.690993e+060.0218060.9012941940.19772814.1606740.8044110.36542517.50294629.029920...3390.0381421820.3578649187.00919976.0426304.7323315.22750716153.6908238007.69688124.11280955.181512
std3452.0714281.034235e+060.0415270.1448563923.971494694.1804730.1969200.1701964.47561622.722432...1474.206546583.4182917371.25704314.5368192.9745963.40929214301.0376285679.41858537.72572453.486408
min5.0000001.140000e+020.0000000.0000000.0000000.0000000.1200000.0330002.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000-2.000000-2.000000
25%3106.0000007.593358e+050.0100000.8800000.0000000.6150000.6700000.23300015.00000016.000000...2300.0000001535.0000004800.00000077.0000002.0000003.0000007800.0000004737.0000005.00000010.000000
50%6006.5000001.634942e+060.0100000.960000500.0000000.9700000.8600000.35000017.00000023.000000...3100.0000001810.0000007700.00000079.0000004.0000005.00000013800.0000007050.00000014.00000036.000000
75%8999.0000002.597905e+060.0200000.9900002000.0000001.6000001.0000000.48000020.00000032.000000...4300.0000002100.00000011700.00000080.0000007.0000007.00000020400.00000010000.00000024.00000091.000000
max11992.0000004.004694e+061.0000001.00000068000.00000047596.7400001.0000000.94100042.000000285.000000...10000.0000006900.00000087100.00000087.00000018.00000020.000000266400.00000082800.000000360.000000323.000000
\n", 631 | "

8 rows × 83 columns

\n", 632 | "
" 633 | ], 634 | "text/plain": [ 635 | " Unnamed: 0 custid low_volume_percent middle_volume_percent \\\n", 636 | "count 4754.000000 4.754000e+03 4752.000000 4752.000000 \n", 637 | "mean 6008.414178 1.690993e+06 0.021806 0.901294 \n", 638 | "std 3452.071428 1.034235e+06 0.041527 0.144856 \n", 639 | "min 5.000000 1.140000e+02 0.000000 0.000000 \n", 640 | "25% 3106.000000 7.593358e+05 0.010000 0.880000 \n", 641 | "50% 6006.500000 1.634942e+06 0.010000 0.960000 \n", 642 | "75% 8999.000000 2.597905e+06 0.020000 0.990000 \n", 643 | "max 11992.000000 4.004694e+06 1.000000 1.000000 \n", 644 | "\n", 645 | " take_amount_in_later_12_month_highest \\\n", 646 | "count 4754.000000 \n", 647 | "mean 1940.197728 \n", 648 | "std 3923.971494 \n", 649 | "min 0.000000 \n", 650 | "25% 0.000000 \n", 651 | "50% 500.000000 \n", 652 | "75% 2000.000000 \n", 653 | "max 68000.000000 \n", 654 | "\n", 655 | " trans_amount_increase_rate_lately trans_activity_month \\\n", 656 | "count 4751.000000 4752.000000 \n", 657 | "mean 14.160674 0.804411 \n", 658 | "std 694.180473 0.196920 \n", 659 | "min 0.000000 0.120000 \n", 660 | "25% 0.615000 0.670000 \n", 661 | "50% 0.970000 0.860000 \n", 662 | "75% 1.600000 1.000000 \n", 663 | "max 47596.740000 1.000000 \n", 664 | "\n", 665 | " trans_activity_day transd_mcc trans_days_interval_filter \\\n", 666 | "count 4752.000000 4752.000000 4746.000000 \n", 667 | "mean 0.365425 17.502946 29.029920 \n", 668 | "std 0.170196 4.475616 22.722432 \n", 669 | "min 0.033000 2.000000 0.000000 \n", 670 | "25% 0.233000 15.000000 16.000000 \n", 671 | "50% 0.350000 17.000000 23.000000 \n", 672 | "75% 0.480000 20.000000 32.000000 \n", 673 | "max 0.941000 42.000000 285.000000 \n", 674 | "\n", 675 | " ... loans_max_limit loans_avg_limit \\\n", 676 | "count ... 4457.000000 4457.000000 \n", 677 | "mean ... 3390.038142 1820.357864 \n", 678 | "std ... 1474.206546 583.418291 \n", 679 | "min ... 0.000000 0.000000 \n", 680 | "25% ... 2300.000000 1535.000000 \n", 681 | "50% ... 3100.000000 1810.000000 \n", 682 | "75% ... 4300.000000 2100.000000 \n", 683 | "max ... 10000.000000 6900.000000 \n", 684 | "\n", 685 | " consfin_credit_limit consfin_credibility consfin_org_count_current \\\n", 686 | "count 4457.000000 4457.000000 4457.000000 \n", 687 | "mean 9187.009199 76.042630 4.732331 \n", 688 | "std 7371.257043 14.536819 2.974596 \n", 689 | "min 0.000000 0.000000 0.000000 \n", 690 | "25% 4800.000000 77.000000 2.000000 \n", 691 | "50% 7700.000000 79.000000 4.000000 \n", 692 | "75% 11700.000000 80.000000 7.000000 \n", 693 | "max 87100.000000 87.000000 18.000000 \n", 694 | "\n", 695 | " consfin_product_count consfin_max_limit consfin_avg_limit \\\n", 696 | "count 4457.000000 4457.000000 4457.000000 \n", 697 | "mean 5.227507 16153.690823 8007.696881 \n", 698 | "std 3.409292 14301.037628 5679.418585 \n", 699 | "min 0.000000 0.000000 0.000000 \n", 700 | "25% 3.000000 7800.000000 4737.000000 \n", 701 | "50% 5.000000 13800.000000 7050.000000 \n", 702 | "75% 7.000000 20400.000000 10000.000000 \n", 703 | "max 20.000000 266400.000000 82800.000000 \n", 704 | "\n", 705 | " latest_query_day loans_latest_day \n", 706 | "count 4450.000000 4457.000000 \n", 707 | "mean 24.112809 55.181512 \n", 708 | "std 37.725724 53.486408 \n", 709 | "min -2.000000 -2.000000 \n", 710 | "25% 5.000000 10.000000 \n", 711 | "50% 14.000000 36.000000 \n", 712 | "75% 24.000000 91.000000 \n", 713 | "max 360.000000 323.000000 \n", 714 | "\n", 715 | "[8 rows x 83 columns]" 716 | ] 717 | }, 718 | "execution_count": 3, 719 | "metadata": {}, 720 | "output_type": "execute_result" 721 | } 722 | ], 723 | "source": [ 724 | "df.describe() #可以看到统计信息" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 4, 730 | "metadata": { 731 | "collapsed": false 732 | }, 733 | "outputs": [ 734 | { 735 | "name": "stdout", 736 | "output_type": "stream", 737 | "text": [ 738 | "we have 70 columns in type float64, they are ['low_volume_percent', 'middle_volume_percent', 'trans_amount_increase_rate_lately', 'trans_activity_month', 'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter', 'trans_days_interval', 'regional_mobility', 'student_feature', 'number_of_trans_from_2011', 'first_transaction_time', 'historical_trans_day', 'rank_trad_1_month', 'avg_consume_less_12_valid_month', 'top_trans_count_last_1_month', 'avg_price_top_last_12_valid_month', 'trans_top_time_last_1_month', 'trans_top_time_last_6_month', 'consume_top_time_last_1_month', 'consume_top_time_last_6_month', 'cross_consume_count_last_1_month', 'trans_fail_top_count_enum_last_1_month', 'trans_fail_top_count_enum_last_6_month', 'trans_fail_top_count_enum_last_12_month', 'consume_mini_time_last_1_month', 'max_consume_count_later_6_month', 'railway_consume_count_last_12_month', 'jewelry_consume_count_last_6_month', 'first_transaction_day', 'trans_day_last_12_month', 'apply_score', 'apply_credibility', 'query_org_count', 'query_finance_count', 'query_cash_count', 'query_sum_count', 'latest_one_month_apply', 'latest_three_month_apply', 'latest_six_month_apply', 'loans_score', 'loans_credibility_behavior', 'loans_count', 'loans_settle_count', 'loans_overdue_count', 'loans_org_count_behavior', 'consfin_org_count_behavior', 'loans_cash_count', 'latest_one_month_loan', 'latest_three_month_loan', 'latest_six_month_loan', 'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc', 'latest_one_month_fail', 'loans_long_time', 'loans_credit_limit', 'loans_credibility_limit', 'loans_org_count_current', 'loans_product_count', 'loans_max_limit', 'loans_avg_limit', 'consfin_credit_limit', 'consfin_credibility', 'consfin_org_count_current', 'consfin_product_count', 'consfin_max_limit', 'consfin_avg_limit', 'latest_query_day', 'loans_latest_day']\\\n", 739 | "we have 13 columns in type int64, they are ['Unnamed: 0', 'custid', 'take_amount_in_later_12_month_highest', 'repayment_capability', 'is_high_user', 'historical_trans_amount', 'trans_amount_3_month', 'abs', 'avg_price_last_12_month', 'max_cumulative_consume_later_1_month', 'pawns_auctions_trusts_consume_last_1_month', 'pawns_auctions_trusts_consume_last_6_month', 'status']\\\n", 740 | "we have 7 columns in type object, they are ['trade_no', 'bank_card_no', 'reg_preference_for_trad', 'source', 'id_name', 'latest_query_time', 'loans_latest_time']\\\n" 741 | ] 742 | } 743 | ], 744 | "source": [ 745 | "#提取我们需要的信息,特别是每个特征的类别信息\n", 746 | "\n", 747 | "def get_data_type(df):\n", 748 | " typedic= {} # 类型字典\n", 749 | " for name in df.columns:\n", 750 | " typedic[str(df[name].dtype)] = typedic.get(str(df[name].dtype),[])+[name]\n", 751 | " for key,value in typedic.items():\n", 752 | " # print('we have {} columns in type {}'.format(len(value),key))\n", 753 | " print('we have {} columns in type {}, they are {}\\\\'.format(len(value),key,value))\n", 754 | "\n", 755 | "get_data_type(df)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "### 二、数据预处理 \n", 763 | "无关特征删除 \n", 764 | "数据类型转换 \n", 765 | "缺失值处理等等" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": {}, 771 | "source": [ 772 | "### 1 . 无关特征删除(每个特征过一下) \n", 773 | "有90个特征,选出应该去除的\n", 774 | "\n", 775 | "先对 类型为obect的查看一下:['trade_no', 'bank_card_no', 'reg_preference_for_trad', 'source', 'id_name', 'latest_query_time', 'loans_latest_time']\n", 776 | "'bank_card_no' :都是卡号一,去掉\n", 777 | "'id_name':客户名字,去掉\n", 778 | "\n", 779 | "\n", 780 | "Unnamed: 0: 应该是原来的数据序号,删掉一些无用数据,造成序号不连续。可以去掉 \n", 781 | "custid : 顾客id号没啥分析\n", 782 | " \n", 783 | " \n" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": 5, 789 | "metadata": { 790 | "collapsed": false, 791 | "scrolled": true 792 | }, 793 | "outputs": [ 794 | { 795 | "data": { 796 | "text/plain": [ 797 | "Index(['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no',\n", 798 | " 'low_volume_percent', 'middle_volume_percent',\n", 799 | " 'take_amount_in_later_12_month_highest',\n", 800 | " 'trans_amount_increase_rate_lately', 'trans_activity_month',\n", 801 | " 'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter',\n", 802 | " 'trans_days_interval', 'regional_mobility', 'student_feature',\n", 803 | " 'repayment_capability', 'is_high_user', 'number_of_trans_from_2011',\n", 804 | " 'first_transaction_time', 'historical_trans_amount',\n", 805 | " 'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',\n", 806 | " 'avg_consume_less_12_valid_month', 'abs',\n", 807 | " 'top_trans_count_last_1_month', 'avg_price_last_12_month',\n", 808 | " 'avg_price_top_last_12_valid_month', 'reg_preference_for_trad',\n", 809 | " 'trans_top_time_last_1_month', 'trans_top_time_last_6_month',\n", 810 | " 'consume_top_time_last_1_month', 'consume_top_time_last_6_month',\n", 811 | " 'cross_consume_count_last_1_month',\n", 812 | " 'trans_fail_top_count_enum_last_1_month',\n", 813 | " 'trans_fail_top_count_enum_last_6_month',\n", 814 | " 'trans_fail_top_count_enum_last_12_month',\n", 815 | " 'consume_mini_time_last_1_month',\n", 816 | " 'max_cumulative_consume_later_1_month',\n", 817 | " 'max_consume_count_later_6_month',\n", 818 | " 'railway_consume_count_last_12_month',\n", 819 | " 'pawns_auctions_trusts_consume_last_1_month',\n", 820 | " 'pawns_auctions_trusts_consume_last_6_month',\n", 821 | " 'jewelry_consume_count_last_6_month', 'status', 'source',\n", 822 | " 'first_transaction_day', 'trans_day_last_12_month', 'id_name',\n", 823 | " 'apply_score', 'apply_credibility', 'query_org_count',\n", 824 | " 'query_finance_count', 'query_cash_count', 'query_sum_count',\n", 825 | " 'latest_query_time', 'latest_one_month_apply',\n", 826 | " 'latest_three_month_apply', 'latest_six_month_apply', 'loans_score',\n", 827 | " 'loans_credibility_behavior', 'loans_count', 'loans_settle_count',\n", 828 | " 'loans_overdue_count', 'loans_org_count_behavior',\n", 829 | " 'consfin_org_count_behavior', 'loans_cash_count',\n", 830 | " 'latest_one_month_loan', 'latest_three_month_loan',\n", 831 | " 'latest_six_month_loan', 'history_suc_fee', 'history_fail_fee',\n", 832 | " 'latest_one_month_suc', 'latest_one_month_fail', 'loans_long_time',\n", 833 | " 'loans_latest_time', 'loans_credit_limit', 'loans_credibility_limit',\n", 834 | " 'loans_org_count_current', 'loans_product_count', 'loans_max_limit',\n", 835 | " 'loans_avg_limit', 'consfin_credit_limit', 'consfin_credibility',\n", 836 | " 'consfin_org_count_current', 'consfin_product_count',\n", 837 | " 'consfin_max_limit', 'consfin_avg_limit', 'latest_query_day',\n", 838 | " 'loans_latest_day'],\n", 839 | " dtype='object')" 840 | ] 841 | }, 842 | "execution_count": 5, 843 | "metadata": {}, 844 | "output_type": "execute_result" 845 | } 846 | ], 847 | "source": [ 848 | "df.columns" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 6, 854 | "metadata": { 855 | "collapsed": false, 856 | "scrolled": true 857 | }, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "0 20180507115231274000000023057383\n", 863 | "1 20180507121002192000000023073000\n", 864 | "2 20180507125159718000000023114911\n", 865 | "3 20180507121358683000000388283484\n", 866 | "4 20180507115448545000000388205844\n", 867 | "Name: trade_no, dtype: object" 868 | ] 869 | }, 870 | "execution_count": 6, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "df['trade_no'].head(5)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "交易号,有时间信息,但后面也还有时间的特征,这里可以去掉" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 7, 889 | "metadata": { 890 | "collapsed": false, 891 | "scrolled": true 892 | }, 893 | "outputs": [ 894 | { 895 | "data": { 896 | "text/plain": [ 897 | "0 卡号1\n", 898 | "1 卡号1\n", 899 | "2 卡号1\n", 900 | "3 卡号1\n", 901 | "4 卡号1\n", 902 | "Name: bank_card_no, dtype: object" 903 | ] 904 | }, 905 | "execution_count": 7, 906 | "metadata": {}, 907 | "output_type": "execute_result" 908 | } 909 | ], 910 | "source": [ 911 | "df['bank_card_no'].head()" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 8, 917 | "metadata": { 918 | "collapsed": false 919 | }, 920 | "outputs": [ 921 | { 922 | "data": { 923 | "text/plain": [ 924 | "xs 4754\n", 925 | "Name: source, dtype: int64" 926 | ] 927 | }, 928 | "execution_count": 8, 929 | "metadata": {}, 930 | "output_type": "execute_result" 931 | } 932 | ], 933 | "source": [ 934 | "#查看重复的值就可以去掉了\n", 935 | "\n", 936 | "df['source'].value_counts() #railway_consume_count_last_12_month\n", 937 | "\n", 938 | "# df['source'].duplicated().sum() #查看不同元素的数量" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "都是一样,删除" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 9, 951 | "metadata": { 952 | "collapsed": false, 953 | "scrolled": true 954 | }, 955 | "outputs": [ 956 | { 957 | "data": { 958 | "text/plain": [ 959 | "0.0 4651\n", 960 | "1.0 72\n", 961 | "2.0 13\n", 962 | "4.0 3\n", 963 | "3.0 2\n", 964 | "30.0 1\n", 965 | "Name: railway_consume_count_last_12_month, dtype: int64" 966 | ] 967 | }, 968 | "execution_count": 9, 969 | "metadata": {}, 970 | "output_type": "execute_result" 971 | } 972 | ], 973 | "source": [ 974 | "df['railway_consume_count_last_12_month'].value_counts()" 975 | ] 976 | }, 977 | { 978 | "cell_type": "markdown", 979 | "metadata": {}, 980 | "source": [ 981 | "### 综上,删掉的值有'Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": 10, 987 | "metadata": { 988 | "collapsed": false, 989 | "scrolled": true 990 | }, 991 | "outputs": [ 992 | { 993 | "name": "stdout", 994 | "output_type": "stream", 995 | "text": [ 996 | "84\n" 997 | ] 998 | }, 999 | { 1000 | "data": { 1001 | "text/plain": [ 1002 | "Index(['low_volume_percent', 'middle_volume_percent',\n", 1003 | " 'take_amount_in_later_12_month_highest',\n", 1004 | " 'trans_amount_increase_rate_lately', 'trans_activity_month',\n", 1005 | " 'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter',\n", 1006 | " 'trans_days_interval', 'regional_mobility', 'student_feature',\n", 1007 | " 'repayment_capability', 'is_high_user', 'number_of_trans_from_2011',\n", 1008 | " 'first_transaction_time', 'historical_trans_amount',\n", 1009 | " 'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',\n", 1010 | " 'avg_consume_less_12_valid_month', 'abs',\n", 1011 | " 'top_trans_count_last_1_month', 'avg_price_last_12_month',\n", 1012 | " 'avg_price_top_last_12_valid_month', 'reg_preference_for_trad',\n", 1013 | " 'trans_top_time_last_1_month', 'trans_top_time_last_6_month',\n", 1014 | " 'consume_top_time_last_1_month', 'consume_top_time_last_6_month',\n", 1015 | " 'cross_consume_count_last_1_month',\n", 1016 | " 'trans_fail_top_count_enum_last_1_month',\n", 1017 | " 'trans_fail_top_count_enum_last_6_month',\n", 1018 | " 'trans_fail_top_count_enum_last_12_month',\n", 1019 | " 'consume_mini_time_last_1_month',\n", 1020 | " 'max_cumulative_consume_later_1_month',\n", 1021 | " 'max_consume_count_later_6_month',\n", 1022 | " 'railway_consume_count_last_12_month',\n", 1023 | " 'pawns_auctions_trusts_consume_last_1_month',\n", 1024 | " 'pawns_auctions_trusts_consume_last_6_month',\n", 1025 | " 'jewelry_consume_count_last_6_month', 'status', 'first_transaction_day',\n", 1026 | " 'trans_day_last_12_month', 'apply_score', 'apply_credibility',\n", 1027 | " 'query_org_count', 'query_finance_count', 'query_cash_count',\n", 1028 | " 'query_sum_count', 'latest_query_time', 'latest_one_month_apply',\n", 1029 | " 'latest_three_month_apply', 'latest_six_month_apply', 'loans_score',\n", 1030 | " 'loans_credibility_behavior', 'loans_count', 'loans_settle_count',\n", 1031 | " 'loans_overdue_count', 'loans_org_count_behavior',\n", 1032 | " 'consfin_org_count_behavior', 'loans_cash_count',\n", 1033 | " 'latest_one_month_loan', 'latest_three_month_loan',\n", 1034 | " 'latest_six_month_loan', 'history_suc_fee', 'history_fail_fee',\n", 1035 | " 'latest_one_month_suc', 'latest_one_month_fail', 'loans_long_time',\n", 1036 | " 'loans_latest_time', 'loans_credit_limit', 'loans_credibility_limit',\n", 1037 | " 'loans_org_count_current', 'loans_product_count', 'loans_max_limit',\n", 1038 | " 'loans_avg_limit', 'consfin_credit_limit', 'consfin_credibility',\n", 1039 | " 'consfin_org_count_current', 'consfin_product_count',\n", 1040 | " 'consfin_max_limit', 'consfin_avg_limit', 'latest_query_day',\n", 1041 | " 'loans_latest_day'],\n", 1042 | " dtype='object')" 1043 | ] 1044 | }, 1045 | "execution_count": 10, 1046 | "metadata": {}, 1047 | "output_type": "execute_result" 1048 | } 1049 | ], 1050 | "source": [ 1051 | "df.drop(['Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'],inplace =True,axis = 1)\n", 1052 | "print(len(df.columns))\n", 1053 | "df.columns" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "markdown", 1058 | "metadata": { 1059 | "collapsed": true 1060 | }, 1061 | "source": [ 1062 | "### 2.数值类型转换" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 11, 1068 | "metadata": { 1069 | "collapsed": false 1070 | }, 1071 | "outputs": [ 1072 | { 1073 | "name": "stdout", 1074 | "output_type": "stream", 1075 | "text": [ 1076 | "we have 3 columns in type object, they are ['reg_preference_for_trad', 'latest_query_time', 'loans_latest_time']\\\n", 1077 | "we have 11 columns in type int64, they are ['take_amount_in_later_12_month_highest', 'repayment_capability', 'is_high_user', 'historical_trans_amount', 'trans_amount_3_month', 'abs', 'avg_price_last_12_month', 'max_cumulative_consume_later_1_month', 'pawns_auctions_trusts_consume_last_1_month', 'pawns_auctions_trusts_consume_last_6_month', 'status']\\\n", 1078 | "we have 70 columns in type float64, they are ['low_volume_percent', 'middle_volume_percent', 'trans_amount_increase_rate_lately', 'trans_activity_month', 'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter', 'trans_days_interval', 'regional_mobility', 'student_feature', 'number_of_trans_from_2011', 'first_transaction_time', 'historical_trans_day', 'rank_trad_1_month', 'avg_consume_less_12_valid_month', 'top_trans_count_last_1_month', 'avg_price_top_last_12_valid_month', 'trans_top_time_last_1_month', 'trans_top_time_last_6_month', 'consume_top_time_last_1_month', 'consume_top_time_last_6_month', 'cross_consume_count_last_1_month', 'trans_fail_top_count_enum_last_1_month', 'trans_fail_top_count_enum_last_6_month', 'trans_fail_top_count_enum_last_12_month', 'consume_mini_time_last_1_month', 'max_consume_count_later_6_month', 'railway_consume_count_last_12_month', 'jewelry_consume_count_last_6_month', 'first_transaction_day', 'trans_day_last_12_month', 'apply_score', 'apply_credibility', 'query_org_count', 'query_finance_count', 'query_cash_count', 'query_sum_count', 'latest_one_month_apply', 'latest_three_month_apply', 'latest_six_month_apply', 'loans_score', 'loans_credibility_behavior', 'loans_count', 'loans_settle_count', 'loans_overdue_count', 'loans_org_count_behavior', 'consfin_org_count_behavior', 'loans_cash_count', 'latest_one_month_loan', 'latest_three_month_loan', 'latest_six_month_loan', 'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc', 'latest_one_month_fail', 'loans_long_time', 'loans_credit_limit', 'loans_credibility_limit', 'loans_org_count_current', 'loans_product_count', 'loans_max_limit', 'loans_avg_limit', 'consfin_credit_limit', 'consfin_credibility', 'consfin_org_count_current', 'consfin_product_count', 'consfin_max_limit', 'consfin_avg_limit', 'latest_query_day', 'loans_latest_day']\\\n" 1079 | ] 1080 | } 1081 | ], 1082 | "source": [ 1083 | "get_data_type(df)" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "markdown", 1088 | "metadata": {}, 1089 | "source": [ 1090 | "#### 非数值型的有三个 ['reg_preference_for_trad', 'latest_query_time', 'loans_latest_time']" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 12, 1096 | "metadata": { 1097 | "collapsed": false 1098 | }, 1099 | "outputs": [ 1100 | { 1101 | "data": { 1102 | "text/plain": [ 1103 | "一线城市 3403\n", 1104 | "三线城市 1064\n", 1105 | "境外 150\n", 1106 | "二线城市 131\n", 1107 | "其他城市 4\n", 1108 | "NaN 2\n", 1109 | "Name: reg_preference_for_trad, dtype: int64" 1110 | ] 1111 | }, 1112 | "execution_count": 12, 1113 | "metadata": {}, 1114 | "output_type": "execute_result" 1115 | } 1116 | ], 1117 | "source": [ 1118 | "# 城市类型\n", 1119 | "df['reg_preference_for_trad'].value_counts(dropna = False) #" 1120 | ] 1121 | }, 1122 | { 1123 | "cell_type": "markdown", 1124 | "metadata": {}, 1125 | "source": [ 1126 | "将未知的填充为其他城市,并且转为数字" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": 13, 1132 | "metadata": { 1133 | "collapsed": false 1134 | }, 1135 | "outputs": [], 1136 | "source": [ 1137 | "df['reg_preference_for_trad'].fillna('其他城市',inplace = True)\n", 1138 | "df['reg_preference_for_trad'].replace({'一线城市':1,'二线城市':2,'三线城市':3,'境外':4,'其他城市':5},inplace = True)" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "markdown", 1143 | "metadata": {}, 1144 | "source": [ 1145 | "'latest_query_time', 'loans_latest_time'为日期类型,先不动" 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "markdown", 1150 | "metadata": {}, 1151 | "source": [ 1152 | "### 3.缺失值处理" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 14, 1158 | "metadata": { 1159 | "collapsed": false, 1160 | "scrolled": true 1161 | }, 1162 | "outputs": [ 1163 | { 1164 | "data": { 1165 | "text/plain": [ 1166 | "low_volume_percent 2\n", 1167 | "middle_volume_percent 2\n", 1168 | "take_amount_in_later_12_month_highest 0\n", 1169 | "trans_amount_increase_rate_lately 3\n", 1170 | "trans_activity_month 2\n", 1171 | "trans_activity_day 2\n", 1172 | "transd_mcc 2\n", 1173 | "trans_days_interval_filter 8\n", 1174 | "trans_days_interval 2\n", 1175 | "regional_mobility 2\n", 1176 | "student_feature 2998\n", 1177 | "repayment_capability 0\n", 1178 | "is_high_user 0\n", 1179 | "number_of_trans_from_2011 2\n", 1180 | "first_transaction_time 2\n", 1181 | "historical_trans_amount 0\n", 1182 | "historical_trans_day 2\n", 1183 | "rank_trad_1_month 2\n", 1184 | "trans_amount_3_month 0\n", 1185 | "avg_consume_less_12_valid_month 2\n", 1186 | "abs 0\n", 1187 | "top_trans_count_last_1_month 2\n", 1188 | "avg_price_last_12_month 0\n", 1189 | "avg_price_top_last_12_valid_month 104\n", 1190 | "reg_preference_for_trad 0\n", 1191 | "trans_top_time_last_1_month 8\n", 1192 | "trans_top_time_last_6_month 8\n", 1193 | "consume_top_time_last_1_month 8\n", 1194 | "consume_top_time_last_6_month 8\n", 1195 | "cross_consume_count_last_1_month 426\n", 1196 | " ... \n", 1197 | "loans_credibility_behavior 297\n", 1198 | "loans_count 297\n", 1199 | "loans_settle_count 297\n", 1200 | "loans_overdue_count 297\n", 1201 | "loans_org_count_behavior 297\n", 1202 | "consfin_org_count_behavior 297\n", 1203 | "loans_cash_count 297\n", 1204 | "latest_one_month_loan 297\n", 1205 | "latest_three_month_loan 297\n", 1206 | "latest_six_month_loan 297\n", 1207 | "history_suc_fee 297\n", 1208 | "history_fail_fee 297\n", 1209 | "latest_one_month_suc 297\n", 1210 | "latest_one_month_fail 297\n", 1211 | "loans_long_time 297\n", 1212 | "loans_latest_time 297\n", 1213 | "loans_credit_limit 297\n", 1214 | "loans_credibility_limit 297\n", 1215 | "loans_org_count_current 297\n", 1216 | "loans_product_count 297\n", 1217 | "loans_max_limit 297\n", 1218 | "loans_avg_limit 297\n", 1219 | "consfin_credit_limit 297\n", 1220 | "consfin_credibility 297\n", 1221 | "consfin_org_count_current 297\n", 1222 | "consfin_product_count 297\n", 1223 | "consfin_max_limit 297\n", 1224 | "consfin_avg_limit 297\n", 1225 | "latest_query_day 304\n", 1226 | "loans_latest_day 297\n", 1227 | "Length: 84, dtype: int64" 1228 | ] 1229 | }, 1230 | "execution_count": 14, 1231 | "metadata": {}, 1232 | "output_type": "execute_result" 1233 | } 1234 | ], 1235 | "source": [ 1236 | "df.isnull().sum() #查看缺失值" 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "markdown", 1241 | "metadata": {}, 1242 | "source": [ 1243 | "学生缺的比较多" 1244 | ] 1245 | }, 1246 | { 1247 | "cell_type": "code", 1248 | "execution_count": 15, 1249 | "metadata": { 1250 | "collapsed": false, 1251 | "scrolled": true 1252 | }, 1253 | "outputs": [ 1254 | { 1255 | "data": { 1256 | "text/plain": [ 1257 | "NaN 2998\n", 1258 | " 1.0 1754\n", 1259 | " 2.0 2\n", 1260 | "Name: student_feature, dtype: int64" 1261 | ] 1262 | }, 1263 | "execution_count": 15, 1264 | "metadata": {}, 1265 | "output_type": "execute_result" 1266 | } 1267 | ], 1268 | "source": [ 1269 | "df['student_feature'].value_counts(dropna=False) #True会默认把缺失值或者na,null这些值去掉" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "markdown", 1274 | "metadata": {}, 1275 | "source": [ 1276 | "#### 这里可能是不是学生的意思,暂时NA看做是0吧" 1277 | ] 1278 | }, 1279 | { 1280 | "cell_type": "code", 1281 | "execution_count": 16, 1282 | "metadata": { 1283 | "collapsed": false 1284 | }, 1285 | "outputs": [ 1286 | { 1287 | "data": { 1288 | "text/plain": [ 1289 | "0.0 2998\n", 1290 | "1.0 1754\n", 1291 | "2.0 2\n", 1292 | "Name: student_feature, dtype: int64" 1293 | ] 1294 | }, 1295 | "execution_count": 16, 1296 | "metadata": {}, 1297 | "output_type": "execute_result" 1298 | } 1299 | ], 1300 | "source": [ 1301 | "df['student_feature'].fillna(0,inplace=True) #True代码改变了源数据\n", 1302 | "df['student_feature'].value_counts(dropna=False)" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "markdown", 1307 | "metadata": {}, 1308 | "source": [ 1309 | "#### 其余特征缺失的话可以用众数填充 \n", 1310 | "也有说法是 \n", 1311 | "数值型取中位数 \n", 1312 | "日期取众数 " 1313 | ] 1314 | }, 1315 | { 1316 | "cell_type": "code", 1317 | "execution_count": 17, 1318 | "metadata": { 1319 | "collapsed": false 1320 | }, 1321 | "outputs": [ 1322 | { 1323 | "data": { 1324 | "text/html": [ 1325 | "
\n", 1326 | "\n", 1339 | "\n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | "
low_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filtertrans_days_intervalregional_mobility...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
00.010.9900.900.550.31317.027.026.03.0...2900.01688.01200.075.01.02.01200.01200.012.018.0
10.020.9420001.281.000.45819.030.014.04.0...3500.01758.015100.080.05.06.022800.09360.04.02.0
20.040.9601.001.000.11413.068.022.01.0...1600.01250.04200.087.01.01.04200.04200.02.06.0
30.000.9620000.130.570.77722.014.06.03.0...3200.01541.016300.080.05.05.030000.012180.02.04.0
40.010.9900.461.000.17513.066.042.01.0...2300.01630.08300.079.02.02.08400.08250.022.0120.0
\n", 1489 | "

5 rows × 84 columns

\n", 1490 | "
" 1491 | ], 1492 | "text/plain": [ 1493 | " low_volume_percent middle_volume_percent \\\n", 1494 | "0 0.01 0.99 \n", 1495 | "1 0.02 0.94 \n", 1496 | "2 0.04 0.96 \n", 1497 | "3 0.00 0.96 \n", 1498 | "4 0.01 0.99 \n", 1499 | "\n", 1500 | " take_amount_in_later_12_month_highest trans_amount_increase_rate_lately \\\n", 1501 | "0 0 0.90 \n", 1502 | "1 2000 1.28 \n", 1503 | "2 0 1.00 \n", 1504 | "3 2000 0.13 \n", 1505 | "4 0 0.46 \n", 1506 | "\n", 1507 | " trans_activity_month trans_activity_day transd_mcc \\\n", 1508 | "0 0.55 0.313 17.0 \n", 1509 | "1 1.00 0.458 19.0 \n", 1510 | "2 1.00 0.114 13.0 \n", 1511 | "3 0.57 0.777 22.0 \n", 1512 | "4 1.00 0.175 13.0 \n", 1513 | "\n", 1514 | " trans_days_interval_filter trans_days_interval regional_mobility \\\n", 1515 | "0 27.0 26.0 3.0 \n", 1516 | "1 30.0 14.0 4.0 \n", 1517 | "2 68.0 22.0 1.0 \n", 1518 | "3 14.0 6.0 3.0 \n", 1519 | "4 66.0 42.0 1.0 \n", 1520 | "\n", 1521 | " ... loans_max_limit loans_avg_limit consfin_credit_limit \\\n", 1522 | "0 ... 2900.0 1688.0 1200.0 \n", 1523 | "1 ... 3500.0 1758.0 15100.0 \n", 1524 | "2 ... 1600.0 1250.0 4200.0 \n", 1525 | "3 ... 3200.0 1541.0 16300.0 \n", 1526 | "4 ... 2300.0 1630.0 8300.0 \n", 1527 | "\n", 1528 | " consfin_credibility consfin_org_count_current consfin_product_count \\\n", 1529 | "0 75.0 1.0 2.0 \n", 1530 | "1 80.0 5.0 6.0 \n", 1531 | "2 87.0 1.0 1.0 \n", 1532 | "3 80.0 5.0 5.0 \n", 1533 | "4 79.0 2.0 2.0 \n", 1534 | "\n", 1535 | " consfin_max_limit consfin_avg_limit latest_query_day loans_latest_day \n", 1536 | "0 1200.0 1200.0 12.0 18.0 \n", 1537 | "1 22800.0 9360.0 4.0 2.0 \n", 1538 | "2 4200.0 4200.0 2.0 6.0 \n", 1539 | "3 30000.0 12180.0 2.0 4.0 \n", 1540 | "4 8400.0 8250.0 22.0 120.0 \n", 1541 | "\n", 1542 | "[5 rows x 84 columns]" 1543 | ] 1544 | }, 1545 | "execution_count": 17, 1546 | "metadata": {}, 1547 | "output_type": "execute_result" 1548 | } 1549 | ], 1550 | "source": [ 1551 | "# print(type(df.columns[1]))\n", 1552 | "for i in df.columns:\n", 1553 | " df[i].fillna(df[i].mode()[0],inplace = True) #加[0]是因为众数可能有多个,返回不是一个数字\n", 1554 | "df.head()" 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "markdown", 1559 | "metadata": { 1560 | "collapsed": true 1561 | }, 1562 | "source": [ 1563 | "### 三、数据集切分" 1564 | ] 1565 | }, 1566 | { 1567 | "cell_type": "code", 1568 | "execution_count": 27, 1569 | "metadata": { 1570 | "collapsed": false 1571 | }, 1572 | "outputs": [ 1573 | { 1574 | "name": "stdout", 1575 | "output_type": "stream", 1576 | "text": [ 1577 | "(3327, 83) (1427, 83) (3327,) (1427,)\n" 1578 | ] 1579 | } 1580 | ], 1581 | "source": [ 1582 | "import numpy as np\n", 1583 | "from sklearn.model_selection import train_test_split\n", 1584 | "\n", 1585 | "Y=df['status']\n", 1586 | "X=df.drop('status',axis=1)\n", 1587 | "X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.3,random_state = 2018) \n", 1588 | "print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "code", 1593 | "execution_count": null, 1594 | "metadata": { 1595 | "collapsed": true 1596 | }, 1597 | "outputs": [], 1598 | "source": [] 1599 | } 1600 | ], 1601 | "metadata": { 1602 | "anaconda-cloud": {}, 1603 | "kernelspec": { 1604 | "display_name": "Python [conda env:tensorflow-gpu]", 1605 | "language": "python", 1606 | "name": "conda-env-tensorflow-gpu-py" 1607 | }, 1608 | "language_info": { 1609 | "codemirror_mode": { 1610 | "name": "ipython", 1611 | "version": 3 1612 | }, 1613 | "file_extension": ".py", 1614 | "mimetype": "text/x-python", 1615 | "name": "python", 1616 | "nbconvert_exporter": "python", 1617 | "pygments_lexer": "ipython3", 1618 | "version": "3.5.4" 1619 | } 1620 | }, 1621 | "nbformat": 4, 1622 | "nbformat_minor": 1 1623 | } 1624 | -------------------------------------------------------------------------------- /数据分析实践/task2_特征衍生和选择.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 【Task 2】\n", 8 | "特征衍生 \n", 9 | "特征挑选 分别用IV值和随机森林等进行特征选择 \n", 10 | "……以及你能想到特征工程处理" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### iv值特征选择" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "def CalcIV(Xvar,Yvar):\n", 29 | " N_0=np.sum(Yvar==0)\n", 30 | " N_1=np.sum(Yvar==1)\n", 31 | " N_0_group=np.zeros(np.unique(Xvar).shape)\n", 32 | " \n", 33 | " N_1_group=np.zeros(np.unique(Xvar).shape)\n", 34 | " for i in range(len(np.unique(Xvar))):\n", 35 | " N_0_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==0)].count()\n", 36 | " N_1_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==1)].count()\n", 37 | " iv = np.sum((N_0_group/N_0-N_1_group/N_1)*np.log((N_0_group/N_0)/(N_1_group/N_1)))\n", 38 | " if iv>=1.0:## 处理极端值\n", 39 | " iv=1\n", 40 | " return iv\n", 41 | "\n", 42 | "def caliv_batch(df,Yvar):\n", 43 | " ivlist=[]\n", 44 | " for col in df.columns:\n", 45 | " iv=CalcIV(df[col],Yvar)\n", 46 | " ivlist.append(iv)\n", 47 | " names=list(df.columns)\n", 48 | " iv_df=pd.DataFrame({'Var':names,'Iv':ivlist},columns=['Var','Iv'])\n", 49 | "\n", 50 | " return iv_df,ivlist\n", 51 | "im_iv, ivl = caliv_batch(data_prepared.iloc[:,:-1],data_prepared.iloc[:,-1])" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### 随机森林选择特征" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "from sklearn.ensemble import RandomForestClassifier\n", 70 | "x_train = data_prepared.iloc[:,:-1]\n", 71 | "y_train = data_prepared.iloc[:,-1]\n", 72 | "feat_lables = x_train.columns\n", 73 | "forest = RandomForestClassifier(n_estimators=10000, random_state=0,n_jobs=1)\n", 74 | "forest.fit(x_train, y_train)\n", 75 | "importance = forest.feature_importances_\n", 76 | "imp_result = np.argsort(importance)[::-1]\n", 77 | "\n", 78 | "for i in range(x_train.shape[1]):\n", 79 | " print(\"%2d. %-*s %f\"%(i+1, 30, feat_lables[i], importance[imp_result[i]]))" 80 | ] 81 | } 82 | ], 83 | "metadata": { 84 | "anaconda-cloud": {}, 85 | "kernelspec": { 86 | "display_name": "Python [conda env:tensorflow-gpu]", 87 | "language": "python", 88 | "name": "conda-env-tensorflow-gpu-py" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.5.4" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 1 105 | } 106 | -------------------------------------------------------------------------------- /数据分析实践/task3_建模和评分.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 任务3 - 建模(2天)\n", 8 | "用逻辑回归、svm和决策树;随机森林和XGBoost进行模型构建,评分方式任意,如准确率等。(不需要考虑模型调参) \n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import warnings\n", 21 | "from sklearn.preprocessing import scale\n", 22 | "from sklearn.model_selection import cross_val_score\n", 23 | "from sklearn.linear_model import LogisticRegression\n", 24 | "from sklearn.tree import DecisionTreeClassifier\n", 25 | "from sklearn.svm import SVC\n", 26 | "from sklearn.ensemble import RandomForestClassifier\n", 27 | "from sklearn.ensemble import GradientBoostingClassifier\n", 28 | "from xgboost.sklearn import XGBClassifier\n", 29 | "import lightgbm as lgb\n", 30 | "\n", 31 | "\n", 32 | "# 读取数据集\n", 33 | "data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')\n", 34 | "\n", 35 | "# 划分为5折交叉验证数据集\n", 36 | "df_y=data_all['status']\n", 37 | "df_X=data_all.drop(columns=['status'])\n", 38 | "df_X=scale(df_X,axis=0) #将数据转化为标准数据\n", 39 | "#构建模型\n", 40 | "\n", 41 | "lr = LogisticRegression(random_state=2018,tol=1e-6) # 逻辑回归模型\n", 42 | "\n", 43 | "tree = DecisionTreeClassifier(random_state=2018) #决策树模型\n", 44 | "\n", 45 | "svm = SVC(probability=True,random_state=2018,tol=1e-6) # SVM模型\n", 46 | "\n", 47 | "forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 随机森林\n", 48 | "\n", 49 | "Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT\n", 50 | "\n", 51 | "Xgbc=XGBClassifier(random_state=2018) #Xgbc\n", 52 | "\n", 53 | "gbm=lgb.LGBMClassifier(random_state=2018) #lgb\n", 54 | "\n", 55 | "\n", 56 | "\n", 57 | "def muti_score(model):\n", 58 | " warnings.filterwarnings('ignore')\n", 59 | " accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)\n", 60 | " precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)\n", 61 | " recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)\n", 62 | " f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)\n", 63 | " auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)\n", 64 | " print(\"准确率:\",accuracy.mean())\n", 65 | " print(\"精确率:\",precision.mean())\n", 66 | " print(\"召回率:\",recall.mean())\n", 67 | " print(\"F1_score:\",f1_score.mean())\n", 68 | " print(\"AUC:\",auc.mean())\n", 69 | "\n", 70 | "\n", 71 | "\n", 72 | "model_name=[\"lr\",\"tree\",\"svm\",\"forest\",\"Gbdt\",\"Xgbc\",\"gbm\"]\n", 73 | "for name in model_name:\n", 74 | " model=eval(name)\n", 75 | " print(name)\n", 76 | " muti_score(model)\n", 77 | "\n", 78 | "\n", 79 | "'''\n", 80 | "lr\n", 81 | "准确率: 0.7890191148682617\n", 82 | "精确率: 0.6542724662896913\n", 83 | "召回率: 0.3377975457965613\n", 84 | "F1_score: 0.44525012166067884\n", 85 | "AUC: 0.7840451024530857\n", 86 | "tree\n", 87 | "准确率: 0.6962524533638791\n", 88 | "精确率: 0.39920670173446693\n", 89 | "召回率: 0.4157413593052284\n", 90 | "F1_score: 0.40705496051057793\n", 91 | "AUC: 0.6029856787858856\n", 92 | "svm\n", 93 | "准确率: 0.787758390223099\n", 94 | "精确率: 0.7351623295760905\n", 95 | "召回率: 0.24060335431243626\n", 96 | "F1_score: 0.36179547264664874\n", 97 | "AUC: 0.7640376541388867\n", 98 | "forest\n", 99 | "准确率: 0.7921756804332226\n", 100 | "精确率: 0.7135700690071172\n", 101 | "召回率: 0.2867128441334693\n", 102 | "F1_score: 0.40835414886475174\n", 103 | "AUC: 0.7752164698827589\n", 104 | "Gbdt\n", 105 | "准确率: 0.7938590063951863\n", 106 | "精确率: 0.6604108594441386\n", 107 | "召回率: 0.36633732991104395\n", 108 | "F1_score: 0.4708811551285791\n", 109 | "AUC: 0.7888240065764295\n", 110 | "Xgbc\n", 111 | "准确率: 0.7982740847293591\n", 112 | "精确率: 0.6829783239831001\n", 113 | "召回率: 0.3663162336064133\n", 114 | "F1_score: 0.47673826685376613\n", 115 | "AUC: 0.7914190511145234\n", 116 | "gbm\n", 117 | "准确率: 0.79049080811139\n", 118 | "精确率: 0.6421783397519263\n", 119 | "召回率: 0.3730354066312717\n", 120 | "F1_score: 0.47150438344663004\n", 121 | "AUC: 0.7776116341798183\n", 122 | "'''" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "anaconda-cloud": {}, 128 | "kernelspec": { 129 | "display_name": "Python [conda env:tensorflow-gpu]", 130 | "language": "python", 131 | "name": "conda-env-tensorflow-gpu-py" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.5.4" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 1 148 | } 149 | -------------------------------------------------------------------------------- /数据分析实践/task5_模型调优.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "【Task 5】 模型调优(2天) \n", 8 | "任务5:使用网格搜索法对5个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估,记得展示代码的运行结果。 \n", 9 | "时间:2天 " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 39, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd \n", 21 | "from sklearn.model_selection import train_test_split\n", 22 | "\n", 23 | "from sklearn.preprocessing import StandardScaler\n", 24 | "from sklearn.linear_model import LogisticRegression\n", 25 | "from sklearn.svm import LinearSVC\n", 26 | "from sklearn.tree import DecisionTreeClassifier\n", 27 | "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n", 28 | "from sklearn.metrics import classification_report\n", 29 | "import xgboost as xgb\n", 30 | "import lightgbm as lgb\n", 31 | "from pandas import DataFrame,Series\n", 32 | "\n", 33 | "df = pd.read_csv('data.csv',encoding = 'gbk')\n", 34 | "\n", 35 | "\"\"\"\n", 36 | "数据处理\n", 37 | "\"\"\"\n", 38 | "\n", 39 | "###删除无关特征###\n", 40 | "df.drop(['Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'],inplace =True,axis = 1)\n", 41 | "\n", 42 | "###数据类型转换###(主要针对 obeject(文字类) \n", 43 | "df['reg_preference_for_trad'].fillna('其他城市',inplace = True)\n", 44 | "df['reg_preference_for_trad'].replace({'一线城市':1,'二线城市':2,'三线城市':3,'境外':4,'其他城市':5},inplace = True)\n", 45 | "\n", 46 | "# 处理日期格式 'latest_query_time', 'loans_latest_time'(暂时去掉???)\n", 47 | "\n", 48 | "df.drop(['latest_query_time', 'loans_latest_time'],inplace =True,axis = 1)\n", 49 | "\n", 50 | "\n", 51 | "###缺失值处理###\n", 52 | "df['student_feature'].fillna(0,inplace=True) \n", 53 | "for i in df.columns:\n", 54 | " df[i].fillna(df[i].mode()[0],inplace = True) #加[0]是因为众数可能有多个,返回不是一个数字" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "### 评估方法一:切分数据集" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 40, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "###切分数据集###\n", 73 | "\n", 74 | "y=df['status']\n", 75 | "x=df.drop('status',axis=1)\n", 76 | "x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.3,random_state = 2018) \n", 77 | "\n", 78 | "features = x_train.columns #目前是83个特征\n", 79 | "scaler = StandardScaler()\n", 80 | "x_train = scaler.fit_transform(x_train)\n", 81 | "x_test = scaler.fit_transform(x_test)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 41, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | " precision recall f1-score support\n", 96 | "\n", 97 | " 0 0.78588 0.92790 0.85101 1068\n", 98 | " 1 0.53614 0.24791 0.33905 359\n", 99 | "\n", 100 | "avg / total 0.72306 0.75683 0.72221 1427\n", 101 | "\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "#随机森林\n", 107 | "clf_rf = RandomForestClassifier()\n", 108 | "clf_rf.fit(x_train, y_train)\n", 109 | "rf_y_pred = clf_rf.predict(x_test)\n", 110 | "\n", 111 | "#评估 \n", 112 | "ans = classification_report(y_test,rf_y_pred,digits=5)\n", 113 | "print(ans)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### 评估方法二:交叉验证" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 42, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "[ 0.77521008 0.78338591 0.77707676 0.76526316 0.77473684] 0.775134550981\n", 135 | " precision recall f1-score support\n", 136 | "\n", 137 | " 0 0.79206 0.94664 0.86248 3561\n", 138 | " 1 0.61847 0.25817 0.36428 1193\n", 139 | "\n", 140 | "avg / total 0.74850 0.77387 0.73746 4754\n", 141 | "\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "# 使用交叉验证:直接就可以验证模型,这里用全部数据集而不是训练集\n", 147 | "from sklearn.model_selection import cross_val_predict,cross_val_score\n", 148 | "\n", 149 | "# #先做标准化 这里做不做都一样\n", 150 | "scaler = StandardScaler()\n", 151 | "x = scaler.fit_transform(x)\n", 152 | "x= scaler.fit_transform(x)\n", 153 | "\n", 154 | "model_rf = RandomForestClassifier()\n", 155 | "score_rf = cross_val_score(model_rf, x,y,cv = 5)\n", 156 | "pred_rf = cross_val_predict(model_rf, x,y,cv = 5)\n", 157 | "print(score_rf,score_rf.mean())\n", 158 | "ans1 = classification_report(y,pred_rf,digits=5)\n", 159 | "print(ans1)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## 模型微调" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### 方法一:网格搜索" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 46, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "GridSearchCV(cv=5, error_score='raise',\n", 187 | " estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 188 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 189 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 190 | " min_samples_leaf=1, min_samples_split=2,\n", 191 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 192 | " oob_score=False, random_state=None, verbose=0,\n", 193 | " warm_start=False),\n", 194 | " fit_params=None, iid=True, n_jobs=1,\n", 195 | " param_grid=[{'max_features': [2, 4, 6, 8], 'n_estimators': [3, 10, 30]}, {'bootstrap': [False], 'max_features': [2, 3, 4], 'n_estimators': [3, 10]}],\n", 196 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", 197 | " scoring=None, verbose=0)" 198 | ] 199 | }, 200 | "execution_count": 46, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "from sklearn.model_selection import GridSearchCV\n", 207 | "param_grid = [\n", 208 | "{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", 209 | "{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", 210 | "]\n", 211 | "rf_model = RandomForestClassifier()\n", 212 | "grid_search = GridSearchCV(rf_model, param_grid, cv=5)\n", 213 | "grid_search.fit(x,y)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 52, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "最高得分:0.78776\n", 228 | "最优参数:n_estimators:30 max_features:8 bootstrap:True\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "# 搜索结果\n", 234 | "print('最高得分:%.5f'% grid_search.best_score_)\n", 235 | "print('最优参数:n_estimators:{} max_features:{} bootstrap:{}'.format(grid_search.best_estimator_.n_estimators,\\\n", 236 | " grid_search.best_estimator_.max_features,grid_search.best_estimator_.bootstrap))" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "anaconda-cloud": {}, 251 | "kernelspec": { 252 | "display_name": "Python [default]", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.2" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 1 271 | } 272 | -------------------------------------------------------------------------------- /数据分析实践/task6_模型融合.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 【Task 6】 模型融合(2天) \n", 8 | "模型融合方式任意,并结合Task5给出你的最优结果。 \n", 9 | "时间:2天 \n", 10 | "例如Stacking融合,用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分结果。 " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 5, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd \n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "\n", 22 | "from sklearn.preprocessing import StandardScaler\n", 23 | "from sklearn.linear_model import LogisticRegression\n", 24 | "from sklearn.svm import SVC\n", 25 | "from sklearn.tree import DecisionTreeClassifier\n", 26 | "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n", 27 | "from sklearn.metrics import classification_report\n", 28 | "import xgboost as xgb\n", 29 | "import lightgbm as lgb\n", 30 | "from pandas import DataFrame,Series\n", 31 | "\n", 32 | "df = pd.read_csv('data.csv',encoding = 'gbk')\n", 33 | "\n", 34 | "\"\"\"\n", 35 | "数据处理\n", 36 | "\"\"\"\n", 37 | "\n", 38 | "###删除无关特征###\n", 39 | "df.drop(['Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'],inplace =True,axis = 1)\n", 40 | "\n", 41 | "###数据类型转换###(主要针对 obeject(文字类) \n", 42 | "df['reg_preference_for_trad'].fillna('其他城市',inplace = True)\n", 43 | "df['reg_preference_for_trad'].replace({'一线城市':1,'二线城市':2,'三线城市':3,'境外':4,'其他城市':5},inplace = True)\n", 44 | "\n", 45 | "# 处理日期格式 'latest_query_time', 'loans_latest_time'(暂时去掉???)\n", 46 | "\n", 47 | "df.drop(['latest_query_time', 'loans_latest_time'],inplace =True,axis = 1)\n", 48 | "\n", 49 | "\n", 50 | "###缺失值处理###\n", 51 | "df['student_feature'].fillna(0,inplace=True) \n", 52 | "for i in df.columns:\n", 53 | " df[i].fillna(df[i].mode()[0],inplace = True) #加[0]是因为众数可能有多个,返回不是一个数字\n", 54 | " \n", 55 | "###切分数据集###\n", 56 | "\n", 57 | "y=df['status']\n", 58 | "x=df.drop('status',axis=1)\n", 59 | "x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.3,random_state = 2018) \n", 60 | "\n", 61 | "\n", 62 | "### 归一化 ###\n", 63 | "features = x_train.columns #目前是83个特征\n", 64 | "scaler = StandardScaler()\n", 65 | "x_train = scaler.fit_transform(x_train)\n", 66 | "x_test = scaler.fit_transform(x_test)\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### 选三个效果好的模型" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 6, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | " precision recall f1-score support\n", 86 | "\n", 87 | " 0 0.81006 0.93446 0.86783 1068\n", 88 | " 1 0.64103 0.34819 0.45126 359\n", 89 | "\n", 90 | " accuracy 0.78697 1427\n", 91 | " macro avg 0.72555 0.64132 0.65954 1427\n", 92 | "weighted avg 0.76754 0.78697 0.76303 1427\n", 93 | "\n" 94 | ] 95 | }, 96 | { 97 | "name": "stderr", 98 | "output_type": "stream", 99 | "text": [ 100 | "D:\\MyCodeEnvironment\\Anaconda3\\envs\\p3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 101 | " FutureWarning)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "# LR\n", 107 | "LR = LogisticRegression()\n", 108 | "LR.fit(x_train, y_train)\n", 109 | "lr_y_pred = LR.predict(x_test)\n", 110 | "ans = classification_report(y_test,lr_y_pred,digits=5)\n", 111 | "print(ans)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": { 118 | "scrolled": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | " precision recall f1-score support\n", 126 | "\n", 127 | " 0 0.78895 0.96255 0.86714 1068\n", 128 | " 1 0.67742 0.23398 0.34783 359\n", 129 | "\n", 130 | " accuracy 0.77926 1427\n", 131 | " macro avg 0.73318 0.59827 0.60749 1427\n", 132 | "weighted avg 0.76089 0.77926 0.73650 1427\n", 133 | "\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "#svm\n", 139 | "SVM = SVC(gamma='auto',probability=True)\n", 140 | "SVM.fit(x_train, y_train)\n", 141 | "svm_y_pred = SVM.predict(x_test)\n", 142 | "svm_y_pro = SVM.predict_proba(x_test)\n", 143 | "# print(svm_y_pro)\n", 144 | "ans = classification_report(y_test,svm_y_pred,digits=5)\n", 145 | "print(ans)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 8, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | " precision recall f1-score support\n", 158 | "\n", 159 | " 0 0.79694 0.92603 0.85665 1068\n", 160 | " 1 0.57527 0.29805 0.39266 359\n", 161 | "\n", 162 | " accuracy 0.76804 1427\n", 163 | " macro avg 0.68610 0.61204 0.62465 1427\n", 164 | "weighted avg 0.74117 0.76804 0.73992 1427\n", 165 | "\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "# lgbm\n", 171 | "LGBM = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=250,\n", 172 | " max_bin=255, subsample_for_bin=200000, objective=None, min_split_gain=0.0, min_child_weight=0.001,\n", 173 | " min_child_samples=20, subsample=1.0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,\n", 174 | " reg_lambda=0.5, random_state=None, n_jobs=-1, silent=True)\n", 175 | "LGBM.fit(x_train, y_train)\n", 176 | "lgbm_y_pred = LGBM.predict(x_test)\n", 177 | "\n", 178 | "ans = classification_report(y_test, lgbm_y_pred,digits=5)\n", 179 | "print(ans)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | " precision recall f1-score support\n", 192 | "\n", 193 | " 0 0.79433 0.94382 0.86264 1068\n", 194 | " 1 0.62025 0.27298 0.37911 359\n", 195 | "\n", 196 | " accuracy 0.77505 1427\n", 197 | " macro avg 0.70729 0.60840 0.62088 1427\n", 198 | "weighted avg 0.75053 0.77505 0.74100 1427\n", 199 | "\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# XGB\n", 205 | "XGB = xgb.XGBClassifier(max_depth=6, num_class =2,learning_rate=0.1, n_estimators=100, silent=True, objective='multi:softmax',\n", 206 | " nthread=32, gamma=0.1, min_child_weight=3, max_delta_step=0, subsample=1, colsample_bytree=1,\n", 207 | " colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=2018,\n", 208 | " missing=None)\n", 209 | "XGB.fit(x_train, y_train)\n", 210 | "xgb2_y_pred = XGB.predict(x_test)\n", 211 | "\n", 212 | "ans = classification_report(y_test, xgb2_y_pred,digits=5)\n", 213 | "print(ans)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | " precision recall f1-score support\n", 226 | "\n", 227 | " 0 0.79709 0.76873 0.78265 1068\n", 228 | " 1 0.37783 0.41783 0.39683 359\n", 229 | "\n", 230 | " accuracy 0.68045 1427\n", 231 | " macro avg 0.58746 0.59328 0.58974 1427\n", 232 | "weighted avg 0.69161 0.68045 0.68559 1427\n", 233 | "\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "DT = DecisionTreeClassifier()\n", 239 | "DT.fit(x_train, y_train)\n", 240 | "dt_y_pred = DT.predict(x_test)\n", 241 | "dt_y_pro = DT.predict_proba(x_test)\n", 242 | "# print(svm_y_pro)\n", 243 | "ans = classification_report(y_test,dt_y_pred,digits=5)\n", 244 | "print(ans)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "### 模型融合" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 12, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stderr", 261 | "output_type": "stream", 262 | "text": [ 263 | "D:\\MyCodeEnvironment\\Anaconda3\\envs\\p3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 264 | " FutureWarning)\n" 265 | ] 266 | }, 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | " precision recall f1-score support\n", 272 | "\n", 273 | " 0 0.79652 0.94195 0.86315 1068\n", 274 | " 1 0.62195 0.28412 0.39006 359\n", 275 | "\n", 276 | " accuracy 0.77645 1427\n", 277 | " macro avg 0.70923 0.61304 0.62660 1427\n", 278 | "weighted avg 0.75260 0.77645 0.74413 1427\n", 279 | "\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "from mlxtend.classifier import StackingCVClassifier\n", 285 | "StackingModel = StackingCVClassifier(classifiers=[XGB,LGBM, SVM],\n", 286 | " use_probas=True, \n", 287 | " meta_classifier=LR,\n", 288 | " cv=5,\n", 289 | " )\n", 290 | "StackingModel.fit(x_train, y_train)\n", 291 | "sm_y_pred = StackingModel.predict(x_test)\n", 292 | "ans = classification_report(y_test,sm_y_pred,digits=5)\n", 293 | "print(ans)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [] 304 | } 305 | ], 306 | "metadata": { 307 | "anaconda-cloud": {}, 308 | "kernelspec": { 309 | "display_name": "Python 3", 310 | "language": "python", 311 | "name": "python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.6.9" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 1 328 | } 329 | -------------------------------------------------------------------------------- /数据分析实践/多分类的评估问题.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 52, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "[[ 8.79681649e-01 1.20307538e-01 1.08131372e-05]\n", 15 | " [ 7.99706325e-01 2.00263292e-01 3.03825365e-05]\n", 16 | " [ 8.53796795e-01 1.46177302e-01 2.59031285e-05]\n", 17 | " [ 8.25383127e-01 1.74558937e-01 5.79356669e-05]\n", 18 | " [ 8.97323628e-01 1.02665167e-01 1.12050036e-05]\n", 19 | " [ 9.26986574e-01 7.30004562e-02 1.29693872e-05]\n", 20 | " [ 8.95064974e-01 1.04895775e-01 3.92506205e-05]\n", 21 | " [ 8.61839691e-01 1.38141399e-01 1.89095833e-05]\n", 22 | " [ 8.03156719e-01 1.96758495e-01 8.47861140e-05]\n", 23 | " [ 7.95421554e-01 2.04552763e-01 2.56832240e-05]\n", 24 | " [ 8.92083069e-01 1.07910759e-01 6.17176870e-06]\n", 25 | " [ 8.63364991e-01 1.36600589e-01 3.44201798e-05]\n", 26 | " [ 7.88177618e-01 2.11794929e-01 2.74526810e-05]\n", 27 | " [ 8.35079702e-01 1.64888155e-01 3.21426418e-05]\n", 28 | " [ 9.28349898e-01 7.16491356e-02 9.66254924e-07]\n", 29 | " [ 9.64535656e-01 3.54620850e-02 2.25877936e-06]\n", 30 | " [ 9.40906153e-01 5.90890027e-02 4.84421830e-06]\n", 31 | " [ 8.91740161e-01 1.08245661e-01 1.41772124e-05]\n", 32 | " [ 8.96525617e-01 1.03467608e-01 6.77567332e-06]\n", 33 | " [ 9.23615524e-01 7.63726510e-02 1.18248373e-05]\n", 34 | " [ 8.30668332e-01 1.69316458e-01 1.52093733e-05]\n", 35 | " [ 9.21914602e-01 7.80675598e-02 1.78384021e-05]\n", 36 | " [ 9.26584671e-01 7.34068679e-02 8.46162713e-06]\n", 37 | " [ 8.67785629e-01 1.32146178e-01 6.81931916e-05]\n", 38 | " [ 8.41271506e-01 1.58655904e-01 7.25903122e-05]\n", 39 | " [ 7.77263282e-01 2.22695181e-01 4.15365716e-05]\n", 40 | " [ 8.81389224e-01 1.18568969e-01 4.18075826e-05]\n", 41 | " [ 8.69974782e-01 1.30013638e-01 1.15794893e-05]\n", 42 | " [ 8.60034106e-01 1.39955486e-01 1.04082979e-05]\n", 43 | " [ 8.32052869e-01 1.67892968e-01 5.41625519e-05]\n", 44 | " [ 8.07811588e-01 1.92136477e-01 5.19350231e-05]\n", 45 | " [ 8.72544939e-01 1.27438925e-01 1.61360155e-05]\n", 46 | " [ 9.33948477e-01 6.60477336e-02 3.78900866e-06]\n", 47 | " [ 9.46250501e-01 5.37475145e-02 1.98493064e-06]\n", 48 | " [ 7.95421554e-01 2.04552763e-01 2.56832240e-05]\n", 49 | " [ 8.47610513e-01 1.52377535e-01 1.19520539e-05]\n", 50 | " [ 8.70019435e-01 1.29976367e-01 4.19728170e-06]\n", 51 | " [ 7.95421554e-01 2.04552763e-01 2.56832240e-05]\n", 52 | " [ 8.31024910e-01 1.68917216e-01 5.78737851e-05]\n", 53 | " [ 8.57737250e-01 1.42246900e-01 1.58501104e-05]\n", 54 | " [ 9.00222082e-01 9.97646975e-02 1.32206853e-05]\n", 55 | " [ 6.90741687e-01 3.09094698e-01 1.63615590e-04]\n", 56 | " [ 8.66068303e-01 1.33887708e-01 4.39884356e-05]\n", 57 | " [ 9.16308833e-01 8.36288777e-02 6.22895883e-05]\n", 58 | " [ 9.15519114e-01 8.44392129e-02 4.16734713e-05]\n", 59 | " [ 8.20309627e-01 1.79642381e-01 4.79919885e-05]\n", 60 | " [ 9.09855663e-01 9.01327650e-02 1.15724381e-05]\n", 61 | " [ 8.51214451e-01 1.48746052e-01 3.94971199e-05]\n", 62 | " [ 8.95519736e-01 1.04472911e-01 7.35323849e-06]\n", 63 | " [ 8.51563342e-01 1.48419676e-01 1.69821772e-05]\n", 64 | " [ 2.98900777e-02 8.60393138e-01 1.09716785e-01]\n", 65 | " [ 3.74487166e-02 7.05572459e-01 2.56978825e-01]\n", 66 | " [ 1.17957675e-02 7.48252356e-01 2.39951876e-01]\n", 67 | " [ 1.32920493e-02 6.51770445e-01 3.34937506e-01]\n", 68 | " [ 1.09868088e-02 6.98832091e-01 2.90181101e-01]\n", 69 | " [ 1.07669519e-02 5.83013186e-01 4.06219862e-01]\n", 70 | " [ 2.15200540e-02 5.37732882e-01 4.40747064e-01]\n", 71 | " [ 1.08418544e-01 7.68766189e-01 1.22815267e-01]\n", 72 | " [ 1.77270021e-02 8.27562690e-01 1.54710308e-01]\n", 73 | " [ 3.30493839e-02 5.28708770e-01 4.38241846e-01]\n", 74 | " [ 2.93117962e-02 7.72717609e-01 1.97970595e-01]\n", 75 | " [ 4.09569813e-02 6.19765980e-01 3.39277039e-01]\n", 76 | " [ 1.95378252e-02 8.79697992e-01 1.00764183e-01]\n", 77 | " [ 8.73285529e-03 5.96503817e-01 3.94763328e-01]\n", 78 | " [ 1.67434866e-01 7.12756209e-01 1.19808925e-01]\n", 79 | " [ 4.75535678e-02 8.43626581e-01 1.08819852e-01]\n", 80 | " [ 1.22530319e-02 4.23869480e-01 5.63877488e-01]\n", 81 | " [ 3.84753639e-02 8.50175432e-01 1.11349204e-01]\n", 82 | " [ 3.09968794e-03 5.96264678e-01 4.00635634e-01]\n", 83 | " [ 3.59781700e-02 8.08752206e-01 1.55269624e-01]\n", 84 | " [ 6.20745751e-03 2.73106189e-01 7.20686354e-01]\n", 85 | " [ 5.81151228e-02 8.19701311e-01 1.22183566e-01]\n", 86 | " [ 1.95840574e-03 5.33800891e-01 4.64240703e-01]\n", 87 | " [ 8.77628703e-03 7.04654010e-01 2.86569703e-01]\n", 88 | " [ 3.69274341e-02 8.38990091e-01 1.24082475e-01]\n", 89 | " [ 3.61807169e-02 8.28744840e-01 1.35074443e-01]\n", 90 | " [ 8.14489700e-03 7.77156946e-01 2.14698157e-01]\n", 91 | " [ 4.64006697e-03 5.23164549e-01 4.72195384e-01]\n", 92 | " [ 1.33500103e-02 5.63205976e-01 4.23444014e-01]\n", 93 | " [ 1.28473017e-01 8.31361691e-01 4.01652917e-02]\n", 94 | " [ 3.60902230e-02 8.03217466e-01 1.60692311e-01]\n", 95 | " [ 5.05096042e-02 8.46149445e-01 1.03340951e-01]\n", 96 | " [ 5.69724571e-02 8.11250984e-01 1.31776559e-01]\n", 97 | " [ 1.22453086e-03 3.99201919e-01 5.99573550e-01]\n", 98 | " [ 1.03123407e-02 3.65034695e-01 6.24652965e-01]\n", 99 | " [ 4.17476538e-02 4.77844283e-01 4.80408063e-01]\n", 100 | " [ 1.90525287e-02 7.45629538e-01 2.35317933e-01]\n", 101 | " [ 7.05352060e-03 7.56932682e-01 2.36013798e-01]\n", 102 | " [ 5.57541864e-02 6.67410837e-01 2.76834977e-01]\n", 103 | " [ 2.10790319e-02 6.62362244e-01 3.16558724e-01]\n", 104 | " [ 8.98003281e-03 5.99716389e-01 3.91303578e-01]\n", 105 | " [ 1.52196906e-02 6.32329159e-01 3.52451150e-01]\n", 106 | " [ 3.47695685e-02 7.98625645e-01 1.66604786e-01]\n", 107 | " [ 9.15416570e-02 7.95877151e-01 1.12581192e-01]\n", 108 | " [ 1.98418694e-02 6.40871800e-01 3.39286330e-01]\n", 109 | " [ 4.81040905e-02 7.31039981e-01 2.20855929e-01]\n", 110 | " [ 3.44565240e-02 6.77463657e-01 2.88079819e-01]\n", 111 | " [ 3.38822929e-02 7.96899915e-01 1.69217792e-01]\n", 112 | " [ 2.54574647e-01 6.90791330e-01 5.46340233e-02]\n", 113 | " [ 3.63488963e-02 7.04234211e-01 2.59416893e-01]\n", 114 | " [ 1.86036022e-04 1.48760823e-01 8.51053141e-01]\n", 115 | " [ 8.09069371e-04 2.94422745e-01 7.04768186e-01]\n", 116 | " [ 2.78126551e-04 3.30535386e-01 6.69186488e-01]\n", 117 | " [ 4.56288643e-04 3.38732197e-01 6.60811514e-01]\n", 118 | " [ 2.51393977e-04 2.57092194e-01 7.42656412e-01]\n", 119 | " [ 6.03186905e-05 3.82744333e-01 6.17195349e-01]\n", 120 | " [ 2.04838186e-03 2.81103453e-01 7.16848165e-01]\n", 121 | " [ 1.23247784e-04 4.24393655e-01 5.75483097e-01]\n", 122 | " [ 1.59929758e-04 4.23195996e-01 5.76644074e-01]\n", 123 | " [ 3.56390886e-04 1.52542892e-01 8.47100717e-01]\n", 124 | " [ 2.99635433e-03 2.78024684e-01 7.18978962e-01]\n", 125 | " [ 6.45242833e-04 3.55681241e-01 6.43673516e-01]\n", 126 | " [ 6.81029987e-04 2.98859721e-01 7.00459249e-01]\n", 127 | " [ 6.28418142e-04 2.96807692e-01 7.02563890e-01]\n", 128 | " [ 6.10997845e-04 1.74593604e-01 8.24795398e-01]\n", 129 | " [ 1.09757190e-03 1.73257823e-01 8.25644605e-01]\n", 130 | " [ 7.99254871e-04 3.48929847e-01 6.50270898e-01]\n", 131 | " [ 1.93443479e-04 2.38473708e-01 7.61332849e-01]\n", 132 | " [ 1.30064976e-05 4.20137191e-01 5.79849802e-01]\n", 133 | " [ 6.81548718e-04 4.69975854e-01 5.29342597e-01]\n", 134 | " [ 5.04477452e-04 2.25292722e-01 7.74202801e-01]\n", 135 | " [ 1.33913767e-03 2.30143290e-01 7.68517573e-01]\n", 136 | " [ 3.82097113e-05 4.28006955e-01 5.71954836e-01]\n", 137 | " [ 2.05299242e-03 4.00421888e-01 5.97525119e-01]\n", 138 | " [ 6.77847072e-04 2.37204010e-01 7.62118143e-01]\n", 139 | " [ 4.56383243e-04 3.97527741e-01 6.02015876e-01]\n", 140 | " [ 3.19858866e-03 3.83866887e-01 6.12934525e-01]\n", 141 | " [ 3.42364119e-03 3.27541103e-01 6.69035256e-01]\n", 142 | " [ 3.00544917e-04 2.98288662e-01 7.01410793e-01]\n", 143 | " [ 6.78376797e-04 5.10705151e-01 4.88616472e-01]\n", 144 | " [ 1.61719140e-04 4.27941843e-01 5.71896438e-01]\n", 145 | " [ 6.44775841e-04 3.44845359e-01 6.54509865e-01]\n", 146 | " [ 2.75279882e-04 2.78027400e-01 7.21697320e-01]\n", 147 | " [ 2.07731418e-03 4.90652652e-01 5.07270034e-01]\n", 148 | " [ 3.54683506e-04 4.42580814e-01 5.57064503e-01]\n", 149 | " [ 1.82017584e-04 3.42008155e-01 6.57809828e-01]\n", 150 | " [ 6.30908753e-04 1.28602511e-01 8.70766580e-01]\n", 151 | " [ 9.21940559e-04 3.20888055e-01 6.78190005e-01]\n", 152 | " [ 4.29311663e-03 3.18426266e-01 6.77280618e-01]\n", 153 | " [ 1.16680587e-03 3.00989509e-01 6.97843685e-01]\n", 154 | " [ 4.46290865e-04 2.02461924e-01 7.97091785e-01]\n", 155 | " [ 2.15227432e-03 2.48822456e-01 7.49025270e-01]\n", 156 | " [ 8.09069371e-04 2.94422745e-01 7.04768186e-01]\n", 157 | " [ 2.91162367e-04 2.24919706e-01 7.74789132e-01]\n", 158 | " [ 4.50477099e-04 1.53984748e-01 8.45564775e-01]\n", 159 | " [ 1.15724730e-03 2.33616548e-01 7.65226205e-01]\n", 160 | " [ 9.19025197e-04 3.79220387e-01 6.19860588e-01]\n", 161 | " [ 1.45811816e-03 2.98379693e-01 7.00162189e-01]\n", 162 | " [ 1.09779827e-03 1.31785617e-01 8.67116585e-01]\n", 163 | " [ 1.68397530e-03 2.81057800e-01 7.17258224e-01]] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 164 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", 165 | " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n", 166 | " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", 167 | " 2 2]\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "from sklearn import datasets\n", 173 | "import numpy as np\n", 174 | "from sklearn.preprocessing import label_binarize\n", 175 | "from sklearn.linear_model import LogisticRegression\n", 176 | "from sklearn.metrics import confusion_matrix, precision_score, accuracy_score,recall_score, f1_score,roc_auc_score\n", 177 | "\n", 178 | "\n", 179 | "iris = datasets.load_iris()\n", 180 | "x, y = iris.data, iris.target\n", 181 | "n_class = len(set(iris.target))\n", 182 | "y_one_hot = label_binarize(y, np.arange(n_class))\n", 183 | "\n", 184 | "# alpha = np.logspace(-2, 2, 20) #设置超参数范围\n", 185 | "# model = LogisticRegressionCV(Cs = alpha, cv = 3, penalty = 'l2') #使用L2正则化\n", 186 | "model = LogisticRegression()\n", 187 | "model.fit(x, y)\n", 188 | "y_score = model.predict(x)\n", 189 | "y_score_pro = model.predict_proba(x) #返回的是\n", 190 | "y_score_one_hot = label_binarize(y_score, np.arange(n_class))\n", 191 | "print(y_score_pro,y)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 53, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "confusion_matrix\n", 206 | " [[50 0 0]\n", 207 | " [ 0 45 5]\n", 208 | " [ 0 1 49]]\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "from sklearn.metrics import confusion_matrix, precision_score, accuracy_score,recall_score, f1_score,roc_auc_score\n", 214 | "\n", 215 | "obj1 = confusion_matrix(y, y_score)\n", 216 | "print('confusion_matrix\\n', obj1)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 54, 222 | "metadata": { 223 | "collapsed": false, 224 | "scrolled": true 225 | }, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 232 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", 233 | " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n", 234 | " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", 235 | " 2 2]\n", 236 | "accuracy:0.96\n", 237 | "precision:0.96\n", 238 | "recall:0.96\n", 239 | "f1-score:0.96\n", 240 | "AUC:0.97\n", 241 | "\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "print(y)\n", 247 | "print('accuracy:{}'.format(accuracy_score(y, y_score)))\n", 248 | "print('precision:{}'.format(precision_score(y, y_score,average='micro')))\n", 249 | "print('recall:{}'.format(recall_score(y, y_score,average='micro')))\n", 250 | "print('f1-score:{}'.format(f1_score(y, y_score,average='micro')))\n", 251 | "print('AUC:{}\\n'.format(roc_auc_score(y_one_hot, y_score_one_hot,average='micro')))\n", 252 | " #形式一:原始值(0或1或2)\n", 253 | " #形式二:各类概率值\n", 254 | " # 形式三:one-hot值" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 55, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "accuracy:0.96\n", 269 | "\n", 270 | "precision:0.96\n", 271 | "\n", 272 | "recall:0.96\n", 273 | "\n", 274 | "f1-score:0.96\n", 275 | "\n", 276 | "AUC:0.97\n", 277 | "\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "print('accuracy:{}\\n'.format(accuracy_score(y_one_hot, y_score_one_hot)))\n", 283 | "print('precision:{}\\n'.format(precision_score(y_one_hot, y_score_one_hot,average='micro')))\n", 284 | "print('recall:{}\\n'.format(recall_score(y_one_hot, y_score_one_hot,average='micro')))\n", 285 | "print('f1-score:{}\\n'.format(f1_score(y_one_hot, y_score_one_hot,average='micro')))\n", 286 | "print('AUC:{}\\n'.format(roc_auc_score(y_one_hot, y_score_one_hot,average='micro')))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 56, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XucTfX+x/HXJ0KoFLpRhyLHMDPuJDmkRBe6KIp0c0tK\n/Ajdy6mjc3ROSop0v6Akow6lFCndlOuM1JDQjejizpjP74+97TNpzGzMnrX3zPv5eOzHmb322mu9\n1zitz3zX5bPM3REREQE4JOgAIiISP1QUREQkQkVBREQiVBRERCRCRUFERCJUFEREJEJFQUREIlQU\nREQkQkVBREQiSgYdYH9VqlTJq1WrFnQMEZGE8vnnn//s7pXzmy/hikK1atWYP39+0DFERBKKmX0b\nzXw6fCQiIhEqCiIiEqGiICIiESoKIiISoaIgIiIRKgoiIhKhoiAiIhEqCiIiEqGiICIiESoKIiIS\noaIgIiIRKgoiIhKhoiAiIhEqCiIiEhGzomBmT5nZOjNbuo/PzcweNrNMM1tsZg1ilUVERKITy5HC\nM0C7PD5vD9QMv3oBj8Uwi4iIRCFmRcHd3wc25jFLR+A5D/kYqGBmx8cqj4iI5C/IJ69VAdbkeL82\nPO2HYOLEl2ue/pT3lq8POoaIFJCs337Cs3ZxaMWqrBpxXtBx9ikhHsdpZr0IHWLipJNOKtR1a+cs\nIgfDs3ez6Yv/8uv7z1Hq+FM57vL7g46UpyCLwnfAiTneVw1P+xN3HweMA2jUqJHHPtr/BFkQWteq\nzNPXNAls/SJycJYtW0aPHj2YN28e7dq1Y+zYsYX+h+3+CrIoTAP6mdlEoCnwm7sHdugovxFBPA/3\nRCT+bN26lZYtW5Kdnc1zzz1Ht27dMLOgY+UrZkXBzCYArYBKZrYWuAs4FMDdHwemA+cCmcBW4JpY\nZdnjQA8Fta5VOQZpRKQo+vLLL6lVqxZly5blxRdfJDU1lWOPPTboWFGLWVFw98vz+dyBGwpqfQVx\n7F+Ha0TkQG3bto27776bkSNH8uyzz9KtWzfatm0bdKz9lhAnmqMRbUHQjl9ECtr7779Pjx49+Prr\nr+nRowfnn39+0JEOWJEoCtc8/WnkZx37F5HCdM8993D33XdTvXp13nnnHdq0aRN0pIOS8L2Pch42\n0rF/ESksoSPg0KhRIwYMGMCSJUsSviAA2J4NSxSNGjXy+fPnA38uCDosJCKx9vPPPzNgwABq1qzJ\nnXfeGXScqJnZ5+7eKL/5EnqkoIIgIoXF3Xn55ZdJSkpi4sSJHHJIQu8+96lInFNQQRCRWPr+++/p\n27cvaWlpNGrUiHfeeYeUlJSgY8VE0Sx1IiIF6Mcff+Tdd9/lX//6Fx999FGRLQhQREYKIiIFbeXK\nlUybNo2bb76ZBg0asHr1aipUqBB0rJjTSEFEJIfdu3fzn//8h7p163LXXXfx448/AhSLggAqCiIi\nEenp6Zx++ukMHDiQM888k/T0dI477rigYxUqHT4SESHUwO5vf/sbZsZLL71Ely5dEqKBXUFTURCR\nYi0jI4PatWtTtmxZJk6cSGpqKpUrF98bYXX4SESKpa1btzJ48GCSk5N54YUXADjrrLOKdUGABB4p\n5Ox3JCKyP2bPnk3Pnj3JzMykd+/edOjQIehIcSNhRwrqdyQiB+Kuu+6idevWuDvvvvsujz/+OEce\neWTQseJGwhaFPXQ3s4hEY0+ftyZNmvB///d/LF68mNatWwecKv4kfFEQEcnL+vXrueKKK7j33nsB\nOO+88xg5ciRly5YNOFl8UlEQkSLJ3XnppZeoXbs2kydPplSpUkFHSggqCiJS5Kxdu5YOHTrQtWtX\natSowYIFCxg2bFjQsRKCioKIFDnr16/n/fff59///jcffvghderUCTpSwkjYS1JFRHLKzMzk9ddf\nZ8CAAdSvX581a9ZwxBFHBB0r4WikICIJLSsri5EjR5KcnMw999zDTz/9BKCCcIBUFEQkYS1ZsoTm\nzZszePBg2rZtS3p6Oscee2zQsRKaDh+JSELaunUrrVu35pBDDmHixIlcdtllxbKBXUFLyKKgFhci\nxdfSpUupU6cOZcuWZdKkSaSmplKpUqWgYxUZCXn4SC0uRIqfLVu2MHDgQFJSUiIN7Nq0aaOCUMAS\ncqSwh1pciBQPs2bNomfPnnzzzTf07duXjh07Bh2pyErIkYKIFB933HEHZ511FiVLlmTOnDk8+uij\nurIohlQURCQuZWdnA9C8eXNuueUWFi1aRMuWLQNOVfSpKIhIXFm3bh1dunThnnvuAaB9+/Y88MAD\nHHbYYQEnKx5UFEQkLrg7L7zwArVr1+a1115TF9OAxLQomFk7M1tuZplmNjSXz480s9fNbJGZpZvZ\nNbHMIyLxac2aNZx//vlceeWV1KpViwULFjBkyJCgYxVLMSsKZlYCeBRoDyQBl5tZ0l6z3QBkuHsq\n0Ap40Mzy7G+76uctMUgrIkHasGEDH374IaNGjWLu3LkkJe29q5DCEstLUpsAme6+EsDMJgIdgYwc\n8zhwuIVuQywPbASy8lroph1ZlEf3KIgkuq+++opp06YxaNAg6tWrx5o1azj88MODjlXsxfLwURVg\nTY73a8PTchoN1Aa+B5YA/d09O5qF6x4FkcSUlZXFAw88QEpKCvfdd1+kgZ0KQnwI+kTzOcBC4ASg\nHjDazP50AbKZ9TKz+WY2v7ADikjBWbRoEU2bNmXo0KGce+65ZGRkqIFdnIllUfgOODHH+6rhaTld\nA0zxkEzgG+Cvey/I3ce5eyN3bxSztCISU1u3bqVNmzZ89913TJ48mSlTpnD88ccHHUv2Esui8BlQ\n08yqh08edwGm7TXPaqANgJkdC9QCVsYwk4gUssWLF+PulC1blldeeYWMjAwuueSSoGPJPsSsKLh7\nFtAPeAtYBrzs7ulm1sfM+oRnGw40N7MlwCxgiLv/HKtMIlJ4Nm/eTP/+/alXrx7PP/88AK1bt+bo\no48OOJnkJaYN8dx9OjB9r2mP5/j5e6BtLDOISOF7++236dWrF6tWraJfv35cdNFFQUeSKEU1UjCz\nUmZWI9ZhRCTx3XbbbbRt25bSpUszd+5cHnnkEV1ZlEDyLQpmdh6hy0XfDr+vZ2avxTqYiCSWPQ3s\nWrRowbBhw1i4cCEtWrQIOJXsr2hGCvcCTYFfAdx9IaBRg4gA8OOPP9KpUyfuvvtuINTA7v7776dM\nmTLBBpMDEk1R2OXuv+41zWMRRkQSh7vzzDPPkJSUxBtvvKFnHBQR0ZxoXmZmlwGHmFl14Cbg49jG\nEpF49u2339KrVy9mzpxJixYtGD9+PLVq1Qo6lhSAaEYK/YCGQDYwBdgB9I9lKBGJb7/++iufffYZ\no0ePZs6cOSoIRUg0I4Vz3H0IEOlja2YXEyoQIlJMLF++nGnTpjF48GBSU1NZvXo15cuXDzqWFLBo\nRgq35zLttoIOIiLxadeuXfzjH/8gNTWVESNGsG7dOgAVhCJqnyMFMzsHaAdUMbN/5/joCEKHkkSk\niFuwYAHXXXcdCxYsoFOnTowePZpjjjkm6FgSQ3kdPloHLAW2A+k5pm8C/vQUNREpWrZu3crZZ5/N\noYceyquvvsrFF18cdCQpBPssCu6+AFhgZi+6+/ZCzCQiAVqwYAH16tWjbNmyTJ48mdTUVI466qig\nY0khieacQhUzm2hmi83sqz2vmCcTkUK1adMm+vXrR4MGDSIN7Fq1aqWCUMxEUxSeAZ4GjNDzll8G\nJsUwk4gUsjfffJO6desyZswY+vfvr0NFxVg0RaGsu78F4O4r3P12QsVBRIqAYcOG0b59e8qVK8eH\nH37IQw89pCuLirFo7lPYYWaHACvCz0H4DlDLQ5EEt3v3bkqUKEGrVq0oWbIkt99+O6VLlw46lgQs\nmqIwAChHqL3FfcCRwLWxDCUisfPDDz9www03UKdOHYYPH84555zDOeecE3QsiRP5Hj5y90/cfZO7\nr3b3K929A7Aq9tFEpCC5O08//TRJSUnMmDFDJ5AlV3kWBTNrbGYXmlml8Ps6ZvYc8EmhpBORArFq\n1Sratm3LtddeS3JyMosWLWLgwIFBx5I4tM+iYGb/AF4EugJvmtndwHvAIuDUQkknIgXit99+44sv\nvmDMmDHMnj2bU0/Vf8KSu7zOKXQEUt19m5kdDawBkt19ZeFEE5GDkZGRwbRp0xg6dGikgV25cuWC\njiVxLq/DR9vdfRuAu28EvlJBEIl/O3fu5O9//zv169dn5MiRkQZ2KggSjbxGCieb2Z722AZUz/Ee\nd9fdLSJxZv78+Vx33XUsXryYLl26MGrUKDWwk/2SV1G4ZK/3o2MZREQOzpYtWzjnnHMoU6YMaWlp\ndOjQIehIkoDyaog3qzCDiMiB+eKLL6hXrx7lypXjtddeIyUlhQoVKgQdSxJUNG0uRCQO/f777/Tt\n25eGDRvywgsvANCyZUsVBDko0dzRLCJxZvr06fTu3Zvvv/+egQMHcsklex/tFTkwUY8UzExNUUTi\nwJAhQzjvvPM44ogjmDdvHg8++KCuLJICk+9IwcyaAE8S6nl0kpmlAj3c/cZYhxOREHcnOzubEiVK\n0KZNG8qUKcOtt96qBnZS4KIZKTwMnA9sAHD3RUDrWIYSkf/57rvvuPDCC7nrrrsAaNu2Lffcc48K\ngsRENEXhEHf/dq9pu2MRRkT+x9154oknSEpKYubMmVSqVCnoSFIMRHOieU34EJKbWQngRkCP4xSJ\noW+++YbrrruO9957j1atWvHEE09Qo0aNoGNJMRDNSOF6YCBwEvAT0Cw8LV9m1s7MlptZppkN3cc8\nrcxsoZmlm9mcaIOLFGWbN29m8eLFjB07llmzZqkgSKGJZqSQ5e5d9nfB4VHFo8DZwFrgMzOb5u4Z\nOeapAIwB2rn7ajPT/fhSbC1dupRp06Zx6623kpyczOrVqylbtmzQsaSYiWak8JmZTTezq8xsfx7D\n2QTIdPeV7r4TmEio82pOVwBT3H01gLuv24/lixQJO3fu5J577qFBgwb85z//iTSwU0GQIETz5LVT\ngL8DDYElZjbVzKIZOVQh1G57j7XhaTmdChxlZrPN7HMz657bgsysl5nNN7P5UaxXJGF89tlnNGzY\nkLvvvptLL72UjIwMNbCTQEV185q7z3P3m4AGwO+EHr5TEEoSKjbnAecAd5jZn57+4e7j3L2Ruzcq\noPWKBG7Lli20a9eOX375hWnTpvHiiy9SuXLloGNJMRfNzWvlCR326QLUBtKA5lEs+zvgxBzvq4an\n5bQW2ODuW4AtZvY+kEo+Vze1rqX/cCRxzZ8/nwYNGlCuXDnS0tJITk7myCOPDDqWCBDdSGEpoSuO\n/unuNdz9/9w9mmc0fwbUNLPqZlaKUFGZttc8aUALMytpZmWBpsCy/Bb89DVNoli9SHz57bff6N27\nN40bN440sGvRooUKgsSVaK4+Otnds/d3we6eZWb9gLeAEsBT7p5uZn3Cnz/u7svM7E1gMZANjHf3\npfu7LpF49/rrr9OnTx9+/PFHBg0aRKdOnYKOJJIrc/fcPzB70N3/z8xeA/40U1BPXit9fE3f8cPX\nQaxa5IAMHjyYkSNHkpyczJNPPknjxo2DjiTFkJl9Hs152bxGCpPC/6snronsJ3dn9+7dlCxZkrZt\n23LEEUcwZMgQSpUqFXQ0kTztc6QQmcGsn7uPzm9aYdFIQeLd2rVruf7660lJSeG+++4LOo4IEP1I\nIZoTzdfmMu26/Y8kUrRlZ2czduxYkpKSePfddznuuOOCjiSy3/Z5+MjMOhO6Yqi6mU3J8dHhwK+x\nDiaSSFauXMm1117LnDlzaNOmDePGjePkk08OOpbIfsvrnMKnhJ6hUJVQD6M9NgELYhlKJNFs2bKF\njIwMxo8fz7XXXouZBR1J5IDke04h3uicgsSLJUuWkJaWxu233w7Atm3bOOywwwJOJZK7gz6nsKeN\ntZn9YmYbc7x+MbONBRlWJJHs2LGDO++8kwYNGvDwww9HGtipIEhRkNeJ5j2P3KwEVM7x2vNepNj5\n+OOPadCgAcOHD+fyyy9n2bJlamAnRco+zynkuIv5ROB7d99pZi2AFOAFQo3xRIqNLVu2cN5551Gu\nXDmmT59O+/btg44kUuCiuSR1KqFHcZ4CPA3UBF6KaSqROPLJJ5+QnZ1NuXLleP3110lPT1dBkCIr\nmqKQ7e67gIuBR9x9AH9+LoJIkfPrr7/So0cPmjVrFmlg17x5cw4/fH+eNSWSWKJ6HKeZXQpcCVwY\nnnZo7CKJBG/q1Kn07duXdevWMWTIEC699NKgI4kUimjvaG5NqHX2SjOrDkyIbSyR4AwcOJCLLrqI\nY445hk8++YQRI0boyiIpNvIdKbj7UjO7CahhZn8l9NxlNXSRIiVnA7tzzz2XihUrcsstt3DooRoU\nS/ESTUO8M4DnCT01zYDjgCvd/cPYx/sz3bwmBW316tX06dOH+vXrq4GdFFkF2RDvP8C57n66uzcn\n9DzlUQcbUCRo2dnZjBkzhjp16jBnzhxOOOGEoCOJBC6aE82l3D1jz5vw09LUFF4SWmZmJtdeey1z\n587l7LPPZty4cVSrVi3oWCKBi6YofGFmjxO6YQ2gK2qIJwlu+/btfPXVVzz99NNcddVVamAnEhbN\nOYUywE1Ai/CkuYTuV9ge42y50jkFOVALFy4kLS2Nu+66CwgVhjJlygScSqRwRHtOIc+iYGbJwClA\nurvHxZ5YRUH21/bt2xk+fDgPPPAAlSpVYvHixepXJMVOQXRJvZVQi4uuwNtmltsT2ETi2rx586hf\nvz73338/3bp1IyMjQwVBJA95nVPoCqS4+xYzqwxMB54qnFgiB2/Lli1ccMEFlC9fnjfffJNzzjkn\n6EgicS+vorDD3bcAuPt6M4vm8lWRwH300Uc0bdqUcuXK8cYbb1C3bl31KxKJUl47+pPNbEr49Rpw\nSo73U/L4nkggfvnlF6699lqaN2/O888/D8Bpp52mgiCyH/IaKVyy1/vRsQwicjCmTJnCDTfcwPr1\n6xk2bBidO3cOOpJIQsrrITuzCjOIyIEaMGAADz30EPXq1WP69OnUr18/6EgiCSuam9dE4k7OBnbn\nn38+xxxzDIMGDVIDO5GDlO/Na/FG9ynIqlWr6N27Nw0aNOAf//hH0HFEEkJBNsTbs8DSBxdJ5OBk\nZ2fzyCOPULduXebNm8df/vKXoCOJFDn5FgUza2JmS4Cvw+9TzeyRmCcTyeHrr7+mZcuW3HTTTZxx\nxhksXbqUPn36BB1LpMiJZqTwMHA+sAHA3RcRehJbvsysnZktN7NMMxuax3yNzSzLzDpFs1wpfnbu\n3MmKFSt47rnnmD59ukYJIjESTVE4xN2/3Wva7vy+ZGYlgEeB9kAScLmZJe1jvgeAmVFkkWJkwYIF\n3H333QDUqVOHVatWceWVV6qjqUgMRVMU1phZE8DNrISZ3Qx8FcX3mhB6dOdKd98JTAQ65jLfjcCr\nwLpoQ0vRtn37doYNG0bjxo0ZO3Ys69evB6B0aZ3WEom1aIrC9cBA4CTgJ6BZeFp+qgBrcrxfG54W\nYWZVgIuAx6IJK0XfBx98QGpqKiNGjKB79+5kZGRQuXLloGOJFBv53qfg7uuALjFa/0PAEHfPzuuQ\ngJn1AnoBlDquRoyiSNA2b95Mx44dOeKII5g5cyZnn3120JFEip18i4KZPQH86WYGd++Vz1e/A07M\n8b5qeFpOjYCJ4YJQCTjXzLLcfepe6xoHjIPQfQr5ZZbE8sEHH9C8eXPKly/Pf//7X+rWrUv58uWD\njiVSLEVz+OgdYFb49SFwDLAjiu99BtQ0s+rhZzp3AablnMHdq7t7NXevBkwG+u5dEKTo2rBhA927\nd+eMM86INLBr1qyZCoJIgKI5fDQp53szex74IIrvZZlZP+AtoATwlLunm1mf8OePH1hkSXTuzuTJ\nk+nXrx8bN27kjjvuoEuXWB2hFJH9cSC9j6oDx0Yzo7tPJ/RwnpzTci0G7n71AWSRBDRgwABGjRpF\nw4YNmTlzJqmpqUFHEpGwaM4p/ML/zikcAmwE9nkjmkhu3J2srCwOPfRQOnTowAknnMDAgQMpWVI9\nGUXiSZ4N8Sx0BvhE/neCONsD7qCnhniJ55tvvqFXr140bNiQESNGBB1HpFgqkIZ44QIw3d13h1+6\n8keitnv3bkaNGkXdunX55JNPOPnkk4OOJCL5iGbsvtDM6rv7gpinkSLjq6++4uqrr+ajjz6iffv2\njB07lhNPPDH/L4pIoPZZFMyspLtnAfWBz8xsBbAFMEKDiAaFlFESUFZWFt9++y0vvPACV1xxhfoV\niSSIvEYKnwINgA6FlEUS3Pz580lLS2P48OEkJSWxcuVK9SsSSTB5nVMwAHdfkdurkPJJAti2bRu3\n3HILTZs25amnnlIDO5EEltdIobKZDdzXh+7+7xjkkQQzZ84cevToQWZmJj179uSf//wnFSpUCDqW\niBygvIpCCaA84RGDyN42b97MxRdfTIUKFZg1axZnnnlm0JFE5CDlVRR+cPd7Cy2JJIy5c+dy+umn\nU758eWbMmEGdOnUoV65c0LFEpADke05BZI+ff/6Zbt260bJly0gDuyZNmqggiBQheY0U2hRaColr\n7s7LL7/MjTfeyC+//MJdd92lBnYiRdQ+i4K7byzMIBK/+vfvzyOPPELjxo2ZNWsWycnJQUcSkRhR\nNzLJlbuza9cuSpUqxUUXXcRf/vIXbr75ZkqUKBF0NBGJoTwb4sUjNcSLvRUrVtCzZ08aNWrEP//5\nz6DjiEgBKJCGeFK87N69m3//+98kJyfz+eefU6tWraAjiUgh0+EjAeDLL7/kqquu4tNPP+WCCy7g\nscceo0qVKkHHEpFCpqIgAGRnZ/P9998zYcIEOnfurAZ2IsWUikIx9umnn5KWlsZ9991HUlISK1as\noFSpUkHHEpEA6ZxCMbR161YGDRrEaaedxrPPPhtpYKeCICIqCsXMe++9R3JyMg8++CA9e/YkPT2d\nypUrBx1LROKEDh8VI5s3b+bSSy+lQoUKvPfee7Rq1SroSCISZzRSKAZmz55NdnZ2pIHd4sWLVRBE\nJFcqCkXY+vXrufzyy2ndujUvvPACAI0bN6Zs2bIBJxOReKXDR0WQuzNhwgRuuukmNm3axPDhw9XA\nTkSioqJQBN144408+uijNGvWjCeffJKkpKSgI4lIglBRKCKys7PJysqiVKlSdOrUiRo1anDjjTeq\ngZ2I7Bc1xCsCvv76a3r27Enjxo3517/+FXQcEYlDaohXDGRlZTFy5EhSUlJYuHAhtWvXDjqSiCQ4\nHT5KUMuWLaN79+7Mnz+fjh07MmbMGE444YSgY4lIglNRSGA//fQTkyZN4tJLL1UDOxEpEDE9fGRm\n7cxsuZllmtnQXD7vamaLzWyJmc0zs9RY5kl0H3/8McOGDQOgdu3arFixgssuu0wFQUQKTMyKgpmV\nAB4F2gNJwOVmtve1kd8Af3P3ZGA4MC5WeRLZli1bGDBgAM2bN+fFF1+MNLA79NBDA04mIkVNLEcK\nTYBMd1/p7juBiUDHnDO4+zx3/yX89mOgagzzJKR33nmHunXr8tBDD9G3b181sBORmIrlOYUqwJoc\n79cCTfOY/zpgRm4fmFkvoBdAqeNqFFS+uLd582a6dOnC0Ucfzfvvv88ZZ5wRdCQRKeLi4pJUM2tN\nqCgMye1zdx/n7o2iuca2KHj33XfZvXs35cuX56233mLRokUqCCJSKGJZFL4DTszxvmp42h+YWQow\nHujo7htimCfu/fTTT1x22WW0adMm0sCuYcOGHHbYYQEnE5HiIpZF4TOgpplVN7NSQBdgWs4ZzOwk\nYApwpbt/FcMscc3def7550lKSoo8HvOKK64IOpaIFEMxO6fg7llm1g94CygBPOXu6WbWJ/z548Cd\nQEVgTPiyyqzicogopxtuuIHHHnuM0047jSeffFJ3JotIYNT7KCDZ2dns2rWL0qVLM2fOHBYvXkzf\nvn3VwE5EYiLa3kcqCgFYvnw5PXr0oGnTpowcOTLoOCJSDKghXhzatWsXI0aMIDU1laVLl5KcnBx0\nJBGRP1Dvo0KSnp7OlVdeyYIFC7j44ot59NFHOe6444KOJSLyByoKhaREiRJs3LiRyZMnc8kllwQd\nR0QkVzp8FEPz5s1jyJDQ/Xh//etfyczMVEEQkbimohADmzdv5qabbqJFixZMmjSJn3/+GYCSJTUw\nE5H4pqJQwGbOnEndunUZPXo0/fr1Y+nSpVSqVCnoWCIiUdGfrgVo8+bNdO3alYoVKzJ37lxOP/30\noCOJiOwXjRQKwNtvvx1pYDdz5kwWLlyogiAiCUlF4SD88MMPXHLJJbRt25YXX3wRgPr161OmTJmA\nk4mIHBgVhQPg7jzzzDMkJSXx3//+lxEjRqiBnYgUCTqncACuv/56xo4dS4sWLRg/fjy1atUKOpJI\nXNq1axdr165l+/btQUcpNsqUKUPVqlUP+HG9KgpRytnA7oorriAlJYU+ffpwyCEabInsy9q1azn8\n8MOpVq0a4U7IEkPuzoYNG1i7di3Vq1c/oGVojxaFZcuWccYZZ3DrrbcC0LJlS/r27auCIJKP7du3\nU7FiRRWEQmJmVKxY8aBGZtqr5WHXrl3cf//91KtXjy+//JL69esHHUkk4aggFK6D/X2rKOxDeno6\nTZo04bbbbqNjx45kZGTQrVu3oGOJyAGYOnUqZsaXX34JwOzZszn//PP/MM/VV1/N5MmTgdAfhEOH\nDqVmzZo0aNCA0047jRkzZkS1rh07dtC5c2dq1KhB06ZNWbVqVa7zTZo0iZSUFOrUqRNphwPw7bff\n0qZNG1JSUmjVqhVr166NfFaiRAnq1atHvXr16NChw/78CqKmorAPJUuW5LfffmPKlCm8/PLLHHvs\nsUFHEpEDNGHCBFq0aMGECROimv+OO+7ghx9+YOnSpXzxxRdMnTqVTZs2RfXdJ598kqOOOorMzEwG\nDBjwhx3+Hhs2bGDw4MHMmjWL9PR0fvzxR2bNmgXAoEGD6N69O4sXL+bOO+9k2LBhke8ddthhLFy4\nkIULFzLOlts2AAANDElEQVRt2rQ/LbcgqCjkMHfuXAYNGgRArVq1+Oqrr7jooosCTiUiB2Pz5s18\n8MEHPPnkk0ycODHf+bdu3coTTzzBI488QunSpQE49thjueyyy6JaX1paGldddRUAnTp1YtasWez9\nMLOVK1dSs2ZNKleuDMBZZ53Fq6++CkBGRgZnnnkmAK1btyYtLS26DS0guvoI2LRpE0OHDmXMmDFU\nr16doUOHUqlSJTWwEylA1Yb+NybLXTXivDw/T0tLo127dpx66qlUrFiRzz//PM/5MzMzOemkkzji\niCNy/bxz584sX778T9MHDhxI9+7d+e677zjxxBOB0BGHI488kg0bNvyhB1qNGjVYvnw5q1atomrV\nqkydOpWdO3cCkJqaypQpU+jfvz+vvfYamzZtYsOGDZETyA0aNKBUqVIMHTqUCy+8MM9tORDFfq83\nY8YMevfuzdq1a7n55pv5+9//Trly5YKOJSIFZMKECfTv3x+ALl26MGHCBC644IJc543mJO2kSZMO\nOtNRRx3FY489RufOnTnkkENo3rw5K1asAGDkyJH069ePZ555hpYtW1KlSpXIs9u//fZbqlSpwsqV\nKznzzDNJTk7mlFNOOeg8ORXrorBp0ya6d+/OMcccw7x582jWrFnQkUSKrPz+oo+FjRs38u6777Jk\nyRLMjN27d2NmXHXVVfzyyy9/mrdSpUrUqFGD1atX8/vvv+c6WshvpFClShXWrFlD1apVycrK4rff\nfqNixYp/mv+CCy6IFKdx48ZFdvwnnHACU6ZMAUKHvl599VUqVKgAQJUqVQA4+eSTadWqFQsWLCjw\nooC7J9Sr1HE1/GBkZ2f7jBkzPCsry93dFy5c6Nu3bz+oZYpI7jIyMgJd/9ixY71Xr15/mNayZUuf\nPXu2V6tWLZJv1apVftJJJ/mvv/7q7u6DBw/2q6++2nfs2OHu7uvWrfOXX345qnWOHj3ae/fu7e7u\nEyZM8EsvvTTX+X766Sd3d9+4caOnpqb68uXL3d19/fr1vnv3bnd3v/XWW/2OO+6IzLdnX7V+/Xqv\nUaOGp6en57rs3H7vwHyPYh8b+E5+f18HUxS+//57v/DCCx3wZ5999oCXIyLRCbootGrVymfMmPGH\naaNGjfI+ffr4Bx984E2bNvXU1FRv1KiRz5w5MzLPjh07fPDgwX7KKad4nTp1vEmTJv7mm29Gtc5t\n27Z5p06d/JRTTvHGjRv7ihUrIp+lpqZGfu7SpYvXrl3ba9eu7RMmTIhMf+WVV7xGjRpes2ZNv+66\n6yKF4MMPP/S6det6SkqK161b18ePH7/PDAdTFMz3Oise70ofX9N3/PD1fn3H3Xn66acZOHAgO3bs\n4N5772XAgAE6kSwSY8uWLaN27dpBxyh2cvu9m9nn7t4ov+8Wi71inz59GDduHC1btmT8+PHUrFkz\n6EgiInGpyBaF3bt3s2vXLsqUKUO3bt2oX78+vXr1Ur8iEZE8FMk9ZHp6Oqeffnqkgd0ZZ5yhjqYi\nIlEoUnvJnTt3Mnz4cOrXr09mZiaNGzcOOpJIsZdo5y0T3cH+vovM4aMlS5bQtWtXlixZQpcuXXj4\n4Ycjt5CLSDDKlCkTuRtX3VJjz8PPUziYRwIXmaJQqlQptm7dSlpaWsy6B4rI/qlatSpr165l/fr1\nQUcpNvY8ee1AxfSSVDNrB4wCSgDj3X3EXp9b+PNzga3A1e7+RV7LzHlJ6pw5c5g2bRoPPvggEDq5\nvOeuQBER+Z9oL0mN2TkFMysBPAq0B5KAy80saa/Z2gM1w69ewGPRLPv333/n+uuvp1WrVkydOpWf\nf/4ZQAVBROQgxfJEcxMg091XuvtOYCLQca95OgLPhW+4+xioYGbH57XQ7B1bqFOnDuPGjWPgwIEs\nWbLkD90HRUTkwMXynEIVYE2O92uBplHMUwX4YZ9L3bSeI0+oxeTJk2nadO/FiYjIwUiIE81m1ovQ\n4SWAHenp6UuLWUfTSsDPQYcoZNrm4kHbXHj+Es1MsSwK3wEn5nhfNTxtf+fB3ccB4wDMbH40J0uK\nEm1z8aBtLh7ifZtjeU7hM6CmmVU3s1JAF2Dvh4pOA7pbSDPgN3ff96EjERGJqZiNFNw9y8z6AW8R\nuiT1KXdPN7M+4c8fB6YTuhw1k9AlqdfEKo+IiOQvpucU3H06oR1/zmmP5/jZgRv2c7HjCiBaotE2\nFw/a5uIhrrc54Z6nICIisVOkGuKJiMjBiduiYGbtzGy5mWWa2dBcPjczezj8+WIzaxBEzoIUxTZ3\nDW/rEjObZ2apQeQsSPltc475GptZlpl1Ksx8BS2a7TWzVma20MzSzWxOYWcsaFH8//pIM3vdzBaF\ntznhzy2a2VNmts7Mlu7j8/jdf0XzzM7CfhE6Mb0COBkoBSwCkvaa51xgBmBAM+CToHMXwjY3B44K\n/9y+OGxzjvneJXR+qlPQuWP8b1wByABOCr8/JujchbDNtwIPhH+uDGwESgWd/SC3uyXQAFi6j8/j\ndv8VryOFmLTIiHP5brO7z3P3X8JvPyZ0X0cii+bfGeBG4FVgXWGGi4FotvcKYIq7rwZw9+KwzQ4c\nHm6QWZ5QUcgq3JgFy93fJ7Qd+xK3+694LQr7an+xv/Mkkv3dnusI/aWRyPLdZjOrAlxElM0S41w0\n/8anAkeZ2Wwz+9zMuhdautiIZptHA7WB74ElQH93zy6ceIGJ2/1XQrS5kD8ys9aEikKLoLMUgoeA\nIe6eXUwe0lISaAi0AQ4DPjKzj939q2BjxdQ5wELgTOAU4G0zm+vuvwcbq3iK16JQYC0yEkhU22Nm\nKcB4oL27byikbLESzTY3AiaGC0Il4Fwzy3L3qYUTsUBFs71rgQ3uvgXYYmbvA6lAohaFaLb5GmCE\nhw62Z5rZN8BfgU8LJ2Ig4nb/Fa+Hj4pji4x8t9nMTgKmAFcWkb8c891md6/u7tXcvRowGeiboAUB\novv/dRrQwsxKmllZQp2FlxVyzoIUzTavJjQywsyOBWoBKws1ZeGL2/1XXI4UvBi2yIhym+8EKgJj\nwn85Z3kcN9bKT5TbXGREs73uvszM3gQWA9mEnliY62WNiSDKf+PhwDNmtoTQ1ThD3D2hO6ea2QSg\nFVDJzNYCdwGHQvzvv3RHs4iIRMTr4SMREQmAioKIiESoKIiISISKgoiIRKgoiIhIhIqCxB0z2x3u\nErrnVS2PeavtqxPlfq5zdriT5yIz+9DMah3AMi40s6Qc7+81s7Py+c50M6twgDk/M7N6UXzn5vA9\nDyL5UlGQeLTN3evleK0qpPV2dfdU4FngXwfw/QuBSFFw9zvd/Z28vuDu57r7r/u5nj05xxBdzpsB\nFQWJioqCJITwiGCumX0RfjXPZZ46ZvZpeHSx2Mxqhqd3yzF9rJmVyGd17wM1wt9tY2YLLPQMi6fM\nrHR4+ggzywivZ2Q4TwfgX+H1nGJmz5hZJws9T+CVHDlbmdkb4Z9XmVmlA8z5ETmaqJnZY2Y230LP\nJLgnPO0m4ATgPTN7LzytrZl9FP49vmJm5fNZjxQjKgoSjw7LcejotfC0dcDZ7t4A6Aw8nMv3+gCj\n3L0eoZ5Ja82sdnj+08PTdwNd81n/BcASMysDPAN0dvdkQh0ArjezioQ6t9Zx9xTg7+4+j1DrgsHh\n0c2KHMt7B2hqZuXC7zsTaiEdcYA52wE5W37cFr7DPQX4m5mluPvDhLqPtnb31uECdDtwVvh3OR8Y\nmM96pBiJyzYXUuxtC+8YczoUGB0+hr6bUIvpvX0E3GZmVQk9k+BrM2tDqOvoZ+HWIIex7+cyvGhm\n24BVhJ7hUAv4JkefqWeBGwi1et4OPBn+i/+NvDYm3OrhTeACM5sMnAfcstds+5uzFKFnD+T8PV1m\nZr0I/Xd9PKFDWYv3+m6z8PQPw+spRej3JgKoKEjiGAD8RKhj6CGEdsp/4O4vmdknhHa6082sN6Fe\nOs+6+7Ao1tHV3efveWNmR+c2U3gn34TQjrwT0I9Q2+e8TAzPtxGY7+6b9vp8v3ICnxM6n/AIcLGZ\nVQcGAY3d/RczewYok8t3DXjb3S+PYj1SDOnwkSSKI4Efwg9fuZJQc7U/MLOTgZXhQyZphA6jzAI6\nmdkx4XmONrO/RLnO5UA1M6sRfn8lMCd8DP5Id59OqFjteVb2JuDwfSxrDqHHM/Zkr0NHYfuVM9xm\n+g6gmZn9FTgC2AL8ZqFOo+1zzJ4z18fA6Xu2yczKmVluoy4pplQUJFGMAa4ys0WEeu1vyWWey4Cl\nZrYQqEvocYcZhI6hzzSzxcDbhA6t5MvdtxPqXvlKuINnNvA4oR3sG+HlfcD/jslPBAaHT0yfstey\ndhM6zNSeXA43HUhOd98GPEjoPMYiYAHwJfAS8GGOWccBb5rZe+6+HrgamBBez0eEfp8igLqkiohI\nDhopiIhIhIqCiIhEqCiIiEiEioKIiESoKIiISISKgoiIRKgoiIhIhIqCiIhE/D+sEC/LFFLCXQAA\nAABJRU5ErkJggg==\n", 299 | "text/plain": [ 300 | "" 301 | ] 302 | }, 303 | "metadata": {}, 304 | "output_type": "display_data" 305 | } 306 | ], 307 | "source": [ 308 | "import matplotlib.pyplot as plt\n", 309 | "from sklearn.metrics import roc_curve\n", 310 | "%matplotlib inline\n", 311 | "auc = roc_auc_score(y_one_hot, y_score_pro,average='micro')\n", 312 | "fpr, tpr, thresholds = roc_curve(y_one_hot.ravel(),y_score_pro.ravel()) # ravel()表示平铺开来\n", 313 | "plt.plot(fpr, tpr, linewidth = 2,label='AUC=%.3f' % auc)\n", 314 | "plt.plot([0,1],[0,1], 'k--')\n", 315 | "plt.axis([0,1.1,0,1.1])\n", 316 | "plt.xlabel('False Postivie Rate')\n", 317 | "plt.ylabel('True Positive Rate')\n", 318 | "plt.legend()\n", 319 | "plt.show()\n" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": true 327 | }, 328 | "outputs": [], 329 | "source": [] 330 | } 331 | ], 332 | "metadata": { 333 | "kernelspec": { 334 | "display_name": "Python [default]", 335 | "language": "python", 336 | "name": "python3" 337 | }, 338 | "language_info": { 339 | "codemirror_mode": { 340 | "name": "ipython", 341 | "version": 3 342 | }, 343 | "file_extension": ".py", 344 | "mimetype": "text/x-python", 345 | "name": "python", 346 | "nbconvert_exporter": "python", 347 | "pygments_lexer": "ipython3", 348 | "version": "3.5.2" 349 | } 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 1 353 | } 354 | -------------------------------------------------------------------------------- /爬虫与网页分析/data/1396354: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 孙敬季 (豆瓣) 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 40 | 41 | 42 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 147 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 226 | 227 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 |
265 | 266 | 267 | 268 |
269 | 270 |

孙敬季 Jingji Sun

271 | 272 |
273 | 274 | 275 |
276 | 277 | 278 | 279 | 280 | 281 |
282 |
283 | 284 | 孙敬季 Jingji Sun 287 | 288 | 289 | 增改描述、换头像 290 | 291 |
292 |
293 | 294 |
    295 | 296 |
  • 297 | 性别: 298 | 男 299 |
  • 300 | 301 |
  • 302 | 星座: 303 | 天蝎座 304 |
  • 305 | 306 |
  • 307 | 出生日期: 308 | 1985-10-27 309 |
  • 310 | 311 |
  • 312 | 出生地: 313 | 中国,吉林,长春 314 |
  • 315 | 316 |
  • 317 | 职业: 318 | 演员 319 |
  • 320 | 321 | 322 |
  • 323 | imdb编号: 324 | nm9914238 325 |
  • 326 | 327 |
328 | 329 |
330 |
331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 |
342 |
343 | 344 | 345 | 346 |
347 | 348 |
349 | 350 | 351 | 352 | 353 | 354 | 361 | 分享到 362 |    363 | 364 | 365 | 397 | 398 | 399 |
400 |
401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 |
413 | 414 | 461 | 462 | 463 | 464 | 465 | 466 | 467 |
468 | 469 | 470 | 471 | 472 | 473 |
474 |
475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 |
488 |
489 |

490 | 影人简介 491 |  · · · · · · 492 |

493 |
494 |
495 |   孙敬季,1985年10月27日出生,吉林长春人。毕业于北京舞蹈学院音乐剧系。全能艺人,表演、声乐、舞蹈、武术等。是中国内地童星男演员。童年主演电影《九香》 电视剧《后妈》而被人熟知。2002年参加周星驰全国选星活动脱颖而出并荣获冠军。2005年出演由周星驰监制的电视剧《功夫状元》表现不俗。2006年参加第一届北京大学生戏剧节荣获“优秀男演员奖”。2009年参演小柯原创音乐剧《凭什么爱我》等脍炙人口的作品。 496 |
497 |
498 | 499 | 500 | 501 |
502 |
503 |

504 | 影人图片 505 |  · · · · · · 506 |  ( 507 | 508 | 全部0张 · 上传照片 509 | ) 510 |

511 |
512 | 513 |
    514 |
515 | 516 |
517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 |
528 |
529 | 530 |

531 | 最近的5部作品(已上映) 532 |  · · · · · · 533 |  ( 534 | 535 | 全部 536 | ) 537 |

538 | 539 |
540 |
541 | 573 |
574 |
575 | 576 | 577 | 578 | 579 | 580 |
581 |
582 |

583 | 最受好评的5部作品 584 |  · · · · · · 585 |  ( 586 | 587 | 全部 588 | ) 589 |

590 |
591 |
592 | 624 |
625 |
626 | 627 | 628 | 629 | 630 | 631 | 632 |
633 |
634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 |
647 | 648 | 649 | 650 | 651 | 652 |
653 |
654 |

655 | 孙敬季的影迷(0) 656 |  · · · · · · 657 |  ( 658 | 659 | 全部 660 | ) 661 |

662 |
663 | 664 |
    665 |
666 | 667 |
668 | 669 |
670 | 671 |
672 |
673 | 674 |
675 | 676 |
677 |
678 |
679 | 680 | 681 | 702 | 703 |
704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 797 | 798 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | -------------------------------------------------------------------------------- /爬虫与网页分析/data/1404079: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 托马斯·埃尔姆斯 (豆瓣) 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 40 | 41 | 42 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 147 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 226 | 227 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 |
265 | 266 | 267 | 268 |
269 | 270 |

托马斯·埃尔姆斯 Thomas Elms

271 | 272 |
273 | 274 | 275 |
276 | 277 | 278 | 279 | 280 | 281 |
282 |
283 | 284 | 托马斯·埃尔姆斯 Thomas Elms 287 | 288 | 289 | 增改描述、换头像 290 | 291 |
292 |
293 | 294 |
    295 | 296 |
  • 297 | 性别: 298 | 男 299 |
  • 300 | 301 | 302 | 303 | 304 |
  • 305 | 职业: 306 | 演员 307 |
  • 308 | 309 | 310 | 311 | 312 |
  • 313 | imdb编号: 314 | nm7402258 315 |
  • 316 | 317 |
318 | 319 |
320 |
321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 |
332 |
333 | 334 | 335 | 336 |
337 | 338 |
339 | 340 | 341 | 342 | 343 | 344 | 351 | 分享到 352 |    353 | 354 | 355 | 387 | 388 | 389 |
390 |
391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 |
403 | 404 | 451 | 452 | 453 | 454 | 455 | 456 | 457 |
458 | 459 | 460 | 461 | 462 | 463 |
464 |
465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 |
478 |
479 |

480 | 影人简介 481 |  · · · · · · 482 |

483 |
484 |
485 |    486 |
487 |
488 | 489 | 490 | 491 |
492 |
493 |

494 | 影人图片 495 |  · · · · · · 496 |  ( 497 | 498 | 全部8张 · 上传照片 499 | ) 500 |

501 |
502 | 503 | 530 | 531 |
532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 |
543 |
544 | 545 |

546 | 最近的5部作品(已上映) 547 |  · · · · · · 548 |  ( 549 | 550 | 全部 551 | ) 552 |

553 | 554 |
555 |
556 | 588 |
589 |
590 | 591 | 592 | 593 | 594 | 595 |
596 |
597 |

598 | 最受好评的5部作品 599 |  · · · · · · 600 |  ( 601 | 602 | 全部 603 | ) 604 |

605 |
606 |
607 | 639 |
640 |
641 | 642 | 643 | 644 | 645 | 646 | 647 |
648 |
649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 |
662 | 663 | 664 | 665 | 666 | 667 |
668 |
669 |

670 | 托马斯·埃尔姆斯的影迷(7) 671 |  · · · · · · 672 |  ( 673 | 674 | 全部 675 | ) 676 |

677 |
678 | 679 | 723 | 724 |
725 | 726 |
727 | 728 |
729 |
730 | 731 |
732 | 733 |
734 |
735 |
736 | 737 | 738 | 759 | 760 |
761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 854 | 855 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | -------------------------------------------------------------------------------- /爬虫与网页分析/网页解析.py: -------------------------------------------------------------------------------- 1 | # --coding:utf-8-- 2 | # Author:Clark Xu, Hang Shang 3 | # Time:2019/3/20 16:42 4 | # # -*- coding: utf-8 -*- 5 | import requests 6 | import os 7 | import time 8 | import sys 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | 14 | from bs4 import BeautifulSoup 15 | import json 16 | 17 | import os 18 | current_path = sys.path[0] 19 | if 'final_result' in os.listdir(current_path): 20 | os.remove('final_result') 21 | def name_extract(soup1): 22 | ''' 23 | Inputs: 24 | soup1 : the source code BeatifulSoup data format 25 | 26 | Returns: 27 | actor name 28 | ''' 29 | tmp_info = soup1.find("h1") 30 | return "" if tmp_info == None else soup1.find("h1").text.split(" ")[0] 31 | 32 | def basic_info_extract(soup1): 33 | '''extract personal information 34 | Inputs: 35 | soup1 : the source code BeatifulSoup data format 36 | 37 | Returns: 38 | basic info (补充) 39 | ''' 40 | 41 | b = soup1.find_all("div",{"class":"info"}) 42 | basic_info_dict = {'star': '', 'bd': '', 'bp': '', 'imdb': ''} 43 | if len(b) == 0: 44 | return ["","","",""] 45 | 46 | c = b[0].find("ul") 47 | if c == None: 48 | return ["", "", "", ""] 49 | 50 | info_text = c.find_all("li")#.text 51 | 52 | basic_info_list = [] 53 | for i in range(len(info_text)): 54 | if info_text[i].text.replace(u" ", u"") == u"": 55 | continue 56 | info_text[i] = info_text[i].text.replace(u" ", u"").replace('\n', '') 57 | 58 | text_element = info_text[i].strip().split(':') 59 | # 获取类别 60 | info_class = info_text[i].split(":")[0] 61 | # if(d[i].text): 62 | if info_class == u"星座": 63 | basic_info_dict['star'] = info_text[i].split(":")[1] 64 | if info_class == u"出生日期": 65 | basic_info_dict['bd'] = info_text[i].split(":")[1] 66 | if info_class == u"出生地": 67 | basic_info_dict['bp'] = info_text[i].split(":")[1] 68 | if info_class == u"imdb编号": 69 | basic_info_dict['imdb'] = info_text[i].split(":")[1] 70 | 71 | 72 | basic_info_list.append(basic_info_dict['star']) 73 | basic_info_list.append(basic_info_dict['bd']) 74 | basic_info_list.append(basic_info_dict['bp']) 75 | basic_info_list.append(basic_info_dict['imdb']) 76 | 77 | return basic_info_list 78 | 79 | # celebrity introduction 80 | def intro_extract(soup1): 81 | '''actor information 82 | Inputs: 83 | soup1 : the source code BeatifulSoup data format 84 | 85 | Returns: 86 | actor info 87 | ''' 88 | intro = soup1.find_all("div", attrs = {"id" : "intro", "class" : "mod"}) 89 | if len(intro) == 0: 90 | return "" 91 | brief = intro[0].find_all("div", {"class": "bd"}) 92 | return 0 if len(brief) == 0 else brief[0].text.replace(" ","").replace('\n', '').replace('\u3000',"") 93 | 94 | def movies_extract(soup1): 95 | '''Recently five movies 96 | Inputs: 97 | soup1 : the source code BeatifulSoup data format 98 | 99 | Returns: 100 | five most recently movies/master work 101 | ''' 102 | master_work=[] 103 | movie = "" 104 | recent = soup1.find_all("div",attrs = { "id" : "recent_movies" , "class" : "mod"}) 105 | if len(recent) !=0: 106 | latest_five = recent[0].find_all("div",{"class":"bd"}) 107 | if len(latest_five) != 0: 108 | b1 = latest_five[0].find("ul") 109 | if len(b1) != 0: 110 | c1 = b1.find_all("li") 111 | for i in range(len(c1)): 112 | d1 = c1[i].find_all("div", attrs = {"class":"info"}) 113 | if len(d1) != 0: 114 | e1 = d1[0].find_all("a") 115 | if len(e1) != 0: 116 | m_name = e1[0].get('title') 117 | if e1[0].get('href') != None: 118 | m_id = e1[0].get('href').split("/")[-2] 119 | else: 120 | m_id = '' 121 | a_1 = {"name": m_name, "douban": m_id} 122 | # j_1 = json.dumps(a_1, ensure_ascii=False) 123 | master_work.append(a_1) 124 | 125 | best = soup1.find_all("div",attrs = {"id":"recent_movies","class":"mod"}) 126 | if len(best) != 0: 127 | best_five = recent[0].find_all("div",{"class":"bd"}) 128 | if len(best_five) != 0: 129 | b2=best_five[0].find("ul") 130 | if len(b2) != 0: 131 | c2=b2.find_all("li") 132 | for i in range(len(c2)): 133 | d2 = c2[i].find_all("div",attrs = {"class":"info"}) 134 | if len(d2) != 0: 135 | e2 = d2[0].find_all("a") 136 | if len(e2) != 0: 137 | m_name = e2[0].get('title') 138 | if e2[0].get('href') != None: 139 | m_id = e2[0].get('href').split("/")[-2] 140 | else: 141 | m_id='' 142 | a_2 = {"name": m_name, "douban": m_id} 143 | # j_1 = json.dumps(a_1, ensure_ascii=False) 144 | master_work.append(a_2) 145 | 146 | return master_work 147 | 148 | 149 | def extract_source(source_code): 150 | ''' analysis source code 151 | Inputs: 152 | the web page source code 153 | 154 | Returns: 155 | structure data 156 | 157 | ''' 158 | info = [] 159 | soup = BeautifulSoup(source_code) 160 | info.append(name_extract(soup)) 161 | info = info + basic_info_extract(soup) 162 | info.append(intro_extract(soup)) 163 | info.append(movies_extract(soup)) 164 | a_1 = { "c_name" : info[0], \ 165 | "e_name" : '',\ 166 | "star" : info[1], \ 167 | "birthdate" : info[2], \ 168 | "birthplace":info[3], \ 169 | "db_id" : info[4], \ 170 | "intro":info[5], \ 171 | "movie":info[6]} 172 | return a_1 173 | 174 | 175 | 176 | def del_dup(list1): 177 | '''合并电影 178 | ''' 179 | res=[] 180 | set1=set() 181 | for i in list1: 182 | if i['douban'] not in set1: 183 | set1.add(i['douban']) 184 | res.append(i) 185 | return res 186 | 187 | def main(): 188 | '''extract actor structure data 189 | ''' 190 | path = "data/" 191 | file_list = set(os.listdir(path)) 192 | count=0 193 | # 先解析这个文件 194 | with open("data/actor_info1", 'r') as f: 195 | each_info = f.readlines() 196 | for i in range(len(each_info)): 197 | all_info = each_info[i].split('\t') 198 | # print all_info 199 | # 获取id 200 | c_name = all_info[0] 201 | e_name = all_info[1].replace('english_name-', '') 202 | douban_id = all_info[2].replace('douban_id-', '') 203 | master_work = all_info[3].replace('master_word-', '').replace('\n', '') 204 | 205 | master_work_list = [] 206 | 207 | for i in master_work.split('|x02'): 208 | if i != "": 209 | movie_id = i.split('|x01')[0] 210 | movie_name = i.split('|x01')[1] 211 | m_1 = {"name": movie_name, "douban": movie_id} 212 | master_work_list.append(m_1) 213 | 214 | if c_name == "" or douban_id not in file_list: 215 | continue 216 | 217 | with open("final_result",'a') as f1: 218 | file = open(path + douban_id,'r') 219 | source = file.read() 220 | final_info = extract_source(source) 221 | final_info["c_name"] = c_name 222 | final_info["e_name"] = e_name 223 | final_info["db_id"] = douban_id 224 | final_info["movie"] = del_dup(final_info["movie"]+master_work_list) 225 | 226 | 227 | f1.write(json.dumps(final_info, ensure_ascii=False) + "\n") 228 | 229 | 230 | print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) , 'start' 231 | main() 232 | print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) , 'end' 233 | 234 | 235 | 236 | 237 | 238 | 239 | --------------------------------------------------------------------------------