├── .gitattributes ├── README.md ├── coursera_deeplearning └── 第一课第二周编程作业 │ ├── assignment2_1.ipynb │ ├── assignment2_2.ipynb │ └── lr_utils.py ├── image ├── 1.png ├── 2.png └── 3.png ├── kaggle手写数字识别 ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── workspace.xml │ └── xgb_learn.iml ├── .ipynb_checkpoints │ └── xgboost_model_reload-checkpoint.ipynb ├── image │ ├── kaggle手写数字比赛.png │ ├── kaggle排名及得分.png │ ├── xgb_importance.png │ ├── xgb_tree.png │ └── 迭代次数及时间.png ├── model │ ├── dump.raw.txt │ └── 迭代次数及时间.png ├── submission_xgb_MultiSoftmax.csv ├── xgb_diginum.py └── xgboost_model_reload.ipynb ├── matplot_test.ipynb ├── numpy ├── np_test.py ├── np_test2.py ├── test.py └── 线下门店服务器安装部署手册.docx - 快捷方式.lnk ├── tensorflow ├── index.py ├── test2.py └── test3.py ├── tf衣服图片识别率提升 ├── CNN_digit.py ├── README.md ├── Text_classifier_v1.py └── Text_classifier_v2.py ├── 泰坦尼克生存预测案例 ├── Titanic.ipynb ├── test.csv └── train.csv ├── 蝴蝶花(iris)分类案例 ├── Iris.ipynb ├── Iris.py ├── Iris3.ipynb ├── IrisFishData.csv ├── Iris_clean.csv ├── iris2.ipynb └── iris2.py ├── 讯飞CTR预测 ├── DC讯飞比赛(EDA & Baseline ).html ├── RXY初版 │ ├── check_column.ipynb │ ├── feature_extract.ipynb │ ├── feature_extract_test.ipynb │ ├── feature_re_extract.ipynb │ ├── lambda_test.ipynb │ └── pandas_test.ipynb ├── digi_onehot.py ├── one_hot_test.py ├── 川哥版 │ ├── _1_extract_features.py │ ├── _2_train.py │ └── utils.py └── 鱼神大佬 │ └── kdxf_baseline.py └── 阿里天池o2o新人赛 ├── o2o_wepe_zhen.ipynb ├── wepe_o2o.ipynb └── xgb.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 阿里O2o新人赛 2 | 3 | 3800多参赛队 第50名 4 | 5 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/image/1.png) 6 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/image/2.png) 7 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/image/3.png) 8 | 9 | 需要gpu加速 速率会快很多 10 | 11 | # kaggle手写数字识别 12 | 13 | ## 1.比赛页面如下 14 | 15 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/kaggle%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E8%AF%86%E5%88%AB/image/kaggle%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E6%AF%94%E8%B5%9B.png) 16 | 17 | ## 2.代码迭代次数及时间 18 | 19 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/kaggle%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E8%AF%86%E5%88%AB/image/%E8%BF%AD%E4%BB%A3%E6%AC%A1%E6%95%B0%E5%8F%8A%E6%97%B6%E9%97%B4.png) 20 | 21 | ## 3.xgb图 22 | 23 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/kaggle%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E8%AF%86%E5%88%AB/image/xgb_importance.png) 24 | 25 | ## 4.排名及结果 26 | 27 | ![Image text](https://github.com/naginoasukara/Data-mining/blob/master/kaggle%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E8%AF%86%E5%88%AB/image/kaggle%E6%8E%92%E5%90%8D%E5%8F%8A%E5%BE%97%E5%88%86.png) 28 | 29 | # tf衣服图片识别率提升 30 | tf官方教程的识别网络结构是一开始全部对图片碾平成一维, 之后过两个全连接层. 31 | 32 | 我的改进是首先将数据reshape成四维, [batch, h,w,c]. CNN1+dropout+CNN2 256全连接 dropout 0.2 128全连接 68全连接 dropout0.2 10全连接 33 | 34 | 准确率可以提升从0.86提升到0.91 35 | -------------------------------------------------------------------------------- /coursera_deeplearning/第一课第二周编程作业/lr_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import h5py 3 | 4 | 5 | def load_dataset(): 6 | train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r") 7 | train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features 8 | train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels 9 | 10 | test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r") 11 | test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features 12 | test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels 13 | 14 | classes = np.array(test_dataset["list_classes"][:]) # the list of classes 15 | 16 | train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0])) 17 | test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0])) 18 | 19 | return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes -------------------------------------------------------------------------------- /image/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/1.png -------------------------------------------------------------------------------- /image/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/2.png -------------------------------------------------------------------------------- /image/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/3.png -------------------------------------------------------------------------------- /kaggle手写数字识别/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /kaggle手写数字识别/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /kaggle手写数字识别/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 65 | 66 | 67 | 72 | 73 | 74 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 101 | 102 | 105 | 106 | 107 | 108 | 111 | 112 | 115 | 116 | 119 | 120 | 121 | 122 | 125 | 126 | 129 | 130 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 155 | 156 | 157 | 158 | 175 | 176 | 187 | 188 | 206 | 207 | 221 | 222 | 239 | 240 | 241 | 242 | 243 | 244 | 246 | 247 | 248 | 249 | 1531275630108 250 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /kaggle手写数字识别/.idea/xgb_learn.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /kaggle手写数字识别/image/kaggle手写数字比赛.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/kaggle手写数字比赛.png -------------------------------------------------------------------------------- /kaggle手写数字识别/image/kaggle排名及得分.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/kaggle排名及得分.png -------------------------------------------------------------------------------- /kaggle手写数字识别/image/xgb_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/xgb_importance.png -------------------------------------------------------------------------------- /kaggle手写数字识别/image/xgb_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/xgb_tree.png -------------------------------------------------------------------------------- /kaggle手写数字识别/image/迭代次数及时间.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/迭代次数及时间.png -------------------------------------------------------------------------------- /kaggle手写数字识别/model/dump.raw.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/model/dump.raw.txt -------------------------------------------------------------------------------- /kaggle手写数字识别/model/迭代次数及时间.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/model/迭代次数及时间.png -------------------------------------------------------------------------------- /kaggle手写数字识别/xgb_diginum.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import pandas as pd 3 | import time 4 | import numpy as np 5 | import matplotlib as plt 6 | 7 | 8 | now = time.time() 9 | 10 | dataset = pd.read_csv("data/train.csv") 11 | 12 | train = dataset.iloc[:,1:].values 13 | labels = dataset.iloc[:,:1].values 14 | 15 | tests = pd.read_csv("data/test.csv") 16 | #test_id = range(len(tests)) 17 | test = tests.iloc[:,:].values 18 | 19 | 20 | params={ 21 | 'booster':'gbtree', 22 | # 这里手写数字是0-9,是一个多类的问题,因此采用了multisoft多分类器, 23 | 'objective': 'multi:softmax', 24 | 'num_class':10, # 类数,与 multisoftmax 并用 25 | 'gamma':0.05, # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:] 26 | 'max_depth':12, # 构建树的深度 [1:] 27 | #'lambda':450, # L2 正则项权重 28 | 'subsample':0.4, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1] 29 | 'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1] 30 | #'min_child_weight':12, # 节点的最少特征数 31 | 'silent':1 , 32 | 'eta': 0.005, # 如同学习率 33 | 'seed':710, 34 | 'nthread':4,# cpu 线程数,根据自己U的个数适当调整 35 | } 36 | 37 | plst = list(params.items()) 38 | 39 | #Using 10000 rows for early stopping. 40 | offset = 35000 # 训练集中数据50000,划分35000用作训练,15000用作验证 41 | 42 | num_rounds = 30000 # 迭代次数 43 | xgtest = xgb.DMatrix(test) 44 | 45 | # 划分训练集与验证集 46 | xgtrain = xgb.DMatrix(train[:offset,:], label=labels[:offset]) 47 | xgval = xgb.DMatrix(train[offset:,:], label=labels[offset:]) 48 | 49 | # return 训练和验证的错误率 50 | watchlist = [(xgtrain, 'train'),(xgval, 'val')] 51 | 52 | 53 | # training model 54 | # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练 55 | model = xgb.train(plst, xgtrain, num_rounds, watchlist,early_stopping_rounds=100) 56 | # 用于存储训练出的模型 57 | model.save_model('model/xgb.model') 58 | # 转存模型 59 | model.dump_model('model/0p-dump.raw.txt') 60 | 61 | preds = model.predict(xgtest,ntree_limit=model.best_iteration) 62 | 63 | 64 | # 将预测结果写入文件 65 | np.savetxt('submission_xgb_MultiSoftmax.csv',np.c_[range(1,len(test)+1),preds], 66 | delimiter=',',header='ImageId,Label',comments='',fmt='%d') 67 | 68 | 69 | cost_time = time.time()-now 70 | print("end ......",'\n',"cost time:",cost_time,"(s)......") 71 | #制图 72 | xgb.plot_importance(model) 73 | #xgb.plot_tree(model, num_trees=2) -------------------------------------------------------------------------------- /matplot_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "[]" 12 | ] 13 | }, 14 | "execution_count": 4, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np\n", 22 | "\n", 23 | "x = np.arange(20)\n", 24 | "y = x**2\n", 25 | "\n", 26 | "plt.plot(x,y)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": { 51 | "scrolled": true 52 | }, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl8FfW5x/HPA4EEwpZACJEAAWQX2cKiVut1X3pLXeuG\niChqta33Wlu73Nba1uq11WoXlwqyKe5W6tVa3GqtsiRsshNZw5IFMIQlIctz/8hgUxpISHJylnzf\nr9d5nTkzvznzMJx8M/md38yYuyMiIrGrRbgLEBGR0FLQi4jEOAW9iEiMU9CLiMQ4Bb2ISIxT0IuI\nxDgFvYhIjFPQi4jEOAW9iEiMiwt3AQBdunTxjIyMcJchIhJVsrOzC909pbZ2ERH0GRkZZGVlhbsM\nEZGoYmab69JOXTciIjFOQS8iEuMU9CIiMU5BLyIS4xT0IiIxTkEvIhLjFPQiIjFOQS8iEiaPvbue\nVdv3hnw7EXHClIhIc/Ni1lYenreOkrIKBp/QIaTb0hG9iEgTW577OT/60wpOO7Ez/31u/5BvT0Ev\nItKEdu0r5dZZ2aS0i+e3V48krmXoY7jWLZhZgpktNLNlZrbSzH4azJ9uZhvNbGnwGB7MNzN7zMxy\nzGy5mY0M9T9CRCQalFdU8s05Syjcf4gnrhtFcmLrJtluXfroS4Gz3H2fmbUCPjKzt4Jld7v7y0e0\nvxDoFzzGAo8HzyIizdpDb6/l48928asrhjE0vWOTbbfWI3qvsi942Sp4+DFWGQ/MDNabD3Qys7SG\nlyoiEr3eWL6dJz/cwIRxvbh8VHqTbrtOnUNm1tLMlgL5wDx3XxAs+kXQPfOImcUH87oDW6utnhvM\nO/I9p5hZlpllFRQUNOCfICIS2dbuLOa7Ly9nVK8k/ucrg5t8+3UKenevcPfhQDowxsxOAr4PDARG\nA8nA94LmVtNb1PCeT7l7prtnpqTUet18EZGoVHSwjFtmZZEYH8cfrh1J67imHwNzXFt098+BD4AL\n3H1H0D1TCjwDjAma5QI9qq2WDmxvhFpFRKJKZaXz3y8sJXfPQR6/diSpHRLCUkddRt2kmFmnYLoN\ncA6w5nC/u5kZ8DVgRbDKXOD6YPTNOKDI3XeEpHoRkQj22HvreXdNPj/+z8FkZiSHrY66jLpJA2aY\nWUuqfjG86O5vmNl7ZpZCVVfNUuDWoP2bwEVADnAAmNT4ZYuIRLZ3V+fxm3fWc+nI7kwY1yustdQa\n9O6+HBhRw/yzjtLegdsbXpqISHTaWLifO19YypATOnD/JUOp6vgIH50ZKyLSiPaXlnPrrGziWhhP\nXDeKhFYtw12SLmomItJY3J3vvbKc9fnFzLhxDD2S24a7JEBH9CIijebpv2/kjeU7uPv8gZzeL3KG\njSvoRUQawcc5hfzyrdVceFI3bv1yn3CX8y8U9CIiDbTt84PcMWcJfVLa8dAVw8L+5euRFPQiIg1Q\nUlbBbbOzKSuv5MkJo2gXH3lffUZeRSIiUcLd+fHrK1ieW8RTE0bRN6VduEuqkY7oRUTq6bmFW3gx\nK5dvnnUi5w3pFu5yjkpBLyJSDws27OLeuSs5c0AKd54T+tsBNoSCXkTkOG3ZdYBbZ2fTI7ktj141\ngpYtIuvL1yMp6EVEjsPekjImz1hEpcPUiaPp2KZVuEuqlYJeRKSOyisq+eZzS9hYuJ/HrxtJ7y6J\n4S6pTjTqRkSkju5/cw1/W1fALy45iVP7dgl3OXWmI3oRkTp4bsEWpv1jI5NOy+DaseG97PDxUtCL\niNTi488K+fHrK/hy/xR+eNGgcJdz3BT0IiLHsKlwP7fNXkxGl0R+e80I4lpGX2xGX8UiIk2k6GDV\nCJsWBlMnZtIhIfJH2NREX8aKiNSgvKKSO55bzJbdB5g9eSy9OkfHCJua1OXm4AlmttDMlpnZSjP7\naTC/t5ktMLP1ZvaCmbUO5scHr3OC5Rmh/SeIiDS+n//fav6+vpCff+0kxvbpHO5yGqQuXTelwFnu\nPgwYDlxgZuOAB4FH3L0fsAeYHLSfDOxx9xOBR4J2IiJRY9b8zUz/eBM3n96br4/uGe5yGqzWoPcq\n+4KXrYKHA2cBLwfzZwBfC6bHB68Jlp9tkXZxZhGRo/hofSH3zl3JWQO7cs+F0TfCpiZ1+jLWzFqa\n2VIgH5gHfAZ87u7lQZNcoHsw3R3YChAsLwKi++8eEWkWNhTs4xvPZnNiSjsevWp4xF/Dpq7qFPTu\nXuHuw4F0YAxQ0685D55r2jN+5Awzm2JmWWaWVVBQUNd6RURCouhAGZNnZNGqZQuenphJ+ygdYVOT\n4xpe6e6fAx8A44BOZnZ41E46sD2YzgV6AATLOwK7a3ivp9w9090zU1Ii5ya6ItL8lFVU8o3nstm2\n5yBPTBhFj+S24S6pUdVl1E2KmXUKptsA5wCrgfeBy4NmE4HXg+m5wWuC5e+5+78d0YuIRAJ35965\nK/lHzi7uv3QoozOSw11So6vLOPo0YIaZtaTqF8OL7v6Gma0CnjeznwNLgKlB+6nALDPLoepI/qoQ\n1C0i0ihmfrKZZxds4dYv9+XyUenhLickag16d18OjKhh/gaq+uuPnF8CXNEo1YmIhNCH6wr46Z9X\ncu7gVL57/oBwlxMyugSCiDRLa3bu5fZnFzOgWwd+8/XhtIiRETY1UdCLSLOz/fOD3DBtEW3jW/L0\nxEwS42P7ajAKehFpVooOlnHDMwvZX1rO9Elj6N6pTbhLCrnY/jUmIlJNaXkFU2ZmsbFwPzMmjWFQ\nWodwl9QkFPQi0ixUVjp3vbiMBRt38+hVwzn1xOi5FWBDqetGRJqFX761mjeW7+CeCwcyfnj32leI\nIQp6EYl50z7ayB//vpGJp/TiljP6hLucJqegF5GY9uanO/jZ/63i/CGp/Pg/h9AcL6aroBeRmLVw\n427ufGEpI3sm8ehVI2LmapTHS0EvIjEpJ7+Ym2dmkZ7UhqevzyShVctwlxQ2CnoRiTl5e0uYOG0R\nrVq2YMakMSQltg53SWGloBeRmFJcUsYNzyzi8wOHmD5pdMxdcrg+NI5eRGLGofJKbpu9mPV5xUy9\nYTQnde8Y7pIigoJeRGKCu3PPK8v5KKeQhy4/mS/31w2NDlPXjYjEhIfeXsurS7Zx17n9uSKzR7jL\niSgKehGJerPmb+YPH3zG1WN6csdZJ4a7nIijoBeRqPbXlTv5yesrOGdQV342vnmeEFUbBb2IRK3s\nzXv45pwlDE3vxGNXjyCupSKtJnW5OXgPM3vfzFab2Uoz+3Yw/14z22ZmS4PHRdXW+b6Z5ZjZWjM7\nP5T/ABFpntbnFXPTjEWkdUxg2sRM2rbW2JKjqcueKQfucvfFZtYeyDazecGyR9z9V9Ubm9lgqm4I\nPgQ4AXjHzPq7e0VjFi4izdeWXQe49ukFVSdE3TiGzu3iw11SRKv1iN7dd7j74mC6GFgNHOsan+OB\n59291N03AjnUcBNxEZH62FlUwjVPz+dQRSWzbxpLr86J4S4p4h1Xh5aZZQAjgAXBrDvMbLmZTTOz\npGBed2BrtdVyOfYvBhGROtm1r5Rrn57P5wfKmDFpDP1T24e7pKhQ56A3s3bAK8Cd7r4XeBzoCwwH\ndgC/Pty0htW9hvebYmZZZpZVUFBw3IWLSPOyt6SM66ctJHfPQaZOzGRYj07hLilq1CnozawVVSH/\nrLu/CuDuee5e4e6VwB/5Z/dMLlD9bIV0YPuR7+nuT7l7prtnpqToDDYROboDh8q58ZlFrMsr5okJ\noxjbp3O4S4oqdRl1Y8BUYLW7P1xtflq1ZpcAK4LpucBVZhZvZr2BfsDCxitZRJqT0vIKbpmVzeIt\ne/jN10fwHwO6hrukqFOXUTenAROAT81saTDvB8DVZjacqm6ZTcAtAO6+0sxeBFZRNWLndo24EZH6\nKK+o5FtzlvD39YX87+Unc/HJabWvJP+m1qB394+oud/9zWOs8wvgFw2oS0SaucpK57svL+ftlXn8\n+CuDuVLXr6k3nUYmIhHH3bn3zyt5dck2/vvc/tz4pd7hLimqKehFJOI89PZaZn6ymSln9OGbukhZ\ngynoRSSi/OGDnC+uRPn9CwfqImWNQEEvIhFj1ieb+N+/rOWrw07g5187SSHfSBT0IhIRXl2cy/+8\nvpJzBnXl11cOo2ULhXxjUdCLSNj9ZcVO7n55Oaf27czvrhlJK11uuFFpb4pIWH24roBvzVnCyekd\n+eP1mSS0ahnukmKOgl5EwiZr026mzMqiT0oi028YQ2K8rikfCgp6EQmLJVv2MOmZRaR1bMOsyWPp\n2LZVuEuKWQp6EWlyi7fs4fqpC0lKbM2zN40lpb1uHBJKCnoRaVLZm3dz/dSFJLdrzfNTxnFCpzbh\nLinmqUNMRJpM1qbdTJy2kK4dEphz8zi6dUwId0nNgo7oRaRJLNy4m+unLSRVId/kdEQvIiG3YMMu\nJk1fRLeOCTx/8zi6dlDINyUd0YtISH3y2S5ueGYRaR0TeH6KQj4cdEQvIiHzcU4hN85YRI+ktjx3\n8ziNrgkTBb2IhMQ/cgqZPGMRPZOrQr5LO4V8uKjrRkQa3UfrC7lx+iIyOicyRyEfdgp6EWlUH64r\nYPKMRfTuksizN42ls0I+7GoNejPrYWbvm9lqM1tpZt8O5ieb2TwzWx88JwXzzcweM7McM1tuZiND\n/Y8Qkcjwwdp8bpqZRd+Udjx38ziFfISoyxF9OXCXuw8CxgG3m9lg4B7gXXfvB7wbvAa4EOgXPKYA\njzd61SIScd5fk8+Umdn069qOZ28aS3Ji63CXJIFag97dd7j74mC6GFgNdAfGAzOCZjOArwXT44GZ\nXmU+0MnM0hq9chGJGO+tyeOWWdn071YV8kkK+YhyXH30ZpYBjAAWAKnuvgOqfhkAXYNm3YGt1VbL\nDeYd+V5TzCzLzLIKCgqOv3IRiQjvrKoK+YFp7Xl28jg6tVXIR5o6B72ZtQNeAe50973HalrDPP+3\nGe5PuXumu2empKTUtQwRiSDzVuVx27PZDE7roEsNR7A6Bb2ZtaIq5J9191eD2XmHu2SC5/xgfi7Q\no9rq6cD2xilXRCLF60u3cdvsbIac0JFZN42lYxuFfKSqy6gbA6YCq9394WqL5gITg+mJwOvV5l8f\njL4ZBxQd7uIRkdgw65NN3PnCUkb1SmLW5DF0SFDIR7K6nBl7GjAB+NTMlgbzfgA8ALxoZpOBLcAV\nwbI3gYuAHOAAMKlRKxaRsHF3Hns3h0feWcc5g1L53TUjdI/XKFBr0Lv7R9Tc7w5wdg3tHbi9gXWJ\nSISprHTue2MV0z/exGUj03nwsqHEtdQ5l9FA17oRkVqVVVRy90vL+NPS7dz0pd784KJBtGhxtOM/\niTQKehE5poOHKrj9ucW8tyafu88fwDfO7EvVV3cSLRT0InJURQfLuGnGIrI27+H+S4Zyzdie4S5J\n6kFBLyI1yi8u4fqpC/msYB+/v2YkFw3VCe7RSkEvIv9my64DXDd1AYX7Spl2w2hO76eTGqOZgl5E\n/sWanXuZMHUhZRWVPHvTWEb0TAp3SdJACnoR+UL25t1MemYRbVvH8dwtp9AvtX24S5JGoKAXEQDe\nX5vPbbOzSevYhlmTx5Ce1DbcJUkjUdCLCK8v3cZdLy5jQLf2zLhxjG79F2MU9CLN3MxPNvGTuSsZ\nk5HM0xMzaa/r1sQcBb1IM6Xr1jQfCnqRZuhQeSU/fO1TXsrO1XVrmgEFvUgzU3SgjFtnZ/PJhl18\n6+x+/Nc5/XRJgxinoBdpRjbv2s+k6YvYuvsAD185jEtHpoe7JGkCCnqRZiJr026mzMqm0p3Zk8cy\ntk/ncJckTURBL9IMvL50G3e/tJzuSW2YdsNoendJDHdJ0oQU9CIxrPrImjG9k3nyulEkJbYOd1nS\nxBT0IjGqtLyC77/yKa8u2calI7vzy0uHEh+n4ZPNUV1uDj7NzPLNbEW1efea2TYzWxo8Lqq27Ptm\nlmNma83s/FAVLiJHt2f/ISY8vZBXl2zjrnP78+srhinkm7G6HNFPB34HzDxi/iPu/qvqM8xsMHAV\nMAQ4AXjHzPq7e0Uj1CoidbChYB83Tl/E9qISHr1qOOOHdw93SRJmtR7Ru/uHwO46vt944Hl3L3X3\njUAOMKYB9YnIcZi/YReXPv4xe0vKmXPzWIW8AHUI+mO4w8yWB107hy9Y3R3YWq1NbjBPRELslexc\nJkxdQOfE1rz2jVMZ1Ss53CVJhKhv0D8O9AWGAzuAXwfzazq9zmt6AzObYmZZZpZVUFBQzzJExN15\n+K9rueulZYzOSObV206jV2cNn5R/qlfQu3ueu1e4eyXwR/7ZPZML9KjWNB3YfpT3eMrdM909MyVF\ntykTqY+Ssgq+9fxSHnsvhysz05k+aQwd2+rqk/Kv6hX0Zlb9LsGXAIdH5MwFrjKzeDPrDfQDFjas\nRBGpSX5xCdc+vYA/L9vOdy8YwIOXnUzrOF2YTP5draNuzGwOcCbQxcxygZ8AZ5rZcKq6ZTYBtwC4\n+0ozexFYBZQDt2vEjUjjy968m9tmL2ZvSRm/v2YkF5+cVvtK0myZe41d6E0qMzPTs7Kywl2GSMRz\nd2bN38x9f15F96Q2PHHdKAaldQh3WRImZpbt7pm1tdOZsSJR4uChCn74WtWZrmcN7MojXx9Oxzbq\nj5faKehFosCWXQe4ZXY2a3bu5b/O6c83zzqRFi10DXmpGwW9SIT7YG0+335+Ke7OtImj+Y+BXcNd\nkkQZBb1IhKqsdH7/fg4Pv7OOAanteXLCKI2Pl3pR0ItEoKKDZdz14lLeWZ3PJSO6c/8lQ2nTWhcl\nk/pR0ItEmLU7i7llVha5ew7y068O4fpTeumertIgCnqRCPLnZdv57svLaZcQx5wp4xidoevVSMMp\n6EUiQFlFJQ+8tYapH21kdEYSv79mJF07JIS7LIkRCnqRMCsoLuWO5xazYONubjg1gx9ePIhWLXUp\nA2k8CnqRMFq8ZQ+3zc6m6GAZj3x9GJeMSA93SRKDFPQiYVBZ6Tz90QYeenstaR3b8OptYxh8gi5l\nIKGhoBdpYnl7S7jrxWV8lFPIBUO68eBlJ+vSwhJSCnqRJjRvVR7ffXkZJWWVPHDpUL4+uoeGTkrI\nKehFmkBJWQW/+L/VzJq/mSEndODRq0ZwYtd24S5LmgkFvUiIrd6xl2/NWcL6/H3cfHpvvnP+AOLj\ndJarNB0FvUiIuDvTP97EL99aQ8c2rZg1eQyn99NtM6XpKehFQqBwXynfeWkZH6wt4JxBXXnwspPp\n3C4+3GVJM6WgF2lkH6zN5zsvLaO4pJyfjR/CdeN0rRoJr1pPvzOzaWaWb2Yrqs1LNrN5ZrY+eE4K\n5puZPWZmOWa23MxGhrJ4kUhSUlbBfX9exQ3PLKJzYjxz7/gSE07JUMhL2NXlPOvpwAVHzLsHeNfd\n+wHvBq8BLgT6BY8pwOONU6ZIZFufV8wlf/iYaf/YyA2nZvD6HacxoFv7cJclAtSh68bdPzSzjCNm\njwfODKZnAB8A3wvmz/SqO47PN7NOZpbm7jsaq2CRSOLuPLtgCz97YxXt4uOYdkMmZw1MDXdZIv+i\nvn30qYfD2913mNnhe5t1B7ZWa5cbzFPQS8wpKC7lB699yrxVeZzRP4VfXXEyXdvripMSeRr7y9ia\nOiO9xoZmU6jq3qFnz56NXIZI6Lg7ry7exn1vrOJgWQU/ungQN57WWzfrlohV36DPO9wlY2ZpQH4w\nPxfoUa1dOrC9pjdw96eApwAyMzNr/GUgEmly9xzgB6+t4MN1BWT2SuKBy07WGa4S8eob9HOBicAD\nwfPr1ebfYWbPA2OBIvXPSyyorHRmL9jMg2+twYGffnUIE8b10lG8RIVag97M5lD1xWsXM8sFfkJV\nwL9oZpOBLcAVQfM3gYuAHOAAMCkENYs0qc8K9nHPK8tZtGkPZ/RP4f5LTiI9qW24yxKps7qMurn6\nKIvOrqGtA7c3tCiRSFBWUckf/76B37yznjatWvKrK4Zx2cjuGhcvUUdnxorUYMW2Ir73ynJWbt/L\nRUO7ce9Xh2hEjUQtBb1INSVlFTz27nqe/HADSW1b88R1I7ngpLRwlyXSIAp6kUDWpt1895XlbCjY\nzxWj0vnRxYN15yeJCQp6afb2lZbz0F/WMHP+Zk7o2IaZN47hjP66nLDEDgW9NGt/W1fAD179lO1F\nB5l4SgZ3nz+AxHj9WEhs0SdamqWtuw9w/5ureWvFTvqmJPLSLaeQmZEc7rJEQkJBL83KgUPlPP7B\nZzz54QZamnHXuf25+Yw+JLTSrf0kdinopVlwd+Yu284Db61hR1EJ44efwD0XDiStY5twlyYScgp6\niXkrthVx79yVZG3ew0ndO/Dbq0eom0aaFQW9xKzCfaX86u21vJC1leS2rXnwsqFcPqoHLXV9Gmlm\nFPQScw6VVzLzk008+u56Dh6qYPJpvfnWOf3okKAx8dI8KeglpnywNp/73ljFhoL9nDkghf/5ymD6\npugywtK8KeglJmws3M/P3ljFe2vy6d0lUbf0E6lGQS9RrbikjN+9l8O0f2wkPq4lP7hoIDec2pvW\ncXW5771I86Cgl6hUWl7BnAVb+N37n1G4r5QrRqVz9wUDdIVJkRoo6CWqlFVU8lJWLr97bz3bi0oY\n0zuZqRMzGdajU7hLE4lYCnqJCuUVlby2ZBuPvbeerbsPMrJnJx66Yhin9u2sG4GI1EJBLxGtstL5\n8/LtPPrOejYU7uek7h2474aTOHNAigJepI4U9BKR3J23V+7k4XnrWJe3j4Hd2vPkhFGcNzhVAS9y\nnBoU9Ga2CSgGKoByd880s2TgBSAD2ARc6e57GlamNBfuzntr8nl43jpWbt9L35REfnv1CC4emkYL\nndEqUi+NcUT/H+5eWO31PcC77v6Amd0TvP5eI2xHYpi78/f1hTw8bx1Lt35Oz+S2PHzlMMYP765L\nFog0UCi6bsYDZwbTM4APUNDLMczfsIuH/7qOhZt2071TGx64dCiXjUqnVUuNhRdpDA0Negf+amYO\nPOnuTwGp7r4DwN13mFnXhhYpscfd+SinkCf+9hn/yNlFaod4fjZ+CFeO7kF8nK4NL9KYGhr0p7n7\n9iDM55nZmrquaGZTgCkAPXv2bGAZEi1KyiqYu2w70z7ayJqdxaS0j+dHFw/iunG9dPMPkRBpUNC7\n+/bgOd/MXgPGAHlmlhYczacB+UdZ9yngKYDMzExvSB0S+XbtK2X2/C3Mmr+Jwn2HGNitPb+6Yhj/\nOSxNR/AiIVbvoDezRKCFuxcH0+cB9wFzgYnAA8Hz641RqESn9XnFTP1oI68u2cah8krOGtiVm77U\nm1N0opNIk2nIEX0q8FrwwxoHPOfufzGzRcCLZjYZ2AJc0fAyJZoc7n9/+u8b+du6AuLjWnD5qHRu\nPK03J3bVJYNFmlq9g97dNwDDapi/Czi7IUVJdCopq2Du0u1M/Wgja/Oq+t+/c15/rhnbi+TE1uEu\nT6TZ0pmx0mCF+0qZPX8zs+dvVv+7SARS0Eu9uDvLcouYs2ALry1V/7tIJFPQy3HJ21vCa0u28XJ2\nLjn5+0hopf53kUinoJdalZRV8M7qPF7OzuXDdQVUOozqlcQDlw7lopPTdNNtkQinoJcauTvLc4t4\nOTuXucu2U3SwjLSOCdx2Zl8uG5lOH91wWyRqKOjlX+QXl/CnoGtmXd4+4uNacP6Qblw+Kp3TTuyi\nC4yJRCEFvVBaXsF7q/N5KTuXv60roKLSGdmzE/dfMpSLT06jYxt1zYhEMwV9M1VeUcmiTXt4a8UO\n5i7bzucHykjtEM+UM/pw2ch0fbEqEkMU9M3IvtJyPlxXwLxVeby3Jp+ig2W0jmvBeYNTuXxUOqf3\nS1HXjEgMUtDHuLy9Jcxblce8VXl88tkuDlVU0qltK84e1JVzB6VyRv8UEuP1MRCJZfoJjzHuztq8\nYuatzOOd1Xksyy0CoGdyWyac0otzB6eS2SuJON3UQ6TZUNDHgPKKShZu2s07q/KZt3onW3cfBGBY\nj07cff4Azh2cSr+u7XS2qkgzpaCPQu7Opl0HWLRxNx9/Vsj7awu+6G8/rW9nbvvyiZwzqCtdOySE\nu1QRiQAK+ihQUems2bmXRRt3s2jTHhZu2k1BcSkAyYmtOXtQV84bnMrp/dTfLiL/TqkQgUrLK/g0\nt4iFm3azaONusjbvobikHIATOiZwWt/OjO6dzJiMZPqmtKOFRsqIyDEo6CPA/tJyFm/Zw8KNu1m4\ncTdLt35OaXklACd2bcdXTj6BMb2TGJ2RTHpS2zBXKyLRRkHfhNydgn2lrNu5j7V5xazbWczqnXtZ\nuX0vFZVOC4OTunfkunG9GJ2RzOiMJDq3iw932SIS5RT0IfL5gUOsy/tnoK/NK2ZdXjGfHyj7ok3n\nxNb0T23PN87sy+iMZEb2SqKd+thFpJGFLFXM7ALgUaAl8LS7PxCqbYXTvtJy1gchvi5vH+vyilm7\ns5j84MtSgPYJcQxIbc+FJ6UxILUd/bu1p39qe7roaF1EmkBIgt7MWgK/B84FcoFFZjbX3VeFYnuh\nUFZRSeG+UnYWlZC3t5T84hLy9paws+if03l7Syk6+M8j9IRWLeif2p4z+qfQP7Ud/VPbM6Bbe7p1\nSNAYdhEJm1Ad0Y8BcoIbiGNmzwPjgSYJ+opKp6SsgtLySkrKKoJHJaXlVc8l5RWUBq/3l1YEwV1K\n/t4SdgYBvmt/Ke7/+r5xLYyu7ePp2iGB3l0SOaVPZ1I7JtCva3v6p7ajR1JbjYARkYgTqqDvDmyt\n9joXGNvYG/lgbT4/e2PVFyFeGoR4WYXXvvIRurRrTdf2CXTrmMDJ6R2/mE7tEE/X9gmkdkigc2Jr\nBbmIRJ1QBX1Nafgv6WtmU4ApAD179qzXRjq0acXAbh2Ib9WChFYtiY+rek6Ia0lCqxb/fN3q8OuW\nX7RNCKbbtm5J58R4Wsfp2i8iEptCFfS5QI9qr9OB7dUbuPtTwFMAmZmZx38IDozsmcTIa5PqW6OI\nSLMQqsPYRUA/M+ttZq2Bq4C5IdqWiIgcQ0iO6N293MzuAN6manjlNHdfGYptiYjIsYVsHL27vwm8\nGar3FxGNKOy7AAAFP0lEQVSRutE3kCIiMU5BLyIS4xT0IiIxTkEvIhLjFPQiIjHO/MgLuoSjCLMC\nYHM9V+8CFDZiOY0t0uuDyK9R9TWM6muYSK6vl7un1NYoIoK+Icwsy90zw13H0UR6fRD5Naq+hlF9\nDRPp9dWFum5ERGKcgl5EJMbFQtA/Fe4CahHp9UHk16j6Gkb1NUyk11erqO+jFxGRY4uFI3oRETmG\nqAl6M7vAzNaaWY6Z3VPD8ngzeyFYvsDMMpqwth5m9r6ZrTazlWb27RranGlmRWa2NHj8uKnqC7a/\nycw+DbadVcNyM7PHgv233MxGNmFtA6rtl6VmttfM7jyiTZPvPzObZmb5Zrai2rxkM5tnZuuD5xpv\niGBmE4M2681sYhPW95CZrQn+D18zs05HWfeYn4cQ1nevmW2r9v940VHWPebPewjre6FabZvMbOlR\n1g35/mtU7h7xD6oudfwZ0AdoDSwDBh/R5hvAE8H0VcALTVhfGjAymG4PrKuhvjOBN8K4DzcBXY6x\n/CLgLaruDjYOWBDG/+udVI0PDuv+A84ARgIrqs37X+CeYPoe4MEa1ksGNgTPScF0UhPVdx4QF0w/\nWFN9dfk8hLC+e4Hv1OEzcMyf91DVd8TyXwM/Dtf+a8xHtBzRf3GzcXc/BBy+2Xh144EZwfTLwNlm\n1iQ3eHX3He6+OJguBlZTdd/caDIemOlV5gOdzCwtDHWcDXzm7vU9ga7RuPuHwO4jZlf/nM0AvlbD\nqucD89x9t7vvAeYBFzRFfe7+V3cvD17Op+rubmFxlP1XF3X5eW+wY9UXZMeVwJzG3m44REvQ13Sz\n8SOD9Is2wQe9COjcJNVVE3QZjQAW1LD4FDNbZmZvmdmQJi2s6p69fzWz7OB+vUeqyz5uCldx9B+u\ncO6/w1LdfQdU/YIHutbQJlL25Y1U/ZVWk9o+D6F0R9C1NO0oXV+RsP9OB/Lcff1Rlodz/x23aAn6\nWm82Xsc2IWVm7YBXgDvdfe8RixdT1R0xDPgt8KemrA04zd1HAhcCt5vZGUcsj4T91xr4KvBSDYvD\nvf+ORyTsyx8C5cCzR2lS2+chVB4H+gLDgR1UdY8cKez7D7iaYx/Nh2v/1Uu0BH2tNxuv3sbM4oCO\n1O/Pxnoxs1ZUhfyz7v7qkcvdfa+77wum3wRamVmXpqrP3bcHz/nAa1T9eVxdXfZxqF0ILHb3vCMX\nhHv/VZN3uEsreM6voU1Y92Xw5e9XgGs96FA+Uh0+DyHh7nnuXuHulcAfj7LdcO+/OOBS4IWjtQnX\n/quvaAn6utxsfC5weHTD5cB7R/uQN7agP28qsNrdHz5Km26HvzMwszFU7ftdTVRfopm1PzxN1Rd2\nK45oNhe4Phh9Mw4oOtxF0YSOehQVzv13hOqfs4nA6zW0eRs4z8ySgq6J84J5IWdmFwDfA77q7geO\n0qYun4dQ1Vf9e59LjrLduvy8h9I5wBp3z61pYTj3X72F+9vguj6oGhWyjqpv438YzLuPqg80QAJV\nf/LnAAuBPk1Y25eo+tNyObA0eFwE3ArcGrS5A1hJ1QiC+cCpTVhfn2C7y4IaDu+/6vUZ8Ptg/34K\nZDbx/29bqoK7Y7V5Yd1/VP3S2QGUUXWUOZmq733eBdYHz8lB20zg6Wrr3hh8FnOASU1YXw5V/duH\nP4eHR6KdALx5rM9DE9U3K/h8LacqvNOOrC94/W8/701RXzB/+uHPXbW2Tb7/GvOhM2NFRGJctHTd\niIhIPSnoRURinIJeRCTGKehFRGKcgl5EJMYp6EVEYpyCXkQkxinoRURi3P8Dj787zxrXhFMAAAAA\nSUVORK5CYII=\n", 57 | "text/plain": [ 58 | "" 59 | ] 60 | }, 61 | "metadata": {}, 62 | "output_type": "display_data" 63 | } 64 | ], 65 | "source": [ 66 | "plt.show()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 2, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.6.2" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /numpy/np_test.py: -------------------------------------------------------------------------------- 1 | #coding = utf-8 2 | import numpy as np 3 | 4 | 5 | a = np.array([[1, 2, 3],[4, 5, 6]], np.int32) 6 | print(a.shape) 7 | #print(a.flags) 8 | #print(a.data) 9 | #print(a.base) 10 | #print(a.item) 11 | print(a.tolist()) 12 | print(a.dumps()) 13 | 14 | b = np.arange(12).reshape(4, 3) 15 | print(b) 16 | print(a.reshape(3, 2)) 17 | 18 | print(np.int32) 19 | 20 | #index 21 | x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) 22 | print(x[1:7:2]) 23 | 24 | x = np.array([[[1],[2],[3]], [[4],[5],[6]]]) 25 | print(x.shape) 26 | print(x) 27 | 28 | x_part = np.array([[1], [2], [3]]) 29 | print(x_part.shape) 30 | -------------------------------------------------------------------------------- /numpy/np_test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | a = np.array([[56.0, 0.0, 4.4], 5 | [1.2, 104.0, 52.0]]) 6 | #print a 7 | 8 | cal = a.sum(axis=0) 9 | print cal 10 | 11 | percentage = 100*a/cal 12 | print percentage 13 | 14 | print np.array([[1,2],[3,4]])/np.array([1,2]) -------------------------------------------------------------------------------- /numpy/test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import numpy as np 3 | import sys 4 | 5 | 6 | def pythonsum(n): 7 | a = [i for i in range(n)] 8 | b = [i for i in range(n)] 9 | c = [] 10 | for i in range(len(a)): 11 | a[i] = i ** 2 12 | b[i] = i ** 3 13 | c.append(a[i]+b[i]) 14 | return c 15 | 16 | 17 | def numpysum(n): 18 | a = np.arange(n) ** 2 19 | b = np.arange(n) ** 3 20 | c = a + b 21 | return c 22 | 23 | 24 | if __name__ == '__main__': 25 | size = 1000 26 | start = datetime.now() 27 | c = pythonsum(size) 28 | delay = datetime.now() - start 29 | print("python运算次幂结果后三个:",c[-3:]) 30 | print("python运行时间:(毫秒)", delay.microseconds) 31 | 32 | start = datetime.now() 33 | c = numpysum(size) 34 | delay = datetime.now() - start 35 | print("numpy运算次幂结果后三个:",c[-3:]) 36 | print("numpy运行时间:(毫秒)", delay.microseconds) -------------------------------------------------------------------------------- /numpy/线下门店服务器安装部署手册.docx - 快捷方式.lnk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/numpy/线下门店服务器安装部署手册.docx - 快捷方式.lnk -------------------------------------------------------------------------------- /tensorflow/index.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import gzip 6 | import os 7 | import tempfile 8 | 9 | import numpy 10 | from six.moves import urllib 11 | from six.moves import xrange # pylint: disable=redefined-builtin 12 | import tensorflow as tf 13 | from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets 14 | from tensorflow.examples.tutorials.mnist import input_data 15 | 16 | 17 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 18 | x = tf.placeholder(tf.float32, [None, 784]) 19 | W = tf.Variable(tf.zeros([784,10])) 20 | b = tf.Variable(tf.zeros([10])) 21 | y = tf.nn.softmax(tf.matmul(x,W) + b) 22 | y_ = tf.placeholder("float", [None,10]) 23 | cross_entropy = -tf.reduce_sum(y_*tf.log(y)) 24 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) 25 | init = tf.initialize_all_variables() 26 | sess = tf.Session() 27 | sess.run(init) 28 | for i in range(1000): 29 | batch_xs, batch_ys = mnist.train.next_batch(100) 30 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) 31 | correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) 32 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 33 | print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) -------------------------------------------------------------------------------- /tensorflow/test2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | a = tf.add(3, 5) 5 | #sess = tf.Session() 6 | #print(sess.run(a)) 7 | #sess.close() 8 | with tf.Session() as sess: 9 | print(sess.run(a)) 10 | 11 | x = 2 12 | y = 3 13 | op1 = tf.add(x, y) 14 | op2 = tf.multiply(x, y) 15 | op3 = tf.pow(op2, op1) 16 | with tf.Session() as sess: 17 | op3 = sess.run(op3) 18 | print(op3) 19 | 20 | with tf.device('/CPU:0'): 21 | #安装好GPU之后,可以使用GPU 22 | a = tf.constant([[1.0]], name='a') 23 | b = tf.constant([[1.0]], name='b') 24 | c = tf.matmul(a, b) 25 | 26 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) 27 | 28 | print(sess.run(c)) 29 | 30 | -------------------------------------------------------------------------------- /tensorflow/test3.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | # 添加层 5 | def add_layer(inputs, in_size, out_size, activation_function=None): 6 | # add one more layer and return the output of this layer 7 | Weights = tf.Variable(tf.random_normal([in_size, out_size])) 8 | biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) 9 | Wx_plus_b = tf.matmul(inputs, Weights) + biases 10 | if activation_function is None: 11 | outputs = Wx_plus_b 12 | else: 13 | outputs = activation_function(Wx_plus_b) 14 | return outputs 15 | 16 | # 1.训练的数据 17 | # Make up some real data 18 | x_data = np.linspace(-1,1,300)[:, np.newaxis] 19 | noise = np.random.normal(0, 0.05, x_data.shape) 20 | y_data = np.square(x_data) - 0.5 + noise 21 | 22 | # 2.定义节点准备接收数据 23 | # define placeholder for inputs to network 24 | xs = tf.placeholder(tf.float32, [None, 1]) 25 | ys = tf.placeholder(tf.float32, [None, 1]) 26 | 27 | # 3.定义神经层:隐藏层和预测层 28 | # add hidden layer 输入值是 xs,在隐藏层有 10 个神经元 29 | l1 = add_layer(xs, 1, 10, activation_function=tf.nn.relu) 30 | # add output layer 输入值是隐藏层 l1,在预测层输出 1 个结果 31 | prediction = add_layer(l1, 10, 1, activation_function=None) 32 | 33 | # 4.定义 loss 表达式 34 | # the error between prediciton and real data 35 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), 36 | reduction_indices=[1])) 37 | 38 | # 5.选择 optimizer 使 loss 达到最小 39 | # 这一行定义了用什么方式去减少 loss,学习率是 0.1 40 | train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss) 41 | 42 | 43 | # important step 对所有变量进行初始化 44 | init = tf.initialize_all_variables() 45 | sess = tf.Session() 46 | # 上面定义的都没有运算,直到 sess.run 才会开始运算 47 | sess.run(init) 48 | 49 | # 迭代 1000 次学习,sess.run optimizer 50 | for i in range(1000): 51 | # training train_step 和 loss 都是由 placeholder 定义的运算,所以这里要用 feed 传入参数 52 | sess.run(train_step, feed_dict={xs: x_data, ys: y_data}) 53 | if i % 50 == 0: 54 | # to see the step improvement 55 | print(sess.run(loss, feed_dict={xs: x_data, ys: y_data})) -------------------------------------------------------------------------------- /tf衣服图片识别率提升/CNN_digit.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | print(tf.__version__) 8 | fashion_mnist = keras.datasets.fashion_mnist 9 | (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() 10 | x_train, x_test = np.squeeze(x_train), np.squeeze(x_test) 11 | x_train = x_train.reshape([60000, 28, 28, 1]) 12 | x_test = x_test.reshape([10000, 28, 28, 1]) 13 | 14 | class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 15 | 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] 16 | 17 | # plt.figure() 18 | # plt.imshow(x_train[0]) 19 | # plt.show() 20 | 21 | model = keras.Sequential() 22 | model.add(keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', 23 | padding='SAME', input_shape=(28,28,1))) 24 | model.add(keras.layers.MaxPool2D(2)) 25 | model.add(keras.layers.Conv2D(filters=32, kernel_size=(2, 2), activation='relu', 26 | padding='SAME', input_shape=(28,28,1))) 27 | model.summary() 28 | model.add(keras.layers.Flatten()) 29 | model.add(keras.layers.Dense(256, activation='relu')) 30 | model.add(keras.layers.Dropout(0.2)) 31 | model.add(keras.layers.Dense(128, activation='relu')) 32 | model.add(keras.layers.Dense(64, activation='relu')) 33 | model.add(keras.layers.Dropout(0.2)) 34 | model.add(keras.layers.Dense(10)) 35 | model.summary() 36 | 37 | model.compile(optimizer='adam', 38 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 39 | metrics=['accuracy']) 40 | 41 | model.fit(x_train, y_train, epochs=10) 42 | test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2) 43 | 44 | print('\nTest accuracy:', test_acc) 45 | 46 | -------------------------------------------------------------------------------- /tf衣服图片识别率提升/README.md: -------------------------------------------------------------------------------- 1 | # tf衣服图片识别率提升 2 | tf官方教程的识别网络结构是一开始全部对图片碾平成一维, 之后过两个全连接层. 3 | 4 | 我的改进是首先将数据reshape成四维, [batch, h,w,c]. CNN1+dropout+CNN2 256全连接 dropout 0.2 128全连接 68全连接 dropout0.2 10全连接 5 | 6 | 准确率可以提升从0.86提升到0.91 7 | 8 | # tf文本分类 9 | 代码v1 先对文本的index进行norm 放几个全连接层 之后准确率是0.5 是瞎猜 10 | 11 | v2使用全局平均池化代替全连接层,准确率变为0.87 gap的作用我认为主要是减少fc的参数量,附带使用全局信息,防止过拟合 12 | -------------------------------------------------------------------------------- /tf衣服图片识别率提升/Text_classifier_v1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | 4 | 5 | def decode_review(texts): 6 | return ' '.join([reverse_word_index.get(i, '?') for i in texts]) 7 | 8 | imdb = keras.datasets.imdb 9 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000) 10 | print(x_train[0]) 11 | 12 | word_index = imdb.get_word_index() 13 | word_index = {k:(v+3) for k,v in word_index.items()} 14 | word_index[""] = 0 15 | word_index[""] = 1 16 | word_index[""] = 2 # unknown 17 | word_index[""] = 3 18 | 19 | reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) 20 | print(decode_review(x_train[0])) 21 | 22 | train_data = keras.preprocessing.sequence.pad_sequences(x_train, 23 | value=word_index[""], 24 | padding='post', 25 | maxlen=256) 26 | 27 | test_data = keras.preprocessing.sequence.pad_sequences(x_test, 28 | value=word_index[""], 29 | padding='post', 30 | maxlen=256) 31 | 32 | train_data = tf.nn.l2_normalize(train_data.astype('float')) 33 | test_data = tf.nn.l2_normalize(test_data.astype('float')) 34 | 35 | model = keras.Sequential() 36 | model.add(keras.layers.Dense(units=64, input_shape=(25000, 256))) 37 | model.add(keras.layers.Dense(units=32)) 38 | model.add(keras.layers.Dense(units=16)) 39 | model.add(keras.layers.Dense(units=1)) 40 | model.summary() 41 | 42 | model.compile(optimizer='adam', 43 | loss='binary_crossentropy', 44 | metrics=['accuracy']) 45 | 46 | model.fit(train_data, test_data, epochs=5) 47 | loss_and_metrics = model.evaluate(test_data, y_test) 48 | print(loss_and_metrics) 49 | -------------------------------------------------------------------------------- /tf衣服图片识别率提升/Text_classifier_v2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | 4 | 5 | def decode_review(texts): 6 | return ' '.join([reverse_word_index.get(i, '?') for i in texts]) 7 | 8 | imdb = keras.datasets.imdb 9 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000) 10 | print(x_train[0]) 11 | 12 | word_index = imdb.get_word_index() 13 | word_index = {k:(v+3) for k,v in word_index.items()} 14 | word_index[""] = 0 15 | word_index[""] = 1 16 | word_index[""] = 2 # unknown 17 | word_index[""] = 3 18 | 19 | reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) 20 | print(decode_review(x_train[0])) 21 | 22 | train_data = keras.preprocessing.sequence.pad_sequences(x_train, 23 | value=word_index[""], 24 | padding='post', 25 | maxlen=256) 26 | 27 | test_data = keras.preprocessing.sequence.pad_sequences(x_test, 28 | value=word_index[""], 29 | padding='post', 30 | maxlen=256) 31 | 32 | # norm化 33 | # train_data = tf.nn.l2_normalize(train_data.astype('float')) 34 | # test_data = tf.nn.l2_normalize(test_data.astype('float')) 35 | 36 | x_val = train_data[:10000] 37 | partial_x_train = train_data[10000:] 38 | 39 | y_val = y_train[:10000] 40 | partial_y_train = y_train[10000:] 41 | 42 | model = keras.Sequential() 43 | model.add(keras.layers.Embedding(input_dim=10000, output_dim=16)) 44 | model.add(keras.layers.GlobalAveragePooling1D()) 45 | model.add(keras.layers.Dense(16, activation='relu')) 46 | model.add(keras.layers.Dense(1, activation='sigmoid')) 47 | model.summary() 48 | 49 | model.compile(optimizer='adam', 50 | loss='binary_crossentropy', 51 | metrics=['accuracy']) 52 | 53 | history = model.fit(partial_x_train, 54 | partial_y_train, 55 | epochs=40, 56 | batch_size=512, 57 | validation_data=(x_val, y_val), 58 | verbose=1) 59 | 60 | loss_and_metrics = model.evaluate(test_data, y_test) 61 | print(loss_and_metrics) 62 | -------------------------------------------------------------------------------- /泰坦尼克生存预测案例/test.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q 3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S 4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q 5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S 6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S 7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S 8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q 9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S 10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C 11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S 12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S 13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S 14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S 15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S 16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S 17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C 18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q 19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C 20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S 21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C 22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C 23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S 24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S 25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C 26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C 27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S 28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C 29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C 30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S 31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C 32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S 33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S 34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S 35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S 36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C 37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C 38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S 39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S 40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S 41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S 42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C 43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S 44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S 45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S 46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S 47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S 48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C 49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q 50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C 51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S 52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S 53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C 54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S 55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S 56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C 57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q 58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S 59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S 60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S 61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C 62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S 63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S 64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S 65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q 66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C 67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S 68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q 69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S 70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C 71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S 72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q 73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S 74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S 75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C 76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C 77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C 78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S 79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S 80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S 81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q 82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C 83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S 84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S 85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S 86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q 87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C 88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q 89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S 90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q 91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S 92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S 93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S 94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S 95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S 96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C 97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S 98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S 99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S 100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S 101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S 102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C 103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S 104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q 105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S 106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C 107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S 108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q 109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q 110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S 111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S 112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C 113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q 114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C 115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q 116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S 117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C 118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C 119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S 120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C 121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S 122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S 123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q 124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C 125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S 126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q 127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S 128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S 129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q 130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S 131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S 132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S 133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C 134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S 135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C 136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S 137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S 138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C 139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S 140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S 141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S 142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S 143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S 144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C 145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S 146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S 147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S 148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S 149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S 150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S 151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S 152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C 153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C 154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S 155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S 156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S 157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S 158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S 159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S 160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S 161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S 162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q 163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C 164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S 165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S 166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S 167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S 168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C 169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S 170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C 171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S 172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S 173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C 174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S 175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C 176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S 177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S 178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S 179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C 180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S 181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C 182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S 183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C 184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S 185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q 186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C 187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S 188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S 189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S 190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S 191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S 192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S 193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S 194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S 195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q 196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S 197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S 198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C 199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S 200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S 201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S 202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q 203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S 204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C 205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S 206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S 207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C 208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q 209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S 210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C 211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S 212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S 213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S 214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S 215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S 216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S 217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S 218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q 219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S 220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C 221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S 222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C 223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S 224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S 225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S 226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C 227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C 228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S 229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q 230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S 231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S 232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S 233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S 234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S 235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q 236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C 237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S 238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C 239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C 240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S 241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C 242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C 243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S 244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C 245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S 246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S 247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S 248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S 249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S 250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S 251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C 252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S 253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S 254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C 255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S 256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S 257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S 258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q 259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S 260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S 261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S 262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S 263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S 264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S 265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S 266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C 267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S 268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S 269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S 270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S 271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S 272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C 273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q 274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C 275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q 276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C 277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S 278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S 279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S 280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S 281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S 282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S 283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S 284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q 285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C 286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S 287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S 288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S 289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S 290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C 291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S 292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S 293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q 294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C 295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S 296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S 297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S 298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C 299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C 300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S 301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S 302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S 303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C 304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S 305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S 306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q 307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S 308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S 309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S 310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S 311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S 312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S 313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C 314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S 315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q 316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C 317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q 318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C 319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S 320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S 321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S 322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S 323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C 324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S 325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S 326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S 327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S 328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S 329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C 330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S 331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S 332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S 333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C 334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C 335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C 336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S 337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S 338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S 339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C 340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S 341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C 342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S 343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S 344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S 345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C 346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S 347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S 348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S 349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C 350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S 351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S 352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C 353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S 354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S 355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S 356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S 357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S 358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S 359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S 360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q 361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S 362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S 363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C 364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S 365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S 366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C 367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S 368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C 369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S 370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C 371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C 372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S 373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C 374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S 375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S 376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S 377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C 378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S 379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S 380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S 381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S 382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q 383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q 384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S 385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S 386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S 387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S 388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S 389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S 390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q 391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S 392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S 393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S 394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S 395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S 396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S 397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S 398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q 399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C 400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S 401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q 402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S 403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S 404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C 405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S 406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C 407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C 408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S 409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C 410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q 411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S 412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q 413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q 414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S 415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S 416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C 417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S 418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S 419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C 420 | -------------------------------------------------------------------------------- /蝴蝶花(iris)分类案例/Iris.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # 数据检查 5 | 6 | # In[5]: 7 | 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | # In[18]: 14 | 15 | 16 | df = pd.read_csv('IrisFishData.csv') 17 | df.head() 18 | 19 | 20 | # In[19]: 21 | 22 | 23 | df.describe() 24 | 25 | 26 | # In[20]: 27 | 28 | 29 | df.isnull().values.any() 30 | 31 | 32 | # In[45]: 33 | 34 | 35 | df = pd.read_csv('IrisFishData.csv', na_values=['NA']) 36 | 37 | 38 | # df.isnull().sum() 39 | 40 | # In[9]: 41 | 42 | 43 | get_ipython().magic('matplotlib inline') 44 | 45 | import matplotlib.pyplot as plt 46 | 47 | 48 | # In[2]: 49 | 50 | 51 | import seaborn as sb 52 | 53 | 54 | # In[24]: 55 | 56 | 57 | sb.pairplot(df.dropna(), hue = 'class') 58 | #每列的分布在对角线上画出 59 | 60 | 61 | # # 数据清洗 62 | 63 | # In[26]: 64 | 65 | 66 | df['class'].unique() 67 | 68 | 69 | # In[46]: 70 | 71 | 72 | df.loc[df['class'] == 'setossa', 'class'] = 'setosa' 73 | df['class'].unique() 74 | 75 | 76 | # ## 由图可知蓝色有个点总是在范围外 是不是可能有数据错误 77 | 78 | # In[55]: 79 | 80 | 81 | df.loc[df['class'] == 'setosa', 'sepal_width_cm'].describe() 82 | 83 | 84 | # In[76]: 85 | 86 | 87 | df = df.loc[(df['class'] != 'setosa') | ((df['class'] == 'setosa') & (df['sepal_width_cm'] >= 2.5))] 88 | #不是setosa的class的 不需要清洗 是setosa的数据需要过滤到sepal_width_cm 值为2.5以上的 89 | 90 | 91 | # In[75]: 92 | 93 | 94 | sub_df = df.loc[(df['class'] != 'setosa') | (df['sepal_width_cm'] >= 2.5)] 95 | sub_df.loc[sub_df['class'] == 'setosa', 'sepal_width_cm'].hist() 96 | 97 | 98 | # In[77]: 99 | 100 | 101 | df.to_csv('Iris_clean.csv', index=False) 102 | 103 | 104 | # In[6]: 105 | 106 | 107 | clean_df = pd.read_csv('Iris_clean.csv') 108 | 109 | 110 | # In[10]: 111 | 112 | 113 | sb.pairplot(clean_df, hue='class') 114 | 115 | 116 | # 117 | 118 | # In[ ]: 119 | 120 | 121 | 122 | 123 | 124 | # In[ ]: 125 | 126 | 127 | 128 | 129 | 130 | # In[ ]: 131 | 132 | 133 | 134 | 135 | 136 | # In[ ]: 137 | 138 | 139 | 140 | 141 | 142 | # In[ ]: 143 | 144 | 145 | 146 | 147 | 148 | # In[ ]: 149 | 150 | 151 | 152 | 153 | 154 | # In[ ]: 155 | 156 | 157 | 158 | 159 | 160 | # In[ ]: 161 | 162 | 163 | 164 | 165 | 166 | # In[ ]: 167 | 168 | 169 | 170 | 171 | 172 | # In[ ]: 173 | 174 | 175 | 176 | 177 | 178 | # In[ ]: 179 | 180 | 181 | 182 | 183 | 184 | # In[ ]: 185 | 186 | 187 | 188 | 189 | 190 | # In[ ]: 191 | 192 | 193 | 194 | 195 | 196 | # In[ ]: 197 | 198 | 199 | 200 | 201 | 202 | # In[ ]: 203 | 204 | 205 | 206 | 207 | 208 | # In[ ]: 209 | 210 | 211 | 212 | 213 | 214 | # In[ ]: 215 | 216 | 217 | 218 | 219 | 220 | # In[ ]: 221 | 222 | 223 | 224 | 225 | 226 | # In[ ]: 227 | 228 | 229 | 230 | 231 | 232 | # In[ ]: 233 | 234 | 235 | 236 | 237 | 238 | # In[ ]: 239 | 240 | 241 | 242 | 243 | 244 | # In[ ]: 245 | 246 | 247 | 248 | 249 | 250 | # In[ ]: 251 | 252 | 253 | 254 | 255 | 256 | # In[ ]: 257 | 258 | 259 | 260 | 261 | 262 | # In[ ]: 263 | 264 | 265 | 266 | 267 | 268 | # In[ ]: 269 | 270 | 271 | 272 | 273 | 274 | # In[ ]: 275 | 276 | 277 | 278 | 279 | 280 | # In[ ]: 281 | 282 | 283 | 284 | 285 | 286 | # In[ ]: 287 | 288 | 289 | 290 | 291 | 292 | # In[ ]: 293 | 294 | 295 | 296 | 297 | 298 | # In[ ]: 299 | 300 | 301 | 302 | 303 | 304 | # In[ ]: 305 | 306 | 307 | 308 | 309 | 310 | # In[ ]: 311 | 312 | 313 | 314 | 315 | 316 | # In[ ]: 317 | 318 | 319 | 320 | 321 | 322 | # In[ ]: 323 | 324 | 325 | 326 | 327 | 328 | # In[ ]: 329 | 330 | 331 | 332 | 333 | 334 | # In[ ]: 335 | 336 | 337 | 338 | 339 | 340 | # In[ ]: 341 | 342 | 343 | 344 | 345 | 346 | # In[ ]: 347 | 348 | 349 | 350 | 351 | 352 | # In[ ]: 353 | 354 | 355 | 356 | 357 | 358 | # In[ ]: 359 | 360 | 361 | 362 | 363 | 364 | # In[ ]: 365 | 366 | 367 | 368 | 369 | 370 | # In[ ]: 371 | 372 | 373 | 374 | 375 | 376 | # In[ ]: 377 | 378 | 379 | 380 | 381 | 382 | # In[ ]: 383 | 384 | 385 | 386 | 387 | 388 | # In[ ]: 389 | 390 | 391 | 392 | 393 | 394 | # In[ ]: 395 | 396 | 397 | 398 | 399 | 400 | # In[ ]: 401 | 402 | 403 | 404 | 405 | 406 | # In[ ]: 407 | 408 | 409 | 410 | 411 | 412 | # In[ ]: 413 | 414 | 415 | 416 | 417 | -------------------------------------------------------------------------------- /蝴蝶花(iris)分类案例/Iris3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[ 4.9 3.1 1.5 0.1]\t-->\tsetosa\t(Actual:setosa)\n", 13 | "[ 5.9 3. 5.1 1.8]\t-->\tvirginica\t(Actual:virginica)\n", 14 | "[ 5.8 2.7 3.9 1.2]\t-->\tversicolor\t(Actual:versicolor)\n", 15 | "[ 5.7 3. 4.2 1.2]\t-->\tversicolor\t(Actual:versicolor)\n", 16 | "[ 6.3 2.5 5. 1.9]\t-->\tvirginica\t(Actual:virginica)\n", 17 | "[ 4.8 3.4 1.6 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 18 | "[ 6.5 2.8 4.6 1.5]\t-->\tversicolor\t(Actual:versicolor)\n", 19 | "[ 5.6 2.9 3.6 1.3]\t-->\tversicolor\t(Actual:versicolor)\n", 20 | "[ 6.1 2.9 4.7 1.4]\t-->\tversicolor\t(Actual:versicolor)\n", 21 | "[ 4.6 3.2 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 22 | "[ 4.4 2.9 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 23 | "[ 4.3 3. 1.1 0.1]\t-->\tsetosa\t(Actual:setosa)\n", 24 | "[ 7.7 2.8 6.7 2. ]\t-->\tvirginica\t(Actual:virginica)\n", 25 | "[ 6.5 3. 5.5 1.8]\t-->\tvirginica\t(Actual:virginica)\n", 26 | "[ 6.7 3.3 5.7 2.5]\t-->\tvirginica\t(Actual:virginica)\n", 27 | "[ 4.7 3.2 1.6 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 28 | "[ 6. 2.7 5.1 1.6]\t-->\tversicolor\t(Actual:versicolor)\n", 29 | "[ 5.7 2.5 5. 2. ]\t-->\tvirginica\t(Actual:virginica)\n", 30 | "[ 6.9 3.1 5.1 2.3]\t-->\tvirginica\t(Actual:virginica)\n", 31 | "[ 5.7 2.9 4.2 1.3]\t-->\tversicolor\t(Actual:versicolor)\n", 32 | "[ 7.2 3.6 6.1 2.5]\t-->\tvirginica\t(Actual:virginica)\n", 33 | "[ 6.1 3. 4.9 1.8]\t-->\tvirginica\t(Actual:virginica)\n", 34 | "[ 5.5 2.3 4. 1.3]\t-->\tversicolor\t(Actual:versicolor)\n", 35 | "[ 6.7 2.5 5.8 1.8]\t-->\tvirginica\t(Actual:virginica)\n", 36 | "[ 5.7 2.8 4.5 1.3]\t-->\tversicolor\t(Actual:versicolor)\n", 37 | "[ 5.7 3.8 1.7 0.3]\t-->\tsetosa\t(Actual:setosa)\n", 38 | "[ 6.4 3.1 5.5 1.8]\t-->\tvirginica\t(Actual:virginica)\n", 39 | "[ 5.2 3.5 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 40 | "[ 6.7 3. 5.2 2.3]\t-->\tvirginica\t(Actual:virginica)\n", 41 | "[ 5.5 4.2 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 42 | "[ 4.9 3.1 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 43 | "[ 5.7 4.4 1.5 0.4]\t-->\tsetosa\t(Actual:setosa)\n", 44 | "[ 5. 2. 3.5 1. ]\t-->\tversicolor\t(Actual:versicolor)\n", 45 | "[ 5.4 3.7 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 46 | "[ 6.7 3.1 4.4 1.4]\t-->\tversicolor\t(Actual:versicolor)\n", 47 | "[ 5. 3.6 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n", 48 | "[ 5.2 4.1 1.5 0.1]\t-->\tsetosa\t(Actual:setosa)\n", 49 | "[ 6.2 2.2 4.5 1.5]\t-->\tversicolor\t(Actual:versicolor)\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "%matplotlib inline\n", 55 | "\n", 56 | "import pandas as pd\n", 57 | "import numpy as np\n", 58 | "from sklearn.ensemble import RandomForestClassifier\n", 59 | "from sklearn.cross_validation import train_test_split\n", 60 | "from sklearn.cross_validation import cross_val_score\n", 61 | "\n", 62 | "df = pd.read_csv('Iris_clean.csv')\n", 63 | "\n", 64 | "training_set = df[['sepal_lenth_cm','sepal_width_cm','petal_length_cm','petal_width_cm']].values\n", 65 | "training_class = df['class'].values\n", 66 | "\n", 67 | "random_forest_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 68 | " max_depth=None, max_features=4, max_leaf_nodes=None,\n", 69 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 70 | " min_samples_leaf=1, min_samples_split=2,\n", 71 | " min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,\n", 72 | " oob_score=False, random_state=None, verbose=0,\n", 73 | " warm_start=False)\n", 74 | "\n", 75 | "cv_scores = cross_val_score(random_forest_classifier, training_set, training_class, cv=10)\n", 76 | "\n", 77 | "(training_inputs,\n", 78 | "testing_inputs,\n", 79 | "training_classes,\n", 80 | "testing_classes) = train_test_split(training_set, training_class, train_size=0.75)\n", 81 | "\n", 82 | "random_forest_classifier.fit(training_set, training_class)\n", 83 | "\n", 84 | "for input_feature, prediction, actual in zip(testing_inputs,\n", 85 | " random_forest_classifier.predict(testing_inputs),\n", 86 | " testing_classes):\n", 87 | " print('{}\\t-->\\t{}\\t(Actual:{})'.format(input_feature, prediction, actual))" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.6.1" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /蝴蝶花(iris)分类案例/IrisFishData.csv: -------------------------------------------------------------------------------- 1 | sepal_lenth_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class 2 | 4.3,3,1.1,0.1,setosa 3 | 4.4,2.9,1.4,0.2,setosa 4 | 4.4,3,1.3,0.2,setosa 5 | 4.4,3.2,1.3,0.2,setossa 6 | 4.5,2.3,1.3,0.3,setosa 7 | 4.6,3.1,1.5,0.2,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 4.6,3.6,1,0.2,setosa 10 | 4.6,3.2,1.4,0.2,setosa 11 | 4.7,3.2,1.3,0.2,setosa 12 | 4.7,3.2,1.6,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.8,3.4,1.9,0.2,setosa 16 | 4.8,3.1,1.6,0.2,setosa 17 | 4.8,3,1.4,0.3,setosa 18 | 4.9,3,1.4,0.2,setosa 19 | 4.9,3.1,1.5,0.1,setosa 20 | 4.9,3.1,1.5,0.2,setosa 21 | 4.9,3.6,1.4,0.1,setosa 22 | 5,3.6,1.4,0.2,setosa 23 | 5,3.4,1.5,0.2,setosa 24 | 5,3,1.6,0.2,setosa 25 | 5,3.4,1.6,0.4,setosa 26 | 5,3.2,1.2,0.2,setosa 27 | 5,3.5,1.3,0.3,setosa 28 | 5,3.5,1.6,0.6,setosa 29 | 5,3.3,1.4,0.2,setosa 30 | 5.1,3.5,1.4,0.2,setosa 31 | 5.1,3.5,1.4,0.3,setosa 32 | 5.1,3.8,1.5,0.3,setosa 33 | 5.1,3.7,1.5,0.4,setosa 34 | 5.1,3.3,1.7,0.5,setosa 35 | 5.1,3.4,1.5,0.2,setosa 36 | 5.1,3.8,1.9,0.4,setosa 37 | 5.1,3.8,1.6,0.2,setosa 38 | 5.2,3.5,1.5,0.2,setosa 39 | 5.2,3.4,1.4,0.2,setosa 40 | 5.2,4.1,1.5,0.1,setosa 41 | 5.3,3.7,1.5,0.2,setosa 42 | 5.4,3.9,1.7,0.4,setosa 43 | 5.4,3.7,1.5,0.2,setosa 44 | 5.4,3.9,1.3,0.4,setosa 45 | 5.4,3.4,1.7,0.2,setosa 46 | 5.4,3.4,1.5,0.4,setosa 47 | 5.5,4.2,1.4,0.2,setosa 48 | 5.5,3.5,1.3,0.2,setosa 49 | 5.7,4.4,1.5,0.4,setosa 50 | 5.7,3.8,1.7,0.3,setosa 51 | 5.8,4,1.2,0.2,setosa 52 | 4.9,2.4,3.3,1,versicolor 53 | 5,2,3.5,1,versicolor 54 | 5,2.3,3.3,1,versicolor 55 | 5.1,2.5,3,1.1,versicolor 56 | 5.2,2.7,3.9,1.4,versicolor 57 | 5.4,3,4.5,1.5,versicolor 58 | 5.5,2.3,4,1.3,versicolor 59 | 5.5,2.4,3.8,1.1,versicolor 60 | 5.5,2.4,3.7,1,versicolor 61 | 5.5,2.5,4,1.3,versicolor 62 | 5.5,2.6,4.4,1.2,versicolor 63 | 5.6,2.9,3.6,1.3,versicolor 64 | 5.6,3,4.5,1.5,versicolor 65 | 5.6,2.5,3.9,1.1,versicolor 66 | 5.6,3,4.1,1.3,versicolor 67 | 5.6,2.7,4.2,1.3,versicolor 68 | 5.7,2.8,4.5,1.3,versicolor 69 | 5.7,2.6,3.5,1,versicolor 70 | 5.7,3,4.2,1.2,versicolor 71 | 5.7,2.9,4.2,1.3,versicolor 72 | 5.7,2.8,4.1,1.3,versicolor 73 | 5.8,2.7,4.1,1,versicolor 74 | 5.8,2.7,3.9,1.2,versicolor 75 | 5.8,2.6,4,1.2,versicolor 76 | 5.9,3,4.2,1.5,versicolor 77 | 5.9,3.2,4.8,1.8,versicolor 78 | 6,2.2,4,1,versicolor 79 | 6,2.9,4.5,1.5,versicolor 80 | 6,2.7,5.1,1.6,versicolor 81 | 6,3.4,4.5,1.6,versicolor 82 | 6.1,2.9,4.7,1.4,versicolor 83 | 6.1,2.8,4,1.3,versicolor 84 | 6.1,2.8,4.7,1.2,versicolor 85 | 6.1,3,4.6,1.4,versicolor 86 | 6.2,2.2,4.5,1.5,versicolor 87 | 6.2,2.9,4.3,1.3,versicolor 88 | 6.3,3.3,4.7,1.6,versicolor 89 | 6.3,2.5,4.9,1.5,versicolor 90 | 6.3,2.3,4.4,1.3,versicolor 91 | 6.4,3.2,4.5,1.5,versicolor 92 | 6.4,2.9,4.3,1.3,versicolor 93 | 6.5,2.8,4.6,1.5,versicolor 94 | 6.6,2.9,4.6,1.3,versicolor 95 | 6.6,3,4.4,1.4,versicolor 96 | 6.7,3.1,4.4,1.4,versicolor 97 | 6.7,3,5,1.7,versicolor 98 | 6.7,3.1,4.7,1.5,versicolor 99 | 6.8,2.8,4.8,1.4,versicolor 100 | 6.9,3.1,4.9,1.5,versicolor 101 | 7,3.2,4.7,1.4,versicolor 102 | 4.9,2.5,4.5,1.7,virginica 103 | 5.6,2.8,4.9,2,virginica 104 | 5.7,2.5,5,2,virginica 105 | 5.8,2.7,5.1,1.9,virginica 106 | 5.8,2.8,5.1,2.4,virginica 107 | 5.8,2.7,5.1,1.9,virginica 108 | 5.9,3,5.1,1.8,virginica 109 | 6,2.2,5,1.5,virginica 110 | 6,3,4.8,1.8,virginica 111 | 6.1,3,4.9,1.8,virginica 112 | 6.1,2.6,5.6,1.4,virginica 113 | 6.2,2.8,4.8,1.8,virginica 114 | 6.2,3.4,5.4,2.3,virginica 115 | 6.3,3.3,6,2.5,virginica 116 | 6.3,2.9,5.6,1.8,virginica 117 | 6.3,2.7,4.9,1.8,virginica 118 | 6.3,2.8,5.1,1.5,virginica 119 | 6.3,3.4,5.6,2.4,virginica 120 | 6.3,2.5,5,1.9,virginica 121 | 6.4,2.7,5.3,1.9,virginica 122 | 6.4,3.2,5.3,2.3,virginica 123 | 6.4,2.8,5.6,2.1,virginica 124 | 6.4,2.8,5.6,2.2,virginica 125 | 6.4,3.1,5.5,1.8,virginica 126 | 6.5,3,5.8,2.2,virginica 127 | 6.5,3.2,5.1,2,virginica 128 | 6.5,3,5.5,1.8,virginica 129 | 6.5,3,5.2,2,virginica 130 | 6.7,2.5,5.8,1.8,virginica 131 | 6.7,3.3,5.7,2.1,virginica 132 | 6.7,3.1,5.6,2.4,virginica 133 | 6.7,3.3,5.7,2.5,virginica 134 | 6.7,3,5.2,2.3,virginica 135 | 6.8,3,5.5,2.1,virginica 136 | 6.8,3.2,5.9,2.3,virginica 137 | 6.9,3.2,5.7,2.3,virginica 138 | 6.9,3.1,5.4,2.1,virginica 139 | 6.9,3.1,5.1,2.3,virginica 140 | 7.1,3,5.9,2.1,virginica 141 | 7.2,3.6,6.1,2.5,virginica 142 | 7.2,3.2,6,1.8,virginica 143 | 7.2,3,5.8,1.6,virginica 144 | 7.3,2.9,6.3,1.8,virginica 145 | 7.4,2.8,6.1,1.9,virginica 146 | 7.6,3,6.6,2.1,virginica 147 | 7.7,3.8,6.7,2.2,virginica 148 | 7.7,2.6,6.9,2.3,virginica 149 | 7.7,2.8,6.7,2,virginica 150 | 7.7,3,6.1,2.3,virginica 151 | 7.9,3.8,6.4,2,virginica 152 | -------------------------------------------------------------------------------- /蝴蝶花(iris)分类案例/Iris_clean.csv: -------------------------------------------------------------------------------- 1 | sepal_lenth_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class 2 | 4.3,3.0,1.1,0.1,setosa 3 | 4.4,2.9,1.4,0.2,setosa 4 | 4.4,3.0,1.3,0.2,setosa 5 | 4.4,3.2,1.3,0.2,setosa 6 | 4.6,3.1,1.5,0.2,setosa 7 | 4.6,3.4,1.4,0.3,setosa 8 | 4.6,3.6,1.0,0.2,setosa 9 | 4.6,3.2,1.4,0.2,setosa 10 | 4.7,3.2,1.3,0.2,setosa 11 | 4.7,3.2,1.6,0.2,setosa 12 | 4.8,3.4,1.6,0.2,setosa 13 | 4.8,3.0,1.4,0.1,setosa 14 | 4.8,3.4,1.9,0.2,setosa 15 | 4.8,3.1,1.6,0.2,setosa 16 | 4.8,3.0,1.4,0.3,setosa 17 | 4.9,3.0,1.4,0.2,setosa 18 | 4.9,3.1,1.5,0.1,setosa 19 | 4.9,3.1,1.5,0.2,setosa 20 | 4.9,3.6,1.4,0.1,setosa 21 | 5.0,3.6,1.4,0.2,setosa 22 | 5.0,3.4,1.5,0.2,setosa 23 | 5.0,3.0,1.6,0.2,setosa 24 | 5.0,3.4,1.6,0.4,setosa 25 | 5.0,3.2,1.2,0.2,setosa 26 | 5.0,3.5,1.3,0.3,setosa 27 | 5.0,3.5,1.6,0.6,setosa 28 | 5.0,3.3,1.4,0.2,setosa 29 | 5.1,3.5,1.4,0.2,setosa 30 | 5.1,3.5,1.4,0.3,setosa 31 | 5.1,3.8,1.5,0.3,setosa 32 | 5.1,3.7,1.5,0.4,setosa 33 | 5.1,3.3,1.7,0.5,setosa 34 | 5.1,3.4,1.5,0.2,setosa 35 | 5.1,3.8,1.9,0.4,setosa 36 | 5.1,3.8,1.6,0.2,setosa 37 | 5.2,3.5,1.5,0.2,setosa 38 | 5.2,3.4,1.4,0.2,setosa 39 | 5.2,4.1,1.5,0.1,setosa 40 | 5.3,3.7,1.5,0.2,setosa 41 | 5.4,3.9,1.7,0.4,setosa 42 | 5.4,3.7,1.5,0.2,setosa 43 | 5.4,3.9,1.3,0.4,setosa 44 | 5.4,3.4,1.7,0.2,setosa 45 | 5.4,3.4,1.5,0.4,setosa 46 | 5.5,4.2,1.4,0.2,setosa 47 | 5.5,3.5,1.3,0.2,setosa 48 | 5.7,4.4,1.5,0.4,setosa 49 | 5.7,3.8,1.7,0.3,setosa 50 | 5.8,4.0,1.2,0.2,setosa 51 | 4.9,2.4,3.3,1.0,versicolor 52 | 5.0,2.0,3.5,1.0,versicolor 53 | 5.0,2.3,3.3,1.0,versicolor 54 | 5.1,2.5,3.0,1.1,versicolor 55 | 5.2,2.7,3.9,1.4,versicolor 56 | 5.4,3.0,4.5,1.5,versicolor 57 | 5.5,2.3,4.0,1.3,versicolor 58 | 5.5,2.4,3.8,1.1,versicolor 59 | 5.5,2.4,3.7,1.0,versicolor 60 | 5.5,2.5,4.0,1.3,versicolor 61 | 5.5,2.6,4.4,1.2,versicolor 62 | 5.6,2.9,3.6,1.3,versicolor 63 | 5.6,3.0,4.5,1.5,versicolor 64 | 5.6,2.5,3.9,1.1,versicolor 65 | 5.6,3.0,4.1,1.3,versicolor 66 | 5.6,2.7,4.2,1.3,versicolor 67 | 5.7,2.8,4.5,1.3,versicolor 68 | 5.7,2.6,3.5,1.0,versicolor 69 | 5.7,3.0,4.2,1.2,versicolor 70 | 5.7,2.9,4.2,1.3,versicolor 71 | 5.7,2.8,4.1,1.3,versicolor 72 | 5.8,2.7,4.1,1.0,versicolor 73 | 5.8,2.7,3.9,1.2,versicolor 74 | 5.8,2.6,4.0,1.2,versicolor 75 | 5.9,3.0,4.2,1.5,versicolor 76 | 5.9,3.2,4.8,1.8,versicolor 77 | 6.0,2.2,4.0,1.0,versicolor 78 | 6.0,2.9,4.5,1.5,versicolor 79 | 6.0,2.7,5.1,1.6,versicolor 80 | 6.0,3.4,4.5,1.6,versicolor 81 | 6.1,2.9,4.7,1.4,versicolor 82 | 6.1,2.8,4.0,1.3,versicolor 83 | 6.1,2.8,4.7,1.2,versicolor 84 | 6.1,3.0,4.6,1.4,versicolor 85 | 6.2,2.2,4.5,1.5,versicolor 86 | 6.2,2.9,4.3,1.3,versicolor 87 | 6.3,3.3,4.7,1.6,versicolor 88 | 6.3,2.5,4.9,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 6.4,3.2,4.5,1.5,versicolor 91 | 6.4,2.9,4.3,1.3,versicolor 92 | 6.5,2.8,4.6,1.5,versicolor 93 | 6.6,2.9,4.6,1.3,versicolor 94 | 6.6,3.0,4.4,1.4,versicolor 95 | 6.7,3.1,4.4,1.4,versicolor 96 | 6.7,3.0,5.0,1.7,versicolor 97 | 6.7,3.1,4.7,1.5,versicolor 98 | 6.8,2.8,4.8,1.4,versicolor 99 | 6.9,3.1,4.9,1.5,versicolor 100 | 7.0,3.2,4.7,1.4,versicolor 101 | 4.9,2.5,4.5,1.7,virginica 102 | 5.6,2.8,4.9,2.0,virginica 103 | 5.7,2.5,5.0,2.0,virginica 104 | 5.8,2.7,5.1,1.9,virginica 105 | 5.8,2.8,5.1,2.4,virginica 106 | 5.8,2.7,5.1,1.9,virginica 107 | 5.9,3.0,5.1,1.8,virginica 108 | 6.0,2.2,5.0,1.5,virginica 109 | 6.0,3.0,4.8,1.8,virginica 110 | 6.1,3.0,4.9,1.8,virginica 111 | 6.1,2.6,5.6,1.4,virginica 112 | 6.2,2.8,4.8,1.8,virginica 113 | 6.2,3.4,5.4,2.3,virginica 114 | 6.3,3.3,6.0,2.5,virginica 115 | 6.3,2.9,5.6,1.8,virginica 116 | 6.3,2.7,4.9,1.8,virginica 117 | 6.3,2.8,5.1,1.5,virginica 118 | 6.3,3.4,5.6,2.4,virginica 119 | 6.3,2.5,5.0,1.9,virginica 120 | 6.4,2.7,5.3,1.9,virginica 121 | 6.4,3.2,5.3,2.3,virginica 122 | 6.4,2.8,5.6,2.1,virginica 123 | 6.4,2.8,5.6,2.2,virginica 124 | 6.4,3.1,5.5,1.8,virginica 125 | 6.5,3.0,5.8,2.2,virginica 126 | 6.5,3.2,5.1,2.0,virginica 127 | 6.5,3.0,5.5,1.8,virginica 128 | 6.5,3.0,5.2,2.0,virginica 129 | 6.7,2.5,5.8,1.8,virginica 130 | 6.7,3.3,5.7,2.1,virginica 131 | 6.7,3.1,5.6,2.4,virginica 132 | 6.7,3.3,5.7,2.5,virginica 133 | 6.7,3.0,5.2,2.3,virginica 134 | 6.8,3.0,5.5,2.1,virginica 135 | 6.8,3.2,5.9,2.3,virginica 136 | 6.9,3.2,5.7,2.3,virginica 137 | 6.9,3.1,5.4,2.1,virginica 138 | 6.9,3.1,5.1,2.3,virginica 139 | 7.1,3.0,5.9,2.1,virginica 140 | 7.2,3.6,6.1,2.5,virginica 141 | 7.2,3.2,6.0,1.8,virginica 142 | 7.2,3.0,5.8,1.6,virginica 143 | 7.3,2.9,6.3,1.8,virginica 144 | 7.4,2.8,6.1,1.9,virginica 145 | 7.6,3.0,6.6,2.1,virginica 146 | 7.7,3.8,6.7,2.2,virginica 147 | 7.7,2.6,6.9,2.3,virginica 148 | 7.7,2.8,6.7,2.0,virginica 149 | 7.7,3.0,6.1,2.3,virginica 150 | 7.9,3.8,6.4,2.0,virginica 151 | -------------------------------------------------------------------------------- /蝴蝶花(iris)分类案例/iris2.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # 数据探索 5 | 6 | # In[2]: 7 | 8 | 9 | get_ipython().magic('matplotlib inline') 10 | 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | import numpy as np 14 | import seaborn as sb 15 | 16 | 17 | # In[3]: 18 | 19 | 20 | df = pd.read_csv('Iris_clean.csv') 21 | sb.pairplot(df) 22 | 23 | 24 | # In[4]: 25 | 26 | 27 | plt.figure(figsize=(10,10)) 28 | 29 | for column_index, column in enumerate(df.columns): 30 | if column == 'class': 31 | continue 32 | #按照索引分成四个小图 33 | plt.subplot(2, 2, column_index+1) 34 | #在每个小图上画出特征 35 | sb.violinplot(x='class', y=column, data=df) 36 | 37 | 38 | # ### 测试训练集 39 | 40 | # In[5]: 41 | 42 | 43 | df = pd.read_csv('Iris_clean.csv') 44 | #scikit-learn 需要输入的是numpy的array 45 | training_set = df[['sepal_lenth_cm','sepal_width_cm','petal_length_cm','petal_width_cm']].values 46 | print(training_set[:5]) 47 | training_class = df['class'].values 48 | print(training_class[:5]) 49 | 50 | 51 | # In[11]: 52 | 53 | 54 | from sklearn.model_selection import train_test_split 55 | import warnings 56 | warnings.filterwarnings("ignore", category=DeprecationWarning) 57 | warnings.filterwarnings("ignore", category=RuntimeWarning) 58 | warnings.filterwarnings("ignore", category=FutureWarning) 59 | #忽略这些warning才可以进行训练 60 | 61 | 62 | # In[12]: 63 | 64 | 65 | (training_inputs, 66 | testing_inputs, 67 | training_classes, 68 | testing_classes) = train_test_split(training_set, training_class, train_size=0.75, random_state=1) 69 | 70 | 71 | # from sklearn.tree import DecisionTreeClassifier 72 | # 73 | # #创分类器对象 74 | # tree_classfier = DecisionTreeClassifier() 75 | # 76 | # tree_classfier.fit(training_inputs, training_classes) 77 | # tree_classfier.score(testing_inputs, testing_classes) 78 | 79 | # ## 97%的正确率 还不错 80 | 81 | # In[ ]: 82 | 83 | 84 | 85 | 86 | 87 | # In[ ]: 88 | 89 | 90 | 91 | 92 | 93 | # In[ ]: 94 | 95 | 96 | 97 | 98 | 99 | # In[ ]: 100 | 101 | 102 | 103 | 104 | 105 | # In[ ]: 106 | 107 | 108 | 109 | 110 | 111 | # In[ ]: 112 | 113 | 114 | 115 | 116 | 117 | # In[ ]: 118 | 119 | 120 | 121 | 122 | 123 | # In[ ]: 124 | 125 | 126 | 127 | 128 | 129 | # In[ ]: 130 | 131 | 132 | 133 | 134 | 135 | # In[ ]: 136 | 137 | 138 | 139 | 140 | 141 | # In[ ]: 142 | 143 | 144 | 145 | 146 | 147 | # In[ ]: 148 | 149 | 150 | 151 | 152 | 153 | # In[ ]: 154 | 155 | 156 | 157 | 158 | 159 | # In[ ]: 160 | 161 | 162 | 163 | 164 | 165 | # In[ ]: 166 | 167 | 168 | 169 | 170 | 171 | # In[ ]: 172 | 173 | 174 | 175 | 176 | 177 | # In[ ]: 178 | 179 | 180 | 181 | 182 | 183 | # In[ ]: 184 | 185 | 186 | 187 | 188 | 189 | # In[ ]: 190 | 191 | 192 | 193 | 194 | 195 | # In[ ]: 196 | 197 | 198 | 199 | 200 | 201 | # In[ ]: 202 | 203 | 204 | 205 | 206 | 207 | # In[ ]: 208 | 209 | 210 | 211 | 212 | 213 | # In[ ]: 214 | 215 | 216 | 217 | 218 | 219 | # In[ ]: 220 | 221 | 222 | 223 | 224 | 225 | # In[ ]: 226 | 227 | 228 | 229 | 230 | 231 | # In[ ]: 232 | 233 | 234 | 235 | 236 | 237 | # In[ ]: 238 | 239 | 240 | 241 | 242 | 243 | # In[ ]: 244 | 245 | 246 | 247 | 248 | 249 | # In[ ]: 250 | 251 | 252 | 253 | 254 | 255 | # In[ ]: 256 | 257 | 258 | 259 | 260 | 261 | # In[ ]: 262 | 263 | 264 | 265 | 266 | 267 | # In[ ]: 268 | 269 | 270 | 271 | 272 | 273 | # In[ ]: 274 | 275 | 276 | 277 | 278 | 279 | # In[ ]: 280 | 281 | 282 | 283 | 284 | 285 | # In[ ]: 286 | 287 | 288 | 289 | 290 | 291 | # In[ ]: 292 | 293 | 294 | 295 | 296 | 297 | # In[ ]: 298 | 299 | 300 | 301 | 302 | 303 | # In[ ]: 304 | 305 | 306 | 307 | 308 | 309 | # In[ ]: 310 | 311 | 312 | 313 | 314 | 315 | # In[ ]: 316 | 317 | 318 | 319 | 320 | 321 | # In[ ]: 322 | 323 | 324 | 325 | 326 | -------------------------------------------------------------------------------- /讯飞CTR预测/RXY初版/feature_extract_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import xgboost as xgb" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "train_data = pd.read_table('./data/round1_iflyad_train.txt')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | " instance_id time city province \\\n", 33 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n", 34 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n", 35 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n", 36 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n", 37 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n", 38 | "\n", 39 | " user_tags carrier devtype \\\n", 40 | "0 NaN 1 2 \n", 41 | "1 2100191,2100078,3001825,,3001781,3001791,30017... 3 2 \n", 42 | "2 NaN 3 2 \n", 43 | "3 2100098,gd_2100000,3001791,3001795,3002193,300... 0 2 \n", 44 | "4 NaN 1 2 \n", 45 | "\n", 46 | " make model nnt ... creative_width creative_height \\\n", 47 | "0 HUAWEI HUAWEI-CAZ-AL10 1 ... 1280 720 \n", 48 | "1 Xiaomi Redmi Note 4 1 ... 960 640 \n", 49 | "2 OPPO OPPO+R11s 1 ... 960 640 \n", 50 | "3 NaN OPPO A57 1 ... 1280 720 \n", 51 | "4 Apple iPhone 7 3 ... 960 640 \n", 52 | "\n", 53 | " creative_is_jump creative_is_download creative_is_js creative_is_voicead \\\n", 54 | "0 True False False False \n", 55 | "1 True False False False \n", 56 | "2 True False False False \n", 57 | "3 True False False False \n", 58 | "4 True False False False \n", 59 | "\n", 60 | " creative_has_deeplink app_paid advert_name click \n", 61 | "0 False False B4734117F35EE97F 0 \n", 62 | "1 False False B4734117F35EE97F 0 \n", 63 | "2 False False E257895F74792E81 0 \n", 64 | "3 False False 0A421D7B11EABFC5 0 \n", 65 | "4 False False B4734117F35EE97F 0 \n", 66 | "\n", 67 | "[5 rows x 35 columns]\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "print(train_data.head())" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "test_data = pd.read_table('./data/round1_iflyad_test_feature.txt')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": { 88 | "scrolled": true 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | " instance_id time city province \\\n", 96 | "0 6930856710792380886 2190675456 137103104101100 137103104100100 \n", 97 | "1 5460409694420131920 2190674821 137103104112100 137103104100100 \n", 98 | "2 982813438159141507 2190674111 137105103101100 137105103100100 \n", 99 | "3 529991959116679673 2190675256 137106101107100 137106101100100 \n", 100 | "4 5357053206615171780 2190673926 137103102101100 137103102100100 \n", 101 | "\n", 102 | " user_tags carrier devtype \\\n", 103 | "0 NaN 2 2 \n", 104 | "1 3004406,3004430,3004434 1 2 \n", 105 | "2 3003779,3003843,3003851,3003863,3003865,300386... 2 2 \n", 106 | "3 NaN 2 2 \n", 107 | "4 2100191,2100041,2100078,2100136,2100042,300182... 3 2 \n", 108 | "\n", 109 | " make model nnt ... creative_type \\\n", 110 | "0 Apple iPhone 8 Plus 1 ... 8 \n", 111 | "1 vivo vivo X9Plus 1 ... 8 \n", 112 | "2 OPPO A73t OPPO A73t 4 ... 5 \n", 113 | "3 vivo Z1 vivo Z1 4 ... 8 \n", 114 | "4 HUAWEI HUAWEI MLA-AL10 4 ... 5 \n", 115 | "\n", 116 | " creative_width creative_height creative_is_jump creative_is_download \\\n", 117 | "0 960 640 True False \n", 118 | "1 960 640 True False \n", 119 | "2 160 640 True False \n", 120 | "3 960 640 True False \n", 121 | "4 320 480 True False \n", 122 | "\n", 123 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid \\\n", 124 | "0 False False False False \n", 125 | "1 False False False False \n", 126 | "2 False False False False \n", 127 | "3 False False False False \n", 128 | "4 False False False False \n", 129 | "\n", 130 | " advert_name \n", 131 | "0 B4734117F35EE97F \n", 132 | "1 B4734117F35EE97F \n", 133 | "2 B4734117F35EE97F \n", 134 | "3 B4734117F35EE97F \n", 135 | "4 42A4CB9035B7F50E \n", 136 | "\n", 137 | "[5 rows x 34 columns]\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "print(test_data.head())" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "print(type(test_data))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "bool\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(test_data['creative_is_js'].dtype)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | " for u in test_data.columns:\n", 186 | " if test_data[u].dtype==bool:\n", 187 | " test_data[u]=test_data[u].astype('int')" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | " instance_id time city province \\\n", 200 | "0 6930856710792380886 2190675456 137103104101100 137103104100100 \n", 201 | "1 5460409694420131920 2190674821 137103104112100 137103104100100 \n", 202 | "2 982813438159141507 2190674111 137105103101100 137105103100100 \n", 203 | "3 529991959116679673 2190675256 137106101107100 137106101100100 \n", 204 | "4 5357053206615171780 2190673926 137103102101100 137103102100100 \n", 205 | "\n", 206 | " user_tags carrier devtype \\\n", 207 | "0 NaN 2 2 \n", 208 | "1 3004406,3004430,3004434 1 2 \n", 209 | "2 3003779,3003843,3003851,3003863,3003865,300386... 2 2 \n", 210 | "3 NaN 2 2 \n", 211 | "4 2100191,2100041,2100078,2100136,2100042,300182... 3 2 \n", 212 | "\n", 213 | " make model nnt ... creative_type \\\n", 214 | "0 Apple iPhone 8 Plus 1 ... 8 \n", 215 | "1 vivo vivo X9Plus 1 ... 8 \n", 216 | "2 OPPO A73t OPPO A73t 4 ... 5 \n", 217 | "3 vivo Z1 vivo Z1 4 ... 8 \n", 218 | "4 HUAWEI HUAWEI MLA-AL10 4 ... 5 \n", 219 | "\n", 220 | " creative_width creative_height creative_is_jump creative_is_download \\\n", 221 | "0 960 640 1 0 \n", 222 | "1 960 640 1 0 \n", 223 | "2 160 640 1 0 \n", 224 | "3 960 640 1 0 \n", 225 | "4 320 480 1 0 \n", 226 | "\n", 227 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid \\\n", 228 | "0 0 0 0 0 \n", 229 | "1 0 0 0 0 \n", 230 | "2 0 0 0 0 \n", 231 | "3 0 0 0 0 \n", 232 | "4 0 0 0 0 \n", 233 | "\n", 234 | " advert_name \n", 235 | "0 B4734117F35EE97F \n", 236 | "1 B4734117F35EE97F \n", 237 | "2 B4734117F35EE97F \n", 238 | "3 B4734117F35EE97F \n", 239 | "4 42A4CB9035B7F50E \n", 240 | "\n", 241 | "[5 rows x 34 columns]\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "print(test_data.head())" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 12, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "40024\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "print(len(test_data['user_tags']))" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 13, 269 | "metadata": { 270 | "scrolled": true 271 | }, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "0 0\n", 278 | "1 0\n", 279 | "2 0\n", 280 | "3 0\n", 281 | "4 0\n", 282 | "Name: f_channel, dtype: object\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "test_data = test_data.fillna(0)\n", 288 | "print(test_data['f_channel'].head())" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 14, 294 | "metadata": { 295 | "scrolled": true 296 | }, 297 | "outputs": [ 298 | { 299 | "name": "stderr", 300 | "output_type": "stream", 301 | "text": [ 302 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:5: SettingWithCopyWarning: \n", 303 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 304 | "\n", 305 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 306 | " \"\"\"\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "#取f_channel字段中下划线后的数字\n", 312 | "for i in range(len(test_data['f_channel'])):\n", 313 | " if test_data['f_channel'][i] != 0:\n", 314 | " #print(i, test_data['f_channel'][i].split('_')[-1])\n", 315 | " test_data['f_channel'][i] = test_data['f_channel'][i].split('_')[-1]" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 27, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "23\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "print(len(test_data['user_tags'][1]))" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stderr", 342 | "output_type": "stream", 343 | "text": [ 344 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:5: SettingWithCopyWarning: \n", 345 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 346 | "\n", 347 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 348 | " \"\"\"\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "#取user_tags长度\n", 354 | "for i in range(len(test_data['user_tags'])):\n", 355 | " if type(test_data['user_tags'][i]) != int:\n", 356 | " #print(i, test_data['f_channel'][i].split('_')[-1])\n", 357 | " test_data['user_tags'][i] = len(test_data['user_tags'][i])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "print(type(test_data['user_tags'][1]) == str)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 18, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "1\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "print(test_data['user_tags'][1])" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 22, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "for u in test_data['user_tags']:\n", 393 | " if type(u) != int:\n", 394 | " u = len(u)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 23, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "135" 406 | ] 407 | }, 408 | "execution_count": 23, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "len(test_data['user_tags'][2])" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 24, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/plain": [ 425 | "'3003779,3003843,3003851,3003863,3003865,3003869,3003875,3004059,3004081,3004089,3004153,3004214,3004266,3004430,3004434,3004500,3004506'" 426 | ] 427 | }, 428 | "execution_count": 24, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "test_data['user_tags'][2]" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 37, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stderr", 444 | "output_type": "stream", 445 | "text": [ 446 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n", 447 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 448 | "\n", 449 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 450 | " This is separate from the ipykernel package so we can avoid doing imports until\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "for u,i in zip(test_data['user_tags'], range(5)):\n", 456 | " if type(u) != int:\n", 457 | " test_data['user_tags'][i] = len(u)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 38, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "0 0\n", 470 | "1 1\n", 471 | "135 2\n", 472 | "0 3\n", 473 | "824 4\n" 474 | ] 475 | } 476 | ], 477 | "source": [ 478 | "for u,i in zip(test_data['user_tags'], range(5)):\n", 479 | " print(u, i)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 39, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stderr", 489 | "output_type": "stream", 490 | "text": [ 491 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n", 492 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 493 | "\n", 494 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 495 | " This is separate from the ipykernel package so we can avoid doing imports until\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "for u,i in zip(test_data['user_tags'], range(len(test_data['user_tags']))):\n", 501 | " if type(u) != int:\n", 502 | " test_data['user_tags'][i] = len(u)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "Python 3", 516 | "language": "python", 517 | "name": "python3" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.6.4" 530 | } 531 | }, 532 | "nbformat": 4, 533 | "nbformat_minor": 2 534 | } 535 | -------------------------------------------------------------------------------- /讯飞CTR预测/RXY初版/feature_re_extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "#读取上一步中获得的数据\n", 20 | "test_data = pd.read_csv('test_data.csv')\n", 21 | "train_data = pd.read_csv('train_data.csv')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#advert_industry_inner 字段中 有两类ID\n", 31 | "def dataInterval(data1):\n", 32 | " d1 = data1.split('_')[0]\n", 33 | " return d1\n", 34 | "\n", 35 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n", 36 | " PublishedTime = arrLike['advert_industry_inner']\n", 37 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n", 38 | " d1 = dataInterval(PublishedTime) #注意去掉两端空白\n", 39 | " return d1\n", 40 | "\n", 41 | "def dataInterval2(data1):\n", 42 | " d2 = data1.split('_')[1]\n", 43 | " return d2\n", 44 | "\n", 45 | "def getInterval2(arrLike): #用来计算日期间隔天数的调用的函数\n", 46 | " PublishedTime = arrLike['advert_industry_inner']\n", 47 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n", 48 | " d2 = dataInterval2(PublishedTime) #注意去掉两端空白\n", 49 | " return d2\n", 50 | " \n", 51 | "#使用apply要快速很多!\n", 52 | "test_data['advert_first'] = test_data.apply(getInterval , axis = 1)\n", 53 | "test_data['advert_second'] = test_data.apply(getInterval2 , axis = 1)\n", 54 | "\n", 55 | "train_data['advert_first'] = train_data.apply(getInterval , axis = 1)\n", 56 | "train_data['advert_second'] = train_data.apply(getInterval2 , axis = 1)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "#顺带计算两类ID间的差距\n", 66 | "def getDistance(arrLike):\n", 67 | " delta = int(arrLike['advert_second']) - int(arrLike['advert_first'])\n", 68 | " return delta\n", 69 | "\n", 70 | "test_data['advert_delta'] = test_data.apply(getDistance, axis = 1)\n", 71 | "train_data['advert_delta'] = train_data.apply(getDistance, axis = 1)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "#删除原有advert_industry_inner字段\n", 81 | "test_data.drop(columns=['advert_industry_inner'], inplace=True)\n", 82 | "train_data.drop(columns=['advert_industry_inner'], inplace=True)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "#将所有类型转换为可以训练的\n", 92 | "test_data['advert_first'] = test_data['advert_first'].astype('int64')\n", 93 | "train_data['advert_first'] = train_data['advert_first'].astype('int64')\n", 94 | "\n", 95 | "test_data['advert_second'] = test_data['advert_first'].astype('int64')\n", 96 | "train_data['advert_second'] = train_data['advert_first'].astype('int64')" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 45, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "Unnamed: 0 int64\n", 108 | "instance_id int64\n", 109 | "time int64\n", 110 | "city int64\n", 111 | "province int64\n", 112 | "user_tags int64\n", 113 | "carrier int64\n", 114 | "devtype int64\n", 115 | "nnt int64\n", 116 | "os int64\n", 117 | "os_name int64\n", 118 | "adid int64\n", 119 | "advert_id int64\n", 120 | "orderid int64\n", 121 | "campaign_id int64\n", 122 | "creative_id int64\n", 123 | "creative_tp_dnf int64\n", 124 | "app_cate_id float64\n", 125 | "f_channel int64\n", 126 | "app_id float64\n", 127 | "creative_type int64\n", 128 | "creative_width int64\n", 129 | "creative_height int64\n", 130 | "creative_is_jump int64\n", 131 | "creative_is_download int64\n", 132 | "creative_is_js int64\n", 133 | "creative_is_voicead int64\n", 134 | "creative_has_deeplink int64\n", 135 | "app_paid int64\n", 136 | "click int64\n", 137 | "advert_first int64\n", 138 | "advert_second int64\n", 139 | "advert_delta int64\n", 140 | "dtype: object" 141 | ] 142 | }, 143 | "execution_count": 45, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "train_data.dtypes" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 8, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "#将click这个label移到最后\n", 159 | "train_data = train_data[\n", 160 | " ['instance_id', 'time', 'city', 'province', 'user_tags', 'carrier',\n", 161 | " 'devtype', 'nnt', 'os', 'os_name', 'adid',\n", 162 | " 'advert_id', 'orderid', 'campaign_id',\n", 163 | " 'creative_id', 'creative_tp_dnf', 'app_cate_id', 'f_channel', 'app_id',\n", 164 | " 'creative_type', 'creative_width', 'creative_height',\n", 165 | " 'creative_is_jump', 'creative_is_download', 'creative_is_js',\n", 166 | " 'creative_is_voicead', 'creative_has_deeplink', 'app_paid','advert_first', 'advert_second', 'advert_delta',\n", 167 | " 'click']\n", 168 | "]" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 47, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/html": [ 179 | "
\n", 180 | "\n", 193 | "\n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | "
instance_idtimecityprovinceuser_tagscarrierdevtypenntosos_name...creative_typecreative_widthcreative_heightcreative_is_jumpcreative_is_downloadcreative_is_jscreative_is_voiceadcreative_has_deeplinkapp_paidclick
0862947199798978072190219034137103102105100137103102100100012121...812807201000000
12699289844928136052219022107013710510110010013710510110010078532121...89606401000000
231175271684458457522190219793137103104111100137103104100100032121...89606401000000
33398484891050993371219022170413710310211310013710310210010033902121...312807201000000
420354775705911764882190220024137103102109100137103102100100012312...89606401000000
\n", 343 | "

5 rows × 32 columns

\n", 344 | "
" 345 | ], 346 | "text/plain": [ 347 | " instance_id time city province \\\n", 348 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n", 349 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n", 350 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n", 351 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n", 352 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n", 353 | "\n", 354 | " user_tags carrier devtype nnt os os_name ... creative_type \\\n", 355 | "0 0 1 2 1 2 1 ... 8 \n", 356 | "1 785 3 2 1 2 1 ... 8 \n", 357 | "2 0 3 2 1 2 1 ... 8 \n", 358 | "3 339 0 2 1 2 1 ... 3 \n", 359 | "4 0 1 2 3 1 2 ... 8 \n", 360 | "\n", 361 | " creative_width creative_height creative_is_jump creative_is_download \\\n", 362 | "0 1280 720 1 0 \n", 363 | "1 960 640 1 0 \n", 364 | "2 960 640 1 0 \n", 365 | "3 1280 720 1 0 \n", 366 | "4 960 640 1 0 \n", 367 | "\n", 368 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid click \n", 369 | "0 0 0 0 0 0 \n", 370 | "1 0 0 0 0 0 \n", 371 | "2 0 0 0 0 0 \n", 372 | "3 0 0 0 0 0 \n", 373 | "4 0 0 0 0 0 \n", 374 | "\n", 375 | "[5 rows x 32 columns]" 376 | ] 377 | }, 378 | "execution_count": 47, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "train_data.head()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 9, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "test_data.to_csv('./data/test_data.csv')\n", 394 | "train_data.to_csv('./data/train_data.csv')" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.6.4" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /讯飞CTR预测/RXY初版/lambda_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import time" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "train_path='./data/round1_iflyad_train.txt'\n", 21 | "test_path='./data/round1_iflyad_test_feature.txt'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 5, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "all_data=pd.read_table(train_path)\n", 31 | "#print(all_data.head(10))\n", 32 | "all_test=pd.read_table(test_path)\n", 33 | "#print(all_test.head(10))\n", 34 | "#将时间戳转化为正常时间\n", 35 | "all_data['time_string']=all_data[\"time\"].apply(lambda x:time.strftime(\"%Y-%m-%d %H:%M:%S\",time.localtime(x)))\n", 36 | "all_data['time_string']=pd.to_datetime(all_data[\"time_string\"])\n", 37 | "all_data[\"hour\"]=all_data[\"time_string\"].dt.hour\n", 38 | "all_data[\"day\"]=all_data[\"time_string\"].dt.day\n", 39 | "all_data[\"day\"]=all_data[\"day\"].apply(lambda x:x-27 if x>=27 else x+4)\n", 40 | "\n", 41 | "all_test['time_string']=all_test[\"time\"].apply(lambda x:time.strftime(\"%Y-%m-%d %H:%M:%S\",time.localtime(x)))\n", 42 | "all_test['time_string']=pd.to_datetime(all_test[\"time_string\"])\n", 43 | "all_test[\"hour\"]=all_test[\"time_string\"].dt.hour\n", 44 | "all_test[\"day\"]=all_test[\"time_string\"].dt.day\n", 45 | "all_test[\"day\"]=all_test[\"day\"].apply(lambda x:x-27 if x>=27 else x+4)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 8, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "all_data = all_data.fillna(0)\n", 55 | "all_test = all_test.fillna(0)\n", 56 | "\n", 57 | "all_data['user_tags']=all_data['user_tags'].apply(lambda x:len(x) if not x==0 else 0)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 9, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/html": [ 68 | "
\n", 69 | "\n", 82 | "\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "
instance_idtimecityprovinceuser_tagscarrierdevtypemakemodelnnt...creative_is_downloadcreative_is_jscreative_is_voiceadcreative_has_deeplinkapp_paidadvert_nameclicktime_stringhourday
0862947199798978072190219034137103102105100137103102100100012HUAWEIHUAWEI-CAZ-AL101...FalseFalseFalseFalseFalseB4734117F35EE97F02039-05-29 02:10:3422
12699289844928136052219022107013710510110010013710510110010078532XiaomiRedmi Note 41...FalseFalseFalseFalseFalseB4734117F35EE97F02039-05-29 02:44:3022
231175271684458457522190219793137103104111100137103104100100032OPPOOPPO+R11s1...FalseFalseFalseFalseFalseE257895F74792E8102039-05-29 02:23:1322
333984848910509933712190221704137103102113100137103102100100339020OPPO A571...FalseFalseFalseFalseFalse0A421D7B11EABFC502039-05-29 02:55:0422
420354775705911764882190220024137103102109100137103102100100012AppleiPhone 73...FalseFalseFalseFalseFalseB4734117F35EE97F02039-05-29 02:27:0422
52065527640347419040219022122813710410410410013710410410010027112Xiaomi,MI 6,sagitMI 61...FalseFalseFalseFalseFalse862FF2E9B0AD4C1402039-05-29 02:47:0822
\n", 256 | "

6 rows × 38 columns

\n", 257 | "
" 258 | ], 259 | "text/plain": [ 260 | " instance_id time city province \\\n", 261 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n", 262 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n", 263 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n", 264 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n", 265 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n", 266 | "5 2065527640347419040 2190221228 137104104104100 137104104100100 \n", 267 | "\n", 268 | " user_tags carrier devtype make model nnt ... \\\n", 269 | "0 0 1 2 HUAWEI HUAWEI-CAZ-AL10 1 ... \n", 270 | "1 785 3 2 Xiaomi Redmi Note 4 1 ... \n", 271 | "2 0 3 2 OPPO OPPO+R11s 1 ... \n", 272 | "3 339 0 2 0 OPPO A57 1 ... \n", 273 | "4 0 1 2 Apple iPhone 7 3 ... \n", 274 | "5 271 1 2 Xiaomi,MI 6,sagit MI 6 1 ... \n", 275 | "\n", 276 | " creative_is_download creative_is_js creative_is_voicead \\\n", 277 | "0 False False False \n", 278 | "1 False False False \n", 279 | "2 False False False \n", 280 | "3 False False False \n", 281 | "4 False False False \n", 282 | "5 False False False \n", 283 | "\n", 284 | " creative_has_deeplink app_paid advert_name click \\\n", 285 | "0 False False B4734117F35EE97F 0 \n", 286 | "1 False False B4734117F35EE97F 0 \n", 287 | "2 False False E257895F74792E81 0 \n", 288 | "3 False False 0A421D7B11EABFC5 0 \n", 289 | "4 False False B4734117F35EE97F 0 \n", 290 | "5 False False 862FF2E9B0AD4C14 0 \n", 291 | "\n", 292 | " time_string hour day \n", 293 | "0 2039-05-29 02:10:34 2 2 \n", 294 | "1 2039-05-29 02:44:30 2 2 \n", 295 | "2 2039-05-29 02:23:13 2 2 \n", 296 | "3 2039-05-29 02:55:04 2 2 \n", 297 | "4 2039-05-29 02:27:04 2 2 \n", 298 | "5 2039-05-29 02:47:08 2 2 \n", 299 | "\n", 300 | "[6 rows x 38 columns]" 301 | ] 302 | }, 303 | "execution_count": 9, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "all_data.head(6)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 10, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "(151301, 38)\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "features1=all_data[(all_data['day']>=0)&(all_data['day']<=4)]\n", 327 | "dataset1=all_data[all_data['day']==5]\n", 328 | "print(dataset1.shape)\n", 329 | "features2=all_data[(all_data['day']>=0)&(all_data['day']<=5)]\n", 330 | "dataset2=all_data[all_data['day']==6]\n", 331 | "features3=all_data[(all_data['day']>=0)&(all_data['day']<=6)]\n", 332 | "dataset3=all_test" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.6.4" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 2 364 | } 365 | -------------------------------------------------------------------------------- /讯飞CTR预测/RXY初版/pandas_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 46, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | " 0 1\n", 23 | "0 1900_2000 1960_2452\n", 24 | "1 1854_1965 2002_2150\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "a = pd.DataFrame([['1900_2000','1960_2452'],['1854_1965','2002_2150']])\n", 30 | "print(a)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 14, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "def dataInterval(data1,data2):\n", 40 | " d1 = datetime.datetime.strptime(data1, '%Y-%m-%d')\n", 41 | " d2 = datetime.datetime.strptime(data2, '%Y-%m-%d')\n", 42 | " delta = d1 - d2\n", 43 | " return delta.days\n", 44 | "\n", 45 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n", 46 | " PublishedTime = arrLike['PublishedTime']\n", 47 | " ReceivedTime = arrLike['ReceivedTime']\n", 48 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n", 49 | " days = dataInterval(PublishedTime.strip(),ReceivedTime.strip()) #注意去掉两端空白\n", 50 | " return days\n", 51 | "\n", 52 | "if __name__ == '__main__': \n", 53 | " fileName = \"NS_new.xls\";\n", 54 | " df = pd.read_excel(fileName) \n", 55 | " df['TimeInterval'] = df.apply(getInterval , axis = 1)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 50, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def dataInterval(data1):\n", 65 | " d1 = data1.split('_')[0]\n", 66 | " return d1\n", 67 | "\n", 68 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n", 69 | " PublishedTime = arrLike[0]\n", 70 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n", 71 | " d1 = dataInterval(PublishedTime) #注意去掉两端空白\n", 72 | " return d1\n", 73 | "\n", 74 | "if __name__ == '__main__':\n", 75 | " a['TimeInterval'] = a.apply(getInterval , axis = 1)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 51, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | " 0 1 TimeInterval\n", 88 | "0 1900_2000 1960_2452 1900\n", 89 | "1 1854_1965 2002_2150 1854\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "print(a)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 41, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "0 [da, ddasasd]\n", 106 | "1 [dsda, das]\n", 107 | "Name: 2, dtype: object" 108 | ] 109 | }, 110 | "execution_count": 41, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "a['2'].str.split('_')" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "import time\n", 124 | "for i in range:\n", 125 | " time.sleep(5)\n", 126 | " print(i)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 42, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "f = lambda x:x[0]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 45, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "['da', 'dsda']\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "print(list(map(f, a['2'].str.split('_'))))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 53, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "1\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "print(1)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 56, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "2\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "print(2)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 60, 192 | "metadata": { 193 | "scrolled": true 194 | }, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "0\n", 201 | "1\n", 202 | "2\n" 203 | ] 204 | }, 205 | { 206 | "ename": "KeyboardInterrupt", 207 | "evalue": "", 208 | "output_type": "error", 209 | "traceback": [ 210 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 211 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 212 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10000\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 213 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "import time\n", 219 | "for i in range(10000):\n", 220 | " time.sleep(2)\n", 221 | " print(i)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 58, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "1\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "print(1)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 61, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | " 0 1 TimeInterval\n", 251 | "0 1900_2000 1960_2452 1900\n", 252 | "1 1854_1965 2002_2150 1854\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "print(a)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [] 266 | } 267 | ], 268 | "metadata": { 269 | "kernelspec": { 270 | "display_name": "Python 3", 271 | "language": "python", 272 | "name": "python3" 273 | }, 274 | "language_info": { 275 | "codemirror_mode": { 276 | "name": "ipython", 277 | "version": 3 278 | }, 279 | "file_extension": ".py", 280 | "mimetype": "text/x-python", 281 | "name": "python", 282 | "nbconvert_exporter": "python", 283 | "pygments_lexer": "ipython3", 284 | "version": "3.6.4" 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 2 289 | } 290 | -------------------------------------------------------------------------------- /讯飞CTR预测/digi_onehot.py: -------------------------------------------------------------------------------- 1 | from numpy import argmax 2 | # define input string 3 | data = 'hello world' 4 | print(data) 5 | # define universe of possible input values 6 | alphabet = 'abcdefghijklmnopqrstuvwxyz ' 7 | # define a mapping of chars to integers 8 | char_to_int = dict((c, i) for i, c in enumerate(alphabet)) 9 | int_to_char = dict((i, c) for i, c in enumerate(alphabet)) 10 | # integer encode input data 11 | integer_encoded = [char_to_int[char] for char in data] 12 | print(integer_encoded) 13 | 14 | onehot_encoded = list() 15 | for value in integer_encoded: 16 | letter = [0 for _ in range(len(alphabet))] 17 | letter[value] = 1 18 | onehot_encoded.append(letter) 19 | print(onehot_encoded) 20 | 21 | inverted = int_to_char[argmax(onehot_encoded[0])] 22 | print(inverted) -------------------------------------------------------------------------------- /讯飞CTR预测/one_hot_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import OneHotEncoder 3 | 4 | df2 = pd.DataFrame({'id': [3566841, 6541227, 3512441], 5 | 'sex': [1, 2, 2], 6 | 'level': [3, 1, 2]}) 7 | 8 | id_data = df2.values[:, :1] 9 | transform_data = df2.values[:, 1:] 10 | 11 | enc = OneHotEncoder() 12 | df2_new = enc.fit_transform(transform_data).toarray() 13 | 14 | #zu he 15 | df2_all = pd.concat((pd.DataFrame(id_data),pd.DataFrame(df2_new)),axis=1) 16 | print(df2_all) -------------------------------------------------------------------------------- /讯飞CTR预测/川哥版/_1_extract_features.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import matplotlib as mpl 6 | from scipy import interpolate 7 | import seaborn as sns 8 | from scipy import interpolate 9 | import time 10 | from utils import * 11 | 12 | train_path='./data/round1_iflyad_train.txt' 13 | test_path='./data/round1_iflyad_test_feature.txt' 14 | 15 | 16 | all_data=pd.read_table(train_path) 17 | #print(all_data.head(10)) 18 | all_test=pd.read_table(test_path) 19 | #print(all_test.head(10)) 20 | #将时间戳转化为正常时间 21 | all_data['time_string']=all_data["time"].apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))) 22 | all_data['time_string']=pd.to_datetime(all_data["time_string"]) 23 | all_data["hour"]=all_data["time_string"].dt.hour 24 | all_data["day"]=all_data["time_string"].dt.day 25 | all_data["day"]=all_data["day"].apply(lambda x:x-27 if x>=27 else x+4) 26 | 27 | all_test['time_string']=all_test["time"].apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))) 28 | all_test['time_string']=pd.to_datetime(all_test["time_string"]) 29 | all_test["hour"]=all_test["time_string"].dt.hour 30 | all_test["day"]=all_test["time_string"].dt.day 31 | all_test["day"]=all_test["day"].apply(lambda x:x-27 if x>=27 else x+4) 32 | 33 | #划分训练集与测试集 34 | #27,28,29,30,31,1,2,3 35 | ''' 36 | feature_extract_dataset(day) label 37 | train1 0-4 5 38 | train2(offline_test) 1-5 6 39 | online_test 2-6 7 40 | ''' 41 | features1=all_data[(all_data['day']>=0)&(all_data['day']<=4)] 42 | dataset1=all_data[all_data['day']==5] 43 | print(dataset1.shape) 44 | features2=all_data[(all_data['day']>=1)&(all_data['day']<=5)] 45 | dataset2=all_data[all_data['day']==6] 46 | features3=all_data[(all_data['day']>=2)&(all_data['day']<=6)] 47 | dataset3=all_test 48 | ''' 49 | aa=all_data[['inner_slot_id']] 50 | bb=all_test[['inner_slot_id']] 51 | cc=pd.concat([aa,bb],axis=0) 52 | cc.drop_duplicates(inplace=True) 53 | df=pd.get_dummies(cc['inner_slot_id'],prefix='inner_slot_id') 54 | inner_slot_id_one_hot=pd.concat([cc,df],axis=1) 55 | ''' 56 | #构造特征 57 | #点击个数,转化率,转化个数 58 | t1=features1[['adid','click']] 59 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num') 60 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt') 61 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio') 62 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']] 63 | t11.drop_duplicates(inplace=True) 64 | print(t1.head(10)) 65 | t2=features1[['app_id','click']] 66 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num') 67 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt') 68 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio') 69 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']] 70 | t21.drop_duplicates(inplace=True) 71 | 72 | t3=features1[['app_id','adid','click']] 73 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num') 74 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt') 75 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio') 76 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']] 77 | t31.drop_duplicates(inplace=True) 78 | 79 | t4=features1[['orderid','click']] 80 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num') 81 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt') 82 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio') 83 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']] 84 | t41.drop_duplicates(inplace=True) 85 | 86 | t5=features1[['inner_slot_id','click']] 87 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num') 88 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt') 89 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio') 90 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']] 91 | t51.drop_duplicates(inplace=True) 92 | 93 | t6=features1[['inner_slot_id','nnt','click']] 94 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num') 95 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt') 96 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio') 97 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']] 98 | t61.drop_duplicates(inplace=True) 99 | 100 | 101 | dataset1=dataset1.merge(t11,on=['adid'],how='left') 102 | dataset1=dataset1.merge(t21,on=['app_id'],how='left') 103 | dataset1=dataset1.merge(t31,on=['app_id','adid'],how='left') 104 | dataset1=dataset1.merge(t41,on=['orderid'],how='left') 105 | dataset1=dataset1.merge(t51,on=['inner_slot_id'],how='left') 106 | dataset1=dataset1.merge(t61,on=['inner_slot_id','nnt'],how='left') 107 | #dataset1=dataset1.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left') 108 | #dataset1['creative_shape']=dataset1['creative_height']*dataset1['creative_width'] 109 | #保存提取的特征 110 | dataset1.to_csv('features/feature1.csv',index=None) 111 | ############################################################################################# 112 | #features2 113 | t1=features2[['adid','click']] 114 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num') 115 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt') 116 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio') 117 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']] 118 | t11.drop_duplicates(inplace=True) 119 | #print(t1.head(10)) 120 | t2=features2[['app_id','click']] 121 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num') 122 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt') 123 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio') 124 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']] 125 | t21.drop_duplicates(inplace=True) 126 | 127 | t3=features2[['app_id','adid','click']] 128 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num') 129 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt') 130 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio') 131 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']] 132 | t31.drop_duplicates(inplace=True) 133 | 134 | t4=features2[['orderid','click']] 135 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num') 136 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt') 137 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio') 138 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']] 139 | t41.drop_duplicates(inplace=True) 140 | 141 | t5=features2[['inner_slot_id','click']] 142 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num') 143 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt') 144 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio') 145 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']] 146 | t51.drop_duplicates(inplace=True) 147 | 148 | t6=features2[['inner_slot_id','nnt','click']] 149 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num') 150 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt') 151 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio') 152 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']] 153 | t61.drop_duplicates(inplace=True) 154 | #one-hot 155 | #inner_slot_id_df=pd.get_dummies(dataset2['inner_slot_id'],prefix='inner_slot_id') 156 | dataset2=dataset2.merge(t11,on=['adid'],how='left') 157 | dataset2=dataset2.merge(t21,on=['app_id'],how='left') 158 | dataset2=dataset2.merge(t31,on=['app_id','adid'],how='left') 159 | dataset2=dataset2.merge(t41,on=['orderid'],how='left') 160 | dataset2=dataset2.merge(t51,on=['inner_slot_id'],how='left') 161 | dataset2=dataset2.merge(t61,on=['inner_slot_id','nnt'],how='left') 162 | #dataset2=dataset2.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left') 163 | #dataset2['creative_shape']=dataset2['creative_height']*dataset2['creative_width'] 164 | 165 | dataset2.to_csv('features/feature2.csv',index=None) 166 | ################################################################################################## 167 | #test数据集 168 | #features3 169 | t1=features3[['adid','click']] 170 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num') 171 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt') 172 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio') 173 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']] 174 | t11.drop_duplicates(inplace=True) 175 | print(t1.head(10)) 176 | t2=features3[['app_id','click']] 177 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num') 178 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt') 179 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio') 180 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']] 181 | t21.drop_duplicates(inplace=True) 182 | 183 | t3=features3[['app_id','adid','click']] 184 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num') 185 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt') 186 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio') 187 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']] 188 | t31.drop_duplicates(inplace=True) 189 | 190 | t4=features3[['orderid','click']] 191 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num') 192 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt') 193 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio') 194 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']] 195 | t41.drop_duplicates(inplace=True) 196 | 197 | t5=features3[['inner_slot_id','click']] 198 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num') 199 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt') 200 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio') 201 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']] 202 | t51.drop_duplicates(inplace=True) 203 | 204 | t6=features3[['inner_slot_id','nnt','click']] 205 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num') 206 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt') 207 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio') 208 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']] 209 | t61.drop_duplicates(inplace=True) 210 | 211 | dataset3=dataset3.merge(t11,on=['adid'],how='left') 212 | dataset3=dataset3.merge(t21,on=['app_id'],how='left') 213 | dataset3=dataset3.merge(t31,on=['app_id','adid'],how='left') 214 | dataset3=dataset3.merge(t41,on=['orderid'],how='left') 215 | dataset3=dataset3.merge(t51,on=['inner_slot_id'],how='left') 216 | dataset3=dataset3.merge(t61,on=['inner_slot_id','nnt'],how='left') 217 | #dataset3=dataset3.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left') 218 | #dataset3['creative_shape']=dataset3['creative_height']*dataset3['creative_width'] 219 | 220 | dataset3.to_csv('features/online_test_features.csv',index=None) -------------------------------------------------------------------------------- /讯飞CTR预测/川哥版/_2_train.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import xgboost as xgb 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.preprocessing import MinMaxScaler 6 | import datetime 7 | 8 | #训练 9 | dataset1 = pd.read_csv('features/feature1.csv') 10 | #dataset1.click.replace(-1,0,inplace=True) 11 | dataset2 = pd.read_csv('features/feature2.csv') 12 | #dataset2.click.replace(-1,0,inplace=True) 13 | dataset3 = pd.read_csv('features/online_test_features.csv') 14 | 15 | dataset1.drop_duplicates(inplace=True) 16 | dataset2.drop_duplicates(inplace=True) 17 | dataset3.drop_duplicates(inplace=True) 18 | 19 | dataset1= dataset1.replace(np.nan,0) 20 | dataset2= dataset2.replace(np.nan,0) 21 | dataset3= dataset3.replace(np.nan,0) 22 | 23 | dataset12 = pd.concat([dataset1,dataset2],axis=0) 24 | 25 | dataset1_y = dataset1.click 26 | dataset1_x = dataset1.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1) # 'day_gap_before','day_gap_after' cause overfitting, 0.77 27 | dataset2_y = dataset2.click 28 | dataset2_x = dataset2.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1) 29 | dataset12_y = dataset12.click 30 | dataset12_x = dataset12.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1) 31 | dataset3_preds = dataset3[['instance_id']] 32 | dataset3_x = dataset3.drop(['instance_id','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1) 33 | 34 | print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape) 35 | 36 | dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y) 37 | dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y) 38 | dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y) 39 | dataset3 = xgb.DMatrix(dataset3_x) 40 | 41 | params={'booster':'gbtree', 42 | 'objective': 'binary:logistic', 43 | 'eval_metric':'logloss', 44 | 'gamma':0.1, 45 | 'min_child_weight':1.1, 46 | 'max_depth':2, 47 | 'lambda':10, 48 | 'subsample':0.7, 49 | 'colsample_bytree':0.7, 50 | 'colsample_bylevel':0.7, 51 | 'eta': 0.2, 52 | 'tree_method':'exact', 53 | 'seed':0, 54 | 'nthread':12 55 | } 56 | 57 | #train on dataset1, evaluate on dataset2 58 | #watchlist = [(dataset1,'train'),(dataset2,'val')] 59 | #model = xgb.train(params,dataset1,num_boost_round=800,evals=watchlist,early_stopping_rounds=300) 60 | 61 | watchlist = [(dataset12,'train')] 62 | model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist) 63 | 64 | #predict test set 65 | dataset3_preds['predicted_score'] = model.predict(dataset3) 66 | #dataset3_preds.click = MinMaxScaler().fit_transform(dataset3_preds.click.reshape(-1, 1)) 67 | #dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True) 68 | dataset3_preds.to_csv("./result/xgb_preds.csv",index=None) 69 | print(dataset3_preds.describe()) 70 | 71 | #save feature score 72 | feature_score = model.get_fscore() 73 | feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True) 74 | fs = [] 75 | for (key,value) in feature_score: 76 | fs.append("{0},{1}\n".format(key,value)) 77 | 78 | with open('xgb_feature_score.csv','w') as f: 79 | f.writelines("feature,score\n") 80 | f.writelines(fs) -------------------------------------------------------------------------------- /讯飞CTR预测/川哥版/utils.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pandas as pd 3 | 4 | #获取统计特征,包括点击次数,转化次数,转化率 5 | def get_type_features(df,columns,value,operation,rename): 6 | if operation=="count":#统计点击次数 7 | add=pd.DataFrame(df.groupby(columns)[value].count()).reset_index() 8 | if operation=="sum":#统计转化次数 9 | add=pd.DataFrame(df.groupby(columns)[value].sum()).reset_index() 10 | if operation=="mean":#统计转化率 11 | add=pd.DataFrame(df.groupby(columns)[value].mean()).reset_index() 12 | add.columns=columns+[rename] 13 | df=df.merge(add,on=columns,how='left') 14 | return df 15 | 16 | -------------------------------------------------------------------------------- /讯飞CTR预测/鱼神大佬/kdxf_baseline.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from sklearn.feature_selection import chi2, SelectPercentile 3 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 4 | from sklearn.model_selection import StratifiedKFold 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from scipy import sparse 7 | import lightgbm as lgb 8 | import warnings 9 | import time 10 | import pandas as pd 11 | import numpy as np 12 | import os 13 | 14 | path = '/Users/inf/PycharmProject/kaggle/kdxf/data' 15 | 16 | warnings.filterwarnings("ignore") 17 | 18 | train = pd.read_table(path + '/train.txt') 19 | test = pd.read_table(path + '/test.txt') 20 | data = pd.concat([train, test], axis=0, ignore_index=True) 21 | 22 | data = data.fillna(-1) 23 | 24 | data['day'] = data['time'].apply(lambda x: int(time.strftime("%d", time.localtime(x)))) 25 | data['hour'] = data['time'].apply(lambda x: int(time.strftime("%H", time.localtime(x)))) 26 | data['label'] = data.click.astype(int) 27 | del data['click'] 28 | 29 | bool_feature = ['creative_is_jump', 'creative_is_download', 'creative_is_js', 'creative_is_voicead', 30 | 'creative_has_deeplink', 'app_paid'] 31 | for i in bool_feature: 32 | data[i] = data[i].astype(int) 33 | 34 | data['advert_industry_inner_1'] = data['advert_industry_inner'].apply(lambda x: x.split('_')[0]) 35 | 36 | ad_cate_feature = ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name', 37 | 'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink', 38 | 'creative_is_jump', 'creative_is_download'] 39 | 40 | media_cate_feature = ['app_cate_id', 'f_channel', 'app_id', 'inner_slot_id'] 41 | 42 | content_cate_feature = ['city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model'] 43 | 44 | origin_cate_list = ad_cate_feature + media_cate_feature + content_cate_feature 45 | 46 | for i in origin_cate_list: 47 | data[i] = data[i].map(dict(zip(data[i].unique(), range(0, data[i].nunique())))) 48 | 49 | cate_feature = origin_cate_list 50 | 51 | num_feature = ['creative_width', 'creative_height', 'hour'] 52 | 53 | feature = cate_feature + num_feature 54 | print(len(feature), feature) 55 | 56 | predict = data[data.label == -1] 57 | predict_result = predict[['instance_id']] 58 | predict_result['predicted_score'] = 0 59 | predict_x = predict.drop('label', axis=1) 60 | 61 | train_x = data[data.label != -1] 62 | train_y = data[data.label != -1].label.values 63 | 64 | # 默认加载 如果 增加了cate类别特征 请改成false重新生成 65 | if os.path.exists(path + '/feature/base_train_csr.npz') and True: 66 | print('load_csr---------') 67 | base_train_csr = sparse.load_npz(path + '/feature/base_train_csr.npz').tocsr().astype('bool') 68 | base_predict_csr = sparse.load_npz(path + '/feature/base_predict_csr.npz').tocsr().astype('bool') 69 | else: 70 | base_train_csr = sparse.csr_matrix((len(train), 0)) 71 | base_predict_csr = sparse.csr_matrix((len(predict_x), 0)) 72 | 73 | enc = OneHotEncoder() 74 | for feature in cate_feature: 75 | enc.fit(data[feature].values.reshape(-1, 1)) 76 | base_train_csr = sparse.hstack((base_train_csr, enc.transform(train_x[feature].values.reshape(-1, 1))), 'csr', 77 | 'bool') 78 | base_predict_csr = sparse.hstack((base_predict_csr, enc.transform(predict[feature].values.reshape(-1, 1))), 79 | 'csr', 80 | 'bool') 81 | print('one-hot prepared !') 82 | 83 | cv = CountVectorizer(min_df=20) 84 | for feature in ['user_tags']: 85 | data[feature] = data[feature].astype(str) 86 | cv.fit(data[feature]) 87 | base_train_csr = sparse.hstack((base_train_csr, cv.transform(train_x[feature].astype(str))), 'csr', 'bool') 88 | base_predict_csr = sparse.hstack((base_predict_csr, cv.transform(predict_x[feature].astype(str))), 'csr', 89 | 'bool') 90 | print('cv prepared !') 91 | 92 | sparse.save_npz(path + '/feature/base_train_csr.npz', base_train_csr) 93 | sparse.save_npz(path + '/feature/base_predict_csr.npz', base_predict_csr) 94 | 95 | train_csr = sparse.hstack( 96 | (sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype( 97 | 'float32') 98 | predict_csr = sparse.hstack( 99 | (sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32') 100 | print(train_csr.shape) 101 | feature_select = SelectPercentile(chi2, percentile=95) 102 | feature_select.fit(train_csr, train_y) 103 | train_csr = feature_select.transform(train_csr) 104 | predict_csr = feature_select.transform(predict_csr) 105 | print('feature select') 106 | print(train_csr.shape) 107 | 108 | lgb_model = lgb.LGBMClassifier( 109 | boosting_type='gbdt', num_leaves=32, reg_alpha=0, reg_lambda=0.1, 110 | max_depth=-1, n_estimators=5000, objective='binary', 111 | subsample=0.7, colsample_bytree=0.7, subsample_freq=1, 112 | learning_rate=0.05, random_state=2018, n_jobs=-1 113 | ) 114 | 115 | skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True) 116 | best_score = [] 117 | for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)): 118 | lgb_model.fit(train_csr[train_index], train_y[train_index], 119 | eval_set=[(train_csr[train_index], train_y[train_index]), 120 | (train_csr[test_index], train_y[test_index])], early_stopping_rounds=100) 121 | best_score.append(lgb_model.best_score_['valid_1']['binary_logloss']) 122 | print(best_score) 123 | test_pred = lgb_model.predict_proba(predict_csr, num_iteration=lgb_model.best_iteration_)[:, 1] 124 | print('test mean:', test_pred.mean()) 125 | predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred 126 | print(np.mean(best_score)) 127 | predict_result['predicted_score'] = predict_result['predicted_score'] / 5 128 | mean = predict_result['predicted_score'].mean() 129 | print('mean:', mean) 130 | now = datetime.datetime.now() 131 | now = now.strftime('%m-%d-%H-%M') 132 | predict_result[['instance_id', 'predicted_score']].to_csv(path + "/submission/lgb_baseline_%s.csv" % now, index=False) -------------------------------------------------------------------------------- /阿里天池o2o新人赛/wepe_o2o.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from datetime import date" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2698: DtypeWarning: Columns (0,1) have mixed types. Specify dtype option on import or set low_memory=False.\n", 26 | " interactivity=interactivity, compiler=compiler, result=result)\n", 27 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2698: DtypeWarning: Columns (0,1,2) have mixed types. Specify dtype option on import or set low_memory=False.\n", 28 | " interactivity=interactivity, compiler=compiler, result=result)\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "#1754884 record,1053282 with coupon_id,9738 coupon. date_received:20160101~20160615,date:20160101~20160630, 539438 users, 8415 merchants\n", 34 | "off_train = pd.read_csv('data/ccf_offline_stage1_train.csv',header=None)\n", 35 | "off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']\n", 36 | "#2050 coupon_id. date_received:20160701~20160731, 76309 users(76307 in trainset, 35965 in online_trainset), 1559 merchants(1558 in trainset)\n", 37 | "off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv',header=None)\n", 38 | "off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']\n", 39 | "#11429826 record(872357 with coupon_id),762858 user(267448 in off_train)\n", 40 | "on_train = pd.read_csv('data/ccf_online_stage1_train.csv',header=None)\n", 41 | "on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "dataset3 = off_test\n", 53 | "feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]\n", 54 | "dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]\n", 55 | "feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]\n", 56 | "dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]\n", 57 | "feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 15, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | " user_id this_month_user_receive_all_coupon_count\n", 70 | "0 1000020 1\n", 71 | "1 1000026 1\n", 72 | "2 1000452 1\n", 73 | "3 1000510 1\n", 74 | "4 100057 1\n" 75 | ] 76 | }, 77 | { 78 | "name": "stderr", 79 | "output_type": "stream", 80 | "text": [ 81 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 82 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 83 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 84 | "\n", 85 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 86 | " \n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "t = dataset3[['user_id']]\n", 92 | "t['this_month_user_receive_all_coupon_count'] = 1\n", 93 | "t = t.groupby('user_id').agg('sum').reset_index()\n", 94 | "print(t[:5])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 17, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | " user_id coupon_id this_month_user_receive_same_coupon_count\n", 107 | "0 1000020 13602 1\n", 108 | "1 1000026 13602 1\n", 109 | "2 1000452 9983 1\n", 110 | "3 1000510 10418 1\n", 111 | "4 100057 2601 1\n" 112 | ] 113 | }, 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 119 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 120 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 121 | "\n", 122 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 123 | " \n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "t1 = dataset3[['user_id','coupon_id']]\n", 129 | "t1['this_month_user_receive_same_coupon_count'] = 1\n", 130 | "#按照user_id和coupon_id进行分组\n", 131 | "#统计每个用户,使用不同优惠券的次数\n", 132 | "t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()\n", 133 | "print(t1[:5])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 29, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stderr", 143 | "output_type": "stream", 144 | "text": [ 145 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\pandas\\core\\generic.py:2999: SettingWithCopyWarning: \n", 146 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 147 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 148 | "\n", 149 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 150 | " self[name] = value\n" 151 | ] 152 | }, 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | " user_id coupon_id date_received\n", 158 | "0 1000020 13602 20160731\n", 159 | "1 1000026 13602 20160729\n", 160 | "2 1000452 9983 20160727\n", 161 | "3 1000510 10418 20160701\n", 162 | "4 100057 2601 20160708\n", 163 | "5 1000651 13602 20160728\n", 164 | "6 1000884 10438 20160714\n", 165 | "7 1000907 1904 20160703\n", 166 | "8 1000936 4203 20160710\n", 167 | "9 1000986 12429 20160701\n", 168 | "10 1001023 13602 20160723\n", 169 | "11 1001176 13181 20160731\n", 170 | "12 1001176 361 20160709\n", 171 | "13 1001176 3992 20160731\n", 172 | "14 100122 12735 20160714\n", 173 | "15 100122 13602 20160727\n", 174 | "16 1001240 10418 20160706\n", 175 | "17 1001240 13602 20160711\n", 176 | "18 1001240 2978 20160706\n", 177 | "19 1001257 11799 20160711\n", 178 | "20 1001302 13602 20160726\n", 179 | "21 1001466 9983 20160717\n", 180 | "22 100150 13602 20160725\n", 181 | "23 1001505 13602 20160710\n", 182 | "24 1001505 4283 20160702\n", 183 | "25 1001525 10418 20160713\n", 184 | "26 1001729 10418 20160702\n", 185 | "27 1001729 10438 20160723\n", 186 | "28 1001729 13602 20160719\n", 187 | "29 1001729 2978 20160723\n", 188 | "... ... ... ...\n", 189 | "105929 99721 10438 20160707\n", 190 | "105930 997367 13602 20160724\n", 191 | "105931 997367 8059 20160715\n", 192 | "105932 997367 9983 20160714\n", 193 | "105933 997426 13602 20160710\n", 194 | "105934 997688 13602 20160705\n", 195 | "105935 997751 13602 20160729\n", 196 | "105936 997802 3443 20160724:20160707\n", 197 | "105937 997802 6465 20160721\n", 198 | "105938 997802 7459 20160724\n", 199 | "105939 997846 613 20160729\n", 200 | "105940 997992 13602 20160707\n", 201 | "105941 998381 13602 20160728\n", 202 | "105942 998639 2978 20160729\n", 203 | "105943 998686 768 20160704\n", 204 | "105944 998717 13602 20160702\n", 205 | "105945 998717 9983 20160703\n", 206 | "105946 998773 4185 20160701\n", 207 | "105947 998807 10418 20160702\n", 208 | "105948 998945 13602 20160727\n", 209 | "105949 999137 9983 20160728\n", 210 | "105950 999350 13602 20160710\n", 211 | "105951 999659 13191 20160705\n", 212 | "105952 999659 7517 20160705\n", 213 | "105953 999781 12027 20160703\n", 214 | "105954 999781 1904 20160703\n", 215 | "105955 999842 9983 20160703\n", 216 | "105956 999931 13320 20160731\n", 217 | "105957 99996 13602 20160723\n", 218 | "105958 User_id Coupon_id Date_received\n", 219 | "\n", 220 | "[105959 rows x 3 columns]\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "t2 = dataset3[['user_id','coupon_id','date_received']]\n", 226 | "t2.date_received = t2.date_received.astype('str')\n", 227 | "# 按照user_id','coupon_id排序后,提出来date_received,进行agg运算\n", 228 | "# agg运算:用冒号连接起来\n", 229 | "t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()\n", 230 | "print(t2)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 30, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "#apply会返回每个优惠券的使用次数\n", 242 | "t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))\n", 243 | "#筛出使用次数大于1次的数据\n", 244 | "t2 = t2[t2.receive_number>1]\n", 245 | "#对max_date_received赋值为最近一次的使用时间\n", 246 | "t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))\n", 247 | "#对min_date_received赋值为最早一次的使用时间\n", 248 | "t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(d) for d in s.split(':')]))\n", 249 | "# 重新定义t2为以下项目\n", 250 | "t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 33, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "t3 = dataset3[['user_id','coupon_id','date_received']]\n", 262 | "#merge,将两个数据集合并\n", 263 | "#将t2和t3在['user_id','coupon_id']上进行左帧合并,即根据t3合并t2的user_id','coupon_id\n", 264 | "#t2[['user_id','coupon_id','max_date_received','min_date_received']]\n", 265 | "#t3[['user_id','coupon_id','date_received']]\n", 266 | "#因此合并方式为:找到每个用户每张优惠券的消费时间和对应券的max_date_received与min_date_received\n", 267 | "t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')\n", 268 | "#t3的this_month_user_receive_same_coupon_lastone项目设置为:此用户消费本张优惠券与最近一次消费本张优惠券的间隔\n", 269 | "t3 = t3.apply(pd.to_numeric, args=('coerce',))\n", 270 | "t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received\n", 271 | "#此用户消费本张优惠券与第一次消费本张优惠券的间隔\n", 272 | "t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received\n", 273 | "def is_firstlastone(x):\n", 274 | " if x==0:\n", 275 | " return 1\n", 276 | " elif x>0:\n", 277 | " return 0\n", 278 | " else:\n", 279 | " return -1 #those only receive once\n", 280 | " \n", 281 | "t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)\n", 282 | "t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)\n", 283 | "t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 34, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | " user_id coupon_id date_received \\\n", 296 | "0 NaN NaN NaN \n", 297 | "1 4129537.0 9983.0 20160712.0 \n", 298 | "2 6949378.0 3429.0 20160706.0 \n", 299 | "3 2166529.0 6928.0 20160727.0 \n", 300 | "4 2166529.0 1808.0 20160727.0 \n", 301 | "5 6172162.0 6500.0 20160708.0 \n", 302 | "6 4005121.0 9983.0 20160706.0 \n", 303 | "7 4347394.0 9983.0 20160716.0 \n", 304 | "8 3094273.0 13602.0 20160727.0 \n", 305 | "9 5139970.0 9983.0 20160729.0 \n", 306 | "10 3237121.0 13602.0 20160703.0 \n", 307 | "11 6224386.0 9983.0 20160716.0 \n", 308 | "12 6488578.0 13602.0 20160712.0 \n", 309 | "13 4164865.0 9983.0 20160703.0 \n", 310 | "14 4164865.0 8059.0 20160706.0 \n", 311 | "15 5468674.0 9983.0 20160713.0 \n", 312 | "16 6258178.0 9144.0 20160706.0 \n", 313 | "17 3659521.0 7341.0 20160727.0 \n", 314 | "18 3659521.0 13181.0 20160717.0 \n", 315 | "19 3659521.0 13602.0 20160718.0 \n", 316 | "20 7333378.0 13602.0 20160704.0 \n", 317 | "21 7333378.0 785.0 20160727.0 \n", 318 | "22 4454914.0 2978.0 20160711.0 \n", 319 | "23 6817282.0 8375.0 20160724.0 \n", 320 | "24 3149569.0 10418.0 20160721.0 \n", 321 | "25 6301186.0 1715.0 20160718.0 \n", 322 | "26 6301186.0 4203.0 20160708.0 \n", 323 | "27 2891521.0 13602.0 20160724.0 \n", 324 | "28 3422977.0 13602.0 20160727.0 \n", 325 | "29 4771330.0 13602.0 20160726.0 \n", 326 | "... ... ... ... \n", 327 | "113611 4194809.0 11799.0 20160717.0 \n", 328 | "113612 4194809.0 13602.0 20160713.0 \n", 329 | "113613 6062585.0 10438.0 20160719.0 \n", 330 | "113614 6062585.0 13602.0 20160719.0 \n", 331 | "113615 6074873.0 878.0 20160724.0 \n", 332 | "113616 6342137.0 13602.0 20160728.0 \n", 333 | "113617 6342137.0 8059.0 20160704.0 \n", 334 | "113618 6342137.0 9822.0 20160724.0 \n", 335 | "113619 6342137.0 3429.0 20160718.0 \n", 336 | "113620 4317689.0 5933.0 20160727.0 \n", 337 | "113621 5110265.0 13602.0 20160711.0 \n", 338 | "113622 5110265.0 9983.0 20160711.0 \n", 339 | "113623 6422009.0 5874.0 20160710.0 \n", 340 | "113624 4851197.0 9983.0 20160713.0 \n", 341 | "113625 4894205.0 2978.0 20160711.0 \n", 342 | "113626 7253501.0 13602.0 20160714.0 \n", 343 | "113627 6485501.0 13602.0 20160720.0 \n", 344 | "113628 4918781.0 10438.0 20160723.0 \n", 345 | "113629 6497789.0 9983.0 20160716.0 \n", 346 | "113630 7047677.0 2601.0 20160708.0 \n", 347 | "113631 6786557.0 9983.0 20160718.0 \n", 348 | "113632 6801917.0 13602.0 20160719.0 \n", 349 | "113633 7066109.0 9144.0 20160705.0 \n", 350 | "113634 4451837.0 13602.0 20160723.0 \n", 351 | "113635 5828093.0 2978.0 20160716.0 \n", 352 | "113636 5828093.0 10418.0 20160716.0 \n", 353 | "113637 6626813.0 7595.0 20160707.0 \n", 354 | "113638 6626813.0 7590.0 20160712.0 \n", 355 | "113639 4547069.0 13602.0 20160717.0 \n", 356 | "113640 6675965.0 613.0 20160728.0 \n", 357 | "\n", 358 | " this_month_user_receive_same_coupon_lastone \\\n", 359 | "0 -1 \n", 360 | "1 -1 \n", 361 | "2 -1 \n", 362 | "3 -1 \n", 363 | "4 -1 \n", 364 | "5 -1 \n", 365 | "6 -1 \n", 366 | "7 -1 \n", 367 | "8 -1 \n", 368 | "9 -1 \n", 369 | "10 -1 \n", 370 | "11 -1 \n", 371 | "12 -1 \n", 372 | "13 -1 \n", 373 | "14 -1 \n", 374 | "15 -1 \n", 375 | "16 -1 \n", 376 | "17 -1 \n", 377 | "18 -1 \n", 378 | "19 -1 \n", 379 | "20 -1 \n", 380 | "21 -1 \n", 381 | "22 -1 \n", 382 | "23 -1 \n", 383 | "24 -1 \n", 384 | "25 -1 \n", 385 | "26 -1 \n", 386 | "27 -1 \n", 387 | "28 -1 \n", 388 | "29 -1 \n", 389 | "... ... \n", 390 | "113611 -1 \n", 391 | "113612 -1 \n", 392 | "113613 -1 \n", 393 | "113614 -1 \n", 394 | "113615 -1 \n", 395 | "113616 -1 \n", 396 | "113617 -1 \n", 397 | "113618 -1 \n", 398 | "113619 -1 \n", 399 | "113620 -1 \n", 400 | "113621 -1 \n", 401 | "113622 -1 \n", 402 | "113623 -1 \n", 403 | "113624 -1 \n", 404 | "113625 -1 \n", 405 | "113626 -1 \n", 406 | "113627 -1 \n", 407 | "113628 -1 \n", 408 | "113629 -1 \n", 409 | "113630 -1 \n", 410 | "113631 -1 \n", 411 | "113632 -1 \n", 412 | "113633 -1 \n", 413 | "113634 -1 \n", 414 | "113635 -1 \n", 415 | "113636 -1 \n", 416 | "113637 -1 \n", 417 | "113638 -1 \n", 418 | "113639 -1 \n", 419 | "113640 -1 \n", 420 | "\n", 421 | " this_month_user_receive_same_coupon_firstone \n", 422 | "0 -1 \n", 423 | "1 -1 \n", 424 | "2 -1 \n", 425 | "3 -1 \n", 426 | "4 -1 \n", 427 | "5 -1 \n", 428 | "6 -1 \n", 429 | "7 -1 \n", 430 | "8 -1 \n", 431 | "9 -1 \n", 432 | "10 -1 \n", 433 | "11 -1 \n", 434 | "12 -1 \n", 435 | "13 -1 \n", 436 | "14 -1 \n", 437 | "15 -1 \n", 438 | "16 -1 \n", 439 | "17 -1 \n", 440 | "18 -1 \n", 441 | "19 -1 \n", 442 | "20 -1 \n", 443 | "21 -1 \n", 444 | "22 -1 \n", 445 | "23 -1 \n", 446 | "24 -1 \n", 447 | "25 -1 \n", 448 | "26 -1 \n", 449 | "27 -1 \n", 450 | "28 -1 \n", 451 | "29 -1 \n", 452 | "... ... \n", 453 | "113611 -1 \n", 454 | "113612 -1 \n", 455 | "113613 -1 \n", 456 | "113614 -1 \n", 457 | "113615 -1 \n", 458 | "113616 -1 \n", 459 | "113617 -1 \n", 460 | "113618 -1 \n", 461 | "113619 -1 \n", 462 | "113620 -1 \n", 463 | "113621 -1 \n", 464 | "113622 -1 \n", 465 | "113623 -1 \n", 466 | "113624 -1 \n", 467 | "113625 -1 \n", 468 | "113626 -1 \n", 469 | "113627 -1 \n", 470 | "113628 -1 \n", 471 | "113629 -1 \n", 472 | "113630 -1 \n", 473 | "113631 -1 \n", 474 | "113632 -1 \n", 475 | "113633 -1 \n", 476 | "113634 -1 \n", 477 | "113635 -1 \n", 478 | "113636 -1 \n", 479 | "113637 -1 \n", 480 | "113638 -1 \n", 481 | "113639 -1 \n", 482 | "113640 -1 \n", 483 | "\n", 484 | "[113641 rows x 5 columns]\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "print(t3)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "collapsed": true 497 | }, 498 | "outputs": [], 499 | "source": [] 500 | } 501 | ], 502 | "metadata": { 503 | "kernelspec": { 504 | "display_name": "Python 3", 505 | "language": "python", 506 | "name": "python3" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.6.1" 519 | } 520 | }, 521 | "nbformat": 4, 522 | "nbformat_minor": 2 523 | } 524 | -------------------------------------------------------------------------------- /阿里天池o2o新人赛/xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | from sklearn.preprocessing import MinMaxScaler 4 | 5 | dataset1 = pd.read_csv('data/dataset1.csv') 6 | dataset1.label.replace(-1,0,inplace=True) 7 | dataset2 = pd.read_csv('data/dataset2.csv') 8 | dataset2.label.replace(-1,0,inplace=True) 9 | dataset3 = pd.read_csv('data/dataset3.csv') 10 | 11 | dataset1.drop_duplicates(inplace=True) 12 | dataset2.drop_duplicates(inplace=True) 13 | dataset3.drop_duplicates(inplace=True) 14 | 15 | dataset12 = pd.concat([dataset1,dataset2],axis=0) 16 | 17 | dataset1_y = dataset1.label 18 | dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) # 'day_gap_before','day_gap_after' cause overfitting, 0.77 19 | dataset2_y = dataset2.label 20 | dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) 21 | dataset12_y = dataset12.label 22 | dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) 23 | dataset3_preds = dataset3[['user_id','coupon_id','date_received']] 24 | dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1) 25 | 26 | print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape) 27 | 28 | dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y) 29 | dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y) 30 | dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y) 31 | dataset3 = xgb.DMatrix(dataset3_x) 32 | 33 | params={'booster':'gbtree', 34 | 'objective': 'rank:pairwise', 35 | 'eval_metric':'auc', 36 | 'gamma':0.1, 37 | 'min_child_weight':1.1, 38 | 'max_depth':5, 39 | 'lambda':10, 40 | 'subsample':0.7, 41 | 'colsample_bytree':0.7, 42 | 'colsample_bylevel':0.7, 43 | 'eta': 0.01, 44 | 'tree_method':'exact', 45 | 'seed':0, 46 | 'nthread':12 47 | } 48 | 49 | #train on dataset1, evaluate on dataset2 50 | #watchlist = [(dataset1,'train'),(dataset2,'val')] 51 | #model = xgb_3500.train(params,dataset1,num_boost_round=3000,evals=watchlist,early_stopping_rounds=300) 52 | 53 | watchlist = [(dataset12,'train')] 54 | model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist) 55 | 56 | #predict test set 57 | dataset3_preds['label'] = model.predict(dataset3) 58 | dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label) 59 | dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True) 60 | dataset3_preds.to_csv("xgb_preds.csv",index=None,header=None) 61 | print(dataset3_preds.describe()) 62 | 63 | #save feature score 64 | feature_score = model.get_fscore() 65 | feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True) 66 | fs = [] 67 | for (key,value) in feature_score: 68 | fs.append("{0},{1}\n".format(key,value)) 69 | 70 | with open('xgb_feature_score.csv','w') as f: 71 | f.writelines("feature,score\n") 72 | f.writelines(fs) 73 | 74 | --------------------------------------------------------------------------------