├── .gitattributes
├── README.md
├── coursera_deeplearning
└── 第一课第二周编程作业
│ ├── assignment2_1.ipynb
│ ├── assignment2_2.ipynb
│ └── lr_utils.py
├── image
├── 1.png
├── 2.png
└── 3.png
├── kaggle手写数字识别
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── workspace.xml
│ └── xgb_learn.iml
├── .ipynb_checkpoints
│ └── xgboost_model_reload-checkpoint.ipynb
├── image
│ ├── kaggle手写数字比赛.png
│ ├── kaggle排名及得分.png
│ ├── xgb_importance.png
│ ├── xgb_tree.png
│ └── 迭代次数及时间.png
├── model
│ ├── dump.raw.txt
│ └── 迭代次数及时间.png
├── submission_xgb_MultiSoftmax.csv
├── xgb_diginum.py
└── xgboost_model_reload.ipynb
├── matplot_test.ipynb
├── numpy
├── np_test.py
├── np_test2.py
├── test.py
└── 线下门店服务器安装部署手册.docx - 快捷方式.lnk
├── tensorflow
├── index.py
├── test2.py
└── test3.py
├── tf衣服图片识别率提升
├── CNN_digit.py
├── README.md
├── Text_classifier_v1.py
└── Text_classifier_v2.py
├── 泰坦尼克生存预测案例
├── Titanic.ipynb
├── test.csv
└── train.csv
├── 蝴蝶花(iris)分类案例
├── Iris.ipynb
├── Iris.py
├── Iris3.ipynb
├── IrisFishData.csv
├── Iris_clean.csv
├── iris2.ipynb
└── iris2.py
├── 讯飞CTR预测
├── DC讯飞比赛(EDA & Baseline ).html
├── RXY初版
│ ├── check_column.ipynb
│ ├── feature_extract.ipynb
│ ├── feature_extract_test.ipynb
│ ├── feature_re_extract.ipynb
│ ├── lambda_test.ipynb
│ └── pandas_test.ipynb
├── digi_onehot.py
├── one_hot_test.py
├── 川哥版
│ ├── _1_extract_features.py
│ ├── _2_train.py
│ └── utils.py
└── 鱼神大佬
│ └── kdxf_baseline.py
└── 阿里天池o2o新人赛
├── o2o_wepe_zhen.ipynb
├── wepe_o2o.ipynb
└── xgb.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 阿里O2o新人赛
2 |
3 | 3800多参赛队 第50名
4 |
5 | 
6 | 
7 | 
8 |
9 | 需要gpu加速 速率会快很多
10 |
11 | # kaggle手写数字识别
12 |
13 | ## 1.比赛页面如下
14 |
15 | 
16 |
17 | ## 2.代码迭代次数及时间
18 |
19 | 
20 |
21 | ## 3.xgb图
22 |
23 | 
24 |
25 | ## 4.排名及结果
26 |
27 | 
28 |
29 | # tf衣服图片识别率提升
30 | tf官方教程的识别网络结构是一开始全部对图片碾平成一维, 之后过两个全连接层.
31 |
32 | 我的改进是首先将数据reshape成四维, [batch, h,w,c]. CNN1+dropout+CNN2 256全连接 dropout 0.2 128全连接 68全连接 dropout0.2 10全连接
33 |
34 | 准确率可以提升从0.86提升到0.91
35 |
--------------------------------------------------------------------------------
/coursera_deeplearning/第一课第二周编程作业/lr_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import h5py
3 |
4 |
5 | def load_dataset():
6 | train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
7 | train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
8 | train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
9 |
10 | test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
11 | test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
12 | test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
13 |
14 | classes = np.array(test_dataset["list_classes"][:]) # the list of classes
15 |
16 | train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
17 | test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
18 |
19 | return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
--------------------------------------------------------------------------------
/image/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/1.png
--------------------------------------------------------------------------------
/image/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/2.png
--------------------------------------------------------------------------------
/image/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/image/3.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/kaggle手写数字识别/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/kaggle手写数字识别/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 | 1531275630108
250 |
251 |
252 | 1531275630108
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
--------------------------------------------------------------------------------
/kaggle手写数字识别/.idea/xgb_learn.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/kaggle手写数字识别/image/kaggle手写数字比赛.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/kaggle手写数字比赛.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/image/kaggle排名及得分.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/kaggle排名及得分.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/image/xgb_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/xgb_importance.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/image/xgb_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/xgb_tree.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/image/迭代次数及时间.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/image/迭代次数及时间.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/model/dump.raw.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/model/dump.raw.txt
--------------------------------------------------------------------------------
/kaggle手写数字识别/model/迭代次数及时间.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/kaggle手写数字识别/model/迭代次数及时间.png
--------------------------------------------------------------------------------
/kaggle手写数字识别/xgb_diginum.py:
--------------------------------------------------------------------------------
1 | import xgboost as xgb
2 | import pandas as pd
3 | import time
4 | import numpy as np
5 | import matplotlib as plt
6 |
7 |
8 | now = time.time()
9 |
10 | dataset = pd.read_csv("data/train.csv")
11 |
12 | train = dataset.iloc[:,1:].values
13 | labels = dataset.iloc[:,:1].values
14 |
15 | tests = pd.read_csv("data/test.csv")
16 | #test_id = range(len(tests))
17 | test = tests.iloc[:,:].values
18 |
19 |
20 | params={
21 | 'booster':'gbtree',
22 | # 这里手写数字是0-9,是一个多类的问题,因此采用了multisoft多分类器,
23 | 'objective': 'multi:softmax',
24 | 'num_class':10, # 类数,与 multisoftmax 并用
25 | 'gamma':0.05, # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:]
26 | 'max_depth':12, # 构建树的深度 [1:]
27 | #'lambda':450, # L2 正则项权重
28 | 'subsample':0.4, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1]
29 | 'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
30 | #'min_child_weight':12, # 节点的最少特征数
31 | 'silent':1 ,
32 | 'eta': 0.005, # 如同学习率
33 | 'seed':710,
34 | 'nthread':4,# cpu 线程数,根据自己U的个数适当调整
35 | }
36 |
37 | plst = list(params.items())
38 |
39 | #Using 10000 rows for early stopping.
40 | offset = 35000 # 训练集中数据50000,划分35000用作训练,15000用作验证
41 |
42 | num_rounds = 30000 # 迭代次数
43 | xgtest = xgb.DMatrix(test)
44 |
45 | # 划分训练集与验证集
46 | xgtrain = xgb.DMatrix(train[:offset,:], label=labels[:offset])
47 | xgval = xgb.DMatrix(train[offset:,:], label=labels[offset:])
48 |
49 | # return 训练和验证的错误率
50 | watchlist = [(xgtrain, 'train'),(xgval, 'val')]
51 |
52 |
53 | # training model
54 | # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
55 | model = xgb.train(plst, xgtrain, num_rounds, watchlist,early_stopping_rounds=100)
56 | # 用于存储训练出的模型
57 | model.save_model('model/xgb.model')
58 | # 转存模型
59 | model.dump_model('model/0p-dump.raw.txt')
60 |
61 | preds = model.predict(xgtest,ntree_limit=model.best_iteration)
62 |
63 |
64 | # 将预测结果写入文件
65 | np.savetxt('submission_xgb_MultiSoftmax.csv',np.c_[range(1,len(test)+1),preds],
66 | delimiter=',',header='ImageId,Label',comments='',fmt='%d')
67 |
68 |
69 | cost_time = time.time()-now
70 | print("end ......",'\n',"cost time:",cost_time,"(s)......")
71 | #制图
72 | xgb.plot_importance(model)
73 | #xgb.plot_tree(model, num_trees=2)
--------------------------------------------------------------------------------
/matplot_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "[]"
12 | ]
13 | },
14 | "execution_count": 4,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import matplotlib.pyplot as plt\n",
21 | "import numpy as np\n",
22 | "\n",
23 | "x = np.arange(20)\n",
24 | "y = x**2\n",
25 | "\n",
26 | "plt.plot(x,y)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": []
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": []
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "metadata": {
51 | "scrolled": true
52 | },
53 | "outputs": [
54 | {
55 | "data": {
56 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl8FfW5x/HPA4EEwpZACJEAAWQX2cKiVut1X3pLXeuG\niChqta33Wlu73Nba1uq11WoXlwqyKe5W6tVa3GqtsiRsshNZw5IFMIQlIctz/8hgUxpISHJylnzf\nr9d5nTkzvznzMJx8M/md38yYuyMiIrGrRbgLEBGR0FLQi4jEOAW9iEiMU9CLiMQ4Bb2ISIxT0IuI\nxDgFvYhIjFPQi4jEOAW9iEiMiwt3AQBdunTxjIyMcJchIhJVsrOzC909pbZ2ERH0GRkZZGVlhbsM\nEZGoYmab69JOXTciIjFOQS8iEuMU9CIiMU5BLyIS4xT0IiIxTkEvIhLjFPQiIjFOQS8iEiaPvbue\nVdv3hnw7EXHClIhIc/Ni1lYenreOkrIKBp/QIaTb0hG9iEgTW577OT/60wpOO7Ez/31u/5BvT0Ev\nItKEdu0r5dZZ2aS0i+e3V48krmXoY7jWLZhZgpktNLNlZrbSzH4azJ9uZhvNbGnwGB7MNzN7zMxy\nzGy5mY0M9T9CRCQalFdU8s05Syjcf4gnrhtFcmLrJtluXfroS4Gz3H2fmbUCPjKzt4Jld7v7y0e0\nvxDoFzzGAo8HzyIizdpDb6/l48928asrhjE0vWOTbbfWI3qvsi942Sp4+DFWGQ/MDNabD3Qys7SG\nlyoiEr3eWL6dJz/cwIRxvbh8VHqTbrtOnUNm1tLMlgL5wDx3XxAs+kXQPfOImcUH87oDW6utnhvM\nO/I9p5hZlpllFRQUNOCfICIS2dbuLOa7Ly9nVK8k/ucrg5t8+3UKenevcPfhQDowxsxOAr4PDARG\nA8nA94LmVtNb1PCeT7l7prtnpqTUet18EZGoVHSwjFtmZZEYH8cfrh1J67imHwNzXFt098+BD4AL\n3H1H0D1TCjwDjAma5QI9qq2WDmxvhFpFRKJKZaXz3y8sJXfPQR6/diSpHRLCUkddRt2kmFmnYLoN\ncA6w5nC/u5kZ8DVgRbDKXOD6YPTNOKDI3XeEpHoRkQj22HvreXdNPj/+z8FkZiSHrY66jLpJA2aY\nWUuqfjG86O5vmNl7ZpZCVVfNUuDWoP2bwEVADnAAmNT4ZYuIRLZ3V+fxm3fWc+nI7kwY1yustdQa\n9O6+HBhRw/yzjtLegdsbXpqISHTaWLifO19YypATOnD/JUOp6vgIH50ZKyLSiPaXlnPrrGziWhhP\nXDeKhFYtw12SLmomItJY3J3vvbKc9fnFzLhxDD2S24a7JEBH9CIijebpv2/kjeU7uPv8gZzeL3KG\njSvoRUQawcc5hfzyrdVceFI3bv1yn3CX8y8U9CIiDbTt84PcMWcJfVLa8dAVw8L+5euRFPQiIg1Q\nUlbBbbOzKSuv5MkJo2gXH3lffUZeRSIiUcLd+fHrK1ieW8RTE0bRN6VduEuqkY7oRUTq6bmFW3gx\nK5dvnnUi5w3pFu5yjkpBLyJSDws27OLeuSs5c0AKd54T+tsBNoSCXkTkOG3ZdYBbZ2fTI7ktj141\ngpYtIuvL1yMp6EVEjsPekjImz1hEpcPUiaPp2KZVuEuqlYJeRKSOyisq+eZzS9hYuJ/HrxtJ7y6J\n4S6pTjTqRkSkju5/cw1/W1fALy45iVP7dgl3OXWmI3oRkTp4bsEWpv1jI5NOy+DaseG97PDxUtCL\niNTi488K+fHrK/hy/xR+eNGgcJdz3BT0IiLHsKlwP7fNXkxGl0R+e80I4lpGX2xGX8UiIk2k6GDV\nCJsWBlMnZtIhIfJH2NREX8aKiNSgvKKSO55bzJbdB5g9eSy9OkfHCJua1OXm4AlmttDMlpnZSjP7\naTC/t5ktMLP1ZvaCmbUO5scHr3OC5Rmh/SeIiDS+n//fav6+vpCff+0kxvbpHO5yGqQuXTelwFnu\nPgwYDlxgZuOAB4FH3L0fsAeYHLSfDOxx9xOBR4J2IiJRY9b8zUz/eBM3n96br4/uGe5yGqzWoPcq\n+4KXrYKHA2cBLwfzZwBfC6bHB68Jlp9tkXZxZhGRo/hofSH3zl3JWQO7cs+F0TfCpiZ1+jLWzFqa\n2VIgH5gHfAZ87u7lQZNcoHsw3R3YChAsLwKi++8eEWkWNhTs4xvPZnNiSjsevWp4xF/Dpq7qFPTu\nXuHuw4F0YAxQ0685D55r2jN+5Awzm2JmWWaWVVBQUNd6RURCouhAGZNnZNGqZQuenphJ+ygdYVOT\n4xpe6e6fAx8A44BOZnZ41E46sD2YzgV6AATLOwK7a3ivp9w9090zU1Ii5ya6ItL8lFVU8o3nstm2\n5yBPTBhFj+S24S6pUdVl1E2KmXUKptsA5wCrgfeBy4NmE4HXg+m5wWuC5e+5+78d0YuIRAJ35965\nK/lHzi7uv3QoozOSw11So6vLOPo0YIaZtaTqF8OL7v6Gma0CnjeznwNLgKlB+6nALDPLoepI/qoQ\n1C0i0ihmfrKZZxds4dYv9+XyUenhLickag16d18OjKhh/gaq+uuPnF8CXNEo1YmIhNCH6wr46Z9X\ncu7gVL57/oBwlxMyugSCiDRLa3bu5fZnFzOgWwd+8/XhtIiRETY1UdCLSLOz/fOD3DBtEW3jW/L0\nxEwS42P7ajAKehFpVooOlnHDMwvZX1rO9Elj6N6pTbhLCrnY/jUmIlJNaXkFU2ZmsbFwPzMmjWFQ\nWodwl9QkFPQi0ixUVjp3vbiMBRt38+hVwzn1xOi5FWBDqetGRJqFX761mjeW7+CeCwcyfnj32leI\nIQp6EYl50z7ayB//vpGJp/TiljP6hLucJqegF5GY9uanO/jZ/63i/CGp/Pg/h9AcL6aroBeRmLVw\n427ufGEpI3sm8ehVI2LmapTHS0EvIjEpJ7+Ym2dmkZ7UhqevzyShVctwlxQ2CnoRiTl5e0uYOG0R\nrVq2YMakMSQltg53SWGloBeRmFJcUsYNzyzi8wOHmD5pdMxdcrg+NI5eRGLGofJKbpu9mPV5xUy9\nYTQnde8Y7pIigoJeRGKCu3PPK8v5KKeQhy4/mS/31w2NDlPXjYjEhIfeXsurS7Zx17n9uSKzR7jL\niSgKehGJerPmb+YPH3zG1WN6csdZJ4a7nIijoBeRqPbXlTv5yesrOGdQV342vnmeEFUbBb2IRK3s\nzXv45pwlDE3vxGNXjyCupSKtJnW5OXgPM3vfzFab2Uoz+3Yw/14z22ZmS4PHRdXW+b6Z5ZjZWjM7\nP5T/ABFpntbnFXPTjEWkdUxg2sRM2rbW2JKjqcueKQfucvfFZtYeyDazecGyR9z9V9Ubm9lgqm4I\nPgQ4AXjHzPq7e0VjFi4izdeWXQe49ukFVSdE3TiGzu3iw11SRKv1iN7dd7j74mC6GFgNHOsan+OB\n59291N03AjnUcBNxEZH62FlUwjVPz+dQRSWzbxpLr86J4S4p4h1Xh5aZZQAjgAXBrDvMbLmZTTOz\npGBed2BrtdVyOfYvBhGROtm1r5Rrn57P5wfKmDFpDP1T24e7pKhQ56A3s3bAK8Cd7r4XeBzoCwwH\ndgC/Pty0htW9hvebYmZZZpZVUFBw3IWLSPOyt6SM66ctJHfPQaZOzGRYj07hLilq1CnozawVVSH/\nrLu/CuDuee5e4e6VwB/5Z/dMLlD9bIV0YPuR7+nuT7l7prtnpqToDDYROboDh8q58ZlFrMsr5okJ\noxjbp3O4S4oqdRl1Y8BUYLW7P1xtflq1ZpcAK4LpucBVZhZvZr2BfsDCxitZRJqT0vIKbpmVzeIt\ne/jN10fwHwO6hrukqFOXUTenAROAT81saTDvB8DVZjacqm6ZTcAtAO6+0sxeBFZRNWLndo24EZH6\nKK+o5FtzlvD39YX87+Unc/HJabWvJP+m1qB394+oud/9zWOs8wvgFw2oS0SaucpK57svL+ftlXn8\n+CuDuVLXr6k3nUYmIhHH3bn3zyt5dck2/vvc/tz4pd7hLimqKehFJOI89PZaZn6ymSln9OGbukhZ\ngynoRSSi/OGDnC+uRPn9CwfqImWNQEEvIhFj1ieb+N+/rOWrw07g5187SSHfSBT0IhIRXl2cy/+8\nvpJzBnXl11cOo2ULhXxjUdCLSNj9ZcVO7n55Oaf27czvrhlJK11uuFFpb4pIWH24roBvzVnCyekd\n+eP1mSS0ahnukmKOgl5EwiZr026mzMqiT0oi028YQ2K8rikfCgp6EQmLJVv2MOmZRaR1bMOsyWPp\n2LZVuEuKWQp6EWlyi7fs4fqpC0lKbM2zN40lpb1uHBJKCnoRaVLZm3dz/dSFJLdrzfNTxnFCpzbh\nLinmqUNMRJpM1qbdTJy2kK4dEphz8zi6dUwId0nNgo7oRaRJLNy4m+unLSRVId/kdEQvIiG3YMMu\nJk1fRLeOCTx/8zi6dlDINyUd0YtISH3y2S5ueGYRaR0TeH6KQj4cdEQvIiHzcU4hN85YRI+ktjx3\n8ziNrgkTBb2IhMQ/cgqZPGMRPZOrQr5LO4V8uKjrRkQa3UfrC7lx+iIyOicyRyEfdgp6EWlUH64r\nYPKMRfTuksizN42ls0I+7GoNejPrYWbvm9lqM1tpZt8O5ieb2TwzWx88JwXzzcweM7McM1tuZiND\n/Y8Qkcjwwdp8bpqZRd+Udjx38ziFfISoyxF9OXCXuw8CxgG3m9lg4B7gXXfvB7wbvAa4EOgXPKYA\njzd61SIScd5fk8+Umdn069qOZ28aS3Ji63CXJIFag97dd7j74mC6GFgNdAfGAzOCZjOArwXT44GZ\nXmU+0MnM0hq9chGJGO+tyeOWWdn071YV8kkK+YhyXH30ZpYBjAAWAKnuvgOqfhkAXYNm3YGt1VbL\nDeYd+V5TzCzLzLIKCgqOv3IRiQjvrKoK+YFp7Xl28jg6tVXIR5o6B72ZtQNeAe50973HalrDPP+3\nGe5PuXumu2empKTUtQwRiSDzVuVx27PZDE7roEsNR7A6Bb2ZtaIq5J9191eD2XmHu2SC5/xgfi7Q\no9rq6cD2xilXRCLF60u3cdvsbIac0JFZN42lYxuFfKSqy6gbA6YCq9394WqL5gITg+mJwOvV5l8f\njL4ZBxQd7uIRkdgw65NN3PnCUkb1SmLW5DF0SFDIR7K6nBl7GjAB+NTMlgbzfgA8ALxoZpOBLcAV\nwbI3gYuAHOAAMKlRKxaRsHF3Hns3h0feWcc5g1L53TUjdI/XKFBr0Lv7R9Tc7w5wdg3tHbi9gXWJ\nSISprHTue2MV0z/exGUj03nwsqHEtdQ5l9FA17oRkVqVVVRy90vL+NPS7dz0pd784KJBtGhxtOM/\niTQKehE5poOHKrj9ucW8tyafu88fwDfO7EvVV3cSLRT0InJURQfLuGnGIrI27+H+S4Zyzdie4S5J\n6kFBLyI1yi8u4fqpC/msYB+/v2YkFw3VCe7RSkEvIv9my64DXDd1AYX7Spl2w2hO76eTGqOZgl5E\n/sWanXuZMHUhZRWVPHvTWEb0TAp3SdJACnoR+UL25t1MemYRbVvH8dwtp9AvtX24S5JGoKAXEQDe\nX5vPbbOzSevYhlmTx5Ce1DbcJUkjUdCLCK8v3cZdLy5jQLf2zLhxjG79F2MU9CLN3MxPNvGTuSsZ\nk5HM0xMzaa/r1sQcBb1IM6Xr1jQfCnqRZuhQeSU/fO1TXsrO1XVrmgEFvUgzU3SgjFtnZ/PJhl18\n6+x+/Nc5/XRJgxinoBdpRjbv2s+k6YvYuvsAD185jEtHpoe7JGkCCnqRZiJr026mzMqm0p3Zk8cy\ntk/ncJckTURBL9IMvL50G3e/tJzuSW2YdsNoendJDHdJ0oQU9CIxrPrImjG9k3nyulEkJbYOd1nS\nxBT0IjGqtLyC77/yKa8u2calI7vzy0uHEh+n4ZPNUV1uDj7NzPLNbEW1efea2TYzWxo8Lqq27Ptm\nlmNma83s/FAVLiJHt2f/ISY8vZBXl2zjrnP78+srhinkm7G6HNFPB34HzDxi/iPu/qvqM8xsMHAV\nMAQ4AXjHzPq7e0Uj1CoidbChYB83Tl/E9qISHr1qOOOHdw93SRJmtR7Ru/uHwO46vt944Hl3L3X3\njUAOMKYB9YnIcZi/YReXPv4xe0vKmXPzWIW8AHUI+mO4w8yWB107hy9Y3R3YWq1NbjBPRELslexc\nJkxdQOfE1rz2jVMZ1Ss53CVJhKhv0D8O9AWGAzuAXwfzazq9zmt6AzObYmZZZpZVUFBQzzJExN15\n+K9rueulZYzOSObV206jV2cNn5R/qlfQu3ueu1e4eyXwR/7ZPZML9KjWNB3YfpT3eMrdM909MyVF\ntykTqY+Ssgq+9fxSHnsvhysz05k+aQwd2+rqk/Kv6hX0Zlb9LsGXAIdH5MwFrjKzeDPrDfQDFjas\nRBGpSX5xCdc+vYA/L9vOdy8YwIOXnUzrOF2YTP5draNuzGwOcCbQxcxygZ8AZ5rZcKq6ZTYBtwC4\n+0ozexFYBZQDt2vEjUjjy968m9tmL2ZvSRm/v2YkF5+cVvtK0myZe41d6E0qMzPTs7Kywl2GSMRz\nd2bN38x9f15F96Q2PHHdKAaldQh3WRImZpbt7pm1tdOZsSJR4uChCn74WtWZrmcN7MojXx9Oxzbq\nj5faKehFosCWXQe4ZXY2a3bu5b/O6c83zzqRFi10DXmpGwW9SIT7YG0+335+Ke7OtImj+Y+BXcNd\nkkQZBb1IhKqsdH7/fg4Pv7OOAanteXLCKI2Pl3pR0ItEoKKDZdz14lLeWZ3PJSO6c/8lQ2nTWhcl\nk/pR0ItEmLU7i7llVha5ew7y068O4fpTeumertIgCnqRCPLnZdv57svLaZcQx5wp4xidoevVSMMp\n6EUiQFlFJQ+8tYapH21kdEYSv79mJF07JIS7LIkRCnqRMCsoLuWO5xazYONubjg1gx9ePIhWLXUp\nA2k8CnqRMFq8ZQ+3zc6m6GAZj3x9GJeMSA93SRKDFPQiYVBZ6Tz90QYeenstaR3b8OptYxh8gi5l\nIKGhoBdpYnl7S7jrxWV8lFPIBUO68eBlJ+vSwhJSCnqRJjRvVR7ffXkZJWWVPHDpUL4+uoeGTkrI\nKehFmkBJWQW/+L/VzJq/mSEndODRq0ZwYtd24S5LmgkFvUiIrd6xl2/NWcL6/H3cfHpvvnP+AOLj\ndJarNB0FvUiIuDvTP97EL99aQ8c2rZg1eQyn99NtM6XpKehFQqBwXynfeWkZH6wt4JxBXXnwspPp\n3C4+3GVJM6WgF2lkH6zN5zsvLaO4pJyfjR/CdeN0rRoJr1pPvzOzaWaWb2Yrqs1LNrN5ZrY+eE4K\n5puZPWZmOWa23MxGhrJ4kUhSUlbBfX9exQ3PLKJzYjxz7/gSE07JUMhL2NXlPOvpwAVHzLsHeNfd\n+wHvBq8BLgT6BY8pwOONU6ZIZFufV8wlf/iYaf/YyA2nZvD6HacxoFv7cJclAtSh68bdPzSzjCNm\njwfODKZnAB8A3wvmz/SqO47PN7NOZpbm7jsaq2CRSOLuPLtgCz97YxXt4uOYdkMmZw1MDXdZIv+i\nvn30qYfD2913mNnhe5t1B7ZWa5cbzFPQS8wpKC7lB699yrxVeZzRP4VfXXEyXdvripMSeRr7y9ia\nOiO9xoZmU6jq3qFnz56NXIZI6Lg7ry7exn1vrOJgWQU/ungQN57WWzfrlohV36DPO9wlY2ZpQH4w\nPxfoUa1dOrC9pjdw96eApwAyMzNr/GUgEmly9xzgB6+t4MN1BWT2SuKBy07WGa4S8eob9HOBicAD\nwfPr1ebfYWbPA2OBIvXPSyyorHRmL9jMg2+twYGffnUIE8b10lG8RIVag97M5lD1xWsXM8sFfkJV\nwL9oZpOBLcAVQfM3gYuAHOAAMCkENYs0qc8K9nHPK8tZtGkPZ/RP4f5LTiI9qW24yxKps7qMurn6\nKIvOrqGtA7c3tCiRSFBWUckf/76B37yznjatWvKrK4Zx2cjuGhcvUUdnxorUYMW2Ir73ynJWbt/L\nRUO7ce9Xh2hEjUQtBb1INSVlFTz27nqe/HADSW1b88R1I7ngpLRwlyXSIAp6kUDWpt1895XlbCjY\nzxWj0vnRxYN15yeJCQp6afb2lZbz0F/WMHP+Zk7o2IaZN47hjP66nLDEDgW9NGt/W1fAD179lO1F\nB5l4SgZ3nz+AxHj9WEhs0SdamqWtuw9w/5ureWvFTvqmJPLSLaeQmZEc7rJEQkJBL83KgUPlPP7B\nZzz54QZamnHXuf25+Yw+JLTSrf0kdinopVlwd+Yu284Db61hR1EJ44efwD0XDiStY5twlyYScgp6\niXkrthVx79yVZG3ew0ndO/Dbq0eom0aaFQW9xKzCfaX86u21vJC1leS2rXnwsqFcPqoHLXV9Gmlm\nFPQScw6VVzLzk008+u56Dh6qYPJpvfnWOf3okKAx8dI8KeglpnywNp/73ljFhoL9nDkghf/5ymD6\npugywtK8KeglJmws3M/P3ljFe2vy6d0lUbf0E6lGQS9RrbikjN+9l8O0f2wkPq4lP7hoIDec2pvW\ncXW5771I86Cgl6hUWl7BnAVb+N37n1G4r5QrRqVz9wUDdIVJkRoo6CWqlFVU8lJWLr97bz3bi0oY\n0zuZqRMzGdajU7hLE4lYCnqJCuUVlby2ZBuPvbeerbsPMrJnJx66Yhin9u2sG4GI1EJBLxGtstL5\n8/LtPPrOejYU7uek7h2474aTOHNAigJepI4U9BKR3J23V+7k4XnrWJe3j4Hd2vPkhFGcNzhVAS9y\nnBoU9Ga2CSgGKoByd880s2TgBSAD2ARc6e57GlamNBfuzntr8nl43jpWbt9L35REfnv1CC4emkYL\nndEqUi+NcUT/H+5eWO31PcC77v6Amd0TvP5eI2xHYpi78/f1hTw8bx1Lt35Oz+S2PHzlMMYP765L\nFog0UCi6bsYDZwbTM4APUNDLMczfsIuH/7qOhZt2071TGx64dCiXjUqnVUuNhRdpDA0Negf+amYO\nPOnuTwGp7r4DwN13mFnXhhYpscfd+SinkCf+9hn/yNlFaod4fjZ+CFeO7kF8nK4NL9KYGhr0p7n7\n9iDM55nZmrquaGZTgCkAPXv2bGAZEi1KyiqYu2w70z7ayJqdxaS0j+dHFw/iunG9dPMPkRBpUNC7\n+/bgOd/MXgPGAHlmlhYczacB+UdZ9yngKYDMzExvSB0S+XbtK2X2/C3Mmr+Jwn2HGNitPb+6Yhj/\nOSxNR/AiIVbvoDezRKCFuxcH0+cB9wFzgYnAA8Hz641RqESn9XnFTP1oI68u2cah8krOGtiVm77U\nm1N0opNIk2nIEX0q8FrwwxoHPOfufzGzRcCLZjYZ2AJc0fAyJZoc7n9/+u8b+du6AuLjWnD5qHRu\nPK03J3bVJYNFmlq9g97dNwDDapi/Czi7IUVJdCopq2Du0u1M/Wgja/Oq+t+/c15/rhnbi+TE1uEu\nT6TZ0pmx0mCF+0qZPX8zs+dvVv+7SARS0Eu9uDvLcouYs2ALry1V/7tIJFPQy3HJ21vCa0u28XJ2\nLjn5+0hopf53kUinoJdalZRV8M7qPF7OzuXDdQVUOozqlcQDlw7lopPTdNNtkQinoJcauTvLc4t4\nOTuXucu2U3SwjLSOCdx2Zl8uG5lOH91wWyRqKOjlX+QXl/CnoGtmXd4+4uNacP6Qblw+Kp3TTuyi\nC4yJRCEFvVBaXsF7q/N5KTuXv60roKLSGdmzE/dfMpSLT06jYxt1zYhEMwV9M1VeUcmiTXt4a8UO\n5i7bzucHykjtEM+UM/pw2ch0fbEqEkMU9M3IvtJyPlxXwLxVeby3Jp+ig2W0jmvBeYNTuXxUOqf3\nS1HXjEgMUtDHuLy9Jcxblce8VXl88tkuDlVU0qltK84e1JVzB6VyRv8UEuP1MRCJZfoJjzHuztq8\nYuatzOOd1Xksyy0CoGdyWyac0otzB6eS2SuJON3UQ6TZUNDHgPKKShZu2s07q/KZt3onW3cfBGBY\nj07cff4Azh2cSr+u7XS2qkgzpaCPQu7Opl0HWLRxNx9/Vsj7awu+6G8/rW9nbvvyiZwzqCtdOySE\nu1QRiQAK+ihQUems2bmXRRt3s2jTHhZu2k1BcSkAyYmtOXtQV84bnMrp/dTfLiL/TqkQgUrLK/g0\nt4iFm3azaONusjbvobikHIATOiZwWt/OjO6dzJiMZPqmtKOFRsqIyDEo6CPA/tJyFm/Zw8KNu1m4\ncTdLt35OaXklACd2bcdXTj6BMb2TGJ2RTHpS2zBXKyLRRkHfhNydgn2lrNu5j7V5xazbWczqnXtZ\nuX0vFZVOC4OTunfkunG9GJ2RzOiMJDq3iw932SIS5RT0IfL5gUOsy/tnoK/NK2ZdXjGfHyj7ok3n\nxNb0T23PN87sy+iMZEb2SqKd+thFpJGFLFXM7ALgUaAl8LS7PxCqbYXTvtJy1gchvi5vH+vyilm7\ns5j84MtSgPYJcQxIbc+FJ6UxILUd/bu1p39qe7roaF1EmkBIgt7MWgK/B84FcoFFZjbX3VeFYnuh\nUFZRSeG+UnYWlZC3t5T84hLy9paws+if03l7Syk6+M8j9IRWLeif2p4z+qfQP7Ud/VPbM6Bbe7p1\nSNAYdhEJm1Ad0Y8BcoIbiGNmzwPjgSYJ+opKp6SsgtLySkrKKoJHJaXlVc8l5RWUBq/3l1YEwV1K\n/t4SdgYBvmt/Ke7/+r5xLYyu7ePp2iGB3l0SOaVPZ1I7JtCva3v6p7ajR1JbjYARkYgTqqDvDmyt\n9joXGNvYG/lgbT4/e2PVFyFeGoR4WYXXvvIRurRrTdf2CXTrmMDJ6R2/mE7tEE/X9gmkdkigc2Jr\nBbmIRJ1QBX1Nafgv6WtmU4ApAD179qzXRjq0acXAbh2Ib9WChFYtiY+rek6Ia0lCqxb/fN3q8OuW\nX7RNCKbbtm5J58R4Wsfp2i8iEptCFfS5QI9qr9OB7dUbuPtTwFMAmZmZx38IDozsmcTIa5PqW6OI\nSLMQqsPYRUA/M+ttZq2Bq4C5IdqWiIgcQ0iO6N293MzuAN6manjlNHdfGYptiYjIsYVsHL27vwm8\nGar3FxGNKOy7AAAFP0lEQVSRutE3kCIiMU5BLyIS4xT0IiIxTkEvIhLjFPQiIjHO/MgLuoSjCLMC\nYHM9V+8CFDZiOY0t0uuDyK9R9TWM6muYSK6vl7un1NYoIoK+Icwsy90zw13H0UR6fRD5Naq+hlF9\nDRPp9dWFum5ERGKcgl5EJMbFQtA/Fe4CahHp9UHk16j6Gkb1NUyk11erqO+jFxGRY4uFI3oRETmG\nqAl6M7vAzNaaWY6Z3VPD8ngzeyFYvsDMMpqwth5m9r6ZrTazlWb27RranGlmRWa2NHj8uKnqC7a/\nycw+DbadVcNyM7PHgv233MxGNmFtA6rtl6VmttfM7jyiTZPvPzObZmb5Zrai2rxkM5tnZuuD5xpv\niGBmE4M2681sYhPW95CZrQn+D18zs05HWfeYn4cQ1nevmW2r9v940VHWPebPewjre6FabZvMbOlR\n1g35/mtU7h7xD6oudfwZ0AdoDSwDBh/R5hvAE8H0VcALTVhfGjAymG4PrKuhvjOBN8K4DzcBXY6x\n/CLgLaruDjYOWBDG/+udVI0PDuv+A84ARgIrqs37X+CeYPoe4MEa1ksGNgTPScF0UhPVdx4QF0w/\nWFN9dfk8hLC+e4Hv1OEzcMyf91DVd8TyXwM/Dtf+a8xHtBzRf3GzcXc/BBy+2Xh144EZwfTLwNlm\n1iQ3eHX3He6+OJguBlZTdd/caDIemOlV5gOdzCwtDHWcDXzm7vU9ga7RuPuHwO4jZlf/nM0AvlbD\nqucD89x9t7vvAeYBFzRFfe7+V3cvD17Op+rubmFxlP1XF3X5eW+wY9UXZMeVwJzG3m44REvQ13Sz\n8SOD9Is2wQe9COjcJNVVE3QZjQAW1LD4FDNbZmZvmdmQJi2s6p69fzWz7OB+vUeqyz5uCldx9B+u\ncO6/w1LdfQdU/YIHutbQJlL25Y1U/ZVWk9o+D6F0R9C1NO0oXV+RsP9OB/Lcff1Rlodz/x23aAn6\nWm82Xsc2IWVm7YBXgDvdfe8RixdT1R0xDPgt8KemrA04zd1HAhcCt5vZGUcsj4T91xr4KvBSDYvD\nvf+ORyTsyx8C5cCzR2lS2+chVB4H+gLDgR1UdY8cKez7D7iaYx/Nh2v/1Uu0BH2tNxuv3sbM4oCO\n1O/Pxnoxs1ZUhfyz7v7qkcvdfa+77wum3wRamVmXpqrP3bcHz/nAa1T9eVxdXfZxqF0ILHb3vCMX\nhHv/VZN3uEsreM6voU1Y92Xw5e9XgGs96FA+Uh0+DyHh7nnuXuHulcAfj7LdcO+/OOBS4IWjtQnX\n/quvaAn6utxsfC5weHTD5cB7R/uQN7agP28qsNrdHz5Km26HvzMwszFU7ftdTVRfopm1PzxN1Rd2\nK45oNhe4Phh9Mw4oOtxF0YSOehQVzv13hOqfs4nA6zW0eRs4z8ySgq6J84J5IWdmFwDfA77q7geO\n0qYun4dQ1Vf9e59LjrLduvy8h9I5wBp3z61pYTj3X72F+9vguj6oGhWyjqpv438YzLuPqg80QAJV\nf/LnAAuBPk1Y25eo+tNyObA0eFwE3ArcGrS5A1hJ1QiC+cCpTVhfn2C7y4IaDu+/6vUZ8Ptg/34K\nZDbx/29bqoK7Y7V5Yd1/VP3S2QGUUXWUOZmq733eBdYHz8lB20zg6Wrr3hh8FnOASU1YXw5V/duH\nP4eHR6KdALx5rM9DE9U3K/h8LacqvNOOrC94/W8/701RXzB/+uHPXbW2Tb7/GvOhM2NFRGJctHTd\niIhIPSnoRURinIJeRCTGKehFRGKcgl5EJMYp6EVEYpyCXkQkxinoRURi3P8Dj787zxrXhFMAAAAA\nSUVORK5CYII=\n",
57 | "text/plain": [
58 | ""
59 | ]
60 | },
61 | "metadata": {},
62 | "output_type": "display_data"
63 | }
64 | ],
65 | "source": [
66 | "plt.show()"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": true
74 | },
75 | "outputs": [],
76 | "source": []
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "collapsed": true
83 | },
84 | "outputs": [],
85 | "source": []
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": true
101 | },
102 | "outputs": [],
103 | "source": []
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 2,
108 | "metadata": {
109 | "collapsed": true
110 | },
111 | "outputs": [],
112 | "source": []
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": true
119 | },
120 | "outputs": [],
121 | "source": []
122 | }
123 | ],
124 | "metadata": {
125 | "kernelspec": {
126 | "display_name": "Python 3",
127 | "language": "python",
128 | "name": "python3"
129 | },
130 | "language_info": {
131 | "codemirror_mode": {
132 | "name": "ipython",
133 | "version": 3
134 | },
135 | "file_extension": ".py",
136 | "mimetype": "text/x-python",
137 | "name": "python",
138 | "nbconvert_exporter": "python",
139 | "pygments_lexer": "ipython3",
140 | "version": "3.6.2"
141 | }
142 | },
143 | "nbformat": 4,
144 | "nbformat_minor": 2
145 | }
146 |
--------------------------------------------------------------------------------
/numpy/np_test.py:
--------------------------------------------------------------------------------
1 | #coding = utf-8
2 | import numpy as np
3 |
4 |
5 | a = np.array([[1, 2, 3],[4, 5, 6]], np.int32)
6 | print(a.shape)
7 | #print(a.flags)
8 | #print(a.data)
9 | #print(a.base)
10 | #print(a.item)
11 | print(a.tolist())
12 | print(a.dumps())
13 |
14 | b = np.arange(12).reshape(4, 3)
15 | print(b)
16 | print(a.reshape(3, 2))
17 |
18 | print(np.int32)
19 |
20 | #index
21 | x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
22 | print(x[1:7:2])
23 |
24 | x = np.array([[[1],[2],[3]], [[4],[5],[6]]])
25 | print(x.shape)
26 | print(x)
27 |
28 | x_part = np.array([[1], [2], [3]])
29 | print(x_part.shape)
30 |
--------------------------------------------------------------------------------
/numpy/np_test2.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | a = np.array([[56.0, 0.0, 4.4],
5 | [1.2, 104.0, 52.0]])
6 | #print a
7 |
8 | cal = a.sum(axis=0)
9 | print cal
10 |
11 | percentage = 100*a/cal
12 | print percentage
13 |
14 | print np.array([[1,2],[3,4]])/np.array([1,2])
--------------------------------------------------------------------------------
/numpy/test.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import numpy as np
3 | import sys
4 |
5 |
6 | def pythonsum(n):
7 | a = [i for i in range(n)]
8 | b = [i for i in range(n)]
9 | c = []
10 | for i in range(len(a)):
11 | a[i] = i ** 2
12 | b[i] = i ** 3
13 | c.append(a[i]+b[i])
14 | return c
15 |
16 |
17 | def numpysum(n):
18 | a = np.arange(n) ** 2
19 | b = np.arange(n) ** 3
20 | c = a + b
21 | return c
22 |
23 |
24 | if __name__ == '__main__':
25 | size = 1000
26 | start = datetime.now()
27 | c = pythonsum(size)
28 | delay = datetime.now() - start
29 | print("python运算次幂结果后三个:",c[-3:])
30 | print("python运行时间:(毫秒)", delay.microseconds)
31 |
32 | start = datetime.now()
33 | c = numpysum(size)
34 | delay = datetime.now() - start
35 | print("numpy运算次幂结果后三个:",c[-3:])
36 | print("numpy运行时间:(毫秒)", delay.microseconds)
--------------------------------------------------------------------------------
/numpy/线下门店服务器安装部署手册.docx - 快捷方式.lnk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naginoa/Data-mining/3bc233461e4d88e7b4212303cc5574c8ebcf2523/numpy/线下门店服务器安装部署手册.docx - 快捷方式.lnk
--------------------------------------------------------------------------------
/tensorflow/index.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import gzip
6 | import os
7 | import tempfile
8 |
9 | import numpy
10 | from six.moves import urllib
11 | from six.moves import xrange # pylint: disable=redefined-builtin
12 | import tensorflow as tf
13 | from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
14 | from tensorflow.examples.tutorials.mnist import input_data
15 |
16 |
17 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
18 | x = tf.placeholder(tf.float32, [None, 784])
19 | W = tf.Variable(tf.zeros([784,10]))
20 | b = tf.Variable(tf.zeros([10]))
21 | y = tf.nn.softmax(tf.matmul(x,W) + b)
22 | y_ = tf.placeholder("float", [None,10])
23 | cross_entropy = -tf.reduce_sum(y_*tf.log(y))
24 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
25 | init = tf.initialize_all_variables()
26 | sess = tf.Session()
27 | sess.run(init)
28 | for i in range(1000):
29 | batch_xs, batch_ys = mnist.train.next_batch(100)
30 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
31 | correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
32 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
33 | print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
--------------------------------------------------------------------------------
/tensorflow/test2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | a = tf.add(3, 5)
5 | #sess = tf.Session()
6 | #print(sess.run(a))
7 | #sess.close()
8 | with tf.Session() as sess:
9 | print(sess.run(a))
10 |
11 | x = 2
12 | y = 3
13 | op1 = tf.add(x, y)
14 | op2 = tf.multiply(x, y)
15 | op3 = tf.pow(op2, op1)
16 | with tf.Session() as sess:
17 | op3 = sess.run(op3)
18 | print(op3)
19 |
20 | with tf.device('/CPU:0'):
21 | #安装好GPU之后,可以使用GPU
22 | a = tf.constant([[1.0]], name='a')
23 | b = tf.constant([[1.0]], name='b')
24 | c = tf.matmul(a, b)
25 |
26 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
27 |
28 | print(sess.run(c))
29 |
30 |
--------------------------------------------------------------------------------
/tensorflow/test3.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | # 添加层
5 | def add_layer(inputs, in_size, out_size, activation_function=None):
6 | # add one more layer and return the output of this layer
7 | Weights = tf.Variable(tf.random_normal([in_size, out_size]))
8 | biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
9 | Wx_plus_b = tf.matmul(inputs, Weights) + biases
10 | if activation_function is None:
11 | outputs = Wx_plus_b
12 | else:
13 | outputs = activation_function(Wx_plus_b)
14 | return outputs
15 |
16 | # 1.训练的数据
17 | # Make up some real data
18 | x_data = np.linspace(-1,1,300)[:, np.newaxis]
19 | noise = np.random.normal(0, 0.05, x_data.shape)
20 | y_data = np.square(x_data) - 0.5 + noise
21 |
22 | # 2.定义节点准备接收数据
23 | # define placeholder for inputs to network
24 | xs = tf.placeholder(tf.float32, [None, 1])
25 | ys = tf.placeholder(tf.float32, [None, 1])
26 |
27 | # 3.定义神经层:隐藏层和预测层
28 | # add hidden layer 输入值是 xs,在隐藏层有 10 个神经元
29 | l1 = add_layer(xs, 1, 10, activation_function=tf.nn.relu)
30 | # add output layer 输入值是隐藏层 l1,在预测层输出 1 个结果
31 | prediction = add_layer(l1, 10, 1, activation_function=None)
32 |
33 | # 4.定义 loss 表达式
34 | # the error between prediciton and real data
35 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction),
36 | reduction_indices=[1]))
37 |
38 | # 5.选择 optimizer 使 loss 达到最小
39 | # 这一行定义了用什么方式去减少 loss,学习率是 0.1
40 | train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
41 |
42 |
43 | # important step 对所有变量进行初始化
44 | init = tf.initialize_all_variables()
45 | sess = tf.Session()
46 | # 上面定义的都没有运算,直到 sess.run 才会开始运算
47 | sess.run(init)
48 |
49 | # 迭代 1000 次学习,sess.run optimizer
50 | for i in range(1000):
51 | # training train_step 和 loss 都是由 placeholder 定义的运算,所以这里要用 feed 传入参数
52 | sess.run(train_step, feed_dict={xs: x_data, ys: y_data})
53 | if i % 50 == 0:
54 | # to see the step improvement
55 | print(sess.run(loss, feed_dict={xs: x_data, ys: y_data}))
--------------------------------------------------------------------------------
/tf衣服图片识别率提升/CNN_digit.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | print(tf.__version__)
8 | fashion_mnist = keras.datasets.fashion_mnist
9 | (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
10 | x_train, x_test = np.squeeze(x_train), np.squeeze(x_test)
11 | x_train = x_train.reshape([60000, 28, 28, 1])
12 | x_test = x_test.reshape([10000, 28, 28, 1])
13 |
14 | class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
15 | 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
16 |
17 | # plt.figure()
18 | # plt.imshow(x_train[0])
19 | # plt.show()
20 |
21 | model = keras.Sequential()
22 | model.add(keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu',
23 | padding='SAME', input_shape=(28,28,1)))
24 | model.add(keras.layers.MaxPool2D(2))
25 | model.add(keras.layers.Conv2D(filters=32, kernel_size=(2, 2), activation='relu',
26 | padding='SAME', input_shape=(28,28,1)))
27 | model.summary()
28 | model.add(keras.layers.Flatten())
29 | model.add(keras.layers.Dense(256, activation='relu'))
30 | model.add(keras.layers.Dropout(0.2))
31 | model.add(keras.layers.Dense(128, activation='relu'))
32 | model.add(keras.layers.Dense(64, activation='relu'))
33 | model.add(keras.layers.Dropout(0.2))
34 | model.add(keras.layers.Dense(10))
35 | model.summary()
36 |
37 | model.compile(optimizer='adam',
38 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
39 | metrics=['accuracy'])
40 |
41 | model.fit(x_train, y_train, epochs=10)
42 | test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
43 |
44 | print('\nTest accuracy:', test_acc)
45 |
46 |
--------------------------------------------------------------------------------
/tf衣服图片识别率提升/README.md:
--------------------------------------------------------------------------------
1 | # tf衣服图片识别率提升
2 | tf官方教程的识别网络结构是一开始全部对图片碾平成一维, 之后过两个全连接层.
3 |
4 | 我的改进是首先将数据reshape成四维, [batch, h,w,c]. CNN1+dropout+CNN2 256全连接 dropout 0.2 128全连接 68全连接 dropout0.2 10全连接
5 |
6 | 准确率可以提升从0.86提升到0.91
7 |
8 | # tf文本分类
9 | 代码v1 先对文本的index进行norm 放几个全连接层 之后准确率是0.5 是瞎猜
10 |
11 | v2使用全局平均池化代替全连接层,准确率变为0.87 gap的作用我认为主要是减少fc的参数量,附带使用全局信息,防止过拟合
12 |
--------------------------------------------------------------------------------
/tf衣服图片识别率提升/Text_classifier_v1.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 |
4 |
5 | def decode_review(texts):
6 | return ' '.join([reverse_word_index.get(i, '?') for i in texts])
7 |
8 | imdb = keras.datasets.imdb
9 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
10 | print(x_train[0])
11 |
12 | word_index = imdb.get_word_index()
13 | word_index = {k:(v+3) for k,v in word_index.items()}
14 | word_index[""] = 0
15 | word_index[""] = 1
16 | word_index[""] = 2 # unknown
17 | word_index[""] = 3
18 |
19 | reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
20 | print(decode_review(x_train[0]))
21 |
22 | train_data = keras.preprocessing.sequence.pad_sequences(x_train,
23 | value=word_index[""],
24 | padding='post',
25 | maxlen=256)
26 |
27 | test_data = keras.preprocessing.sequence.pad_sequences(x_test,
28 | value=word_index[""],
29 | padding='post',
30 | maxlen=256)
31 |
32 | train_data = tf.nn.l2_normalize(train_data.astype('float'))
33 | test_data = tf.nn.l2_normalize(test_data.astype('float'))
34 |
35 | model = keras.Sequential()
36 | model.add(keras.layers.Dense(units=64, input_shape=(25000, 256)))
37 | model.add(keras.layers.Dense(units=32))
38 | model.add(keras.layers.Dense(units=16))
39 | model.add(keras.layers.Dense(units=1))
40 | model.summary()
41 |
42 | model.compile(optimizer='adam',
43 | loss='binary_crossentropy',
44 | metrics=['accuracy'])
45 |
46 | model.fit(train_data, test_data, epochs=5)
47 | loss_and_metrics = model.evaluate(test_data, y_test)
48 | print(loss_and_metrics)
49 |
--------------------------------------------------------------------------------
/tf衣服图片识别率提升/Text_classifier_v2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 |
4 |
5 | def decode_review(texts):
6 | return ' '.join([reverse_word_index.get(i, '?') for i in texts])
7 |
8 | imdb = keras.datasets.imdb
9 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
10 | print(x_train[0])
11 |
12 | word_index = imdb.get_word_index()
13 | word_index = {k:(v+3) for k,v in word_index.items()}
14 | word_index[""] = 0
15 | word_index[""] = 1
16 | word_index[""] = 2 # unknown
17 | word_index[""] = 3
18 |
19 | reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
20 | print(decode_review(x_train[0]))
21 |
22 | train_data = keras.preprocessing.sequence.pad_sequences(x_train,
23 | value=word_index[""],
24 | padding='post',
25 | maxlen=256)
26 |
27 | test_data = keras.preprocessing.sequence.pad_sequences(x_test,
28 | value=word_index[""],
29 | padding='post',
30 | maxlen=256)
31 |
32 | # norm化
33 | # train_data = tf.nn.l2_normalize(train_data.astype('float'))
34 | # test_data = tf.nn.l2_normalize(test_data.astype('float'))
35 |
36 | x_val = train_data[:10000]
37 | partial_x_train = train_data[10000:]
38 |
39 | y_val = y_train[:10000]
40 | partial_y_train = y_train[10000:]
41 |
42 | model = keras.Sequential()
43 | model.add(keras.layers.Embedding(input_dim=10000, output_dim=16))
44 | model.add(keras.layers.GlobalAveragePooling1D())
45 | model.add(keras.layers.Dense(16, activation='relu'))
46 | model.add(keras.layers.Dense(1, activation='sigmoid'))
47 | model.summary()
48 |
49 | model.compile(optimizer='adam',
50 | loss='binary_crossentropy',
51 | metrics=['accuracy'])
52 |
53 | history = model.fit(partial_x_train,
54 | partial_y_train,
55 | epochs=40,
56 | batch_size=512,
57 | validation_data=(x_val, y_val),
58 | verbose=1)
59 |
60 | loss_and_metrics = model.evaluate(test_data, y_test)
61 | print(loss_and_metrics)
62 |
--------------------------------------------------------------------------------
/泰坦尼克生存预测案例/test.csv:
--------------------------------------------------------------------------------
1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S
4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q
5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S
6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S
7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S
8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q
9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S
10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C
11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S
12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S
14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S
15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S
16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S
17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C
18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q
19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C
20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S
21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C
22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C
23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S
24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C
26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C
27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S
28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C
29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C
30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S
31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S
33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S
34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S
35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C
37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C
38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S
39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S
40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S
41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S
42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C
43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S
44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S
45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S
46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S
47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S
48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C
49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q
50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C
51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S
52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S
53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C
54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S
55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S
56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C
57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q
58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S
59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S
60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S
61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C
62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S
63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S
64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S
65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q
66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C
67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S
68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q
69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S
70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C
71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S
72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q
73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S
74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S
75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C
76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C
77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C
78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S
79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S
80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S
81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q
82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C
83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S
84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S
85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S
86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q
87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C
88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q
89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S
90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q
91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S
92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S
93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S
95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S
96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C
97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S
98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S
99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S
100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S
101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S
102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C
103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S
104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S
106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C
107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S
108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q
109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S
111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S
112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C
113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q
114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C
115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q
116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S
117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C
118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C
119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S
120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C
121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S
122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S
123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q
124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C
125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S
126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q
127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S
128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S
129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q
130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S
131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S
132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S
133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C
134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S
135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S
137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S
138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C
139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S
140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S
141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S
142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S
143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S
144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C
145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S
146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S
147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S
148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S
149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S
150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S
151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S
152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C
153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C
154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S
155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S
156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S
157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S
158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S
159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S
160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S
161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S
162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q
163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C
164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S
165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S
166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S
167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S
168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C
169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S
170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C
171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S
172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S
173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C
174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S
175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C
176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S
177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S
178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S
179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C
180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S
181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C
182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S
183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C
184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S
185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q
186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C
187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S
188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S
189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S
190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S
191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S
192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S
193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S
194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S
195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q
196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S
197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S
198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C
199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S
200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S
201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S
202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q
203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C
205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S
206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S
207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q
209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S
210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C
211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S
212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S
213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S
214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S
215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S
216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S
217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S
218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q
219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S
220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C
221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S
222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C
223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S
224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S
225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S
226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C
227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C
228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S
229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q
230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S
231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S
232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S
233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S
234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S
235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q
236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C
237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S
238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C
239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C
240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S
241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C
242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C
243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S
244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C
245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S
246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S
247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S
248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S
249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S
250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S
251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C
252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S
253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S
254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C
255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S
256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S
257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S
258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q
259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S
260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S
261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S
262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S
263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S
264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S
265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S
266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C
267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S
268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S
269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S
270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S
271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S
272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C
273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q
274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C
275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q
276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C
277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S
278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S
279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S
280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S
281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S
282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S
283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S
284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q
285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C
286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S
287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S
288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S
290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C
291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S
292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S
293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q
294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C
295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S
296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S
297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S
298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C
299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C
300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S
301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S
302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S
303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C
304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S
305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S
306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S
308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S
309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S
310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S
311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S
312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S
313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C
314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S
315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q
316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C
317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q
318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C
319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S
320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S
321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S
322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S
323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C
324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S
325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S
326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S
327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S
328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S
329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C
330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S
331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S
332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S
333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C
334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C
335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C
336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S
337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S
338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S
339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C
340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S
341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C
342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S
343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S
344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S
345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C
346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S
348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S
349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C
350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S
351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S
352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C
353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S
354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S
355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S
356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S
357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S
358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S
359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S
360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q
361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S
362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S
363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C
364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S
365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S
366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C
367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S
368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C
369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S
370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C
371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C
372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S
373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C
374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S
375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S
376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S
377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C
378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S
379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S
380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S
381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S
382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q
383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q
384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S
385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S
386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S
388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S
389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S
390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q
391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S
392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S
393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S
394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S
395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S
396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S
397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S
398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q
399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C
400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S
401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q
402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S
403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S
404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C
405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S
406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C
407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C
408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S
409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C
410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S
412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q
414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S
415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C
417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C
420 |
--------------------------------------------------------------------------------
/蝴蝶花(iris)分类案例/Iris.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # 数据检查
5 |
6 | # In[5]:
7 |
8 |
9 | import pandas as pd
10 | import numpy as np
11 |
12 |
13 | # In[18]:
14 |
15 |
16 | df = pd.read_csv('IrisFishData.csv')
17 | df.head()
18 |
19 |
20 | # In[19]:
21 |
22 |
23 | df.describe()
24 |
25 |
26 | # In[20]:
27 |
28 |
29 | df.isnull().values.any()
30 |
31 |
32 | # In[45]:
33 |
34 |
35 | df = pd.read_csv('IrisFishData.csv', na_values=['NA'])
36 |
37 |
38 | # df.isnull().sum()
39 |
40 | # In[9]:
41 |
42 |
43 | get_ipython().magic('matplotlib inline')
44 |
45 | import matplotlib.pyplot as plt
46 |
47 |
48 | # In[2]:
49 |
50 |
51 | import seaborn as sb
52 |
53 |
54 | # In[24]:
55 |
56 |
57 | sb.pairplot(df.dropna(), hue = 'class')
58 | #每列的分布在对角线上画出
59 |
60 |
61 | # # 数据清洗
62 |
63 | # In[26]:
64 |
65 |
66 | df['class'].unique()
67 |
68 |
69 | # In[46]:
70 |
71 |
72 | df.loc[df['class'] == 'setossa', 'class'] = 'setosa'
73 | df['class'].unique()
74 |
75 |
76 | # ## 由图可知蓝色有个点总是在范围外 是不是可能有数据错误
77 |
78 | # In[55]:
79 |
80 |
81 | df.loc[df['class'] == 'setosa', 'sepal_width_cm'].describe()
82 |
83 |
84 | # In[76]:
85 |
86 |
87 | df = df.loc[(df['class'] != 'setosa') | ((df['class'] == 'setosa') & (df['sepal_width_cm'] >= 2.5))]
88 | #不是setosa的class的 不需要清洗 是setosa的数据需要过滤到sepal_width_cm 值为2.5以上的
89 |
90 |
91 | # In[75]:
92 |
93 |
94 | sub_df = df.loc[(df['class'] != 'setosa') | (df['sepal_width_cm'] >= 2.5)]
95 | sub_df.loc[sub_df['class'] == 'setosa', 'sepal_width_cm'].hist()
96 |
97 |
98 | # In[77]:
99 |
100 |
101 | df.to_csv('Iris_clean.csv', index=False)
102 |
103 |
104 | # In[6]:
105 |
106 |
107 | clean_df = pd.read_csv('Iris_clean.csv')
108 |
109 |
110 | # In[10]:
111 |
112 |
113 | sb.pairplot(clean_df, hue='class')
114 |
115 |
116 | #
117 |
118 | # In[ ]:
119 |
120 |
121 |
122 |
123 |
124 | # In[ ]:
125 |
126 |
127 |
128 |
129 |
130 | # In[ ]:
131 |
132 |
133 |
134 |
135 |
136 | # In[ ]:
137 |
138 |
139 |
140 |
141 |
142 | # In[ ]:
143 |
144 |
145 |
146 |
147 |
148 | # In[ ]:
149 |
150 |
151 |
152 |
153 |
154 | # In[ ]:
155 |
156 |
157 |
158 |
159 |
160 | # In[ ]:
161 |
162 |
163 |
164 |
165 |
166 | # In[ ]:
167 |
168 |
169 |
170 |
171 |
172 | # In[ ]:
173 |
174 |
175 |
176 |
177 |
178 | # In[ ]:
179 |
180 |
181 |
182 |
183 |
184 | # In[ ]:
185 |
186 |
187 |
188 |
189 |
190 | # In[ ]:
191 |
192 |
193 |
194 |
195 |
196 | # In[ ]:
197 |
198 |
199 |
200 |
201 |
202 | # In[ ]:
203 |
204 |
205 |
206 |
207 |
208 | # In[ ]:
209 |
210 |
211 |
212 |
213 |
214 | # In[ ]:
215 |
216 |
217 |
218 |
219 |
220 | # In[ ]:
221 |
222 |
223 |
224 |
225 |
226 | # In[ ]:
227 |
228 |
229 |
230 |
231 |
232 | # In[ ]:
233 |
234 |
235 |
236 |
237 |
238 | # In[ ]:
239 |
240 |
241 |
242 |
243 |
244 | # In[ ]:
245 |
246 |
247 |
248 |
249 |
250 | # In[ ]:
251 |
252 |
253 |
254 |
255 |
256 | # In[ ]:
257 |
258 |
259 |
260 |
261 |
262 | # In[ ]:
263 |
264 |
265 |
266 |
267 |
268 | # In[ ]:
269 |
270 |
271 |
272 |
273 |
274 | # In[ ]:
275 |
276 |
277 |
278 |
279 |
280 | # In[ ]:
281 |
282 |
283 |
284 |
285 |
286 | # In[ ]:
287 |
288 |
289 |
290 |
291 |
292 | # In[ ]:
293 |
294 |
295 |
296 |
297 |
298 | # In[ ]:
299 |
300 |
301 |
302 |
303 |
304 | # In[ ]:
305 |
306 |
307 |
308 |
309 |
310 | # In[ ]:
311 |
312 |
313 |
314 |
315 |
316 | # In[ ]:
317 |
318 |
319 |
320 |
321 |
322 | # In[ ]:
323 |
324 |
325 |
326 |
327 |
328 | # In[ ]:
329 |
330 |
331 |
332 |
333 |
334 | # In[ ]:
335 |
336 |
337 |
338 |
339 |
340 | # In[ ]:
341 |
342 |
343 |
344 |
345 |
346 | # In[ ]:
347 |
348 |
349 |
350 |
351 |
352 | # In[ ]:
353 |
354 |
355 |
356 |
357 |
358 | # In[ ]:
359 |
360 |
361 |
362 |
363 |
364 | # In[ ]:
365 |
366 |
367 |
368 |
369 |
370 | # In[ ]:
371 |
372 |
373 |
374 |
375 |
376 | # In[ ]:
377 |
378 |
379 |
380 |
381 |
382 | # In[ ]:
383 |
384 |
385 |
386 |
387 |
388 | # In[ ]:
389 |
390 |
391 |
392 |
393 |
394 | # In[ ]:
395 |
396 |
397 |
398 |
399 |
400 | # In[ ]:
401 |
402 |
403 |
404 |
405 |
406 | # In[ ]:
407 |
408 |
409 |
410 |
411 |
412 | # In[ ]:
413 |
414 |
415 |
416 |
417 |
--------------------------------------------------------------------------------
/蝴蝶花(iris)分类案例/Iris3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "[ 4.9 3.1 1.5 0.1]\t-->\tsetosa\t(Actual:setosa)\n",
13 | "[ 5.9 3. 5.1 1.8]\t-->\tvirginica\t(Actual:virginica)\n",
14 | "[ 5.8 2.7 3.9 1.2]\t-->\tversicolor\t(Actual:versicolor)\n",
15 | "[ 5.7 3. 4.2 1.2]\t-->\tversicolor\t(Actual:versicolor)\n",
16 | "[ 6.3 2.5 5. 1.9]\t-->\tvirginica\t(Actual:virginica)\n",
17 | "[ 4.8 3.4 1.6 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
18 | "[ 6.5 2.8 4.6 1.5]\t-->\tversicolor\t(Actual:versicolor)\n",
19 | "[ 5.6 2.9 3.6 1.3]\t-->\tversicolor\t(Actual:versicolor)\n",
20 | "[ 6.1 2.9 4.7 1.4]\t-->\tversicolor\t(Actual:versicolor)\n",
21 | "[ 4.6 3.2 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
22 | "[ 4.4 2.9 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
23 | "[ 4.3 3. 1.1 0.1]\t-->\tsetosa\t(Actual:setosa)\n",
24 | "[ 7.7 2.8 6.7 2. ]\t-->\tvirginica\t(Actual:virginica)\n",
25 | "[ 6.5 3. 5.5 1.8]\t-->\tvirginica\t(Actual:virginica)\n",
26 | "[ 6.7 3.3 5.7 2.5]\t-->\tvirginica\t(Actual:virginica)\n",
27 | "[ 4.7 3.2 1.6 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
28 | "[ 6. 2.7 5.1 1.6]\t-->\tversicolor\t(Actual:versicolor)\n",
29 | "[ 5.7 2.5 5. 2. ]\t-->\tvirginica\t(Actual:virginica)\n",
30 | "[ 6.9 3.1 5.1 2.3]\t-->\tvirginica\t(Actual:virginica)\n",
31 | "[ 5.7 2.9 4.2 1.3]\t-->\tversicolor\t(Actual:versicolor)\n",
32 | "[ 7.2 3.6 6.1 2.5]\t-->\tvirginica\t(Actual:virginica)\n",
33 | "[ 6.1 3. 4.9 1.8]\t-->\tvirginica\t(Actual:virginica)\n",
34 | "[ 5.5 2.3 4. 1.3]\t-->\tversicolor\t(Actual:versicolor)\n",
35 | "[ 6.7 2.5 5.8 1.8]\t-->\tvirginica\t(Actual:virginica)\n",
36 | "[ 5.7 2.8 4.5 1.3]\t-->\tversicolor\t(Actual:versicolor)\n",
37 | "[ 5.7 3.8 1.7 0.3]\t-->\tsetosa\t(Actual:setosa)\n",
38 | "[ 6.4 3.1 5.5 1.8]\t-->\tvirginica\t(Actual:virginica)\n",
39 | "[ 5.2 3.5 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
40 | "[ 6.7 3. 5.2 2.3]\t-->\tvirginica\t(Actual:virginica)\n",
41 | "[ 5.5 4.2 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
42 | "[ 4.9 3.1 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
43 | "[ 5.7 4.4 1.5 0.4]\t-->\tsetosa\t(Actual:setosa)\n",
44 | "[ 5. 2. 3.5 1. ]\t-->\tversicolor\t(Actual:versicolor)\n",
45 | "[ 5.4 3.7 1.5 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
46 | "[ 6.7 3.1 4.4 1.4]\t-->\tversicolor\t(Actual:versicolor)\n",
47 | "[ 5. 3.6 1.4 0.2]\t-->\tsetosa\t(Actual:setosa)\n",
48 | "[ 5.2 4.1 1.5 0.1]\t-->\tsetosa\t(Actual:setosa)\n",
49 | "[ 6.2 2.2 4.5 1.5]\t-->\tversicolor\t(Actual:versicolor)\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "%matplotlib inline\n",
55 | "\n",
56 | "import pandas as pd\n",
57 | "import numpy as np\n",
58 | "from sklearn.ensemble import RandomForestClassifier\n",
59 | "from sklearn.cross_validation import train_test_split\n",
60 | "from sklearn.cross_validation import cross_val_score\n",
61 | "\n",
62 | "df = pd.read_csv('Iris_clean.csv')\n",
63 | "\n",
64 | "training_set = df[['sepal_lenth_cm','sepal_width_cm','petal_length_cm','petal_width_cm']].values\n",
65 | "training_class = df['class'].values\n",
66 | "\n",
67 | "random_forest_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
68 | " max_depth=None, max_features=4, max_leaf_nodes=None,\n",
69 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
70 | " min_samples_leaf=1, min_samples_split=2,\n",
71 | " min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,\n",
72 | " oob_score=False, random_state=None, verbose=0,\n",
73 | " warm_start=False)\n",
74 | "\n",
75 | "cv_scores = cross_val_score(random_forest_classifier, training_set, training_class, cv=10)\n",
76 | "\n",
77 | "(training_inputs,\n",
78 | "testing_inputs,\n",
79 | "training_classes,\n",
80 | "testing_classes) = train_test_split(training_set, training_class, train_size=0.75)\n",
81 | "\n",
82 | "random_forest_classifier.fit(training_set, training_class)\n",
83 | "\n",
84 | "for input_feature, prediction, actual in zip(testing_inputs,\n",
85 | " random_forest_classifier.predict(testing_inputs),\n",
86 | " testing_classes):\n",
87 | " print('{}\\t-->\\t{}\\t(Actual:{})'.format(input_feature, prediction, actual))"
88 | ]
89 | }
90 | ],
91 | "metadata": {
92 | "kernelspec": {
93 | "display_name": "Python 3",
94 | "language": "python",
95 | "name": "python3"
96 | },
97 | "language_info": {
98 | "codemirror_mode": {
99 | "name": "ipython",
100 | "version": 3
101 | },
102 | "file_extension": ".py",
103 | "mimetype": "text/x-python",
104 | "name": "python",
105 | "nbconvert_exporter": "python",
106 | "pygments_lexer": "ipython3",
107 | "version": "3.6.1"
108 | }
109 | },
110 | "nbformat": 4,
111 | "nbformat_minor": 2
112 | }
113 |
--------------------------------------------------------------------------------
/蝴蝶花(iris)分类案例/IrisFishData.csv:
--------------------------------------------------------------------------------
1 | sepal_lenth_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
2 | 4.3,3,1.1,0.1,setosa
3 | 4.4,2.9,1.4,0.2,setosa
4 | 4.4,3,1.3,0.2,setosa
5 | 4.4,3.2,1.3,0.2,setossa
6 | 4.5,2.3,1.3,0.3,setosa
7 | 4.6,3.1,1.5,0.2,setosa
8 | 4.6,3.4,1.4,0.3,setosa
9 | 4.6,3.6,1,0.2,setosa
10 | 4.6,3.2,1.4,0.2,setosa
11 | 4.7,3.2,1.3,0.2,setosa
12 | 4.7,3.2,1.6,0.2,setosa
13 | 4.8,3.4,1.6,0.2,setosa
14 | 4.8,3,1.4,0.1,setosa
15 | 4.8,3.4,1.9,0.2,setosa
16 | 4.8,3.1,1.6,0.2,setosa
17 | 4.8,3,1.4,0.3,setosa
18 | 4.9,3,1.4,0.2,setosa
19 | 4.9,3.1,1.5,0.1,setosa
20 | 4.9,3.1,1.5,0.2,setosa
21 | 4.9,3.6,1.4,0.1,setosa
22 | 5,3.6,1.4,0.2,setosa
23 | 5,3.4,1.5,0.2,setosa
24 | 5,3,1.6,0.2,setosa
25 | 5,3.4,1.6,0.4,setosa
26 | 5,3.2,1.2,0.2,setosa
27 | 5,3.5,1.3,0.3,setosa
28 | 5,3.5,1.6,0.6,setosa
29 | 5,3.3,1.4,0.2,setosa
30 | 5.1,3.5,1.4,0.2,setosa
31 | 5.1,3.5,1.4,0.3,setosa
32 | 5.1,3.8,1.5,0.3,setosa
33 | 5.1,3.7,1.5,0.4,setosa
34 | 5.1,3.3,1.7,0.5,setosa
35 | 5.1,3.4,1.5,0.2,setosa
36 | 5.1,3.8,1.9,0.4,setosa
37 | 5.1,3.8,1.6,0.2,setosa
38 | 5.2,3.5,1.5,0.2,setosa
39 | 5.2,3.4,1.4,0.2,setosa
40 | 5.2,4.1,1.5,0.1,setosa
41 | 5.3,3.7,1.5,0.2,setosa
42 | 5.4,3.9,1.7,0.4,setosa
43 | 5.4,3.7,1.5,0.2,setosa
44 | 5.4,3.9,1.3,0.4,setosa
45 | 5.4,3.4,1.7,0.2,setosa
46 | 5.4,3.4,1.5,0.4,setosa
47 | 5.5,4.2,1.4,0.2,setosa
48 | 5.5,3.5,1.3,0.2,setosa
49 | 5.7,4.4,1.5,0.4,setosa
50 | 5.7,3.8,1.7,0.3,setosa
51 | 5.8,4,1.2,0.2,setosa
52 | 4.9,2.4,3.3,1,versicolor
53 | 5,2,3.5,1,versicolor
54 | 5,2.3,3.3,1,versicolor
55 | 5.1,2.5,3,1.1,versicolor
56 | 5.2,2.7,3.9,1.4,versicolor
57 | 5.4,3,4.5,1.5,versicolor
58 | 5.5,2.3,4,1.3,versicolor
59 | 5.5,2.4,3.8,1.1,versicolor
60 | 5.5,2.4,3.7,1,versicolor
61 | 5.5,2.5,4,1.3,versicolor
62 | 5.5,2.6,4.4,1.2,versicolor
63 | 5.6,2.9,3.6,1.3,versicolor
64 | 5.6,3,4.5,1.5,versicolor
65 | 5.6,2.5,3.9,1.1,versicolor
66 | 5.6,3,4.1,1.3,versicolor
67 | 5.6,2.7,4.2,1.3,versicolor
68 | 5.7,2.8,4.5,1.3,versicolor
69 | 5.7,2.6,3.5,1,versicolor
70 | 5.7,3,4.2,1.2,versicolor
71 | 5.7,2.9,4.2,1.3,versicolor
72 | 5.7,2.8,4.1,1.3,versicolor
73 | 5.8,2.7,4.1,1,versicolor
74 | 5.8,2.7,3.9,1.2,versicolor
75 | 5.8,2.6,4,1.2,versicolor
76 | 5.9,3,4.2,1.5,versicolor
77 | 5.9,3.2,4.8,1.8,versicolor
78 | 6,2.2,4,1,versicolor
79 | 6,2.9,4.5,1.5,versicolor
80 | 6,2.7,5.1,1.6,versicolor
81 | 6,3.4,4.5,1.6,versicolor
82 | 6.1,2.9,4.7,1.4,versicolor
83 | 6.1,2.8,4,1.3,versicolor
84 | 6.1,2.8,4.7,1.2,versicolor
85 | 6.1,3,4.6,1.4,versicolor
86 | 6.2,2.2,4.5,1.5,versicolor
87 | 6.2,2.9,4.3,1.3,versicolor
88 | 6.3,3.3,4.7,1.6,versicolor
89 | 6.3,2.5,4.9,1.5,versicolor
90 | 6.3,2.3,4.4,1.3,versicolor
91 | 6.4,3.2,4.5,1.5,versicolor
92 | 6.4,2.9,4.3,1.3,versicolor
93 | 6.5,2.8,4.6,1.5,versicolor
94 | 6.6,2.9,4.6,1.3,versicolor
95 | 6.6,3,4.4,1.4,versicolor
96 | 6.7,3.1,4.4,1.4,versicolor
97 | 6.7,3,5,1.7,versicolor
98 | 6.7,3.1,4.7,1.5,versicolor
99 | 6.8,2.8,4.8,1.4,versicolor
100 | 6.9,3.1,4.9,1.5,versicolor
101 | 7,3.2,4.7,1.4,versicolor
102 | 4.9,2.5,4.5,1.7,virginica
103 | 5.6,2.8,4.9,2,virginica
104 | 5.7,2.5,5,2,virginica
105 | 5.8,2.7,5.1,1.9,virginica
106 | 5.8,2.8,5.1,2.4,virginica
107 | 5.8,2.7,5.1,1.9,virginica
108 | 5.9,3,5.1,1.8,virginica
109 | 6,2.2,5,1.5,virginica
110 | 6,3,4.8,1.8,virginica
111 | 6.1,3,4.9,1.8,virginica
112 | 6.1,2.6,5.6,1.4,virginica
113 | 6.2,2.8,4.8,1.8,virginica
114 | 6.2,3.4,5.4,2.3,virginica
115 | 6.3,3.3,6,2.5,virginica
116 | 6.3,2.9,5.6,1.8,virginica
117 | 6.3,2.7,4.9,1.8,virginica
118 | 6.3,2.8,5.1,1.5,virginica
119 | 6.3,3.4,5.6,2.4,virginica
120 | 6.3,2.5,5,1.9,virginica
121 | 6.4,2.7,5.3,1.9,virginica
122 | 6.4,3.2,5.3,2.3,virginica
123 | 6.4,2.8,5.6,2.1,virginica
124 | 6.4,2.8,5.6,2.2,virginica
125 | 6.4,3.1,5.5,1.8,virginica
126 | 6.5,3,5.8,2.2,virginica
127 | 6.5,3.2,5.1,2,virginica
128 | 6.5,3,5.5,1.8,virginica
129 | 6.5,3,5.2,2,virginica
130 | 6.7,2.5,5.8,1.8,virginica
131 | 6.7,3.3,5.7,2.1,virginica
132 | 6.7,3.1,5.6,2.4,virginica
133 | 6.7,3.3,5.7,2.5,virginica
134 | 6.7,3,5.2,2.3,virginica
135 | 6.8,3,5.5,2.1,virginica
136 | 6.8,3.2,5.9,2.3,virginica
137 | 6.9,3.2,5.7,2.3,virginica
138 | 6.9,3.1,5.4,2.1,virginica
139 | 6.9,3.1,5.1,2.3,virginica
140 | 7.1,3,5.9,2.1,virginica
141 | 7.2,3.6,6.1,2.5,virginica
142 | 7.2,3.2,6,1.8,virginica
143 | 7.2,3,5.8,1.6,virginica
144 | 7.3,2.9,6.3,1.8,virginica
145 | 7.4,2.8,6.1,1.9,virginica
146 | 7.6,3,6.6,2.1,virginica
147 | 7.7,3.8,6.7,2.2,virginica
148 | 7.7,2.6,6.9,2.3,virginica
149 | 7.7,2.8,6.7,2,virginica
150 | 7.7,3,6.1,2.3,virginica
151 | 7.9,3.8,6.4,2,virginica
152 |
--------------------------------------------------------------------------------
/蝴蝶花(iris)分类案例/Iris_clean.csv:
--------------------------------------------------------------------------------
1 | sepal_lenth_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
2 | 4.3,3.0,1.1,0.1,setosa
3 | 4.4,2.9,1.4,0.2,setosa
4 | 4.4,3.0,1.3,0.2,setosa
5 | 4.4,3.2,1.3,0.2,setosa
6 | 4.6,3.1,1.5,0.2,setosa
7 | 4.6,3.4,1.4,0.3,setosa
8 | 4.6,3.6,1.0,0.2,setosa
9 | 4.6,3.2,1.4,0.2,setosa
10 | 4.7,3.2,1.3,0.2,setosa
11 | 4.7,3.2,1.6,0.2,setosa
12 | 4.8,3.4,1.6,0.2,setosa
13 | 4.8,3.0,1.4,0.1,setosa
14 | 4.8,3.4,1.9,0.2,setosa
15 | 4.8,3.1,1.6,0.2,setosa
16 | 4.8,3.0,1.4,0.3,setosa
17 | 4.9,3.0,1.4,0.2,setosa
18 | 4.9,3.1,1.5,0.1,setosa
19 | 4.9,3.1,1.5,0.2,setosa
20 | 4.9,3.6,1.4,0.1,setosa
21 | 5.0,3.6,1.4,0.2,setosa
22 | 5.0,3.4,1.5,0.2,setosa
23 | 5.0,3.0,1.6,0.2,setosa
24 | 5.0,3.4,1.6,0.4,setosa
25 | 5.0,3.2,1.2,0.2,setosa
26 | 5.0,3.5,1.3,0.3,setosa
27 | 5.0,3.5,1.6,0.6,setosa
28 | 5.0,3.3,1.4,0.2,setosa
29 | 5.1,3.5,1.4,0.2,setosa
30 | 5.1,3.5,1.4,0.3,setosa
31 | 5.1,3.8,1.5,0.3,setosa
32 | 5.1,3.7,1.5,0.4,setosa
33 | 5.1,3.3,1.7,0.5,setosa
34 | 5.1,3.4,1.5,0.2,setosa
35 | 5.1,3.8,1.9,0.4,setosa
36 | 5.1,3.8,1.6,0.2,setosa
37 | 5.2,3.5,1.5,0.2,setosa
38 | 5.2,3.4,1.4,0.2,setosa
39 | 5.2,4.1,1.5,0.1,setosa
40 | 5.3,3.7,1.5,0.2,setosa
41 | 5.4,3.9,1.7,0.4,setosa
42 | 5.4,3.7,1.5,0.2,setosa
43 | 5.4,3.9,1.3,0.4,setosa
44 | 5.4,3.4,1.7,0.2,setosa
45 | 5.4,3.4,1.5,0.4,setosa
46 | 5.5,4.2,1.4,0.2,setosa
47 | 5.5,3.5,1.3,0.2,setosa
48 | 5.7,4.4,1.5,0.4,setosa
49 | 5.7,3.8,1.7,0.3,setosa
50 | 5.8,4.0,1.2,0.2,setosa
51 | 4.9,2.4,3.3,1.0,versicolor
52 | 5.0,2.0,3.5,1.0,versicolor
53 | 5.0,2.3,3.3,1.0,versicolor
54 | 5.1,2.5,3.0,1.1,versicolor
55 | 5.2,2.7,3.9,1.4,versicolor
56 | 5.4,3.0,4.5,1.5,versicolor
57 | 5.5,2.3,4.0,1.3,versicolor
58 | 5.5,2.4,3.8,1.1,versicolor
59 | 5.5,2.4,3.7,1.0,versicolor
60 | 5.5,2.5,4.0,1.3,versicolor
61 | 5.5,2.6,4.4,1.2,versicolor
62 | 5.6,2.9,3.6,1.3,versicolor
63 | 5.6,3.0,4.5,1.5,versicolor
64 | 5.6,2.5,3.9,1.1,versicolor
65 | 5.6,3.0,4.1,1.3,versicolor
66 | 5.6,2.7,4.2,1.3,versicolor
67 | 5.7,2.8,4.5,1.3,versicolor
68 | 5.7,2.6,3.5,1.0,versicolor
69 | 5.7,3.0,4.2,1.2,versicolor
70 | 5.7,2.9,4.2,1.3,versicolor
71 | 5.7,2.8,4.1,1.3,versicolor
72 | 5.8,2.7,4.1,1.0,versicolor
73 | 5.8,2.7,3.9,1.2,versicolor
74 | 5.8,2.6,4.0,1.2,versicolor
75 | 5.9,3.0,4.2,1.5,versicolor
76 | 5.9,3.2,4.8,1.8,versicolor
77 | 6.0,2.2,4.0,1.0,versicolor
78 | 6.0,2.9,4.5,1.5,versicolor
79 | 6.0,2.7,5.1,1.6,versicolor
80 | 6.0,3.4,4.5,1.6,versicolor
81 | 6.1,2.9,4.7,1.4,versicolor
82 | 6.1,2.8,4.0,1.3,versicolor
83 | 6.1,2.8,4.7,1.2,versicolor
84 | 6.1,3.0,4.6,1.4,versicolor
85 | 6.2,2.2,4.5,1.5,versicolor
86 | 6.2,2.9,4.3,1.3,versicolor
87 | 6.3,3.3,4.7,1.6,versicolor
88 | 6.3,2.5,4.9,1.5,versicolor
89 | 6.3,2.3,4.4,1.3,versicolor
90 | 6.4,3.2,4.5,1.5,versicolor
91 | 6.4,2.9,4.3,1.3,versicolor
92 | 6.5,2.8,4.6,1.5,versicolor
93 | 6.6,2.9,4.6,1.3,versicolor
94 | 6.6,3.0,4.4,1.4,versicolor
95 | 6.7,3.1,4.4,1.4,versicolor
96 | 6.7,3.0,5.0,1.7,versicolor
97 | 6.7,3.1,4.7,1.5,versicolor
98 | 6.8,2.8,4.8,1.4,versicolor
99 | 6.9,3.1,4.9,1.5,versicolor
100 | 7.0,3.2,4.7,1.4,versicolor
101 | 4.9,2.5,4.5,1.7,virginica
102 | 5.6,2.8,4.9,2.0,virginica
103 | 5.7,2.5,5.0,2.0,virginica
104 | 5.8,2.7,5.1,1.9,virginica
105 | 5.8,2.8,5.1,2.4,virginica
106 | 5.8,2.7,5.1,1.9,virginica
107 | 5.9,3.0,5.1,1.8,virginica
108 | 6.0,2.2,5.0,1.5,virginica
109 | 6.0,3.0,4.8,1.8,virginica
110 | 6.1,3.0,4.9,1.8,virginica
111 | 6.1,2.6,5.6,1.4,virginica
112 | 6.2,2.8,4.8,1.8,virginica
113 | 6.2,3.4,5.4,2.3,virginica
114 | 6.3,3.3,6.0,2.5,virginica
115 | 6.3,2.9,5.6,1.8,virginica
116 | 6.3,2.7,4.9,1.8,virginica
117 | 6.3,2.8,5.1,1.5,virginica
118 | 6.3,3.4,5.6,2.4,virginica
119 | 6.3,2.5,5.0,1.9,virginica
120 | 6.4,2.7,5.3,1.9,virginica
121 | 6.4,3.2,5.3,2.3,virginica
122 | 6.4,2.8,5.6,2.1,virginica
123 | 6.4,2.8,5.6,2.2,virginica
124 | 6.4,3.1,5.5,1.8,virginica
125 | 6.5,3.0,5.8,2.2,virginica
126 | 6.5,3.2,5.1,2.0,virginica
127 | 6.5,3.0,5.5,1.8,virginica
128 | 6.5,3.0,5.2,2.0,virginica
129 | 6.7,2.5,5.8,1.8,virginica
130 | 6.7,3.3,5.7,2.1,virginica
131 | 6.7,3.1,5.6,2.4,virginica
132 | 6.7,3.3,5.7,2.5,virginica
133 | 6.7,3.0,5.2,2.3,virginica
134 | 6.8,3.0,5.5,2.1,virginica
135 | 6.8,3.2,5.9,2.3,virginica
136 | 6.9,3.2,5.7,2.3,virginica
137 | 6.9,3.1,5.4,2.1,virginica
138 | 6.9,3.1,5.1,2.3,virginica
139 | 7.1,3.0,5.9,2.1,virginica
140 | 7.2,3.6,6.1,2.5,virginica
141 | 7.2,3.2,6.0,1.8,virginica
142 | 7.2,3.0,5.8,1.6,virginica
143 | 7.3,2.9,6.3,1.8,virginica
144 | 7.4,2.8,6.1,1.9,virginica
145 | 7.6,3.0,6.6,2.1,virginica
146 | 7.7,3.8,6.7,2.2,virginica
147 | 7.7,2.6,6.9,2.3,virginica
148 | 7.7,2.8,6.7,2.0,virginica
149 | 7.7,3.0,6.1,2.3,virginica
150 | 7.9,3.8,6.4,2.0,virginica
151 |
--------------------------------------------------------------------------------
/蝴蝶花(iris)分类案例/iris2.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # 数据探索
5 |
6 | # In[2]:
7 |
8 |
9 | get_ipython().magic('matplotlib inline')
10 |
11 | import matplotlib.pyplot as plt
12 | import pandas as pd
13 | import numpy as np
14 | import seaborn as sb
15 |
16 |
17 | # In[3]:
18 |
19 |
20 | df = pd.read_csv('Iris_clean.csv')
21 | sb.pairplot(df)
22 |
23 |
24 | # In[4]:
25 |
26 |
27 | plt.figure(figsize=(10,10))
28 |
29 | for column_index, column in enumerate(df.columns):
30 | if column == 'class':
31 | continue
32 | #按照索引分成四个小图
33 | plt.subplot(2, 2, column_index+1)
34 | #在每个小图上画出特征
35 | sb.violinplot(x='class', y=column, data=df)
36 |
37 |
38 | # ### 测试训练集
39 |
40 | # In[5]:
41 |
42 |
43 | df = pd.read_csv('Iris_clean.csv')
44 | #scikit-learn 需要输入的是numpy的array
45 | training_set = df[['sepal_lenth_cm','sepal_width_cm','petal_length_cm','petal_width_cm']].values
46 | print(training_set[:5])
47 | training_class = df['class'].values
48 | print(training_class[:5])
49 |
50 |
51 | # In[11]:
52 |
53 |
54 | from sklearn.model_selection import train_test_split
55 | import warnings
56 | warnings.filterwarnings("ignore", category=DeprecationWarning)
57 | warnings.filterwarnings("ignore", category=RuntimeWarning)
58 | warnings.filterwarnings("ignore", category=FutureWarning)
59 | #忽略这些warning才可以进行训练
60 |
61 |
62 | # In[12]:
63 |
64 |
65 | (training_inputs,
66 | testing_inputs,
67 | training_classes,
68 | testing_classes) = train_test_split(training_set, training_class, train_size=0.75, random_state=1)
69 |
70 |
71 | # from sklearn.tree import DecisionTreeClassifier
72 | #
73 | # #创分类器对象
74 | # tree_classfier = DecisionTreeClassifier()
75 | #
76 | # tree_classfier.fit(training_inputs, training_classes)
77 | # tree_classfier.score(testing_inputs, testing_classes)
78 |
79 | # ## 97%的正确率 还不错
80 |
81 | # In[ ]:
82 |
83 |
84 |
85 |
86 |
87 | # In[ ]:
88 |
89 |
90 |
91 |
92 |
93 | # In[ ]:
94 |
95 |
96 |
97 |
98 |
99 | # In[ ]:
100 |
101 |
102 |
103 |
104 |
105 | # In[ ]:
106 |
107 |
108 |
109 |
110 |
111 | # In[ ]:
112 |
113 |
114 |
115 |
116 |
117 | # In[ ]:
118 |
119 |
120 |
121 |
122 |
123 | # In[ ]:
124 |
125 |
126 |
127 |
128 |
129 | # In[ ]:
130 |
131 |
132 |
133 |
134 |
135 | # In[ ]:
136 |
137 |
138 |
139 |
140 |
141 | # In[ ]:
142 |
143 |
144 |
145 |
146 |
147 | # In[ ]:
148 |
149 |
150 |
151 |
152 |
153 | # In[ ]:
154 |
155 |
156 |
157 |
158 |
159 | # In[ ]:
160 |
161 |
162 |
163 |
164 |
165 | # In[ ]:
166 |
167 |
168 |
169 |
170 |
171 | # In[ ]:
172 |
173 |
174 |
175 |
176 |
177 | # In[ ]:
178 |
179 |
180 |
181 |
182 |
183 | # In[ ]:
184 |
185 |
186 |
187 |
188 |
189 | # In[ ]:
190 |
191 |
192 |
193 |
194 |
195 | # In[ ]:
196 |
197 |
198 |
199 |
200 |
201 | # In[ ]:
202 |
203 |
204 |
205 |
206 |
207 | # In[ ]:
208 |
209 |
210 |
211 |
212 |
213 | # In[ ]:
214 |
215 |
216 |
217 |
218 |
219 | # In[ ]:
220 |
221 |
222 |
223 |
224 |
225 | # In[ ]:
226 |
227 |
228 |
229 |
230 |
231 | # In[ ]:
232 |
233 |
234 |
235 |
236 |
237 | # In[ ]:
238 |
239 |
240 |
241 |
242 |
243 | # In[ ]:
244 |
245 |
246 |
247 |
248 |
249 | # In[ ]:
250 |
251 |
252 |
253 |
254 |
255 | # In[ ]:
256 |
257 |
258 |
259 |
260 |
261 | # In[ ]:
262 |
263 |
264 |
265 |
266 |
267 | # In[ ]:
268 |
269 |
270 |
271 |
272 |
273 | # In[ ]:
274 |
275 |
276 |
277 |
278 |
279 | # In[ ]:
280 |
281 |
282 |
283 |
284 |
285 | # In[ ]:
286 |
287 |
288 |
289 |
290 |
291 | # In[ ]:
292 |
293 |
294 |
295 |
296 |
297 | # In[ ]:
298 |
299 |
300 |
301 |
302 |
303 | # In[ ]:
304 |
305 |
306 |
307 |
308 |
309 | # In[ ]:
310 |
311 |
312 |
313 |
314 |
315 | # In[ ]:
316 |
317 |
318 |
319 |
320 |
321 | # In[ ]:
322 |
323 |
324 |
325 |
326 |
--------------------------------------------------------------------------------
/讯飞CTR预测/RXY初版/feature_extract_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "import xgboost as xgb"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "train_data = pd.read_table('./data/round1_iflyad_train.txt')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 3,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | " instance_id time city province \\\n",
33 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n",
34 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n",
35 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n",
36 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n",
37 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n",
38 | "\n",
39 | " user_tags carrier devtype \\\n",
40 | "0 NaN 1 2 \n",
41 | "1 2100191,2100078,3001825,,3001781,3001791,30017... 3 2 \n",
42 | "2 NaN 3 2 \n",
43 | "3 2100098,gd_2100000,3001791,3001795,3002193,300... 0 2 \n",
44 | "4 NaN 1 2 \n",
45 | "\n",
46 | " make model nnt ... creative_width creative_height \\\n",
47 | "0 HUAWEI HUAWEI-CAZ-AL10 1 ... 1280 720 \n",
48 | "1 Xiaomi Redmi Note 4 1 ... 960 640 \n",
49 | "2 OPPO OPPO+R11s 1 ... 960 640 \n",
50 | "3 NaN OPPO A57 1 ... 1280 720 \n",
51 | "4 Apple iPhone 7 3 ... 960 640 \n",
52 | "\n",
53 | " creative_is_jump creative_is_download creative_is_js creative_is_voicead \\\n",
54 | "0 True False False False \n",
55 | "1 True False False False \n",
56 | "2 True False False False \n",
57 | "3 True False False False \n",
58 | "4 True False False False \n",
59 | "\n",
60 | " creative_has_deeplink app_paid advert_name click \n",
61 | "0 False False B4734117F35EE97F 0 \n",
62 | "1 False False B4734117F35EE97F 0 \n",
63 | "2 False False E257895F74792E81 0 \n",
64 | "3 False False 0A421D7B11EABFC5 0 \n",
65 | "4 False False B4734117F35EE97F 0 \n",
66 | "\n",
67 | "[5 rows x 35 columns]\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "print(train_data.head())"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "test_data = pd.read_table('./data/round1_iflyad_test_feature.txt')"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 5,
87 | "metadata": {
88 | "scrolled": true
89 | },
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | " instance_id time city province \\\n",
96 | "0 6930856710792380886 2190675456 137103104101100 137103104100100 \n",
97 | "1 5460409694420131920 2190674821 137103104112100 137103104100100 \n",
98 | "2 982813438159141507 2190674111 137105103101100 137105103100100 \n",
99 | "3 529991959116679673 2190675256 137106101107100 137106101100100 \n",
100 | "4 5357053206615171780 2190673926 137103102101100 137103102100100 \n",
101 | "\n",
102 | " user_tags carrier devtype \\\n",
103 | "0 NaN 2 2 \n",
104 | "1 3004406,3004430,3004434 1 2 \n",
105 | "2 3003779,3003843,3003851,3003863,3003865,300386... 2 2 \n",
106 | "3 NaN 2 2 \n",
107 | "4 2100191,2100041,2100078,2100136,2100042,300182... 3 2 \n",
108 | "\n",
109 | " make model nnt ... creative_type \\\n",
110 | "0 Apple iPhone 8 Plus 1 ... 8 \n",
111 | "1 vivo vivo X9Plus 1 ... 8 \n",
112 | "2 OPPO A73t OPPO A73t 4 ... 5 \n",
113 | "3 vivo Z1 vivo Z1 4 ... 8 \n",
114 | "4 HUAWEI HUAWEI MLA-AL10 4 ... 5 \n",
115 | "\n",
116 | " creative_width creative_height creative_is_jump creative_is_download \\\n",
117 | "0 960 640 True False \n",
118 | "1 960 640 True False \n",
119 | "2 160 640 True False \n",
120 | "3 960 640 True False \n",
121 | "4 320 480 True False \n",
122 | "\n",
123 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid \\\n",
124 | "0 False False False False \n",
125 | "1 False False False False \n",
126 | "2 False False False False \n",
127 | "3 False False False False \n",
128 | "4 False False False False \n",
129 | "\n",
130 | " advert_name \n",
131 | "0 B4734117F35EE97F \n",
132 | "1 B4734117F35EE97F \n",
133 | "2 B4734117F35EE97F \n",
134 | "3 B4734117F35EE97F \n",
135 | "4 42A4CB9035B7F50E \n",
136 | "\n",
137 | "[5 rows x 34 columns]\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "print(test_data.head())"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 6,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "name": "stdout",
152 | "output_type": "stream",
153 | "text": [
154 | "\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "print(type(test_data))"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 7,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "name": "stdout",
169 | "output_type": "stream",
170 | "text": [
171 | "bool\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "print(test_data['creative_is_js'].dtype)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 8,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | " for u in test_data.columns:\n",
186 | " if test_data[u].dtype==bool:\n",
187 | " test_data[u]=test_data[u].astype('int')"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 9,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | " instance_id time city province \\\n",
200 | "0 6930856710792380886 2190675456 137103104101100 137103104100100 \n",
201 | "1 5460409694420131920 2190674821 137103104112100 137103104100100 \n",
202 | "2 982813438159141507 2190674111 137105103101100 137105103100100 \n",
203 | "3 529991959116679673 2190675256 137106101107100 137106101100100 \n",
204 | "4 5357053206615171780 2190673926 137103102101100 137103102100100 \n",
205 | "\n",
206 | " user_tags carrier devtype \\\n",
207 | "0 NaN 2 2 \n",
208 | "1 3004406,3004430,3004434 1 2 \n",
209 | "2 3003779,3003843,3003851,3003863,3003865,300386... 2 2 \n",
210 | "3 NaN 2 2 \n",
211 | "4 2100191,2100041,2100078,2100136,2100042,300182... 3 2 \n",
212 | "\n",
213 | " make model nnt ... creative_type \\\n",
214 | "0 Apple iPhone 8 Plus 1 ... 8 \n",
215 | "1 vivo vivo X9Plus 1 ... 8 \n",
216 | "2 OPPO A73t OPPO A73t 4 ... 5 \n",
217 | "3 vivo Z1 vivo Z1 4 ... 8 \n",
218 | "4 HUAWEI HUAWEI MLA-AL10 4 ... 5 \n",
219 | "\n",
220 | " creative_width creative_height creative_is_jump creative_is_download \\\n",
221 | "0 960 640 1 0 \n",
222 | "1 960 640 1 0 \n",
223 | "2 160 640 1 0 \n",
224 | "3 960 640 1 0 \n",
225 | "4 320 480 1 0 \n",
226 | "\n",
227 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid \\\n",
228 | "0 0 0 0 0 \n",
229 | "1 0 0 0 0 \n",
230 | "2 0 0 0 0 \n",
231 | "3 0 0 0 0 \n",
232 | "4 0 0 0 0 \n",
233 | "\n",
234 | " advert_name \n",
235 | "0 B4734117F35EE97F \n",
236 | "1 B4734117F35EE97F \n",
237 | "2 B4734117F35EE97F \n",
238 | "3 B4734117F35EE97F \n",
239 | "4 42A4CB9035B7F50E \n",
240 | "\n",
241 | "[5 rows x 34 columns]\n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "print(test_data.head())"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 12,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "40024\n"
259 | ]
260 | }
261 | ],
262 | "source": [
263 | "print(len(test_data['user_tags']))"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 13,
269 | "metadata": {
270 | "scrolled": true
271 | },
272 | "outputs": [
273 | {
274 | "name": "stdout",
275 | "output_type": "stream",
276 | "text": [
277 | "0 0\n",
278 | "1 0\n",
279 | "2 0\n",
280 | "3 0\n",
281 | "4 0\n",
282 | "Name: f_channel, dtype: object\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "test_data = test_data.fillna(0)\n",
288 | "print(test_data['f_channel'].head())"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 14,
294 | "metadata": {
295 | "scrolled": true
296 | },
297 | "outputs": [
298 | {
299 | "name": "stderr",
300 | "output_type": "stream",
301 | "text": [
302 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
303 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
304 | "\n",
305 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
306 | " \"\"\"\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "#取f_channel字段中下划线后的数字\n",
312 | "for i in range(len(test_data['f_channel'])):\n",
313 | " if test_data['f_channel'][i] != 0:\n",
314 | " #print(i, test_data['f_channel'][i].split('_')[-1])\n",
315 | " test_data['f_channel'][i] = test_data['f_channel'][i].split('_')[-1]"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 27,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "23\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "print(len(test_data['user_tags'][1]))"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "name": "stderr",
342 | "output_type": "stream",
343 | "text": [
344 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
345 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
346 | "\n",
347 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
348 | " \"\"\"\n"
349 | ]
350 | }
351 | ],
352 | "source": [
353 | "#取user_tags长度\n",
354 | "for i in range(len(test_data['user_tags'])):\n",
355 | " if type(test_data['user_tags'][i]) != int:\n",
356 | " #print(i, test_data['f_channel'][i].split('_')[-1])\n",
357 | " test_data['user_tags'][i] = len(test_data['user_tags'][i])"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "print(type(test_data['user_tags'][1]) == str)"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 18,
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "1\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "print(test_data['user_tags'][1])"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 22,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "for u in test_data['user_tags']:\n",
393 | " if type(u) != int:\n",
394 | " u = len(u)"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 23,
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/plain": [
405 | "135"
406 | ]
407 | },
408 | "execution_count": 23,
409 | "metadata": {},
410 | "output_type": "execute_result"
411 | }
412 | ],
413 | "source": [
414 | "len(test_data['user_tags'][2])"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 24,
420 | "metadata": {},
421 | "outputs": [
422 | {
423 | "data": {
424 | "text/plain": [
425 | "'3003779,3003843,3003851,3003863,3003865,3003869,3003875,3004059,3004081,3004089,3004153,3004214,3004266,3004430,3004434,3004500,3004506'"
426 | ]
427 | },
428 | "execution_count": 24,
429 | "metadata": {},
430 | "output_type": "execute_result"
431 | }
432 | ],
433 | "source": [
434 | "test_data['user_tags'][2]"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 37,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "name": "stderr",
444 | "output_type": "stream",
445 | "text": [
446 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
447 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
448 | "\n",
449 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
450 | " This is separate from the ipykernel package so we can avoid doing imports until\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "for u,i in zip(test_data['user_tags'], range(5)):\n",
456 | " if type(u) != int:\n",
457 | " test_data['user_tags'][i] = len(u)"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": 38,
463 | "metadata": {},
464 | "outputs": [
465 | {
466 | "name": "stdout",
467 | "output_type": "stream",
468 | "text": [
469 | "0 0\n",
470 | "1 1\n",
471 | "135 2\n",
472 | "0 3\n",
473 | "824 4\n"
474 | ]
475 | }
476 | ],
477 | "source": [
478 | "for u,i in zip(test_data['user_tags'], range(5)):\n",
479 | " print(u, i)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 39,
485 | "metadata": {},
486 | "outputs": [
487 | {
488 | "name": "stderr",
489 | "output_type": "stream",
490 | "text": [
491 | "C:\\ProgramData\\Anaconda2\\envs\\py36\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
492 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
493 | "\n",
494 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
495 | " This is separate from the ipykernel package so we can avoid doing imports until\n"
496 | ]
497 | }
498 | ],
499 | "source": [
500 | "for u,i in zip(test_data['user_tags'], range(len(test_data['user_tags']))):\n",
501 | " if type(u) != int:\n",
502 | " test_data['user_tags'][i] = len(u)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": []
511 | }
512 | ],
513 | "metadata": {
514 | "kernelspec": {
515 | "display_name": "Python 3",
516 | "language": "python",
517 | "name": "python3"
518 | },
519 | "language_info": {
520 | "codemirror_mode": {
521 | "name": "ipython",
522 | "version": 3
523 | },
524 | "file_extension": ".py",
525 | "mimetype": "text/x-python",
526 | "name": "python",
527 | "nbconvert_exporter": "python",
528 | "pygments_lexer": "ipython3",
529 | "version": "3.6.4"
530 | }
531 | },
532 | "nbformat": 4,
533 | "nbformat_minor": 2
534 | }
535 |
--------------------------------------------------------------------------------
/讯飞CTR预测/RXY初版/feature_re_extract.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "#读取上一步中获得的数据\n",
20 | "test_data = pd.read_csv('test_data.csv')\n",
21 | "train_data = pd.read_csv('train_data.csv')"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#advert_industry_inner 字段中 有两类ID\n",
31 | "def dataInterval(data1):\n",
32 | " d1 = data1.split('_')[0]\n",
33 | " return d1\n",
34 | "\n",
35 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n",
36 | " PublishedTime = arrLike['advert_industry_inner']\n",
37 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n",
38 | " d1 = dataInterval(PublishedTime) #注意去掉两端空白\n",
39 | " return d1\n",
40 | "\n",
41 | "def dataInterval2(data1):\n",
42 | " d2 = data1.split('_')[1]\n",
43 | " return d2\n",
44 | "\n",
45 | "def getInterval2(arrLike): #用来计算日期间隔天数的调用的函数\n",
46 | " PublishedTime = arrLike['advert_industry_inner']\n",
47 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n",
48 | " d2 = dataInterval2(PublishedTime) #注意去掉两端空白\n",
49 | " return d2\n",
50 | " \n",
51 | "#使用apply要快速很多!\n",
52 | "test_data['advert_first'] = test_data.apply(getInterval , axis = 1)\n",
53 | "test_data['advert_second'] = test_data.apply(getInterval2 , axis = 1)\n",
54 | "\n",
55 | "train_data['advert_first'] = train_data.apply(getInterval , axis = 1)\n",
56 | "train_data['advert_second'] = train_data.apply(getInterval2 , axis = 1)"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "#顺带计算两类ID间的差距\n",
66 | "def getDistance(arrLike):\n",
67 | " delta = int(arrLike['advert_second']) - int(arrLike['advert_first'])\n",
68 | " return delta\n",
69 | "\n",
70 | "test_data['advert_delta'] = test_data.apply(getDistance, axis = 1)\n",
71 | "train_data['advert_delta'] = train_data.apply(getDistance, axis = 1)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 5,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "#删除原有advert_industry_inner字段\n",
81 | "test_data.drop(columns=['advert_industry_inner'], inplace=True)\n",
82 | "train_data.drop(columns=['advert_industry_inner'], inplace=True)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 6,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "#将所有类型转换为可以训练的\n",
92 | "test_data['advert_first'] = test_data['advert_first'].astype('int64')\n",
93 | "train_data['advert_first'] = train_data['advert_first'].astype('int64')\n",
94 | "\n",
95 | "test_data['advert_second'] = test_data['advert_first'].astype('int64')\n",
96 | "train_data['advert_second'] = train_data['advert_first'].astype('int64')"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 45,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/plain": [
107 | "Unnamed: 0 int64\n",
108 | "instance_id int64\n",
109 | "time int64\n",
110 | "city int64\n",
111 | "province int64\n",
112 | "user_tags int64\n",
113 | "carrier int64\n",
114 | "devtype int64\n",
115 | "nnt int64\n",
116 | "os int64\n",
117 | "os_name int64\n",
118 | "adid int64\n",
119 | "advert_id int64\n",
120 | "orderid int64\n",
121 | "campaign_id int64\n",
122 | "creative_id int64\n",
123 | "creative_tp_dnf int64\n",
124 | "app_cate_id float64\n",
125 | "f_channel int64\n",
126 | "app_id float64\n",
127 | "creative_type int64\n",
128 | "creative_width int64\n",
129 | "creative_height int64\n",
130 | "creative_is_jump int64\n",
131 | "creative_is_download int64\n",
132 | "creative_is_js int64\n",
133 | "creative_is_voicead int64\n",
134 | "creative_has_deeplink int64\n",
135 | "app_paid int64\n",
136 | "click int64\n",
137 | "advert_first int64\n",
138 | "advert_second int64\n",
139 | "advert_delta int64\n",
140 | "dtype: object"
141 | ]
142 | },
143 | "execution_count": 45,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "train_data.dtypes"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 8,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "#将click这个label移到最后\n",
159 | "train_data = train_data[\n",
160 | " ['instance_id', 'time', 'city', 'province', 'user_tags', 'carrier',\n",
161 | " 'devtype', 'nnt', 'os', 'os_name', 'adid',\n",
162 | " 'advert_id', 'orderid', 'campaign_id',\n",
163 | " 'creative_id', 'creative_tp_dnf', 'app_cate_id', 'f_channel', 'app_id',\n",
164 | " 'creative_type', 'creative_width', 'creative_height',\n",
165 | " 'creative_is_jump', 'creative_is_download', 'creative_is_js',\n",
166 | " 'creative_is_voicead', 'creative_has_deeplink', 'app_paid','advert_first', 'advert_second', 'advert_delta',\n",
167 | " 'click']\n",
168 | "]"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 47,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/html": [
179 | "\n",
180 | "\n",
193 | "
\n",
194 | " \n",
195 | " \n",
196 | " | \n",
197 | " instance_id | \n",
198 | " time | \n",
199 | " city | \n",
200 | " province | \n",
201 | " user_tags | \n",
202 | " carrier | \n",
203 | " devtype | \n",
204 | " nnt | \n",
205 | " os | \n",
206 | " os_name | \n",
207 | " ... | \n",
208 | " creative_type | \n",
209 | " creative_width | \n",
210 | " creative_height | \n",
211 | " creative_is_jump | \n",
212 | " creative_is_download | \n",
213 | " creative_is_js | \n",
214 | " creative_is_voicead | \n",
215 | " creative_has_deeplink | \n",
216 | " app_paid | \n",
217 | " click | \n",
218 | "
\n",
219 | " \n",
220 | " \n",
221 | " \n",
222 | " 0 | \n",
223 | " 86294719979897807 | \n",
224 | " 2190219034 | \n",
225 | " 137103102105100 | \n",
226 | " 137103102100100 | \n",
227 | " 0 | \n",
228 | " 1 | \n",
229 | " 2 | \n",
230 | " 1 | \n",
231 | " 2 | \n",
232 | " 1 | \n",
233 | " ... | \n",
234 | " 8 | \n",
235 | " 1280 | \n",
236 | " 720 | \n",
237 | " 1 | \n",
238 | " 0 | \n",
239 | " 0 | \n",
240 | " 0 | \n",
241 | " 0 | \n",
242 | " 0 | \n",
243 | " 0 | \n",
244 | "
\n",
245 | " \n",
246 | " 1 | \n",
247 | " 2699289844928136052 | \n",
248 | " 2190221070 | \n",
249 | " 137105101100100 | \n",
250 | " 137105101100100 | \n",
251 | " 785 | \n",
252 | " 3 | \n",
253 | " 2 | \n",
254 | " 1 | \n",
255 | " 2 | \n",
256 | " 1 | \n",
257 | " ... | \n",
258 | " 8 | \n",
259 | " 960 | \n",
260 | " 640 | \n",
261 | " 1 | \n",
262 | " 0 | \n",
263 | " 0 | \n",
264 | " 0 | \n",
265 | " 0 | \n",
266 | " 0 | \n",
267 | " 0 | \n",
268 | "
\n",
269 | " \n",
270 | " 2 | \n",
271 | " 3117527168445845752 | \n",
272 | " 2190219793 | \n",
273 | " 137103104111100 | \n",
274 | " 137103104100100 | \n",
275 | " 0 | \n",
276 | " 3 | \n",
277 | " 2 | \n",
278 | " 1 | \n",
279 | " 2 | \n",
280 | " 1 | \n",
281 | " ... | \n",
282 | " 8 | \n",
283 | " 960 | \n",
284 | " 640 | \n",
285 | " 1 | \n",
286 | " 0 | \n",
287 | " 0 | \n",
288 | " 0 | \n",
289 | " 0 | \n",
290 | " 0 | \n",
291 | " 0 | \n",
292 | "
\n",
293 | " \n",
294 | " 3 | \n",
295 | " 3398484891050993371 | \n",
296 | " 2190221704 | \n",
297 | " 137103102113100 | \n",
298 | " 137103102100100 | \n",
299 | " 339 | \n",
300 | " 0 | \n",
301 | " 2 | \n",
302 | " 1 | \n",
303 | " 2 | \n",
304 | " 1 | \n",
305 | " ... | \n",
306 | " 3 | \n",
307 | " 1280 | \n",
308 | " 720 | \n",
309 | " 1 | \n",
310 | " 0 | \n",
311 | " 0 | \n",
312 | " 0 | \n",
313 | " 0 | \n",
314 | " 0 | \n",
315 | " 0 | \n",
316 | "
\n",
317 | " \n",
318 | " 4 | \n",
319 | " 2035477570591176488 | \n",
320 | " 2190220024 | \n",
321 | " 137103102109100 | \n",
322 | " 137103102100100 | \n",
323 | " 0 | \n",
324 | " 1 | \n",
325 | " 2 | \n",
326 | " 3 | \n",
327 | " 1 | \n",
328 | " 2 | \n",
329 | " ... | \n",
330 | " 8 | \n",
331 | " 960 | \n",
332 | " 640 | \n",
333 | " 1 | \n",
334 | " 0 | \n",
335 | " 0 | \n",
336 | " 0 | \n",
337 | " 0 | \n",
338 | " 0 | \n",
339 | " 0 | \n",
340 | "
\n",
341 | " \n",
342 | "
\n",
343 | "
5 rows × 32 columns
\n",
344 | "
"
345 | ],
346 | "text/plain": [
347 | " instance_id time city province \\\n",
348 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n",
349 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n",
350 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n",
351 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n",
352 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n",
353 | "\n",
354 | " user_tags carrier devtype nnt os os_name ... creative_type \\\n",
355 | "0 0 1 2 1 2 1 ... 8 \n",
356 | "1 785 3 2 1 2 1 ... 8 \n",
357 | "2 0 3 2 1 2 1 ... 8 \n",
358 | "3 339 0 2 1 2 1 ... 3 \n",
359 | "4 0 1 2 3 1 2 ... 8 \n",
360 | "\n",
361 | " creative_width creative_height creative_is_jump creative_is_download \\\n",
362 | "0 1280 720 1 0 \n",
363 | "1 960 640 1 0 \n",
364 | "2 960 640 1 0 \n",
365 | "3 1280 720 1 0 \n",
366 | "4 960 640 1 0 \n",
367 | "\n",
368 | " creative_is_js creative_is_voicead creative_has_deeplink app_paid click \n",
369 | "0 0 0 0 0 0 \n",
370 | "1 0 0 0 0 0 \n",
371 | "2 0 0 0 0 0 \n",
372 | "3 0 0 0 0 0 \n",
373 | "4 0 0 0 0 0 \n",
374 | "\n",
375 | "[5 rows x 32 columns]"
376 | ]
377 | },
378 | "execution_count": 47,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "train_data.head()"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 9,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "test_data.to_csv('./data/test_data.csv')\n",
394 | "train_data.to_csv('./data/train_data.csv')"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": []
403 | }
404 | ],
405 | "metadata": {
406 | "kernelspec": {
407 | "display_name": "Python 3",
408 | "language": "python",
409 | "name": "python3"
410 | },
411 | "language_info": {
412 | "codemirror_mode": {
413 | "name": "ipython",
414 | "version": 3
415 | },
416 | "file_extension": ".py",
417 | "mimetype": "text/x-python",
418 | "name": "python",
419 | "nbconvert_exporter": "python",
420 | "pygments_lexer": "ipython3",
421 | "version": "3.6.4"
422 | }
423 | },
424 | "nbformat": 4,
425 | "nbformat_minor": 2
426 | }
427 |
--------------------------------------------------------------------------------
/讯飞CTR预测/RXY初版/lambda_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "import time"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "train_path='./data/round1_iflyad_train.txt'\n",
21 | "test_path='./data/round1_iflyad_test_feature.txt'"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 5,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "all_data=pd.read_table(train_path)\n",
31 | "#print(all_data.head(10))\n",
32 | "all_test=pd.read_table(test_path)\n",
33 | "#print(all_test.head(10))\n",
34 | "#将时间戳转化为正常时间\n",
35 | "all_data['time_string']=all_data[\"time\"].apply(lambda x:time.strftime(\"%Y-%m-%d %H:%M:%S\",time.localtime(x)))\n",
36 | "all_data['time_string']=pd.to_datetime(all_data[\"time_string\"])\n",
37 | "all_data[\"hour\"]=all_data[\"time_string\"].dt.hour\n",
38 | "all_data[\"day\"]=all_data[\"time_string\"].dt.day\n",
39 | "all_data[\"day\"]=all_data[\"day\"].apply(lambda x:x-27 if x>=27 else x+4)\n",
40 | "\n",
41 | "all_test['time_string']=all_test[\"time\"].apply(lambda x:time.strftime(\"%Y-%m-%d %H:%M:%S\",time.localtime(x)))\n",
42 | "all_test['time_string']=pd.to_datetime(all_test[\"time_string\"])\n",
43 | "all_test[\"hour\"]=all_test[\"time_string\"].dt.hour\n",
44 | "all_test[\"day\"]=all_test[\"time_string\"].dt.day\n",
45 | "all_test[\"day\"]=all_test[\"day\"].apply(lambda x:x-27 if x>=27 else x+4)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 8,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "all_data = all_data.fillna(0)\n",
55 | "all_test = all_test.fillna(0)\n",
56 | "\n",
57 | "all_data['user_tags']=all_data['user_tags'].apply(lambda x:len(x) if not x==0 else 0)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 9,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/html": [
68 | "\n",
69 | "\n",
82 | "
\n",
83 | " \n",
84 | " \n",
85 | " | \n",
86 | " instance_id | \n",
87 | " time | \n",
88 | " city | \n",
89 | " province | \n",
90 | " user_tags | \n",
91 | " carrier | \n",
92 | " devtype | \n",
93 | " make | \n",
94 | " model | \n",
95 | " nnt | \n",
96 | " ... | \n",
97 | " creative_is_download | \n",
98 | " creative_is_js | \n",
99 | " creative_is_voicead | \n",
100 | " creative_has_deeplink | \n",
101 | " app_paid | \n",
102 | " advert_name | \n",
103 | " click | \n",
104 | " time_string | \n",
105 | " hour | \n",
106 | " day | \n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " 0 | \n",
112 | " 86294719979897807 | \n",
113 | " 2190219034 | \n",
114 | " 137103102105100 | \n",
115 | " 137103102100100 | \n",
116 | " 0 | \n",
117 | " 1 | \n",
118 | " 2 | \n",
119 | " HUAWEI | \n",
120 | " HUAWEI-CAZ-AL10 | \n",
121 | " 1 | \n",
122 | " ... | \n",
123 | " False | \n",
124 | " False | \n",
125 | " False | \n",
126 | " False | \n",
127 | " False | \n",
128 | " B4734117F35EE97F | \n",
129 | " 0 | \n",
130 | " 2039-05-29 02:10:34 | \n",
131 | " 2 | \n",
132 | " 2 | \n",
133 | "
\n",
134 | " \n",
135 | " 1 | \n",
136 | " 2699289844928136052 | \n",
137 | " 2190221070 | \n",
138 | " 137105101100100 | \n",
139 | " 137105101100100 | \n",
140 | " 785 | \n",
141 | " 3 | \n",
142 | " 2 | \n",
143 | " Xiaomi | \n",
144 | " Redmi Note 4 | \n",
145 | " 1 | \n",
146 | " ... | \n",
147 | " False | \n",
148 | " False | \n",
149 | " False | \n",
150 | " False | \n",
151 | " False | \n",
152 | " B4734117F35EE97F | \n",
153 | " 0 | \n",
154 | " 2039-05-29 02:44:30 | \n",
155 | " 2 | \n",
156 | " 2 | \n",
157 | "
\n",
158 | " \n",
159 | " 2 | \n",
160 | " 3117527168445845752 | \n",
161 | " 2190219793 | \n",
162 | " 137103104111100 | \n",
163 | " 137103104100100 | \n",
164 | " 0 | \n",
165 | " 3 | \n",
166 | " 2 | \n",
167 | " OPPO | \n",
168 | " OPPO+R11s | \n",
169 | " 1 | \n",
170 | " ... | \n",
171 | " False | \n",
172 | " False | \n",
173 | " False | \n",
174 | " False | \n",
175 | " False | \n",
176 | " E257895F74792E81 | \n",
177 | " 0 | \n",
178 | " 2039-05-29 02:23:13 | \n",
179 | " 2 | \n",
180 | " 2 | \n",
181 | "
\n",
182 | " \n",
183 | " 3 | \n",
184 | " 3398484891050993371 | \n",
185 | " 2190221704 | \n",
186 | " 137103102113100 | \n",
187 | " 137103102100100 | \n",
188 | " 339 | \n",
189 | " 0 | \n",
190 | " 2 | \n",
191 | " 0 | \n",
192 | " OPPO A57 | \n",
193 | " 1 | \n",
194 | " ... | \n",
195 | " False | \n",
196 | " False | \n",
197 | " False | \n",
198 | " False | \n",
199 | " False | \n",
200 | " 0A421D7B11EABFC5 | \n",
201 | " 0 | \n",
202 | " 2039-05-29 02:55:04 | \n",
203 | " 2 | \n",
204 | " 2 | \n",
205 | "
\n",
206 | " \n",
207 | " 4 | \n",
208 | " 2035477570591176488 | \n",
209 | " 2190220024 | \n",
210 | " 137103102109100 | \n",
211 | " 137103102100100 | \n",
212 | " 0 | \n",
213 | " 1 | \n",
214 | " 2 | \n",
215 | " Apple | \n",
216 | " iPhone 7 | \n",
217 | " 3 | \n",
218 | " ... | \n",
219 | " False | \n",
220 | " False | \n",
221 | " False | \n",
222 | " False | \n",
223 | " False | \n",
224 | " B4734117F35EE97F | \n",
225 | " 0 | \n",
226 | " 2039-05-29 02:27:04 | \n",
227 | " 2 | \n",
228 | " 2 | \n",
229 | "
\n",
230 | " \n",
231 | " 5 | \n",
232 | " 2065527640347419040 | \n",
233 | " 2190221228 | \n",
234 | " 137104104104100 | \n",
235 | " 137104104100100 | \n",
236 | " 271 | \n",
237 | " 1 | \n",
238 | " 2 | \n",
239 | " Xiaomi,MI 6,sagit | \n",
240 | " MI 6 | \n",
241 | " 1 | \n",
242 | " ... | \n",
243 | " False | \n",
244 | " False | \n",
245 | " False | \n",
246 | " False | \n",
247 | " False | \n",
248 | " 862FF2E9B0AD4C14 | \n",
249 | " 0 | \n",
250 | " 2039-05-29 02:47:08 | \n",
251 | " 2 | \n",
252 | " 2 | \n",
253 | "
\n",
254 | " \n",
255 | "
\n",
256 | "
6 rows × 38 columns
\n",
257 | "
"
258 | ],
259 | "text/plain": [
260 | " instance_id time city province \\\n",
261 | "0 86294719979897807 2190219034 137103102105100 137103102100100 \n",
262 | "1 2699289844928136052 2190221070 137105101100100 137105101100100 \n",
263 | "2 3117527168445845752 2190219793 137103104111100 137103104100100 \n",
264 | "3 3398484891050993371 2190221704 137103102113100 137103102100100 \n",
265 | "4 2035477570591176488 2190220024 137103102109100 137103102100100 \n",
266 | "5 2065527640347419040 2190221228 137104104104100 137104104100100 \n",
267 | "\n",
268 | " user_tags carrier devtype make model nnt ... \\\n",
269 | "0 0 1 2 HUAWEI HUAWEI-CAZ-AL10 1 ... \n",
270 | "1 785 3 2 Xiaomi Redmi Note 4 1 ... \n",
271 | "2 0 3 2 OPPO OPPO+R11s 1 ... \n",
272 | "3 339 0 2 0 OPPO A57 1 ... \n",
273 | "4 0 1 2 Apple iPhone 7 3 ... \n",
274 | "5 271 1 2 Xiaomi,MI 6,sagit MI 6 1 ... \n",
275 | "\n",
276 | " creative_is_download creative_is_js creative_is_voicead \\\n",
277 | "0 False False False \n",
278 | "1 False False False \n",
279 | "2 False False False \n",
280 | "3 False False False \n",
281 | "4 False False False \n",
282 | "5 False False False \n",
283 | "\n",
284 | " creative_has_deeplink app_paid advert_name click \\\n",
285 | "0 False False B4734117F35EE97F 0 \n",
286 | "1 False False B4734117F35EE97F 0 \n",
287 | "2 False False E257895F74792E81 0 \n",
288 | "3 False False 0A421D7B11EABFC5 0 \n",
289 | "4 False False B4734117F35EE97F 0 \n",
290 | "5 False False 862FF2E9B0AD4C14 0 \n",
291 | "\n",
292 | " time_string hour day \n",
293 | "0 2039-05-29 02:10:34 2 2 \n",
294 | "1 2039-05-29 02:44:30 2 2 \n",
295 | "2 2039-05-29 02:23:13 2 2 \n",
296 | "3 2039-05-29 02:55:04 2 2 \n",
297 | "4 2039-05-29 02:27:04 2 2 \n",
298 | "5 2039-05-29 02:47:08 2 2 \n",
299 | "\n",
300 | "[6 rows x 38 columns]"
301 | ]
302 | },
303 | "execution_count": 9,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "all_data.head(6)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 10,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "name": "stdout",
319 | "output_type": "stream",
320 | "text": [
321 | "(151301, 38)\n"
322 | ]
323 | }
324 | ],
325 | "source": [
326 | "features1=all_data[(all_data['day']>=0)&(all_data['day']<=4)]\n",
327 | "dataset1=all_data[all_data['day']==5]\n",
328 | "print(dataset1.shape)\n",
329 | "features2=all_data[(all_data['day']>=0)&(all_data['day']<=5)]\n",
330 | "dataset2=all_data[all_data['day']==6]\n",
331 | "features3=all_data[(all_data['day']>=0)&(all_data['day']<=6)]\n",
332 | "dataset3=all_test"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": []
341 | }
342 | ],
343 | "metadata": {
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "language": "python",
347 | "name": "python3"
348 | },
349 | "language_info": {
350 | "codemirror_mode": {
351 | "name": "ipython",
352 | "version": 3
353 | },
354 | "file_extension": ".py",
355 | "mimetype": "text/x-python",
356 | "name": "python",
357 | "nbconvert_exporter": "python",
358 | "pygments_lexer": "ipython3",
359 | "version": "3.6.4"
360 | }
361 | },
362 | "nbformat": 4,
363 | "nbformat_minor": 2
364 | }
365 |
--------------------------------------------------------------------------------
/讯飞CTR预测/RXY初版/pandas_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 46,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | " 0 1\n",
23 | "0 1900_2000 1960_2452\n",
24 | "1 1854_1965 2002_2150\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "a = pd.DataFrame([['1900_2000','1960_2452'],['1854_1965','2002_2150']])\n",
30 | "print(a)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 14,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "def dataInterval(data1,data2):\n",
40 | " d1 = datetime.datetime.strptime(data1, '%Y-%m-%d')\n",
41 | " d2 = datetime.datetime.strptime(data2, '%Y-%m-%d')\n",
42 | " delta = d1 - d2\n",
43 | " return delta.days\n",
44 | "\n",
45 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n",
46 | " PublishedTime = arrLike['PublishedTime']\n",
47 | " ReceivedTime = arrLike['ReceivedTime']\n",
48 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n",
49 | " days = dataInterval(PublishedTime.strip(),ReceivedTime.strip()) #注意去掉两端空白\n",
50 | " return days\n",
51 | "\n",
52 | "if __name__ == '__main__': \n",
53 | " fileName = \"NS_new.xls\";\n",
54 | " df = pd.read_excel(fileName) \n",
55 | " df['TimeInterval'] = df.apply(getInterval , axis = 1)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 50,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "def dataInterval(data1):\n",
65 | " d1 = data1.split('_')[0]\n",
66 | " return d1\n",
67 | "\n",
68 | "def getInterval(arrLike): #用来计算日期间隔天数的调用的函数\n",
69 | " PublishedTime = arrLike[0]\n",
70 | "# print(PublishedTime.strip(),ReceivedTime.strip())\n",
71 | " d1 = dataInterval(PublishedTime) #注意去掉两端空白\n",
72 | " return d1\n",
73 | "\n",
74 | "if __name__ == '__main__':\n",
75 | " a['TimeInterval'] = a.apply(getInterval , axis = 1)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 51,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | " 0 1 TimeInterval\n",
88 | "0 1900_2000 1960_2452 1900\n",
89 | "1 1854_1965 2002_2150 1854\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "print(a)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 41,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "0 [da, ddasasd]\n",
106 | "1 [dsda, das]\n",
107 | "Name: 2, dtype: object"
108 | ]
109 | },
110 | "execution_count": 41,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "a['2'].str.split('_')"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "import time\n",
124 | "for i in range:\n",
125 | " time.sleep(5)\n",
126 | " print(i)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 42,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "f = lambda x:x[0]"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 45,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "name": "stdout",
145 | "output_type": "stream",
146 | "text": [
147 | "['da', 'dsda']\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "print(list(map(f, a['2'].str.split('_'))))"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 53,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "1\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "print(1)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 56,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "name": "stdout",
179 | "output_type": "stream",
180 | "text": [
181 | "2\n"
182 | ]
183 | }
184 | ],
185 | "source": [
186 | "print(2)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 60,
192 | "metadata": {
193 | "scrolled": true
194 | },
195 | "outputs": [
196 | {
197 | "name": "stdout",
198 | "output_type": "stream",
199 | "text": [
200 | "0\n",
201 | "1\n",
202 | "2\n"
203 | ]
204 | },
205 | {
206 | "ename": "KeyboardInterrupt",
207 | "evalue": "",
208 | "output_type": "error",
209 | "traceback": [
210 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
211 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
212 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10000\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
213 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "import time\n",
219 | "for i in range(10000):\n",
220 | " time.sleep(2)\n",
221 | " print(i)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 58,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "name": "stdout",
231 | "output_type": "stream",
232 | "text": [
233 | "1\n"
234 | ]
235 | }
236 | ],
237 | "source": [
238 | "print(1)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 61,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "name": "stdout",
248 | "output_type": "stream",
249 | "text": [
250 | " 0 1 TimeInterval\n",
251 | "0 1900_2000 1960_2452 1900\n",
252 | "1 1854_1965 2002_2150 1854\n"
253 | ]
254 | }
255 | ],
256 | "source": [
257 | "print(a)"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": []
266 | }
267 | ],
268 | "metadata": {
269 | "kernelspec": {
270 | "display_name": "Python 3",
271 | "language": "python",
272 | "name": "python3"
273 | },
274 | "language_info": {
275 | "codemirror_mode": {
276 | "name": "ipython",
277 | "version": 3
278 | },
279 | "file_extension": ".py",
280 | "mimetype": "text/x-python",
281 | "name": "python",
282 | "nbconvert_exporter": "python",
283 | "pygments_lexer": "ipython3",
284 | "version": "3.6.4"
285 | }
286 | },
287 | "nbformat": 4,
288 | "nbformat_minor": 2
289 | }
290 |
--------------------------------------------------------------------------------
/讯飞CTR预测/digi_onehot.py:
--------------------------------------------------------------------------------
1 | from numpy import argmax
2 | # define input string
3 | data = 'hello world'
4 | print(data)
5 | # define universe of possible input values
6 | alphabet = 'abcdefghijklmnopqrstuvwxyz '
7 | # define a mapping of chars to integers
8 | char_to_int = dict((c, i) for i, c in enumerate(alphabet))
9 | int_to_char = dict((i, c) for i, c in enumerate(alphabet))
10 | # integer encode input data
11 | integer_encoded = [char_to_int[char] for char in data]
12 | print(integer_encoded)
13 |
14 | onehot_encoded = list()
15 | for value in integer_encoded:
16 | letter = [0 for _ in range(len(alphabet))]
17 | letter[value] = 1
18 | onehot_encoded.append(letter)
19 | print(onehot_encoded)
20 |
21 | inverted = int_to_char[argmax(onehot_encoded[0])]
22 | print(inverted)
--------------------------------------------------------------------------------
/讯飞CTR预测/one_hot_test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.preprocessing import OneHotEncoder
3 |
4 | df2 = pd.DataFrame({'id': [3566841, 6541227, 3512441],
5 | 'sex': [1, 2, 2],
6 | 'level': [3, 1, 2]})
7 |
8 | id_data = df2.values[:, :1]
9 | transform_data = df2.values[:, 1:]
10 |
11 | enc = OneHotEncoder()
12 | df2_new = enc.fit_transform(transform_data).toarray()
13 |
14 | #zu he
15 | df2_all = pd.concat((pd.DataFrame(id_data),pd.DataFrame(df2_new)),axis=1)
16 | print(df2_all)
--------------------------------------------------------------------------------
/讯飞CTR预测/川哥版/_1_extract_features.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | import matplotlib as mpl
6 | from scipy import interpolate
7 | import seaborn as sns
8 | from scipy import interpolate
9 | import time
10 | from utils import *
11 |
12 | train_path='./data/round1_iflyad_train.txt'
13 | test_path='./data/round1_iflyad_test_feature.txt'
14 |
15 |
16 | all_data=pd.read_table(train_path)
17 | #print(all_data.head(10))
18 | all_test=pd.read_table(test_path)
19 | #print(all_test.head(10))
20 | #将时间戳转化为正常时间
21 | all_data['time_string']=all_data["time"].apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x)))
22 | all_data['time_string']=pd.to_datetime(all_data["time_string"])
23 | all_data["hour"]=all_data["time_string"].dt.hour
24 | all_data["day"]=all_data["time_string"].dt.day
25 | all_data["day"]=all_data["day"].apply(lambda x:x-27 if x>=27 else x+4)
26 |
27 | all_test['time_string']=all_test["time"].apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x)))
28 | all_test['time_string']=pd.to_datetime(all_test["time_string"])
29 | all_test["hour"]=all_test["time_string"].dt.hour
30 | all_test["day"]=all_test["time_string"].dt.day
31 | all_test["day"]=all_test["day"].apply(lambda x:x-27 if x>=27 else x+4)
32 |
33 | #划分训练集与测试集
34 | #27,28,29,30,31,1,2,3
35 | '''
36 | feature_extract_dataset(day) label
37 | train1 0-4 5
38 | train2(offline_test) 1-5 6
39 | online_test 2-6 7
40 | '''
41 | features1=all_data[(all_data['day']>=0)&(all_data['day']<=4)]
42 | dataset1=all_data[all_data['day']==5]
43 | print(dataset1.shape)
44 | features2=all_data[(all_data['day']>=1)&(all_data['day']<=5)]
45 | dataset2=all_data[all_data['day']==6]
46 | features3=all_data[(all_data['day']>=2)&(all_data['day']<=6)]
47 | dataset3=all_test
48 | '''
49 | aa=all_data[['inner_slot_id']]
50 | bb=all_test[['inner_slot_id']]
51 | cc=pd.concat([aa,bb],axis=0)
52 | cc.drop_duplicates(inplace=True)
53 | df=pd.get_dummies(cc['inner_slot_id'],prefix='inner_slot_id')
54 | inner_slot_id_one_hot=pd.concat([cc,df],axis=1)
55 | '''
56 | #构造特征
57 | #点击个数,转化率,转化个数
58 | t1=features1[['adid','click']]
59 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num')
60 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt')
61 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio')
62 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']]
63 | t11.drop_duplicates(inplace=True)
64 | print(t1.head(10))
65 | t2=features1[['app_id','click']]
66 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num')
67 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt')
68 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio')
69 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']]
70 | t21.drop_duplicates(inplace=True)
71 |
72 | t3=features1[['app_id','adid','click']]
73 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num')
74 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt')
75 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio')
76 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']]
77 | t31.drop_duplicates(inplace=True)
78 |
79 | t4=features1[['orderid','click']]
80 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num')
81 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt')
82 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio')
83 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']]
84 | t41.drop_duplicates(inplace=True)
85 |
86 | t5=features1[['inner_slot_id','click']]
87 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num')
88 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt')
89 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio')
90 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']]
91 | t51.drop_duplicates(inplace=True)
92 |
93 | t6=features1[['inner_slot_id','nnt','click']]
94 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num')
95 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt')
96 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio')
97 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']]
98 | t61.drop_duplicates(inplace=True)
99 |
100 |
101 | dataset1=dataset1.merge(t11,on=['adid'],how='left')
102 | dataset1=dataset1.merge(t21,on=['app_id'],how='left')
103 | dataset1=dataset1.merge(t31,on=['app_id','adid'],how='left')
104 | dataset1=dataset1.merge(t41,on=['orderid'],how='left')
105 | dataset1=dataset1.merge(t51,on=['inner_slot_id'],how='left')
106 | dataset1=dataset1.merge(t61,on=['inner_slot_id','nnt'],how='left')
107 | #dataset1=dataset1.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left')
108 | #dataset1['creative_shape']=dataset1['creative_height']*dataset1['creative_width']
109 | #保存提取的特征
110 | dataset1.to_csv('features/feature1.csv',index=None)
111 | #############################################################################################
112 | #features2
113 | t1=features2[['adid','click']]
114 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num')
115 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt')
116 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio')
117 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']]
118 | t11.drop_duplicates(inplace=True)
119 | #print(t1.head(10))
120 | t2=features2[['app_id','click']]
121 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num')
122 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt')
123 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio')
124 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']]
125 | t21.drop_duplicates(inplace=True)
126 |
127 | t3=features2[['app_id','adid','click']]
128 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num')
129 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt')
130 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio')
131 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']]
132 | t31.drop_duplicates(inplace=True)
133 |
134 | t4=features2[['orderid','click']]
135 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num')
136 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt')
137 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio')
138 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']]
139 | t41.drop_duplicates(inplace=True)
140 |
141 | t5=features2[['inner_slot_id','click']]
142 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num')
143 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt')
144 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio')
145 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']]
146 | t51.drop_duplicates(inplace=True)
147 |
148 | t6=features2[['inner_slot_id','nnt','click']]
149 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num')
150 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt')
151 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio')
152 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']]
153 | t61.drop_duplicates(inplace=True)
154 | #one-hot
155 | #inner_slot_id_df=pd.get_dummies(dataset2['inner_slot_id'],prefix='inner_slot_id')
156 | dataset2=dataset2.merge(t11,on=['adid'],how='left')
157 | dataset2=dataset2.merge(t21,on=['app_id'],how='left')
158 | dataset2=dataset2.merge(t31,on=['app_id','adid'],how='left')
159 | dataset2=dataset2.merge(t41,on=['orderid'],how='left')
160 | dataset2=dataset2.merge(t51,on=['inner_slot_id'],how='left')
161 | dataset2=dataset2.merge(t61,on=['inner_slot_id','nnt'],how='left')
162 | #dataset2=dataset2.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left')
163 | #dataset2['creative_shape']=dataset2['creative_height']*dataset2['creative_width']
164 |
165 | dataset2.to_csv('features/feature2.csv',index=None)
166 | ##################################################################################################
167 | #test数据集
168 | #features3
169 | t1=features3[['adid','click']]
170 | t1=get_type_features(t1,['adid'],'click',"sum",'adid_click_num')
171 | t1=get_type_features(t1,['adid'],'click',"count",'adid_click_cnt')
172 | t1=get_type_features(t1,['adid'],'click',"mean",'adid_click_radio')
173 | t11=t1[['adid','adid_click_num','adid_click_cnt','adid_click_radio']]
174 | t11.drop_duplicates(inplace=True)
175 | print(t1.head(10))
176 | t2=features3[['app_id','click']]
177 | t2=get_type_features(t2,['app_id'],'click',"sum",'appid_click_num')
178 | t2=get_type_features(t2,['app_id'],'click',"count",'appid_click_cnt')
179 | t2=get_type_features(t2,['app_id'],'click',"mean",'appid_click_radio')
180 | t21=t2[['app_id','appid_click_num','appid_click_cnt','appid_click_radio']]
181 | t21.drop_duplicates(inplace=True)
182 |
183 | t3=features3[['app_id','adid','click']]
184 | t3=get_type_features(t3,['app_id','adid'],'click',"sum",'appid_ad_click_num')
185 | t3=get_type_features(t3,['app_id','adid'],'click',"count",'appid_ad_click_cnt')
186 | t3=get_type_features(t3,['app_id','adid'],'click',"mean",'appid_ad_click_radio')
187 | t31=t3[['app_id','adid','appid_ad_click_num','appid_ad_click_cnt','appid_ad_click_radio']]
188 | t31.drop_duplicates(inplace=True)
189 |
190 | t4=features3[['orderid','click']]
191 | t4=get_type_features(t4,['orderid'],'click',"sum",'orderid_click_num')
192 | t4=get_type_features(t4,['orderid'],'click',"count",'orderid_click_cnt')
193 | t4=get_type_features(t4,['orderid'],'click',"mean",'orderid_click_radio')
194 | t41=t4[['orderid','orderid_click_num','orderid_click_cnt','orderid_click_radio']]
195 | t41.drop_duplicates(inplace=True)
196 |
197 | t5=features3[['inner_slot_id','click']]
198 | t5=get_type_features(t5,['inner_slot_id'],'click',"sum",'inner_slot_id_click_num')
199 | t5=get_type_features(t5,['inner_slot_id'],'click',"count",'inner_slot_id_click_cnt')
200 | t5=get_type_features(t5,['inner_slot_id'],'click',"mean",'inner_slot_id_click_radio')
201 | t51=t5[['inner_slot_id','inner_slot_id_click_num','inner_slot_id_click_cnt','inner_slot_id_click_radio']]
202 | t51.drop_duplicates(inplace=True)
203 |
204 | t6=features3[['inner_slot_id','nnt','click']]
205 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"sum",'inner_slot_id_nnt_click_num')
206 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"count",'inner_slot_id_nnt_click_cnt')
207 | t6=get_type_features(t6,['inner_slot_id','nnt'],'click',"mean",'inner_slot_id_nnt_click_radio')
208 | t61=t6[['inner_slot_id','nnt','inner_slot_id_nnt_click_num','inner_slot_id_nnt_click_cnt','inner_slot_id_nnt_click_radio']]
209 | t61.drop_duplicates(inplace=True)
210 |
211 | dataset3=dataset3.merge(t11,on=['adid'],how='left')
212 | dataset3=dataset3.merge(t21,on=['app_id'],how='left')
213 | dataset3=dataset3.merge(t31,on=['app_id','adid'],how='left')
214 | dataset3=dataset3.merge(t41,on=['orderid'],how='left')
215 | dataset3=dataset3.merge(t51,on=['inner_slot_id'],how='left')
216 | dataset3=dataset3.merge(t61,on=['inner_slot_id','nnt'],how='left')
217 | #dataset3=dataset3.merge(inner_slot_id_one_hot,on=['inner_slot_id'],how='left')
218 | #dataset3['creative_shape']=dataset3['creative_height']*dataset3['creative_width']
219 |
220 | dataset3.to_csv('features/online_test_features.csv',index=None)
--------------------------------------------------------------------------------
/讯飞CTR预测/川哥版/_2_train.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import xgboost as xgb
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.preprocessing import MinMaxScaler
6 | import datetime
7 |
8 | #训练
9 | dataset1 = pd.read_csv('features/feature1.csv')
10 | #dataset1.click.replace(-1,0,inplace=True)
11 | dataset2 = pd.read_csv('features/feature2.csv')
12 | #dataset2.click.replace(-1,0,inplace=True)
13 | dataset3 = pd.read_csv('features/online_test_features.csv')
14 |
15 | dataset1.drop_duplicates(inplace=True)
16 | dataset2.drop_duplicates(inplace=True)
17 | dataset3.drop_duplicates(inplace=True)
18 |
19 | dataset1= dataset1.replace(np.nan,0)
20 | dataset2= dataset2.replace(np.nan,0)
21 | dataset3= dataset3.replace(np.nan,0)
22 |
23 | dataset12 = pd.concat([dataset1,dataset2],axis=0)
24 |
25 | dataset1_y = dataset1.click
26 | dataset1_x = dataset1.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1) # 'day_gap_before','day_gap_after' cause overfitting, 0.77
27 | dataset2_y = dataset2.click
28 | dataset2_x = dataset2.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1)
29 | dataset12_y = dataset12.click
30 | dataset12_x = dataset12.drop(['instance_id','click','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1)
31 | dataset3_preds = dataset3[['instance_id']]
32 | dataset3_x = dataset3.drop(['instance_id','time','time_string','day','user_tags','make','model','advert_industry_inner','advert_name','f_channel','inner_slot_id','osv','os_name'],axis=1)
33 |
34 | print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape)
35 |
36 | dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
37 | dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
38 | dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y)
39 | dataset3 = xgb.DMatrix(dataset3_x)
40 |
41 | params={'booster':'gbtree',
42 | 'objective': 'binary:logistic',
43 | 'eval_metric':'logloss',
44 | 'gamma':0.1,
45 | 'min_child_weight':1.1,
46 | 'max_depth':2,
47 | 'lambda':10,
48 | 'subsample':0.7,
49 | 'colsample_bytree':0.7,
50 | 'colsample_bylevel':0.7,
51 | 'eta': 0.2,
52 | 'tree_method':'exact',
53 | 'seed':0,
54 | 'nthread':12
55 | }
56 |
57 | #train on dataset1, evaluate on dataset2
58 | #watchlist = [(dataset1,'train'),(dataset2,'val')]
59 | #model = xgb.train(params,dataset1,num_boost_round=800,evals=watchlist,early_stopping_rounds=300)
60 |
61 | watchlist = [(dataset12,'train')]
62 | model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist)
63 |
64 | #predict test set
65 | dataset3_preds['predicted_score'] = model.predict(dataset3)
66 | #dataset3_preds.click = MinMaxScaler().fit_transform(dataset3_preds.click.reshape(-1, 1))
67 | #dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
68 | dataset3_preds.to_csv("./result/xgb_preds.csv",index=None)
69 | print(dataset3_preds.describe())
70 |
71 | #save feature score
72 | feature_score = model.get_fscore()
73 | feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
74 | fs = []
75 | for (key,value) in feature_score:
76 | fs.append("{0},{1}\n".format(key,value))
77 |
78 | with open('xgb_feature_score.csv','w') as f:
79 | f.writelines("feature,score\n")
80 | f.writelines(fs)
--------------------------------------------------------------------------------
/讯飞CTR预测/川哥版/utils.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import pandas as pd
3 |
4 | #获取统计特征,包括点击次数,转化次数,转化率
5 | def get_type_features(df,columns,value,operation,rename):
6 | if operation=="count":#统计点击次数
7 | add=pd.DataFrame(df.groupby(columns)[value].count()).reset_index()
8 | if operation=="sum":#统计转化次数
9 | add=pd.DataFrame(df.groupby(columns)[value].sum()).reset_index()
10 | if operation=="mean":#统计转化率
11 | add=pd.DataFrame(df.groupby(columns)[value].mean()).reset_index()
12 | add.columns=columns+[rename]
13 | df=df.merge(add,on=columns,how='left')
14 | return df
15 |
16 |
--------------------------------------------------------------------------------
/讯飞CTR预测/鱼神大佬/kdxf_baseline.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from sklearn.feature_selection import chi2, SelectPercentile
3 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder
4 | from sklearn.model_selection import StratifiedKFold
5 | from sklearn.feature_extraction.text import CountVectorizer
6 | from scipy import sparse
7 | import lightgbm as lgb
8 | import warnings
9 | import time
10 | import pandas as pd
11 | import numpy as np
12 | import os
13 |
14 | path = '/Users/inf/PycharmProject/kaggle/kdxf/data'
15 |
16 | warnings.filterwarnings("ignore")
17 |
18 | train = pd.read_table(path + '/train.txt')
19 | test = pd.read_table(path + '/test.txt')
20 | data = pd.concat([train, test], axis=0, ignore_index=True)
21 |
22 | data = data.fillna(-1)
23 |
24 | data['day'] = data['time'].apply(lambda x: int(time.strftime("%d", time.localtime(x))))
25 | data['hour'] = data['time'].apply(lambda x: int(time.strftime("%H", time.localtime(x))))
26 | data['label'] = data.click.astype(int)
27 | del data['click']
28 |
29 | bool_feature = ['creative_is_jump', 'creative_is_download', 'creative_is_js', 'creative_is_voicead',
30 | 'creative_has_deeplink', 'app_paid']
31 | for i in bool_feature:
32 | data[i] = data[i].astype(int)
33 |
34 | data['advert_industry_inner_1'] = data['advert_industry_inner'].apply(lambda x: x.split('_')[0])
35 |
36 | ad_cate_feature = ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name',
37 | 'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink',
38 | 'creative_is_jump', 'creative_is_download']
39 |
40 | media_cate_feature = ['app_cate_id', 'f_channel', 'app_id', 'inner_slot_id']
41 |
42 | content_cate_feature = ['city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model']
43 |
44 | origin_cate_list = ad_cate_feature + media_cate_feature + content_cate_feature
45 |
46 | for i in origin_cate_list:
47 | data[i] = data[i].map(dict(zip(data[i].unique(), range(0, data[i].nunique()))))
48 |
49 | cate_feature = origin_cate_list
50 |
51 | num_feature = ['creative_width', 'creative_height', 'hour']
52 |
53 | feature = cate_feature + num_feature
54 | print(len(feature), feature)
55 |
56 | predict = data[data.label == -1]
57 | predict_result = predict[['instance_id']]
58 | predict_result['predicted_score'] = 0
59 | predict_x = predict.drop('label', axis=1)
60 |
61 | train_x = data[data.label != -1]
62 | train_y = data[data.label != -1].label.values
63 |
64 | # 默认加载 如果 增加了cate类别特征 请改成false重新生成
65 | if os.path.exists(path + '/feature/base_train_csr.npz') and True:
66 | print('load_csr---------')
67 | base_train_csr = sparse.load_npz(path + '/feature/base_train_csr.npz').tocsr().astype('bool')
68 | base_predict_csr = sparse.load_npz(path + '/feature/base_predict_csr.npz').tocsr().astype('bool')
69 | else:
70 | base_train_csr = sparse.csr_matrix((len(train), 0))
71 | base_predict_csr = sparse.csr_matrix((len(predict_x), 0))
72 |
73 | enc = OneHotEncoder()
74 | for feature in cate_feature:
75 | enc.fit(data[feature].values.reshape(-1, 1))
76 | base_train_csr = sparse.hstack((base_train_csr, enc.transform(train_x[feature].values.reshape(-1, 1))), 'csr',
77 | 'bool')
78 | base_predict_csr = sparse.hstack((base_predict_csr, enc.transform(predict[feature].values.reshape(-1, 1))),
79 | 'csr',
80 | 'bool')
81 | print('one-hot prepared !')
82 |
83 | cv = CountVectorizer(min_df=20)
84 | for feature in ['user_tags']:
85 | data[feature] = data[feature].astype(str)
86 | cv.fit(data[feature])
87 | base_train_csr = sparse.hstack((base_train_csr, cv.transform(train_x[feature].astype(str))), 'csr', 'bool')
88 | base_predict_csr = sparse.hstack((base_predict_csr, cv.transform(predict_x[feature].astype(str))), 'csr',
89 | 'bool')
90 | print('cv prepared !')
91 |
92 | sparse.save_npz(path + '/feature/base_train_csr.npz', base_train_csr)
93 | sparse.save_npz(path + '/feature/base_predict_csr.npz', base_predict_csr)
94 |
95 | train_csr = sparse.hstack(
96 | (sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype(
97 | 'float32')
98 | predict_csr = sparse.hstack(
99 | (sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32')
100 | print(train_csr.shape)
101 | feature_select = SelectPercentile(chi2, percentile=95)
102 | feature_select.fit(train_csr, train_y)
103 | train_csr = feature_select.transform(train_csr)
104 | predict_csr = feature_select.transform(predict_csr)
105 | print('feature select')
106 | print(train_csr.shape)
107 |
108 | lgb_model = lgb.LGBMClassifier(
109 | boosting_type='gbdt', num_leaves=32, reg_alpha=0, reg_lambda=0.1,
110 | max_depth=-1, n_estimators=5000, objective='binary',
111 | subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
112 | learning_rate=0.05, random_state=2018, n_jobs=-1
113 | )
114 |
115 | skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
116 | best_score = []
117 | for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
118 | lgb_model.fit(train_csr[train_index], train_y[train_index],
119 | eval_set=[(train_csr[train_index], train_y[train_index]),
120 | (train_csr[test_index], train_y[test_index])], early_stopping_rounds=100)
121 | best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
122 | print(best_score)
123 | test_pred = lgb_model.predict_proba(predict_csr, num_iteration=lgb_model.best_iteration_)[:, 1]
124 | print('test mean:', test_pred.mean())
125 | predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
126 | print(np.mean(best_score))
127 | predict_result['predicted_score'] = predict_result['predicted_score'] / 5
128 | mean = predict_result['predicted_score'].mean()
129 | print('mean:', mean)
130 | now = datetime.datetime.now()
131 | now = now.strftime('%m-%d-%H-%M')
132 | predict_result[['instance_id', 'predicted_score']].to_csv(path + "/submission/lgb_baseline_%s.csv" % now, index=False)
--------------------------------------------------------------------------------
/阿里天池o2o新人赛/wepe_o2o.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "from datetime import date"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stderr",
23 | "output_type": "stream",
24 | "text": [
25 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2698: DtypeWarning: Columns (0,1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
26 | " interactivity=interactivity, compiler=compiler, result=result)\n",
27 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2698: DtypeWarning: Columns (0,1,2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
28 | " interactivity=interactivity, compiler=compiler, result=result)\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "#1754884 record,1053282 with coupon_id,9738 coupon. date_received:20160101~20160615,date:20160101~20160630, 539438 users, 8415 merchants\n",
34 | "off_train = pd.read_csv('data/ccf_offline_stage1_train.csv',header=None)\n",
35 | "off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']\n",
36 | "#2050 coupon_id. date_received:20160701~20160731, 76309 users(76307 in trainset, 35965 in online_trainset), 1559 merchants(1558 in trainset)\n",
37 | "off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv',header=None)\n",
38 | "off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']\n",
39 | "#11429826 record(872357 with coupon_id),762858 user(267448 in off_train)\n",
40 | "on_train = pd.read_csv('data/ccf_online_stage1_train.csv',header=None)\n",
41 | "on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 4,
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "dataset3 = off_test\n",
53 | "feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]\n",
54 | "dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]\n",
55 | "feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]\n",
56 | "dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]\n",
57 | "feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 15,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | " user_id this_month_user_receive_all_coupon_count\n",
70 | "0 1000020 1\n",
71 | "1 1000026 1\n",
72 | "2 1000452 1\n",
73 | "3 1000510 1\n",
74 | "4 100057 1\n"
75 | ]
76 | },
77 | {
78 | "name": "stderr",
79 | "output_type": "stream",
80 | "text": [
81 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
82 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
83 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
84 | "\n",
85 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
86 | " \n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "t = dataset3[['user_id']]\n",
92 | "t['this_month_user_receive_all_coupon_count'] = 1\n",
93 | "t = t.groupby('user_id').agg('sum').reset_index()\n",
94 | "print(t[:5])"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 17,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | " user_id coupon_id this_month_user_receive_same_coupon_count\n",
107 | "0 1000020 13602 1\n",
108 | "1 1000026 13602 1\n",
109 | "2 1000452 9983 1\n",
110 | "3 1000510 10418 1\n",
111 | "4 100057 2601 1\n"
112 | ]
113 | },
114 | {
115 | "name": "stderr",
116 | "output_type": "stream",
117 | "text": [
118 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
119 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
120 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
121 | "\n",
122 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
123 | " \n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "t1 = dataset3[['user_id','coupon_id']]\n",
129 | "t1['this_month_user_receive_same_coupon_count'] = 1\n",
130 | "#按照user_id和coupon_id进行分组\n",
131 | "#统计每个用户,使用不同优惠券的次数\n",
132 | "t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()\n",
133 | "print(t1[:5])"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 29,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "name": "stderr",
143 | "output_type": "stream",
144 | "text": [
145 | "c:\\users\\xpc\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\pandas\\core\\generic.py:2999: SettingWithCopyWarning: \n",
146 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
147 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
148 | "\n",
149 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
150 | " self[name] = value\n"
151 | ]
152 | },
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | " user_id coupon_id date_received\n",
158 | "0 1000020 13602 20160731\n",
159 | "1 1000026 13602 20160729\n",
160 | "2 1000452 9983 20160727\n",
161 | "3 1000510 10418 20160701\n",
162 | "4 100057 2601 20160708\n",
163 | "5 1000651 13602 20160728\n",
164 | "6 1000884 10438 20160714\n",
165 | "7 1000907 1904 20160703\n",
166 | "8 1000936 4203 20160710\n",
167 | "9 1000986 12429 20160701\n",
168 | "10 1001023 13602 20160723\n",
169 | "11 1001176 13181 20160731\n",
170 | "12 1001176 361 20160709\n",
171 | "13 1001176 3992 20160731\n",
172 | "14 100122 12735 20160714\n",
173 | "15 100122 13602 20160727\n",
174 | "16 1001240 10418 20160706\n",
175 | "17 1001240 13602 20160711\n",
176 | "18 1001240 2978 20160706\n",
177 | "19 1001257 11799 20160711\n",
178 | "20 1001302 13602 20160726\n",
179 | "21 1001466 9983 20160717\n",
180 | "22 100150 13602 20160725\n",
181 | "23 1001505 13602 20160710\n",
182 | "24 1001505 4283 20160702\n",
183 | "25 1001525 10418 20160713\n",
184 | "26 1001729 10418 20160702\n",
185 | "27 1001729 10438 20160723\n",
186 | "28 1001729 13602 20160719\n",
187 | "29 1001729 2978 20160723\n",
188 | "... ... ... ...\n",
189 | "105929 99721 10438 20160707\n",
190 | "105930 997367 13602 20160724\n",
191 | "105931 997367 8059 20160715\n",
192 | "105932 997367 9983 20160714\n",
193 | "105933 997426 13602 20160710\n",
194 | "105934 997688 13602 20160705\n",
195 | "105935 997751 13602 20160729\n",
196 | "105936 997802 3443 20160724:20160707\n",
197 | "105937 997802 6465 20160721\n",
198 | "105938 997802 7459 20160724\n",
199 | "105939 997846 613 20160729\n",
200 | "105940 997992 13602 20160707\n",
201 | "105941 998381 13602 20160728\n",
202 | "105942 998639 2978 20160729\n",
203 | "105943 998686 768 20160704\n",
204 | "105944 998717 13602 20160702\n",
205 | "105945 998717 9983 20160703\n",
206 | "105946 998773 4185 20160701\n",
207 | "105947 998807 10418 20160702\n",
208 | "105948 998945 13602 20160727\n",
209 | "105949 999137 9983 20160728\n",
210 | "105950 999350 13602 20160710\n",
211 | "105951 999659 13191 20160705\n",
212 | "105952 999659 7517 20160705\n",
213 | "105953 999781 12027 20160703\n",
214 | "105954 999781 1904 20160703\n",
215 | "105955 999842 9983 20160703\n",
216 | "105956 999931 13320 20160731\n",
217 | "105957 99996 13602 20160723\n",
218 | "105958 User_id Coupon_id Date_received\n",
219 | "\n",
220 | "[105959 rows x 3 columns]\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "t2 = dataset3[['user_id','coupon_id','date_received']]\n",
226 | "t2.date_received = t2.date_received.astype('str')\n",
227 | "# 按照user_id','coupon_id排序后,提出来date_received,进行agg运算\n",
228 | "# agg运算:用冒号连接起来\n",
229 | "t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()\n",
230 | "print(t2)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 30,
236 | "metadata": {
237 | "collapsed": true
238 | },
239 | "outputs": [],
240 | "source": [
241 | "#apply会返回每个优惠券的使用次数\n",
242 | "t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))\n",
243 | "#筛出使用次数大于1次的数据\n",
244 | "t2 = t2[t2.receive_number>1]\n",
245 | "#对max_date_received赋值为最近一次的使用时间\n",
246 | "t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))\n",
247 | "#对min_date_received赋值为最早一次的使用时间\n",
248 | "t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(d) for d in s.split(':')]))\n",
249 | "# 重新定义t2为以下项目\n",
250 | "t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 33,
256 | "metadata": {
257 | "collapsed": true
258 | },
259 | "outputs": [],
260 | "source": [
261 | "t3 = dataset3[['user_id','coupon_id','date_received']]\n",
262 | "#merge,将两个数据集合并\n",
263 | "#将t2和t3在['user_id','coupon_id']上进行左帧合并,即根据t3合并t2的user_id','coupon_id\n",
264 | "#t2[['user_id','coupon_id','max_date_received','min_date_received']]\n",
265 | "#t3[['user_id','coupon_id','date_received']]\n",
266 | "#因此合并方式为:找到每个用户每张优惠券的消费时间和对应券的max_date_received与min_date_received\n",
267 | "t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')\n",
268 | "#t3的this_month_user_receive_same_coupon_lastone项目设置为:此用户消费本张优惠券与最近一次消费本张优惠券的间隔\n",
269 | "t3 = t3.apply(pd.to_numeric, args=('coerce',))\n",
270 | "t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received\n",
271 | "#此用户消费本张优惠券与第一次消费本张优惠券的间隔\n",
272 | "t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received\n",
273 | "def is_firstlastone(x):\n",
274 | " if x==0:\n",
275 | " return 1\n",
276 | " elif x>0:\n",
277 | " return 0\n",
278 | " else:\n",
279 | " return -1 #those only receive once\n",
280 | " \n",
281 | "t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)\n",
282 | "t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)\n",
283 | "t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 34,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "name": "stdout",
293 | "output_type": "stream",
294 | "text": [
295 | " user_id coupon_id date_received \\\n",
296 | "0 NaN NaN NaN \n",
297 | "1 4129537.0 9983.0 20160712.0 \n",
298 | "2 6949378.0 3429.0 20160706.0 \n",
299 | "3 2166529.0 6928.0 20160727.0 \n",
300 | "4 2166529.0 1808.0 20160727.0 \n",
301 | "5 6172162.0 6500.0 20160708.0 \n",
302 | "6 4005121.0 9983.0 20160706.0 \n",
303 | "7 4347394.0 9983.0 20160716.0 \n",
304 | "8 3094273.0 13602.0 20160727.0 \n",
305 | "9 5139970.0 9983.0 20160729.0 \n",
306 | "10 3237121.0 13602.0 20160703.0 \n",
307 | "11 6224386.0 9983.0 20160716.0 \n",
308 | "12 6488578.0 13602.0 20160712.0 \n",
309 | "13 4164865.0 9983.0 20160703.0 \n",
310 | "14 4164865.0 8059.0 20160706.0 \n",
311 | "15 5468674.0 9983.0 20160713.0 \n",
312 | "16 6258178.0 9144.0 20160706.0 \n",
313 | "17 3659521.0 7341.0 20160727.0 \n",
314 | "18 3659521.0 13181.0 20160717.0 \n",
315 | "19 3659521.0 13602.0 20160718.0 \n",
316 | "20 7333378.0 13602.0 20160704.0 \n",
317 | "21 7333378.0 785.0 20160727.0 \n",
318 | "22 4454914.0 2978.0 20160711.0 \n",
319 | "23 6817282.0 8375.0 20160724.0 \n",
320 | "24 3149569.0 10418.0 20160721.0 \n",
321 | "25 6301186.0 1715.0 20160718.0 \n",
322 | "26 6301186.0 4203.0 20160708.0 \n",
323 | "27 2891521.0 13602.0 20160724.0 \n",
324 | "28 3422977.0 13602.0 20160727.0 \n",
325 | "29 4771330.0 13602.0 20160726.0 \n",
326 | "... ... ... ... \n",
327 | "113611 4194809.0 11799.0 20160717.0 \n",
328 | "113612 4194809.0 13602.0 20160713.0 \n",
329 | "113613 6062585.0 10438.0 20160719.0 \n",
330 | "113614 6062585.0 13602.0 20160719.0 \n",
331 | "113615 6074873.0 878.0 20160724.0 \n",
332 | "113616 6342137.0 13602.0 20160728.0 \n",
333 | "113617 6342137.0 8059.0 20160704.0 \n",
334 | "113618 6342137.0 9822.0 20160724.0 \n",
335 | "113619 6342137.0 3429.0 20160718.0 \n",
336 | "113620 4317689.0 5933.0 20160727.0 \n",
337 | "113621 5110265.0 13602.0 20160711.0 \n",
338 | "113622 5110265.0 9983.0 20160711.0 \n",
339 | "113623 6422009.0 5874.0 20160710.0 \n",
340 | "113624 4851197.0 9983.0 20160713.0 \n",
341 | "113625 4894205.0 2978.0 20160711.0 \n",
342 | "113626 7253501.0 13602.0 20160714.0 \n",
343 | "113627 6485501.0 13602.0 20160720.0 \n",
344 | "113628 4918781.0 10438.0 20160723.0 \n",
345 | "113629 6497789.0 9983.0 20160716.0 \n",
346 | "113630 7047677.0 2601.0 20160708.0 \n",
347 | "113631 6786557.0 9983.0 20160718.0 \n",
348 | "113632 6801917.0 13602.0 20160719.0 \n",
349 | "113633 7066109.0 9144.0 20160705.0 \n",
350 | "113634 4451837.0 13602.0 20160723.0 \n",
351 | "113635 5828093.0 2978.0 20160716.0 \n",
352 | "113636 5828093.0 10418.0 20160716.0 \n",
353 | "113637 6626813.0 7595.0 20160707.0 \n",
354 | "113638 6626813.0 7590.0 20160712.0 \n",
355 | "113639 4547069.0 13602.0 20160717.0 \n",
356 | "113640 6675965.0 613.0 20160728.0 \n",
357 | "\n",
358 | " this_month_user_receive_same_coupon_lastone \\\n",
359 | "0 -1 \n",
360 | "1 -1 \n",
361 | "2 -1 \n",
362 | "3 -1 \n",
363 | "4 -1 \n",
364 | "5 -1 \n",
365 | "6 -1 \n",
366 | "7 -1 \n",
367 | "8 -1 \n",
368 | "9 -1 \n",
369 | "10 -1 \n",
370 | "11 -1 \n",
371 | "12 -1 \n",
372 | "13 -1 \n",
373 | "14 -1 \n",
374 | "15 -1 \n",
375 | "16 -1 \n",
376 | "17 -1 \n",
377 | "18 -1 \n",
378 | "19 -1 \n",
379 | "20 -1 \n",
380 | "21 -1 \n",
381 | "22 -1 \n",
382 | "23 -1 \n",
383 | "24 -1 \n",
384 | "25 -1 \n",
385 | "26 -1 \n",
386 | "27 -1 \n",
387 | "28 -1 \n",
388 | "29 -1 \n",
389 | "... ... \n",
390 | "113611 -1 \n",
391 | "113612 -1 \n",
392 | "113613 -1 \n",
393 | "113614 -1 \n",
394 | "113615 -1 \n",
395 | "113616 -1 \n",
396 | "113617 -1 \n",
397 | "113618 -1 \n",
398 | "113619 -1 \n",
399 | "113620 -1 \n",
400 | "113621 -1 \n",
401 | "113622 -1 \n",
402 | "113623 -1 \n",
403 | "113624 -1 \n",
404 | "113625 -1 \n",
405 | "113626 -1 \n",
406 | "113627 -1 \n",
407 | "113628 -1 \n",
408 | "113629 -1 \n",
409 | "113630 -1 \n",
410 | "113631 -1 \n",
411 | "113632 -1 \n",
412 | "113633 -1 \n",
413 | "113634 -1 \n",
414 | "113635 -1 \n",
415 | "113636 -1 \n",
416 | "113637 -1 \n",
417 | "113638 -1 \n",
418 | "113639 -1 \n",
419 | "113640 -1 \n",
420 | "\n",
421 | " this_month_user_receive_same_coupon_firstone \n",
422 | "0 -1 \n",
423 | "1 -1 \n",
424 | "2 -1 \n",
425 | "3 -1 \n",
426 | "4 -1 \n",
427 | "5 -1 \n",
428 | "6 -1 \n",
429 | "7 -1 \n",
430 | "8 -1 \n",
431 | "9 -1 \n",
432 | "10 -1 \n",
433 | "11 -1 \n",
434 | "12 -1 \n",
435 | "13 -1 \n",
436 | "14 -1 \n",
437 | "15 -1 \n",
438 | "16 -1 \n",
439 | "17 -1 \n",
440 | "18 -1 \n",
441 | "19 -1 \n",
442 | "20 -1 \n",
443 | "21 -1 \n",
444 | "22 -1 \n",
445 | "23 -1 \n",
446 | "24 -1 \n",
447 | "25 -1 \n",
448 | "26 -1 \n",
449 | "27 -1 \n",
450 | "28 -1 \n",
451 | "29 -1 \n",
452 | "... ... \n",
453 | "113611 -1 \n",
454 | "113612 -1 \n",
455 | "113613 -1 \n",
456 | "113614 -1 \n",
457 | "113615 -1 \n",
458 | "113616 -1 \n",
459 | "113617 -1 \n",
460 | "113618 -1 \n",
461 | "113619 -1 \n",
462 | "113620 -1 \n",
463 | "113621 -1 \n",
464 | "113622 -1 \n",
465 | "113623 -1 \n",
466 | "113624 -1 \n",
467 | "113625 -1 \n",
468 | "113626 -1 \n",
469 | "113627 -1 \n",
470 | "113628 -1 \n",
471 | "113629 -1 \n",
472 | "113630 -1 \n",
473 | "113631 -1 \n",
474 | "113632 -1 \n",
475 | "113633 -1 \n",
476 | "113634 -1 \n",
477 | "113635 -1 \n",
478 | "113636 -1 \n",
479 | "113637 -1 \n",
480 | "113638 -1 \n",
481 | "113639 -1 \n",
482 | "113640 -1 \n",
483 | "\n",
484 | "[113641 rows x 5 columns]\n"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "print(t3)"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {
496 | "collapsed": true
497 | },
498 | "outputs": [],
499 | "source": []
500 | }
501 | ],
502 | "metadata": {
503 | "kernelspec": {
504 | "display_name": "Python 3",
505 | "language": "python",
506 | "name": "python3"
507 | },
508 | "language_info": {
509 | "codemirror_mode": {
510 | "name": "ipython",
511 | "version": 3
512 | },
513 | "file_extension": ".py",
514 | "mimetype": "text/x-python",
515 | "name": "python",
516 | "nbconvert_exporter": "python",
517 | "pygments_lexer": "ipython3",
518 | "version": "3.6.1"
519 | }
520 | },
521 | "nbformat": 4,
522 | "nbformat_minor": 2
523 | }
524 |
--------------------------------------------------------------------------------
/阿里天池o2o新人赛/xgb.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import xgboost as xgb
3 | from sklearn.preprocessing import MinMaxScaler
4 |
5 | dataset1 = pd.read_csv('data/dataset1.csv')
6 | dataset1.label.replace(-1,0,inplace=True)
7 | dataset2 = pd.read_csv('data/dataset2.csv')
8 | dataset2.label.replace(-1,0,inplace=True)
9 | dataset3 = pd.read_csv('data/dataset3.csv')
10 |
11 | dataset1.drop_duplicates(inplace=True)
12 | dataset2.drop_duplicates(inplace=True)
13 | dataset3.drop_duplicates(inplace=True)
14 |
15 | dataset12 = pd.concat([dataset1,dataset2],axis=0)
16 |
17 | dataset1_y = dataset1.label
18 | dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) # 'day_gap_before','day_gap_after' cause overfitting, 0.77
19 | dataset2_y = dataset2.label
20 | dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
21 | dataset12_y = dataset12.label
22 | dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
23 | dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
24 | dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
25 |
26 | print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape)
27 |
28 | dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
29 | dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
30 | dataset12 = xgb.DMatrix(dataset12_x,label=dataset12_y)
31 | dataset3 = xgb.DMatrix(dataset3_x)
32 |
33 | params={'booster':'gbtree',
34 | 'objective': 'rank:pairwise',
35 | 'eval_metric':'auc',
36 | 'gamma':0.1,
37 | 'min_child_weight':1.1,
38 | 'max_depth':5,
39 | 'lambda':10,
40 | 'subsample':0.7,
41 | 'colsample_bytree':0.7,
42 | 'colsample_bylevel':0.7,
43 | 'eta': 0.01,
44 | 'tree_method':'exact',
45 | 'seed':0,
46 | 'nthread':12
47 | }
48 |
49 | #train on dataset1, evaluate on dataset2
50 | #watchlist = [(dataset1,'train'),(dataset2,'val')]
51 | #model = xgb_3500.train(params,dataset1,num_boost_round=3000,evals=watchlist,early_stopping_rounds=300)
52 |
53 | watchlist = [(dataset12,'train')]
54 | model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist)
55 |
56 | #predict test set
57 | dataset3_preds['label'] = model.predict(dataset3)
58 | dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label)
59 | dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
60 | dataset3_preds.to_csv("xgb_preds.csv",index=None,header=None)
61 | print(dataset3_preds.describe())
62 |
63 | #save feature score
64 | feature_score = model.get_fscore()
65 | feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
66 | fs = []
67 | for (key,value) in feature_score:
68 | fs.append("{0},{1}\n".format(key,value))
69 |
70 | with open('xgb_feature_score.csv','w') as f:
71 | f.writelines("feature,score\n")
72 | f.writelines(fs)
73 |
74 |
--------------------------------------------------------------------------------