├── .gitignore ├── README.md ├── digit-recognizer ├── readme.md └── src │ ├── KNN.ipynb │ ├── Tensorflow_NN.ipynb │ ├── ensemble_stacking.py │ ├── feature_engineer.py │ ├── keras_NN.ipynb │ ├── logistregression_NN.ipynb │ ├── model_knn.py │ ├── model_svm.py │ ├── scikit-learn-knn.ipynb │ ├── tensorflow_model_softmax.py │ ├── utils.py │ └── xgboost.ipynb ├── house-predict └── src │ ├── EDA.ipynb │ └── house_predict_start.ipynb ├── requrements.txt └── titanic ├── README.md ├── example └── titanic.py ├── images └── submit.png └── src ├── 2-1-Logistic_predict_Titanic.ipynb ├── EDA.ipynb ├── __init__.py ├── ensemble.py ├── ensemble_starking.py ├── ensemble_util.py ├── ensemble_xgb_rf.py ├── feature_engineer.py ├── feature_predict_age.py ├── gridssearch_xgboost.ipynb ├── load_data.py ├── model_dt.py ├── model_lr.py ├── model_selection.ipynb ├── model_xgb.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | input/ 3 | output/ 4 | __pycache__/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 利用Kaggle平台,快速入门机器学习 2 | 3 | ## 前言 4 | 5 | ## 1. 准备工作 6 | 7 | ### 1.1 硬件准备 8 | 9 | ### 1.2 机器学习环境搭建 10 | 11 | ## 2. Titanic 12 | 13 | - 2.1 [利用Logistic算法预测titanic存活率](titanic/src/2-1-Logistic_predict_Titanic.ipynb) 14 | - 2.2 [模型评估与交叉验证]() 15 | - 2.3 [模型融合]() 16 | 17 | ## 3. House prediction 18 | 19 | ## 4. Digit recognizer 20 | 21 | ## 附录 22 | -------------------------------------------------------------------------------- /digit-recognizer/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wsg011/kaggle-start/0dd948ae407edcc68f790f310d6b7c64e4e8a328/digit-recognizer/readme.md -------------------------------------------------------------------------------- /digit-recognizer/src/KNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 利用KNN算法识别手写数字" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# -*- coding: utf-8 -*-\n", 19 | "\n", 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import operator\n", 24 | "import time\n", 25 | "plt.rcParams['font.sans-serif']=['simhei'] #用于正常显示中文\n", 26 | "plt.rcParams['axes.unicode_minus']=False #用于正常显示负号" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "testDataSet = pd.read_csv('data/test.csv')\n", 38 | "trainDataSet = pd.read_csv('data/train.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "trainLabel = trainDataSet['label']\n", 50 | "trainData = trainDataSet.iloc[:,1:785]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "28000" 64 | ] 65 | }, 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "m_t,n_t = np.shape(testDataSet)\n", 73 | "m_tr,n_tr = np.shape(trainData)\n", 74 | "\n", 75 | "testDataMat = np.multiply(testDataSet != np.zeros((m_t,n_t)),np.ones((m_t,1)))\n", 76 | "trainDataMat = np.multiply(trainData != np.zeros((m_tr,n_tr)),np.ones((m_tr,1))) \n", 77 | "np.shape(testDataMat)[0]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def classify(inX, dataSet, labels, k): \n", 89 | " dataSetSize = dataSet.shape[0] \n", 90 | " diffMat = np.tile(inX, (dataSetSize,1)) - dataSet \n", 91 | " sqDiffMat = diffMat**2 \n", 92 | " sqDistances = sqDiffMat.sum(axis=1) \n", 93 | " distances = sqDistances**0.5 \n", 94 | " sortedDistIndicies = distances.argsort() \n", 95 | " classCount={} \n", 96 | " for i in range(k): \n", 97 | " voteIlabel = labels[sortedDistIndicies[i]] \n", 98 | " classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 \n", 99 | " sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) \n", 100 | " return sortedClassCount[0][0]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "This is 1testdata,classdify is:2\n", 115 | "This is 2testdata,classdify is:0\n", 116 | "This is 3testdata,classdify is:9\n", 117 | "This is 4testdata,classdify is:9\n", 118 | "This is 5testdata,classdify is:3\n", 119 | "This is 6testdata,classdify is:7\n", 120 | "This is 7testdata,classdify is:0\n", 121 | "This is 8testdata,classdify is:3\n", 122 | "This is 9testdata,classdify is:0\n", 123 | "This is 10testdata,classdify is:3\n", 124 | "This is 11testdata,classdify is:5\n", 125 | "This is 12testdata,classdify is:7\n", 126 | "This is 13testdata,classdify is:4\n", 127 | "This is 14testdata,classdify is:0\n", 128 | "This is 15testdata,classdify is:4\n", 129 | "This is 16testdata,classdify is:3\n", 130 | "This is 17testdata,classdify is:3\n", 131 | "This is 18testdata,classdify is:1\n", 132 | "This is 19testdata,classdify is:9\n", 133 | "This is 20testdata,classdify is:0\n", 134 | "This is 21testdata,classdify is:9\n", 135 | "This is 22testdata,classdify is:1\n", 136 | "This is 23testdata,classdify is:1\n", 137 | "This is 24testdata,classdify is:5\n", 138 | "This is 25testdata,classdify is:7\n", 139 | "This is 26testdata,classdify is:4\n", 140 | "This is 27testdata,classdify is:2\n", 141 | "This is 28testdata,classdify is:7\n", 142 | "This is 29testdata,classdify is:9\n", 143 | "This is 30testdata,classdify is:7\n", 144 | "This is 31testdata,classdify is:7\n", 145 | "This is 32testdata,classdify is:5\n", 146 | "This is 33testdata,classdify is:4\n", 147 | "This is 34testdata,classdify is:2\n", 148 | "This is 35testdata,classdify is:6\n", 149 | "This is 36testdata,classdify is:2\n", 150 | "This is 37testdata,classdify is:5\n", 151 | "This is 38testdata,classdify is:5\n", 152 | "This is 39testdata,classdify is:1\n", 153 | "This is 40testdata,classdify is:6\n", 154 | "This is 41testdata,classdify is:7\n", 155 | "This is 42testdata,classdify is:7\n", 156 | "This is 43testdata,classdify is:4\n", 157 | "This is 44testdata,classdify is:9\n", 158 | "This is 45testdata,classdify is:5\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "Label = []\n", 164 | "ImageId = []\n", 165 | "np.shape(testDataMat)[0]\n", 166 | "i=1\n", 167 | "time1 = time.time()\n", 168 | "for i in range(np.shape(testDataMat)[0]):\n", 169 | " classdifys = classify(testDataMat.iloc[i],trainDataMat,trainLabel,k=5)\n", 170 | " ImageId.append(i+1)\n", 171 | " Label.append(classdifys)\n", 172 | " print \"This is \" + str(i+1) +\"testdata,classdify is:\" + str(classdifys)\n", 173 | "time2 = time.time()\n", 174 | "train_time = time2-time1\n", 175 | "data = pd.DataFrame(Label,ImageId)\n", 176 | "data.to_csv('data/classdify.csv',encoding='utf-8')" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "train_time" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "" 199 | ] 200 | } 201 | ], 202 | "metadata": { 203 | "anaconda-cloud": {}, 204 | "kernelspec": { 205 | "display_name": "Python 2", 206 | "language": "python", 207 | "name": "python2" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 2.0 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython2", 219 | "version": "2.7.13" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 0 224 | } -------------------------------------------------------------------------------- /digit-recognizer/src/ensemble_stacking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wsg011/kaggle-start/0dd948ae407edcc68f790f310d6b7c64e4e8a328/digit-recognizer/src/ensemble_stacking.py -------------------------------------------------------------------------------- /digit-recognizer/src/feature_engineer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.preprocessing import OneHotEncoder 4 | from utils import load_data 5 | 6 | 7 | def feature_engineer(): 8 | train, test = load_data() 9 | 10 | print("feature transform...") 11 | x_train = train.drop(['label'], axis=1) 12 | x_train = x_train.applymap(lambda x: x/255.0) 13 | 14 | label = train['label'].values 15 | 16 | #encode = OneHotEncoder() 17 | #y_train = encode.fit_transform(train['label'].reshape(-1, 1)).toarray() 18 | 19 | test = test.applymap(lambda x: x/255.0) 20 | 21 | return x_train, label, test -------------------------------------------------------------------------------- /digit-recognizer/src/keras_NN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import tensorflow as tf\n", 14 | "from keras.models import Sequential\n", 15 | "from keras.layers import Dense, Dropout, Flatten\n", 16 | "from keras.layers import Conv2D, MaxPooling2D\n", 17 | "from keras.optimizers import RMSprop, Adam\n", 18 | "from sklearn.preprocessing import OneHotEncoder\n", 19 | "from sklearn.model_selection import train_test_split" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "def load_data():\n", 31 | " train = pd.read_csv('../input/train.csv')\n", 32 | " test = pd.read_csv('../input/test.csv')\n", 33 | " print(\"train shape:{}\\ntest shape:{}\".format(train.shape, test.shape))\n", 34 | " \n", 35 | " train_df = train.drop(['label'], axis=1)\n", 36 | " label = pd.get_dummies(train['label'])\n", 37 | " #train_df = train_df/255.0\n", 38 | " #test = test/255.0\n", 39 | " train_df = train_df.applymap(lambda x: x/255).astype(np.float32)\n", 40 | " test = test.applymap(lambda x: x/255).astype(np.float32)\n", 41 | " \n", 42 | " # Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)\n", 43 | " train_df = train_df.values.reshape(-1,28,28,1)\n", 44 | " test = test.values.reshape(-1,28,28,1)\n", 45 | " \n", 46 | " x_train, x_valid, y_train, y_valid = train_test_split(train_df, label, test_size=0.2, random_state=2017)\n", 47 | " print(\"x_train shape:\"+str(x_train.shape))\n", 48 | " print(\"x_valid shape:\"+str(x_valid.shape))\n", 49 | " \n", 50 | " return x_train, y_train, x_valid, y_valid, test" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "train shape:(42000, 785)\n", 65 | "test shape:(28000, 784)\n", 66 | "x_train shape:(33600, 28, 28, 1)\n", 67 | "x_valid shape:(8400, 28, 28, 1)\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "x_train, y_train, x_valid, y_valid, x_test = load_data()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 20, 78 | "metadata": { 79 | "collapsed": false, 80 | "scrolled": true 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Train on 33600 samples, validate on 8400 samples\n", 88 | "Epoch 1/50\n", 89 | "33600/33600 [==============================] - 13s 396us/step - loss: 0.8678 - acc: 0.7172 - val_loss: 0.1705 - val_acc: 0.9485\n", 90 | "Epoch 2/50\n", 91 | "33600/33600 [==============================] - 12s 368us/step - loss: 0.2728 - acc: 0.9214 - val_loss: 0.1032 - val_acc: 0.9687\n", 92 | "Epoch 3/50\n", 93 | "33600/33600 [==============================] - 12s 360us/step - loss: 0.1954 - acc: 0.9455 - val_loss: 0.0822 - val_acc: 0.9745\n", 94 | "Epoch 4/50\n", 95 | "33600/33600 [==============================] - 13s 380us/step - loss: 0.1519 - acc: 0.9584 - val_loss: 0.0683 - val_acc: 0.9783\n", 96 | "Epoch 5/50\n", 97 | "33600/33600 [==============================] - 12s 369us/step - loss: 0.1265 - acc: 0.9654 - val_loss: 0.0590 - val_acc: 0.9825\n", 98 | "Epoch 6/50\n", 99 | "33600/33600 [==============================] - 13s 382us/step - loss: 0.1095 - acc: 0.9691 - val_loss: 0.0586 - val_acc: 0.9823\n", 100 | "Epoch 7/50\n", 101 | "33600/33600 [==============================] - 12s 367us/step - loss: 0.0934 - acc: 0.9745 - val_loss: 0.0466 - val_acc: 0.9857\n", 102 | "Epoch 8/50\n", 103 | "33600/33600 [==============================] - 13s 384us/step - loss: 0.0832 - acc: 0.9772 - val_loss: 0.0481 - val_acc: 0.9870\n", 104 | "Epoch 9/50\n", 105 | "33600/33600 [==============================] - 12s 371us/step - loss: 0.0765 - acc: 0.9780 - val_loss: 0.0469 - val_acc: 0.9871\n", 106 | "Epoch 10/50\n", 107 | "33600/33600 [==============================] - 12s 367us/step - loss: 0.0693 - acc: 0.9816 - val_loss: 0.0387 - val_acc: 0.9893\n", 108 | "Epoch 11/50\n", 109 | "33600/33600 [==============================] - 12s 367us/step - loss: 0.0644 - acc: 0.9823 - val_loss: 0.0409 - val_acc: 0.9881\n", 110 | "Epoch 12/50\n", 111 | "33600/33600 [==============================] - 12s 370us/step - loss: 0.0561 - acc: 0.9847 - val_loss: 0.0422 - val_acc: 0.9895\n", 112 | "Epoch 13/50\n", 113 | "33600/33600 [==============================] - 12s 367us/step - loss: 0.0537 - acc: 0.9851 - val_loss: 0.0394 - val_acc: 0.9894\n", 114 | "Epoch 14/50\n", 115 | "33600/33600 [==============================] - 12s 369us/step - loss: 0.0474 - acc: 0.9864 - val_loss: 0.0392 - val_acc: 0.9902\n", 116 | "Epoch 15/50\n", 117 | "33600/33600 [==============================] - 12s 368us/step - loss: 0.0437 - acc: 0.9887 - val_loss: 0.0372 - val_acc: 0.9912\n", 118 | "Epoch 16/50\n", 119 | "33600/33600 [==============================] - 12s 368us/step - loss: 0.0433 - acc: 0.9874 - val_loss: 0.0386 - val_acc: 0.9905\n", 120 | "Epoch 17/50\n", 121 | "33600/33600 [==============================] - 12s 370us/step - loss: 0.0387 - acc: 0.9893 - val_loss: 0.0377 - val_acc: 0.9904\n", 122 | "Epoch 18/50\n", 123 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0343 - acc: 0.9907 - val_loss: 0.0392 - val_acc: 0.9901\n", 124 | "Epoch 19/50\n", 125 | "33600/33600 [==============================] - 12s 367us/step - loss: 0.0339 - acc: 0.9905 - val_loss: 0.0330 - val_acc: 0.9923\n", 126 | "Epoch 20/50\n", 127 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0323 - acc: 0.9913 - val_loss: 0.0385 - val_acc: 0.9914\n", 128 | "Epoch 21/50\n", 129 | "33600/33600 [==============================] - 12s 365us/step - loss: 0.0300 - acc: 0.9918 - val_loss: 0.0314 - val_acc: 0.9918\n", 130 | "Epoch 22/50\n", 131 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0299 - acc: 0.9913 - val_loss: 0.0346 - val_acc: 0.9919\n", 132 | "Epoch 23/50\n", 133 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0267 - acc: 0.9924 - val_loss: 0.0363 - val_acc: 0.9918\n", 134 | "Epoch 24/50\n", 135 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0245 - acc: 0.9935 - val_loss: 0.0350 - val_acc: 0.9923\n", 136 | "Epoch 25/50\n", 137 | "33600/33600 [==============================] - 12s 366us/step - loss: 0.0239 - acc: 0.9932 - val_loss: 0.0379 - val_acc: 0.9906\n", 138 | "Epoch 26/50\n", 139 | "33600/33600 [==============================] - 12s 365us/step - loss: 0.0209 - acc: 0.9937 - val_loss: 0.0386 - val_acc: 0.9915\n", 140 | "Epoch 27/50\n", 141 | "33600/33600 [==============================] - 12s 372us/step - loss: 0.0211 - acc: 0.9941 - val_loss: 0.0352 - val_acc: 0.9921\n", 142 | "Epoch 28/50\n", 143 | "33600/33600 [==============================] - 12s 361us/step - loss: 0.0211 - acc: 0.9939 - val_loss: 0.0345 - val_acc: 0.9914\n", 144 | "Epoch 29/50\n", 145 | "33600/33600 [==============================] - 12s 361us/step - loss: 0.0195 - acc: 0.9948 - val_loss: 0.0407 - val_acc: 0.9920\n", 146 | "Epoch 30/50\n", 147 | "33600/33600 [==============================] - 12s 354us/step - loss: 0.0182 - acc: 0.9947 - val_loss: 0.0363 - val_acc: 0.9929\n", 148 | "Epoch 31/50\n", 149 | "33600/33600 [==============================] - 12s 354us/step - loss: 0.0175 - acc: 0.9951 - val_loss: 0.0362 - val_acc: 0.9927\n", 150 | "Epoch 32/50\n", 151 | "33600/33600 [==============================] - 12s 353us/step - loss: 0.0183 - acc: 0.9951 - val_loss: 0.0358 - val_acc: 0.9931\n", 152 | "Epoch 33/50\n", 153 | "33600/33600 [==============================] - 12s 354us/step - loss: 0.0170 - acc: 0.9951 - val_loss: 0.0399 - val_acc: 0.9918\n", 154 | "Epoch 34/50\n", 155 | "33600/33600 [==============================] - 12s 352us/step - loss: 0.0140 - acc: 0.9960 - val_loss: 0.0393 - val_acc: 0.9914\n", 156 | "Epoch 35/50\n", 157 | "33600/33600 [==============================] - 12s 359us/step - loss: 0.0159 - acc: 0.9951 - val_loss: 0.0401 - val_acc: 0.9926\n", 158 | "Epoch 36/50\n", 159 | "33600/33600 [==============================] - 12s 352us/step - loss: 0.0153 - acc: 0.9950 - val_loss: 0.0463 - val_acc: 0.9910\n", 160 | "Epoch 37/50\n", 161 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0138 - acc: 0.9960 - val_loss: 0.0486 - val_acc: 0.9923\n", 162 | "Epoch 38/50\n", 163 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0126 - acc: 0.9967 - val_loss: 0.0431 - val_acc: 0.9918\n", 164 | "Epoch 39/50\n", 165 | "33600/33600 [==============================] - 12s 349us/step - loss: 0.0112 - acc: 0.9972 - val_loss: 0.0359 - val_acc: 0.9931\n", 166 | "Epoch 40/50\n", 167 | "33600/33600 [==============================] - 12s 352us/step - loss: 0.0128 - acc: 0.9967 - val_loss: 0.0386 - val_acc: 0.9931\n", 168 | "Epoch 41/50\n", 169 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0097 - acc: 0.9970 - val_loss: 0.0387 - val_acc: 0.9931\n", 170 | "Epoch 42/50\n", 171 | "33600/33600 [==============================] - 12s 349us/step - loss: 0.0122 - acc: 0.9965 - val_loss: 0.0373 - val_acc: 0.9924\n", 172 | "Epoch 43/50\n", 173 | "33600/33600 [==============================] - 12s 354us/step - loss: 0.0112 - acc: 0.9970 - val_loss: 0.0365 - val_acc: 0.9924\n", 174 | "Epoch 44/50\n", 175 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0112 - acc: 0.9969 - val_loss: 0.0355 - val_acc: 0.9926\n", 176 | "Epoch 45/50\n", 177 | "33600/33600 [==============================] - 12s 348us/step - loss: 0.0090 - acc: 0.9976 - val_loss: 0.0382 - val_acc: 0.9930\n", 178 | "Epoch 46/50\n", 179 | "33600/33600 [==============================] - 12s 350us/step - loss: 0.0105 - acc: 0.9971 - val_loss: 0.0458 - val_acc: 0.9921\n", 180 | "Epoch 47/50\n", 181 | "33600/33600 [==============================] - 12s 349us/step - loss: 0.0107 - acc: 0.9968 - val_loss: 0.0403 - val_acc: 0.9927\n", 182 | "Epoch 48/50\n", 183 | "33600/33600 [==============================] - 12s 349us/step - loss: 0.0097 - acc: 0.9972 - val_loss: 0.0413 - val_acc: 0.9921\n", 184 | "Epoch 49/50\n", 185 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0088 - acc: 0.9970 - val_loss: 0.0411 - val_acc: 0.9921\n", 186 | "Epoch 50/50\n", 187 | "33600/33600 [==============================] - 12s 351us/step - loss: 0.0078 - acc: 0.9978 - val_loss: 0.0416 - val_acc: 0.9932\n" 188 | ] 189 | }, 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "" 194 | ] 195 | }, 196 | "execution_count": 20, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "# 牛逼的Sequential类可以让我们灵活地插入不同的神经网络层\n", 203 | "model = Sequential()\n", 204 | "\n", 205 | "# 加上一个2D卷积层, 32个输出(也就是卷积通道),激活函数选用relu,\n", 206 | "# 卷积核的窗口选用3*3像素窗口\n", 207 | "model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n", 208 | "model.add(Conv2D(32, (3, 3), activation='relu'))\n", 209 | "\n", 210 | "# 池化层是2*2像素的\n", 211 | "model.add(MaxPooling2D(pool_size=(2, 2)))\n", 212 | "# 对于池化层的输出,采用0.35概率的Dropout\n", 213 | "model.add(Dropout(0.35))\n", 214 | "\n", 215 | "model.add(Conv2D(64, (3, 3), activation='relu'))\n", 216 | "model.add(Conv2D(128, (3, 3), activation='relu'))\n", 217 | "# 池化层是2*2像素的\n", 218 | "model.add(MaxPooling2D(pool_size=(2, 2)))\n", 219 | "# 对于池化层的输出,采用0.35概率的Dropout\n", 220 | "model.add(Dropout(0.35))\n", 221 | "\n", 222 | "# 展平所有像素,比如[28*28] -> [784]\n", 223 | "model.add(Flatten())\n", 224 | "model.add(Dense(512, activation='relu'))\n", 225 | "# 对所有像素使用全连接层,输出为128,激活函数选用relu\n", 226 | "model.add(Dense(256, activation='relu'))\n", 227 | "# 对所有像素使用全连接层,输出为64,激活函数选用relu\n", 228 | "model.add(Dense(64, activation='relu'))\n", 229 | "\n", 230 | "# 对输入采用0.5概率的Dropout\n", 231 | "model.add(Dropout(0.5))\n", 232 | "# 对刚才Dropout的输出采用softmax激活函数,得到最后结果0-9\n", 233 | "model.add(Dense(10, activation='softmax'))\n", 234 | "\n", 235 | "# Define the optimizer\n", 236 | "optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)\n", 237 | "model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-4), metrics=['accuracy'])\n", 238 | "\n", 239 | "model.fit(x_train, y_train, epochs=50, batch_size=64, verbose=1, validation_data=(x_valid, y_valid))\n", 240 | "\n", 241 | "# loss: 0.0293 - acc: 0.9915 - val_loss: 0.0323 - val_acc: 0.9925\n", 242 | "# LB -> 99.14%" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 21, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "from datetime import datetime\n", 254 | "\n", 255 | "# predict results\n", 256 | "results = model.predict(x_test)\n", 257 | "\n", 258 | "# select the indix with the maximum probability\n", 259 | "results = np.argmax(results,axis = 1)\n", 260 | "\n", 261 | "submit_df= pd.read_csv('../input/sample_submission.csv')\n", 262 | "submit_df.Label = results\n", 263 | "filename = \"../sub/{}.scv\".format(datetime.now().strftime('%Y%m%d_%H_%M'))\n", 264 | "submit_df.to_csv(filename,index=None,encoding='utf-8')" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "" 276 | ] 277 | } 278 | ], 279 | "metadata": { 280 | "anaconda-cloud": {}, 281 | "kernelspec": { 282 | "display_name": "Python [conda root]", 283 | "language": "python", 284 | "name": "conda-root-py" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3.0 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.5.2" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 0 301 | } -------------------------------------------------------------------------------- /digit-recognizer/src/model_knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import operator 7 | import time 8 | 9 | plt.rcParams['font.sans-serif']=['simhei'] #用于正常显示中文 10 | plt.rcParams['axes.unicode_minus']=False #用于正常显示负号 11 | 12 | testDataSet = pd.read_csv('../input/test.csv') 13 | trainDataSet = pd.read_csv('../input/train.csv') 14 | trainLabel = trainDataSet['label'] 15 | trainData = trainDataSet.iloc[:, 1:785] 16 | m_t,n_t = np.shape(testDataSet) 17 | m_tr,n_tr = np.shape(trainData) 18 | 19 | testDataMat = np.multiply(testDataSet != np.zeros((m_t, n_t)), np.ones((m_t, 1))) 20 | trainDataMat = np.multiply(trainData != np.zeros((m_tr, n_tr)), np.ones((m_tr, 1))) 21 | np.shape(testDataMat)[0] 22 | 23 | 24 | def classify(inX, dataSet, labels, k): 25 | dataSetSize = dataSet.shape[0] 26 | diffMat = np.tile(inX, (dataSetSize,1)) - dataSet 27 | sqDiffMat = diffMat**2 28 | sqDistances = sqDiffMat.sum(axis=1) 29 | distances = sqDistances**0.5 30 | sortedDistIndicies = distances.argsort() 31 | classCount={} 32 | for i in range(k): 33 | voteIlabel = labels[sortedDistIndicies[i]] 34 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 35 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 36 | return sortedClassCount[0][0] 37 | 38 | Label = [] 39 | ImageId = [] 40 | np.shape(testDataMat)[0] 41 | i=1 42 | time1 = time.time() 43 | for i in range(np.shape(testDataMat)[0]): 44 | classdifys = classify(testDataMat.iloc[i],trainDataMat,trainLabel,k=10) 45 | ImageId.append(i+1) 46 | Label.append(classdifys) 47 | #print "This is " + str(i+1) +"testdata,classdify is:" + str(classdifys) 48 | time2 = time.time() 49 | train_time = time2-time1 50 | data = pd.DataFrame(Label,ImageId) 51 | data.to_csv('../sub/knn.csv',encoding='utf-8') -------------------------------------------------------------------------------- /digit-recognizer/src/model_svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import time 4 | 5 | from sklearn.model_selection import train_test_split, cross_val_score 6 | from sklearn.svm import SVC 7 | 8 | from feature_engineer import feature_engineer 9 | from utils import submission 10 | 11 | 12 | # 13 | # Load data 14 | # 15 | train, label, test = feature_engineer() 16 | 17 | x_train, x_valid, y_train, y_valid = train_test_split(train, label, test_size=0.2, random_state=0) 18 | print("train shape:{}, test shape:{}".format(x_train.shape, x_valid.shape)) 19 | 20 | svc = SVC() 21 | 22 | # 23 | # cv 24 | # 25 | #start_time = time.time() 26 | #print(cross_val_score(svc, train, label, cv=5, n_jobs=-1)) 27 | # score:[ 0.94122546 0.93538022 0.93808787 0.93902584 0.94104335] 28 | #print("CV time:{}".format(time.time()-start_time)) 29 | 30 | start_time = time.time() 31 | svc.fit(train, label) 32 | print("fit time:{}".format(time.time()-start_time)) 33 | pre = svc.predict(test) 34 | submission(pre) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /digit-recognizer/src/scikit-learn-knn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# -*- coding: utf-8 -*-\n", 12 | "\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "from sklearn.neighbors import KNeighborsClassifier\n", 16 | "from sklearn.ensemble import RandomForestClassifier" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "def load_data():\n", 28 | " test = pd.read_csv('../input/test.csv')\n", 29 | " train_df = pd.read_csv('../input/train.csv')\n", 30 | " \n", 31 | " label = train_df['label']\n", 32 | " train = train_df.drop(['label'], axis=1)\n", 33 | " #归一化处理\n", 34 | " test = test/255\n", 35 | " train = train/255\n", 36 | " print(\"train shape:\" + str(train.shape))\n", 37 | " return train, label, test" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "train, label, test = load_data()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "neighbors = KNeighborsClassifier()\n", 60 | "neighbors.fit(trainData,label)\n", 61 | "pre = neighbors.predict(testData)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "array([2, 0, 9, ..., 3, 9, 2], dtype=int64)" 75 | ] 76 | }, 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "pre" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "dataframe = pd.DataFrame(pre)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 8, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "dataframe.to_csv('data/data.csv',encoding='utf-8')" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 119 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 120 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 121 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 122 | " n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n", 123 | " verbose=0, warm_start=False)" 124 | ] 125 | }, 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "rf = RandomForestClassifier()\n", 133 | "rf.fit(trainData,label)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 49, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "predict_data = rf.predict(testData)\n", 145 | "predict_data = predict_data.tolist()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 51, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "rf_dataframe = pd.DataFrame(predict_data)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 55, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "indexs = range(2,28001)\n", 168 | "rf_dataframe.to_csv('data/randonf.csv',index=False,encoding='utf-8')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "anaconda-cloud": {}, 185 | "kernelspec": { 186 | "display_name": "Python [conda root]", 187 | "language": "python", 188 | "name": "conda-root-py" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3.0 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.5.2" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 0 205 | } -------------------------------------------------------------------------------- /digit-recognizer/src/tensorflow_model_softmax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | def load_data(): 8 | """ 9 | Load train and validation data set. 10 | parameter: 11 | no 12 | 13 | return: 14 | x_train y_train 15 | x_valid y_valid 16 | test 17 | """ 18 | train = pd.read_csv('../input/train.csv') 19 | test = pd.read_csv('../input/test.csv') 20 | 21 | train_df = train.drop(['label'], axis=1) 22 | label = pd.get_dummies(train['label']) 23 | train_df = train_df.applymap(lambda x: x / 255) 24 | test = test.applymap(lambda x: x / 255) 25 | 26 | x_train, x_valid, y_train, y_valid = train_test_split(train_df, label, test_size=0.2, random_state=2017) 27 | print("x_train shape:" + str(x_train.shape)) 28 | print("x_valid shape:" + str(x_valid.shape)) 29 | 30 | return x_train, y_train, x_valid, y_valid, test 31 | 32 | x_train, y_train, x_valid, y_valid, x_test = load_data() 33 | 34 | # Define tensorflow softmax model 35 | sess =tf.InteractiveSession() 36 | 37 | x = tf.placeholder("float", shape=[None, 784]) 38 | y = tf.placeholder("float", shape=[None, 10]) 39 | w = tf.Variable(tf.zeros([784, 10])) 40 | b = tf.Variable(tf.zeros([10])) 41 | yhat = tf.nn.softmax(tf.matmul(x, w) + b) 42 | 43 | # cost function 44 | cross_entropy = -tf.reduce_sum(y*tf.log(yhat)) 45 | # gradient descent optimizer, learn rate 0.01 46 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) 47 | # Evaluation model with validation data 48 | correct_prediction = tf.equal(tf.argmax(yhat, 1), tf.argmax(y, 1)) 49 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 50 | # init variables 51 | sess.run(tf.global_variables_initializer()) 52 | 53 | train_size = x_train.shape[0] 54 | batch_size = 64 55 | 56 | for i in range(1000): 57 | start = i * batch_size % train_size 58 | end = (i + 1) * batch_size % train_size 59 | # print(start, end) 60 | if start > end: 61 | start = 0 62 | 63 | batch_x = x_train[start:end] 64 | batch_y = y_train[start:end] 65 | 66 | if i % 100 == 0: 67 | print("train-loss ======== > {}".format(cross_entropy.eval(feed_dict={x: batch_x, y: batch_y}))) 68 | print("train-accuracy ==== > {}".format(accuracy.eval({x: x_train, y: y_train}))) 69 | sess.run(train_step, feed_dict={x: batch_x, y: batch_y}) 70 | 71 | print("validation accuracy score is {}".format(accuracy.eval({x: x_valid, y: y_valid}))) 72 | 73 | # Prediction and submission 74 | y_pre = sess.run(tf.nn.softmax(tf.matmul(x, w) + b), feed_dict={x:x_test}) 75 | pre_label = np.argmax(y_pre, axis=1) 76 | 77 | 78 | submit_df = pd.read_csv('../input/sample_submission.csv') 79 | submit_df.Label = pre_label 80 | submit_df.to_csv('tensorflow_lr.csv', index=None, encoding='utf-8') 81 | -------------------------------------------------------------------------------- /digit-recognizer/src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from datetime import datetime 4 | 5 | 6 | def load_data(): 7 | print("load data...") 8 | train = pd.read_csv('../input/train.csv') 9 | test = pd.read_csv('../input/test.csv') 10 | 11 | return train, test 12 | 13 | 14 | def submission(pre): 15 | sample = pd.read_csv('../input/sample_submission.csv') 16 | 17 | sample['Label'] = pre 18 | 19 | submit_file = '../sub/{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M')) 20 | sample.to_csv(submit_file, index=False) -------------------------------------------------------------------------------- /digit-recognizer/src/xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "/home/shunguo/SDE/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 15 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import xgboost as xgb\n", 23 | "from sklearn.cross_validation import train_test_split" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## load data and cleaning data" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "train_df = pd.read_csv('data/train.csv')\n", 42 | "test_df = pd.read_csv('data/test.csv')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## make xgboost DMatrix" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 9, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "train, valid = train_test_split(train_df, test_size = 0.2, random_state=1)\n", 61 | "\n", 62 | "y_train = train.label\n", 63 | "x_train = train.drop(['label'], axis=1)/255\n", 64 | "\n", 65 | "y_valid = valid.label\n", 66 | "x_valid = valid.drop(['label'], axis=1)/255\n", 67 | "\n", 68 | "xgb_train = xgb.DMatrix(x_train, label=y_train)\n", 69 | "xgb_valid = xgb.DMatrix(x_valid, label=y_valid)\n", 70 | "xgb_test = xgb.DMatrix(test_df)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 10, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "(33600, 784) (8400, 784)\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "print x_train.shape, x_valid.shape" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## xgb params" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 13, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "[0]\ttrain-merror:0.146637\tvalid-merror:0.173452\n", 111 | "Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.\n", 112 | "\n", 113 | "Will train until valid-merror hasn't improved in 100 rounds.\n", 114 | "[10]\ttrain-merror:0.029762\tvalid-merror:0.059048\n", 115 | "[20]\ttrain-merror:0.009911\tvalid-merror:0.044048\n", 116 | "[30]\ttrain-merror:0.002619\tvalid-merror:0.036548\n", 117 | "[40]\ttrain-merror:0.000506\tvalid-merror:0.033095\n", 118 | "[50]\ttrain-merror:3e-05\tvalid-merror:0.030476\n", 119 | "[60]\ttrain-merror:0\tvalid-merror:0.029286\n", 120 | "[70]\ttrain-merror:0\tvalid-merror:0.02869\n", 121 | "[80]\ttrain-merror:0\tvalid-merror:0.027619\n", 122 | "[90]\ttrain-merror:0\tvalid-merror:0.027262\n", 123 | "[100]\ttrain-merror:0\tvalid-merror:0.026786\n", 124 | "[110]\ttrain-merror:0\tvalid-merror:0.026667\n", 125 | "[120]\ttrain-merror:0\tvalid-merror:0.026429\n", 126 | "[130]\ttrain-merror:0\tvalid-merror:0.025952\n", 127 | "[140]\ttrain-merror:0\tvalid-merror:0.025952\n", 128 | "[150]\ttrain-merror:0\tvalid-merror:0.025595\n", 129 | "[160]\ttrain-merror:0\tvalid-merror:0.025833\n", 130 | "[170]\ttrain-merror:0\tvalid-merror:0.025714\n", 131 | "[180]\ttrain-merror:0\tvalid-merror:0.025476\n", 132 | "[190]\ttrain-merror:0\tvalid-merror:0.025357\n", 133 | "[200]\ttrain-merror:0\tvalid-merror:0.024762\n", 134 | "[210]\ttrain-merror:0\tvalid-merror:0.024762\n", 135 | "[220]\ttrain-merror:0\tvalid-merror:0.024643\n", 136 | "[230]\ttrain-merror:0\tvalid-merror:0.024643\n", 137 | "[240]\ttrain-merror:0\tvalid-merror:0.024405\n", 138 | "[250]\ttrain-merror:0\tvalid-merror:0.024405\n", 139 | "[260]\ttrain-merror:0\tvalid-merror:0.024643\n", 140 | "[270]\ttrain-merror:0\tvalid-merror:0.024167\n", 141 | "[280]\ttrain-merror:0\tvalid-merror:0.024167\n", 142 | "[290]\ttrain-merror:0\tvalid-merror:0.024286\n", 143 | "[300]\ttrain-merror:0\tvalid-merror:0.024405\n", 144 | "[310]\ttrain-merror:0\tvalid-merror:0.024286\n", 145 | "[320]\ttrain-merror:0\tvalid-merror:0.024048\n", 146 | "[330]\ttrain-merror:0\tvalid-merror:0.023929\n", 147 | "[340]\ttrain-merror:0\tvalid-merror:0.024048\n", 148 | "[350]\ttrain-merror:0\tvalid-merror:0.024048\n", 149 | "[360]\ttrain-merror:0\tvalid-merror:0.023929\n", 150 | "[370]\ttrain-merror:0\tvalid-merror:0.024048\n", 151 | "[380]\ttrain-merror:0\tvalid-merror:0.024286\n", 152 | "[390]\ttrain-merror:0\tvalid-merror:0.024167\n", 153 | "[400]\ttrain-merror:0\tvalid-merror:0.024048\n", 154 | "[410]\ttrain-merror:0\tvalid-merror:0.024048\n", 155 | "[420]\ttrain-merror:0\tvalid-merror:0.023929\n", 156 | "[430]\ttrain-merror:0\tvalid-merror:0.023929\n", 157 | "[440]\ttrain-merror:0\tvalid-merror:0.02381\n", 158 | "[450]\ttrain-merror:0\tvalid-merror:0.02381\n", 159 | "[460]\ttrain-merror:0\tvalid-merror:0.024048\n", 160 | "Stopping. Best iteration:\n", 161 | "[362]\ttrain-merror:0\tvalid-merror:0.02381\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "params={\n", 168 | " 'booster':'gbtree',\n", 169 | " 'objective': 'multi:softmax', #多分类的问题\n", 170 | " 'num_class':10, # 类别数,与 multisoftmax 并用\n", 171 | " \n", 172 | " 'eta': 0.3, # 如同学习率 scikit-learn:eta –> learning_rate \n", 173 | " 'max_depth':6, # 构建树的深度,越大越容易过拟合\n", 174 | " 'colsample_bytree':0.3, # 生成树时进行的列采样\n", 175 | " 'seed':400,\n", 176 | " \n", 177 | " 'silent':0 ,#设置成1则没有运行信息输出,最好是设置为0.\n", 178 | " 'nthread':4,# cpu 线程数\n", 179 | "}\n", 180 | "\n", 181 | "watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]\n", 182 | "clf = xgb.train(params, xgb_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 14, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/html": [ 195 | "
\n", 196 | "\n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | "
ImageIdLabel
012
120
239
349
453
\n", 232 | "
" 233 | ], 234 | "text/plain": [ 235 | " ImageId Label\n", 236 | "0 1 2\n", 237 | "1 2 0\n", 238 | "2 3 9\n", 239 | "3 4 9\n", 240 | "4 5 3" 241 | ] 242 | }, 243 | "execution_count": 14, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "pre_test = clf.predict(xgb_test)\n", 250 | "submit_df = pd.read_csv('data/sample_submission.csv')\n", 251 | "submit_df.Label = pre_test.astype(int)\n", 252 | "submit_df.to_csv('data/xgboost.csv',index=None,encoding='utf-8')\n", 253 | "submit_df.head()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 36, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "" 265 | ] 266 | } 267 | ], 268 | "metadata": { 269 | "anaconda-cloud": {}, 270 | "kernelspec": { 271 | "display_name": "Python 2", 272 | "language": "python", 273 | "name": "python2" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 2.0 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython2", 285 | "version": "2.7.13" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 0 290 | } -------------------------------------------------------------------------------- /house-predict/src/EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | <<<<<<< HEAD 7 | "metadata": { 8 | "collapsed": true 9 | }, 10 | ======= 11 | "metadata": {}, 12 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 13 | "outputs": [], 14 | "source": [ 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Load Data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | <<<<<<< HEAD 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | ======= 35 | "metadata": {}, 36 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 37 | "outputs": [], 38 | "source": [ 39 | "train = pd.read_csv('../input/train.csv')\n", 40 | "test = pd.read_csv('../input/test.csv')\n", 41 | "sample = pd.read_csv('../input/sample_submission.csv')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### 1.Data describe" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | <<<<<<< HEAD 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | ======= 59 | "metadata": {}, 60 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", 216 | "

5 rows × 81 columns

\n", 217 | "
" 218 | <<<<<<< HEAD 219 | ], 220 | "text/plain": [ 221 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 222 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n", 223 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n", 224 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", 225 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", 226 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", 227 | "\n", 228 | " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n", 229 | "0 Lvl AllPub ... 0 NaN NaN NaN 0 \n", 230 | "1 Lvl AllPub ... 0 NaN NaN NaN 0 \n", 231 | "2 Lvl AllPub ... 0 NaN NaN NaN 0 \n", 232 | "3 Lvl AllPub ... 0 NaN NaN NaN 0 \n", 233 | "4 Lvl AllPub ... 0 NaN NaN NaN 0 \n", 234 | "\n", 235 | " MoSold YrSold SaleType SaleCondition SalePrice \n", 236 | "0 2 2008 WD Normal 208500 \n", 237 | "1 5 2007 WD Normal 181500 \n", 238 | "2 9 2008 WD Normal 223500 \n", 239 | "3 2 2006 WD Abnorml 140000 \n", 240 | "4 12 2008 WD Normal 250000 \n", 241 | "\n", 242 | "[5 rows x 81 columns]" 243 | ] 244 | }, 245 | "execution_count": 3, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | ======= 249 | ] 250 | }, 251 | "output_type": "execute_result", 252 | "metadata": {} 253 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 254 | } 255 | ], 256 | "source": [ 257 | "train.head()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 4, 263 | <<<<<<< HEAD 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | ======= 268 | "metadata": {}, 269 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/html": [ 274 | "
\n", 275 | "\n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
0146120RH80.011622PaveNaNRegLvlAllPub...1200NaNMnPrvNaN062010WDNormal
1146220RL81.014267PaveNaNIR1LvlAllPub...00NaNNaNGar21250062010WDNormal
2146360RL74.013830PaveNaNIR1LvlAllPub...00NaNMnPrvNaN032010WDNormal
3146460RL78.09978PaveNaNIR1LvlAllPub...00NaNNaNNaN062010WDNormal
41465120RL43.05005PaveNaNIR1HLSAllPub...1440NaNNaNNaN012010WDNormal
\n", 425 | "

5 rows × 80 columns

\n", 426 | "
" 427 | <<<<<<< HEAD 428 | ], 429 | "text/plain": [ 430 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 431 | "0 1461 20 RH 80.0 11622 Pave NaN Reg \n", 432 | "1 1462 20 RL 81.0 14267 Pave NaN IR1 \n", 433 | "2 1463 60 RL 74.0 13830 Pave NaN IR1 \n", 434 | "3 1464 60 RL 78.0 9978 Pave NaN IR1 \n", 435 | "4 1465 120 RL 43.0 5005 Pave NaN IR1 \n", 436 | "\n", 437 | " LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence \\\n", 438 | "0 Lvl AllPub ... 120 0 NaN MnPrv \n", 439 | "1 Lvl AllPub ... 0 0 NaN NaN \n", 440 | "2 Lvl AllPub ... 0 0 NaN MnPrv \n", 441 | "3 Lvl AllPub ... 0 0 NaN NaN \n", 442 | "4 HLS AllPub ... 144 0 NaN NaN \n", 443 | "\n", 444 | " MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n", 445 | "0 NaN 0 6 2010 WD Normal \n", 446 | "1 Gar2 12500 6 2010 WD Normal \n", 447 | "2 NaN 0 3 2010 WD Normal \n", 448 | "3 NaN 0 6 2010 WD Normal \n", 449 | "4 NaN 0 1 2010 WD Normal \n", 450 | "\n", 451 | "[5 rows x 80 columns]" 452 | ] 453 | }, 454 | "execution_count": 4, 455 | "metadata": {}, 456 | "output_type": "execute_result" 457 | ======= 458 | ] 459 | }, 460 | "output_type": "execute_result", 461 | "metadata": {} 462 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 463 | } 464 | ], 465 | "source": [ 466 | "test.head()" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 5, 472 | <<<<<<< HEAD 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | ======= 477 | "metadata": {}, 478 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/html": [ 483 | "
\n", 484 | "\n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | "
IdSalePrice
01461169277.052498
11462187758.393989
21463183583.683570
31464179317.477511
41465150730.079977
\n", 520 | "
" 521 | <<<<<<< HEAD 522 | ], 523 | "text/plain": [ 524 | " Id SalePrice\n", 525 | "0 1461 169277.052498\n", 526 | "1 1462 187758.393989\n", 527 | "2 1463 183583.683570\n", 528 | "3 1464 179317.477511\n", 529 | "4 1465 150730.079977" 530 | ] 531 | }, 532 | "execution_count": 5, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | ======= 536 | ] 537 | }, 538 | "output_type": "execute_result", 539 | "metadata": {} 540 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 541 | } 542 | ], 543 | "source": [ 544 | "sample.head()" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 6, 550 | <<<<<<< HEAD 551 | "metadata": { 552 | "collapsed": false 553 | }, 554 | ======= 555 | "metadata": {}, 556 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 557 | "outputs": [ 558 | { 559 | "name": "stdout", 560 | "output_type": "stream", 561 | "text": [ 562 | "\n", 563 | "RangeIndex: 1460 entries, 0 to 1459\n", 564 | "Data columns (total 81 columns):\n", 565 | "Id 1460 non-null int64\n", 566 | "MSSubClass 1460 non-null int64\n", 567 | "MSZoning 1460 non-null object\n", 568 | "LotFrontage 1201 non-null float64\n", 569 | "LotArea 1460 non-null int64\n", 570 | "Street 1460 non-null object\n", 571 | "Alley 91 non-null object\n", 572 | "LotShape 1460 non-null object\n", 573 | "LandContour 1460 non-null object\n", 574 | "Utilities 1460 non-null object\n", 575 | "LotConfig 1460 non-null object\n", 576 | "LandSlope 1460 non-null object\n", 577 | "Neighborhood 1460 non-null object\n", 578 | "Condition1 1460 non-null object\n", 579 | "Condition2 1460 non-null object\n", 580 | "BldgType 1460 non-null object\n", 581 | "HouseStyle 1460 non-null object\n", 582 | "OverallQual 1460 non-null int64\n", 583 | "OverallCond 1460 non-null int64\n", 584 | "YearBuilt 1460 non-null int64\n", 585 | "YearRemodAdd 1460 non-null int64\n", 586 | "RoofStyle 1460 non-null object\n", 587 | "RoofMatl 1460 non-null object\n", 588 | "Exterior1st 1460 non-null object\n", 589 | "Exterior2nd 1460 non-null object\n", 590 | "MasVnrType 1452 non-null object\n", 591 | "MasVnrArea 1452 non-null float64\n", 592 | "ExterQual 1460 non-null object\n", 593 | "ExterCond 1460 non-null object\n", 594 | "Foundation 1460 non-null object\n", 595 | "BsmtQual 1423 non-null object\n", 596 | "BsmtCond 1423 non-null object\n", 597 | "BsmtExposure 1422 non-null object\n", 598 | "BsmtFinType1 1423 non-null object\n", 599 | "BsmtFinSF1 1460 non-null int64\n", 600 | "BsmtFinType2 1422 non-null object\n", 601 | "BsmtFinSF2 1460 non-null int64\n", 602 | "BsmtUnfSF 1460 non-null int64\n", 603 | "TotalBsmtSF 1460 non-null int64\n", 604 | "Heating 1460 non-null object\n", 605 | "HeatingQC 1460 non-null object\n", 606 | "CentralAir 1460 non-null object\n", 607 | "Electrical 1459 non-null object\n", 608 | "1stFlrSF 1460 non-null int64\n", 609 | "2ndFlrSF 1460 non-null int64\n", 610 | "LowQualFinSF 1460 non-null int64\n", 611 | "GrLivArea 1460 non-null int64\n", 612 | "BsmtFullBath 1460 non-null int64\n", 613 | "BsmtHalfBath 1460 non-null int64\n", 614 | "FullBath 1460 non-null int64\n", 615 | "HalfBath 1460 non-null int64\n", 616 | "BedroomAbvGr 1460 non-null int64\n", 617 | "KitchenAbvGr 1460 non-null int64\n", 618 | "KitchenQual 1460 non-null object\n", 619 | "TotRmsAbvGrd 1460 non-null int64\n", 620 | "Functional 1460 non-null object\n", 621 | "Fireplaces 1460 non-null int64\n", 622 | "FireplaceQu 770 non-null object\n", 623 | "GarageType 1379 non-null object\n", 624 | "GarageYrBlt 1379 non-null float64\n", 625 | "GarageFinish 1379 non-null object\n", 626 | "GarageCars 1460 non-null int64\n", 627 | "GarageArea 1460 non-null int64\n", 628 | "GarageQual 1379 non-null object\n", 629 | "GarageCond 1379 non-null object\n", 630 | "PavedDrive 1460 non-null object\n", 631 | "WoodDeckSF 1460 non-null int64\n", 632 | "OpenPorchSF 1460 non-null int64\n", 633 | "EnclosedPorch 1460 non-null int64\n", 634 | "3SsnPorch 1460 non-null int64\n", 635 | "ScreenPorch 1460 non-null int64\n", 636 | "PoolArea 1460 non-null int64\n", 637 | "PoolQC 7 non-null object\n", 638 | "Fence 281 non-null object\n", 639 | "MiscFeature 54 non-null object\n", 640 | "MiscVal 1460 non-null int64\n", 641 | "MoSold 1460 non-null int64\n", 642 | "YrSold 1460 non-null int64\n", 643 | "SaleType 1460 non-null object\n", 644 | "SaleCondition 1460 non-null object\n", 645 | "SalePrice 1460 non-null int64\n", 646 | "dtypes: float64(3), int64(35), object(43)\n", 647 | "memory usage: 924.0+ KB\n" 648 | ] 649 | } 650 | ], 651 | "source": [ 652 | "train.info()" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 7, 658 | <<<<<<< HEAD 659 | "metadata": { 660 | "collapsed": false 661 | }, 662 | ======= 663 | "metadata": {}, 664 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "\n", 671 | "RangeIndex: 1459 entries, 0 to 1458\n", 672 | "Data columns (total 80 columns):\n", 673 | "Id 1459 non-null int64\n", 674 | "MSSubClass 1459 non-null int64\n", 675 | "MSZoning 1455 non-null object\n", 676 | "LotFrontage 1232 non-null float64\n", 677 | "LotArea 1459 non-null int64\n", 678 | "Street 1459 non-null object\n", 679 | "Alley 107 non-null object\n", 680 | "LotShape 1459 non-null object\n", 681 | "LandContour 1459 non-null object\n", 682 | "Utilities 1457 non-null object\n", 683 | "LotConfig 1459 non-null object\n", 684 | "LandSlope 1459 non-null object\n", 685 | "Neighborhood 1459 non-null object\n", 686 | "Condition1 1459 non-null object\n", 687 | "Condition2 1459 non-null object\n", 688 | "BldgType 1459 non-null object\n", 689 | "HouseStyle 1459 non-null object\n", 690 | "OverallQual 1459 non-null int64\n", 691 | "OverallCond 1459 non-null int64\n", 692 | "YearBuilt 1459 non-null int64\n", 693 | "YearRemodAdd 1459 non-null int64\n", 694 | "RoofStyle 1459 non-null object\n", 695 | "RoofMatl 1459 non-null object\n", 696 | "Exterior1st 1458 non-null object\n", 697 | "Exterior2nd 1458 non-null object\n", 698 | "MasVnrType 1443 non-null object\n", 699 | "MasVnrArea 1444 non-null float64\n", 700 | "ExterQual 1459 non-null object\n", 701 | "ExterCond 1459 non-null object\n", 702 | "Foundation 1459 non-null object\n", 703 | "BsmtQual 1415 non-null object\n", 704 | "BsmtCond 1414 non-null object\n", 705 | "BsmtExposure 1415 non-null object\n", 706 | "BsmtFinType1 1417 non-null object\n", 707 | "BsmtFinSF1 1458 non-null float64\n", 708 | "BsmtFinType2 1417 non-null object\n", 709 | "BsmtFinSF2 1458 non-null float64\n", 710 | "BsmtUnfSF 1458 non-null float64\n", 711 | "TotalBsmtSF 1458 non-null float64\n", 712 | "Heating 1459 non-null object\n", 713 | "HeatingQC 1459 non-null object\n", 714 | "CentralAir 1459 non-null object\n", 715 | "Electrical 1459 non-null object\n", 716 | "1stFlrSF 1459 non-null int64\n", 717 | "2ndFlrSF 1459 non-null int64\n", 718 | "LowQualFinSF 1459 non-null int64\n", 719 | "GrLivArea 1459 non-null int64\n", 720 | "BsmtFullBath 1457 non-null float64\n", 721 | "BsmtHalfBath 1457 non-null float64\n", 722 | "FullBath 1459 non-null int64\n", 723 | "HalfBath 1459 non-null int64\n", 724 | "BedroomAbvGr 1459 non-null int64\n", 725 | "KitchenAbvGr 1459 non-null int64\n", 726 | "KitchenQual 1458 non-null object\n", 727 | "TotRmsAbvGrd 1459 non-null int64\n", 728 | "Functional 1457 non-null object\n", 729 | "Fireplaces 1459 non-null int64\n", 730 | "FireplaceQu 729 non-null object\n", 731 | "GarageType 1383 non-null object\n", 732 | "GarageYrBlt 1381 non-null float64\n", 733 | "GarageFinish 1381 non-null object\n", 734 | "GarageCars 1458 non-null float64\n", 735 | "GarageArea 1458 non-null float64\n", 736 | "GarageQual 1381 non-null object\n", 737 | "GarageCond 1381 non-null object\n", 738 | "PavedDrive 1459 non-null object\n", 739 | "WoodDeckSF 1459 non-null int64\n", 740 | "OpenPorchSF 1459 non-null int64\n", 741 | "EnclosedPorch 1459 non-null int64\n", 742 | "3SsnPorch 1459 non-null int64\n", 743 | "ScreenPorch 1459 non-null int64\n", 744 | "PoolArea 1459 non-null int64\n", 745 | "PoolQC 3 non-null object\n", 746 | "Fence 290 non-null object\n", 747 | "MiscFeature 51 non-null object\n", 748 | "MiscVal 1459 non-null int64\n", 749 | "MoSold 1459 non-null int64\n", 750 | "YrSold 1459 non-null int64\n", 751 | "SaleType 1458 non-null object\n", 752 | "SaleCondition 1459 non-null object\n", 753 | "dtypes: float64(11), int64(26), object(43)\n", 754 | "memory usage: 911.9+ KB\n" 755 | ] 756 | } 757 | ], 758 | "source": [ 759 | "test.info()" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | <<<<<<< HEAD 766 | "metadata": { 767 | "collapsed": true 768 | }, 769 | "outputs": [], 770 | "source": [] 771 | ======= 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "" 776 | ] 777 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 778 | } 779 | ], 780 | "metadata": { 781 | "kernelspec": { 782 | "display_name": "Python 2", 783 | "language": "python", 784 | "name": "python2" 785 | }, 786 | "language_info": { 787 | "codemirror_mode": { 788 | "name": "ipython", 789 | <<<<<<< HEAD 790 | "version": 2 791 | ======= 792 | "version": 2.0 793 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 794 | }, 795 | "file_extension": ".py", 796 | "mimetype": "text/x-python", 797 | "name": "python", 798 | "nbconvert_exporter": "python", 799 | "pygments_lexer": "ipython2", 800 | "version": "2.7.13" 801 | } 802 | }, 803 | "nbformat": 4, 804 | "nbformat_minor": 2 805 | <<<<<<< HEAD 806 | } 807 | ======= 808 | } 809 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 810 | -------------------------------------------------------------------------------- /house-predict/src/house_predict_start.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# House Predict Start" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 40, 13 | <<<<<<< HEAD 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | ======= 18 | "metadata": {}, 19 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 20 | "outputs": [], 21 | "source": [ 22 | "#\n", 23 | "# import libariry\n", 24 | "#\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "from sklearn.metrics import r2_score\n", 29 | "from sklearn.model_selection import cross_val_score\n", 30 | "from sklearn.model_selection import GridSearchCV" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## 1.Load data and data preprocess" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | <<<<<<< HEAD 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | ======= 48 | "metadata": {}, 49 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 50 | "outputs": [], 51 | "source": [ 52 | "def load_data():\n", 53 | " train = pd.read_csv('../input/train.csv')\n", 54 | " test = pd.read_csv('../input/test.csv')\n", 55 | " sample = pd.read_csv('../input/sample_submission.csv')\n", 56 | " \n", 57 | " return train, test, sample" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "train shape:(1460, 81)\n", 65 | "\n", 66 | "test shape:(1459, 80)\n", 67 | "\n", 68 | "sample shape:(1459, 2)\n", 69 | "\n", 70 | "train dtypes: float64(3), int64(35), object(43)\n", 71 | "\n", 72 | "test dtypes: float64(11), int64(26), object(43)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | <<<<<<< HEAD 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | ======= 83 | "metadata": {}, 84 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "((1460, 81), (1459, 80), (1459, 2))\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "train, test, sample = load_data()\n", 96 | "\n", 97 | "def clearn_data(data):\n", 98 | " for c in data.columns:\n", 99 | " data[c] = data[c].fillna(-1)\n", 100 | " if data[c].dtypes == 'float64':\n", 101 | " data[c] = data[c].astype(np.float32)\n", 102 | " return data\n", 103 | "\n", 104 | "train = clearn_data(train)\n", 105 | "test = clearn_data(test) \n", 106 | "\n", 107 | "print(train.shape, test.shape, sample.shape)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## 2.Feature Engineering" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 4, 120 | <<<<<<< HEAD 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | ======= 125 | "metadata": {}, 126 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# feature select\n", 138 | "features = []\n", 139 | "for c in train.columns:\n", 140 | " if train[c].dtypes == 'int':\n", 141 | " features.append(c)\n", 142 | "\n", 143 | "print(features)\n", 144 | "\n", 145 | "x_train = train[features]\n", 146 | "x_train = x_train.drop(['Id', 'SalePrice'], axis=1)\n", 147 | "y_train = train['SalePrice']\n", 148 | "\n", 149 | "x_test = test[x_train.columns]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## 3.Model Ensemble" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "**RandomForestRegressor**" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 94, 169 | <<<<<<< HEAD 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | ======= 174 | "metadata": {}, 175 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "(, {}, 0.8413066185753254)\n", 182 | "RF model score:0.858018201381\n", 183 | "\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "from sklearn.ensemble import RandomForestRegressor\n", 194 | "\n", 195 | "# rf model\n", 196 | "rf_params = {}\n", 197 | "rf_params['n_estimators'] = 20 \n", 198 | "rf_params['max_depth'] = 17 # 2 -> 0.66\n", 199 | "#rf_params['max_features'] = 'sqrt' # 1-> 0.399\n", 200 | "#rf_params['min_samples_split'] = 10\n", 201 | "#rf_params['min_samples_leaf'] = 20\n", 202 | "#rf_params['max_leaf_nodes'] = 18\n", 203 | "rf_params['oob_score'] = True\n", 204 | "rf_params['n_jobs'] = -1\n", 205 | "\n", 206 | "rf_param = {\n", 207 | " #min_samples_leaf': range(5, 20, 5),\n", 208 | " #'min_samples_split': range(10, 50, 10)\n", 209 | "}\n", 210 | "\n", 211 | "rf_model = RandomForestRegressor(**rf_params)\n", 212 | "\n", 213 | "#\n", 214 | "# Grid Search\n", 215 | "#\n", 216 | "grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param, cv=5)\n", 217 | "grid_search.fit(x_train, y_train)\n", 218 | "print(grid_search.best_params_, grid_search.best_score_)\n", 219 | "\n", 220 | "rf_model_score = cross_val_score(rf_model, x_train, y_train, cv=10).mean()\n", 221 | "print(\"RF model score:{}\".format(rf_model_score))\n", 222 | "print(rf_model.score)\n", 223 | "\n", 224 | "rf_model.fit(x_train, y_train)\n", 225 | "rf_pre = rf_model.predict(x_test)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "**XGBRegresser**" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 109, 238 | <<<<<<< HEAD 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | ======= 243 | "metadata": {}, 244 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "({}, 0.88223089268776989)\n", 251 | "XGB model score:0.882230892688\n", 252 | "\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "from xgboost import XGBRegressor\n", 262 | "\n", 263 | "# xgb params\n", 264 | "xgb_params = {}\n", 265 | "xgb_params['n_estimators'] = 400\n", 266 | "xgb_params['max_depth'] = 4\n", 267 | "xgb_params['learning_rate'] = 0.07\n", 268 | "xgb_params['min_child_weight'] = 3\n", 269 | "\n", 270 | "xgb_param = {\n", 271 | " #'n_estimators': range(50, 1000, 50),\n", 272 | " #'min_child_weight': range(1, 10, 1)\n", 273 | "}\n", 274 | "\n", 275 | "xgb_model = XGBRegressor(**xgb_params)\n", 276 | "\n", 277 | "#\n", 278 | "# Grid Search\n", 279 | "#\n", 280 | "grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param, cv=5)\n", 281 | "grid_search.fit(x_train, y_train)\n", 282 | "print(grid_search.best_params_, grid_search.best_score_)\n", 283 | "\n", 284 | "#\n", 285 | "# cross_val_score\n", 286 | "#\n", 287 | "xgb_model_score = cross_val_score(xgb_model, x_train, y_train, cv=5).mean()\n", 288 | "print(\"XGB model score:{}\".format(xgb_model_score))\n", 289 | "print(xgb_model.score)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "## Submit" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 88, 302 | <<<<<<< HEAD 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | ======= 307 | "metadata": {}, 308 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 309 | "outputs": [], 310 | "source": [ 311 | "from datetime import datetime\n", 312 | "sample['SalePrice'] = rf_pre\n", 313 | "\n", 314 | "submit_file = '../sub/{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M'))\n", 315 | "sample.to_csv(submit_file, index=False, float_format='%.4f')" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | <<<<<<< HEAD 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [] 327 | ======= 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "" 332 | ] 333 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 334 | } 335 | ], 336 | "metadata": { 337 | "kernelspec": { 338 | "display_name": "Python 2", 339 | "language": "python", 340 | "name": "python2" 341 | }, 342 | "language_info": { 343 | "codemirror_mode": { 344 | "name": "ipython", 345 | <<<<<<< HEAD 346 | "version": 2 347 | ======= 348 | "version": 2.0 349 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython2", 356 | "version": "2.7.13" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 2 361 | <<<<<<< HEAD 362 | } 363 | ======= 364 | } 365 | >>>>>>> a725bb7d5fd9ef617e5516df313da7c12c54d1bb 366 | -------------------------------------------------------------------------------- /requrements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | opencv-python==4.1.0.25 3 | pandas==0.23.0 4 | scikit-learn==0.19.1 5 | tensorflow-gpu==1.13.1 -------------------------------------------------------------------------------- /titanic/README.md: -------------------------------------------------------------------------------- 1 | # Titanic幸存者预测竞赛 2 | 3 | 泰坦尼克号预测竞赛是Kaggle平台的入门级竞赛,通过这个竞赛我们可以熟悉Kaggle竞赛的完整流程。 4 | 5 | 教程正在不断完善的过程,只需要关注我提到的文件就可以了,以前的很多代码可能有问题,请见谅。 6 | 7 | ## 如何开始Kaggle竞赛? 8 | 9 | - 1.访问Kaggle网站:https://www.kaggle.com 10 | - 2.注册Kaggle账号。这里很多同学反映注册不了,收不到验证码,这个是qiang的问题,请自行找梯子。 11 | - 3.参加Titanic竞赛。 12 | 13 | ## 机器学习环境? 14 | 15 | 我目前的机器学习环境是基于python3.6平台,可以下载anaconda,具体的安装方法请自行百度。 16 | 17 | - Python3.6 18 | - 常用库:Numpy,Scipy,Matplotlib,sciket-learn 19 | - 开发工具:推荐jupyter,或者pycharm、vscode 20 | - 电脑配置? 入门级别的比赛对机器的性能要求并不高,一般笔记本应该都没有问题。 21 | 22 | ## Start 23 | 24 | ### 准备工作 25 | 26 | 1.创建input、sub文件夹 27 | 28 | 在kaggle-start/titanic下创建新的文件夹input、sub,input存放数据,sub存放结果。 29 | 30 | 2.下载训练数据 31 | 32 | Kaggle的竞赛一般的数据文件有: 33 | 34 | - train.csv //训练数据 35 | - test.csv // 测试数据 36 | - submission.csv //提交示例 37 | 38 | 39 | ## 待续。。。 40 | 41 | 42 | -------------------------------------------------------------------------------- /titanic/example/titanic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn import tree 6 | 7 | if __name__ == "__main__": 8 | print "load data..." 9 | train = pd.read_csv("../data/train.csv") 10 | test = pd.read_csv("../data/test.csv") 11 | print train.info(), test.info() 12 | 13 | print "clean data..." 14 | # Age 15 | train["Age"] = train["Age"].fillna(train["Age"].median()) 16 | test["Age"] = test["Age"].fillna(test["Age"].median()) 17 | # Fare 18 | train["Sex"] = train["Sex"].apply(lambda x: 1 if x == "male" else 0) 19 | test["Sex"] = test["Sex"].apply(lambda x: 1 if x == "male" else 0) 20 | 21 | # Select Features 22 | feature = ["Age", "Sex"] 23 | 24 | print "Fix model..." 25 | # Model of DecisionTree 26 | dt = tree.DecisionTreeClassifier() 27 | dt = dt.fit(train[feature], train["Survived"]) 28 | 29 | print "Predict in test data set..." 30 | predict_data = dt.predict(test[feature]) 31 | 32 | submission = pd.DataFrame({ 33 | "PassengerId": test["PassengerId"], 34 | "Survived": predict_data 35 | }) 36 | print "write submit file:decision_tree.csv" 37 | submission.to_csv('../data/decision_tree.csv', index=False) 38 | 39 | -------------------------------------------------------------------------------- /titanic/images/submit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wsg011/kaggle-start/0dd948ae407edcc68f790f310d6b7c64e4e8a328/titanic/images/submit.png -------------------------------------------------------------------------------- /titanic/src/2-1-Logistic_predict_Titanic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 利用Logistic算法预测Titanic存活率\n", 8 | "\n", 9 | "\n", 10 | "> “泰坦尼克号”沉没的时候,船长下了一个命令:“让女人和小孩先走!”\n", 11 | "\n", 12 | "我用Titanic竞赛的例子来介绍完整的参加一次kaggle竞赛的基本流程,这篇文章可以希望帮助你解决这几个问题:\n", 13 | "\n", 14 | "- 为什么要参加Kaggle竞赛?\n", 15 | " - 通过kaggle竞赛,入门机器学习。\n", 16 | " - 跟踪机器学习的发展方向\n", 17 | " - 。。。。 \n", 18 | "- 我是小白,我要开始参加哪个比赛?\n", 19 | " - 当然是Titanic了,不然我写这文章干嘛。\n", 20 | "- 参加Kaggle比赛的基本流程是什么?\n", 21 | " - 了解竞赛的基本信息。\n", 22 | " - Data Explore Analysis(DEA):下载数据,探索数据\n", 23 | " - Data Clearning:清洗数据\n", 24 | " - Feature engineering:特征工程\n", 25 | " - Model turning:模型训练和模型评估\n", 26 | " - 终极杀器:Model ensemble,模型融合\n", 27 | "\n", 28 | "本篇文章的目的是帮助大家入门Kaggle平台,所以不会有很高的分数,主要以介绍基本流程为主,以完成Titanic竞赛的baseline为目标。\n", 29 | "\n", 30 | "下一篇文章才会教大家如何在baseline的基础上去提高竞赛的成绩。\n", 31 | "\n", 32 | "具体的操作还有问题的请看看我之前的微信公众号文章。\n", 33 | "\n", 34 | "- Kaggle竞赛入门:(一)机器学习环境搭建 https://mp.weixin.qq.com/s?__biz=MzAxMTU3NTkzOQ==&mid=2662346159&idx=1&sn=8552a2228e7ef5defc95129c7a2d3245&chksm=80fa7a43b78df3552d29dab8d6c17b99d50eba2be7dabf7a54b257a4dc75ae9f22058d7f3a3f#rd\n", 35 | "- kaggle平台入门(二)Titanic初试身手 https://mp.weixin.qq.com/s?__biz=MzAxMTU3NTkzOQ==&mid=2662346192&idx=1&sn=d662c08f7d356030dd5a3ab67b8a82d4&chksm=80fa7a3cb78df32a5b6e5c120624f1dec3c752568decce55169b61939a25d1209393f01ae3f8#rd" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### 导入需要用的包\n", 43 | "\n", 44 | "Python数据分析中,最为常用的包是Numpy,pandas,matplotlib,sciket-learn.\n", 45 | "\n", 46 | "Numpy主要是用于矩阵运算。\n", 47 | "\n", 48 | "Pandas用于数据处理和数据分析。\n", 49 | "\n", 50 | "matplotlib是Python下的最强大的数据可视化工具。\n", 51 | "\n", 52 | "sciket-learn是目前Python机器学习中运用最多的库,他包含了我们在机器学习中常用的模型。本文我引入了tree下的决策树模型。" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import numpy as np\n", 62 | "import pandas as pd\n", 63 | "import matplotlib.pyplot as plt\n", 64 | "from sklearn.tree import DecisionTreeClassifier" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### 数据探索\n", 72 | "\n", 73 | "开始一个竞赛之前,我们必须花时间来了解一下我们的比赛数据。\n", 74 | "\n", 75 | "首先,Kaggle的大部分比赛提供的文件都是csv格式的,我们用pandas.read_csv()这个方法来读取数据。" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 2, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "train = pd.read_csv('../input/train.csv')\n", 85 | "test = pd.read_csv('../input/test.csv')\n", 86 | "submission = pd.read_csv('../input/gender_submission.csv')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "看看我们的数据都有什么?\n", 94 | "\n", 95 | "pandas提供了几个方法,一个是pandas.head(),用于查看数据的前几行数据。\n", 96 | "\n", 97 | "我们调用这个方法后,就可以看到我们数据的前5行。\n", 98 | "\n", 99 | "我解释一下比较重要的数据:\n", 100 | "- PassengerId # 乘客id\n", 101 | "- Survived # 是否获救,1存活,0未存活\n", 102 | "- Pclass # 船舱的等级,分为1,2,3个不同的舱位\n", 103 | "- Name # 乘客姓名\n", 104 | "- Sex # 性别,注意妇女先走\n", 105 | "- Age # 年龄,注意小孩先走\n", 106 | "- SibSp # \n", 107 | "- Parch\n", 108 | "- Ticket\n", 109 | "- Fare # 船票价格 \n", 110 | "- Cabin\n", 111 | "- Embarked \n", 112 | "\n", 113 | "其实这些描述在比赛的说明里都有的,只是需要花点时间认真读一下。" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 4, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/html": [ 124 | "
\n", 125 | "\n", 138 | "\n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 234 | "
" 235 | ], 236 | "text/plain": [ 237 | " PassengerId Survived Pclass \\\n", 238 | "0 1 0 3 \n", 239 | "1 2 1 1 \n", 240 | "2 3 1 3 \n", 241 | "3 4 1 1 \n", 242 | "4 5 0 3 \n", 243 | "\n", 244 | " Name Sex Age SibSp \\\n", 245 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 246 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 247 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 248 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 249 | "4 Allen, Mr. William Henry male 35.0 0 \n", 250 | "\n", 251 | " Parch Ticket Fare Cabin Embarked \n", 252 | "0 0 A/5 21171 7.2500 NaN S \n", 253 | "1 0 PC 17599 71.2833 C85 C \n", 254 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 255 | "3 0 113803 53.1000 C123 S \n", 256 | "4 0 373450 8.0500 NaN S " 257 | ] 258 | }, 259 | "execution_count": 4, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "train.head()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 12, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/html": [ 276 | "
\n", 277 | "\n", 290 | "\n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", 380 | "
" 381 | ], 382 | "text/plain": [ 383 | " PassengerId Pclass Name Sex \\\n", 384 | "0 892 3 Kelly, Mr. James male \n", 385 | "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", 386 | "2 894 2 Myles, Mr. Thomas Francis male \n", 387 | "3 895 3 Wirz, Mr. Albert male \n", 388 | "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", 389 | "\n", 390 | " Age SibSp Parch Ticket Fare Cabin Embarked \n", 391 | "0 34.5 0 0 330911 7.8292 NaN Q \n", 392 | "1 47.0 1 0 363272 7.0000 NaN S \n", 393 | "2 62.0 0 0 240276 9.6875 NaN Q \n", 394 | "3 27.0 0 0 315154 8.6625 NaN S \n", 395 | "4 22.0 1 1 3101298 12.2875 NaN S " 396 | ] 397 | }, 398 | "execution_count": 12, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "test.head()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "另外一个比较重要的方法就是.describe(),这个方法会快速帮我们计算一些特征的统计指标。" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 5, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/html": [ 422 | "
\n", 423 | "\n", 436 | "\n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | "
PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000714.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.6991180.5230080.38159432.204208
std257.3538420.4865920.83607114.5264971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000020.1250000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000038.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200
\n", 532 | "
" 533 | ], 534 | "text/plain": [ 535 | " PassengerId Survived Pclass Age SibSp \\\n", 536 | "count 891.000000 891.000000 891.000000 714.000000 891.000000 \n", 537 | "mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n", 538 | "std 257.353842 0.486592 0.836071 14.526497 1.102743 \n", 539 | "min 1.000000 0.000000 1.000000 0.420000 0.000000 \n", 540 | "25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n", 541 | "50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n", 542 | "75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n", 543 | "max 891.000000 1.000000 3.000000 80.000000 8.000000 \n", 544 | "\n", 545 | " Parch Fare \n", 546 | "count 891.000000 891.000000 \n", 547 | "mean 0.381594 32.204208 \n", 548 | "std 0.806057 49.693429 \n", 549 | "min 0.000000 0.000000 \n", 550 | "25% 0.000000 7.910400 \n", 551 | "50% 0.000000 14.454200 \n", 552 | "75% 0.000000 31.000000 \n", 553 | "max 6.000000 512.329200 " 554 | ] 555 | }, 556 | "execution_count": 5, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "train.describe()" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 13, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/html": [ 573 | "
\n", 574 | "\n", 587 | "\n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | "
PassengerIdPclassAgeSibSpParchFare
count418.000000418.000000418.000000418.000000418.000000417.000000
mean1100.5000002.26555029.5992820.4473680.39234435.627188
std120.8104580.84183812.7037700.8967600.98142955.907576
min892.0000001.0000000.1700000.0000000.0000000.000000
25%996.2500001.00000023.0000000.0000000.0000007.895800
50%1100.5000003.00000027.0000000.0000000.00000014.454200
75%1204.7500003.00000035.7500001.0000000.00000031.500000
max1309.0000003.00000076.0000008.0000009.000000512.329200
\n", 674 | "
" 675 | ], 676 | "text/plain": [ 677 | " PassengerId Pclass Age SibSp Parch Fare\n", 678 | "count 418.000000 418.000000 418.000000 418.000000 418.000000 417.000000\n", 679 | "mean 1100.500000 2.265550 29.599282 0.447368 0.392344 35.627188\n", 680 | "std 120.810458 0.841838 12.703770 0.896760 0.981429 55.907576\n", 681 | "min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n", 682 | "25% 996.250000 1.000000 23.000000 0.000000 0.000000 7.895800\n", 683 | "50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n", 684 | "75% 1204.750000 3.000000 35.750000 1.000000 0.000000 31.500000\n", 685 | "max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200" 686 | ] 687 | }, 688 | "execution_count": 13, 689 | "metadata": {}, 690 | "output_type": "execute_result" 691 | } 692 | ], 693 | "source": [ 694 | "test.describe()" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "Titanic就是要求我们根据提供的训练数据,预测test数据中的乘客是否幸存。\n", 702 | "\n", 703 | "其实在一个kaggle竞赛中,我们是需要花很多时间来对竞赛的具体任务深入的分析。这些内容希望教给大家,花一点时间,认真的看看Tantinic这个竞赛的描述。" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "### 数据清洗\n", 711 | "\n", 712 | "数据清洗就是需要我们在数据探索的基础上,对数据进行清洗。比如:异常值处理,缺失值处理,数据归一化等。\n", 713 | "\n", 714 | "Titanic这个比赛,比较重要的几个特征是年龄、性别,因为船长说了女人和小孩先走,所以女性和年龄小的幸存的概率更大。\n", 715 | "\n", 716 | "为了让大家快速上手,我不做过多的特征处理,主要是采用数值型特征,包括:Pclass,Age,SibSp,Parch,Fare。" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 3, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "# Age 的count这里只有714行,而其他的列都有891行,说明Age有缺失值,我们用中位数来填充。\n", 726 | "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", 727 | "# 对test进行同样的操作\n", 728 | "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", 729 | "\n", 730 | "# test的fare也有一个缺失值,同样的处理办法\n", 731 | "train[\"Fare\"] = train[\"Fare\"].fillna(train[\"Fare\"].median())\n", 732 | "test[\"Fare\"] = test[\"Fare\"].fillna(test[\"Fare\"].median())" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "### 特征工程\n", 740 | "\n", 741 | "特征工程是机器学习中最重要的一个环节之一,好的特征工程可以显著我们的模型性能。" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 4, 747 | "metadata": {}, 748 | "outputs": [], 749 | "source": [ 750 | "# 我选择最简单的几个特征\n", 751 | "features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']\n", 752 | "\n", 753 | "# train,test里对应的Pclass,Age,SibSp,Parch,Fare作为训练数据和测试数据。\n", 754 | "x_train = train[features]\n", 755 | "x_test = test[features]\n", 756 | "# train的Survived字段为我们的训练标记。\n", 757 | "y_train = train['Survived']" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "metadata": {}, 763 | "source": [ 764 | "看一下我们的训练数据和测试数据。" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 5, 770 | "metadata": {}, 771 | "outputs": [ 772 | { 773 | "data": { 774 | "text/html": [ 775 | "
\n", 776 | "\n", 789 | "\n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | "
PclassAgeSibSpParchFare
0322.0107.2500
1138.01071.2833
2326.0007.9250
3135.01053.1000
4335.0008.0500
\n", 843 | "
" 844 | ], 845 | "text/plain": [ 846 | " Pclass Age SibSp Parch Fare\n", 847 | "0 3 22.0 1 0 7.2500\n", 848 | "1 1 38.0 1 0 71.2833\n", 849 | "2 3 26.0 0 0 7.9250\n", 850 | "3 1 35.0 1 0 53.1000\n", 851 | "4 3 35.0 0 0 8.0500" 852 | ] 853 | }, 854 | "execution_count": 5, 855 | "metadata": {}, 856 | "output_type": "execute_result" 857 | } 858 | ], 859 | "source": [ 860 | "x_train.head()" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 6, 866 | "metadata": {}, 867 | "outputs": [ 868 | { 869 | "data": { 870 | "text/html": [ 871 | "
\n", 872 | "\n", 885 | "\n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | "
PclassAgeSibSpParchFare
0334.5007.8292
1347.0107.0000
2262.0009.6875
3327.0008.6625
4322.01112.2875
\n", 939 | "
" 940 | ], 941 | "text/plain": [ 942 | " Pclass Age SibSp Parch Fare\n", 943 | "0 3 34.5 0 0 7.8292\n", 944 | "1 3 47.0 1 0 7.0000\n", 945 | "2 2 62.0 0 0 9.6875\n", 946 | "3 3 27.0 0 0 8.6625\n", 947 | "4 3 22.0 1 1 12.2875" 948 | ] 949 | }, 950 | "execution_count": 6, 951 | "metadata": {}, 952 | "output_type": "execute_result" 953 | } 954 | ], 955 | "source": [ 956 | "x_test.head()" 957 | ] 958 | }, 959 | { 960 | "cell_type": "markdown", 961 | "metadata": {}, 962 | "source": [ 963 | "### 模型训练\n", 964 | "\n", 965 | "准备好训练数据之后,就需要用模型进行训练。\n", 966 | "\n", 967 | "Titanic是要求我们判断乘客是否幸存,就是简单的二分类问题,将数据分为两类(0、1)。\n", 968 | "\n", 969 | "对于二分类问题,我们可以采用比较简单的决策树模型的处理。" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": 7, 975 | "metadata": {}, 976 | "outputs": [], 977 | "source": [ 978 | "# 创建决策树model\n", 979 | "model_dt = DecisionTreeClassifier()\n", 980 | "# 利用训练数据训练模型,x_train是训练数据,y_trian是训练标记\n", 981 | "model_dt.fit(x_train, y_train)\n", 982 | "# 利用训练好的模型在测试集上进行预测\n", 983 | "y_prediction = model_dt.predict(x_test)" 984 | ] 985 | }, 986 | { 987 | "cell_type": "markdown", 988 | "metadata": {}, 989 | "source": [ 990 | "看一下我们的预测结果。" 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": 8, 996 | "metadata": {}, 997 | "outputs": [ 998 | { 999 | "data": { 1000 | "text/plain": [ 1001 | "array([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0],\n", 1002 | " dtype=int64)" 1003 | ] 1004 | }, 1005 | "execution_count": 8, 1006 | "metadata": {}, 1007 | "output_type": "execute_result" 1008 | } 1009 | ], 1010 | "source": [ 1011 | "# 查看前20个预测结果\n", 1012 | "y_prediction[:20]" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "markdown", 1017 | "metadata": {}, 1018 | "source": [ 1019 | "### 模型评估\n", 1020 | "\n", 1021 | "我们的模型训练好了,但是现在你知道模型的性能怎么样吗?\n", 1022 | "\n", 1023 | "这里就需要我们队模型进行评估。在真实的应用场景中,我们需要将数据分为3部份,比例为8:1:1,8为train,1为valid,1为test。\n", 1024 | "\n", 1025 | "这部分的内容我以后再完善。\n", 1026 | "\n", 1027 | "Kaggle比赛中,我们有一部分测试数据是放在Kaggle上的,Kaggle利用这部分数据对我们的模型进行评估,我们在Kaggle上的分数就是这么来的。" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "### 提交结果\n", 1035 | "\n", 1036 | "将我们的预测结果保存到model-dt-submission.csv文件中,并提交。" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 9, 1042 | "metadata": {}, 1043 | "outputs": [], 1044 | "source": [ 1045 | "submission['Survived'] = y_prediction\n", 1046 | "submission.to_csv('../output/submission.csv',index=False,encoding=\"utf-8\")" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "markdown", 1051 | "metadata": {}, 1052 | "source": [ 1053 | "提交结果。\n", 1054 | "\n", 1055 | "\n", 1056 | "\n", 1057 | "\n", 1058 | "成绩非常不理想,问题出在哪里呢?留下来给大家思考。\n", 1059 | "\n", 1060 | "本篇文章,我用了Titanic的数据集,写了一个baseline,下一篇文章,我会从机器学习的每个环节入手,手把手教大家把成绩提高到Top10%。" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "markdown", 1065 | "metadata": {}, 1066 | "source": [ 1067 | "## 参考资料\n", 1068 | "\n", 1069 | "- " 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "execution_count": null, 1075 | "metadata": {}, 1076 | "outputs": [], 1077 | "source": [] 1078 | } 1079 | ], 1080 | "metadata": { 1081 | "anaconda-cloud": {}, 1082 | "kernelspec": { 1083 | "display_name": "Python 3", 1084 | "language": "python", 1085 | "name": "python3" 1086 | }, 1087 | "language_info": { 1088 | "codemirror_mode": { 1089 | "name": "ipython", 1090 | "version": 3 1091 | }, 1092 | "file_extension": ".py", 1093 | "mimetype": "text/x-python", 1094 | "name": "python", 1095 | "nbconvert_exporter": "python", 1096 | "pygments_lexer": "ipython3", 1097 | "version": "3.6.5" 1098 | } 1099 | }, 1100 | "nbformat": 4, 1101 | "nbformat_minor": 1 1102 | } 1103 | -------------------------------------------------------------------------------- /titanic/src/EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Titanic dataset explore\n", 8 | "\n", 9 | "- 1.Data preprocess\n", 10 | " - 1.1 load data\n", 11 | " - 1.2 data explore" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import seaborn as sns" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## 1.Data preprocess\n", 34 | "\n", 35 | "### 1.1 load data\n", 36 | "\n", 37 | "Load dataset, train data have 891 columns, test have 418.\n", 38 | "\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 8, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "\n", 53 | "RangeIndex: 891 entries, 0 to 890\n", 54 | "Data columns (total 12 columns):\n", 55 | "PassengerId 891 non-null int64\n", 56 | "Survived 891 non-null int64\n", 57 | "Pclass 891 non-null int64\n", 58 | "Name 891 non-null object\n", 59 | "Sex 891 non-null object\n", 60 | "Age 714 non-null float64\n", 61 | "SibSp 891 non-null int64\n", 62 | "Parch 891 non-null int64\n", 63 | "Ticket 891 non-null object\n", 64 | "Fare 891 non-null float64\n", 65 | "Cabin 204 non-null object\n", 66 | "Embarked 889 non-null object\n", 67 | "dtypes: float64(2), int64(5), object(5)\n", 68 | "memory usage: 83.6+ KB\n", 69 | "\n", 70 | "RangeIndex: 418 entries, 0 to 417\n", 71 | "Data columns (total 11 columns):\n", 72 | "PassengerId 418 non-null int64\n", 73 | "Pclass 418 non-null int64\n", 74 | "Name 418 non-null object\n", 75 | "Sex 418 non-null object\n", 76 | "Age 332 non-null float64\n", 77 | "SibSp 418 non-null int64\n", 78 | "Parch 418 non-null int64\n", 79 | "Ticket 418 non-null object\n", 80 | "Fare 417 non-null float64\n", 81 | "Cabin 91 non-null object\n", 82 | "Embarked 418 non-null object\n", 83 | "dtypes: float64(2), int64(4), object(5)\n", 84 | "memory usage: 36.0+ KB\n" 85 | ] 86 | }, 87 | { 88 | "data": { 89 | "text/html": [ 90 | "
\n", 91 | "\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " PassengerId Survived Pclass \\\n", 174 | "0 1 0 3 \n", 175 | "1 2 1 1 \n", 176 | "2 3 1 3 \n", 177 | "\n", 178 | " Name Sex Age SibSp \\\n", 179 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 180 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 181 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 182 | "\n", 183 | " Parch Ticket Fare Cabin Embarked \n", 184 | "0 0 A/5 21171 7.2500 NaN S \n", 185 | "1 0 PC 17599 71.2833 C85 C \n", 186 | "2 0 STON/O2. 3101282 7.9250 NaN S " 187 | ] 188 | }, 189 | "execution_count": 8, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "train = pd.read_csv(\"../input/train.csv\")\n", 196 | "test = pd.read_csv(\"../input/test.csv\")\n", 197 | "\n", 198 | "train.info()\n", 199 | "test.info()\n", 200 | "train.head(3)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### 1.2 Data explore" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "**Survived**\n", 215 | "\n", 216 | "0 is unsurvived, 1 is survived." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 11, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcoAAAEVCAYAAABg9KUmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X1cVHXe//H3MAOa3Jh2oVt5k5qoaSDmJZliadnoJd5A\nQMIurZdm6eZtyoqoUNmqpOFqN9ta+bgKb4h0VEzNXW/Sy2zR6MabwohrNTVMSjPAAIH5/eHD+YnC\ncTTHGfX1/Itzzvec+Zwz58x7vl8OB5PdbrcLAADUysvdBQAA4MkISgAADBCUAAAYICgBADBAUAIA\nYICgBADAAEEJAIABgtJDvPjiixo8eLAGDx6sTp06yWq1OqbLyso0ePBg/fLLLyouLtYTTzzxm15r\n2rRp2rlz51Wq/Lc5t19Xw9NPPy2bzXbR/AULFmj16tWG61ZVVWn06NGyWq1asmTJVanHWR999JEW\nLFggSdq8ebNefPHFq/4aK1as0KhRo+pcfjXfh99i7969Gjdu3FXbXmhoqI4cOXLR/F9//VWTJk1S\n//79ZbVatWnTplrX96Rr5Uo4c+4768MPP1RCQsJV2dZ1xw6P07t3b/uePXtqXXb48GF7586dr3FF\n14ennnrKvnLlyita9+jRo/ZOnTrZKysrr3JVl7Zw4UL7888/75Jtnzx50j5jxgx7SEiI/amnnnLJ\na3iyzp072w8fPnzR/LS0NPv06dPtdvvZ975Hjx72wsLCa13edWXDhg32P/zhD+4uwy3oUV4n2rVr\npxMnTmjq1KmOHmZVVZVWrFihmJgYDRkyRL1799ayZcskSTabTaNHj9YzzzyjiIgIRUZG6ptvvpEk\nJSQk6MMPP5Qkbd26VYMHD9bAgQP1+OOPKy8v76LX/sc//qHIyEhFRUUpJiZGu3fvvmg7F0536tRJ\n48ePl9Vq1Ztvvqmnn37a0a6goEDh4eGqqqpy7NfQoUNrbGvevHmaO3euJOn9999XVFSUhgwZomHD\nhqmgoECS9MMPP+i///u/NWDAAI0cOVJFRUW1HrukpCS9/fbbkqR7771Xr7zyioYOHao+ffrof/7n\nf1RSUqInn3xSlZWVioqK0nfffadPP/1UsbGxGjhwoKKiorR9+3bHcY2Pj1dkZKQSEhJks9k0atQo\nDRs2TH379tUTTzyhjRs3KiEhQeHh4Vq8eLEk6fTp0/rzn/+s2NhYWa1WRUVF6f/+7//05ZdfKjMz\nU+vXr9f8+fNls9kcx+rYsWMaNWqUBg4cqIiICL311luSpCNHjuiRRx7RzJkzFR0drb59+2r9+vW1\n7vuGDRvUpEkT/fnPf651+Tnn3oeioiINHz5ckZGRioyM1F//+tda2y9cuNBxbEaMGKHjx4/X2M6F\n283JydGgQYM0dOhQDRo0SM8++6zjPZGk5cuXa8KECcrJyVFERISKi4vVpUuXGu9pbGystm3bpoqK\nCs2aNUuRkZEaNGiQkpKSVFJSIkn69NNPNXjwYA0ZMkQzZsxQdXV1rfVv2rRJMTExkqQ77rhDPXv2\n1IYNGy5qd+6crqysVGpqqmOfx40bp9LS0jrb1zZd27knqc5jfv65cOF0UlKSRo0apQEDBmjWrFl1\nHqtz5/57771X5zVYUFCg4cOHKyoqSoMHD9aKFSsc7RYsWKBHHnlE0dHR+uc//1nrsbwZEJTXmdmz\nZ6t+/fpas2aNysrK9P7772vRokVavXq15s+f7wgXSdq9e7dmzJihDz74QF26dKnxwSRJP/74oxIT\nEzVnzhytXbtWI0aM0Lx58y56zZdeekmpqamy2WwaP368cnJyLlnnmTNn1Lt3b23cuFFxcXHKzc11\nXMg2m01RUVEym82O9jExMVq1apWks8Og2dnZiomJ0a5du7R69WotXbpUq1ev1pNPPqmxY8dKkl54\n4QWFhIRo3bp1mj59uv79739fsq6Kigo1atRImZmZWrhwoV5++WV5e3tr0aJFjuPq7++vcePGadq0\naVq7dq3S0tKUmJiow4cPS5K+/fZbZWRkKCMjQ9LZD+fZs2dr48aN+umnn7Ru3Tq98847evPNN/XX\nv/5V1dXV2r59uwICApSVlaWNGzeqU6dOWrp0qUJCQjR06FD913/9lyZOnFij1smTJyssLExr167V\n8uXLlZ2drXXr1kmSDh8+rJ49e2rFihWaPHlyjff9fHFxcRozZozq169/yWMjSVlZWWrWrJlWrVql\npUuX6tChQyouLq7RprCwUO+8845Wrlwpm82mHj16aM+ePZfcdn5+vl5++WVlZ2crNja2xpCgzWZT\nbGysY9rf3199+/ZVdna2pLMf7EVFRQoPD9eiRYtkNptls9mUnZ2tJk2aaN68eaqoqND48eOVlJSk\n1atXKywsTGVlZbXWUlhYqNtvv90x3bRpUx07dqzO2r/44gvt2rVL2dnZstlsat68uQ4cOHDJfT5f\nbedeeXm5U8e8NmVlZVq3bp2Sk5PrPFbnDBgwoNZr0G63a9y4cZo0aZJsNpuWLFmixYsX64svvtCm\nTZv0j3/8Q6tXr1ZmZqbjy8jNyOLuAnDlfH199cYbb2jbtm06ePCg8vLydPr0acfyjh076ne/+50k\n6Z577rnoG+Fnn32mtm3bqkOHDpKkRx99VI8++uhFrzNgwACNGTNGDz74oHr06KGRI0c6VV/Xrl0l\nSX5+frJarcrOztawYcOUnZ3t6Pme079/f7300ksqKirSV199pZYtW+quu+5SVlaWDh06pKFDhzra\nnjp1Sj///LN27typKVOmSJJatmypsLAwp+p6+OGHJZ09PhUVFTWOmSTt2bNHLVq0UEhIiCSpbdu2\n6tKli3bt2iWTyaR27drJz8/P0f7ee+91fOg2a9ZMPXv2lJeXl5o3b67y8nL9+uuv6tevn5o3b66M\njAwdOnRIu3btUmhoaJ01nj59Wp999pmjR+rv7+/o2YaEhMjb21sPPvigpLPv7c8//+zUvl9KeHi4\nnnrqKRUWFuqBBx7QpEmT5O/vX6NN06ZN1b59e0VGRqpXr17q1auXunfvfslt33777brzzjslSWFh\nYSovL9fevXt1yy236MSJE+revbt27drlaB8TE6Pnn39eI0aM0MqVKxUVFSUvLy999NFHKi4udvzu\n8MyZM7rtttv0zTffyGKxOGqJiIhQSkpKrbXYa3nEtZdX3f2GoKAgmc1mxcTEqGfPnrJarQoODr7k\nPl+otnPPmWNem/vuu8/xc13H6py6rsGDBw/qu+++U3JysqNtWVmZvvrqKxUUFKhv376Oc/2xxx5z\nfDm82RCU17Fjx47p8ccfV2xsrO677z7169dPW7dudSw/vxdhMpku+nAwm80ymUyOabvdrgMHDqh9\n+/Y12k2cOFHR0dHasWOHbDabFi1a5Lhp5vxtnjlzpsZ6DRo0cPwcExOjGTNmqE2bNrr77rvVvHnz\ni9parVZ98MEH+vzzzx3DYtXV1Ro8eLASExMd08ePH1fDhg0v2ieLxbnTuV69eo5jcuE+nHuNC9nt\ndlVWVsrb27vGfkmSj49Pjena6li2bJmysrL0+9//XgMHDtStt95a600m59dQW12VlZWSJG9vb8cH\n4fnv4W8VHByszZs365NPPtG//vUvxcTE6LXXXlOXLl0cbby8vLRkyRLt3btXn3zyiWbNmqWwsDBN\nnz69xrYqKipqTJ9/3Ewmk6Kjo7VmzRp5e3srOjr6ov3o2rWrKisrtWfPHn3wwQfKzMx0HIfk5GTH\nF4XS0lKVl5ersLDwomNW1zlx++23q6ioSIGBgZKk48ePX3Teny8gIEBr1qzRZ599pn/961+aMGGC\nnnjiCQ0bNuyitkbXRG3nXl3H/MLz2+j6qutYna+2a/DAgQOOfTvnxx9/lL+/v+bOnVvj9c8fAbrZ\nMPR6nbFYLKqqqpLdbte+ffvUuHFj/elPf1J4eLgjJKuqqpzaVkhIiAoKCpSfny/p7B2X5wLpnMrK\nSvXp00enT59WXFycUlNTVVBQoMrKSjVu3Fj79u2TJH333XeGQ1GdO3eWJL322muOELxQbGysbDab\nPv/8c1mtVklSjx49tG7dOsfvwJYvX64//vGPks72ft577z1J0vfff+/UkLAzQkJC9O9//9sxnJif\nn6/du3erW7duV7zNHTt2KDIyUjExMWrVqpW2bNnieJ/MZrMjAM/x8/NTSEiIli5dKkkqLi7W6tWr\n9cADD1xxDc6YN2+eXn/9dT3yyCOaNm2a7r77bh08eLBGm7y8PEVERKhNmzZ6+umnNWzYMMd737hx\nY+3du1eSLvk7rcjISG3ZskUbN25UVFRUrW1iYmI0c+ZMtWvXTnfccYckqWfPnlq6dKkqKipUXV2t\nGTNmKD09XUFBQbLb7dq2bZuks+fzqVOnat3uww8/7Dh3jh07pv/93/9V796966x169atGjZsmEJD\nQzV27FgNGTKk1t/nX841cU5dx7xx48bKz89XeXm5Kisra3wJdvZYna+2a7BVq1aqV6+eIygLCwsV\nERGhffv2KTw8XB9++KF++eUXVVdX1wjTmw09yutMYGCg7rnnHvXv31/vvPOOmjZtqn79+umWW25R\ncHCwGjdurEOHDjm1rf/4j//QvHnzNGXKFFVVVcnPz0/z58+v0cZisSg5OVmTJ0+WxWKRyWTSrFmz\n5OPjo9GjRyspKUnbtm1T69atHUOtdYmJiXF8INSmU6dOslgsslqtjm/e4eHhGjlypIYPHy6TySQ/\nPz+9+uqrMplMSk1N1dSpU9W/f3/97ne/M+wRXI7GjRtrwYIFmjlzpsrKymQymTR79my1atVKn3/+\n+RVtc/jw4UpJSZHNZpPZbFbHjh0dN1d1795dY8eOlbe3tzp27OhYZ968eXrhhRdks9lUUVHhuJHk\n6NGjdb7ODz/8oKeeekqLFi1S06ZNL7vOP/7xj0pKSlJERIR8fHzUrl07RURE1GjTvn179e/fX489\n9pgaNGig+vXrO3qT06dP1wsvvKCAgAA98MADjh5bbc6dy5WVlXXWOmTIEKWnpys9Pd0x709/+pPS\n0tIUGRmpqqoqdejQQUlJSfL29tZrr72m5557Tunp6erQoYNuu+02x3ojR47U0KFD9fDDD2vs2LF6\n7rnnNGDAAFVVVSkxMVEtWrSos9ZevXpp+/btioiIUIMGDdSwYUPNnDnzonaXe01IdR9zLy8v/ed/\n/qf69++vwMBAhYWFGQZvbcfqQhdegz4+Pnr99df1l7/8RW+99ZYqKys1fvx4x7DugQMH9Nhjjykg\nIEDt27fXyZMnL7k/NyKTvbbBegDXrcTERCUnJ6tRo0buLsWjZGVlqVGjRurbt6+7S8F1hqFX4Aby\n66+/qmfPnoRkLcxmsx566CF3l4HrED1KAAAM0KMEAMAAQQkAgIHr8q7XoqJLP7UC116jRg108uTp\nSzcEbmJcJ54pMLDuhzzQo8RVY7HcvH+QDDiL6+T6Q1ACAGCAoAQAwABBCQCAAYISAAADBCUAAAYI\nSgAADBCUAAAYICgBADBwXT6Z50YyfM4Wd5eACyxO6uPuEgB4EHqUAAAYICgBADBAUAIAYICgBADA\nAEEJAIABghIAAAMEJQAABghKAAAMEJQAABggKAEAMEBQAgBggKAEAMAAQQkAgAGCEgAAAwQlAAAG\nXPr/KCMjI+Xn5ydJatasmUaNGqWkpCSZTCa1bdtWqamp8vLyUlZWljIzM2WxWDR69Gj17t3blWUB\nAOA0lwVleXm57Ha7MjIyHPNGjRqlCRMmKCwsTCkpKdq8ebM6d+6sjIwMrVy5UuXl5YqPj1ePHj3k\n4+PjqtIAAHCay4IyLy9Pv/76q4YPH67Kyko9++yz2r9/v7p16yZJ6tWrlz7++GN5eXkpNDRUPj4+\n8vHxUYsWLZSXl6fg4GBXlQYAgNNcFpT169fXiBEjFBMTo4MHD2rkyJGy2+0ymUySJF9fXxUXF6uk\npET+/v6O9Xx9fVVSUmK47UaNGshiMbuqdNzkAgP9L90I+A04x64vLgvKVq1aqWXLljKZTGrVqpVu\nvfVW7d+/37G8tLRUAQEB8vPzU2lpaY355wdnbU6ePO2qsgEVFRW7uwTcwAID/TnHPJDRlxeX3fW6\nYsUKzZkzR5L0ww8/qKSkRD169FBOTo4kafv27eratauCg4OVm5ur8vJyFRcXq6CgQEFBQa4qCwCA\ny+KyHmV0dLSmTp2quLg4mUwmzZo1S40aNdKMGTOUnp6u1q1by2q1ymw2KyEhQfHx8bLb7Zo4caLq\n1avnqrIAALgsJrvdbnd3EZfrRhq2GD5ni7tLwAUWJ/Vxdwm4gTH06pncMvQKAMCNgKAEAMAAQQkA\ngAGCEgAAAwQlAAAGCEoAAAwQlAAAGCAoAQAwQFACAGCAoAQAwABBCQCAAYISAAADBCUAAAYISgAA\nDBCUAAAYICgBADBAUAIAYICgBADAAEEJAIABghIAAAMEJQAABghKAAAMEJQAABggKAEAMEBQAgBg\ngKAEAMAAQQkAgAGCEgAAAwQlAAAGCEoAAAwQlAAAGHBpUP7000968MEHVVBQoEOHDikuLk7x8fFK\nTU1VdXW1JCkrK0tRUVGKjY3V1q1bXVkOAACXzWVBeebMGaWkpKh+/fqSpNmzZ2vChAlatmyZ7Ha7\nNm/erKKiImVkZCgzM1Nvv/220tPTVVFR4aqSAAC4bC4LyrS0NA0dOlRNmjSRJO3fv1/dunWTJPXq\n1Us7d+7Unj17FBoaKh8fH/n7+6tFixbKy8tzVUkAAFw2iys2arPZ1LhxY4WHh2vRokWSJLvdLpPJ\nJEny9fVVcXGxSkpK5O/v71jP19dXJSUll9x+o0YNZLGYXVE6oMBA/0s3An4DzrHri0uCcuXKlTKZ\nTPrkk0/09ddfa8qUKTpx4oRjeWlpqQICAuTn56fS0tIa888PzrqcPHnaFWUDkqSiomJ3l4AbWGCg\nP+eYBzL68uKSodelS5dqyZIlysjIUIcOHZSWlqZevXopJydHkrR9+3Z17dpVwcHBys3NVXl5uYqL\ni1VQUKCgoCBXlAQAwBVxSY+yNlOmTNGMGTOUnp6u1q1by2q1ymw2KyEhQfHx8bLb7Zo4caLq1at3\nrUoCAOCSTHa73e7uIi7XjTRsMXzOFneXgAssTurj7hJwA2Po1TNd86FXAABuFAQlAAAGCEoAAAwQ\nlAAAGCAoAQAwQFACAGCAoAQAwABBCQCAAYISAAADBCUAAAYISgAADBCUAAAYICgBADBAUAIAYICg\nBADAAEEJAIABghIAAAMWdxcAAM4YPmeLu0vABRYn9XF3CdcEPUoAAAw4FZQVFRXKy8uTJK1du1Zp\naWk6fvy4SwsDAMATOBWUiYmJ2rhxo7788ku98sor8vPzU1JSkqtrAwDA7ZwKyiNHjmj8+PHauHGj\noqOj9cwzz+jUqVOurg0AALdzKiirqqp04sQJbd68WQ899JCKiopUVlbm6toAAHA7p+56ffLJJxUb\nG6s+ffooKChIVqtV48ePd3VtAAC4nVNBmZ+fr02bNjmm169fL7PZ7LKiAADwFE4NvW7dulV2u90x\nTUgCAG4WTvUob731VvXr108dO3ZUvXr1HPNnz57tssIAAPAETgVlZGSkq+sAAMAjOR2UR44c0bff\nfquePXuqsLBQzZs3d3VtAAC4nVO/o1y/fr1Gjx6tv/zlLzp16pSGDh2qNWvWuLo2AADczqmgfPPN\nN7V8+XL5+vrqtttu06pVq7Ro0SLDdaqqqjR16lQNHTpUcXFx+uabb3To0CHFxcUpPj5eqampqq6u\nliRlZWUpKipKsbGx2rp162/fKwAArhKnhl69vLzk5+fnmG7SpIm8vIwz9lzgZWZmKicnR/Pnz5fd\nbteECRMUFhamlJQUbd68WZ07d1ZGRoZWrlyp8vJyxcfHq0ePHvLx8fkNuwUAwNXhVFC2bdtWS5Ys\nUWVlpb7++mstW7ZM7du3N1znkUce0UMPPSRJ+v777xUQEKCdO3eqW7dukqRevXrp448/lpeXl0JD\nQ+Xj4yMfHx+1aNFCeXl5Cg4O/m17BgDAVeBUUKakpOhvf/ub6tWrp+TkZN1///2aMmXKpTdusWjK\nlCn65z//qYULF+rjjz+WyWSSJPn6+qq4uFglJSXy9/d3rOPr66uSkhLD7TZq1EAWC3/LCdcIDPS/\ndCMAN8214lRQNmjQQOPGjdOAAQPk7e2tu+66y+mHDqSlpWny5MmKjY1VeXm5Y35paakCAgLk5+en\n0tLSGvPPD87anDx52qnXBq5EUVGxu0sArgs30rViFPpO3cyza9cu9e3bV8nJyZo8ebL69++vvXv3\nGq6zevVq/f3vf5ck3XLLLTKZTOrUqZNycnIkSdu3b1fXrl0VHBys3NxclZeXq7i4WAUFBQoKCnJ2\n3wAAcCmnepRz5szR3//+d7Vr106StHfvXj3//PNasWJFnes8+uijmjp1qn7/+9+rsrJSycnJatOm\njWbMmKH09HS1bt1aVqtVZrNZCQkJio+Pl91u18SJE2s8/QcAAHdyKiglOUJSku69915VVVUZtm/Q\noIEWLFhw0fwlS5ZcNC82NlaxsbHOlgIAwDVjGJS7d++WJLVq1UopKSmKjo6WxWLR2rVrde+9916T\nAgEAcCfDoFy4cGGN6blz5zp+Pnf3KgAANzLDoMzIyLhWdQAA4JGc+h3lp59+qnfeeUenTp2qMf/d\nd991SVEAAHgKp4IyKSlJY8aM0R133OHqegAA8ChOBWXTpk01ZMgQV9cCAIDHcSooExISNHnyZN1/\n//2yWP7/KoQnAOBG51RQLlu2TJKUm5tbYz5BCQC40TkVlEVFRdqwYYOrawEAwOM49azXrl27auvW\nraqsrHR1PQAAeBSnepRbt27V+++/X2OeyWTS119/7ZKiAADwFE4F5Y4dO1xdBwAAHsmpoHz11Vdr\nnT9mzJirWgwAAJ7Gqd9Rnu/MmTPasmWLfvrpJ1fUAwCAR3GqR3lhz/GZZ57R8OHDXVIQAACe5LJ7\nlJJUWlqq77///mrXAgCAx3GqR9mnTx/Hv9Wy2+365ZdfNGLECJcWBgCAJ3AqKBcvXqwdO3bo559/\nliQFBAQoICDApYUBAOAJnArK+fPn6/vvv1ebNm1kMpl09OhRSTzCDgBw43MqKA8cOKAPP/zQ1bUA\nAOBxnLqZp02bNjp+/LirawEAwOM41aMsKytTv379FBQUJB8fH8f8d99912WFAQDgCZwKyqefftrV\ndQAA4JGcCspu3bq5ug4AADzSFT1wAACAmwVBCQCAAYISAAADBCUAAAYISgAADBCUAAAYICgBADDg\n1N9RXq4zZ84oOTlZR48eVUVFhUaPHq27775bSUlJMplMatu2rVJTU+Xl5aWsrCxlZmbKYrFo9OjR\n6t27tytKAgDgirgkKLOzs3Xrrbdq7ty5+vnnnzVkyBC1b99eEyZMUFhYmFJSUrR582Z17txZGRkZ\nWrlypcrLyxUfH68ePXrUeEweAADu5JKg7Nevn6xWq6Sz/+jZbDZr//79jif89OrVSx9//LG8vLwU\nGhoqHx8f+fj4qEWLFsrLy1NwcLArygIA4LK5JCh9fX0lSSUlJRo3bpwmTJigtLQ0mUwmx/Li4mKV\nlJTI39+/xnolJSWX3H6jRg1ksZhdUTqgwED/SzcCcNNcKy4JSkkqLCzUM888o/j4eA0cOFBz5851\nLCstLVVAQID8/PxUWlpaY/75wVmXkydPu6RmQJKKiordXQJwXbiRrhWj0HfJXa8//vijhg8frsTE\nREVHR0uS7rnnHuXk5EiStm/frq5duyo4OFi5ubkqLy9XcXGxCgoKFBQU5IqSAAC4Ii7pUb7xxhv6\n5Zdf9Prrr+v111+XJE2bNk0vvvii0tPT1bp1a1mtVpnNZiUkJCg+Pl52u10TJ05UvXr1XFESAABX\nxGS32+3uLuJy3Ujd/eFztri7BFxgcVIfd5eAWnCteJ4b6Vq55kOvAADcKAhKAAAMEJQAABggKAEA\nMEBQAgBggKAEAMAAQQkAgAGCEgAAAwQlAAAGCEoAAAwQlAAAGCAoAQAwQFACAGCAoAQAwABBCQCA\nAYISAAADBCUAAAYISgAADBCUAAAYICgBADBAUAIAYICgBADAAEEJAIABghIAAAMEJQAABghKAAAM\nEJQAABggKAEAMEBQAgBggKAEAMAAQQkAgAGXBuWXX36phIQESdKhQ4cUFxen+Ph4paamqrq6WpKU\nlZWlqKgoxcbGauvWra4sBwCAy+ayoHzzzTc1ffp0lZeXS5Jmz56tCRMmaNmyZbLb7dq8ebOKioqU\nkZGhzMxMvf3220pPT1dFRYWrSgIA4LK5LChbtGihV155xTG9f/9+devWTZLUq1cv7dy5U3v27FFo\naKh8fHzk7++vFi1aKC8vz1UlAQBw2Syu2rDVatWRI0cc03a7XSaTSZLk6+ur4uJilZSUyN/f39HG\n19dXJSUll9x2o0YNZLGYr37RgKTAQP9LNwJw01wrLgvKC3l5/f/Oa2lpqQICAuTn56fS0tIa888P\nzrqcPHnaJTUCklRUVOzuEoDrwo10rRiF/jW76/Wee+5RTk6OJGn79u3q2rWrgoODlZubq/LychUX\nF6ugoEBBQUHXqiQAAC7pmvUop0yZohkzZig9PV2tW7eW1WqV2WxWQkKC4uPjZbfbNXHiRNWrV+9a\nlQQAwCW5NCibNWumrKwsSVKrVq20ZMmSi9rExsYqNjbWlWUAAHDFeOAAAAAGCEoAAAwQlAAAGCAo\nAQAwQFACAGCAoAQAwABBCQCAAYISAAADBCUAAAYISgAADBCUAAAYICgBADBAUAIAYICgBADAAEEJ\nAIABghIAAAMEJQAABghKAAAMEJQAABggKAEAMEBQAgBggKAEAMAAQQkAgAGCEgAAAwQlAAAGCEoA\nAAwQlAAAGCAoAQAwQFACAGCAoAQAwIDF3QVIUnV1tZ577jkdOHBAPj4+evHFF9WyZUt3lwUAgGf0\nKDdt2qSKigq99957mjRpkubMmePukgAAkOQhQZmbm6vw8HBJUufOnbVv3z43VwQAwFkeMfRaUlIi\nPz8/x7TDfB0NAAAER0lEQVTZbFZlZaUsltrLCwz0v1aludzalwe7uwTgusC1AnfxiB6ln5+fSktL\nHdPV1dV1hiQAANeSRwRlly5dtH37dknSF198oaCgIDdXBADAWSa73W53dxHn7nr95ptvZLfbNWvW\nLLVp08bdZQEA4BlBCQCAp/KIoVcAADwVQQkAgAGCEgAAAwQlfpPq6mp3lwAALsUfK+KyHT58WLNn\nz9a+fftksVhUXV2toKAgTZ06Va1atXJ3eQBwVXHXKy7bE088oUmTJikkJMQx74svvtCcOXOUmZnp\nxsoA4OqjR4nLVlFRUSMkpbPP6AVwsYSEBJ05c6bGPLvdLpPJxBfL6wRBicvWrl07TZ06VeHh4fL3\n91dpaam2bdumdu3aubs0wONMnjxZ06dP12uvvSaz2ezucnAFGHrFZbPb7dq0aZNyc3MdD7Tv0qWL\n+vbtK5PJ5O7yAI/z1ltvqWXLlurbt6+7S8EVICgBADDAn4cAAGCAoAQAwABBCdzAli9fruXLl//m\n7SQkJCgnJ+cqVARcf7jrFbiBxcXFubsE4LpHUAIe5NixY5o8ebJOnz4tLy8vTZ8+Xc8++6zeffdd\nNWvWTDk5OXr11VeVkZGhhIQENWzYUPn5+Ro4cKBOnDihlJQUSVJaWpqaNGmikpISSVLDhg118ODB\ni5bHxsbqhRdeUH5+vqqqqjRy5EhFRESooqJC06ZN0759+3TnnXfq5MmTbjsmgLsx9Ap4kBUrVuih\nhx6SzWZTYmKicnNzDdu3a9dOGzduVFxcnDZt2qSqqirZ7XZt3LhRAwYMcLQbMGBArcv/9re/qWPH\njrLZbFq6dKneeOMNHT58WBkZGZKkDRs2aPr06fruu+9cut+AJ6NHCXiQ7t27a+zYsfr666/14IMP\n6g9/+IOWLl1aZ/vg4GBJ0m233aYOHTooJydH3t7euuuuu9SkSRNHu7qW79y5U2VlZVq5cqUk6fTp\n08rPz9euXbv0+OOPS5LuuusuhYaGunCvAc9GUAIe5L777tO6dev00Ucfaf369Vq1apWksw95kKTK\nysoa7evXr+/4edCgQVq/fr28vb01aNCgi7Zd2/Lq6mrNnTtXHTt2lCT9+OOPatiwobKysmr8ZxiL\nhY8K3LwYegU8yEsvvaQ1a9YoMjJSKSkp+uqrr9SoUSN9++23kqTNmzfXue7DDz+s3bt3a8eOHbU+\nAaa25ffff7/jrtjjx49r0KBBKiwsVPfu3fXBBx+ourpaR48e1WeffeaCvQWuD3xNBDxIQkKCJk2a\npFWrVslsNis1NVW+vr6aOXOmXn31VfXs2bPOdevXr68uXbqooqJCvr6+Ti0fM2aMnnvuOUVERKiq\nqkqJiYlq0aKF4uPjlZ+fr/79++vOO+9UUFCQy/YZ8HQ8wg4AAAMMvQIAYICgBADAAEEJAIABghIA\nAAMEJQAABghKAAAMEJQAABggKAEAMPD/ALWX1nw3VzHuAAAAAElFTkSuQmCC\n", 229 | "text/plain": [ 230 | "" 231 | ] 232 | }, 233 | "metadata": {}, 234 | "output_type": "display_data" 235 | } 236 | ], 237 | "source": [ 238 | "fig = plt.figure()\n", 239 | "fig.set(alpha=0.2)\n", 240 | "fig.set_size_inches(16, 9)\n", 241 | "\n", 242 | "plt.subplot2grid((2,2),(0,0))\n", 243 | "train[\"Survived\"].value_counts().plot(kind='bar')\n", 244 | "plt.title('Titanic survived information,1 is survived,0 is unsurvived')\n", 245 | "plt.xlabel('survived')\n", 246 | "plt.ylabel('numbers')\n", 247 | "\n", 248 | "plt.show()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "**Sex**" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 30, 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "" 269 | ] 270 | }, 271 | "metadata": {}, 272 | "output_type": "display_data" 273 | }, 274 | { 275 | "data": { 276 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeQAAAFfCAYAAACfo79PAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHDxJREFUeJzt3X9clHW+9/H3wDioM0OSkrYZJgam7eHW8kC7JoV1pO1o\ntR2W1vHQlrYlWS32w18l2Ob6I4uyH2b1oF9oh6Vo3brr1L2R53CnPWij0o0T6qJyr/mL0LaZkQaQ\n6/5jH1HuKqDOcH0dXs+/5JqZaz4XD4fXXNfMXOOwLMsSAACwVYzdAwAAAIIMAIARCDIAAAYgyAAA\nGIAgAwBgAIIMAIABnN250tNPP6333ntPra2tmjp1qtLT0zVv3jw5HA6lpKSoqKhIMTExKi8vV1lZ\nmZxOp/Lz85WVlRXp+QEAiAqOrj6HXF1dreeff16rVq1Sc3OznnvuOdXW1urGG29URkaGCgsLNWHC\nBI0ZM0bTp09XRUWFQqGQfD6fKioq5HK5jrnuxkZ/2DcIPSchob8OHjxk9xhAr8Nj79SVmOg95mVd\nHrJ+//33lZqaqlmzZmnmzJm69NJLVVtbq/T0dElSZmamNm7cqM2bN2vs2LFyuVzyer1KSkpSXV1d\n+LYCxnE6Y+0eAeiVeOxFpy4PWR88eFC7d+/W6tWrtWvXLuXn58uyLDkcDkmS2+2W3+9XIBCQ1/td\n+d1utwKBQOQmBwAginQZ5AEDBig5OVkul0vJycmKi4vT3r17Oy4PBoOKj4+Xx+NRMBg8Yvn3A300\nCQn9eaZ3iuvs8AuAyOGxF326DPKFF16ol156STfeeKP279+v5uZm/ehHP1J1dbUyMjJUVVWliy66\nSGlpaXr00UcVCoXU0tKi+vp6paamdrpuXgM5tSUmenkfAGADHnunrs6eSHUZ5KysLP3xj39UTk6O\nLMtSYWGhhg4dqoULF6q4uFjJycnKzs5WbGys8vLy5PP5ZFmWZs+erbi4uLBuCAAA0arLd1lHEs/w\nTm08SwfswWPv1HVS77IGAACRR5ABADBAt87UZafpy94L6/qemzexW9crLX1BH330oQ4fbpPD4dCs\nWQU677xRJ3SfK1c+rOuum6YhQ4ac0O2Liubr6qv/TRdcMO6Ebg8AMJ/xQbbDjh3btWFDlZ56qkQO\nh0Pbtm3R4sWL9OKL/3FC6/vVr+4K63wAgOjDIeuj8Hg82rdvr9588/dqbNyvlJSRevbZF3XbbTer\noWGnJGnduldVUvK09uzZreuvv0633Xaz1q59UdOm/e3d6JJUXLxc//3f6ztuN2NGnvbs2S1JWr/+\nXT366EMKBAK67745uv32W3T77beovv7PkqSKinLdeKNPd999h3bt2mXL7wEA0HMI8lEkJp6hZcuK\ntXnzJt1yy43y+f5NGzf+32Ne/8CBJj3yyJOaNu0XGjEiRZs2faKWlhZ9/HGNxo+f0HG9yZOv1ttv\nvylJeuutN3TVVdfopZee04UXpuvxx5/WnDn36qGHlurAgSa98kqZnn76BS1bVqy2ttaIbzMAwF4c\nsj6KXbv+IrfbrQULiiRJdXX/o7vvvkMDBw7quM73Pyx25pk/UJ8+fSRJU6Zco//8z/+tpqYmXXxx\nppzO737F//IvV2jWrJs0Zco1CgaDSk4+V9u3/1kff/yRKiv/jyTJ7/9aX3yxS8OHJ3d8MceoUedH\nepMBADYjyEdRX79Nv//977R8ebH69Omjs89OksfjVXz8aWpq+lLDhp2jrVvrNGhQoiTJ4fjuQMO4\ncel66qnH1NjYqLvumnvEej0ej0aOHKXHHivWlVdOkSQNG3aOJk0arUmTrtDBgwf0xhvrNHRoknbs\n2K5Q6Bs5nX20desWTZr0k577BQCnuFnvzbF7BJyEJyc+aPcItiDIR3HJJRO1c+cO3XTT9erfv5/a\n2y3deuuv1KePUw8/vEyDBw/piPHfczgcuvTSy/TRRx/qrLOG/sPlU6Zco7vuukPz5xdKkq6/frqW\nLXtAr7/+mg4dCmr69JuVkJCgf//3X2jmzOkaMCBB/fr1i+j2AgDsx5m6cMI4WxBMxR7yqS2a95A5\nUxcAAIYjyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGMD4zyGH++ML4X47/QcfbFBZ2RpZlqVvvvlG\nOTnXndRJPN566w3Fx8fr4osvOaHbr1v3qpqamjRjxi0nPAMAoOcZH2TTrVixRC++WCav16tDh4L6\nxS98+ud/zlBCwukntL5vz+AFAOhdCPJRvPXWG2po2Kn8/NsVCoU0bVqOhgw5UykpI7V9e70OHQro\ngQeWa8iQM+X1evXKK/+hSy+9TMOHJ2vt2lfkcrlUUvK0Bg4cqGuuyVFDw06tWLFETzzxjPLycnX2\n2cPUp49Tu3bt0uLFy3XmmT/Q+vXvatOmT+X1ejVw4ED95S//T+eem6qf/GSympq+1D33FOi559Zo\n9eontGnTJ2pvb9d1103TxImXa9OmT7Vy5UPyeuMVGxur88//od2/QgDAceI15OMwatT5WrlylcaN\ny9Af/vCOJKm4+Al98803uv/+e3X11VeotPR5dXbys+bmZt1wwwzdf//So37707cmT/7bl1RI0jvv\nvKV//dcp+uCDDdqz5ws99VSJHntstV566Tn5/X49/PBSLVr0G61cuUo/+MEPIvgbAABECkHu0ndx\nTU0dKUkaPHiwWlpC+vrrr7V3717deusdevHFMpWUlKq6+gNt2HDkVzX+faCTks6R9Ldvf/qv/6rU\nl182dnz707eGD0/W4cOHtXfvHlVW/kGTJl2p7dv/rC1b6nTbbTfrrrtuV1tbm/bu3a0DBw4oKWmY\nJOmf/ul/ReKXAACIMIJ8FC6XS01NX0qStmyp61jucDiOuF5ra4uKiubrwIEmSdLAgYM0cOBAuVwu\nuVxxamr62/KtW+uOuN236znatz993+TJV2vVqsd0zjnD5fV6NWzYORo7dpyeeOIZPfbYak2ceLnO\nOmuoEhMTtXPnDknS55//T5h+CwCAnsRryEeRkfFjrVtXofz8GRo5cpTcbvdRrzdw4CAVFNytOXNm\nKzY2Vu3th/XjH09QevpFGjr0bBUWztcnn9Ro5MhRx7yvv//2p+/LyrpcK1c+pGXLiiVJ48dn6pNP\nanTrrTepufmQMjOz1L+/W/fcs0CLFxfJ7Xarf//+8nqPffJyAICZ+LYnnDC+7Qmm4tueTm182xMA\nALANQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBk\nAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwADO7lzppz/9qTwejyRp6NChmjlzpubN\nmyeHw6GUlBQVFRUpJiZG5eXlKisrk9PpVH5+vrKysiI6PAAA0aLLIIdCIVmWpdLS0o5lM2fOVEFB\ngTIyMlRYWKjKykqNGTNGpaWlqqioUCgUks/n0/jx4+VyuSK6AQAARIMug1xXV6fm5mZNnz5dbW1t\nuvPOO1VbW6v09HRJUmZmpjZs2KCYmBiNHTtWLpdLLpdLSUlJqqurU1paWsQ3AgCAU12XQe7bt69m\nzJihn/3sZ9q5c6d++ctfyrIsORwOSZLb7Zbf71cgEJDX6+24ndvtViAQ6HTdCQn95XTGnuQmwE6J\nid6urwQAx6G3/l3pMsjDhw/XsGHD5HA4NHz4cA0YMEC1tbUdlweDQcXHx8vj8SgYDB6x/PuBPpqD\nBw+dxOiwW2KiV42NfrvHABBlovnvSmdPNrp8l/Wrr76qZcuWSZL27dunQCCg8ePHq7q6WpJUVVWl\ncePGKS0tTTU1NQqFQvL7/aqvr1dqamqYNgEAgOjW5R5yTk6O5s+fr6lTp8rhcGjJkiVKSEjQwoUL\nVVxcrOTkZGVnZys2NlZ5eXny+XyyLEuzZ89WXFxcT2wDAACnPIdlWZZddx7NhyV6Aw5Zw1Sz3ptj\n9wg4CU9OfNDuESLmpA5ZAwCAyCPIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEI\nMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAA\nggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAY\ngCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAA\nBuhWkJuamnTJJZeovr5eDQ0Nmjp1qnw+n4qKitTe3i5JKi8v17XXXqvc3FytX78+okMDABBtugxy\na2urCgsL1bdvX0nS0qVLVVBQoJdfflmWZamyslKNjY0qLS1VWVmZSkpKVFxcrJaWlogPDwBAtOgy\nyMuXL9fPf/5znXHGGZKk2tpapaenS5IyMzO1ceNGbd68WWPHjpXL5ZLX61VSUpLq6uoiOzkAAFHE\n2dmFr732mk4//XRNmDBBzzzzjCTJsiw5HA5Jktvtlt/vVyAQkNfr7bid2+1WIBDo8s4TEvrL6Yw9\nmflhs8REb9dXAoDj0Fv/rnQa5IqKCjkcDn3wwQf6/PPPNXfuXB04cKDj8mAwqPj4eHk8HgWDwSOW\nfz/Qx3Lw4KGTGB12S0z0qrHRb/cYAKJMNP9d6ezJRqeHrNeuXas1a9aotLRUo0aN0vLly5WZmanq\n6mpJUlVVlcaNG6e0tDTV1NQoFArJ7/ervr5eqamp4d0KAACiWKd7yEczd+5cLVy4UMXFxUpOTlZ2\ndrZiY2OVl5cnn88ny7I0e/ZsxcXFRWJeAACiksOyLMuuO4/mwxK9AYesYapZ782xewSchCcnPmj3\nCBFzwoesAQBAzyDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAY\ngCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAA\nBiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwA\ngAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYwGn3ANFs1ntz7B4BJ+jJiQ/aPQKAXoY9\nZAAADECQAQAwQJeHrA8fPqz77rtPO3bskMPh0P3336+4uDjNmzdPDodDKSkpKioqUkxMjMrLy1VW\nVian06n8/HxlZWX1xDYAAHDK6zLI69evlySVlZWpurpajzzyiCzLUkFBgTIyMlRYWKjKykqNGTNG\npaWlqqioUCgUks/n0/jx4+VyuSK+EQAAnOq6DPLll1+uSy+9VJK0e/duxcfHa+PGjUpPT5ckZWZm\nasOGDYqJidHYsWPlcrnkcrmUlJSkuro6paWlRXQDAACIBt16DdnpdGru3Ll64IEHNGXKFFmWJYfD\nIUlyu93y+/0KBALyer0dt3G73QoEApGZGgCAKNPtjz0tX75cd999t3JzcxUKhTqWB4NBxcfHy+Px\nKBgMHrH8+4E+moSE/nI6Y09gbCCyEhM7/78LIHJ66+OvyyCvW7dO+/bt0y233KJ+/frJ4XDohz/8\noaqrq5WRkaGqqipddNFFSktL06OPPqpQKKSWlhbV19crNTW103UfPHgobBsChFNjo9/uEYBeK5of\nf5092egyyJMmTdL8+fM1bdo0tbW1acGCBRoxYoQWLlyo4uJiJScnKzs7W7GxscrLy5PP55NlWZo9\ne7bi4uLCuiEAAESrLoPcv39/rVy58h+Wr1mz5h+W5ebmKjc3NzyTAQDQi3BiEAAADECQAQAwAEEG\nAMAABBkAAAMQZAAADECQAQAwAEEGAMAABBkAAAMQZAAADECQAQAwAEEGAMAABBkAAAMQZAAADECQ\nAQAwAEEGAMAABBkAAAMQZAAADECQAQAwAEEGAMAABBkAAAMQZAAADECQAQAwAEEGAMAABBkAAAMQ\nZAAADECQAQAwAEEGAMAABBkAAAMQZAAADOC0e4Bo1vzhFXaPgBM10e4BAPQ27CEDAGAAggwAgAEI\nMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYoNMvl2ht\nbdWCBQv0xRdfqKWlRfn5+Tr33HM1b948ORwOpaSkqKioSDExMSovL1dZWZmcTqfy8/OVlZXVU9sA\nAMApr9Mgv/766xowYIBWrFihr776Stdcc43OO+88FRQUKCMjQ4WFhaqsrNSYMWNUWlqqiooKhUIh\n+Xw+jR8/Xi6Xq6e2AwCAU1qnQb7iiiuUnZ0tSbIsS7GxsaqtrVV6erokKTMzUxs2bFBMTIzGjh0r\nl8sll8ulpKQk1dXVKS0tLfJbAABAFOg0yG63W5IUCAR0xx13qKCgQMuXL5fD4ei43O/3KxAIyOv1\nHnG7QCDQ5Z0nJPSX0xl7MvMDEZGY6O36SgAiorc+/joNsiTt2bNHs2bNks/n05QpU7RixYqOy4LB\noOLj4+XxeBQMBo9Y/v1AH8vBg4dOcGwgshob/XaPAPRa0fz46+zJRqfvsv7yyy81ffp03XPPPcrJ\nyZEkjR49WtXV1ZKkqqoqjRs3TmlpaaqpqVEoFJLf71d9fb1SU1PDuAkAAES3TveQV69era+//lqr\nVq3SqlWrJEn33nuvFi9erOLiYiUnJys7O1uxsbHKy8uTz+eTZVmaPXu24uLiemQDAACIBg7Lsiy7\n7jyaD0tI0vRl79k9Ak7Qc/Mm2j0CTsKs9+bYPQJOwpMTH7R7hIg54UPWAACgZxBkAAAMQJABADAA\nQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAM\nQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAA\nAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYA\nwAAEGQAAAxBkAAAMQJABADAAQQYAwABOuwcAgHBr/vAKu0fAyZho9wD26NYe8qZNm5SXlydJamho\n0NSpU+Xz+VRUVKT29nZJUnl5ua699lrl5uZq/fr1kZsYAIAo1GWQn332Wd13330KhUKSpKVLl6qg\noEAvv/yyLMtSZWWlGhsbVVpaqrKyMpWUlKi4uFgtLS0RHx4AgGjRZZCTkpL0+OOPd/xcW1ur9PR0\nSVJmZqY2btyozZs3a+zYsXK5XPJ6vUpKSlJdXV3kpgYAIMp0+Rpydna2du3a1fGzZVlyOBySJLfb\nLb/fr0AgIK/X23Edt9utQCDQ5Z0nJPSX0xl7InMDEZWY6O36SgAiorc+/o77TV0xMd/tVAeDQcXH\nx8vj8SgYDB6x/PuBPpaDBw8d790DPaKx0W/3CECvFc2Pv86ebBz3x55Gjx6t6upqSVJVVZXGjRun\ntLQ01dTUKBQKye/3q76+XqmpqSc+MQAAvcxx7yHPnTtXCxcuVHFxsZKTk5Wdna3Y2Fjl5eXJ5/PJ\nsizNnj1bcXFxkZgXAICo1K0gDx06VOXl5ZKk4cOHa82aNf9wndzcXOXm5oZ3OgAAegnO1AUAgAEI\nMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAA\nggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAY\ngCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAA\nBiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAGc4VxZe3u7Fi1apC1btsjlcmnx\n4sUaNmxYOO8CAICoFNY95HfffVctLS367W9/q7vuukvLli0L5+oBAIhaYQ1yTU2NJkyYIEkaM2aM\nPvvss3CuHgCAqBXWIAcCAXk8no6fY2Nj1dbWFs67AAAgKoX1NWSPx6NgMNjxc3t7u5zOY99FYqI3\nnHdvnDcevtruEYBeicceTkVh3UO+4IILVFVVJUn69NNPlZqaGs7VAwAQtRyWZVnhWtm377LeunWr\nLMvSkiVLNGLEiHCtHgCAqBXWIAMAgBPDiUEAADAAQQYAwAAEGQAAAxBkAAAMQJABADBAWE8MgugW\nCAT07LPPav/+/crKytLIkSP58hCgB+3cuVMNDQ0aOXKkBg8eLIfDYfdICCP2kNFtCxYs0Nlnn62G\nhgYNGjRI9957r90jAb3GmjVrVFRUpEceeURvv/22HnjgAbtHQpgRZHTbV199pZycHDmdTl1wwQVq\nb2+3eySg13jzzTf1/PPPy+v16oYbbtCmTZvsHglhRpBxXOrr6yVJe/fuVWxsrM3TAL2HZVlyOBwd\nh6ldLpfNEyHcOFMXum3Lli0qLCxUfX29kpOTVVRUpPPPP9/usYBeYc2aNXrrrbe0e/dupaSk6KKL\nLtKMGTPsHgthRJAB4BRRX1+vrVu3avjw4TrvvPPsHgdhRpDRpYsvvviYl73//vs9OAnQ+zz88MPH\nfDf1nXfe2cPTIJL42BO6RHQB+yQnJ9s9AnoIe8jotk8//VSvvfaaWltbJUn79+9XSUmJzVMBvUNb\nW5v+9Kc/qa2tTZZlaf/+/Zo8ebLdYyGM2ENGty1atEg33XST3nnnHaWmpqqlpcXukYBe47bbblNr\na6v279+vw4cP64wzziDIUYaPPaHbEhISNHnyZHk8Ht1+++3at2+f3SMBvcbBgwdVUlKitLQ0vfba\nawqFQnaPhDAjyOi2mJgYbdu2Tc3Nzdq+fbv++te/2j0S0Gv07dtXktTc3Nzxb0QXXkNGt23btk3b\ntm3T4MGD9Zvf/EZXXXWVbrjhBrvHAnqFtWvX6quvvlKfPn1UWVmpfv366YUXXrB7LIQRryGj21JS\nUnTmmWcqFArpmWee4cT2QA8aMmSI3n//fbW2tqpv376cKS8KsYeMbpszZ45qamoUHx/fcRq/3/3u\nd3aPBfQK2dnZ+vWvf63TTjutYxknB4ku7CGj23bs2KHKykq7xwB6pZSUFGVkZNg9BiKIIKPb0tLS\ntH37dk5UANjgsssu03XXXXfE42/p0qU2ToRwI8joNo/Ho5ycHPXv379jGWfxAnpGaWmpbrrpJnm9\nXrtHQYQQZHRbdXW1PvzwQzmd/LcBetqgQYN05ZVX2j0GIoi/rOi2c845R01NTRo8eLDdowC9Tt++\nfTVjxgyNHj264xMOfLlEdCHI6LaPP/5YEydO1IABAzr+IHDIGugZWVlZdo+ACONjTwAAGIA9ZHTb\ntm3bVFRUpK+//lpXXXWVUlJSeNYOAGHCuazRbYsXL9bSpUuVkJCgnJwcPf7443aPBABRgyDjuAwb\nNkwOh0Onn3663G633eMAQNQgyOiS3++XJJ122mkqKytTc3Oz3nzzTcXHx9s8GQBED4KMLt18882S\nJLfbrS+++EIJCQn67LPPtGTJEpsnA4Dowbus0aW8vDwdOnRIDQ0NGjFiRMdyh8OhsrIyGycDgOhB\nkNGlw4cPa9++fVq0aJGKioqOuOyss86yaSoAiC4EGQAAA/AaMgAABiDIAAAYgCADAGAAggwAgAEI\nMgAABvj/Svl2S70i14oAAAAASUVORK5CYII=\n", 277 | "text/plain": [ 278 | "" 279 | ] 280 | }, 281 | "metadata": {}, 282 | "output_type": "display_data" 283 | } 284 | ], 285 | "source": [ 286 | "fig = plt.figure()\n", 287 | "fig.set(alpha=0.2)\n", 288 | "fig.set_size_inches(16, 6)\n", 289 | "\n", 290 | "survived = train['Sex'][train[\"Survived\"] == 1].value_counts()\n", 291 | "unsurvived = train['Sex'][train[\"Survived\"] == 0].value_counts()\n", 292 | "df=pd.DataFrame({'Survived':survived, 'unSurvived':unsurvived})\n", 293 | "\n", 294 | "df.plot.bar(stacked=True)\n", 295 | "\n", 296 | "plt.show()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "**Pclass**" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 34, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "" 317 | ] 318 | }, 319 | "metadata": {}, 320 | "output_type": "display_data" 321 | }, 322 | { 323 | "data": { 324 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeQAAAFGCAYAAAC7euwcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGBhJREFUeJzt3X90VPWd//HXJJOJMpnUNKaKYiiRhGrdLOlmw55So2GV\nWFZWdGmUYSMrUBVQSIrySyB4pEKKREVFxEN3bdBNo/SL+i27fjVmN98ST3qaKp6i0RqBLYhuDFhm\nBpgk5u4fXSNUSEKYMO8Zno9z+CN37sz93PA5PHMvN/e6HMdxBAAAoioh2gMAAAAEGQAAEwgyAAAG\nEGQAAAwgyAAAGECQAQAwwB3Njbe1BaK5+ZiRljZEBw8ejvYwEEeYU4g05lT/ZGT4TvoaR8gxwO1O\njPYQEGeYU4g05tTpI8gAABhAkAEAMIAgAwBgAEEGAMCAfl1lfeONNyolJUWSNGzYMN15551atGiR\nXC6XsrOzVVFRoYSEBNXW1qqmpkZut1uzZs1SUVHRoA4eAIB40WeQw+GwHMdRdXV1z7I777xTZWVl\nGjNmjJYvX666ujqNHj1a1dXV2rJli8LhsPx+v8aOHSuPxzOoOwAAQDzoM8gtLS06cuSIpk+frq6u\nLv3oRz/Szp07VVBQIEkqLCzU9u3blZCQoLy8PHk8Hnk8HmVmZqqlpUW5ubmDvhMAAMS6PoN8zjnn\naMaMGfrBD36g3bt364c//KEcx5HL5ZIkeb1eBQIBBYNB+Xxf/sKz1+tVMBjs9bPT0ob0+btrE+e/\n2J/96LeX197Qr/U2btyoxsZGdXV1yeVyaeHChbriiisGtM0f//jHuu2223TRRRcN6P3l5eW65ZZb\nNGbMmAG9HziR3m5QAAwEc+r09BnkESNGaPjw4XK5XBoxYoTOO+887dy5s+f1UCik1NRUpaSkKBQK\nHbf82ECfSDTu6tKfu4Pt2vWhXnnlVT355Ca5XC79/vfvacGCRXrmmX8d0DZvv31uv7d9Mp99dpg7\nmyFiMjJ8zCdEFHOqf07rTl0vvPCCVq9eLUn65JNPFAwGNXbsWDU1NUmSGhoalJ+fr9zcXDU3Nysc\nDisQCKi1tVU5OTkR2oUzKyUlRZ988rF++csX1db238rOHqWnn35Gd911u/bs2S1J2rr1BW3a9JT2\n7/9It956s+6663Y9++wzmjp1shzHkSRVVVXqP/+zvud9M2aUav/+jyRJ9fWv6ZFHHlIwGNTSpQt0\n99136O6771Br6weSpC1banXbbX7dc89c7dmzJyrfBwDAmdNnkCdPnqxAIKApU6aovLxcDz74oO67\n7z499thjuvnmm9XZ2ani4mJlZGSotLRUfr9f06ZNU3l5uZKTk8/EPkRcRsY3tHp1ld5+e4fuuOM2\n+f3/oMbG/3/S9Q8caNfDDz+hqVOn6dJLs7Vjx5vq6OjQb3/brLFjr+xZ7/rrb9C///svJUnbtr2s\nv//7SfrZz36qv/qrAj322FNasOA+PfTQKh040K7nn6/RU0/9i1avrlJnZ+eg7zMAILr6PGXt8Xi0\ndu3aryzfvHnzV5aVlJSopKQkMiOLor17/yCv16slSyokSS0t7+iee+YqPf38nnX+9yBYkjR06EVK\nSkqSJE2cOEn/9m//V+3t7fre9wrldn/5Lb722us0Z85MTZw4SaFQSFlZI/Xhhx/ot7/9jerq/p8k\nKRA4pH379mrEiKyeK9S5MA4A4l9Un/ZkVWvr7/Xii/9HlZVVSkpK0iWXZColxafU1K+pvf1TDR/+\nTb3/fovOPz9DkuRyfXmiIT+/QE8+uU5tbW2aP3/hcZ+bkpKiUaMu07p1VZowYaIkafjwb2r8+Ms1\nfvx1OnjwgF5+eauGDcvUrl0fKhw+Krc7Se+++64KC685c98AAKbNeX1BtIcQE54Y95NoD+GUEOQT\nuOqqcdq9e5dmzrxVQ4acq+5uR7Nnz1NSkltr167WBRdc2BPjP+dyuXT11X+r3/zm17r44mFfeX3i\nxEmaP3+uFi9eLkm69dbpWr36Ab300i90+HBI06ffrrS0NP3jP07TnXdO13nnpencc88d1P0FAESf\ny3GOPfl6ZnFFXv9w9SIijTkV2zhC7h+LR8g8DxkAAOMIMgAABhBkAAAMIMgAABhAkAEAMIAgAwBg\ngPnfQ4705f2Rvgz+jTe2q6ZmsxzH0dGjRzV58s0aP/77A/68bdteVmpqqr73vasG9P6tW19Qe3u7\nZsy4Y8BjAACceeaDbN2aNQ/qmWdq5PP5dPhwSNOm+fXXfz1GaWlfH9DnfXEHLwDA2YUgn8C2bS9r\nz57dmjXrboXDYU2dOlkXXjhU2dmj9OGHrTp8OKgHHqjUhRcOlc/n0/PP/6uuvvpvNWJElp599nl5\nPB5t2vSU0tPTNWnSZO3Zs1tr1jyoxx/fqNLSEl1yyXAlJbm1d+9erVxZqaFDL1J9/WvaseMt+Xw+\npaen6w9/+C+NHJmj73//erW1tWn69Jn66U83a8OGx7Vjx5vq7u7WzTdP1bhx12jHjrf06KMPyedL\nVWJior797YE9txkAED38H/IpuOyyb+vRR9crP3+MXn31FUlSVdXjOnr0qO6//z7dcMN1qq7+Z/V2\n87MjR47on/5phu6/f9UJn/70heuv/9NDKiTpxRdf1N/93US98cZ27d+/T08+uUnr1m3Qz372UwUC\nAa1du0orVvxYjz66XhdddNEgfgcAAIOFIPfpy7jm5IySJF1wwQXq6Ajr0KFD+vjjjzV79lw980yN\nNm2qVlPTG9q+/fhHNf55oDMzvynpT09/+o//qNOnn7b1PP3pCyNGZOnzzz/Xxx/v17Zt2zR+/AR9\n+OEHeu+9Ft111+2aP/9udXV16eOPP9KBAweUmTlckvQXf/GXg/FNAAAMMoJ8Ah6PR+3tn0qS3nuv\npWe5y+U6br3Ozg5VVCzWgQPtkqT09POVnp4uj8cjjydZ7e1/Wv7++y3Hve+LzznR05+Odf31N2j9\n+nUaOXKkfD6fhg//pvLy8vX44xu1bt0GjRt3jS6+eJgyMjK0e/cuSdK7774Toe8CAOBM4v+QT2DM\nmO9q69YtmjVrhkaNukxer/eE66Wnn6+ysnu0YEG5EhMT1d39ub773StVUPA3GjbsEi1fvlhvvtms\nUaMuO+m2/vzpT8cqKrpGjz76kDZs2CBJGju2UG++2azZs2fqyJHDKiws0pAhXt177xKtXFkhr9er\nIUOGyOc7+c3LAQA28bSnGMCTeRBpzKnYxtOe+oenPQEAgFNGkAEAMIAgAwBgAEEGAMAAggwAgAEE\nGQAAAwgyAAAGEGQAAAwgyAAAGECQAQAwgCADAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDI\nAAAYQJABADCAIAMAYABBBgDAAIIMAIABBBkAAAMIMgAABhBkAAAMIMgAABhAkAEAMIAgAwBgAEEG\nAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAAGNCvILe3t+uqq65Sa2ur9uzZoylTpsjv96uiokLd\n3d2SpNraWt10000qKSlRfX39oA4aAIB402eQOzs7tXz5cp1zzjmSpFWrVqmsrEzPPfecHMdRXV2d\n2traVF1drZqaGm3atElVVVXq6OgY9MEDABAv+gxyZWWlbrnlFn3jG9+QJO3cuVMFBQWSpMLCQjU2\nNurtt99WXl6ePB6PfD6fMjMz1dLSMrgjBwAgjrh7e/EXv/iFvv71r+vKK6/Uxo0bJUmO48jlckmS\nvF6vAoGAgsGgfD5fz/u8Xq+CwWCfG09LGyK3O/F0xn/WyMjw9b0ScAqYU4h3sTbHew3yli1b5HK5\n9MYbb+jdd9/VwoULdeDAgZ7XQ6GQUlNTlZKSolAodNzyYwN9MgcPHj6NoZ89MjJ8amsLRHsYiCPM\nKZwNLM7x3n5I6PWU9bPPPqvNmzerurpal112mSorK1VYWKimpiZJUkNDg/Lz85Wbm6vm5maFw2EF\nAgG1trYqJycnsnsBAEAc6/UI+UQWLlyoZcuWqaqqSllZWSouLlZiYqJKS0vl9/vlOI7Ky8uVnJw8\nGOMFACAuuRzHcaK1cYunEyzi9CIijTkV2+a8viDaQ4gJT4z7SbSH8BUDPmUNAADODIIMAIABBBkA\nAAMIMgAABhBkAAAMIMgAABhAkAEAMIAgAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAA\nGECQAQAwgCADAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCAIAMAYABBBgDA\nAIIMAIABBBkAAAMIMgAABhBkAAAMIMgAABhAkAEAMIAgAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAG\nEGQAAAwgyAAAGECQAQAwgCADAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCA\nIAMAYABBBgDAAIIMAIAB7r5W+Pzzz7V06VLt2rVLLpdL999/v5KTk7Vo0SK5XC5lZ2eroqJCCQkJ\nqq2tVU1Njdxut2bNmqWioqIzsQ8AAMS8PoNcX18vSaqpqVFTU5MefvhhOY6jsrIyjRkzRsuXL1dd\nXZ1Gjx6t6upqbdmyReFwWH6/X2PHjpXH4xn0nQAAINb1GeRrrrlGV199tSTpo48+UmpqqhobG1VQ\nUCBJKiws1Pbt25WQkKC8vDx5PB55PB5lZmaqpaVFubm5g7oDAADEgz6DLElut1sLFy7Uq6++qnXr\n1mn79u1yuVySJK/Xq0AgoGAwKJ/P1/Mer9erYDDY6+empQ2R2514GsM/e2Rk+PpeCTgFzCnEu1ib\n4/0KsiRVVlbqnnvuUUlJicLhcM/yUCik1NRUpaSkKBQKHbf82ECfyMGDhwcw5LNPRoZPbW2BaA8D\ncYQ5hbOBxTne2w8JfV5lvXXrVj311FOSpHPPPVcul0tXXHGFmpqaJEkNDQ3Kz89Xbm6umpubFQ6H\nFQgE1NraqpycnAjtAgAA8a3PI+Tx48dr8eLFmjp1qrq6urRkyRJdeumlWrZsmaqqqpSVlaXi4mIl\nJiaqtLRUfr9fjuOovLxcycnJZ2IfAACIeS7HcZxobdzi6QSLOL2ISGNOxbY5ry+I9hBiwhPjfhLt\nIXzFaZ2yBgAAg48gAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAAGECQAQAwgCADAGAA\nQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCAIAMAYABBBgDAAIIMAIABBBkAAAMI\nMgAABhBkAAAMIMgAABhAkAEAMIAgAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAAGECQ\nAQAwgCADAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCAIAMAYABBBgDAAIIM\nAIAB7mgPwJo5ry+I9hBixhPjfhLtIQBA3OAIGQAAAwgyAAAGEGQAAAwgyAAAGECQAQAwoNerrDs7\nO7VkyRLt27dPHR0dmjVrlkaOHKlFixbJ5XIpOztbFRUVSkhIUG1trWpqauR2uzVr1iwVFRWdqX0A\nACDm9Rrkl156Seedd57WrFmjzz77TJMmTdK3vvUtlZWVacyYMVq+fLnq6uo0evRoVVdXa8uWLQqH\nw/L7/Ro7dqw8Hs+Z2g8AAGJar0G+7rrrVFxcLElyHEeJiYnauXOnCgoKJEmFhYXavn27EhISlJeX\nJ4/HI4/Ho8zMTLW0tCg3N3fw9wAAgDjQa5C9Xq8kKRgMau7cuSorK1NlZaVcLlfP64FAQMFgUD6f\n77j3BYPBPjeeljZEbnfi6YwfUZSR4et7JZjF3x/iXazN8T7v1LV//37NmTNHfr9fEydO1Jo1a3pe\nC4VCSk1NVUpKikKh0HHLjw30yRw8eHiAw4YFbW2BaA8BA5SR4ePvD3HP4hzv7YeEXq+y/vTTTzV9\n+nTde++9mjx5siTp8ssvV1NTkySpoaFB+fn5ys3NVXNzs8LhsAKBgFpbW5WTkxPBXQAAIL71eoS8\nYcMGHTp0SOvXr9f69eslSffdd59WrlypqqoqZWVlqbi4WImJiSotLZXf75fjOCovL1dycvIZ2QEA\nAOKBy3EcJ1obt3g6gYdL9B8Pl4hdnLKObfw71T8W/40a8ClrAABwZhBkAAAMIMgAABhAkAEAMKDP\n30MGcHq4AKd/LF6AA5xJHCEDAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCA\nIAMAYABBBgDAAIIMAIABBBkAAAMIMgAABhBkAAAMIMgAABhAkAEAMIAgAwBgAEEGAMAAggwAgAEE\nGQAAAwgyAAAGEGQAAAwgyAAAGECQAQAwgCADAGAAQQYAwACCDACAAe5oD8CaI7++LtpDiB3joj0A\nAIgfHCEDAGAAQQYAwACCDACAAQQZAAADCDIAAAYQZAAADCDIAAAYQJABADCAIAMAYAB36gKAGMMd\nBfspxu4myBEyAAAGEGQAAAwgyAAAGECQAQAwgCADAGAAQQYAwACCDACAAf0K8o4dO1RaWipJ2rNn\nj6ZMmSK/36+Kigp1d3dLkmpra3XTTTeppKRE9fX1gzdiAADiUJ9Bfvrpp7V06VKFw2FJ0qpVq1RW\nVqbnnntOjuOorq5ObW1tqq6uVk1NjTZt2qSqqip1dHQM+uABAIgXfQY5MzNTjz32WM/XO3fuVEFB\ngSSpsLBQjY2Nevvtt5WXlyePxyOfz6fMzEy1tLQM3qgBAIgzfd46s7i4WHv37u352nEcuVwuSZLX\n61UgEFAwGJTP5+tZx+v1KhgM9rnxtLQhcrsTBzJuGJCR4et7JaCfmE+ItFibU6d8L+uEhC8PqkOh\nkFJTU5WSkqJQKHTc8mMDfTIHDx4+1c3DkLa2QLSHgDjCfEKkWZxTvf2QcMpXWV9++eVqamqSJDU0\nNCg/P1+5ublqbm5WOBxWIBBQa2urcnJyBj5iAADOMqd8hLxw4UItW7ZMVVVVysrKUnFxsRITE1Va\nWiq/3y/HcVReXq7k5OTBGC8AAHGpX0EeNmyYamtrJUkjRozQ5s2bv7JOSUmJSkpKIjs6AADOEtwY\nBAAAAwgyAAAGEGQAAAw45Yu6AJyaI7++LtpDiA3joj0AILo4QgYAwACCDACAAQQZAAADCDIAAAYQ\nZAAADCDIAAAYQJABADCAIAMAYABBBgDAAIIMAIABBBkAAAMIMgAABhBkAAAMIMgAABhAkAEAMIAg\nAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAAGECQAQAwgCADAGAAQQYAwACCDACAAQQZ\nAAADCDIAAAYQZAAADCDIAAAYQJABADCAIAMAYABBBgDAAIIMAIABBBkAAAMIMgAABhBkAAAMIMgA\nABhAkAEAMIAgAwBgAEEGAMAAggwAgAEEGQAAAwgyAAAGEGQAAAwgyAAAGOCO5Id1d3drxYoVeu+9\n9+TxeLRy5UoNHz48kpsAACAuRfQI+bXXXlNHR4d+/vOfa/78+Vq9enUkPx4AgLgV0SA3Nzfryiuv\nlCSNHj1av/vd7yL58QAAxK2InrIOBoNKSUnp+ToxMVFdXV1yu0+8mYwMXyQ3HxEvr70h2kNAnGFO\nIdKYU/EpokfIKSkpCoVCPV93d3efNMYAAOBLEQ3yd77zHTU0NEiS3nrrLeXk5ETy4wEAiFsux3Gc\nSH3YF1dZv//++3IcRw8++KAuvfTSSH08AABxK6JBBgAAA8ONQQAAMIAgAwBgAEEGAMAAggycpTo6\nOqI9BMSJo0ePMp8igCADce71119XUVGRrr32Wm3btq1n+cyZM6M4KsSyDz74QLNnz9bixYvV2Nio\nCRMmaMKECaqvr4/20GIad+0A4tyGDRu0detWdXd3a968eQqHw7rxxhvFL1hgoCoqKjRv3jzt27dP\nc+fO1SuvvKLk5GTNnDlTRUVF0R5ezCLIBpWWlqqzs/O4ZY7jyOVyqaamJkqjQqxKSkrS1772NUnS\n+vXrNW3aNA0dOlQulyvKI0Os6u7uVkFBgSSpqalJ6enpksSdGU8Tv4ds0I4dO7R06VI98cQTSkxM\nPO61iy++OEqjQqxasGCB0tLSNG/ePA0ZMkT79+/XjBkzdOjQIf3qV7+K9vAQg5YsWSKXy6UHHnhA\nCQl/+p/PjRs36p133tEjjzwS5dHFrsQVK1asiPYgcLwLL7xQhw8fVldXl0aPHq3U1NSeP8CpKioq\nUnt7u7Kzs5WUlCSfz6fi4mL98Y9/VGFhYbSHhxj0xWnpY+/EuHfvXt1xxx1KSkqK1rBiHkfIAAAY\nwFXWAAAYQJABADCAIAMAYABBBgDAAIIMAIAB/wPT1TC87FDQsgAAAABJRU5ErkJggg==\n", 325 | "text/plain": [ 326 | "" 327 | ] 328 | }, 329 | "metadata": {}, 330 | "output_type": "display_data" 331 | } 332 | ], 333 | "source": [ 334 | "survived = train['Pclass'][train[\"Survived\"] == 1].value_counts()\n", 335 | "unsurvived = train['Pclass'][train[\"Survived\"] == 0].value_counts()\n", 336 | "\n", 337 | "fig = plt.figure()\n", 338 | "fig.set(alpha=0.2)\n", 339 | "fig.set_size_inches(16, 9)\n", 340 | "#female.plot(kind=\"bar\", stacked=True)\n", 341 | "df=pd.DataFrame({'Survived':survived, 'unSurvived':unsurvived})\n", 342 | "df.plot(kind=\"bar\", stacked=True)\n", 343 | "\n", 344 | "plt.show()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 32, 350 | "metadata": { 351 | "collapsed": false 352 | }, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/plain": [ 357 | "" 358 | ] 359 | }, 360 | "metadata": {}, 361 | "output_type": "display_data" 362 | }, 363 | { 364 | "data": { 365 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeQAAAFICAYAAACBcI1sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGqFJREFUeJzt3X98VPWd7/H3ZIYJMDO5pDEVaQwSJQh2c4myCZQaCatE\nXVmpUpRhY1doKwG1iVB+KYk/uBJAA6j8UIt3NYohmi7q1t22BtzU4COtFOFh1kAb+f3LGOKDmSFM\nEnPuH64RLCYhmTDfyX09/8v8OPM5mQOvOWcyc2yWZVkCAABhFRXuAQAAAEEGAMAIBBkAAAMQZAAA\nDECQAQAwAEEGAMAAjnA+eF2dL5wPHzFiY/uroeFUuMdAL8I2hVBjm+qc+HjPt17HHnIEcDjs4R4B\nvQzbFEKNbar7CDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBggLB+MQgAwDzTC7eE\ndHkvLBjfqdsVF/+rPvjgj/riixbZbDbNnp2rK68c3qXHXL36Sd1xxzQNHDiwS/cvKFioW2+9XVdf\nPapL9+8KggwACLu9ez9RZWWF1q3bIJvNpr/8ZbeWLHlYL774apeW94tfzAnpfBcCh6wBAGHndrt1\n/Pgx/eY3b6iu7lMNHTpMzz//ou699+fav3+fJGnz5te1YcOzOnr0iO666w7de+/P9corL2ratMmy\nLEuSVFS0TP/1X1vb7jdjRraOHj0iSdq69R2tWvWE/H6/Hnponu677x7dd989qq39qySprKxUd9/t\n1dy59+vQoUMX/HdAkAEAYRcf/10VFhZp166duueeu+X13q5t2/7wrbc/caJeK1eu0bRpP9Hllw/V\nzp071NTUpD//ebvGjr227Xa33HKr/vM/fyNJevvtt/RP/zRJL730gq65Jk1PP/2s5s17UE88sVQn\nTtTrtddK9Oyz/6rCwiK1tDT3+Dp/E4esAQBhd+jQQblcLi1aVCBJqqn5b82de7/i4i5qu83/7ARL\nki65ZJD69OkjSZo4cZL+4z/+XfX19frhDzPkcHydthtuuFGzZ/9UEydOUiAQUFLSFfrkk7/qz3/+\nQOXlv5Mk+XwndfjwIQ0ZkiSn0ylJGj78qp5e5b9BkL9h9pZ54R4hYqwZvzzcIwDoJWpr/6I33vg3\nLVtWpD59+ujSSxPldnsUE/O/VF//mQYPvkx79tToooviJUk229cHeEeNStO6dU+prq5Oc+bMP2u5\nbrdbw4YN11NPFenmmydKkgYPvkwTJozQhAk3qqHhhN56a7MSEhK1d+8nCgZPy+Hooz17dmvChJsu\n3C9ABBkAYIDrrhuvffv26qc/vUv9+/dTa6ulWbN+oT59HHryyUJdfPHAthh/k81m07hx/6APPvij\nvve9hL+5fuLESZoz534tXJgvSbrrrukqLHxMb775a506FdD06T9XbGys/vmff6KZM6drwIBY9evX\nr0fX91xslnXmQYALy8TzIbOH3HnsIUeu+HiPkf/+ELnYpjqH8yEDAGA4ggwAgAEIMgAABiDIAAAY\ngCADAGAAggwAgAH4HDIA4Cyh/vhnqD8i+f77lSopeVmWZen06dOaPPmObn2Jx9tvv6WYmBj98IfX\nden+mze/rvr6es2YcU+XZ5AIMgAgwqxY8bhefLFEHo9Hp04F9JOfePX3f5+u2NjvdGl5X32DV7gR\nZABA2L399lvav3+fcnLuUzAY1LRpkzVw4CUaOnSYPvmkVqdO+fXYY8s0cOAl8ng8eu21VzVu3D9o\nyJAkvfLKa3I6ndqw4VnFxcVp0qTJ2r9/n1aseFzPPPOcsrOn6NJLB6tPH4cOHTqkJUuW6ZJLBmnr\n1ne0c+eH8ng8iouL08GDB3TFFcm66aZbVF//mX75y1y98MLLWr/+Ge3cuUOtra26445pGj/+eu3c\n+aFWr35CHk+M7Ha7rrrq+93+HfAeMgDAWMOHX6XVq9dq1Kh0/f73v5UkFRU9o9OnT+uRRx7Urbfe\nqOLi/6v2vnSysbFR//IvM/TII0vPefanr9xyy5cnqZCk3/72bf3jP07U++9X6ujRw1q3boOeemq9\nXnrpBfl8Pj355FI9/PD/0erVazVo0KCQrCtBBgAY5uu4JicPkyRdfPHFamoK6uTJkzp27Jhmzbpf\nL75Yog0bilVV9b4qK88+VeM3A52YeJmkL8/+9O675frss7q2sz99ZciQJH3xxRc6duyoyst/rwkT\nbtYnn/xVu3fX6N57f645c+5TS0uLjh07ohMnTigxcbAk6e/+7n+HZK0JMgAg7JxOp+rrP5Mk7d5d\n03a5zWY763bNzU0qKFioEyfqJUlxcRcpLi5OTqdTTme06uu/vHzPnpqz7vfVcs519qcz3XLLrVq7\n9ilddtkQeTweDR58mVJTR+mZZ57TU0+t1/jx1+t730tQfHy89u3bK0n6+OP/DsnvgPeQAQBhl57+\nA23eXKacnBkaNmy4XC7XOW8XF3eRcnPnat68PNntdrW2fqEf/OBapaWNVkLCpcrPX6gdO7Zr2LDh\n3/pY3zz705kyM6/X6tVPqLCwSJI0dmyGduzYrlmzfqrGxlPKyMhU//4u/fKXi7RkSYFcLpf69+8v\nj+fbTxrRWZ0629Ozzz6rLVu2qLm5WVOnTlVaWpoWLFggm82moUOHqqCgQFFRUSotLVVJSYkcDody\ncnKUmZnZ7nJNPDMIZ3vqPM72FLk4Mw9CjW2qc7p1tqeqqirt2LFDr776qoqLi3Xs2DEtXbpUubm5\n2rhxoyzLUnl5uerq6lRcXKySkhJt2LBBRUVFampqCumKAADQW3UY5Pfee0/JycmaPXu2Zs6cqXHj\nxqm6ulppaWmSpIyMDG3btk27du1SamqqnE6nPB6PEhMTVVNT08HSAQCA1In3kBsaGnTkyBGtX79e\nhw4dUk5OjizLanuD3OVyyefzye/3n3UM3eVyye/399zkAAD0Ih0GecCAAUpKSpLT6VRSUpKio6N1\n7NixtusDgYBiYmLkdrsVCATOuryjN7ljY/vL4bB3Y3yEU3vvhcB8PH8INbap7ukwyNdcc41eeukl\n3X333fr000/V2NioMWPGqKqqSunp6aqoqNDo0aOVkpKiVatWKRgMqqmpSbW1tUpOTm532Q0Np0K2\nIrjw+AOOyMUf4CDU2KY6p70XLR0GOTMzU3/60580efJkWZal/Px8JSQkaPHixSoqKlJSUpKysrJk\nt9uVnZ0tr9cry7KUl5en6OjokK4IAAC9Vac+9tRTTHw1xceeOo+PPUUu9mYQamxTndOtjz0BAICe\nR5ABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAA\nAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYA\nwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJABADAAQQYAwAAEGQAAAxBkAAAMQJAB\nADAAQQYAwAAEGQAAAxBkAAAM4OjMjX70ox/J7XZLkhISEjRz5kwtWLBANptNQ4cOVUFBgaKiolRa\nWqqSkhI5HA7l5OQoMzOzR4cHAKC36DDIwWBQlmWpuLi47bKZM2cqNzdX6enpys/PV3l5uUaOHKni\n4mKVlZUpGAzK6/Vq7NixcjqdPboCAAD0Bh0GuaamRo2NjZo+fbpaWlr0wAMPqLq6WmlpaZKkjIwM\nVVZWKioqSqmpqXI6nXI6nUpMTFRNTY1SUlJ6fCUAAIh0HQa5b9++mjFjhn784x9r3759+tnPfibL\nsmSz2SRJLpdLPp9Pfr9fHo+n7X4ul0t+v7/dZcfG9pfDYe/mKiBc4uM9Hd8IxuL5Q6ixTXVPh0Ee\nMmSIBg8eLJvNpiFDhmjAgAGqrq5uuz4QCCgmJkZut1uBQOCsy88M9Lk0NJzqxugIt7o6X7hHQBfF\nx3t4/hBSbFOd096Llg7/yvr1119XYWGhJOn48ePy+/0aO3asqqqqJEkVFRUaNWqUUlJStH37dgWD\nQfl8PtXW1io5OTlEqwAAQO/W4R7y5MmTtXDhQk2dOlU2m02PP/64YmNjtXjxYhUVFSkpKUlZWVmy\n2+3Kzs6W1+uVZVnKy8tTdHT0hVgHAAAins2yLCtcD27i4Y3ZW+aFe4SIsWb88nCPgC7i8CJCjW2q\nc7p1yBoAAPQ8ggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEI\nMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAA\nggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAY\ngCADAGAAggwAgAEIMgAABiDIAAAYoFNBrq+v13XXXafa2lrt379fU6dOldfrVUFBgVpbWyVJpaWl\nuu222zRlyhRt3bq1R4cGAKC36TDIzc3Nys/PV9++fSVJS5cuVW5urjZu3CjLslReXq66ujoVFxer\npKREGzZsUFFRkZqamnp8eAAAeosOg7xs2TLdeeed+u53vytJqq6uVlpamiQpIyND27Zt065du5Sa\nmiqn0ymPx6PExETV1NT07OQAAPQijvau/PWvf63vfOc7uvbaa/Xcc89JkizLks1mkyS5XC75fD75\n/X55PJ62+7lcLvn9/g4fPDa2vxwOe3fmRxjFx3s6vhGMxfOHUGOb6p52g1xWViabzab3339fH3/8\nsebPn68TJ060XR8IBBQTEyO3261AIHDW5WcG+ts0NJzqxugIt7o6X7hHQBfFx3t4/hBSbFOd096L\nlnYPWb/yyit6+eWXVVxcrOHDh2vZsmXKyMhQVVWVJKmiokKjRo1SSkqKtm/frmAwKJ/Pp9raWiUn\nJ4d2LQAA6MXa3UM+l/nz52vx4sUqKipSUlKSsrKyZLfblZ2dLa/XK8uylJeXp+jo6J6YFwCAXslm\nWZYVrgc38fDG7C3zwj1CxFgzfnm4R0AXcXgRocY21TldPmQNAAAuDIIMAIABCDIAAAYgyAAAGIAg\nAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYg\nyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIAB\nCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGMDR0Q2+\n+OILPfTQQ9q7d69sNpseeeQRRUdHa8GCBbLZbBo6dKgKCgoUFRWl0tJSlZSUyOFwKCcnR5mZmRdi\nHQAAiHgdBnnr1q2SpJKSElVVVWnlypWyLEu5ublKT09Xfn6+ysvLNXLkSBUXF6usrEzBYFBer1dj\nx46V0+ns8ZUAACDSdRjk66+/XuPGjZMkHTlyRDExMdq2bZvS0tIkSRkZGaqsrFRUVJRSU1PldDrl\ndDqVmJiompoapaSk9OgKAADQG3TqPWSHw6H58+frscce08SJE2VZlmw2myTJ5XLJ5/PJ7/fL4/G0\n3cflcsnv9/fM1AAA9DId7iF/ZdmyZZo7d66mTJmiYDDYdnkgEFBMTIzcbrcCgcBZl58Z6HOJje0v\nh8PehbFhgvj49p9fmI3nD6HGNtU9HQZ58+bNOn78uO655x7169dPNptN3//+91VVVaX09HRVVFRo\n9OjRSklJ0apVqxQMBtXU1KTa2lolJye3u+yGhlMhWxFceHV1vnCPgC6Kj/fw/CGk2KY6p70XLR0G\necKECVq4cKGmTZumlpYWLVq0SJdffrkWL16soqIiJSUlKSsrS3a7XdnZ2fJ6vbIsS3l5eYqOjg7p\nigAA0FvZLMuywvXgJr6amr1lXrhHiBhrxi8P9wjoIvZmEGpsU53T3h4yXwwCAIABCDIAAAYgyAAA\nGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIA\nAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIM\nAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAEe4BzBN4x9vDPcIkWN8uAcA\ngN6DPWQAAAxAkAEAMABBBgDAAO2+h9zc3KxFixbp8OHDampqUk5Ojq644gotWLBANptNQ4cOVUFB\ngaKiolRaWqqSkhI5HA7l5OQoMzPzQq0DAAARr90gv/nmmxowYIBWrFihzz//XJMmTdKVV16p3Nxc\npaenKz8/X+Xl5Ro5cqSKi4tVVlamYDAor9ersWPHyul0Xqj1AAAgorUb5BtvvFFZWVmSJMuyZLfb\nVV1drbS0NElSRkaGKisrFRUVpdTUVDmdTjmdTiUmJqqmpkYpKSk9vwYAAPQC7b6H7HK55Ha75ff7\ndf/99ys3N1eWZclms7Vd7/P55Pf75fF4zrqf3+/v2ckBAOhFOvwc8tGjRzV79mx5vV5NnDhRK1as\naLsuEAgoJiZGbrdbgUDgrMvPDPS3iY3tL4fD3sXREW7x8R0/xzAXzx9CjW2qe9oN8meffabp06cr\nPz9fY8aMkSSNGDFCVVVVSk9PV0VFhUaPHq2UlBStWrVKwWBQTU1Nqq2tVXJycocP3tBwKjRrgbCo\nq/OFewR0UXy8h+cPIcU21TntvWhpN8jr16/XyZMntXbtWq1du1aS9OCDD2rJkiUqKipSUlKSsrKy\nZLfblZ2dLa/XK8uylJeXp+jo6NCuBQAAvZjNsiwrXA9u4qup6YVbwj1CxHhhAd+dGanYm0GosU11\nTnt7yHwxCAAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAYgCADAGAAggwAgAEIMgAABiDIAAAY\noMOzPQHontlb5oV7hIiwZvzycI8AhBV7yAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIM\nAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAg\nAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGKBTQd65c6eys7MlSfv379fU\nqVPl9XpVUFCg1tZWSVJpaaluu+02TZkyRVu3bu25iQEA6IU6DPLzzz+vhx56SMFgUJK0dOlS5ebm\nauPGjbIsS+Xl5aqrq1NxcbFKSkq0YcMGFRUVqampqceHBwCgt+gwyImJiXr66afbfq6urlZaWpok\nKSMjQ9u2bdOuXbuUmpoqp9Mpj8ejxMRE1dTU9NzUAAD0Mh0GOSsrSw6Ho+1ny7Jks9kkSS6XSz6f\nT36/Xx6Pp+02LpdLfr+/B8YFAKB3cnR8k7NFRX3d8EAgoJiYGLndbgUCgbMuPzPQ3yY2tr8cDvv5\njgBDxMd3/BwDncX2FPl4DrvnvIM8YsQIVVVVKT09XRUVFRo9erRSUlK0atUqBYNBNTU1qba2VsnJ\nyR0uq6HhVJeGhhnq6nzhHgG9CNtTZIuP9/AcdkJ7L1rOO8jz58/X4sWLVVRUpKSkJGVlZclutys7\nO1ter1eWZSkvL0/R0dHdGhoAgP+f2CzLssL14Ca+mppeuCXcI0SMFxaMD/cIEYFtqnPYniIbe8id\n094eMl8MAgCAAQgyAAAGIMgAABiAIAMAYACCDACAAQgyAAAGOO/PIQMAwmv2lnnhHiEirBm/PNwj\nnBf2kAEAMABBBgDAAAQZAAADEGQAAAxAkAEAMABBBgDAAAQZAAADEGQAAAxAkAEAMABBBgDAAAQZ\nAAADEGQAAAxAkAEAMABBBgDAAAQZAAADEGQAAAxAkAEAMABBBgDAAAQZAAADEGQAAAzgCPcAAIDz\n0/jHG8M9QmQYH+4Bzg97yAAAGIAgAwBgAIIMAIABCDIAAAYgyAAAGIAgAwBgAIIMAIABCDIAAAYg\nyAAAGCCk39TV2tqqhx9+WLt375bT6dSSJUs0ePDgUD4EAAC9Ukj3kN955x01NTVp06ZNmjNnjgoL\nC0O5eAAAeq2QBnn79u269tprJUkjR47URx99FMrFAwDQa4X0kLXf75fb7W772W63q6WlRQ7HuR8m\nPt4TyocPibeevDXcI6CXYZtCqLFN9U4h3UN2u90KBAJtP7e2tn5rjAEAwNdCGuSrr75aFRUVkqQP\nP/xQycnJoVw8AAC9ls2yLCtUC/vqr6z37Nkjy7L0+OOP6/LLLw/V4gEA6LVCGmQAANA1fDEIAAAG\nIMgAABiAIAMAYACCbLiTJ0/K7/eHewxEuE2bNqmlpUWS9MEHH+jVV18N80ToDZqamnT48GGdPn1a\n0pf/XzU2NoZ5qshFkA1TXV2tSZMmqbm5Wb/73e+UlZWl22+/XVu2bAn3aIhQTz/9tCorK9Xc3CxJ\nGjhwoCorK7VmzZowT4ZI1dzcrEcffVQ33XSTHnjgAU2YMEH5+flaunSpDh48GO7xIhZBNszy5ctV\nWFioPn36aNWqVfrVr36lsrIyPffcc+EeDRGqoqJCq1evVr9+/SRJCQkJWrlyJS/y0GVr1qxRXFyc\nysvLtWnTJr377rtqaWlRfX093z/RDXyNlmFaW1t15ZVX6vjx42psbNRVV10lSYqK4rUTuqZ///6y\n2WxnXdanTx+5XK4wTYRIV1VVddbbHlFRUTp+/LgaGhrCOFXk4395w3z1VaN/+MMfNGbMGElfHh46\n8ytJgfPRt2/fvzmMePDgwb+JNNBZ59pBWLlypfr27RuGaXoP9pANM2bMGN155506duyY1q1bpwMH\nDujRRx/VzTffHO7REKHmzp2rWbNmacyYMbr00kt15MgRvffee1q2bFm4R0OE6tu3rw4cOKDExMS2\nyz7//PO2t0XQNXxTl4Fqa2vldrt18cUX68CBA9q9e7duuOGGcI+FCObz+VReXq5PP/1UgwYN0rhx\n4846MxtwPj766CPNmzdPU6ZMUUJCgg4ePKjXX39dK1as0IgRI8I9XsQiyACA83b8+HG98cYbOnTo\nkAYNGqRJkyZp4MCB4R4rohFkAAAMwB91AQBgAIIMAIABCDIAAAYgyAAAGIAgAwBggP8HJN9BaJBD\nkl0AAAAASUVORK5CYII=\n", 366 | "text/plain": [ 367 | "" 368 | ] 369 | }, 370 | "metadata": {}, 371 | "output_type": "display_data" 372 | } 373 | ], 374 | "source": [ 375 | "survived = train['Embarked'][train[\"Survived\"] == 1].value_counts()\n", 376 | "unsurvived = train['Embarked'][train[\"Survived\"] == 0].value_counts()\n", 377 | "\n", 378 | "fig = plt.figure()\n", 379 | "fig.set(alpha=0.2)\n", 380 | "fig.set_size_inches(16, 9)\n", 381 | "#female.plot(kind=\"bar\", stacked=True)\n", 382 | "df=pd.DataFrame({'Survived':survived, 'unSurvived':unsurvived})\n", 383 | "df.plot(kind=\"bar\", stacked=True)\n", 384 | "\n", 385 | "plt.show()" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "" 397 | ] 398 | } 399 | ], 400 | "metadata": { 401 | "anaconda-cloud": {}, 402 | "kernelspec": { 403 | "display_name": "Python [default]", 404 | "language": "python", 405 | "name": "python2" 406 | }, 407 | "language_info": { 408 | "codemirror_mode": { 409 | "name": "ipython", 410 | "version": 2.0 411 | }, 412 | "file_extension": ".py", 413 | "mimetype": "text/x-python", 414 | "name": "python", 415 | "nbconvert_exporter": "python", 416 | "pygments_lexer": "ipython2", 417 | "version": "2.7.12" 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 0 422 | } -------------------------------------------------------------------------------- /titanic/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wsg011/kaggle-start/0dd948ae407edcc68f790f310d6b7c64e4e8a328/titanic/src/__init__.py -------------------------------------------------------------------------------- /titanic/src/ensemble.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | # import models 6 | import xgboost as xgb 7 | from sklearn.svm import SVC 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.tree import DecisionTreeClassifier 10 | from sklearn.ensemble import ExtraTreesClassifier 11 | from sklearn.ensemble import AdaBoostClassifier 12 | from sklearn.ensemble import RandomForestClassifier 13 | from sklearn.ensemble import GradientBoostingClassifier 14 | from xgboost import XGBClassifier 15 | 16 | from sklearn.model_selection import KFold 17 | from sklearn.metrics import accuracy_score 18 | from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, learning_curve 19 | 20 | 21 | class Ensemble(object): 22 | def __init__(self, base_models, sec_model, n_folds=5): 23 | self.n_folds = n_folds 24 | self.base_models = base_models 25 | self.clf = sec_model 26 | 27 | def fit_predict(self, X, y, T): 28 | X = np.array(X) 29 | y = np.array(y) 30 | T = np.array(T) 31 | 32 | folds = KFold(n_splits=self.n_folds, random_state=0) 33 | en_train = np.zeros((X.shape[0], len(self.base_models))) 34 | en_test = np.zeros((T.shape[0], len(self.base_models))) 35 | 36 | for i, clf in enumerate(self.base_models): 37 | en_test_i = np.zeros((T.shape[0], self.n_folds)) 38 | print("fit model:%s " % str(i+1)) 39 | for j, (train_idx, test_idx) in enumerate(folds.split(X, y)): 40 | X_train = X[train_idx] 41 | y_train = y[train_idx] 42 | X_holdout = X[test_idx] 43 | y_holdout = y[test_idx] 44 | clf.fit(X_train, y_train) 45 | y_pre = clf.predict(X_holdout) 46 | en_train[test_idx, i] = y_pre 47 | en_test_i[:, j] = clf.predict(T)[:] 48 | score = accuracy_score(en_train[:, i], y) 49 | print("model %s scoring: %s" % (i, score)) 50 | en_test[:, i] = en_test_i.mean(1) 51 | self.clf.fit(en_train, y) 52 | pre = self.clf.predict(en_test) 53 | return pre 54 | 55 | def predict(self, x): 56 | return self.clf.predict(x) 57 | 58 | def score(self, x, y): 59 | s = accuracy_score(y, self.predict(x)) 60 | return s 61 | 62 | if __name__ == "__main__": 63 | print("load data...") 64 | train = pd.read_csv("../data/feature_train.csv") 65 | test = pd.read_csv("../data/feature_test.csv") 66 | features = ["Pclass", "Age", "sex", "child", "fimalysize", "Fare", "embark", "cabin", "name"] 67 | x_train, y_train = train[features], train['Survived'] 68 | x_test = test[features] 69 | 70 | lr = LogisticRegression() 71 | svc = SVC() 72 | dt = DecisionTreeClassifier() 73 | et = ExtraTreesClassifier() 74 | ada = AdaBoostClassifier() 75 | rf = RandomForestClassifier(n_estimators=140, max_depth=4, min_samples_leaf=4, min_samples_split=6) 76 | GBDT = GradientBoostingClassifier() 77 | xgb_GBDT = XGBClassifier(objective='binary:logistic', learning_rate=0.1, n_estimators=40, max_depth=6) 78 | 79 | clfs = [lr, et, ada, rf, GBDT, xgb_GBDT] 80 | ensemble = Ensemble(clfs) 81 | sec_train, sec_test = ensemble.fit_predict(x_train, y_train, x_test) 82 | y_pred = clf.predict(X_holdout)[:] 83 | print ("Fit Model %d fold %d: %s" % (i, j, accuracy_score(y_holdout, y_pred))) 84 | clf = lr 85 | clf.fit(sec_train, y_train) 86 | # 87 | #score = 0 88 | #for i in range(0, 10): 89 | # num_test = 0.2 90 | # X_train, X_cv, Y_train, Y_cv = train_test_split(x_train, y_train, test_size=num_test) 91 | # ensemble.fit(X_train, Y_train) 92 | # # Y_test = bag.predict(X_test) 93 | # acc_xgb = round(ensemble.score(X_cv, Y_cv) * 100, 2) 94 | # score += acc_xgb 95 | #print(score / 10) # 0.8786 96 | pre = clf.predict(sec_test) 97 | 98 | predict_dataframe = pd.DataFrame({ 99 | "PassengerId": test["PassengerId"], 100 | "Survived": pre.astype(int) 101 | }) 102 | predict_dataframe.to_csv('../data/ensemble.csv', index=False, encoding="utf-8") 103 | 104 | -------------------------------------------------------------------------------- /titanic/src/ensemble_starking.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | from datetime import datetime 5 | 6 | from xgboost import XGBClassifier 7 | from lightgbm import LGBMClassifier 8 | from sklearn.tree import DecisionTreeClassifier 9 | from sklearn.svm import SVC 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.ensemble import GradientBoostingClassifier 12 | 13 | from sklearn.linear_model import LogisticRegression 14 | 15 | from ensemble import Ensemble 16 | from feature_engineer import feature_engineer 17 | from utils import submission 18 | 19 | 20 | # 21 | # load data 22 | # 23 | x_train, y_train, x_test = feature_engineer() 24 | 25 | # 26 | # Set model 27 | # 28 | svc = SVC() 29 | rf = RandomForestClassifier() 30 | 31 | from ensemble_util import Ensemble 32 | 33 | 34 | def submit(pre): 35 | submission = pd.DataFrame({ 36 | "PassengerId": test["PassengerId"], 37 | "Survived": pre 38 | }) 39 | print("write submit file: *.csv") 40 | submit_file = '../sub/{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M')) 41 | submission.to_csv(submit_file, encoding="utf-8", index=False) 42 | 43 | if __name__ == '__main__': 44 | train = pd.read_csv('../data/feature_train.csv') 45 | test = pd.read_csv('../data/feature_test.csv') 46 | 47 | base_models = [svc, rf, xgb, lgb, rf, gbm] 48 | 49 | stack = Ensemble(n_splits=8, 50 | stacker=DecisionTreeClassifier(), 51 | base_models=base_models) 52 | # Stacker score: 0.8239 LB: 0.779 53 | 54 | clfs = [DecisionTreeClassifier(), 55 | XGBClassifier(n_estimators=100, max_depth=4, min_child_weight=2), 56 | RandomForestClassifier(n_estimators=140, max_depth=4, min_samples_split=6, min_samples_leaf=4, n_jobs=4), 57 | GradientBoostingClassifier(n_estimators=140, max_depth=4, min_samples_split=6, min_samples_leaf=4)] 58 | 59 | ensemble = Ensemble(clfs, rf) 60 | prediction = ensemble.fit_predict(x_train, y_train, x_test) 61 | submit(prediction) 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /titanic/src/ensemble_util.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | # !/usr/bin/python 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.model_selection import KFold 8 | from sklearn.metrics import accuracy_score 9 | 10 | 11 | class Ensemble(object): 12 | def __init__(self, base_models, sec_model, n_folds=5): 13 | self.n_folds = n_folds 14 | self.base_models = base_models 15 | self.clf = sec_model 16 | 17 | def fit_predict(self, X, y, T): 18 | X = np.array(X) 19 | y = np.array(y) 20 | T = np.array(T) 21 | 22 | folds = KFold(n_splits=self.n_folds, random_state=0) 23 | en_train = np.zeros((X.shape[0], len(self.base_models))) 24 | en_test = np.zeros((T.shape[0], len(self.base_models))) 25 | 26 | for i, clf in enumerate(self.base_models): 27 | en_test_i = np.zeros((T.shape[0], self.n_folds)) 28 | print("fit model:%s " % str(i+1)) 29 | for j, (train_idx, test_idx) in enumerate(folds.split(X, y)): 30 | X_train = X[train_idx] 31 | y_train = y[train_idx] 32 | X_holdout = X[test_idx] 33 | y_holdout = y[test_idx] 34 | clf.fit(X_train, y_train) 35 | y_pre = clf.predict(X_holdout) 36 | en_train[test_idx, i] = y_pre 37 | en_test_i[:, j] = clf.predict(T)[:] 38 | score = accuracy_score(en_train[:, i], y) 39 | print("model %s scoring: %s" % (i, score)) 40 | en_test[:, i] = en_test_i.mean(1) 41 | self.clf.fit(en_train, y) 42 | pre = self.clf.predict(en_test) 43 | return pre 44 | 45 | def predict(self, x): 46 | return self.clf.predict(x) 47 | 48 | def score(self, x, y): 49 | s = accuracy_score(y, self.predict(x)) 50 | return s -------------------------------------------------------------------------------- /titanic/src/ensemble_xgb_rf.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 6 | from sklearn.model_selection import cross_val_score 7 | from sklearn.tree import DecisionTreeClassifier 8 | 9 | from xgboost import XGBClassifier 10 | 11 | 12 | def stacking(alg, x_train, y_train, test): 13 | alg.fit(x_train, y_train) 14 | oof_y_train = alg.predict(x_train) 15 | oof_y_subminssion = alg.predict(test) 16 | 17 | return oof_y_train, oof_y_subminssion 18 | 19 | 20 | def submit(alg, test_data): 21 | predict_data = alg.predict(test_data[features]) 22 | submission = pd.DataFrame({ 23 | "PassengerId": test_data["PassengerId"], 24 | "Survived": predict_data 25 | }) 26 | submission.to_csv('../sub/bagging.csv', index=False) 27 | 28 | 29 | if __name__ == '__main__': 30 | train = pd.read_csv('../data/feature_train.csv') 31 | test = pd.read_csv('../data/feature_test.csv') 32 | print(train.head()) 33 | 34 | features = ["Pclass", "Fare", "Age", "SibSp", "child", "Parch", "sex", "fimalysize", 35 | "embark", "name", 'cabin'] 36 | x_train = train[features] 37 | y_train = train["Survived"] 38 | print(x_train.info()) 39 | 40 | rf = RandomForestClassifier( 41 | n_estimators=140, max_depth=4, min_samples_split=6, min_samples_leaf=4, n_jobs=4) 42 | rf_scores = cross_val_score(rf, x_train, y_train, cv=3) 43 | 44 | dt = DecisionTreeClassifier() 45 | dt_scores = cross_val_score(dt, x_train, y_train) 46 | dt.fit(x_train, y_train) 47 | 48 | xgb = XGBClassifier(n_estimators=140, max_depth=4, min_child_weight=6) 49 | xgb_scores = cross_val_score(xgb, x_train, y_train) 50 | xgb.fit(x_train, y_train) 51 | 52 | bagging_clf = BaggingClassifier(xgb, max_samples=0.9, max_features=1.0, bootstrap=True, 53 | bootstrap_features=False, n_jobs=4) 54 | bagging_scores = cross_val_score(bagging_clf, x_train, y_train, cv=3) 55 | bagging_clf.fit(x_train, y_train) 56 | 57 | print("rf scores:", rf_scores.mean()) 58 | print("dt scores:", dt_scores.mean()) 59 | print("xgb scores:", xgb_scores.mean()) 60 | print("bagging scores:", bagging_scores.mean()) 61 | 62 | submit(xgb, test) 63 | -------------------------------------------------------------------------------- /titanic/src/feature_engineer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from utils import load_data 6 | 7 | 8 | def feature_engineer(df, features=None): 9 | """ 特种工程 10 | 特征工程包括以下的工作: 11 | 1.缺失值填充 12 | 2.OneHot编码 13 | params: 14 | df: 输入数据 15 | features: 输出的特征列表 16 | return: 17 | feature_df: 输出特征 18 | columns:['age', 'fare', 'sex', 'child', 'older', 'fimalysize', 'embarked', 19 | 'embark', 'name', 'cabin'] 20 | 21 | """ 22 | feature_df = pd.DataFrame() 23 | 24 | # Age 25 | feature_df["age"] = df["Age"].fillna(df["Age"].median()) 26 | 27 | # Fare 28 | feature_df["fare"] = df["Fare"].fillna(df["Fare"].median()) 29 | 30 | # sex 31 | feature_df["sex"] = df["Sex"].apply(lambda x: 1 if x == "male" else 0) 32 | 33 | # child 34 | feature_df["child"] = df["Age"].apply(lambda x: 1 if x < 16 else 0) 35 | 36 | # older 37 | feature_df["older"] = df["Age"].apply(lambda x: 1 if x > 45 else 0) 38 | 39 | # familysize 40 | feature_df["SibSp"] = df["SibSp"] 41 | feature_df["Parch"] = df["Parch"] 42 | feature_df["fimalysize"] = df["SibSp"] + df["Parch"] + 1 43 | 44 | # embark 45 | feature_df["embark"] = df["Embarked"].fillna("S") 46 | def getEmbark(Embarked): 47 | if Embarked == "S": 48 | return 1 49 | elif Embarked == "C": 50 | return 2 51 | else: 52 | return 3 53 | feature_df["embark"] = feature_df["embark"].apply(getEmbark) 54 | 55 | # name 56 | def getName(name): 57 | if "Mr" in str(name): 58 | return 1 59 | elif "Mrs" in str(name): 60 | return 2 61 | else: 62 | return 0 63 | feature_df["name"] = df["Name"].apply(getName) 64 | 65 | # cabin 66 | feature_df["cabin"] = df["Cabin"].fillna("N") 67 | def getCabin(cabin): 68 | if cabin == "N": 69 | return 0 70 | else: 71 | return 1 72 | feature_df["cabin"] = feature_df["cabin"].apply(getCabin) 73 | 74 | return feature_df 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /titanic/src/feature_predict_age.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import mean_absolute_error 8 | 9 | 10 | def load_data(): 11 | df = pd.read_csv('../input/train.csv') 12 | 13 | train = df[df["Age"].notnull()] 14 | test = df[df["Age"].isnull()] 15 | print(train.shape, test.shape) 16 | 17 | x_train = train.drop(["PassengerId", "Age"], axis=1) 18 | y_train = train["Age"].values 19 | print(x_train.columns) 20 | 21 | return x_train, y_train, test 22 | 23 | 24 | def feature_engineer(): 25 | train, label, test = load_data() 26 | 27 | # Fare 28 | train["Fare"] = train["Fare"].fillna(train["Fare"].median()) 29 | test["Fare"] = test["Fare"].fillna(test["Fare"].median()) 30 | 31 | # Cabin 32 | train["Cabin"] = train["Cabin"].fillna("N") 33 | test["Cabin"] = test["Cabin"].fillna("N") 34 | 35 | # Embarked 36 | train["Embarked"] = train["Embarked"].fillna("S") 37 | test["Embarked"] = test["Embarked"].fillna("S") 38 | 39 | # embark 40 | def getEmbark(Embarked): 41 | if Embarked == "S": 42 | return 1 43 | elif Embarked == "C": 44 | return 2 45 | else: 46 | return 3 47 | 48 | train["A-embark"] = train["Embarked"].apply(getEmbark) 49 | test["A-embark"] = test["Embarked"].apply(getEmbark) 50 | 51 | # name 52 | def getName(name): 53 | if "Mr" in str(name): 54 | return 1 55 | elif "Mrs" in str(name): 56 | return 2 57 | else: 58 | return 0 59 | 60 | train["A-name"] = train["Name"].apply(getName) 61 | test["A-name"] = test["Name"].apply(getName) 62 | 63 | feature = ["Survived", "Pclass", "SibSp", "Parch", "Fare", "A-embark", "A-name"] 64 | 65 | return train[feature], label, test[feature] 66 | 67 | 68 | def stand_linear_regression(x, y): 69 | xMat = np.mat(x) 70 | yMat = np.mat(y).T 71 | 72 | xTx = xMat.T * xMat 73 | 74 | if np.linalg.det(xTx) == 0.0: 75 | print("This matrix ins singular, cannot to inverse") 76 | return 77 | ws = xTx.I * xMat.T * yMat 78 | return ws 79 | 80 | 81 | def predict_age(x_train, y_train, x_valid, y_valid): 82 | ws = stand_linear_regression(x_train, y_train) 83 | x_valid_Mat = np.mat(x_valid) 84 | print(ws) 85 | y_pre = x_valid_Mat * ws 86 | print y_pre 87 | # 12.4654083464 88 | # A-e -> 11.6206955336 89 | # A-n -> 10.0781662156 90 | print(mean_absolute_error(y_valid, y_pre)) 91 | 92 | 93 | if __name__ == "__main__": 94 | train, label, test = feature_engineer() 95 | 96 | x_train, x_valid, y_train, y_valid = train_test_split(train, label, test_size=0.3, random_state=2017) 97 | 98 | predict_age(x_train, y_train, x_valid, y_valid) -------------------------------------------------------------------------------- /titanic/src/gridssearch_xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XGBoost调参技巧(二)Titanic实战预测进入9%" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Titanic是Kaggle竞赛里的入门比赛之一,要求参赛者根据乘客的属性来预测是否幸存,是典型的二分类(Binary Classifier)问题。解决二分类问题的算法有很多:决策树、随机森林、GBM,而XGBoost是GBM的优化实现。因此本文以Titanic幸存者预测竞赛为例,介绍XGBoost的调参技巧。\n", 15 | "\n", 16 | "## 一、读取数据,清洗数据" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### 1.读取数据" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "/home/shunguo/SDE/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 36 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 37 | "/home/shunguo/SDE/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", 38 | " DeprecationWarning)\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "#coding:utf-8\n", 44 | "import numpy as np\n", 45 | "import pandas as pd\n", 46 | "from xgboost import XGBClassifier\n", 47 | "from sklearn.cross_validation import KFold\n", 48 | "from sklearn.grid_search import GridSearchCV\n", 49 | "from sklearn.metrics import accuracy_score\n", 50 | "\n", 51 | "#read data\n", 52 | "train = pd.read_csv(\"../data/train.csv\")\n", 53 | "test = pd.read_csv(\"../data/test.csv\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "导入需要用到的包,注意我导入的是xgboost下的XGBClassifier包,可以结合sciket-learn下的grid_search来对参数进行暴力猜解。\n", 61 | "\n", 62 | "### 2.清洗数据" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "def clean_data(titanic):#填充空数据 和 把string数据转成integer表示\n", 72 | " titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())\n", 73 | " # child\n", 74 | " titanic[\"child\"] = titanic[\"Age\"].apply(lambda x: 1 if x < 15 else 0)\n", 75 | "\n", 76 | " # sex\n", 77 | " titanic[\"sex\"] = titanic[\"Sex\"].apply(lambda x: 1 if x == \"male\" else 0)\n", 78 | "\n", 79 | " titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", 80 | " # embark\n", 81 | " def getEmbark(Embarked):\n", 82 | " if Embarked == \"S\":\n", 83 | " return 1\n", 84 | " elif Embarked == \"C\":\n", 85 | " return 2\n", 86 | " else:\n", 87 | " return 3\n", 88 | " titanic[\"embark\"] = titanic[\"Embarked\"].apply(getEmbark)\n", 89 | " \n", 90 | " # familysize\n", 91 | " titanic[\"fimalysize\"] = titanic[\"SibSp\"] + titanic[\"Parch\"] + 1\n", 92 | "\n", 93 | " # cabin\n", 94 | " def getCabin(cabin):\n", 95 | " if cabin == \"N\":\n", 96 | " return 0\n", 97 | " else:\n", 98 | " return 1\n", 99 | " titanic[\"cabin\"] = titanic[\"Cabin\"].apply(getCabin)\n", 100 | " \n", 101 | " # name\n", 102 | " def getName(name):\n", 103 | " if \"Mr\" in str(name):\n", 104 | " return 1\n", 105 | " elif \"Mrs\" in str(name):\n", 106 | " return 2\n", 107 | " else:\n", 108 | " return 0\n", 109 | " titanic[\"name\"] = titanic[\"Name\"].apply(getName)\n", 110 | "\n", 111 | " titanic[\"Fare\"] = titanic[\"Fare\"].fillna(titanic[\"Fare\"].median())\n", 112 | "\n", 113 | " return titanic\n", 114 | "# 对数据进行清洗\n", 115 | "train_data = clean_data(train)\n", 116 | "test_data = clean_data(test)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## 二、特征工程\n", 124 | "\n", 125 | "Kaggle竞赛的三个核心步骤:**特征工程、调参、模型融合**。俗话说:**数据和特征决定机器学习的上限,而算法只是用来逼近这个上限**,所以特征工程是机器学习能否成功的关键。我们在每个比赛中需要花大量时间来反复完成这个工作。" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 3, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "features = [\"Pclass\", \"sex\", \"child\", \"fimalysize\", \"Fare\", \"embark\", \"cabin\"]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## 三、模型选择" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### 1.构造模型" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 9, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# 简单初始化xgb的分类器就可以\n", 162 | "clf =XGBClassifier(learning_rate=0.1, max_depth=6, silent=True, objective='binary:logistic')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### 2.交叉验证kfold\n", 170 | "利用skean提供的grid_search来进行交叉验证选择参数" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 14, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "([mean: 0.81818, std: 0.01692, params: {'n_estimators': 40, 'max_depth': 3},\n", 182 | " mean: 0.81369, std: 0.02371, params: {'n_estimators': 50, 'max_depth': 3},\n", 183 | " mean: 0.81257, std: 0.02241, params: {'n_estimators': 60, 'max_depth': 3},\n", 184 | " mean: 0.81706, std: 0.02833, params: {'n_estimators': 70, 'max_depth': 3},\n", 185 | " mean: 0.81706, std: 0.02833, params: {'n_estimators': 80, 'max_depth': 3},\n", 186 | " mean: 0.81818, std: 0.02990, params: {'n_estimators': 90, 'max_depth': 3},\n", 187 | " mean: 0.83053, std: 0.03091, params: {'n_estimators': 40, 'max_depth': 5},\n", 188 | " mean: 0.82604, std: 0.03252, params: {'n_estimators': 50, 'max_depth': 5},\n", 189 | " mean: 0.82492, std: 0.03402, params: {'n_estimators': 60, 'max_depth': 5},\n", 190 | " mean: 0.82716, std: 0.03314, params: {'n_estimators': 70, 'max_depth': 5},\n", 191 | " mean: 0.82941, std: 0.03553, params: {'n_estimators': 80, 'max_depth': 5},\n", 192 | " mean: 0.82492, std: 0.03666, params: {'n_estimators': 90, 'max_depth': 5},\n", 193 | " mean: 0.82828, std: 0.03595, params: {'n_estimators': 40, 'max_depth': 7},\n", 194 | " mean: 0.82941, std: 0.03264, params: {'n_estimators': 50, 'max_depth': 7},\n", 195 | " mean: 0.82828, std: 0.03356, params: {'n_estimators': 60, 'max_depth': 7},\n", 196 | " mean: 0.82941, std: 0.03438, params: {'n_estimators': 70, 'max_depth': 7},\n", 197 | " mean: 0.82941, std: 0.03615, params: {'n_estimators': 80, 'max_depth': 7},\n", 198 | " mean: 0.82828, std: 0.03680, params: {'n_estimators': 90, 'max_depth': 7},\n", 199 | " mean: 0.82941, std: 0.03438, params: {'n_estimators': 40, 'max_depth': 9},\n", 200 | " mean: 0.82941, std: 0.03256, params: {'n_estimators': 50, 'max_depth': 9},\n", 201 | " mean: 0.82941, std: 0.03488, params: {'n_estimators': 60, 'max_depth': 9},\n", 202 | " mean: 0.82941, std: 0.03625, params: {'n_estimators': 70, 'max_depth': 9},\n", 203 | " mean: 0.82941, std: 0.03399, params: {'n_estimators': 80, 'max_depth': 9},\n", 204 | " mean: 0.83165, std: 0.03457, params: {'n_estimators': 90, 'max_depth': 9}],\n", 205 | " {'max_depth': 9, 'n_estimators': 90},\n", 206 | " 0.8316498316498316)" 207 | ] 208 | }, 209 | "execution_count": 14, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "params = {\n", 216 | " 'max_depth': range(3, 11, 2),\n", 217 | " 'n_estimators': range(40, 100, 10)\n", 218 | "}\n", 219 | "grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=5)\n", 220 | "grid_search.fit(train_data[features], train_data['Survived'])\n", 221 | "grid_search.grid_scores_, grid_Search.best_params_, grid_Search.best_score_" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 15, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "pre = grid_search.predict(test[features])\n", 231 | "predict_dataframe = pd.DataFrame({\n", 232 | " \"PassengerId\": test[\"PassengerId\"],\n", 233 | " \"Survived\": pre\n", 234 | "})\n", 235 | "predict_dataframe.to_csv('../data/xgboost-gridsearch.csv',index=False,encoding=\"utf-8\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [] 246 | } 247 | ], 248 | "metadata": { 249 | "anaconda-cloud": {}, 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.6.4" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 1 270 | } 271 | -------------------------------------------------------------------------------- /titanic/src/load_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | 4 | 5 | def load_data(path="../input/"): 6 | train = pd.read_csv(path+'train.csv') 7 | test = pd.read_csv(path+'test.csv') 8 | submission = pd.read_csv(path+'gender_submission.csv') 9 | 10 | return train, test, submission -------------------------------------------------------------------------------- /titanic/src/model_dt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn import tree 6 | 7 | 8 | def submit(pre): 9 | submission = pd.DataFrame({ 10 | "PassengerId": test["PassengerId"], 11 | "Survived": predict_data 12 | }) 13 | print "write submit file:decisiontree.csv" 14 | submission.to_csv('../../sub/decisiontree.csv', index=False) 15 | 16 | 17 | if __name__ == "__main__": 18 | print "load data..." 19 | train = pd.read_csv("../../data/feature_train.csv") 20 | test = pd.read_csv("../../data/feature_test.csv") 21 | print train.info(), test.info() 22 | 23 | # Select Features 24 | predictors = ["Pclass", "sex", "Age", "Fare", "embark"] 25 | print "feature select:", predictors 26 | 27 | print "Fix model..." 28 | # Model of DecisionTree 29 | dt = tree.DecisionTreeClassifier() 30 | dt = dt.fit(train[predictors], train["Survived"]) 31 | 32 | print "Predict in test data set..." 33 | predict_data = dt.predict(test[predictors]) 34 | submit(predict_data) 35 | 36 | -------------------------------------------------------------------------------- /titanic/src/model_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn import cross_validation 7 | 8 | train = pd.read_csv("data/train.csv", dtype={"Age": np.float64},) 9 | test = pd.read_csv("data/test.csv", dtype={"Age": np.float64},) 10 | 11 | 12 | def harmonize_data(titanic): 13 | #填充空数据 和 把string数据转成integer表示 14 | titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median()) 15 | 16 | titanic.loc[titanic["Sex"] == "male", "Sex"] = 0 17 | titanic.loc[titanic["Sex"] == "female", "Sex"] = 1 18 | 19 | titanic["Embarked"] = titanic["Embarked"].fillna("S") 20 | 21 | titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 22 | titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 23 | titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 24 | 25 | titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median()) 26 | 27 | return titanic 28 | 29 | train_data = harmonize_data(train) 30 | test_data = harmonize_data(test) 31 | 32 | predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] 33 | 34 | lr = LogisticRegression(random_state=1) 35 | scores = cross_validation.cross_val_score( 36 | lr, 37 | train_data[predictors], 38 | train_data["Survived"], 39 | cv=10 40 | ) 41 | 42 | print scores.mean() 43 | -------------------------------------------------------------------------------- /titanic/src/model_selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XGBoost调参技巧(二)Titanic实战预测进入9%" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Titanic是Kaggle竞赛里的入门比赛之一,要求参赛者根据乘客的属性来预测是否幸存,是典型的二分类(Binary Classifier)问题。解决二分类问题的算法有很多:决策树、随机森林、GBM,而XGBoost是GBM的优化实现。因此本文以Titanic幸存者预测竞赛为例,介绍XGBoost的调参技巧。\n", 15 | "\n", 16 | "## 一、读取数据,清洗数据" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### 1.读取数据" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "#coding:utf-8\n", 35 | "import numpy as np\n", 36 | "import pandas as pd\n", 37 | "from xgboost import XGBClassifier\n", 38 | "from sklearn.cross_validation import KFold\n", 39 | "from sklearn.grid_search import GridSearchCV\n", 40 | "from sklearn.metrics import accuracy_score\n", 41 | "\n", 42 | "#read data\n", 43 | "train = pd.read_csv(\"../input/train.csv\")\n", 44 | "test = pd.read_csv(\"../input/test.csv\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "导入需要用到的包,注意我导入的是xgboost下的XGBClassifier包,可以结合sciket-learn下的grid_search来对参数进行暴力猜解。\n", 52 | "\n", 53 | "### 2.清洗数据" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "def clean_data(titanic):#填充空数据 和 把string数据转成integer表示\n", 65 | " titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())\n", 66 | " \n", 67 | " titanic[\"Fare\"] = titanic[\"Fare\"].fillna(titanic[\"Fare\"].median())\n", 68 | " \n", 69 | " # child\n", 70 | " titanic[\"child\"] = titanic[\"Age\"].apply(lambda x: 1 if x < 15 else 0)\n", 71 | "\n", 72 | " # sex\n", 73 | " titanic[\"sex\"] = titanic[\"Sex\"].apply(lambda x: 1 if x == \"male\" else 0)\n", 74 | "\n", 75 | " titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", 76 | " # embark\n", 77 | " def getEmbark(Embarked):\n", 78 | " if Embarked == \"S\":\n", 79 | " return 1\n", 80 | " elif Embarked == \"C\":\n", 81 | " return 2\n", 82 | " else:\n", 83 | " return 3\n", 84 | " titanic[\"embark\"] = titanic[\"Embarked\"].apply(getEmbark)\n", 85 | " \n", 86 | " # familysize\n", 87 | " titanic[\"fimalysize\"] = titanic[\"SibSp\"] + titanic[\"Parch\"] + 1\n", 88 | "\n", 89 | " # cabin\n", 90 | " def getCabin(cabin):\n", 91 | " if cabin == \"N\":\n", 92 | " return 0\n", 93 | " else:\n", 94 | " return 1\n", 95 | " titanic[\"cabin\"] = titanic[\"Cabin\"].apply(getCabin)\n", 96 | " \n", 97 | " # name\n", 98 | " def getName(name):\n", 99 | " if \"Mr\" in str(name):\n", 100 | " return 1\n", 101 | " elif \"Mrs\" in str(name):\n", 102 | " return 2\n", 103 | " else:\n", 104 | " return 0\n", 105 | " titanic[\"name\"] = titanic[\"Name\"].apply(getName)\n", 106 | "\n", 107 | " \n", 108 | "\n", 109 | " return titanic\n", 110 | "# 对数据进行清洗\n", 111 | "train_data = clean_data(train)\n", 112 | "test_data = clean_data(test)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## 二、特征工程\n", 120 | "\n", 121 | "Kaggle竞赛的三个核心步骤:**特征工程、调参、模型融合**。俗话说:**数据和特征决定机器学习的上限,而算法只是用来逼近这个上限**,所以特征工程是机器学习能否成功的关键。我们在每个比赛中需要花大量时间来反复完成这个工作。" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 4, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "features = [\"Pclass\", \"sex\", \"child\", \"fimalysize\", \"Fare\", \"embark\", \"cabin\"]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## 三、模型选择" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### 1.构造模型" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 5, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "# 简单初始化xgb的分类器就可以\n", 158 | "clf =XGBClassifier(learning_rate=0.1, max_depth=6, silent=True, objective='binary:logistic')" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "### 2.交叉验证kfold\n", 166 | "利用skean提供的grid_search来进行交叉验证选择参数" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "([mean: 0.81818, std: 0.01692, params: {'n_estimators': 40, 'max_depth': 3},\n", 180 | " mean: 0.81369, std: 0.02371, params: {'n_estimators': 50, 'max_depth': 3},\n", 181 | " mean: 0.81257, std: 0.02241, params: {'n_estimators': 60, 'max_depth': 3},\n", 182 | " mean: 0.81706, std: 0.02833, params: {'n_estimators': 70, 'max_depth': 3},\n", 183 | " mean: 0.81706, std: 0.02833, params: {'n_estimators': 80, 'max_depth': 3},\n", 184 | " mean: 0.81818, std: 0.02990, params: {'n_estimators': 90, 'max_depth': 3},\n", 185 | " mean: 0.83053, std: 0.03091, params: {'n_estimators': 40, 'max_depth': 5},\n", 186 | " mean: 0.82604, std: 0.03252, params: {'n_estimators': 50, 'max_depth': 5},\n", 187 | " mean: 0.82492, std: 0.03402, params: {'n_estimators': 60, 'max_depth': 5},\n", 188 | " mean: 0.82716, std: 0.03314, params: {'n_estimators': 70, 'max_depth': 5},\n", 189 | " mean: 0.82941, std: 0.03553, params: {'n_estimators': 80, 'max_depth': 5},\n", 190 | " mean: 0.82492, std: 0.03666, params: {'n_estimators': 90, 'max_depth': 5},\n", 191 | " mean: 0.82828, std: 0.03595, params: {'n_estimators': 40, 'max_depth': 7},\n", 192 | " mean: 0.82941, std: 0.03264, params: {'n_estimators': 50, 'max_depth': 7},\n", 193 | " mean: 0.82828, std: 0.03356, params: {'n_estimators': 60, 'max_depth': 7},\n", 194 | " mean: 0.82941, std: 0.03438, params: {'n_estimators': 70, 'max_depth': 7},\n", 195 | " mean: 0.82941, std: 0.03615, params: {'n_estimators': 80, 'max_depth': 7},\n", 196 | " mean: 0.82828, std: 0.03680, params: {'n_estimators': 90, 'max_depth': 7},\n", 197 | " mean: 0.82941, std: 0.03438, params: {'n_estimators': 40, 'max_depth': 9},\n", 198 | " mean: 0.82941, std: 0.03256, params: {'n_estimators': 50, 'max_depth': 9},\n", 199 | " mean: 0.82941, std: 0.03488, params: {'n_estimators': 60, 'max_depth': 9},\n", 200 | " mean: 0.82941, std: 0.03625, params: {'n_estimators': 70, 'max_depth': 9},\n", 201 | " mean: 0.82941, std: 0.03399, params: {'n_estimators': 80, 'max_depth': 9},\n", 202 | " mean: 0.83165, std: 0.03457, params: {'n_estimators': 90, 'max_depth': 9}],\n", 203 | " {'max_depth': 9, 'n_estimators': 90},\n", 204 | " 0.8316498316498316)" 205 | ] 206 | }, 207 | "execution_count": 8, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "params = {\n", 214 | " 'max_depth': range(3, 11, 2),\n", 215 | " 'n_estimators': range(40, 100, 10)\n", 216 | "}\n", 217 | "grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=5)\n", 218 | "grid_search.fit(train_data[features], train_data['Survived'])\n", 219 | "grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 9, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "pre = grid_search.predict(test[features])\n", 231 | "predict_dataframe = pd.DataFrame({\n", 232 | " \"PassengerId\": test[\"PassengerId\"],\n", 233 | " \"Survived\": pre\n", 234 | "})\n", 235 | "predict_dataframe.to_csv('../sub/xgboost-gridsearch.csv',index=False,encoding=\"utf-8\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [] 246 | } 247 | ], 248 | "metadata": { 249 | "anaconda-cloud": {}, 250 | "kernelspec": { 251 | "display_name": "Python [default]", 252 | "language": "python", 253 | "name": "python2" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 2 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython2", 265 | "version": "2.7.12" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 0 270 | } 271 | -------------------------------------------------------------------------------- /titanic/src/model_xgb.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*-- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from xgboost import XGBClassifier 6 | from sklearn.grid_search import GridSearchCV 7 | 8 | 9 | print "load data..." 10 | train = pd.read_csv("data/train.csv") 11 | test = pd.read_csv("data/test.csv") 12 | 13 | print "train data set:", train.shape 14 | print "test date set:", test.shape 15 | 16 | print "clean data..." 17 | 18 | def clean_data(titanic): 19 | titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median()) 20 | 21 | titanic.loc[titanic["Sex"] == "male", "Sex"] = 0 22 | titanic.loc[titanic["Sex"] == "female", "Sex"] = 1 23 | 24 | titanic.loc[titanic["Embarked"] == 'S', "Embarked"] = 1 25 | titanic.loc[titanic["Embarked"] == 'C', "Embarked"] = 2 26 | titanic.loc[titanic["Embarked"] == 'Q', "Embarked"] = 3 27 | titanic.loc[titanic["Embarked"].isnull()] = 1 28 | 29 | titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median()) 30 | 31 | return titanic 32 | 33 | train_data = clean_data(train) 34 | test_data = clean_data(train) 35 | 36 | # Engineer Features 37 | print "Engineer Feature..." 38 | predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] 39 | 40 | # Model of Random Forest 41 | print "fit model..." 42 | xgb = XGBClassifier( 43 | objective='binary:logistic', 44 | n_estimators=100, 45 | learning_rate=0.1, 46 | max_depth=5, 47 | min_child_weight=1, 48 | nthread=4, 49 | seed=100, 50 | ) 51 | params = { 52 | 'max_depth': range(3, 11, 2) 53 | } 54 | grid_Search = GridSearchCV(estimator=xbg, param_grid=params, cv=5) 55 | grid_Search.fit(train_data[predictors], train_data['survived']) 56 | grid_Search.grid_scores_, grid_Search.best_params_, grid_Search.best_score_ 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /titanic/src/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | from load_data import load_data 5 | from feature_engineer import feature_engineer 6 | 7 | 8 | if __name__ == "__main__": 9 | train_df, test_df, submission = load_data() 10 | 11 | x_train = feature_engineer(train_df) 12 | y_train = train_df["Survived"] 13 | x_test = feature_engineer(test_df) 14 | 15 | rf = RandomForestClassifier( 16 | n_estimators=500, 17 | min_samples_split=12, 18 | min_samples_leaf=1, 19 | oob_score=True, 20 | random_state=2019 21 | ) 22 | 23 | # train 24 | rf.fit(x_train,y_train) 25 | y_pre = rf.predict(x_test) 26 | 27 | submission["Survived"] = y_pre 28 | submission.to_csv("../output/submission.csv", index=False, encoding="utf-8") -------------------------------------------------------------------------------- /titanic/src/utils.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | # !/usr/bin/python 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from datetime import datetime 7 | 8 | 9 | # 10 | # load data 11 | # 12 | def load_data(): 13 | train = pd.read_csv('../input/train.csv') 14 | test = pd.read_csv('../input/test.csv') 15 | submission = pd.read_csv('../input/submission.csv') 16 | 17 | return train, test, submission 18 | 19 | # 20 | # submit file 21 | # 22 | def submission(pre): 23 | test = pd.read_csv('../input/test.csv') 24 | submission = pd.DataFrame({ 25 | "PassengerId": test["PassengerId"], 26 | "Survived": pre 27 | }) 28 | 29 | submit_file = '../sub/{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M')) 30 | print("write submit file:{}".format(submit_file)) 31 | submission.to_csv(submit_file, encoding="utf-8", index=False) --------------------------------------------------------------------------------