├── GbmRStarter.R
├── NaiveBayes-Python.ipynb
├── README.md
├── Rank_1_Roman
    ├── Instructions
    ├── first model.ipynb
    ├── mean_evaluation.py
    └── start.ipynb
├── Rank_2_Sergazy
    ├── Instructions
    ├── best.py
    ├── layer2.py
    ├── lstm.py
    ├── readability.py
    ├── syllables_en.py
    ├── utils.py
    └── word2vecUtils.py
├── XGBoost_Python_TextFeats.ipynb
└── xgboost_starter.R


/GbmRStarter.R:
--------------------------------------------------------------------------------
 1 | # load data and libraries
 2 | 
 3 | library(data.table)
 4 | library(lubridate)
 5 | library(stringr)
 6 | 
 7 | train <- fread("train.csv")
 8 | test <- fread("test.csv")
 9 | 
10 | # data dimension
11 | 
12 | sprintf("There are %s rows and %s columns in train data ",nrow(train),ncol(train))
13 | sprintf("There are %s rows and %s columns in test data ",nrow(test),ncol(test))
14 | 
15 | # convert unix time format 
16 | 
17 | unix_feats <- c('deadline','state_changed_at','created_at','launched_at')
18 | train[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats]
19 | test[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats]
20 | 
21 | # create simple features
22 | 
23 | len_feats <- c('name_len','desc_len','keywords_len')
24 | count_feats <- c('name_count','desc_count','keywords_count')
25 | cols <- c('name','desc','keywords')
26 | 
27 | train[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols]
28 | train[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols]
29 | 
30 | test[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols]
31 | test[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols]
32 | 
33 | # encode features
34 | 
35 | train[,disable_communication := as.integer(as.factor(disable_communication))-1]
36 | train[,country := as.integer(as.factor(country))-1]
37 | 
38 | test[,disable_communication := as.integer(as.factor(disable_communication))-1]
39 | test[,country := as.integer(as.factor(country))-1]
40 | 
41 | 
42 | # cols to use in modeling
43 | cols_to_use <- c('final_status'
44 |                  ,'name_len'
45 |                  ,'desc_len'
46 |                  ,'keywords_len'
47 |                  ,'name_count'
48 |                  ,'desc_count'
49 |                  ,'keywords_count')
50 | 
51 | 
52 | # GBM
53 | library(gbm)
54 | set.seed(1)
55 | 
56 | X_train <- copy(train)
57 | X_train[,final_status := as.factor(final_status)]
58 | 
59 | clf_model <- gbm(final_status ~ .
60 |                  ,data = train[,cols_to_use,with=F]
61 |                  ,n.trees = 500
62 |                  ,interaction.depth = 5
63 |                  ,shrinkage = 0.3
64 |                  ,train.fraction = 0.6
65 |                  ,verbose = T)
66 | 
67 | 
68 | # check variable importance
69 | summary(clf_model, n.trees = 125)
70 | 
71 | # make predictions
72 | clf_pred <- predict(clf_model, newdata = test, n.trees = 232,type = 'response')
73 | clf_pred <- ifelse(clf_pred > 0.6,1,0)
74 | 
75 | # write file
76 | subst <- data.table(project_id = test$project_id, final_status = clf_pred)
77 | fwrite(subst, "gbm_starter.csv") #0.65754
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/NaiveBayes-Python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### This script is based on simple features derived from text variables. "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 167,
 13 |    "metadata": {
 14 |     "collapsed": true,
 15 |     "deletable": true,
 16 |     "editable": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "from sklearn.naive_bayes import GaussianNB"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {
 28 |     "deletable": true,
 29 |     "editable": true
 30 |    },
 31 |    "source": [
 32 |     "### load data"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 168,
 38 |    "metadata": {
 39 |     "collapsed": true,
 40 |     "deletable": true,
 41 |     "editable": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "train = pd.read_csv(\"train.csv\")\n",
 46 |     "test = pd.read_csv(\"test.csv\")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 169,
 52 |    "metadata": {
 53 |     "collapsed": false,
 54 |     "deletable": true,
 55 |     "editable": true
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "The train data has 108129 rows and 14 columns\n",
 63 |       "The test data has 63465 rows and 12 columns\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))\n",
 69 |     "print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {
 75 |     "deletable": true,
 76 |     "editable": true
 77 |    },
 78 |    "source": [
 79 |     "### convert time to unix format"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 170,
 85 |    "metadata": {
 86 |     "collapsed": true,
 87 |     "deletable": true,
 88 |     "editable": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "import time\n",
 93 |     "\n",
 94 |     "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n",
 95 |     "\n",
 96 |     "for x in unix_cols:\n",
 97 |     "    train[x] = train[x].apply(lambda k: time.ctime(k))\n",
 98 |     "    test[x] = test[x].apply(lambda k: time.ctime(k))"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {
104 |     "deletable": true,
105 |     "editable": true
106 |    },
107 |    "source": [
108 |     "### create simple features"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 171,
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "deletable": true,
117 |     "editable": true
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "cols_to_use = ['name','desc']\n",
122 |     "len_feats = ['name_len','desc_len']\n",
123 |     "count_feats = ['name_count','desc_count']\n",
124 |     "\n",
125 |     "for i in np.arange(2):\n",
126 |     "    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n",
127 |     "    train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 172,
133 |    "metadata": {
134 |     "collapsed": true,
135 |     "deletable": true,
136 |     "editable": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "train['keywords_len'] = train['keywords'].apply(str).apply(len)\n",
141 |     "train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 173,
147 |    "metadata": {
148 |     "collapsed": false
149 |    },
150 |    "outputs": [],
151 |    "source": [
152 |     "for i in np.arange(2):\n",
153 |     "    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)\n",
154 |     "    test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
155 |     "    \n",
156 |     "test['keywords_len'] = test['keywords'].apply(str).apply(len)\n",
157 |     "test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {
163 |     "deletable": true,
164 |     "editable": true
165 |    },
166 |    "source": [
167 |     "### encoding features"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 174,
173 |    "metadata": {
174 |     "collapsed": false,
175 |     "deletable": true,
176 |     "editable": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "from sklearn.preprocessing import LabelEncoder\n",
181 |     "\n",
182 |     "feat = ['disable_communication','country']\n",
183 |     "for x in feat:\n",
184 |     "    le = LabelEncoder()\n",
185 |     "    le.fit(list(train[x].values) + list(test[x].values))\n",
186 |     "    train[x] = le.transform(list(train[x]))\n",
187 |     "    test[x] = le.transform(list(test[x].values))\n",
188 |     "    "
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "### model training"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 175,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "cols_to_use = ['name_len'\n",
207 |     "                 ,'desc_len'\n",
208 |     "                 ,'keywords_len'\n",
209 |     "                 ,'name_count'\n",
210 |     "                 ,'desc_count'\n",
211 |     "                 ,'keywords_count']\n",
212 |     "\n",
213 |     "target = train['final_status']\n",
214 |     "\n",
215 |     "# data for modeling\n",
216 |     "k_train = train[cols_to_use]\n",
217 |     "k_test = test[cols_to_use]"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "### naive bayes"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 176,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "gnb = GaussianNB()\n",
236 |     "nvb_pred = gnb.fit(k_train, target).predict(k_test)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "### write the file"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 177,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "nBsub = pd.DataFrame({'project_id':test['project_id'],'final_status':nvb_pred})\n",
255 |     "nBsub = nBsub[['project_id','final_status']]\n",
256 |     "nBsub.to_csv(\"nBstarter.csv\",index = False) #0.6526"
257 |    ]
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 2",
263 |    "language": "python",
264 |    "name": "python2"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 2
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython2",
276 |    "version": "2.7.13"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 2
281 | }
282 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Challenge #2
 2 | 
 3 | This repository contains R and Python scripts to help ML aspirants and enthusiasts get a nice head-start and learn something new from  this 15 days Machine Learning Challenge.
 4 | It is an online competition, hence people from all over the world can participate.
 5 | 
 6 | **Problem:** Predict if a project will get successfully funded or not.  
 7 | 
 8 | **Prize:** $700, $500
 9 | 
10 | **Duration:** 15th June 2017 to 30th June 2017
11 | 
12 | **Link:** https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-2/machine-learning/funding-successful-projects/
13 | 


--------------------------------------------------------------------------------
/Rank_1_Roman/Instructions:
--------------------------------------------------------------------------------
1 | To reproduce the final submission file::
2 | 
3 | 1. Run first_model.py
4 | 2. start.py
5 | 
6 | Note: Keep mean_evaluation.py in the same directory as the files above.
7 | 


--------------------------------------------------------------------------------
/Rank_1_Roman/first model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
 15 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "import xgboost as xgb\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import os\n",
 24 |     "from sklearn.preprocessing import LabelEncoder\n",
 25 |     "from mean_evaluation import roman_mean\n",
 26 |     "import datetime\n",
 27 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 28 |     "import re\n",
 29 |     "from scipy.sparse import hstack\n",
 30 |     "from sklearn.linear_model import LogisticRegression, SGDClassifier\n",
 31 |     "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
 32 |     "from sklearn.model_selection import cross_val_predict\n",
 33 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 34 |     "from sklearn.svm import SVC, LinearSVC\n",
 35 |     "from scipy.spatial import distance"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {
 42 |     "collapsed": false
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "train = pd.read_csv('train.csv')\n",
 47 |     "test = pd.read_csv('test.csv')\n",
 48 |     "\n",
 49 |     "final_status = train.final_status\n",
 50 |     "projest_id = train.project_id\n",
 51 |     "backers_count = train.backers_count\n",
 52 |     "\n",
 53 |     "ltr = len(train)\n",
 54 |     "train.drop(['final_status', 'backers_count'], axis = 1, inplace = True)\n",
 55 |     "\n",
 56 |     "data = pd.concat([train, test], axis = 0)\n",
 57 |     "data.index = range(len(data))"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "roman_model = roman_mean(directory = 'path', \n",
 69 |     "                         n_folds_gen = 10, \n",
 70 |     "                         n_folds_sub = 5, \n",
 71 |     "                         seed = 322, \n",
 72 |     "                         sub_seed = 228, \n",
 73 |     "                         ltr = ltr, \n",
 74 |     "                         data = data, \n",
 75 |     "                         target = final_status)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "int_project_id = []\n",
 87 |     "for x in data.project_id.tolist():\n",
 88 |     "    int_project_id += [int(x[4:])]\n",
 89 |     "data['int_project_id'] = int_project_id"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 8,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "int_disable_communication = []\n",
101 |     "for x in data.disable_communication.tolist():\n",
102 |     "    if x == False:\n",
103 |     "        int_disable_communication += [0]\n",
104 |     "    else:\n",
105 |     "        int_disable_communication += [1]\n",
106 |     "data['disable_communication_int'] = int_disable_communication"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 10,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "data['deadline-created_at'] = data.deadline - data.created_at\n",
118 |     "data['launched_at-created_at'] = data.deadline - data.created_at\n",
119 |     "data['state_changed_at-created_at'] = data.deadline - data.created_at\n",
120 |     "data['state_changed_at-deadline'] = data.state_changed_at - data.deadline\n",
121 |     "data['deadline-launched_at'] = data.deadline - data.launched_at\n",
122 |     "data['state_changed_at-launched_at'] = data.state_changed_at - data.launched_at"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 11,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "data['len_name'] = [len(str(x)) for x in data.name.tolist()]\n",
134 |     "data['len_desc'] = [len(str(x)) for x in data.desc.tolist()]\n",
135 |     "data['len_keywords'] = [len(str(x)) for x in data.keywords.tolist()]\n",
136 |     "data['numb_keywords'] = [len(str(x).split('-')) for x in data.keywords.tolist()]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 13,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "len_cov = []\n",
148 |     "for x in data.desc.tolist():\n",
149 |     "    tokens = re.findall('\\\"', str(x))\n",
150 |     "    len_cov += [len(tokens)]\n",
151 |     "data['len_cov'] = len_cov\n",
152 |     "data['bad_znak'] =  data['len_cov'] / data['len_desc']"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 14,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "keywords = [re.sub('-', ' ', str(x)) for x in data.keywords.tolist()]\n",
164 |     "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n",
165 |     "keywords_vect = vectorizer.fit_transform(keywords)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 17,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "names = [str(x) for x in data.name.tolist()]\n",
177 |     "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n",
178 |     "names_vect = vectorizer.fit_transform(names)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 18,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "desc = [str(x) for x in data.desc.tolist()]\n",
190 |     "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n",
191 |     "desc_vect = vectorizer.fit_transform(desc)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 19,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "del vectorizer"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 20,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "sp_data = hstack([keywords_vect, names_vect, desc_vect]).tocsr()"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 22,
219 |    "metadata": {
220 |     "collapsed": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "time_feat = ['deadline', 'created_at', 'launched_at',  'state_changed_at']\n",
225 |     "for time in time_feat:\n",
226 |     "    weekday = []\n",
227 |     "    hour = []\n",
228 |     "    day = []\n",
229 |     "    for x in data.loc[:, time].tolist():\n",
230 |     "        weekday += [datetime.datetime.fromtimestamp(x).weekday()]\n",
231 |     "        hour += [datetime.datetime.fromtimestamp(x).hour]\n",
232 |     "        day += [datetime.datetime.fromtimestamp(x).day]\n",
233 |     "    data[time + '_' + 'weekday'] = weekday\n",
234 |     "    data[time + '_' + 'hour'] = hour\n",
235 |     "    data[time + '_' + 'day'] = day"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 24,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "deadline_hour_weekday\n",
250 |       "created_at_hour_weekday\n",
251 |       "launched_at_hour_weekday\n",
252 |       "state_changed_at_hour_weekday\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "for time in time_feat:\n",
258 |     "    print(time + '_' + 'hour_weekday')\n",
259 |     "    data[time + '_' + 'hour_weekday'] = data[time + '_' + 'hour'].astype(str) + '_' + data[time + '_' + 'weekday'].astype(str)\n",
260 |     "    data[time + '_' + 'hour_country'] = data[time + '_' + 'hour'].astype(str) + '_' + data['country'].astype(str)\n",
261 |     "    data[time + '_' + 'weekday_country'] = data[time + '_' + 'weekday'].astype(str) + '_' + data['country'].astype(str)\n",
262 |     "    data[time + '_' + 'day_country'] = data[time + '_' + 'day'].astype(str) + '_' + data['country'].astype(str)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 25,
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "normal_goal = []\n",
274 |     "for x, y in zip(data.currency.tolist(), data.goal.tolist()):\n",
275 |     "    if x == 'USD':\n",
276 |     "        normal_goal += [y]\n",
277 |     "    if x == 'GBP':\n",
278 |     "        normal_goal += [1.5 * y]\n",
279 |     "    if x == 'EUR':\n",
280 |     "        normal_goal += [1.2 * y]\n",
281 |     "    if x == 'CAD':\n",
282 |     "        normal_goal += [0.85 * y]\n",
283 |     "    if x == 'AUD':\n",
284 |     "        normal_goal += [0.85 * y]\n",
285 |     "    if x == 'SEK':\n",
286 |     "        normal_goal += [0.14 * y]\n",
287 |     "    if x == 'NZD':\n",
288 |     "        normal_goal += [0.70 * y]\n",
289 |     "    if x == 'DKK':\n",
290 |     "        normal_goal += [0.17 * y]\n",
291 |     "    if x == 'NOK':\n",
292 |     "        normal_goal += [0.15 * y]\n",
293 |     "    if x == 'CHF':\n",
294 |     "        normal_goal += [y]\n",
295 |     "    if x == 'MXN':\n",
296 |     "        normal_goal += [0.07 * y]\n",
297 |     "    if x == 'SGD':\n",
298 |     "        normal_goal += [0.73 * y]\n",
299 |     "    if x == 'HKD':\n",
300 |     "        normal_goal += [0.13 * y]\n",
301 |     "\n",
302 |     "data['normal_goal'] = normal_goal\n",
303 |     "data['deadline-created_at_normal_goal'] = data.loc[:, 'deadline-created_at'] / data.normal_goal\n",
304 |     "data['launched_at-created_at_normal_goal'] = data.loc[:, 'launched_at-created_at'] / data.normal_goal\n",
305 |     "data['state_changed_at-created_at_normal_goal'] = data.loc[:, 'state_changed_at-created_at'] / data.normal_goal\n",
306 |     "data['state_changed_at-deadline_normal_goal'] = data.loc[:, 'state_changed_at-deadline'] / data.normal_goal\n",
307 |     "data['deadline-launched_at_normal_goal'] = data.loc[:, 'deadline-launched_at'] / data.normal_goal\n",
308 |     "data['state_changed_at-launched_at_normal_goal'] = data.loc[:, 'state_changed_at-launched_at'] / data.normal_goal"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 27,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "country\n",
323 |       "currency\n",
324 |       "deadline_hour_weekday\n",
325 |       "created_at_hour_weekday\n",
326 |       "launched_at_hour_weekday\n",
327 |       "state_changed_at_hour_weekday\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "roman_model.cols_mean(['country', 'currency', 'deadline_hour_weekday',\n",
333 |     "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday'])"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 28,
339 |    "metadata": {
340 |     "collapsed": false,
341 |     "scrolled": true
342 |    },
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\project_id\n",
349 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\name\n",
350 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\desc\n",
351 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\goal\n",
352 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\keywords\n",
353 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\disable_communication\n",
354 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\country\n",
355 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\currency\n",
356 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline\n",
357 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at\n",
358 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at\n",
359 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at\n",
360 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\int_project_id\n",
361 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\disable_communication_int\n",
362 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-created_at\n",
363 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at-created_at\n",
364 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-created_at\n",
365 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-deadline\n",
366 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-launched_at\n",
367 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-launched_at\n",
368 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_name\n",
369 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_desc\n",
370 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_keywords\n",
371 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\numb_keywords\n",
372 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_cov\n",
373 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\bad_znak\n",
374 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_weekday\n",
375 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour\n",
376 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_day\n",
377 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_weekday\n",
378 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour\n",
379 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_day\n",
380 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_weekday\n",
381 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour\n",
382 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_day\n",
383 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_weekday\n",
384 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour\n",
385 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_day\n",
386 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour_weekday\n",
387 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour_country\n",
388 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_weekday_country\n",
389 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_day_country\n",
390 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour_weekday\n",
391 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour_country\n",
392 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_weekday_country\n",
393 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_day_country\n",
394 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour_weekday\n",
395 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour_country\n",
396 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_weekday_country\n",
397 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_day_country\n",
398 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour_weekday\n",
399 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour_country\n",
400 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_weekday_country\n",
401 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_day_country\n",
402 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\normal_goal\n",
403 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-created_at_normal_goal\n",
404 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at-created_at_normal_goal\n",
405 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-created_at_normal_goal\n",
406 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-deadline_normal_goal\n",
407 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-launched_at_normal_goal\n",
408 |       "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-launched_at_normal_goal\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "roman_model.save_in_file(data)"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 29,
419 |    "metadata": {
420 |     "collapsed": true
421 |    },
422 |    "outputs": [],
423 |    "source": [
424 |     "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc',\n",
425 |     "               'max_depth':8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5}"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 30,
431 |    "metadata": {
432 |     "collapsed": true
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "feature_list = ['country_mean', \n",
437 |     "                'currency_mean', \n",
438 |     "                'disable_communication_int', \n",
439 |     "                'normal_goal',\n",
440 |     "                'deadline-created_at', \n",
441 |     "                'launched_at-created_at', \n",
442 |     "                'state_changed_at-created_at', \n",
443 |     "                'deadline-launched_at',\n",
444 |     "                'state_changed_at-deadline',\n",
445 |     "                'state_changed_at-launched_at',\n",
446 |     "                'deadline-created_at_normal_goal', \n",
447 |     "                'launched_at-created_at_normal_goal', \n",
448 |     "                'state_changed_at-created_at_normal_goal', \n",
449 |     "                'deadline-launched_at_normal_goal',\n",
450 |     "                'state_changed_at-deadline_normal_goal',\n",
451 |     "                'state_changed_at-launched_at_normal_goal', \n",
452 |     "                'len_name', \n",
453 |     "                'len_desc', \n",
454 |     "                'len_keywords',  \n",
455 |     "                'created_at_hour', 'created_at_weekday', 'created_at_day', \n",
456 |     "                'deadline_hour', 'deadline_weekday', 'deadline_day', \n",
457 |     "                'launched_at_hour', 'launched_at_weekday', 'launched_at_day', \n",
458 |     "                'state_changed_at_hour', 'state_changed_at_weekday', 'state_changed_at_day', \n",
459 |     "        'canceled', 'deadline_hour_weekday_mean',\n",
460 |     "'created_at_hour_weekday_mean', \n",
461 |     "'launched_at_hour_weekday_mean',\n",
462 |     "'state_changed_at_hour_weekday_mean']"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 32,
468 |    "metadata": {
469 |     "collapsed": false
470 |    },
471 |    "outputs": [
472 |     {
473 |      "name": "stdout",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "Calculate 1/10\n",
477 |       "1/36\n"
478 |      ]
479 |     },
480 |     {
481 |      "ename": "FileNotFoundError",
482 |      "evalue": "File b'/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\\\features\\\\country_mean\\\\country_mean.csv' does not exist",
483 |      "output_type": "error",
484 |      "traceback": [
485 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
486 |       "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
487 |       "\u001b[0;32m<ipython-input-32-82d5d2cc0636>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mroman_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredictSparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdic_par\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;31m# roman_model.predict(dic_par, stack_feat, 5000, True, False)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
488 |       "\u001b[0;32m/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah/mean_evaluation.py\u001b[0m in \u001b[0;36mpredictSparse\u001b[0;34m(self, dic_par, sparse, feature_list, num_round, save, fscore, score)\u001b[0m\n\u001b[1;32m    283\u001b[0m                     \u001b[0mfeature_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_feat_directory\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\\\'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    284\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 285\u001b[0;31m                     \u001b[0mfeature_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_feat_directory\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\\\'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfeature\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    286\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_col\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    287\u001b[0m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtocsr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
489 |       "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m    644\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[1;32m    645\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 646\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    648\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
490 |       "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    387\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    388\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 389\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    391\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mchunksize\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
491 |       "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    728\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    729\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 730\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    731\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    732\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
492 |       "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m    921\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    922\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 923\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    924\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    925\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
493 |       "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m   1388\u001b[0m         \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1390\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1391\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1392\u001b[0m         \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
494 |       "\u001b[0;32mpandas/parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)\u001b[0;34m()\u001b[0m\n",
495 |       "\u001b[0;32mpandas/parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)\u001b[0;34m()\u001b[0m\n",
496 |       "\u001b[0;31mFileNotFoundError\u001b[0m: File b'/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\\\features\\\\country_mean\\\\country_mean.csv' does not exist"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "roman_model.predictSparse(dic_par, sp_data, feature_list, 5000, True, False)\n",
502 |     "# roman_model.predict(dic_par, stack_feat, 5000, True, False)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {
509 |     "collapsed": true
510 |    },
511 |    "outputs": [],
512 |    "source": []
513 |   }
514 |  ],
515 |  "metadata": {
516 |   "anaconda-cloud": {},
517 |   "kernelspec": {
518 |    "display_name": "Python 3",
519 |    "language": "python",
520 |    "name": "python3"
521 |   },
522 |   "language_info": {
523 |    "codemirror_mode": {
524 |     "name": "ipython",
525 |     "version": 3
526 |    },
527 |    "file_extension": ".py",
528 |    "mimetype": "text/x-python",
529 |    "name": "python",
530 |    "nbconvert_exporter": "python",
531 |    "pygments_lexer": "ipython3",
532 |    "version": "3.5.2"
533 |   }
534 |  },
535 |  "nbformat": 4,
536 |  "nbformat_minor": 1
537 | }
538 | 


--------------------------------------------------------------------------------
/Rank_1_Roman/mean_evaluation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[17]:
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | import xgboost as xgb
  9 | import pickle
 10 | import re
 11 | from sklearn.model_selection import StratifiedKFold
 12 | from sklearn.feature_extraction import DictVectorizer
 13 | from sklearn.model_selection import cross_val_predict 
 14 | from sklearn.metrics import roc_auc_score
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.ensemble import RandomForestClassifier
 17 | from sklearn.preprocessing import OneHotEncoder
 18 | from scipy.sparse import hstack
 19 | import os
 20 | from xgboost import XGBClassifier
 21 | from sklearn.metrics import accuracy_score
 22 | 
 23 | 
 24 | 
 25 | # In[18]:
 26 | 
 27 | class roman_mean:
 28 |     def __init__(self, directory, data, target, n_folds_gen, n_folds_sub, seed, sub_seed, ltr,
 29 |                  extra_train = None, extra_target = None):
 30 |         self.directory = directory
 31 |         self.n_folds_gen = n_folds_gen
 32 |         self.n_folds_sub = n_folds_sub
 33 |         self.seed = seed
 34 |         self.sub_seed = sub_seed
 35 |         self.ltr = ltr
 36 |         self.data = data
 37 |         self.target = target
 38 |         self.extra_train = extra_train
 39 |         self.extra_target = extra_target
 40 |     
 41 |     def save_in_file(self, data):
 42 |         for x in data.columns.values:
 43 |             directory = self.directory + '\\features\\' + x
 44 |             print(directory)
 45 |             if not os.path.exists(directory):
 46 |                 os.makedirs(directory)
 47 |             else:
 48 |                 print(x + ' already save.')
 49 |                 continue
 50 |             data.loc[:, x].to_csv(directory + '\\' + x + '.csv', index = None, header = True)
 51 |     
 52 |     #mean_eval + mean_start + cols_mean Computation mean values by target with double cross_validation
 53 |     def mean_eval(self, pred, alpha, train_fold, test_fold, target, col_name):
 54 |         if type(self.extra_train) == type(None):
 55 |             cur_train = train_fold
 56 |             cur_target = target
 57 |         else:
 58 |             cur_train = pd.concat([self.extra_train.loc[:, col_name], train_fold], axis = 0)
 59 |             cur_train.index = range(len(cur_train))
 60 |             cur_target = pd.concat([self.extra_target, target], axis = 0)
 61 |             cur_target.index = range(len(cur_target))
 62 |         grouped = cur_target.groupby(cur_train)
 63 |         grouped_mean = grouped.mean().to_dict()
 64 |         grouped_count = grouped.count().to_dict()
 65 |         glob_mean = cur_target.mean()
 66 |         pred[list(test_fold.index)] = [(grouped_mean[x] * grouped_count[x] + glob_mean * alpha) / (grouped_count[x] + alpha) 
 67 |                                  if x in grouped_mean else glob_mean for x in test_fold]
 68 |   
 69 |     def mean_start(self, col):
 70 |         kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True)
 71 |         kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True)
 72 |         alpha = 20
 73 |         directory = self.directory + '\\features\\' + col.name + '_mean'
 74 |         if not os.path.exists(directory):
 75 |             os.makedirs(directory)
 76 |         else:
 77 |             print(col.name + ' already exist.')
 78 |             return
 79 |         for i, (train_index, test_index) in enumerate(kf_gen.split(col[:self.ltr], self.target)):
 80 |             pred = pd.Series([-1] * len(col))
 81 |             sub_col = col[train_index]
 82 |             sub_target = self.target[train_index]
 83 |             for j, (sub_train_index, sub_test_index) in enumerate(kf_sub.split(sub_col, sub_target)):
 84 |                 self.mean_eval(pred, alpha,
 85 |                           sub_col.iloc[sub_train_index], 
 86 |                           sub_col.iloc[sub_test_index], 
 87 |                           sub_target.iloc[sub_train_index], col.name)
 88 |             self.mean_eval(pred, alpha,
 89 |                       col[train_index], 
 90 |                       col[test_index], 
 91 |                       self.target[train_index], col.name)
 92 |             self.mean_eval(pred, alpha, 
 93 |                       col[train_index], 
 94 |                       col[self.ltr:], 
 95 |                       self.target[train_index], col.name)
 96 |             pred.name = col.name + '_mean'
 97 |             pred.to_csv(self.directory + '\\features\\' + col.name + '_mean' + '\\' + str(i)
 98 |                         + '.csv', index = None, header = True)
 99 |     
100 |     def cols_mean(self, cols):
101 |         for col in cols:
102 |             print(col)
103 |             self.mean_start(self.data.loc[:, col])
104 | 
105 |     #Computation factor machine with double cross_validation
106 |     
107 |     #Computation logistic regression with double cross_validaton
108 |     def cols_LR(self, feature_list):
109 |         
110 |         kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True)
111 |         kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True)
112 |         for i, (train_index, test_index) in enumerate(kf_gen.split(self.data[:self.ltr], self.target)):
113 |             print(i)
114 |             sp_data = pd.DataFrame()
115 |             features_directory = self.directory + '\\features'
116 |             col_i = 1
117 |             for feature in feature_list:
118 |                 print(str(col_i) + '/' + str(len(feature_list)))
119 |                 col_i += 1
120 |                 cur_feat_directory = features_directory + '\\' + feature
121 |                 if len(os.listdir(cur_feat_directory)) > 1:
122 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv')
123 |                 else:
124 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv')
125 |                 sp_data = pd.concat([sp_data, feature_col], axis = 1)
126 |                 del feature_col
127 |             pred = pd.Series([-1] * len(self.data))
128 |             clf = LogisticRegression(C = 20, n_jobs = -1)
129 |             pred[train_index] = cross_val_predict(clf, sp_data.loc[train_index, :], self.target[train_index], cv = kf_sub, 
130 |                                                   method = 'predict_proba', n_jobs = -1)[:, 1]
131 |             print('OK')
132 |             clf.fit(sp_data.loc[train_index, :], self.target[train_index])
133 |             pred[test_index] = clf.predict_proba(sp_data.loc[test_index, :])[:, 1]
134 |             print(roc_auc_score(self.target[test_index], pred[test_index]))
135 |             pred[self.ltr:] = clf.predict_proba(sp_data.loc[self.ltr:, :])[:, 1]
136 |             pred.name = 'LR_true'
137 |             directory = self.directory + '\\features\\LR_true2\\' 
138 |             if not os.path.exists(directory):
139 |                     os.makedirs(directory)
140 |             pred.to_csv(directory + str(i) + '.csv', index = None, header = True)
141 | 
142 |     #Computation xgboost predict with double cross_validation         
143 |     def cols_XGB(self, feature_list, dic_par_list, num_round):
144 |         features_directory = self.directory + '//features'
145 | 
146 |         kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True)
147 |         kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True)
148 |         for i, (train_index, test_index) in enumerate(kf_gen.split(self.data[:self.ltr], self.target)):
149 |             print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen))
150 |             data = pd.DataFrame()
151 |             col_i = 1
152 |             for feature in feature_list:
153 |                 print(str(col_i) + '/' + str(len(feature_list)))
154 |                 col_i += 1
155 |                 cur_feat_directory = features_directory + '//' + feature
156 |                 if len(os.listdir(cur_feat_directory)) > 1:
157 |                     feature_col = pd.read_csv(cur_feat_directory + '//' + str(i) + '.csv')
158 |                 else:
159 |                     feature_col = pd.read_csv(cur_feat_directory + '//' + feature + '.csv')
160 |                 data = pd.concat([data, feature_col], axis = 1)
161 |                 #print(feature_col.columns)
162 |             del feature_col
163 |             print(i)
164 |             for k, dic_par in enumerate(dic_par_list):
165 |                 pred = pd.Series([-1] * len(self.data))
166 |                 for j, (sub_train_index, sub_test_index) in enumerate(kf_sub.split(self.data.loc[train_index, :], self.target[train_index])):
167 |                     print(i, k, j)
168 |                     xgall = xgb.DMatrix(data.loc[train_index[sub_train_index], :], self.target[train_index[sub_train_index]])
169 |                     xgeval = xgb.DMatrix(data.loc[train_index[sub_test_index], :], self.target[train_index[sub_test_index]])
170 |                     bst = xgb.train(dic_par, xgall, maximize=True, early_stopping_rounds=20,
171 |                                 num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=False)            
172 |                     pred[train_index[sub_test_index]] = bst.predict(xgeval)
173 |                     del xgall, xgeval, bst
174 |                 xgall = xgb.DMatrix(data.loc[train_index, :], self.target[train_index])
175 |                 xg_cvtest = xgb.DMatrix(data.loc[test_index, :], self.target[test_index])
176 |                 xg_test = xgb.DMatrix(data.loc[self.ltr:, :])
177 |                 bst = xgb.train(dic_par, xgall, maximize=True, early_stopping_rounds=20,
178 |                 num_boost_round=num_round, evals=[(xgall, 'train'), (xg_cvtest, 'test')], verbose_eval=False) 
179 |                 pred[test_index] = bst.predict(xg_cvtest)
180 |                 pred[self.ltr:] = bst.predict(xg_test)
181 |                 print(bst.best_score)
182 |                 print(roc_auc_score(self.target[test_index], pred[test_index]))
183 |                 name = 'XGB' + str(k + 3)
184 |                 pred.name = name
185 |                 directory = self.directory + '//features//' + name + '//' 
186 |                 if not os.path.exists(directory):
187 |                         os.makedirs(directory)
188 |                 pred.to_csv(directory + str(i) + '.csv', index = None, header = True)
189 |                 del xgall, xg_cvtest, xg_test, bst, pred
190 | 
191 |     #Computation lightGBM with double cross_validaton
192 |                 
193 |     #Computation SVD recommends with double cross_validation            
194 |     #Computation xgboost predict    
195 |     def predict(self, dic_par, feature_list, num_round, save = False, fscore = False):
196 |         pred = pd.Series([-1] * len(self.data))
197 |         kf = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True)
198 |         kf_split = kf.split(self.data.loc[:self.ltr-1, :], self.target)
199 |         features_directory = self.directory + '\\features'
200 |         pred_test = pd.DataFrame()
201 |         score_list = []
202 |         tree_limit = []
203 |         pred_directory = self.directory + '\\models'
204 |         model_number = 100    
205 |         model_directory = pred_directory + '\\model_' + str(model_number)
206 |         if not os.path.exists(model_directory):
207 |             os.makedirs(model_directory)
208 |         for i, (train_index, test_index) in enumerate(kf_split):
209 |             print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen))
210 |             data = pd.DataFrame()
211 |             col_i = 1
212 |             for feature in feature_list:
213 |                 print(str(col_i) + '/' + str(len(feature_list)))
214 |                 col_i += 1
215 |                 cur_feat_directory = features_directory + '\\' + feature
216 |                 if len(os.listdir(cur_feat_directory)) > 1:
217 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv')
218 |                 else:
219 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv')
220 |                 data = pd.concat([data, feature_col], axis = 1)
221 |             xgall = xgb.DMatrix(data.loc[train_index, :], self.target[train_index])
222 |             xgeval = xgb.DMatrix(data.loc[test_index, :], self.target[test_index])
223 |             bst = xgb.train(dic_par, xgall, maximize=False, early_stopping_rounds=30,
224 |                             num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=50)
225 |             del xgall, xgeval
226 |             xg_cvtest = xgb.DMatrix(data.loc[test_index, :])
227 |             xg_test = xgb.DMatrix(data.loc[self.ltr:, :])
228 |             del data
229 |             pred[test_index] = bst.predict(xg_cvtest, ntree_limit=bst.best_ntree_limit)
230 |             score_list += [bst.best_score]
231 |             if fscore == True:
232 |                 return bst.get_fscore()
233 |             print(bst.best_score)
234 |             print(bst.best_ntree_limit)
235 |             tree_limit += [bst.best_ntree_limit]
236 |             cur_pred = pd.DataFrame(bst.predict(xg_test, ntree_limit=bst.best_ntree_limit))
237 |             pred_test = pd.concat([pred_test, cur_pred], axis = 1)
238 |             pred.to_csv(model_directory + '\\predict' + str(i) + '.csv')
239 |             pred_test.to_csv(model_directory + '\\pred_test' + str(i) + '.csv')
240 |         pred[self.ltr:] = np.array(pred_test.mean(axis = 1))
241 |         del xg_cvtest, xg_test, bst
242 |         if save == True:
243 |             pred_directory = self.directory + '\\models'
244 |             if not os.path.exists(pred_directory):
245 |                 os.makedirs(pred_directory)
246 |             model_number = len(os.listdir(pred_directory)) + 1
247 |             model_directory = pred_directory + '\\model_' + str(model_number)
248 |             if not os.path.exists(model_directory):
249 |                 os.makedirs(model_directory)
250 |             f = open(model_directory + '\\info.txt', 'w')
251 |             f.write('Model_' + str(model_number) + ' info:\n')
252 |             for i, (x, y) in enumerate(zip(score_list, tree_limit)):
253 |                 f.write('Fold ' + str(i + 1) + ': Score: ' + str(x) + ' Tree_number: ' + str(y) + '\n')
254 |             f.write('Model score:' + str(1 - np.mean(score_list)))
255 |             f.close()
256 |             pred.to_csv(model_directory + '\\predict.csv')
257 |             del pred
258 |      
259 |     def predictSparse(self, dic_par, sparse, feature_list, num_round, save = False, fscore = False, score = False):
260 |         pred = pd.Series([-1] * len(self.data))
261 |         kf = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True)
262 |         kf_split = kf.split(self.data.loc[:self.ltr-1, :], self.target)
263 |         features_directory = self.directory + '\\features'
264 |         pred_test = pd.DataFrame()
265 |         score_list = []
266 |         tree_limit = []
267 |         pred_directory = self.directory + '\\models'
268 |         model_number = 100    
269 |         model_directory = pred_directory + '\\model_' + str(model_number)
270 |         if not os.path.exists(model_directory):
271 |             os.makedirs(model_directory)
272 |         for i, (train_index, test_index) in enumerate(kf_split):
273 |             if score == False:
274 |                 print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen))
275 |             data = pd.DataFrame()
276 |             col_i = 1
277 |             for feature in feature_list:
278 |                 if score == False:
279 |                     print(str(col_i) + '/' + str(len(feature_list)))
280 |                 col_i += 1
281 |                 cur_feat_directory = features_directory + '\\' + feature
282 |                 if len(os.listdir(cur_feat_directory)) > 1:
283 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv')
284 |                 else:
285 |                     feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv')
286 |                 data = pd.concat([data, feature_col], axis = 1)
287 |             data = hstack([data, sparse]).tocsr()
288 |             xgall = xgb.DMatrix(data[train_index], self.target[train_index])
289 |             xgeval = xgb.DMatrix(data[test_index], self.target[test_index])
290 |             bst = xgb.train(dic_par, xgall, maximize=False, early_stopping_rounds=50,
291 |                             num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=20)
292 |             del xgall, xgeval
293 |             xg_cvtest = xgb.DMatrix(data[test_index])
294 |             xg_test = xgb.DMatrix(data[self.ltr:])
295 |             del data
296 |             pred[test_index] = bst.predict(xg_cvtest, ntree_limit=bst.best_ntree_limit)
297 |             score_list += [bst.best_score]
298 |             if fscore == True:
299 |                 return bst.get_fscore()
300 |             print(bst.best_score)
301 |             print(bst.best_ntree_limit)
302 |             tree_limit += [bst.best_ntree_limit]
303 |             cur_pred = pd.DataFrame(bst.predict(xg_test, ntree_limit=bst.best_ntree_limit))
304 |             pred_test = pd.concat([pred_test, cur_pred], axis = 1)
305 |             pred.to_csv(model_directory + '\\predict' + str(i) + '.csv')
306 |             pred_test.to_csv(model_directory + '\\pred_test' + str(i) + '.csv')
307 |         pred[self.ltr:] = np.array(pred_test.mean(axis = 1))
308 |         if score == True:
309 |             return accuracy_score(self.target, round(pred[:self.ltr]).astype(int))
310 |         del xg_cvtest, xg_test, bst
311 |         if save == True:
312 |             pred_directory = self.directory + '\\models'
313 |             if not os.path.exists(pred_directory):
314 |                 os.makedirs(pred_directory)
315 |             model_number = len(os.listdir(pred_directory)) + 1
316 |             model_directory = pred_directory + '\\model_' + str(model_number)
317 |             if not os.path.exists(model_directory):
318 |                 os.makedirs(model_directory)
319 |             f = open(model_directory + '\\info.txt', 'w')
320 |             f.write('Model_' + str(model_number) + ' info:\n')
321 |             for i, (x, y) in enumerate(zip(score_list, tree_limit)):
322 |                 f.write('Fold ' + str(i + 1) + ': Score: ' + str(x) + ' Tree_number: ' + str(y) + '\n')
323 |             f.write('Model score:' + str(1 - np.mean(score_list)))
324 |             f.close()
325 |             pred.to_csv(model_directory + '\\predict.csv')
326 |             del pred
327 | 
328 | 


--------------------------------------------------------------------------------
/Rank_1_Roman/start.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
 15 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import xgboost as xgb\n",
 23 |     "import datetime\n",
 24 |     "import re\n",
 25 |     "from sklearn.svm import LinearSVC\n",
 26 |     "from sklearn.metrics import accuracy_score"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "CREATE DATA"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "train = pd.read_csv('train.csv')\n",
 45 |     "test = pd.read_csv('test.csv')\n",
 46 |     "\n",
 47 |     "final_status = train.final_status\n",
 48 |     "projest_id = train.project_id\n",
 49 |     "backers_count = train.backers_count\n",
 50 |     "\n",
 51 |     "ltr = len(train)\n",
 52 |     "train.drop(['final_status', 'backers_count'], axis = 1, inplace = True)\n",
 53 |     "\n",
 54 |     "data = pd.concat([train, test], axis = 0)\n",
 55 |     "data.index = range(len(data))"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "int_disable_communication = []\n",
 67 |     "for x in data.disable_communication.tolist():\n",
 68 |     "    if x == False:\n",
 69 |     "        int_disable_communication += [0]\n",
 70 |     "    else:\n",
 71 |     "        int_disable_communication += [1]\n",
 72 |     "data['disable_communication'] = int_disable_communication"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "data['deadline-created_at'] = data.deadline - data.created_at\n",
 84 |     "data['launched_at-created_at'] = data.deadline - data.created_at\n",
 85 |     "data['state_changed_at-created_at'] = data.deadline - data.created_at\n",
 86 |     "data['state_changed_at-deadline'] = data.state_changed_at - data.deadline\n",
 87 |     "data['deadline-launched_at'] = data.deadline - data.launched_at\n",
 88 |     "data['state_changed_at-launched_at'] = data.state_changed_at - data.launched_at"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "data['len_name'] = [len(str(x)) for x in data.name.tolist()]\n",
100 |     "data['len_desc'] = [len(str(x)) for x in data.desc.tolist()]\n",
101 |     "data['len_keywords'] = [len(str(x)) for x in data.keywords.tolist()]\n",
102 |     "data['numb_keywords'] = [len(str(x).split('-')) for x in data.keywords.tolist()]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {
109 |     "collapsed": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "len_cov = []\n",
114 |     "for x in data.desc.tolist():\n",
115 |     "    tokens = re.findall('\\\"', str(x))\n",
116 |     "    len_cov += [len(tokens)]\n",
117 |     "data['len_cov'] = len_cov\n",
118 |     "data['bad_znak'] =  data['len_cov'] / data['len_desc']"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 7,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "normal_goal = []\n",
130 |     "for x, y in zip(data.currency.tolist(), data.goal.tolist()):\n",
131 |     "    if x == 'USD':\n",
132 |     "        normal_goal += [y]\n",
133 |     "    if x == 'GBP':\n",
134 |     "        normal_goal += [1.5 * y]\n",
135 |     "    if x == 'EUR':\n",
136 |     "        normal_goal += [1.2 * y]\n",
137 |     "    if x == 'CAD':\n",
138 |     "        normal_goal += [0.85 * y]\n",
139 |     "    if x == 'AUD':\n",
140 |     "        normal_goal += [0.85 * y]\n",
141 |     "    if x == 'SEK':\n",
142 |     "        normal_goal += [0.14 * y]\n",
143 |     "    if x == 'NZD':\n",
144 |     "        normal_goal += [0.70 * y]\n",
145 |     "    if x == 'DKK':\n",
146 |     "        normal_goal += [0.17 * y]\n",
147 |     "    if x == 'NOK':\n",
148 |     "        normal_goal += [0.15 * y]\n",
149 |     "    if x == 'CHF':\n",
150 |     "        normal_goal += [y]\n",
151 |     "    if x == 'MXN':\n",
152 |     "        normal_goal += [0.07 * y]\n",
153 |     "    if x == 'SGD':\n",
154 |     "        normal_goal += [0.73 * y]\n",
155 |     "    if x == 'HKD':\n",
156 |     "        normal_goal += [0.13 * y]\n",
157 |     "\n",
158 |     "data['normal_goal'] = normal_goal\n",
159 |     "data['deadline-created_at_normal_goal'] = data.loc[:, 'deadline-created_at'] / data.normal_goal\n",
160 |     "data['launched_at-created_at_normal_goal'] = data.loc[:, 'launched_at-created_at'] / data.normal_goal\n",
161 |     "data['state_changed_at-created_at_normal_goal'] = data.loc[:, 'state_changed_at-created_at'] / data.normal_goal\n",
162 |     "data['state_changed_at-deadline_normal_goal'] = data.loc[:, 'state_changed_at-deadline'] / data.normal_goal\n",
163 |     "data['deadline-launched_at_normal_goal'] = data.loc[:, 'deadline-launched_at'] / data.normal_goal\n",
164 |     "data['state_changed_at-launched_at_normal_goal'] = data.loc[:, 'state_changed_at-launched_at'] / data.normal_goal"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 8,
170 |    "metadata": {
171 |     "collapsed": false
172 |    },
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "deadline_hour_weekday\n",
179 |       "created_at_hour_weekday\n",
180 |       "launched_at_hour_weekday\n",
181 |       "state_changed_at_hour_weekday\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "time_feat = ['deadline', 'created_at', 'launched_at',  'state_changed_at']\n",
187 |     "for time in time_feat:\n",
188 |     "    weekday = []\n",
189 |     "    hour = []\n",
190 |     "    day = []\n",
191 |     "    for x in data.loc[:, time].tolist():\n",
192 |     "        weekday += [datetime.datetime.fromtimestamp(x).weekday()]\n",
193 |     "        hour += [datetime.datetime.fromtimestamp(x).hour]\n",
194 |     "        day += [datetime.datetime.fromtimestamp(x).day]\n",
195 |     "    data[time + '_' + 'weekday'] = weekday\n",
196 |     "    data[time + '_' + 'hour'] = hour\n",
197 |     "    data[time + '_' + 'day'] = day\n",
198 |     "    \n",
199 |     "for time in time_feat:\n",
200 |     "    print(time + '_' + 'hour_weekday')\n",
201 |     "    data[time + '_' + 'hour_weekday'] = data[time + '_' + 'hour'].astype(str) + '_' + data[time + '_' + 'weekday'].astype(str)\n",
202 |     "    data[time + '_' + 'hour_country'] = data[time + '_' + 'hour'].astype(str) + '_' + data['country'].astype(str)\n",
203 |     "    data[time + '_' + 'weekday_country'] = data[time + '_' + 'weekday'].astype(str) + '_' + data['country'].astype(str)\n",
204 |     "    data[time + '_' + 'day_country'] = data[time + '_' + 'day'].astype(str) + '_' + data['country'].astype(str)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 9,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "canceled = []\n",
216 |     "for x in data.name.tolist():\n",
217 |     "    if len(re.findall('Canceled', str(x))) > 0:\n",
218 |     "        canceled += [1]\n",
219 |     "    else:\n",
220 |     "        canceled += [0]\n",
221 |     "data['canceled'] = canceled"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 10,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "for x in ['deadline_hour_weekday',\n",
233 |     "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n",
234 |     "    for y in ['country', 'currency']:\n",
235 |     "        data[x + y] = (data[x] + data[y]).astype('category').cat.codes\n",
236 |     "\n",
237 |     "for x in ['deadline_hour_weekday',\n",
238 |     "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n",
239 |     "    for y in ['country']:\n",
240 |     "        for z in ['currency']:\n",
241 |     "            data[x + y] = (data[x] + data[y] + data[z]).astype('category').cat.codes\n",
242 |     "            \n",
243 |     "for x in ['country', 'currency', 'deadline_hour_weekday',\n",
244 |     "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n",
245 |     "    data[x] = data[x].astype('category').cat.codes"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 11,
251 |    "metadata": {
252 |     "collapsed": true
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "for x in ['deadline_hour_country', 'deadline_weekday_country', 'deadline_day_country', 'created_at_hour_country',\n",
257 |     "'created_at_weekday_country', 'created_at_day_country', 'launched_at_hour_country', 'launched_at_weekday_country',\n",
258 |     "'launched_at_day_country', 'state_changed_at_hour_country', 'state_changed_at_weekday_country', 'state_changed_at_day_country']:\n",
259 |     "    data[x] = data[x].astype('category').cat.codes"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 12,
265 |    "metadata": {
266 |     "collapsed": true
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
271 |     "\n",
272 |     "name = data.name.fillna('None').tolist()\n",
273 |     "tfidf = TfidfVectorizer(max_features = 500, stop_words = 'english', ngram_range = (1, 2))\n",
274 |     "name_vect = tfidf.fit_transform(name)\n",
275 |     "\n",
276 |     "desc = data.desc.fillna('None').tolist()\n",
277 |     "tfidf = TfidfVectorizer(max_features = 2000, stop_words = 'english', ngram_range = (1, 4))\n",
278 |     "desc_vect = tfidf.fit_transform(desc)\n",
279 |     "\n",
280 |     "keywords = data.keywords.tolist()\n",
281 |     "tfidf = TfidfVectorizer(max_features = 1000, stop_words = 'english', ngram_range = (1, 3))\n",
282 |     "keywords_vect = tfidf.fit_transform(keywords)"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 13,
288 |    "metadata": {
289 |     "collapsed": true
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "from scipy.sparse import hstack\n",
294 |     "\n",
295 |     "sp_data = hstack([keywords_vect, name_vect, desc_vect]).tocsr()\n",
296 |     "del tfidf, keywords_vect, name_vect, desc_vect"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "LEARN MODEL"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 14,
309 |    "metadata": {
310 |     "collapsed": false
311 |    },
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "1\n",
318 |       "1\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "name = data.name.fillna('None').tolist()\n",
324 |     "tfidf = TfidfVectorizer(max_features = 2000, ngram_range = (1, 6), analyzer = 'char')\n",
325 |     "name_char = tfidf.fit_transform(name)\n",
326 |     "\n",
327 |     "print(1)\n",
328 |     "\n",
329 |     "desc = data.desc.fillna('None').tolist()\n",
330 |     "tfidf = TfidfVectorizer(max_features = 6000, ngram_range = (1, 6), analyzer = 'char')\n",
331 |     "desc_char = tfidf.fit_transform(desc)\n",
332 |     "print(1)\n",
333 |     "keywords = data.keywords.tolist()\n",
334 |     "tfidf = TfidfVectorizer(max_features = 3000, ngram_range = (1, 6), analyzer = 'char')\n",
335 |     "keywords_char = tfidf.fit_transform(keywords)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 15,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "sp_data = hstack([sp_data, name_char, desc_char, keywords_char]).tocsr()\n",
347 |     "del tfidf, name_char, desc_char, keywords_char"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 16,
353 |    "metadata": {
354 |     "collapsed": true
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "pred = pd.DataFrame()\n",
359 |     "pred['svc'] = [-1] * (len(test))\n",
360 |     "clf = LinearSVC()\n",
361 |     "clf.fit(sp_data[:ltr], final_status)\n",
362 |     "pred['svc'] = clf.predict(sp_data[ltr:])"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 17,
368 |    "metadata": {
369 |     "collapsed": true
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "from sklearn.linear_model import LogisticRegression\n",
374 |     "clf = LogisticRegression(C = 2)\n",
375 |     "clf.fit(sp_data[:ltr], final_status)\n",
376 |     "pred['logreg'] = clf.predict_proba(sp_data[ltr:])[:, -1]"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 18,
382 |    "metadata": {
383 |     "collapsed": true
384 |    },
385 |    "outputs": [],
386 |    "source": [
387 |     "data = np.array(data.drop(['project_id', 'name', 'desc', 'keywords'], axis = 1))\n",
388 |     "sp_data1 = hstack([sp_data, data]).tocsr()"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 19,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "del data, sp_data"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 20,
405 |    "metadata": {
406 |     "collapsed": true
407 |    },
408 |    "outputs": [],
409 |    "source": [
410 |     "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n",
411 |     "               'max_depth':5, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 5}\n",
412 |     "dtest = xgb.DMatrix(sp_data1[ltr:, 14500:])\n",
413 |     "dtrain = xgb.DMatrix(sp_data1[:ltr, 14500:], label=final_status)\n",
414 |     "\n",
415 |     "bst = xgb.train(dic_par, dtrain, 222)\n",
416 |     "pred['xgb1'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 21,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "del dtrain, dtest, bst"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 22,
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "outputs": [],
437 |    "source": [
438 |     "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n",
439 |     "               'max_depth':8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5}\n",
440 |     "dtest = xgb.DMatrix(sp_data1[ltr:])\n",
441 |     "dtrain = xgb.DMatrix(sp_data1[:ltr], label=final_status)\n",
442 |     "\n",
443 |     "bst = xgb.train(dic_par, dtrain, 416)\n",
444 |     "pred['xgb2'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 23,
450 |    "metadata": {
451 |     "collapsed": true
452 |    },
453 |    "outputs": [],
454 |    "source": [
455 |     "del dtrain, dtest, bst"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 24,
461 |    "metadata": {
462 |     "collapsed": true
463 |    },
464 |    "outputs": [],
465 |    "source": [
466 |     "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n",
467 |     "               'max_depth':7, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 6}\n",
468 |     "dtest = xgb.DMatrix(sp_data1[ltr:])\n",
469 |     "dtrain = xgb.DMatrix(sp_data1[:ltr], label=final_status)\n",
470 |     "\n",
471 |     "bst = xgb.train(dic_par, dtrain, 228)\n",
472 |     "pred['xgb3'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 25,
478 |    "metadata": {
479 |     "collapsed": true
480 |    },
481 |    "outputs": [],
482 |    "source": [
483 |     "del dtrain, dtest, bst"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 26,
489 |    "metadata": {
490 |     "collapsed": true
491 |    },
492 |    "outputs": [],
493 |    "source": [
494 |     "import lightgbm as lgb"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 27,
500 |    "metadata": {
501 |     "collapsed": true
502 |    },
503 |    "outputs": [],
504 |    "source": [
505 |     "params = {\n",
506 |     "    'objective': 'binary',\n",
507 |     "    'metric': 'binary_error',\n",
508 |     "    'num_leaves': 80,\n",
509 |     "    'learning_rate': 0.1,\n",
510 |     "    'feature_fraction': 0.9,\n",
511 |     "    'bagging_fraction': 0.8,\n",
512 |     "    'bagging_freq': 2\n",
513 |     "}\n",
514 |     "lgb_train = lgb.Dataset(sp_data1[:ltr], np.array(final_status))\n",
515 |     "gbm = lgb.train(params,\n",
516 |     "                lgb_train,\n",
517 |     "                num_boost_round=148)\n",
518 |     "pred['lgb1'] = gbm.predict(sp_data1[ltr:])"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": 30,
524 |    "metadata": {
525 |     "collapsed": false
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "del lgb_train"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 31,
535 |    "metadata": {
536 |     "collapsed": true
537 |    },
538 |    "outputs": [],
539 |    "source": [
540 |     "params = {\n",
541 |     "    'objective': 'binary',\n",
542 |     "    'metric': 'binary_error',\n",
543 |     "    'num_leaves': 50,\n",
544 |     "    'learning_rate': 0.1,\n",
545 |     "    'feature_fraction': 0.8,\n",
546 |     "    'bagging_fraction': 0.7,\n",
547 |     "}\n",
548 |     "lgb_train = lgb.Dataset(sp_data1[:ltr, 14500:], np.array(final_status))\n",
549 |     "gbm = lgb.train(params,\n",
550 |     "                lgb_train,\n",
551 |     "                num_boost_round=124)\n",
552 |     "pred['lgb2'] = gbm.predict(sp_data1[ltr:, 14500:])"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 34,
558 |    "metadata": {
559 |     "collapsed": false
560 |    },
561 |    "outputs": [],
562 |    "source": [
563 |     "del lgb_train"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "READ PREDICT FROM ANOTHER FILE"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 35,
576 |    "metadata": {
577 |     "collapsed": false
578 |    },
579 |    "outputs": [],
580 |    "source": [
581 |     "pred_new = pd.read_csv('predict.csv', header = None)\n",
582 |     "pred['xgb_old'] = pred_new[1][ltr:].tolist()"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "markdown",
587 |    "metadata": {},
588 |    "source": [
589 |     "ADD BAGGING(SPLIT BY TIME INTERVAL, COEF FROM LINEAR REGRESSION WITH NOISE)"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": 36,
595 |    "metadata": {
596 |     "collapsed": false
597 |    },
598 |    "outputs": [],
599 |    "source": [
600 |     "final_pred = pd.Series([-100] * len(test))\n",
601 |     "col = ['logreg', 'xgb1', 'xgb2', 'xgb3', 'lgb1', 'lgb2', 'xgb_old']\n",
602 |     "new_col = [x + '1' for x in col]\n",
603 |     "pred[new_col] = pred[col].round()"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 37,
609 |    "metadata": {
610 |     "collapsed": false
611 |    },
612 |    "outputs": [],
613 |    "source": [
614 |     "date = pd.to_datetime(test.created_at,unit='s')"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 38,
620 |    "metadata": {
621 |     "collapsed": false
622 |    },
623 |    "outputs": [],
624 |    "source": [
625 |     "y2017 = date[date.dt.year == 2017].index\n",
626 |     "coef = [-0.027163179271743373, 0.50829964473673006, 0.21379022476789045, 0.21797444121259385, \n",
627 |     "        0.10735714061372345, 0.17883463215622081, -0.11532700181862125, 0.0054078524855090682, \n",
628 |     "        0.019044340523711512, 0.020904671380734853, 0.016799278763598158, -0.035555538613519899, \n",
629 |     "        .070046648507590167, 0.015530628742049274, 0.041652250420829068]\n",
630 |     "final_pred[y2017] = (np.array(pred.loc[y2017, :]) * coef).sum(axis = 1).round()"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": 39,
636 |    "metadata": {
637 |     "collapsed": true
638 |    },
639 |    "outputs": [],
640 |    "source": [
641 |     "y2010 = date[date.dt.year.isin([2010,2011,2012,2013,2014])].index\n",
642 |     "\n",
643 |     "coef = [-0.027163179271743373, 0.50829964473673006, 0.21379022476789045, 0.21797444121259385, 0.10735714061372345, \n",
644 |     "        0.17883463215622081, -0.11532700181862125, 0.0054078524855090682, 0.019044340523711512, 0.020904671380734853, \n",
645 |     "        0.016799278763598158, -0.035555538613519899, 0.070046648507590167, 0.015530628742049274, 0.041652250420829068]\n",
646 |     "final_pred[y2010] = (np.array(pred.loc[y2010, :]) * coef).sum(axis = 1).round()"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": 40,
652 |    "metadata": {
653 |     "collapsed": true
654 |    },
655 |    "outputs": [],
656 |    "source": [
657 |     "coef_list2015 = [[[-0.41383995384225819, -0.16307572346026833, -0.10896588031522875, 0.35471079322753063, 0.22390883115227922, -0.32863241243857799, 0.097549160081059405, 1.1372073765804023, 0.87153515112747171, -0.080863813424291636, 0.28487078798716486, -0.55400069424988796, -0.17659008257259162, 0.35160714276697574, -0.21936988430087989], -0.08629609882731698], [[-0.39286094196229099, 0.34510484198718872, 0.89787931598668014, -1.486627118144938, 1.245325850060452, 1.1053038953569081, -1.4454198657086255, 0.55721922803318291, 0.27085791705579459, 0.064604935954636344, 0.13771723388179469, -0.11899338973184959, -0.064443006982082052, 0.14364967167706721, -0.27374821149664608], 0.0036556793890552552], [[0.083346325658424161, -0.12831268413013974, 1.1959007032760487, -0.5233849336825912, 0.39900940712026656, 0.42745138413344769, -2.1143414710142512, 1.4761439331118933, 0.083346325658423981, -0.10015748315347639, -0.47897883986230105, 0.40147546444362403, 0.31978463221249886, 0.46676606100165352, -0.43921808570249288], 0.10734424730150077], [[-0.025017998270211974, 0.8149649690076205, 0.22134553385093006, -0.27829229874696421, 0.2163670138984588, 1.4214632555094264, 0.020124861567298286, -0.66832723045793496, -0.11756296346591055, 0.024269691750705785, 0.20962183305716242, -0.39990573074287339, -0.17063736111252792, 0.052916305919370354, 0.27456099230977582], -0.087124834756392822], [[-0.17857188656760381, -0.50018596299048845, 1.4635581312988146, -0.0504486329542157, 1.4202366056430444, 0.13200100751257016, -1.3792525358813492, -0.0043358756294259448, 0.61417616538769393, -0.091765531130930111, -0.1849941363492198, -0.28635895012223983, 0.08344268469493421, 0.029755702251762495, -0.12095137688527768], 0.059323755834263248], [[0.23266570483925128, 0.59183162350733021, 0.85228046871709484, -0.18994063134104572, -0.13074213291852804, -0.57422222869450446, -0.55415140450869049, 1.4479664887719843, -0.50511446356786971, -0.3152090047679722, -0.12621744624435693, 0.053571357532884112, 0.57767217434333518, 0.26510233281407025, -0.37662132192200815], 0.013015069835713933], [[0.079112999667759101, 0.53610646520732519, -1.155397987642315, 0.2565622200003016, 0.14382146435348217, 1.3073772692753969, 0.87854749768335161, -0.42129780803337913, -0.24246377511520789, -0.055366480326769274, -0.14171743379467075, 0.26797543629685394, 0.19325962796902985, 0.062853835080842801, -0.31020409520730441], -0.001021638893230814], [[-0.14140528990693546, -0.4157381903396174, 0.32643815251828778, 0.47829470762966453, 0.14540838047690763, -0.070953783338113974, 0.07259713275983698, 1.0273916737378574, 0.2105072342428041, -0.15848873991231688, 0.23483160420629368, -0.37459121068207463, 0.03737705965952931, 0.092195302291443681, -0.085222845716571793], 0.0078031063428684044], [[0.10550662841276728, 0.72612646807507764, 0.88080140234336735, -0.058116892446551854, -0.30759765699777342, 0.85370792127323414, -0.73696074165507164, 0.1901825008658854, -0.18077110440369148, -0.1083241428815005, 0.10296198995045747, 0.13142052324342346, -0.1441361965581997, 0.15228816196374362, -0.16334161610384135], -0.042526473797645292], [[0.033740498382682026, 0.080843506890849542, -0.083625278536957698, 0.24728436962909617, -0.55848674776303997, 0.83791202376532703, 0.048375547816753614, 1.1468314736516987, -0.069594498991358028, -0.055720452596246917, 0.037641457795474897, 0.043932420232981995, -0.049062634778156131, -0.012202354713624297, -0.21266052896534438], 0.0022399725905200008], [[-0.17409663626947336, 0.33843178141729097, -0.53125937805643997, 0.79958304675601433, 0.04106395228403914, 0.034285833381269976, 0.70360379309129073, 0.45037001128304066, -0.009205923702995028, -0.031796753471828643, 0.033143113104870336, -0.20158329715391332, 0.32201721875719203, -0.26813488809618247, -0.08392575310327377], -0.013782307499392965], [[-0.30701584284152922, 0.5942492683928583, 0.23239298742646675, 0.51924826923963019, -0.018419637046819481, 0.45063965311071841, 0.71939432972730277, -0.99474793277483242, 0.43454889983497075, -0.01691951305694267, -0.12875288169134289, -0.2424468901124931, -0.051384055344316615, -0.026109141686985748, 0.33732870822515598], -0.022383224751678088], [[0.15177273114974479, -0.026347320130969451, 0.25714275716769786, -0.60081133386779295, 1.5031631570715014, 0.43161693644893617, -0.29902155663521324, 0.098163961202380037, -0.15912121243201116, -0.19138392116446523, 0.13537943701624849, -0.1437814082267192, -0.0049560636108855194, 0.13559807851976091, 0.049246074378967775], 0.022938205951003654], [[0.083373645090151383, 0.44047294051683972, 0.74861778201841345, -0.20226722175100323, 0.87339331088680272, -0.27091310474858565, -0.6773592374808135, 0.28950885701085288, -0.11428941375895169, 0.064335136098343737, 0.085030349125940985, -0.10794608697245556, 0.056561839146722304, 0.24180383330031219, -0.11578746024682307], -0.046245635922416151], [[-0.050371576006364556, 0.6726724372394467, 0.25161075195261418, 0.67810570052226105, 0.59750443557138855, -0.4095965095982248, 0.087341165718058555, -0.01399886708101386, -0.1129046724188246, 0.10705832882972, -0.08347928741234778, 0.036823106059101052, -0.046602563791332902, -0.060134854954618455, 0.053886915926962181], -0.070925883612583063], [[0.077757796330699722, 0.29439865068850385, 0.43827025281740206, 0.35369973922058889, -0.078012553798705031, 0.075846743133362399, -0.45084175214799821, 0.77945829571817871, -0.031010811043684208, 0.0021613621474618205, 0.0078662533977770233, 0.10998001384693995, -0.14270865670379468, 0.061212808939159924, -0.17481214510521037], -0.0082442323680364527], [[-0.0064521578344324771, 0.41694436929706225, 0.20744985991777531, 0.35062836067520409, 0.29385875391622807, 0.16496024305653861, 0.064932604791711881, 0.25991103584696063, -0.1497593824171288, -0.18249832759070406, -0.14947770489950465, -0.10636999044390935, 0.1030929747886534, -0.0052910789341637787, 0.0053631808788744129], -0.050704163539114], [[0.22618710579684795, 0.31056171954317158, 0.21148323865181329, 0.56238236792633811, -0.38802270579054898, 0.50123862939552399, 0.013060952495983269, 0.42142316833453486, -0.14627618413647422, -0.00074587493405625649, -0.080276015934133693, -0.020160317076800549, -0.065571331961821161, 0.11343891541036422, -0.12186886136797415], -0.067370582777799337], [[-0.021166784463613177, 0.37550565405307484, 0.40151444164484273, 0.55020638162551849, -0.40732832938350261, 0.52349114862659396, 0.092195903521491318, 0.00076687090595081589, 0.056342743605521772, 0.072530193047288166, -0.025441056107622151, 0.11105505003182653, -0.15479516729304266, -0.039872341320651872, 0.018317039855248352], -0.068029487047780002], [[-0.038502206012433142, 0.1846914167264177, -0.012346366389744123, -0.32675224793022695, 0.32883359264341405, 0.80578704244427946, 0.20223654563194438, 0.2867340017678921, 0.050752578137505111, 0.073907401772084463, 0.049807673019682697, -0.055565730740980895, -0.19496891199125929, -0.028915831048797663, 0.062628046019031713], -0.048029081326567191], [[0.10091406807450043, 0.29974702524870739, 0.4129055100414003, 0.081063474592990709, 0.73313546834260235, -0.050258705754476329, -0.076959817628007077, 0.15423294085164146, -0.063825304014272444, -0.016442470293993372, -0.058599320299935864, -0.05472523946420732, 0.026006936119348345, -0.025964162544909519, -0.055132724117975374], -0.057617515387948792], [[-0.13412571009324611, 0.28867598295950708, 0.83654410646535038, 0.28744844076126608, 0.096612475021383093, 0.22575683170007652, -0.38651929812922864, 0.22053437132158993, 0.091632146031291467, -0.092885990626032311, -0.085645502553226185, 0.064797226467597363, 0.071972380240118095, 0.0043700370634170427, -0.013152856788972489], -0.053802826760923406], [[0.11128812962073018, 0.090281010573965148, 0.4713430340572165, 0.49463928492923137, -0.85945069178762168, 1.1062059214649451, -0.11803519286306985, 0.27940357132598365, -0.010427237053803481, 0.06616255841965657, 0.047939009754045747, 0.13430250169613434, -0.26490673303777479, -0.095745368665667463, -0.058540872952473344], -0.035069013726363951], [[0.022176366776143132, 0.26783831291371463, 0.28277588311753432, 0.47846058106773559, 0.16097025556464878, 0.2571532474206788, 0.012421942353549095, 0.070547552636659627, 0.0038074992477577507, 0.060850205792590858, 0.011059711916478789, -0.17663880153962472, -0.08402572264726911, -0.048057976864883084, 0.10959690375543049], -0.073200989594718324], [[-0.090304449224284669, 0.49505311698570725, 0.9053338838928614, 0.40652720285926353, -0.11490412096443998, -0.098850876691273903, -0.39369186142375756, 0.34103526425569303, 0.038250938244487998, -0.029405985065134366, -0.18598476424640928, 0.08084270069570032, 0.090540519485523802, 0.057132674845881665, 0.052513088783564199], -0.07959706965928609], [[0.034032949283043942, 0.43840900310101066, 0.42569503775973472, 0.028574249007546057, 0.37555142622684001, 0.35767653562159568, -0.11049481867598984, 0.2268268093364002, -0.10829224779425944, -0.056374962049798139, -0.056207087776516756, -0.13788653698481018, 0.11076960903459684, -0.0010955468465459473, -0.011783861552982333], -0.056547139659597401], [[-0.083736262982500836, 0.51790857341129104, 0.22985557359544206, 0.036319451048168069, 0.08837876334638084, 0.70331749885554329, 0.13589645438844405, 0.20851642148244737, 0.022701999775491616, 0.083971680331921317, -0.079447240778194567, -0.099434288371511095, -0.14378732684723694, -0.070311448520760128, 0.036857677991006033], -0.10836956431151584], [[-0.068557504813032605, 0.26378895686980636, -0.26641784069181468, 0.040062426713872612, 0.56711754869939868, 0.42277416390708517, 0.2490748737220656, 0.30443647171716692, 0.062113553145254408, 0.040751176294925917, 0.056732236572030637, -0.074881428530962302, 0.003889022629293859, -0.053635216449648804, -0.056706126189836753], -0.056701181353114083], [[-0.17546570273489157, 0.43384915148572484, 0.16508675957300636, 0.44127927336104517, -0.88372309184143083, 0.98728170988045361, 0.28839230234366586, 0.27573818316821119, 0.2000919673725336, 0.03301671253297378, 0.14169158656984532, -0.0041617381686023802, -0.27982885612923369, -0.10830269104552115, -0.05674728609894264], -0.060562733003362679], [[-0.07666996966260306, 0.47714699207326589, 0.22605680907628603, 0.31502624020937509, -0.36475049392442677, 0.60351163741844072, -0.019577217727722446, 0.19943273776652326, 0.043846737957801002, -0.017958288969666725, 0.13689204825568435, 0.061463101783768737, -0.19770044948964921, 0.10110510955148963, -0.012943943165161009], -0.05128653661657806], [[-0.16132807588827527, 0.41774138045343401, -0.015151873430680387, 0.25997315921009484, 0.5763079399248735, -0.1113946094014111, 0.20336140106543807, 0.21908978843202909, 0.14287374729900668, -0.068921910922539964, -0.15961689075722965, -0.020865388903675952, 0.064943390319233324, -0.020705354823424116, 0.10982466710622138], -0.066536479411353544], [[0.033420578920728417, 0.36261397183334343, 0.11522342338398239, 0.26811771127358552, 0.45928653716345469, 0.088297155469125865, 0.046621023088578961, 0.16860717733653874, -0.10814273405689259, 0.09233421397035077, 0.061115125945500309, -0.10779632497544339, -0.011971386582525445, -0.060966834403295866, 0.013015925716524157], -0.041185756159896558], [[-0.2001934238267373, 0.35756821822539775, 0.65629248341856783, 0.7222437647466009, -0.45124717573004536, 0.16166894454599889, -0.55679306764127956, 0.74961179323168303, 0.11106026591460433, -0.044603850777884424, -0.069731305019352152, 0.016434566356183844, -0.14644456070896128, 0.10611304915772002, 0.037700570463224614], -0.071258519696936196], [[-0.10295951257957371, 0.26497789321265153, 0.17735696503865289, 0.25004078842064581, -0.1647187228055734, 0.65231499049870001, 0.43516256847086809, 0.11431377802962625, 0.10800730125931264, -0.14992026834471867, -0.12102503699504252, 0.12357349660658856, -0.082886271233495457, -0.067460966622815949, 0.036317888288777633], -0.070039765613889027], [[-0.096907389037331454, 0.17050439167661569, 0.64205061346879988, 0.76313628034326009, -0.0044909688331003783, 0.30761665709503117, -0.31100357813475199, 0.025406843425650694, 0.14796448023372932, -0.083493968100144422, -0.17118020356075014, -0.024403338970001798, -0.092876865018402421, 0.071073147526174274, 0.047936643137305279], -0.038680520288416842], [[0.079948716058980601, 0.31351124735614011, 0.55810534252532806, 0.61721371456929985, -0.20591359843991136, 0.14379263217675714, -0.092754694036302099, 0.45850832554198534, -0.089644213069306686, -0.1481613738985047, -0.0027943169767884468, -0.06712279432969534, -0.035892366242658402, 0.07226570483180475, -0.10956944991592577], -0.06771292192303735], [[0.14208770949351235, 0.32634980833629473, 0.0057023557206580394, 0.34005409286447319, 0.023877414588673748, 0.13129096485728847, 0.51437288924768865, 0.23756857474314735, 0.0077691359060347506, 0.036860099004501906, -0.033258458545080499, -0.032523937334426434, 0.0010050016670497541, -0.077843564598096826, -0.10308283687025624], -0.071094499113193876], [[-0.077394610941101172, 0.30918759781497629, 0.16011474994892291, 0.15240543266999998, 0.30830256344156948, 0.61035118576443181, 0.096442516757869634, 0.14390346875654592, 0.033232887524917154, -0.16539139459578883, 0.0058115506692817398, -0.16757131562514824, 0.042536117455280131, 0.039348880899364724, 0.030092896671985392], -0.060883113904824093], [[-0.0070828350649748077, 0.40781712459741987, 0.034236102462881002, -0.01338658192344374, 0.22754362109380527, 0.38596225272624901, 0.52043963194730458, 0.24760818062480192, -0.076569855880416871, -0.11286547598304614, 0.086397646579394582, -0.015818525068393652, -0.13414010910793983, -0.055122509422700028, 0.060905993687274673], -0.071882279302774355], [[-0.02783460471464733, 0.32386775286017178, 0.40398860140344511, 0.24814403493961668, -0.0025605358775599857, 0.70320912952820713, -0.073975073500936711, -0.12049477833890768, 0.012364886108188731, -0.066584009851180681, 0.093800535507852445, -0.041253359718894045, -0.11733492720617113, 0.071131323927756285, 0.030340345313332573], -0.023072640349124429], [[0.054224530017478262, 0.35199437661707283, -0.085541348942181281, -0.12135721626161355, 0.49833798692878006, 0.55726615408398217, 0.28427948322998875, 0.41481215479310918, -0.0099423104735685275, 0.10074679708786802, -0.20725673275882073, 0.023772809400098571, 0.012800216072735904, -0.079181836826978824, -0.15337319851858133], -0.082871891843055989], [[-0.0042761698649839332, 0.43608710290235025, 0.18529779382451289, 0.084334868886577957, 0.30706194694592254, 0.68899336942951139, 0.10435512625136395, 0.0044062011365280651, -0.10907245389055792, -0.11987942604225243, 0.06016292047339275, -0.075457948154415255, -0.087176235287321269, 0.17738944525485981, 0.027081393019452407], -0.064576609537433494], [[-0.12606536049257239, 0.62386988399598142, -0.023261447248895589, -0.17343459301746444, -0.053332308429939107, 0.4354331749164882, 0.16103332381176294, 0.5337953514815662, -0.0063621913100341126, 0.00037921639833465859, 0.13656400106184058, 0.12478707533979005, -0.1434102084799333, 0.16504635366330389, -0.12793340951786364], -0.038997739327800818], [[0.023625106997053526, 0.21465053115274352, -0.10239683891827478, 0.55091010709219912, -0.19591912739509182, 0.58174125012288447, 0.14048587981746832, 0.35939100493920428, 0.0036477217904137549, 0.062895547806887275, -0.025414936857275305, -0.16024746218096914, -0.05692126463352748, 0.095222291902597428, -0.0013976777143698271], -0.021205154288806594], [[-0.1415733161297775, 0.19282706324156584, -0.15317008495995021, 0.41760498989039097, 0.22934896395910748, 0.099855835777710789, 0.86143296168583372, 0.079825134266028178, 0.076888549405725759, 0.029142433900836595, -0.15039955470091237, 0.031885383748681551, -0.036849541966962523, -0.0090335151587409568, 0.093197750343661123], -0.075400121761051153], [[-0.019456985872773423, 0.34492550602029814, -0.085580857667536245, 1.1710411536252971, -0.29660686639492939, -0.39241176722993321, 0.054434520898320238, 0.80443734093500463, 0.051728582172760912, 0.041088729944448121, -0.057246015153181506, -0.098767569674687195, 0.036465417287937285, 0.11034756929537173, -0.11395338218577326], -0.085219239227286858], [[-0.14231106663132562, 0.44184632243492411, 0.47752304900707943, 0.16236003198198581, -0.18651407493988231, 0.33350107876263191, -0.11721668576444982, 0.39118530497284021, 0.12129555555447177, 0.086273883873666884, 0.17795776035727312, -0.3715642737177734, -0.045789414899917474, 0.07615615287210209, 0.14238442427090875], -0.056763246101817377], [[0.062778659062940265, 0.34929379382974429, 0.53605437288068436, 0.13244501373484097, 0.13203014126186671, 0.49908883811679128, -0.56382750092432421, 0.37578910719263359, -0.079515245962285139, 0.02533911740854948, 0.030263265775452797, 0.16348607963670136, -0.13187494888973528, 0.060474841975876825, -0.10904394959216032], -0.031395599343956082]]"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 41,
663 |    "metadata": {
664 |     "collapsed": true
665 |    },
666 |    "outputs": [],
667 |    "source": [
668 |     "for x in range(1, 13):\n",
669 |     "    for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n",
670 |     "        ind = date[(date.dt.year == 2015)&(date.dt.month == x)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n",
671 |     "        final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2015[(x-1)*4 + i][0]).sum(axis = 1) + coef_list2015[(x-1)*4 + i][1]]"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": 42,
677 |    "metadata": {
678 |     "collapsed": true
679 |    },
680 |    "outputs": [],
681 |    "source": [
682 |     "coef_list2016 = [[[0.083160931053686798, 0.50123045434246438, -0.18436015016358764, 0.68130701919442604, 0.43200682038007177, -0.13604295964040092, 0.86730048043856223, -0.054315340181438576, -0.11990878642468367, -0.050063800461746855, -0.16781948020678739, -0.017739169634362117, -0.053654669444286818, -0.065720715845076993, 0.060483040613759156], -0.1249840501022485], [[0.063187436646098161, 0.30373368200674061, 1.0352977426357544, 0.3126191497028955, -0.077914433527196542, 0.87640946035320866, -0.63982168138475803, -0.14496076399427266, -0.060049558383010515, -0.20683417920784564, -0.0047388938169548589, -0.16135590218796603, -0.15628840179855302, 0.16740171378123109, 0.14767828336246941], -0.027583666045878918], [[-0.030980294648838998, 0.43922046683261201, -0.058868357706043262, 0.18424321650682049, 0.86106773791521762, -0.32550525072686465, 0.11896158762571502, 0.66969483521636475, -0.095674781196959491, 0.032006394265927218, -0.14233265851020718, -0.10707930219848585, 0.031629126969294985, -0.019324452143396537, -0.036687987683005269], -0.079626080992849846], [[-0.13954682033127497, 0.33378976286213791, 0.039806033607806893, 0.66123324614724044, -0.22516298826827527, 0.24567958234692971, 0.10308423516841406, 0.37442856856287487, 0.17653572475170914, 0.14059440547621241, -0.27656781828210741, 0.030288533414691385, 0.058761813190410017, -0.054413963372691443, -0.0020022353426871753], -0.036679859962110484], [[0.037129369093817581, 0.5964270728739044, -0.0081063460338467175, -0.22902343615441942, 0.025090731265902444, 0.99604128550763149, 0.16865515878609616, 0.13849912555419008, -0.16600225119406706, 0.033048310728461927, 0.13042055350669213, -0.0032510487880389738, -0.30229216986185653, -0.022642924492079664, -0.01660969447249272], -0.031204436876102359], [[0.0082779509397315557, 0.27521533089258582, 0.271282282448851, 0.24317783066043919, 0.062882102756682101, 0.4521431499010618, -0.19229296943676083, 0.5437477146870735, 0.058585547353780951, -0.0084156629510135639, -0.055255998117297006, -0.034297359496214271, -0.13039999678299585, 0.037031877430364679, -0.02121886377040233], -0.04764990752098508], [[-0.052013705254242218, 0.51331125256924759, -0.15988689871387099, 0.19022155200745466, -0.092351222909538594, 0.46369523140293517, 0.34804940954857166, 0.43290636769210378, -0.059085881660886547, 0.087312789233614685, -0.10203119330789079, 0.067741651875626863, 0.006542044979097883, -0.079964872107691432, -0.025968044414554387], -0.062214553397731853], [[0.0035154323757134588, 0.38454543946760283, -0.1697376761093464, 0.4045272761907584, 0.73035008984184091, 0.21273459085457591, 0.25397547633361278, -0.03324424351378491, -0.099615443747611124, 0.25953490739354618, -0.12172402518309783, -0.15005876063023216, -0.16200660359045868, -0.040408726661696287, 0.1357592809482786], -0.041462351736194336], [[-0.055766997867973386, 0.4767145176607413, -0.52928267087002179, -0.28042202462113103, 0.54888396859182875, 0.50465770289027989, 0.41277133713636116, 0.53375278268615411, -0.033405733150843719, 0.11803405634009667, 0.063341052706401113, -0.11591676800731615, 0.015009234471315414, -0.023084194758126175, -0.06309272421879808], -0.053198787799453862], [[-0.11921808654820741, 0.55304896032398199, 0.33545462577906782, 0.24652645735943704, 0.044552579593745895, 0.21973740219305932, -0.076825940759151876, 0.40570419920086587, 0.059279502970141673, -0.017626646813853342, -0.10219100754909555, 0.030149974189498718, 0.020593515861067502, 0.06185404788751081, -0.02542982596750254], -0.035417073816853384], [[-0.049542828528010417, 0.49810052204019534, 0.29169250395142649, 0.14836672215851654, -0.46431739526436122, 0.67813958273219033, 0.18972221177940243, 0.3559326649863791, -0.041996124544671193, -0.11896253867737042, 0.1487842103646303, -0.075842527222067302, -0.18728400629354813, 0.075909596907412891, -0.0019456230359987114], -0.018278768600546447], [[-0.10262823373263355, 0.65191373057167312, -0.28743238537415644, 0.3620766146781168, 0.38958765020601949, 0.017297579199162694, 0.20795136345319695, 0.19773175650810898, 0.076873259695250731, 0.16377980648044524, -0.083889894106966401, 0.0058007973684544578, -0.04766633765452899, 0.026744298136452127, -0.030190417104037574], -0.012864659958101266], [[-0.044112450776202093, 0.3664592386316341, 0.59732017641677249, 0.22072828001157588, 0.54930686941293161, -0.21040086570602037, -0.320286820585927, 0.16497839913200574, 0.06654955973086793, 0.0042008847447502218, -0.082302011852698892, -0.072833137017210337, 0.11822863790833205, 0.040761949445283441, -0.0063984967606624255], 0.041235476248580705], [[0.078435481397622769, 0.37244994152640953, 0.58707362776076422, 0.37490143249356256, -0.25283669292671007, 0.60866130292090204, -0.1970030047798168, -0.021329057359658829, 0.018989108798100673, -0.11011685335915292, -0.060082501704998348, -0.037443957435357278, -0.15762130581769557, 0.11426169779263162, 0.07190937929381036], -0.021064875515049541], [[0.0034718108656329578, 0.57915256686658578, 0.29528623742650506, -0.050229880493790402, 0.48137810862273306, 0.25318696244880917, -0.15889571006608449, 0.16483440770352081, -0.059730792487845241, 0.093544024981333074, -0.12247159270970034, 0.075771507207590461, -0.096402151710678813, -0.022497849716211576, 0.036354367030831419], 0.005256366888516828], [[-0.023820576680877423, 0.78225853472656826, 0.011634343713431342, 0.019918416317100285, 0.557084254430666, 0.27156835264849705, 0.14575719682470084, -0.088897624975371042, -0.063732953886952048, -0.039590585757928537, -0.052638053599495027, -0.1870774941584803, -0.040966366635034501, 0.098049128640642746, 0.088011347176140398], -0.021608780483270751], [[-0.061349462068088743, 0.69183746903156507, 0.52172715479450382, 0.48187658768522434, -0.099606104627660508, -0.052401242419038541, 0.024573034101930763, 0.23738518382344564, -0.032962004746863116, -0.12605462236670967, -0.036538133880784029, -0.14126295053073362, 0.14844289442390313, 0.008468265486780327, -0.027186130473260628], -0.033833884151762861], [[-0.11431828195368751, 0.78393055757029084, -0.063003189365440804, 0.18182597873019035, -0.33994031661063867, 0.7460911636953973, 0.32971674305189375, -0.04141630449778666, 0.0062870280295277969, 0.073761911986879036, -0.073507477054227055, 0.085813879520226566, 0.10732331935558642, -0.023845436016895677, -0.11301867089636497], -0.0054055816321251227], [[-0.054753180065529136, 0.72907257976196072, 0.51853520325796476, -0.17125328017481417, 0.35257103677379797, 0.30815676910938594, -0.46331790288173313, 0.079081445127745398, -0.020775639058208617, -0.065812688743678599, -0.015409164790581326, -0.094174344553187739, 0.0008859535771960636, 0.18820843876930005, 0.10425469702505108], 0.056803781410861676], [[-0.042467685625824328, 0.82658628356417985, -0.092932697163839367, 0.022071698051833677, 0.61328725281874952, -0.3824315690142488, 0.3787196213319835, 0.18888077541879378, -0.056751968974047287, -0.11727758155060736, 0.027543187445467854, 0.045457019178591185, -0.047819813632006034, 0.020668298110510047, 0.041497997852390389], 0.024869402799663931], [[-0.064757052637594945, 0.61254843023201899, 0.20086584118803563, 0.21615529630668404, -0.24447282050211117, 0.55194291330325529, -0.009869904195014656, -0.22518268192949908, 0.033520839040034556, -0.075252534891125988, -0.09619332773586535, -0.16578170784589868, 0.12373269435445577, 0.051440396724725945, 0.13820583675161502], 0.07605182599347754], [[-0.095896018600558228, 0.69788715734204487, 0.21436145719045058, 0.83706643929778035, 0.47507517220924905, -0.2908287955029823, 0.18132404698344207, -0.6709551186955669, 0.029008135445491634, -0.020478235431542147, -0.14937658392775088, -0.048159972223858927, 0.047785031619034357, 0.015409964290556966, 0.19223101919119823], 0.046328812007278453], [[-0.14139965440116536, 1.1099453120571932, 0.90215970671662171, -0.22271025596956984, -0.018733690889584986, 0.27713791517741265, -0.015171447692745841, -0.44736962141944825, -0.13945812177327943, -0.16770232269774765, -0.18555121833590268, 0.3340515092066168, -0.019131701690440761, 0.04779454714153375, 0.17032034003113261], -0.0011406461477660446], [[-0.15814004966474193, 0.76510545034432464, 0.55809107865746277, 0.055086482390308111, 1.6874616017811952, -0.98119567509265881, -0.22255726211598903, -0.42610664307362767, 0.048872138661632203, 0.086992973690962261, 0.22112821840946606, -0.31818104317672624, 0.032161073706369003, -0.016595663155686613, 0.17860319823197202], 0.061364613680657654], [[-0.085409697642654284, 0.87040181141592043, 1.1053281722579182, 0.13828450653163327, -1.2316974212995426, -0.0061250907444628522, 0.11930851679229618, -0.29960478093678317, 0.0077587108370841262, -0.29725813009402691, 0.25100538067247935, 0.15072089763164931, 0.14909301157086774, -0.018689890304061096, 0.39481458468156305], 0.05877411487918438], [[-0.29984373974635936, 0.71934734076942719, 1.2392092995547836, 0.41184431977829539, 0.13052097731042822, 0.012981282398934396, -0.77084668605305651, -0.23923303071710911, 0.24613616288030205, 0.0071792330569403484, -0.16906316486569095, 0.18095148458834975, -0.12071531335216135, 0.18887601960560274, 0.051885666612439096], 0.069759828760088682], [[-0.15449120581946524, 0.64951964246975047, 0.16969084588187366, 0.70820451021752118, 0.68696831944577208, -0.47619883509347344, -0.29090557003820378, 0.07620809485410926, 0.19548669409941749, -0.059913958975032836, -0.11821757647728848, -0.19305873749916524, -0.13176045531817471, 0.15929252439240482, 0.056679345616207244], 0.11912041223397452], [[0.006180404871175979, 0.80464282037114809, -0.33033917021567527, -0.14848881979217166, -1.235242142750355, 1.3095685396707577, 0.6957242895856508, -0.02989187842375074, -0.10674055798263112, -0.037619586416044015, 0.011956177131142276, 0.050958002969719307, -0.0086747554764927681, -0.11553937525330471, 0.32522194317886177], 0.029034802249806424], [[-0.1657612836095966, 0.94073016547909261, 0.51058101741405593, 0.030484186580697514, -0.47335063751875994, 0.57147407316464671, 0.37793039521913463, -0.47295122425549962, 0.010816584966879941, 0.097829543387180029, -0.21309623583345366, 0.074870851146988271, -0.058132337538235268, -0.24332889978101008, 0.42768616534437087], 0.038796184224211938], [[0.025647555447089326, 0.72666569404040504, 1.6463192615309852, 0.15034166562043572, -0.067317069313437283, 0.46532622741236068, -0.86103506347136061, -0.75741516656446195, -0.035803335172834738, -0.22717670542117901, 0.071836839974666111, -0.084342850750207754, -0.054927106726417874, 0.20650080761349554, 0.20007689716806881], 0.093074451237209366], [[-0.001959021207822406, 0.83239308122272682, 0.37957376389069414, 0.18619450498571441, 0.56580540139778224, -0.33440574530327416, -0.22863859727686675, -0.29546311597557634, -0.075875965058126549, -0.13685162690406641, 0.08330235469297663, -0.15623644870280184, 0.03650658244675975, 0.16184825625987359, 0.33483923528636689], 0.051801286141569425], [[-0.15561910248092847, 0.9655331160147993, 0.19024631202086678, -0.16389065866071933, 0.49217663484548924, 0.59772856889259007, -0.66386712615445909, 0.085032757491798894, -0.062744636941233445, 0.067025521253281539, 0.069696799220337891, 0.047764491738702577, -0.20580601665107331, 0.090321642139469582, 0.020765245239603963], 0.017690729166233488], [[-0.096453655024564017, 0.58878378008258592, 0.91393894691688504, 0.34938317289662602, 0.039152078643877972, 0.039398004355004235, -1.0468286783459875, 0.46362537171815743, 0.02312687381619816, 0.086659296247407092, -0.093363837264509641, 0.17609181669056617, -0.064349293969083843, 0.19115432937995736, -0.060071639048318637], 0.092949401936266773], [[0.018536111824581453, 0.70288133456920732, 0.1572960624779724, 0.94921016378166667, 0.65848222506481491, -0.53704710893855245, -0.29893180864523983, -0.40740458137528102, -0.14933227609016425, -0.011001519678496552, -0.031467342092052764, 0.016334637904423882, -0.011023627493020494, 0.046868387035175152, 0.077048979071970614], 0.096965255896032387], [[-0.25259118431472216, 0.8233415869510412, 0.4199431033875719, 1.1768474473476098, -1.2356704052210177, 0.14276615677208457, -0.071037668353966799, 0.41782871470689475, 0.15248698131333682, -0.24720613484926324, -0.42648857055296946, 0.36562680947974546, -0.085172934683006352, 0.29547776416200666, -0.06802360170871613], 0.052760631572301764], [[-0.072576660742084367, 0.81651417087679246, -0.3211034844306131, 0.30928733494137484, -0.09012779788680797, -0.50078664788211746, 0.9512547206547276, -0.0032189580080161928, -0.10700584083014469, 0.032073855929995732, -0.010406086197703862, 0.069831232786959546, 0.34587216310976254, 0.093766240427648195, -0.17325996366570007], 0.096218432324126135], [[-0.28432041279606995, 0.63988723059029973, 1.0836922161217861, 0.98661564214995512, -0.12433327393925117, -0.46263556657516319, -0.065707352399842003, -0.31079427858663244, 0.23194998789726029, -0.33864363672233078, 0.31520027787319682, -0.21545976552161242, -0.040905762230344422, -0.00748873833591579, -0.21221297709348941], 0.088021580093375407], [[-0.16089707161042341, 0.8965822415267819, 0.4103863205396423, 0.59468823344320354, 0.04254317994671461, 0.28553571860940874, 0.21152992243231228, -0.68788481353454334, 0.064970151509591206, -0.22785316880554574, -0.051201941851013566, -0.32032223026961659, 0.15010684940229679, -0.063439537422950276, 0.22006490290322889], 0.025322682321992973], [[0.11182075799149772, 0.65411818318708925, 1.3129187937616515, 0.85288827805273559, -1.6246409006550993, 0.74593025778158539, -0.38871261440224841, -0.087892290411351701, -0.045656229767157573, 0.045928349904733434, -0.16611837605758883, 0.2315789223043605, -0.34715191537467827, 0.054190747068566192, 0.18518850753887883], 0.05267236980336254], [[-0.13344781089068675, 0.63515225114238016, 0.77325119481004267, -0.23308661481801735, 0.2008554964948821, -0.095572234935413447, -0.76559358529034227, 1.1408187820540743, 0.011940503111879601, -0.012057379165902993, 0.066549237185619392, 0.13054067356138055, -0.15333355924898628, 0.024779068153564121, -0.28546668205502557], 0.09847076037014274], [[-0.21037641281310179, 0.9288051581200526, -0.66322254241331191, 0.24471045837055735, -0.19194499568561604, 1.1597561102737344, -0.003107297478991633, 0.26577554130079833, -0.012955549233710062, 0.57645607258796061, -0.27205638140095012, 0.098248048134944366, -0.41718998126023216, -0.080263742634118618, -0.20794407164281067], 0.033673328415926296], [[-0.14994541548372922, 0.98141226935245029, -0.27950378846317891, -0.32498007101383392, -0.35353773357469331, 0.5436907607315189, 0.67536406628105428, 0.11623154500823252, -0.00067651238963972737, 0.13859895364541563, 0.34966577728333448, 0.13896685765843331, -0.39184623610185354, -0.17269902859927255, 0.00040508572409858612], 0.028379878546068105], [[-0.14590691431003053, 1.0464832814251701, 1.0481929470578977, -0.48657408078753095, 0.62350112672748947, 0.93308813773062682, -0.34721584768203095, -1.4288201702593981, -0.095042162810413611, -0.069649525643022514, 0.18041281864798286, 0.0086063826300428747, -0.21864748321611804, 0.12624215537856975, 0.43201176983660594], -0.020934269528173011], [[0.07207680342415454, 0.77697734766041227, 0.3067820573261868, 0.022557231517119136, 0.82240771305419913, -0.19652115180943242, 0.07599237305462464, -0.36763840644908552, -0.013013836366206788, -0.26191350689882448, -0.45585536331995213, -0.11287556111335956, 0.38283161849607267, 0.025787016595057471, 0.036475336578099038], -0.005342340330847406], [[-0.017361177160982864, 1.042984596048806, -0.22705026056619793, -0.10520380866992814, 0.37986986873844969, 0.0035414727513486122, 0.93228250277674085, -0.56238540287508332, -0.23254838672858535, 0.11408832920138845, 0.41925891479783384, -0.055394342682305586, -0.41360226254919463, 0.082108661979596098, 0.21278205018642582], -0.0055409687694392695], [[-0.023808340221479832, 0.84635535835645104, 0.27922958125037833, 0.936967869657783, -0.40313567111024923, 0.84330653662802091, -0.42275541949917705, -0.24602082764493866, -0.076037549032303448, 0.18946227321254394, -0.63949378370955645, 0.0076838455058896304, 0.2848136733716235, -0.32201195158638829, 0.05865693087106294], -0.0040226517553044738], [[-0.084129629439401846, 0.38795226024982232, 1.1803963363130143, 1.9883352651032504, -1.9821675141870865, 0.22611224235732835, -0.69729948680873344, -0.075019732689127927, 0.23349343981311343, 0.26662333349037559, -0.48588083112512215, 0.41253348601745166, 0.22478142391797751, -0.24390460187141016, -0.18026241639775309], 0.081950757990531559], [[-0.047550062368741451, 0.88350187140437808, 0.56199307124951803, -0.096428489165694131, 0.90391694368564324, -1.4389011253249979, 0.39303569862021248, 0.69111563392488917, -0.025263595517992787, -0.049169933732915005, 0.16752516730843969, -0.30151511955072863, 0.23789111228216764, -0.48602221325105943, 0.23101391574902952], 0.00063119675804385045]]"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": 43,
688 |    "metadata": {
689 |     "collapsed": true
690 |    },
691 |    "outputs": [],
692 |    "source": [
693 |     "for x in range(1, 13):\n",
694 |     "    for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n",
695 |     "        ind = date[(date.dt.year == 2016)&(date.dt.month == x)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n",
696 |     "        final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2016[(x-1)*4 + i][0]).sum(axis = 1) + coef_list2016[(x-1)*4 + i][1]]"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 44,
702 |    "metadata": {
703 |     "collapsed": true
704 |    },
705 |    "outputs": [],
706 |    "source": [
707 |     "coef_list2017 = [[[-0.092260279685306562, 0.47993713095700025, 1.3627279414391347, 0.19273462880948466, -0.43111286718320307, 0.3093180618980409, -0.63330613216205289, -0.06358364220948258, 0.19092647149070691, -0.22550467829007492, -0.0096726092674919162, 0.032662864635206401, -0.31034612227919123, 0.36000601525052756, 0.072208252512741211], 0.1033118679714411], [[0.026020136865209237, 0.71385675075166954, 0.19154883685548907, 0.34416777217129813, -0.95574217469666189, 0.14653896310740583, 0.54406576756292635, 0.50815570748139871, -0.11041845651374511, 0.15500194489430491, -0.30283088541950576, -0.11729819887503543, 0.11573965958550625, -0.037944605033069845, 0.23614815740095207], 0.032610559808463446], [[0.058485304818391169, 1.0464212109432975, 0.19961840811742557, 0.30101063937092576, 0.90901659787261513, 0.10254210837610717, 0.50387515072918676, -1.3332534333399786, -0.29452778524554846, -0.037687102120534721, -0.1671236775444731, -0.059676630015028098, -0.16696044545586608, 0.078660354501412511, 0.30556879269033232], 0.022689027579702148], [[-0.20008981539808449, 0.68803052136755405, 0.7171076414175982, -0.089068886422192503, 0.45442419258068401, 0.65089805716290039, -1.289926063724629, 0.31339860971881217, 0.16465953479286172, -0.060096924040990818, 0.26641700319021283, -0.10774731392237336, -0.26224116097107819, 0.1765668954172569, 0.013030401338264719], 0.044476192980860224]]"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": 45,
713 |    "metadata": {
714 |     "collapsed": true
715 |    },
716 |    "outputs": [],
717 |    "source": [
718 |     "for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n",
719 |     "        ind = date[(date.dt.year == 2017)&(date.dt.month == 1)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n",
720 |     "        final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2017[i][0]).sum(axis = 1) + coef_list2017[i][1]]"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 47,
726 |    "metadata": {
727 |     "collapsed": true
728 |    },
729 |    "outputs": [],
730 |    "source": [
731 |     "ans = pd.DataFrame()\n",
732 |     "ans['project_id'] = test.project_id\n",
733 |     "ans['final_status'] = final_pred.astype('int8')\n",
734 |     "#ans.to_csv('predict_final_roman.csv', index = None)"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 49,
740 |    "metadata": {
741 |     "collapsed": true
742 |    },
743 |    "outputs": [],
744 |    "source": [
745 |     "offline = pd.read_csv('../../../offline_testcase.csv')"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": 50,
751 |    "metadata": {
752 |     "collapsed": false
753 |    },
754 |    "outputs": [],
755 |    "source": [
756 |     "from sklearn.metrics import accuracy_score"
757 |    ]
758 |   },
759 |   {
760 |    "cell_type": "code",
761 |    "execution_count": 51,
762 |    "metadata": {
763 |     "collapsed": false
764 |    },
765 |    "outputs": [
766 |     {
767 |      "data": {
768 |       "text/plain": [
769 |        "0.76434255101236903"
770 |       ]
771 |      },
772 |      "execution_count": 51,
773 |      "metadata": {},
774 |      "output_type": "execute_result"
775 |     }
776 |    ],
777 |    "source": [
778 |     "accuracy_score(offline.final_status, ans.final_status)"
779 |    ]
780 |   }
781 |  ],
782 |  "metadata": {
783 |   "anaconda-cloud": {},
784 |   "kernelspec": {
785 |    "display_name": "Python 3",
786 |    "language": "python",
787 |    "name": "python3"
788 |   },
789 |   "language_info": {
790 |    "codemirror_mode": {
791 |     "name": "ipython",
792 |     "version": 3
793 |    },
794 |    "file_extension": ".py",
795 |    "mimetype": "text/x-python",
796 |    "name": "python",
797 |    "nbconvert_exporter": "python",
798 |    "pygments_lexer": "ipython3",
799 |    "version": "3.5.2"
800 |   }
801 |  },
802 |  "nbformat": 4,
803 |  "nbformat_minor": 2
804 | }
805 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/Instructions:
--------------------------------------------------------------------------------
1 | To reproduce the score, run the files in following sequence:
2 | 1. Run best.py
3 | 2. Run lstm.py
4 | 3. Run layer2.py
5 | 
6 | Note: Make sure you load functions from rest of the scripts.
7 | 
8 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/best.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import enchant
  4 | from sklearn.metrics import mean_squared_error
  5 | from sklearn.feature_extraction.text import CountVectorizer
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from sklearn.linear_model import LogisticRegression
  8 | from word2vecUtils import utils
  9 | from xgboost import XGBClassifier
 10 | from sklearn.preprocessing import LabelEncoder
 11 | import time
 12 | import lightgbm as lgb
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | # from keras.preprocessing.text import Tokenizer
 15 | # from keras.preprocessing.sequence import pad_sequences
 16 | import re
 17 | import readability
 18 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
 19 | 
 20 | train = pd.read_csv('train.csv')
 21 | test = pd.read_csv('test.csv')
 22 | 
 23 | # trainp = np.loadtxt('trainp.csv', delimiter=',')
 24 | # testp = np.loadtxt('testp.csv', delimiter=',')
 25 | # trainp = pd.DataFrame(trainp)
 26 | # testp = pd.DataFrame(testp)
 27 | # train = pd.concat([train,trainp],axis = 1)
 28 | # test = pd.concat([test,testp],axis = 1)
 29 | 
 30 | train['created_atX'] = train['created_at']/max(train['created_at'])
 31 | test['created_atX'] = test['created_at']/max(test['created_at'])
 32 | train['deadlineX'] = train['deadline']/max(train['deadline'])
 33 | test['deadlineX'] = test['deadline']/max(test['deadline'])
 34 | 
 35 | y_train = train.final_status
 36 | X_train = train.drop(['backers_count', 'final_status'], 1)
 37 | X_test = test
 38 | 
 39 | X = pd.concat([X_train, X_test])
 40 | 
 41 | X = X.set_index(np.arange(len(X)))
 42 | 
 43 | 
 44 | def computeRead(text):
 45 |     rd = readability.Readability(text)
 46 |     score = rd.FleschKincaidGradeLevel()
 47 |     return int(score)
 48 | def ARIscore(text):
 49 |     rd = readability.Readability(text)
 50 |     score = rd.ARI()
 51 |     return float(score)
 52 | def LIXscore(text):
 53 |     rd = readability.Readability(text)
 54 |     score = rd.LIX()
 55 |     return float(score)
 56 | 
 57 | 
 58 | X['readscore'] = X['desc'].apply(lambda d: computeRead(str(d)))
 59 | X['ariscore'] = X['desc'].apply(lambda d: ARIscore(str(d)))
 60 | X['lixscore'] = X['desc'].apply(lambda d: LIXscore(str(d)))
 61 | 
 62 | X['readscoreX'] = X['name'].apply(lambda d: computeRead(str(d)))
 63 | X['ariscoreX'] = X['name'].apply(lambda d: ARIscore(str(d)))
 64 | X['lixscoreX'] = X['name'].apply(lambda d: LIXscore(str(d)))
 65 | 
 66 | X['coeff'] = np.zeros(len(X))
 67 | X.coeff.ix[X.currency == 'USD'] = 1
 68 | X.coeff.ix[X.currency == 'GBP'] = 0.78
 69 | X.coeff.ix[X.currency == 'EUR'] = 0.89
 70 | X.coeff.ix[X.currency == 'CAD'] = 1.32
 71 | X.coeff.ix[X.currency == 'AUD'] = 1.31
 72 | X.coeff.ix[X.currency == 'SEK'] = 8.71
 73 | X.coeff.ix[X.currency == 'NZD'] = 1.38
 74 | X.coeff.ix[X.currency == 'DKK'] = 6.63
 75 | X.coeff.ix[X.currency == 'NOK'] = 8.42
 76 | X.coeff.ix[X.currency == 'CHF'] = 0.97
 77 | X.coeff.ix[X.currency == 'MXN'] = 17.95
 78 | X.coeff.ix[X.currency == 'SGD'] = 1.38
 79 | X.coeff.ix[X.currency == 'HKD'] = 7.8
 80 | 
 81 | X['dollars'] = X['goal'] / X['coeff']
 82 | 
 83 | X = pd.get_dummies(X, columns=['country'])
 84 | 
 85 | 
 86 | 
 87 | le = LabelEncoder()
 88 | le.fit(X.disable_communication)
 89 | X.disable_communication = le.transform(X.disable_communication)
 90 | 
 91 | le = LabelEncoder()
 92 | le.fit(X.currency)
 93 | X.currency = le.transform(X.currency)
 94 | 
 95 | 
 96 | def year(date):
 97 |     return int(time.strftime("%Y", time.localtime(date)))
 98 | 
 99 | 
100 | def month(date):
101 |     return int(time.strftime("%m", time.localtime(date)))
102 | 
103 | 
104 | X['created_month'] = np.zeros(len(X))
105 | X['deadline_month'] = np.zeros(len(X))
106 | X['launched_month'] = np.zeros(len(X))
107 | X['state_changed_month'] = np.zeros(len(X))
108 | 
109 | X['created_month'] = X['created_at'].apply(month)
110 | X['deadline_month'] = X['deadline'].apply(month)
111 | X['launched_month'] = X['launched_at'].apply(month)
112 | X['state_changed_month'] = X['state_changed_at'].apply(month)
113 | 
114 | d = enchant.Dict("en_US")
115 | X['valideng'] = X['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', str(x))))
116 | X['valideng'] = X['valideng'].apply(lambda x: sum(1 for c in str(x).split(' ') if len(c) < 4 or d.check(c)))
117 | analyzer = SentimentIntensityAnalyzer()
118 | def compoundScore(text):
119 |     res = analyzer.polarity_scores(text)
120 |     return float(res['compound'])
121 | def negSent(text):
122 |     res = analyzer.polarity_scores(text)
123 |     return float(res['neg'])
124 | def posSent(text):
125 |     res = analyzer.polarity_scores(text)
126 |     return float(res['pos'])
127 | def neuSent(text):
128 |     res = analyzer.polarity_scores(text)
129 |     return float(res['neu'])
130 | X['compoundScore'] = X['desc'].apply(lambda d: compoundScore(str(d)))
131 | X['negSent'] = X['desc'].apply(lambda d: negSent(str(d)))
132 | X['posSent'] = X['desc'].apply(lambda d: posSent(str(d)))
133 | X['neuSent'] = X['desc'].apply(lambda d: neuSent(str(d)))
134 | 
135 | X['compoundScoreX'] = X['name'].apply(lambda d: compoundScore(str(d)))
136 | X['negSentX'] = X['name'].apply(lambda d: negSent(str(d)))
137 | X['posSentX'] = X['name'].apply(lambda d: posSent(str(d)))
138 | X['neuSentX'] = X['name'].apply(lambda d: neuSent(str(d)))
139 | 
140 | cols_to_use = ['name', 'desc']
141 | len_feats = ['name_len', 'desc_len']
142 | count_feats = ['name_count', 'desc_count']
143 | 
144 | for i in np.arange(2):
145 |     X[len_feats[i]] = X[cols_to_use[i]].apply(str).apply(len)
146 |     X[count_feats[i]] = X[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))
147 | 
148 | X['keywords_len'] = X['keywords'].apply(str).apply(len)
149 | X['keywords_count'] = X['keywords'].apply(str).apply(lambda x: len(x.split('-')))
150 | 
151 | X['dots'] = X['desc'].apply(str).apply(lambda x: x.count('.'))
152 | X['comma'] = X['desc'].apply(str).apply(lambda x: x.count(','))
153 | X['kav'] = X['desc'].apply(str).apply(lambda x: x.count('\"'))
154 | X['vopros'] = X['desc'].apply(str).apply(lambda x: x.count('?'))
155 | X['voskl'] = X['desc'].apply(str).apply(lambda x: x.count('!'))
156 | X['smiles'] = X['desc'].apply(str).apply(lambda x: x.count(":)"))
157 | X['Iocc'] = X['desc'].apply(str).apply(lambda x: x.count('I') + x.count('i'))
158 | X['kkstid'] = X['project_id'].apply(str).apply(lambda x: int(x.replace('kkst', '')))
159 | 
160 | X['digitsenc'] = X['desc'].apply(str).apply(
161 |     lambda x: x.count('0') + x.count('1') + x.count('2') + x.count('3') + x.count('4') + x.count('5') + x.count(
162 |         '6') + x.count('7') + x.count('8') + x.count('9'))
163 | 
164 | X['kkstidlen'] = X['project_id'].apply(str).apply(len)
165 | X['potentiality'] = (X['deadline'] - X['created_at']) * X['dollars']
166 | X['hardness'] = X['dollars'] / (X['deadline'] - X['created_at'])
167 | X['freshness'] = X['deadline'] / X['state_changed_at']
168 | X['editingTime'] = X['created_at'] / X['launched_at']
169 | X['diversity'] = X['keywords_len'] / X['name_len']
170 | X['diversity2'] = X['desc_count'] / X['keywords_count']
171 | X['upper'] = X['desc'].apply(str).apply(lambda x: sum(1 for c in x if c.isupper()))
172 | 
173 | X['editingDuration'] = np.log(X['launched_at'] - X['created_at'])
174 | X['loggoal'] = np.log(X['dollars'])
175 | X['durationX'] = np.log(X['deadline'] - X['launched_at'])
176 | 
177 | #from datetime import datetime
178 | #X['satornot'] = np.zeros(len(X))
179 | #X['dow'] = X['deadline'].apply(lambda x: datetime.fromtimestamp(x/1000).strftime("%A"))
180 | #X.satornot.ix[X.dow == 'Saturday'] = 1
181 | #X.satornot.ix[X.dow != 'Saturday'] = 0
182 | #X = X.drop(['dow'], 1)
183 | 
184 | #X['durationToChange'] = X['state_changed_at'] - X['deadline']
185 | import datetime
186 | daydict = {}
187 | for index, row in X.iterrows():
188 |     if datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d') in daydict:
189 |         daydict[datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d')] += 1
190 |     else:
191 |         daydict[datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d')] = 0
192 | 
193 | X['zagr'] = X['deadline'].apply(lambda x: daydict[datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d')])
194 | # time.strftime("%Y", time.localtime(X.deadline))
195 | # time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(epoch))
196 | 
197 | 
198 | # clean_desc= []
199 | # for index,row in X.iterrows():
200 | #     clean_desc.append(" ".join(utils.review_to_wordlist(str(row['desc']) + "" + str(row['name']) + " " + str(row['keywords']), False)))
201 | #
202 | # vectorizer2  =  TfidfVectorizer(min_df=3,  max_features=300,
203 | #         strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
204 | #         ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
205 | #         stop_words = None )
206 | #
207 | # data_features = vectorizer2.fit_transform(clean_desc)
208 | # np.asarray(data_features)
209 | # data_features = data_features.astype(np.float64)
210 | # features_df = pd.DataFrame(data_features.todense(), columns=vectorizer2.get_feature_names())
211 | # X = pd.concat([X, features_df], axis=1)
212 | 
213 | clean_desc = []
214 | for index, row in X.iterrows():
215 |     clean_desc.append(" ".join(
216 |         utils.review_to_wordlist(str(row['desc']) + "" + str(row['name']) + " " + str(row['keywords']), False)))
217 | 
218 | vectorizer = CountVectorizer(analyzer="word",
219 |                              tokenizer=None,
220 |                              preprocessor = None,
221 |                              stop_words = None,
222 |                              max_features = 3300)
223 | 
224 | data_features = vectorizer.fit_transform(clean_desc)
225 | np.asarray(data_features)
226 | data_features = data_features.astype(np.float32)
227 | features_df = pd.DataFrame(data_features.todense(), columns=vectorizer.get_feature_names())
228 | X = pd.concat([X, features_df], axis=1)
229 | 
230 | # from sklearn.cluster import KMeans
231 | # kmeans = KMeans(n_clusters=15, random_state=0).fit(features_df)
232 | # labels = kmeans.labels_
233 | # kmeans = []
234 | # X['categoryX'] = labels
235 | #
236 | # X = pd.get_dummies(X, columns=['categoryX'])
237 | 
238 | 
239 | #
240 | # def tokenizerKeras(data):
241 | #     data = data[['desc']]
242 | #
243 | #     data['desc'] = data['desc'].apply(lambda x: str(x).lower())
244 | #     data['desc'] = data['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
245 | #
246 | #     for idx, row in data.iterrows():
247 | #         row[0] = row[0].replace('rt', ' ')
248 | #
249 | #     max_features = 1500
250 | #     tokenizer = Tokenizer(nb_words=max_features, split=' ')
251 | #     tokenizer.fit_on_texts(data['desc'].values)
252 | #     X = tokenizer.texts_to_sequences(data['desc'].values)
253 | #     X = pad_sequences(X)
254 | #     return X
255 | # features_df = pd.DataFrame(tokenizerKeras(X))
256 | # X = pd.concat([X, features_df], axis=1)
257 | 
258 | X = X.drop(['project_id', 'name', 'desc', 'keywords'], 1)
259 | # colnames = list(X.columns.values)
260 | # todrop = []
261 | # for col in colnames:
262 | #     try:
263 | #         cur = col.astype(int)
264 | #         todrop.append(col)
265 | #     except:
266 | #         continue
267 | # X.drop(todrop, 1)
268 | 
269 | # cols = X.columns
270 | # for dup in X.columns:
271 | #     cols[X.columns.get_loc(dup)] = [dup + '.' + str(d_idx) if d_idx != 0 else dup for d_idx in
272 | #                                     range(X.columns.get_loc(dup).sum())]
273 | # X.columns = cols
274 | X_train = X.ix[:len(X_train) - 1]
275 | X_test = X.ix[len(X_train):]
276 | print("started training")
277 | 
278 | gbm = lgb.LGBMClassifier(n_estimators=2900, max_depth=3, subsample=0.7, colsample_bytree= 0.7)
279 | gbm = gbm.fit(X_train, y_train)
280 | Y = gbm.predict_proba(X)
281 | np.savetxt('lgb',Y,delimiter = ',', fmt = '%0.6f')
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/layer2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import lightgbm as lgb
 4 | X1 = pd.read_csv('lgb', delimiter = ',', header = None)
 5 | X2 = pd.read_csv('lstm', delimiter = ',', header = None)
 6 | train = pd.read_csv('train.csv')
 7 | test = pd.read_csv('test.csv')
 8 | y_train = train.final_status
 9 | X = pd.concat([X1,X2], 1)
10 | X_train = X.ix[:len(train) - 1]
11 | X_test = X.ix[len(train):]
12 | gbm  = lgb.LGBMClassifier()
13 | gbm.fit(X_train,y_train)
14 | y_pred = gbm.predict_proba(X_test)
15 | y_result = []
16 | magic = 0.64
17 | for i in range(0, 63465):
18 |     if y_pred[i][0] > magic:
19 |         y_result.append(0)
20 |     else:
21 |         y_result.append(1)
22 | for index, row in test.iterrows():
23 |     if str(row['name']).count("Canceled") + str(row['name']).count("Suspended") > 0 or row['deadline'] > row['state_changed_at'] or row['disable_communication'] == True:
24 |         y_result[index] = 0
25 | sub = pd.read_csv('samplesubmission.csv')
26 | sub.final_status = y_result
27 | sub.to_csv('sub2.csv', index=0)


--------------------------------------------------------------------------------
/Rank_2_Sergazy/lstm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np  # linear algebra
 2 | import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | from keras.preprocessing.text import Tokenizer
 5 | from keras.preprocessing.sequence import pad_sequences
 6 | from keras.models import Sequential
 7 | from keras.layers import Dense, Embedding, LSTM, GRU, Activation
 8 | from sklearn.model_selection import train_test_split
 9 | from keras.utils.np_utils import to_categorical
10 | import tensorflow as tf
11 | import re
12 | import nltk
13 | train = pd.read_csv('train.csv')
14 | test = pd.read_csv('test.csv')
15 | 
16 | 
17 | y_train = train.final_status
18 | train = train[['goal','desc', 'name', 'keywords']]
19 | test = test[['goal','desc', 'name', 'keywords']]
20 | # Keeping only the neccessary columns
21 | data = pd.concat([train, test])
22 | data = data.set_index(np.arange(len(data)))
23 | data['desc'] = data['desc'] + data['name'] + data['keywords']
24 | 
25 | data['desc'] = data['desc'].apply(lambda x: str(x).lower())
26 | data['desc'] = data['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
27 | 
28 | 
29 | max_features = 6000
30 | tokenizer = Tokenizer(nb_words=max_features, split=' ')
31 | tokenizer.fit_on_texts(data['desc'].values)
32 | X = tokenizer.texts_to_sequences(data['desc'].values)
33 | X = pad_sequences(X)
34 | embed_dim = 256
35 | lstm_out = 512
36 | model = Sequential()
37 | model.add(Embedding(max_features, embed_dim, input_length=X.shape[1], dropout=0.2))
38 | model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2, return_sequences=True))
39 | model.add(GRU(lstm_out, activation='relu'))
40 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='tanh'))
41 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='relu'))
42 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='sigmoid'))
43 | model.add(Dense(2, activation='softmax'))
44 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
45 | print(model.summary())
46 | 
47 | y_train = pd.get_dummies(y_train).values
48 | x_train = X[:len(train)]
49 | x_test = X[len(train):]
50 | 
51 | batch_size = 64
52 | model.fit(x_train, y_train, nb_epoch=2, batch_size=batch_size, verbose=2)
53 | Y = model.predict_proba(X)
54 | 
55 | np.savetxt('lstm', Y, delimiter=',', fmt = '%0.6f')


--------------------------------------------------------------------------------
/Rank_2_Sergazy/readability.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import math
 4 | 
 5 | from utils import get_char_count
 6 | from utils import get_words
 7 | from utils import get_sentences
 8 | from utils import count_syllables
 9 | from utils import count_complex_words
10 | 
11 | 
12 | class Readability:
13 |     analyzedVars = {}
14 | 
15 |     def __init__(self, text):
16 |         self.analyze_text(text)
17 | 
18 |     def analyze_text(self, text):
19 |         words = get_words(text)
20 |         char_count = get_char_count(words)
21 |         word_count = len(words)
22 |         sentence_count = len(get_sentences(text))
23 |         syllable_count = count_syllables(words)
24 |         complexwords_count = count_complex_words(text)
25 |         avg_words_p_sentence = word_count/sentence_count
26 |         
27 |         self.analyzedVars = {
28 |             'words': words,
29 |             'char_cnt': float(char_count),
30 |             'word_cnt': float(word_count),
31 |             'sentence_cnt': float(sentence_count),
32 |             'syllable_cnt': float(syllable_count),
33 |             'complex_word_cnt': float(complexwords_count),
34 |             'avg_words_p_sentence': float(avg_words_p_sentence)
35 |         }
36 | 
37 |     def ARI(self):
38 |         score = 0.0 
39 |         if self.analyzedVars['word_cnt'] > 0.0:
40 |             score = 4.71 * (self.analyzedVars['char_cnt'] / self.analyzedVars['word_cnt']) + 0.5 * (self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt']) - 21.43
41 |         return score
42 |         
43 |     def FleschReadingEase(self):
44 |         score = 0.0 
45 |         if self.analyzedVars['word_cnt'] > 0.0:
46 |             score = 206.835 - (1.015 * (self.analyzedVars['avg_words_p_sentence'])) - (84.6 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']))
47 |         return round(score, 4)
48 |         
49 |     def FleschKincaidGradeLevel(self):
50 |         score = 0.0 
51 |         if self.analyzedVars['word_cnt'] > 0.0:
52 |             score = 0.39 * (self.analyzedVars['avg_words_p_sentence']) + 11.8 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']) - 15.59
53 |         return round(score, 4)
54 |         
55 |     def GunningFogIndex(self):
56 |         score = 0.0 
57 |         if self.analyzedVars['word_cnt'] > 0.0:
58 |             score = 0.4 * ((self.analyzedVars['avg_words_p_sentence']) + (100 * (self.analyzedVars['complex_word_cnt']/self.analyzedVars['word_cnt'])))
59 |         return round(score, 4)
60 | 
61 |     def SMOGIndex(self):
62 |         score = 0.0 
63 |         if self.analyzedVars['word_cnt'] > 0.0:
64 |             score = (math.sqrt(self.analyzedVars['complex_word_cnt']*(30/self.analyzedVars['sentence_cnt'])) + 3)
65 |         return score
66 | 
67 |     def ColemanLiauIndex(self):
68 |         score = 0.0 
69 |         if self.analyzedVars['word_cnt'] > 0.0:
70 |             score = (5.89*(self.analyzedVars['char_cnt']/self.analyzedVars['word_cnt']))-(30*(self.analyzedVars['sentence_cnt']/self.analyzedVars['word_cnt']))-15.8
71 |         return round(score, 4)
72 | 
73 |     def LIX(self):
74 |         longwords = 0.0
75 |         score = 0.0 
76 |         if self.analyzedVars['word_cnt'] > 0.0:
77 |             for word in self.analyzedVars['words']:
78 |                 if len(word) >= 7:
79 |                     longwords += 1.0
80 |             score = self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt'] + float(100 * longwords) / self.analyzedVars['word_cnt']
81 |         return score
82 | 
83 |     def RIX(self):
84 |         longwords = 0.0
85 |         score = 0.0 
86 |         if self.analyzedVars['word_cnt'] > 0.0:
87 |             for word in self.analyzedVars['words']:
88 |                 if len(word) >= 7:
89 |                     longwords += 1.0
90 |             score = longwords / self.analyzedVars['sentence_cnt']
91 |         return score
92 |         
93 | 
94 | if __name__ == "__main__":
95 |     text = """We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise!  By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say.  As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over."""
96 | 
97 |     rd = Readability(text)
98 | 
99 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/syllables_en.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fallback syllable counter
  3 | 
  4 | This is based on the algorithm in Greg Fast's perl module
  5 | Lingua::EN::Syllable.
  6 | """
  7 | 
  8 | import string, re, os
  9 | 
 10 | specialSyllables_en = """tottered 2
 11 | chummed 1
 12 | peeped 1
 13 | moustaches 2
 14 | shamefully 3
 15 | messieurs 2
 16 | satiated 4
 17 | sailmaker 4
 18 | sheered 1
 19 | disinterred 3
 20 | propitiatory 6
 21 | bepatched 2
 22 | particularized 5
 23 | caressed 2
 24 | trespassed 2
 25 | sepulchre 3
 26 | flapped 1
 27 | hemispheres 3
 28 | pencilled 2
 29 | motioned 2
 30 | poleman 2
 31 | slandered 2
 32 | sombre 2
 33 | etc 4
 34 | sidespring 2
 35 | mimes 1
 36 | effaces 2
 37 | mr 2
 38 | mrs 2
 39 | ms 1
 40 | dr 2
 41 | st 1
 42 | sr 2
 43 | jr 2
 44 | truckle 2
 45 | foamed 1
 46 | fringed 2
 47 | clattered 2
 48 | capered 2
 49 | mangroves 2
 50 | suavely 2
 51 | reclined 2
 52 | brutes 1
 53 | effaced 2
 54 | quivered 2
 55 | h'm 1
 56 | veriest 3
 57 | sententiously 4
 58 | deafened 2
 59 | manoeuvred 3
 60 | unstained 2
 61 | gaped 1
 62 | stammered 2
 63 | shivered 2
 64 | discoloured 3
 65 | gravesend 2
 66 | 60 2
 67 | lb 1
 68 | unexpressed 3
 69 | greyish 2
 70 | unostentatious 5
 71 | """
 72 | 
 73 | fallback_cache = {}
 74 | 
 75 | fallback_subsyl = ["cial", "tia", "cius", "cious", "gui", "ion", "iou",
 76 |                    "sia$", ".ely$"]
 77 | 
 78 | fallback_addsyl = ["ia", "riet", "dien", "iu", "io", "ii",
 79 |                    "[aeiouy]bl$", "mbl$",
 80 |                    "[aeiou]{3}",
 81 |                    "^mc", "ism$",
 82 |                    "(.)(?!\\1)([aeiouy])\\2l$",
 83 |                    "[^l]llien",
 84 |                    "^coad.", "^coag.", "^coal.", "^coax.",
 85 |                    "(.)(?!\\1)[gq]ua(.)(?!\\2)[aeiou]",
 86 |                    "dnt$"]
 87 | 
 88 | 
 89 | # Compile our regular expressions
 90 | for i in range(len(fallback_subsyl)):
 91 |     fallback_subsyl[i] = re.compile(fallback_subsyl[i])
 92 | for i in range(len(fallback_addsyl)):
 93 |     fallback_addsyl[i] = re.compile(fallback_addsyl[i])
 94 | 
 95 | def _normalize_word(word):
 96 |     return word.strip().lower()
 97 | 
 98 | # Read our syllable override file and stash that info in the cache
 99 | for line in specialSyllables_en.splitlines():
100 |     line = line.strip()
101 |     if line:
102 |         toks = line.split()
103 |         assert len(toks) == 2
104 |         fallback_cache[_normalize_word(toks[0])] = int(toks[1])
105 | 
106 | def count(word):
107 |     word = _normalize_word(word)
108 |     if not word:
109 |         return 0
110 | 
111 |     # Check for a cached syllable count
112 |     count = fallback_cache.get(word, -1)
113 |     if count > 0:
114 |         return count
115 | 
116 |     # Remove final silent 'e'
117 |     if word[-1] == "e":
118 |         word = word[:-1]
119 | 
120 |     # Count vowel groups
121 |     count = 0
122 |     prev_was_vowel = 0
123 |     for c in word:
124 |         is_vowel = c in ("a", "e", "i", "o", "u", "y")
125 |         if is_vowel and not prev_was_vowel:
126 |             count += 1
127 |         prev_was_vowel = is_vowel
128 | 
129 |     # Add & subtract syllables
130 |     for r in fallback_addsyl:
131 |         if r.search(word):
132 |             count += 1
133 |     for r in fallback_subsyl:
134 |         if r.search(word):
135 |             count -= 1
136 | 
137 |     # Cache the syllable count
138 |     fallback_cache[word] = count
139 | 
140 |     return count
141 | 
142 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | utility functions for breaking down a given block of text
 3 | into it's component syntactic parts.
 4 | """
 5 | 
 6 | import nltk
 7 | 
 8 | from nltk.tokenize import RegexpTokenizer
 9 | import syllables_en
10 | 
11 | TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
12 | SPECIAL_CHARS = ['.', ',', '!', '?']
13 | 
14 | def get_char_count(words):
15 |     characters = 0
16 |     for word in words:
17 |         characters += len(word)
18 |     return characters
19 |     
20 | 
21 | def get_words(text=''):
22 |     words = []
23 |     words = TOKENIZER.tokenize(text)
24 |     filtered_words = []
25 |     for word in words:
26 |         if word in SPECIAL_CHARS or word == " ":
27 |             pass
28 |         else:
29 |             new_word = word.replace(",","").replace(".","")
30 |             new_word = new_word.replace("!","").replace("?","")
31 |             filtered_words.append(new_word)
32 |     return filtered_words
33 | 
34 | def get_sentences(text=''):
35 |     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
36 |     sentences = tokenizer.tokenize(text)
37 |     return sentences
38 | 
39 | def count_syllables(words):
40 |     syllableCount = 0
41 |     for word in words:
42 |         syllableCount += syllables_en.count(word)
43 |     return syllableCount
44 | 
45 | #This method must be enhanced. At the moment it only
46 | #considers the number of syllables in a word.
47 | #This often results in that too many complex words are detected.
48 | def count_complex_words(text=''):
49 |     words = get_words(text)
50 |     sentences = get_sentences(text)
51 |     complex_words = 0
52 |     found = False
53 |     cur_word = []
54 |     
55 |     for word in words:          
56 |         cur_word.append(word)
57 |         if count_syllables(cur_word)>= 3:
58 |             
59 |             #Checking proper nouns. If a word starts with a capital letter
60 |             #and is NOT at the beginning of a sentence we don't add it
61 |             #as a complex word.
62 |             if not(word[0].isupper()):
63 |                 complex_words += 1
64 |             else:
65 |                 for sentence in sentences:
66 |                     if str(sentence).startswith(word):
67 |                         found = True
68 |                         break
69 |                 if found: 
70 |                     complex_words += 1
71 |                     found = False
72 |                 
73 |         cur_word.remove(word)
74 |     return complex_words
75 | 
76 | 


--------------------------------------------------------------------------------
/Rank_2_Sergazy/word2vecUtils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re
  4 | import nltk
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from bs4 import BeautifulSoup
 10 | from nltk.corpus import stopwords
 11 | from nltk.stem import SnowballStemmer
 12 | 
 13 | class utils(object):
 14 |     """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""
 15 | 
 16 |     @staticmethod
 17 |     def review_to_wordlist( review, remove_stopwords=False ):
 18 | 
 19 |         #
 20 |         # 2. Remove non-letters
 21 |         review_text = re.sub("[^a-zA-Z]"," ", review)
 22 |         #
 23 |         # 3. Convert words to lower case and split them
 24 |         words = review_text.lower().split()
 25 |         #
 26 |         # 4. Optionally remove stop words (false by default)
 27 |         if remove_stopwords:
 28 |             stops = set(stopwords.words("english"))
 29 |             words = [w for w in words if not w in stops]
 30 |         #
 31 |         # 5. Return a list of words
 32 |         return(words)
 33 | 
 34 |     # Define a function to split a review into parsed sentences
 35 |     @staticmethod
 36 |     def review_to_sentences( review, tokenizer, remove_stopwords=False ):
 37 |         # Function to split a review into parsed sentences. Returns a
 38 |         # list of sentences, where each sentence is a list of words
 39 |         #
 40 |         # 1. Use the NLTK tokenizer to split the paragraph into sentences
 41 |         raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
 42 |         #
 43 |         # 2. Loop over each sentence
 44 |         sentences = []
 45 |         for raw_sentence in raw_sentences:
 46 |             # If a sentence is empty, skip it
 47 |             if len(raw_sentence) > 0:
 48 |                 # Otherwise, call review_to_wordlist to get a list of words
 49 |                 sentences.append( utils.review_to_wordlist( raw_sentence, \
 50 |                   remove_stopwords ))
 51 |         #
 52 |         # Return the list of sentences (each sentence is a list of words,
 53 |         # so this returns a list of lists
 54 |         return sentences
 55 | 
 56 | 
 57 |     @staticmethod
 58 |     def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
 59 |         # Clean the text, with the option to remove stopwords and to stem words.
 60 |         
 61 |         # Convert words to lower case and split them
 62 |         text = text.lower().split()
 63 | 
 64 |         # Optionally, remove stop words
 65 |         if remove_stopwords:
 66 |             stops = set(stopwords.words("english"))
 67 |             text = [w for w in text if not w in stops]
 68 |         
 69 |         text = " ".join(text)
 70 | 
 71 |         # Clean the text
 72 |         text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
 73 |         text = re.sub(r"what's", "what is ", text)
 74 |         text = re.sub(r"\'s", " ", text)
 75 |         text = re.sub(r"\'ve", " have ", text)
 76 |         text = re.sub(r"can't", "cannot ", text)
 77 |         text = re.sub(r"n't", " not ", text)
 78 |         text = re.sub(r"i'm", "i am ", text)
 79 |         text = re.sub(r"\'re", " are ", text)
 80 |         text = re.sub(r"\'d", " would ", text)
 81 |         text = re.sub(r"\'ll", " will ", text)
 82 |         text = re.sub(r",", " ", text)
 83 |         text = re.sub(r"\.", " ", text)
 84 |         text = re.sub(r"!", " ! ", text)
 85 |         text = re.sub(r"\/", " ", text)
 86 |         text = re.sub(r"\^", " ^ ", text)
 87 |         text = re.sub(r"\+", " + ", text)
 88 |         text = re.sub(r"\-", " - ", text)
 89 |         text = re.sub(r"\=", " = ", text)
 90 |         text = re.sub(r"'", " ", text)
 91 |         text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
 92 |         text = re.sub(r":", " : ", text)
 93 |         text = re.sub(r" e g ", " eg ", text)
 94 |         text = re.sub(r" b g ", " bg ", text)
 95 |         text = re.sub(r" u s ", " american ", text)
 96 |         text = re.sub(r"\0s", "0", text)
 97 |         text = re.sub(r" 9 11 ", "911", text)
 98 |         text = re.sub(r"e - mail", "email", text)
 99 |         text = re.sub(r"j k", "jk", text)
100 |         text = re.sub(r"\s{2,}", " ", text)
101 |         
102 |         # Optionally, shorten words to their stems
103 |         if stem_words:
104 |             text = text.split()
105 |             stemmer = SnowballStemmer('english')
106 |             stemmed_words = [stemmer.stem(word) for word in text]
107 |             text = " ".join(stemmed_words)
108 |         
109 |         # Return a list of words
110 |         return(text)
111 | 
112 | 


--------------------------------------------------------------------------------
/XGBoost_Python_TextFeats.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Introduction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is created and shared to help people learn and understand the process of solving a problem which involves text variables. Apart from creating new variables, you'll learn to extract ~650 text (count) features and use them in training a xgboost model. This script scores ~0.70 on public leaderboard.\n",
 15 |     "For any questions, feel free to raise issues."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# load libraries\n",
 27 |     "\n",
 28 |     "import pandas as pd\n",
 29 |     "import numpy as np\n",
 30 |     "import re\n",
 31 |     "import datetime\n",
 32 |     "from nltk.corpus import stopwords\n",
 33 |     "from sklearn.preprocessing import LabelEncoder\n",
 34 |     "from nltk.stem.snowball import SnowballStemmer\n",
 35 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 36 |     "import xgboost as xgb\n",
 37 |     "\n",
 38 |     "pd.set_option('display.max_colwidth',100)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 177,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "#load data\n",
 50 |     "train = pd.read_csv('train.csv')\n",
 51 |     "test = pd.read_csv('test.csv')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 98,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# convert unix time format\n",
 63 |     "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n",
 64 |     "\n",
 65 |     "for x in unix_cols:\n",
 66 |     "    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))\n",
 67 |     "    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Some features"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 99,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "cols_to_use = ['name','desc']\n",
 86 |     "len_feats = ['name_len','desc_len']\n",
 87 |     "count_feats = ['name_count','desc_count']\n",
 88 |     "\n",
 89 |     "for i in np.arange(2):\n",
 90 |     "    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n",
 91 |     "    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 100,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "train['name_count'] = train['name'].str.split().str.len()\n",
103 |     "train['desc_count'] = train['desc'].str.split().str.len()\n",
104 |     "\n",
105 |     "test['name_count'] = test['name'].str.split().str.len()\n",
106 |     "test['desc_count'] = test['desc'].str.split().str.len()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 101,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "train['keywords_len'] = train['keywords'].str.len()\n",
118 |     "train['keywords_count'] = train['keywords'].str.split('-').str.len()\n",
119 |     "\n",
120 |     "test['keywords_len'] = test['keywords'].str.len()\n",
121 |     "test['keywords_count'] = test['keywords'].str.split('-').str.len()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "### Some more features"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 102,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "# converting string variables to datetime\n",
140 |     "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n",
141 |     "\n",
142 |     "for x in unix_cols:\n",
143 |     "    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))\n",
144 |     "    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 103,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "# there should be simpler way - might take longer\n",
156 |     "# creating list with time difference between 1) launched_at and created_at 2) deadline and launched_at\n",
157 |     "\n",
158 |     "time1 = []\n",
159 |     "time3 = []\n",
160 |     "for i in np.arange(train.shape[0]):\n",
161 |     "    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))\n",
162 |     "    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 104,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "train['time1'] = np.log(time1)\n",
174 |     "train['time3'] = np.log(time3)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 105,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "# for test data\n",
186 |     "time5 = []\n",
187 |     "time6 = []\n",
188 |     "for i in np.arange(test.shape[0]):\n",
189 |     "    time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))\n",
190 |     "    time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 106,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "test['time1'] = np.log(time5)\n",
202 |     "test['time3'] = np.log(time6)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 107,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "feat = ['disable_communication','country']\n",
214 |     "\n",
215 |     "for x in feat:\n",
216 |     "    le = LabelEncoder()\n",
217 |     "    le.fit(list(train[x].values) + list(test[x].values))\n",
218 |     "    train[x] = le.transform(list(train[x]))\n",
219 |     "    test[x] = le.transform(list(test[x]))"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 109,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "train['goal'] = np.log1p(train['goal'])\n",
231 |     "test['goal'] = np.log1p(test['goal'])"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Text Cleaning"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 110,
244 |    "metadata": {
245 |     "collapsed": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "# creating a full list of descriptions from train and etst\n",
250 |     "kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 111,
256 |    "metadata": {
257 |     "collapsed": true
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower\n",
262 |     "def desc_clean(word):\n",
263 |     "    p1 = re.sub(pattern='(\\W+)|(\\d+)|(\\s+)',repl=' ',string=word)\n",
264 |     "    p1 = p1.lower()\n",
265 |     "    return p1\n",
266 |     "\n",
267 |     "kickdesc = kickdesc.map(desc_clean)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 113,
273 |    "metadata": {
274 |     "collapsed": false
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "stop = set(stopwords.words('english'))\n",
279 |     "kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]\n",
280 |     "\n",
281 |     "stemmer = SnowballStemmer(language='english')\n",
282 |     "kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]\n",
283 |     "\n",
284 |     "kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]\n",
285 |     "\n",
286 |     "kickdesc = [' '.join(x) for x in kickdesc]"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "### Creating Count Features"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 147,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "# Due to memory error, limited the number of features to 650\n",
305 |     "cv = CountVectorizer(max_features=650)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 148,
311 |    "metadata": {
312 |     "collapsed": false
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "alldesc = cv.fit_transform(kickdesc).todense()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 150,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "#create a data frame\n",
328 |     "combine = pd.DataFrame(alldesc)\n",
329 |     "combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 157,
335 |    "metadata": {
336 |     "collapsed": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "#split the text features\n",
341 |     "\n",
342 |     "train_text = combine[:train.shape[0]]\n",
343 |     "test_text = combine[train.shape[0]:]\n",
344 |     "\n",
345 |     "test_text.reset_index(drop=True,inplace=True)"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "### Finalizing train and test data before merging"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 162,
358 |    "metadata": {
359 |     "collapsed": true
360 |    },
361 |    "outputs": [],
362 |    "source": [
363 |     "cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 198,
369 |    "metadata": {
370 |     "collapsed": true
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "target = train['final_status']"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 168,
380 |    "metadata": {
381 |     "collapsed": true
382 |    },
383 |    "outputs": [],
384 |    "source": [
385 |     "train = train.loc[:,cols_to_use]\n",
386 |     "test = test.loc[:,cols_to_use]"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 174,
392 |    "metadata": {
393 |     "collapsed": false
394 |    },
395 |    "outputs": [],
396 |    "source": [
397 |     "X_train = pd.concat([train, train_text],axis=1)\n",
398 |     "X_test = pd.concat([test, test_text],axis=1)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 175,
404 |    "metadata": {
405 |     "collapsed": false
406 |    },
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "(108129, 659)\n",
413 |       "(63465, 659)\n"
414 |      ]
415 |     }
416 |    ],
417 |    "source": [
418 |     "print X_train.shape\n",
419 |     "print X_test.shape"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "### Model Training"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 180,
432 |    "metadata": {
433 |     "collapsed": true
434 |    },
435 |    "outputs": [],
436 |    "source": [
437 |     "dtrain = xgb.DMatrix(data=X_train, label = target)\n",
438 |     "dtest = xgb.DMatrix(data=X_test)"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 185,
444 |    "metadata": {
445 |     "collapsed": true
446 |    },
447 |    "outputs": [],
448 |    "source": [
449 |     "params = {\n",
450 |     "    'objective':'binary:logistic',\n",
451 |     "    'eval_metric':'error',\n",
452 |     "    'eta':0.025,\n",
453 |     "    'max_depth':6,\n",
454 |     "    'subsample':0.7,\n",
455 |     "    'colsample_bytree':0.7,\n",
456 |     "    'min_child_weight':5\n",
457 |     "    \n",
458 |     "}"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 186,
464 |    "metadata": {
465 |     "collapsed": false
466 |    },
467 |    "outputs": [
468 |     {
469 |      "name": "stdout",
470 |      "output_type": "stream",
471 |      "text": [
472 |       "[0]\ttrain-error:0.312506+0.000818869\ttest-error:0.315126+0.00200614\n",
473 |       "[10]\ttrain-error:0.309036+0.00111051\ttest-error:0.312694+0.00309961\n",
474 |       "[20]\ttrain-error:0.308571+0.00121385\ttest-error:0.311917+0.00244751\n",
475 |       "[30]\ttrain-error:0.307829+0.00111239\ttest-error:0.311871+0.00226141\n",
476 |       "[40]\ttrain-error:0.306412+0.000131922\ttest-error:0.310502+0.00297732\n",
477 |       "[50]\ttrain-error:0.3044+0.00051706\ttest-error:0.308846+0.00239344\n",
478 |       "[60]\ttrain-error:0.302541+0.000442086\ttest-error:0.307773+0.00231223\n",
479 |       "[70]\ttrain-error:0.301077+0.000528266\ttest-error:0.306812+0.00204724\n",
480 |       "[80]\ttrain-error:0.29982+0.000589525\ttest-error:0.305988+0.00195492\n",
481 |       "[90]\ttrain-error:0.298257+0.000508031\ttest-error:0.305064+0.00191186\n",
482 |       "[100]\ttrain-error:0.297228+0.000279615\ttest-error:0.303723+0.00173837\n",
483 |       "[110]\ttrain-error:0.296349+0.000327545\ttest-error:0.303223+0.00172593\n",
484 |       "[120]\ttrain-error:0.295457+0.00028638\ttest-error:0.302289+0.00191928\n",
485 |       "[130]\ttrain-error:0.294583+0.000378843\ttest-error:0.301623+0.00154688\n",
486 |       "[140]\ttrain-error:0.293783+0.000438995\ttest-error:0.300948+0.00175078\n",
487 |       "[150]\ttrain-error:0.292814+0.000369126\ttest-error:0.300467+0.00155916\n",
488 |       "[160]\ttrain-error:0.292169+0.000444825\ttest-error:0.299968+0.00178122\n",
489 |       "[170]\ttrain-error:0.291249+0.00034968\ttest-error:0.299459+0.00213267\n",
490 |       "[180]\ttrain-error:0.290553+0.000387982\ttest-error:0.298793+0.00213854\n",
491 |       "[190]\ttrain-error:0.28976+0.00040561\ttest-error:0.298321+0.00204807\n",
492 |       "[200]\ttrain-error:0.289114+0.000467604\ttest-error:0.297748+0.00208411\n",
493 |       "[210]\ttrain-error:0.288467+0.000442353\ttest-error:0.297507+0.00212083\n",
494 |       "[220]\ttrain-error:0.287843+0.000490146\ttest-error:0.297008+0.00234605\n",
495 |       "[230]\ttrain-error:0.287285+0.000393046\ttest-error:0.296879+0.00223639\n",
496 |       "[240]\ttrain-error:0.286751+0.000357893\ttest-error:0.296574+0.00248519\n",
497 |       "[250]\ttrain-error:0.286134+0.000314877\ttest-error:0.296009+0.00236468\n",
498 |       "[260]\ttrain-error:0.285519+0.000468298\ttest-error:0.295843+0.00219033\n",
499 |       "[270]\ttrain-error:0.284932+0.000440225\ttest-error:0.295658+0.00224829\n",
500 |       "[280]\ttrain-error:0.28452+0.000440191\ttest-error:0.295399+0.0023164\n",
501 |       "[290]\ttrain-error:0.283933+0.000573286\ttest-error:0.295436+0.00240771\n",
502 |       "[300]\ttrain-error:0.28351+0.000553644\ttest-error:0.295094+0.00255344\n",
503 |       "[310]\ttrain-error:0.283059+0.00042736\ttest-error:0.294622+0.00241165\n",
504 |       "[320]\ttrain-error:0.282467+0.000447172\ttest-error:0.294243+0.00222645\n",
505 |       "[330]\ttrain-error:0.281928+0.000534553\ttest-error:0.294298+0.0023867\n",
506 |       "[340]\ttrain-error:0.281459+0.000575282\ttest-error:0.293965+0.0023361\n",
507 |       "[350]\ttrain-error:0.28105+0.000749895\ttest-error:0.293947+0.00236084\n",
508 |       "[360]\ttrain-error:0.280581+0.000750061\ttest-error:0.293669+0.00230567\n",
509 |       "[370]\ttrain-error:0.280201+0.000686446\ttest-error:0.293632+0.00227019\n",
510 |       "[380]\ttrain-error:0.279804+0.000762174\ttest-error:0.293457+0.00198204\n",
511 |       "[390]\ttrain-error:0.279177+0.000770605\ttest-error:0.293401+0.00211599\n",
512 |       "[400]\ttrain-error:0.278974+0.00074798\ttest-error:0.293438+0.00208914\n",
513 |       "[410]\ttrain-error:0.278409+0.000716594\ttest-error:0.293207+0.00205472\n",
514 |       "[420]\ttrain-error:0.278042+0.000809643\ttest-error:0.293078+0.00229264\n",
515 |       "[430]\ttrain-error:0.27773+0.000680744\ttest-error:0.293078+0.00194402\n",
516 |       "[440]\ttrain-error:0.277392+0.00069521\ttest-error:0.292957+0.00195425\n",
517 |       "[450]\ttrain-error:0.276805+0.000554099\ttest-error:0.292754+0.00202523\n",
518 |       "[460]\ttrain-error:0.276335+0.000462337\ttest-error:0.292356+0.00195339\n",
519 |       "[470]\ttrain-error:0.276046+0.000483488\ttest-error:0.292171+0.00215688\n",
520 |       "[480]\ttrain-error:0.275612+0.000481538\ttest-error:0.292153+0.00231926\n",
521 |       "[490]\ttrain-error:0.275316+0.000540829\ttest-error:0.29206+0.0023093\n",
522 |       "[500]\ttrain-error:0.274876+0.000555857\ttest-error:0.291893+0.00216299\n",
523 |       "[510]\ttrain-error:0.274601+0.000543743\ttest-error:0.291727+0.0022489\n",
524 |       "[520]\ttrain-error:0.274345+0.000556289\ttest-error:0.291672+0.00212988\n",
525 |       "[530]\ttrain-error:0.273884+0.000640932\ttest-error:0.291671+0.002083\n",
526 |       "[540]\ttrain-error:0.273445+0.000572263\ttest-error:0.291431+0.00215843\n",
527 |       "[550]\ttrain-error:0.27307+0.000643974\ttest-error:0.291533+0.00208399\n",
528 |       "[560]\ttrain-error:0.272839+0.000715068\ttest-error:0.291367+0.0021741\n",
529 |       "[570]\ttrain-error:0.272474+0.000693709\ttest-error:0.291145+0.00218187\n",
530 |       "[580]\ttrain-error:0.272116+0.000735978\ttest-error:0.291061+0.00239614\n",
531 |       "[590]\ttrain-error:0.27172+0.000671488\ttest-error:0.291052+0.00220047\n",
532 |       "[600]\ttrain-error:0.271392+0.000581353\ttest-error:0.291061+0.00205433\n",
533 |       "[610]\ttrain-error:0.270997+0.000704158\ttest-error:0.291034+0.00215672\n",
534 |       "[620]\ttrain-error:0.27073+0.00065256\ttest-error:0.290978+0.00208651\n",
535 |       "[630]\ttrain-error:0.270305+0.00058566\ttest-error:0.290876+0.00236142\n",
536 |       "[640]\ttrain-error:0.269984+0.000583791\ttest-error:0.290756+0.00241029\n",
537 |       "[650]\ttrain-error:0.269609+0.000637878\ttest-error:0.290543+0.00218322\n",
538 |       "[660]\ttrain-error:0.269343+0.000666656\ttest-error:0.290432+0.00199139\n",
539 |       "[670]\ttrain-error:0.268943+0.000610928\ttest-error:0.290303+0.00202394\n",
540 |       "[680]\ttrain-error:0.268562+0.000486423\ttest-error:0.290025+0.00214464\n",
541 |       "[690]\ttrain-error:0.268263+0.000557561\ttest-error:0.290072+0.0020426\n",
542 |       "[700]\ttrain-error:0.267801+0.000551476\ttest-error:0.289942+0.00195377\n",
543 |       "[710]\ttrain-error:0.267494+0.000522724\ttest-error:0.289942+0.00203917\n",
544 |       "[720]\ttrain-error:0.267221+0.000533431\ttest-error:0.290229+0.00199579\n",
545 |       "[730]\ttrain-error:0.266888+0.000576349\ttest-error:0.289979+0.00215134\n",
546 |       "[740]\ttrain-error:0.266578+0.000550324\ttest-error:0.289794+0.00203336\n",
547 |       "[750]\ttrain-error:0.266263+0.000577213\ttest-error:0.289877+0.00193757\n",
548 |       "[760]\ttrain-error:0.266023+0.000512068\ttest-error:0.289794+0.00204126\n",
549 |       "[770]\ttrain-error:0.265692+0.000416448\ttest-error:0.289618+0.00217298\n",
550 |       "[780]\ttrain-error:0.26532+0.000458632\ttest-error:0.289387+0.0019424\n",
551 |       "[790]\ttrain-error:0.26507+0.000483665\ttest-error:0.289119+0.00195537\n",
552 |       "[800]\ttrain-error:0.26483+0.000291451\ttest-error:0.289064+0.00178512\n",
553 |       "[810]\ttrain-error:0.264453+0.00026856\ttest-error:0.288814+0.00180906\n",
554 |       "[820]\ttrain-error:0.26431+0.000318559\ttest-error:0.288823+0.00181794\n",
555 |       "[830]\ttrain-error:0.264018+0.000304194\ttest-error:0.288851+0.00188464\n",
556 |       "[840]\ttrain-error:0.263632+0.000370743\ttest-error:0.288694+0.00201019\n",
557 |       "[850]\ttrain-error:0.263352+0.000371047\ttest-error:0.288518+0.00189282\n",
558 |       "[860]\ttrain-error:0.262953+0.000410568\ttest-error:0.288536+0.00195185\n",
559 |       "[870]\ttrain-error:0.26261+0.000301595\ttest-error:0.288472+0.00210928\n",
560 |       "[880]\ttrain-error:0.262349+0.000334279\ttest-error:0.288361+0.00204547\n",
561 |       "[890]\ttrain-error:0.262137+0.000331987\ttest-error:0.288176+0.0019154\n",
562 |       "[900]\ttrain-error:0.261792+0.00027822\ttest-error:0.28825+0.00200458\n",
563 |       "[910]\ttrain-error:0.261489+0.000354748\ttest-error:0.287972+0.00209926\n",
564 |       "[920]\ttrain-error:0.261239+0.000327636\ttest-error:0.28825+0.00186055\n",
565 |       "[930]\ttrain-error:0.260909+0.000239154\ttest-error:0.287963+0.00191734\n",
566 |       "[940]\ttrain-error:0.260596+0.000299306\ttest-error:0.287954+0.00166505\n",
567 |       "[950]\ttrain-error:0.260319+0.000246223\ttest-error:0.287797+0.00174056\n",
568 |       "[960]\ttrain-error:0.260134+0.000339131\ttest-error:0.287639+0.00181021\n",
569 |       "[970]\ttrain-error:0.25991+0.000288789\ttest-error:0.287648+0.00182037\n",
570 |       "[980]\ttrain-error:0.259715+0.000294464\ttest-error:0.287695+0.00192808\n",
571 |       "[990]\ttrain-error:0.259422+0.000406691\ttest-error:0.287565+0.0019653\n"
572 |      ]
573 |     }
574 |    ],
575 |    "source": [
576 |     "# You can probably get better accuracy with rounds > 1000. \n",
577 |     "bst = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=40,nfold=5L,verbose_eval=10)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 187,
583 |    "metadata": {
584 |     "collapsed": true
585 |    },
586 |    "outputs": [],
587 |    "source": [
588 |     "bst_train = xgb.train(params, dtrain, num_boost_round=1000)"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 188,
594 |    "metadata": {
595 |     "collapsed": true
596 |    },
597 |    "outputs": [],
598 |    "source": [
599 |     "p_test = bst_train.predict(dtest)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 189,
605 |    "metadata": {
606 |     "collapsed": true
607 |    },
608 |    "outputs": [],
609 |    "source": [
610 |     "sub = pd.DataFrame()\n",
611 |     "sub['project_id'] = test['project_id']\n",
612 |     "sub['final_status'] = p_test"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 194,
618 |    "metadata": {
619 |     "collapsed": false
620 |    },
621 |    "outputs": [],
622 |    "source": [
623 |     "sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 196,
629 |    "metadata": {
630 |     "collapsed": true
631 |    },
632 |    "outputs": [],
633 |    "source": [
634 |     "sub.to_csv(\"xgb_with_python_feats.csv\",index=False) #0.70"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": null,
640 |    "metadata": {
641 |     "collapsed": true
642 |    },
643 |    "outputs": [],
644 |    "source": []
645 |   }
646 |  ],
647 |  "metadata": {
648 |   "kernelspec": {
649 |    "display_name": "Python 2",
650 |    "language": "python",
651 |    "name": "python2"
652 |   },
653 |   "language_info": {
654 |    "codemirror_mode": {
655 |     "name": "ipython",
656 |     "version": 2
657 |    },
658 |    "file_extension": ".py",
659 |    "mimetype": "text/x-python",
660 |    "name": "python",
661 |    "nbconvert_exporter": "python",
662 |    "pygments_lexer": "ipython2",
663 |    "version": "2.7.13"
664 |   }
665 |  },
666 |  "nbformat": 4,
667 |  "nbformat_minor": 2
668 | }
669 | 


--------------------------------------------------------------------------------
/xgboost_starter.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # This script will help you learn how to build a xgboost models on features extracted using 
  3 | # Text Mining methods. This script scores ~0.70 on public leaderboard.
  4 | 
  5 | 
  6 | # Load Libraries ----------------------------------------------------------
  7 | 
  8 | library(data.table)
  9 | library(stringr)
 10 | library(text2vec)
 11 | 
 12 | train <- fread("train.csv")
 13 | test <- fread("test.csv")
 14 | 
 15 | 
 16 | # Convert Unix Time Format ------------------------------------------------
 17 | 
 18 | unix_feats <- c('deadline','state_changed_at','created_at','launched_at')
 19 | train[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats]
 20 | test[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats]
 21 | 
 22 | 
 23 | # Create Features ---------------------------------------------------------
 24 | 
 25 | len_feats <- c('name_len','desc_len','keywords_len')
 26 | count_feats <- c('name_count','desc_count','keywords_count')
 27 | cols <- c('name','desc','keywords')
 28 | 
 29 | train[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols]
 30 | train[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols]
 31 | 
 32 | test[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols]
 33 | test[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols]
 34 | 
 35 | 
 36 | # Some More Features ------------------------------------------------------
 37 | 
 38 | train[,time1 := as.numeric(difftime(launched_at, created_at))]
 39 | train[,time3 := as.numeric(difftime(deadline, launched_at))]
 40 | 
 41 | train[,time1 := log(time1)]
 42 | train[,time3 := log(time3)]
 43 | 
 44 | test[,time1 := as.numeric(difftime(launched_at, created_at))]
 45 | test[,time3 := as.numeric(difftime(deadline, launched_at))]
 46 | 
 47 | test[,time1 := log(time1)]
 48 | test[,time3 := log(time3)]
 49 | 
 50 | 
 51 | 
 52 | # Encoding Variables ------------------------------------------------------
 53 | 
 54 | train[,disable_communication := as.integer(as.factor(disable_communication))-1]
 55 | test[,disable_communication := as.integer(as.factor(disable_communication))-1]
 56 | 
 57 | countryall <- data.table(country = append(train$country, test$country))
 58 | countryall[,country := as.integer(as.factor(country))-1]
 59 | 
 60 | country_train <- countryall[1:nrow(train)]
 61 | country_test <- countryall[(nrow(train)+1):nrow(countryall)]
 62 | 
 63 | train[,country := NULL][,country := country_train$country]
 64 | test[,country := NULL][, country := country_test$country]
 65 | 
 66 | train[,goal := log1p(goal)]
 67 | test[,goal := log1p(goal)]
 68 | 
 69 | rm(country_test,country_train,countryall)
 70 | gc()
 71 | 
 72 | 
 73 | 
 74 | # Creating Features from 'Keywords' Variable ------------------------------
 75 | 
 76 | # We could have use a R package to perform the following text mining steps.
 77 | # Rather we'll follow a manual cleaning process which will help you learn using regular expressions as well
 78 | 
 79 | #creating a data frame by combining keywords from both data sets
 80 | fullkey <- rbind(train[,.(project_id,keywords)], test[,.(project_id, keywords)])
 81 | 
 82 | 
 83 | 
 84 | # Text Cleaning -----------------------------------------------------------
 85 | 
 86 | fullkey[,keywords := lapply(keywords, function(x) str_split(string = x, pattern = "-"))]
 87 | 
 88 | # function to remove stop words
 89 | remov_stop <- function(x){
 90 |   
 91 |   t <- unlist(x)
 92 |   t <- setdiff(t, tidytext::stop_words$word)
 93 |   return (t)
 94 |   
 95 | }
 96 | 
 97 | fullkey[,keywords := lapply(keywords, function(x) remov_stop(x))]
 98 | fullkey[,keywords := lapply(keywords, function(x) str_replace_all(x, "[[:digit:]]",""))]
 99 | fullkey[,keywords := lapply(keywords, function(x) SnowballC::wordStem(x))]
100 | fullkey[, keywords := lapply(keywords, function(x) x[nchar(x) > 2])]
101 | 
102 | 
103 | # creating count corpus
104 | 
105 | vec_train <- itoken(fullkey$keywords,tokenizer = word_tokenizer,ids = fullkey$project_id)
106 | vocab = create_vocabulary(vec_train)
107 | vocab
108 | 
109 | pruned_vocab <- prune_vocabulary(vocab,term_count_min = 150) # words occuring 150 or more times
110 | pruned_vocab
111 | 
112 | vocab1 <- vocab_vectorizer(pruned_vocab)
113 | dtm_text <- create_dtm(vec_train,vocab1)
114 | dim(dtm_text)
115 | 
116 | dtm_text1 <- as.data.table(as.matrix(dtm_text))
117 | 
118 | dtm_train <- dtm_text1[1:108129]
119 | dtm_test <- dtm_text1[108130:171594]
120 | 
121 | 
122 | # Adding text features in train and test data -----------------------------
123 | 
124 | X_train <- copy(train)
125 | X_test <- copy(test)
126 | 
127 | cols_to_use <- c('name_len'
128 |                  ,'desc_len'
129 |                  ,'keywords_len'
130 |                  ,'name_count'
131 |                  ,'desc_count'
132 |                  ,'keywords_count'
133 |                  ,'time1'
134 |                  ,'time3'
135 |                  ,'goal')
136 | 
137 | X_train <- cbind(X_train[,cols_to_use,with=F], dtm_train)
138 | X_test <- cbind(X_test[,cols_to_use,with=F], dtm_test)
139 | 
140 | X_train <- cbind(X_train, train_isnum$is_number)
141 | X_test <- cbind(X_train, test_isnum$is_number)
142 | 
143 | 
144 | # Model Training ----------------------------------------------------------
145 | 
146 | library(xgboost)
147 | 
148 | dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = as.numeric(train$final_status))
149 | dtest <- xgb.DMatrix(data = as.matrix(X_test))
150 | 
151 | params <- list(
152 |   
153 |   objective = "binary:logistic",
154 |   eta = 0.025,
155 |   max_depth = 6,
156 |   subsample = 0.7,
157 |   colsample_bytree = 0.7,
158 |   min_child_weight = 5
159 |   
160 | )
161 | 
162 | big_cv <- xgb.cv(params = params
163 |                  ,data = dtrain
164 |                  ,nrounds = 1000
165 |                  ,nfold = 5L
166 |                  ,metrics = 'error'
167 |                  ,stratified = T
168 |                  ,print_every_n = 10
169 |                  ,early_stopping_rounds = 40)
170 | 
171 | iter <- big_cv$best_iteration
172 | 
173 | big_train <- xgb.train(params = params
174 |                        ,data = dtrain
175 |                        ,nrounds = iter)
176 | 
177 | imp <- xgb.importance(model = big_train, feature_names = colnames(dtrain))
178 | xgb.plot.importance(imp,top_n = 20)
179 | 
180 | big_pred <- predict(big_train, dtest)
181 | big_pred <- ifelse(big_pred > 0.5,1,0)
182 | 
183 | sub <- data.table(project_id = test$project_id, final_status = big_pred)
184 | fwrite(sub, "xgb_with_feats.csv") #0.70
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------