├── GbmRStarter.R ├── NaiveBayes-Python.ipynb ├── README.md ├── Rank_1_Roman ├── Instructions ├── first model.ipynb ├── mean_evaluation.py └── start.ipynb ├── Rank_2_Sergazy ├── Instructions ├── best.py ├── layer2.py ├── lstm.py ├── readability.py ├── syllables_en.py ├── utils.py └── word2vecUtils.py ├── XGBoost_Python_TextFeats.ipynb └── xgboost_starter.R /GbmRStarter.R: -------------------------------------------------------------------------------- 1 | # load data and libraries 2 | 3 | library(data.table) 4 | library(lubridate) 5 | library(stringr) 6 | 7 | train <- fread("train.csv") 8 | test <- fread("test.csv") 9 | 10 | # data dimension 11 | 12 | sprintf("There are %s rows and %s columns in train data ",nrow(train),ncol(train)) 13 | sprintf("There are %s rows and %s columns in test data ",nrow(test),ncol(test)) 14 | 15 | # convert unix time format 16 | 17 | unix_feats <- c('deadline','state_changed_at','created_at','launched_at') 18 | train[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats] 19 | test[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats] 20 | 21 | # create simple features 22 | 23 | len_feats <- c('name_len','desc_len','keywords_len') 24 | count_feats <- c('name_count','desc_count','keywords_count') 25 | cols <- c('name','desc','keywords') 26 | 27 | train[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols] 28 | train[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols] 29 | 30 | test[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols] 31 | test[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols] 32 | 33 | # encode features 34 | 35 | train[,disable_communication := as.integer(as.factor(disable_communication))-1] 36 | train[,country := as.integer(as.factor(country))-1] 37 | 38 | test[,disable_communication := as.integer(as.factor(disable_communication))-1] 39 | test[,country := as.integer(as.factor(country))-1] 40 | 41 | 42 | # cols to use in modeling 43 | cols_to_use <- c('final_status' 44 | ,'name_len' 45 | ,'desc_len' 46 | ,'keywords_len' 47 | ,'name_count' 48 | ,'desc_count' 49 | ,'keywords_count') 50 | 51 | 52 | # GBM 53 | library(gbm) 54 | set.seed(1) 55 | 56 | X_train <- copy(train) 57 | X_train[,final_status := as.factor(final_status)] 58 | 59 | clf_model <- gbm(final_status ~ . 60 | ,data = train[,cols_to_use,with=F] 61 | ,n.trees = 500 62 | ,interaction.depth = 5 63 | ,shrinkage = 0.3 64 | ,train.fraction = 0.6 65 | ,verbose = T) 66 | 67 | 68 | # check variable importance 69 | summary(clf_model, n.trees = 125) 70 | 71 | # make predictions 72 | clf_pred <- predict(clf_model, newdata = test, n.trees = 232,type = 'response') 73 | clf_pred <- ifelse(clf_pred > 0.6,1,0) 74 | 75 | # write file 76 | subst <- data.table(project_id = test$project_id, final_status = clf_pred) 77 | fwrite(subst, "gbm_starter.csv") #0.65754 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /NaiveBayes-Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### This script is based on simple features derived from text variables. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 167, 13 | "metadata": { 14 | "collapsed": true, 15 | "deletable": true, 16 | "editable": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from sklearn.naive_bayes import GaussianNB" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "deletable": true, 29 | "editable": true 30 | }, 31 | "source": [ 32 | "### load data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 168, 38 | "metadata": { 39 | "collapsed": true, 40 | "deletable": true, 41 | "editable": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "train = pd.read_csv(\"train.csv\")\n", 46 | "test = pd.read_csv(\"test.csv\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 169, 52 | "metadata": { 53 | "collapsed": false, 54 | "deletable": true, 55 | "editable": true 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "The train data has 108129 rows and 14 columns\n", 63 | "The test data has 63465 rows and 12 columns\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))\n", 69 | "print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "deletable": true, 76 | "editable": true 77 | }, 78 | "source": [ 79 | "### convert time to unix format" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 170, 85 | "metadata": { 86 | "collapsed": true, 87 | "deletable": true, 88 | "editable": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "import time\n", 93 | "\n", 94 | "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n", 95 | "\n", 96 | "for x in unix_cols:\n", 97 | " train[x] = train[x].apply(lambda k: time.ctime(k))\n", 98 | " test[x] = test[x].apply(lambda k: time.ctime(k))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "deletable": true, 105 | "editable": true 106 | }, 107 | "source": [ 108 | "### create simple features" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 171, 114 | "metadata": { 115 | "collapsed": false, 116 | "deletable": true, 117 | "editable": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "cols_to_use = ['name','desc']\n", 122 | "len_feats = ['name_len','desc_len']\n", 123 | "count_feats = ['name_count','desc_count']\n", 124 | "\n", 125 | "for i in np.arange(2):\n", 126 | " train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n", 127 | " train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 172, 133 | "metadata": { 134 | "collapsed": true, 135 | "deletable": true, 136 | "editable": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "train['keywords_len'] = train['keywords'].apply(str).apply(len)\n", 141 | "train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 173, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "for i in np.arange(2):\n", 153 | " test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)\n", 154 | " test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n", 155 | " \n", 156 | "test['keywords_len'] = test['keywords'].apply(str).apply(len)\n", 157 | "test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "deletable": true, 164 | "editable": true 165 | }, 166 | "source": [ 167 | "### encoding features" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 174, 173 | "metadata": { 174 | "collapsed": false, 175 | "deletable": true, 176 | "editable": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "from sklearn.preprocessing import LabelEncoder\n", 181 | "\n", 182 | "feat = ['disable_communication','country']\n", 183 | "for x in feat:\n", 184 | " le = LabelEncoder()\n", 185 | " le.fit(list(train[x].values) + list(test[x].values))\n", 186 | " train[x] = le.transform(list(train[x]))\n", 187 | " test[x] = le.transform(list(test[x].values))\n", 188 | " " 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "### model training" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 175, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "cols_to_use = ['name_len'\n", 207 | " ,'desc_len'\n", 208 | " ,'keywords_len'\n", 209 | " ,'name_count'\n", 210 | " ,'desc_count'\n", 211 | " ,'keywords_count']\n", 212 | "\n", 213 | "target = train['final_status']\n", 214 | "\n", 215 | "# data for modeling\n", 216 | "k_train = train[cols_to_use]\n", 217 | "k_test = test[cols_to_use]" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "### naive bayes" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 176, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "gnb = GaussianNB()\n", 236 | "nvb_pred = gnb.fit(k_train, target).predict(k_test)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "### write the file" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 177, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "nBsub = pd.DataFrame({'project_id':test['project_id'],'final_status':nvb_pred})\n", 255 | "nBsub = nBsub[['project_id','final_status']]\n", 256 | "nBsub.to_csv(\"nBstarter.csv\",index = False) #0.6526" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 2", 263 | "language": "python", 264 | "name": "python2" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 2 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython2", 276 | "version": "2.7.13" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 2 281 | } 282 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Challenge #2 2 | 3 | This repository contains R and Python scripts to help ML aspirants and enthusiasts get a nice head-start and learn something new from this 15 days Machine Learning Challenge. 4 | It is an online competition, hence people from all over the world can participate. 5 | 6 | **Problem:** Predict if a project will get successfully funded or not. 7 | 8 | **Prize:** $700, $500 9 | 10 | **Duration:** 15th June 2017 to 30th June 2017 11 | 12 | **Link:** https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-2/machine-learning/funding-successful-projects/ 13 | -------------------------------------------------------------------------------- /Rank_1_Roman/Instructions: -------------------------------------------------------------------------------- 1 | To reproduce the final submission file:: 2 | 3 | 1. Run first_model.py 4 | 2. start.py 5 | 6 | Note: Keep mean_evaluation.py in the same directory as the files above. 7 | -------------------------------------------------------------------------------- /Rank_1_Roman/first model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 15 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import xgboost as xgb\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import os\n", 24 | "from sklearn.preprocessing import LabelEncoder\n", 25 | "from mean_evaluation import roman_mean\n", 26 | "import datetime\n", 27 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 28 | "import re\n", 29 | "from scipy.sparse import hstack\n", 30 | "from sklearn.linear_model import LogisticRegression, SGDClassifier\n", 31 | "from sklearn.model_selection import StratifiedKFold, train_test_split\n", 32 | "from sklearn.model_selection import cross_val_predict\n", 33 | "from sklearn.neighbors import KNeighborsClassifier\n", 34 | "from sklearn.svm import SVC, LinearSVC\n", 35 | "from scipy.spatial import distance" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "train = pd.read_csv('train.csv')\n", 47 | "test = pd.read_csv('test.csv')\n", 48 | "\n", 49 | "final_status = train.final_status\n", 50 | "projest_id = train.project_id\n", 51 | "backers_count = train.backers_count\n", 52 | "\n", 53 | "ltr = len(train)\n", 54 | "train.drop(['final_status', 'backers_count'], axis = 1, inplace = True)\n", 55 | "\n", 56 | "data = pd.concat([train, test], axis = 0)\n", 57 | "data.index = range(len(data))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "roman_model = roman_mean(directory = 'path', \n", 69 | " n_folds_gen = 10, \n", 70 | " n_folds_sub = 5, \n", 71 | " seed = 322, \n", 72 | " sub_seed = 228, \n", 73 | " ltr = ltr, \n", 74 | " data = data, \n", 75 | " target = final_status)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "int_project_id = []\n", 87 | "for x in data.project_id.tolist():\n", 88 | " int_project_id += [int(x[4:])]\n", 89 | "data['int_project_id'] = int_project_id" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "int_disable_communication = []\n", 101 | "for x in data.disable_communication.tolist():\n", 102 | " if x == False:\n", 103 | " int_disable_communication += [0]\n", 104 | " else:\n", 105 | " int_disable_communication += [1]\n", 106 | "data['disable_communication_int'] = int_disable_communication" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 10, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "data['deadline-created_at'] = data.deadline - data.created_at\n", 118 | "data['launched_at-created_at'] = data.deadline - data.created_at\n", 119 | "data['state_changed_at-created_at'] = data.deadline - data.created_at\n", 120 | "data['state_changed_at-deadline'] = data.state_changed_at - data.deadline\n", 121 | "data['deadline-launched_at'] = data.deadline - data.launched_at\n", 122 | "data['state_changed_at-launched_at'] = data.state_changed_at - data.launched_at" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 11, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "data['len_name'] = [len(str(x)) for x in data.name.tolist()]\n", 134 | "data['len_desc'] = [len(str(x)) for x in data.desc.tolist()]\n", 135 | "data['len_keywords'] = [len(str(x)) for x in data.keywords.tolist()]\n", 136 | "data['numb_keywords'] = [len(str(x).split('-')) for x in data.keywords.tolist()]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 13, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "len_cov = []\n", 148 | "for x in data.desc.tolist():\n", 149 | " tokens = re.findall('\\\"', str(x))\n", 150 | " len_cov += [len(tokens)]\n", 151 | "data['len_cov'] = len_cov\n", 152 | "data['bad_znak'] = data['len_cov'] / data['len_desc']" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 14, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "keywords = [re.sub('-', ' ', str(x)) for x in data.keywords.tolist()]\n", 164 | "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n", 165 | "keywords_vect = vectorizer.fit_transform(keywords)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 17, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "names = [str(x) for x in data.name.tolist()]\n", 177 | "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n", 178 | "names_vect = vectorizer.fit_transform(names)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 18, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "desc = [str(x) for x in data.desc.tolist()]\n", 190 | "vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')\n", 191 | "desc_vect = vectorizer.fit_transform(desc)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 19, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "del vectorizer" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 20, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "sp_data = hstack([keywords_vect, names_vect, desc_vect]).tocsr()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 22, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "time_feat = ['deadline', 'created_at', 'launched_at', 'state_changed_at']\n", 225 | "for time in time_feat:\n", 226 | " weekday = []\n", 227 | " hour = []\n", 228 | " day = []\n", 229 | " for x in data.loc[:, time].tolist():\n", 230 | " weekday += [datetime.datetime.fromtimestamp(x).weekday()]\n", 231 | " hour += [datetime.datetime.fromtimestamp(x).hour]\n", 232 | " day += [datetime.datetime.fromtimestamp(x).day]\n", 233 | " data[time + '_' + 'weekday'] = weekday\n", 234 | " data[time + '_' + 'hour'] = hour\n", 235 | " data[time + '_' + 'day'] = day" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 24, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "deadline_hour_weekday\n", 250 | "created_at_hour_weekday\n", 251 | "launched_at_hour_weekday\n", 252 | "state_changed_at_hour_weekday\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "for time in time_feat:\n", 258 | " print(time + '_' + 'hour_weekday')\n", 259 | " data[time + '_' + 'hour_weekday'] = data[time + '_' + 'hour'].astype(str) + '_' + data[time + '_' + 'weekday'].astype(str)\n", 260 | " data[time + '_' + 'hour_country'] = data[time + '_' + 'hour'].astype(str) + '_' + data['country'].astype(str)\n", 261 | " data[time + '_' + 'weekday_country'] = data[time + '_' + 'weekday'].astype(str) + '_' + data['country'].astype(str)\n", 262 | " data[time + '_' + 'day_country'] = data[time + '_' + 'day'].astype(str) + '_' + data['country'].astype(str)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 25, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "normal_goal = []\n", 274 | "for x, y in zip(data.currency.tolist(), data.goal.tolist()):\n", 275 | " if x == 'USD':\n", 276 | " normal_goal += [y]\n", 277 | " if x == 'GBP':\n", 278 | " normal_goal += [1.5 * y]\n", 279 | " if x == 'EUR':\n", 280 | " normal_goal += [1.2 * y]\n", 281 | " if x == 'CAD':\n", 282 | " normal_goal += [0.85 * y]\n", 283 | " if x == 'AUD':\n", 284 | " normal_goal += [0.85 * y]\n", 285 | " if x == 'SEK':\n", 286 | " normal_goal += [0.14 * y]\n", 287 | " if x == 'NZD':\n", 288 | " normal_goal += [0.70 * y]\n", 289 | " if x == 'DKK':\n", 290 | " normal_goal += [0.17 * y]\n", 291 | " if x == 'NOK':\n", 292 | " normal_goal += [0.15 * y]\n", 293 | " if x == 'CHF':\n", 294 | " normal_goal += [y]\n", 295 | " if x == 'MXN':\n", 296 | " normal_goal += [0.07 * y]\n", 297 | " if x == 'SGD':\n", 298 | " normal_goal += [0.73 * y]\n", 299 | " if x == 'HKD':\n", 300 | " normal_goal += [0.13 * y]\n", 301 | "\n", 302 | "data['normal_goal'] = normal_goal\n", 303 | "data['deadline-created_at_normal_goal'] = data.loc[:, 'deadline-created_at'] / data.normal_goal\n", 304 | "data['launched_at-created_at_normal_goal'] = data.loc[:, 'launched_at-created_at'] / data.normal_goal\n", 305 | "data['state_changed_at-created_at_normal_goal'] = data.loc[:, 'state_changed_at-created_at'] / data.normal_goal\n", 306 | "data['state_changed_at-deadline_normal_goal'] = data.loc[:, 'state_changed_at-deadline'] / data.normal_goal\n", 307 | "data['deadline-launched_at_normal_goal'] = data.loc[:, 'deadline-launched_at'] / data.normal_goal\n", 308 | "data['state_changed_at-launched_at_normal_goal'] = data.loc[:, 'state_changed_at-launched_at'] / data.normal_goal" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 27, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "country\n", 323 | "currency\n", 324 | "deadline_hour_weekday\n", 325 | "created_at_hour_weekday\n", 326 | "launched_at_hour_weekday\n", 327 | "state_changed_at_hour_weekday\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "roman_model.cols_mean(['country', 'currency', 'deadline_hour_weekday',\n", 333 | "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday'])" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 28, 339 | "metadata": { 340 | "collapsed": false, 341 | "scrolled": true 342 | }, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\project_id\n", 349 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\name\n", 350 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\desc\n", 351 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\goal\n", 352 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\keywords\n", 353 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\disable_communication\n", 354 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\country\n", 355 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\currency\n", 356 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline\n", 357 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at\n", 358 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at\n", 359 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at\n", 360 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\int_project_id\n", 361 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\disable_communication_int\n", 362 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-created_at\n", 363 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at-created_at\n", 364 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-created_at\n", 365 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-deadline\n", 366 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-launched_at\n", 367 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-launched_at\n", 368 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_name\n", 369 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_desc\n", 370 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_keywords\n", 371 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\numb_keywords\n", 372 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\len_cov\n", 373 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\bad_znak\n", 374 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_weekday\n", 375 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour\n", 376 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_day\n", 377 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_weekday\n", 378 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour\n", 379 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_day\n", 380 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_weekday\n", 381 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour\n", 382 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_day\n", 383 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_weekday\n", 384 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour\n", 385 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_day\n", 386 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour_weekday\n", 387 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_hour_country\n", 388 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_weekday_country\n", 389 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline_day_country\n", 390 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour_weekday\n", 391 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_hour_country\n", 392 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_weekday_country\n", 393 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\created_at_day_country\n", 394 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour_weekday\n", 395 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_hour_country\n", 396 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_weekday_country\n", 397 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at_day_country\n", 398 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour_weekday\n", 399 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_hour_country\n", 400 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_weekday_country\n", 401 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at_day_country\n", 402 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\normal_goal\n", 403 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-created_at_normal_goal\n", 404 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\launched_at-created_at_normal_goal\n", 405 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-created_at_normal_goal\n", 406 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-deadline_normal_goal\n", 407 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\deadline-launched_at_normal_goal\n", 408 | "/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\state_changed_at-launched_at_normal_goal\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "roman_model.save_in_file(data)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 29, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc',\n", 425 | " 'max_depth':8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5}" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 30, 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "feature_list = ['country_mean', \n", 437 | " 'currency_mean', \n", 438 | " 'disable_communication_int', \n", 439 | " 'normal_goal',\n", 440 | " 'deadline-created_at', \n", 441 | " 'launched_at-created_at', \n", 442 | " 'state_changed_at-created_at', \n", 443 | " 'deadline-launched_at',\n", 444 | " 'state_changed_at-deadline',\n", 445 | " 'state_changed_at-launched_at',\n", 446 | " 'deadline-created_at_normal_goal', \n", 447 | " 'launched_at-created_at_normal_goal', \n", 448 | " 'state_changed_at-created_at_normal_goal', \n", 449 | " 'deadline-launched_at_normal_goal',\n", 450 | " 'state_changed_at-deadline_normal_goal',\n", 451 | " 'state_changed_at-launched_at_normal_goal', \n", 452 | " 'len_name', \n", 453 | " 'len_desc', \n", 454 | " 'len_keywords', \n", 455 | " 'created_at_hour', 'created_at_weekday', 'created_at_day', \n", 456 | " 'deadline_hour', 'deadline_weekday', 'deadline_day', \n", 457 | " 'launched_at_hour', 'launched_at_weekday', 'launched_at_day', \n", 458 | " 'state_changed_at_hour', 'state_changed_at_weekday', 'state_changed_at_day', \n", 459 | " 'canceled', 'deadline_hour_weekday_mean',\n", 460 | "'created_at_hour_weekday_mean', \n", 461 | "'launched_at_hour_weekday_mean',\n", 462 | "'state_changed_at_hour_weekday_mean']" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 32, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "Calculate 1/10\n", 477 | "1/36\n" 478 | ] 479 | }, 480 | { 481 | "ename": "FileNotFoundError", 482 | "evalue": "File b'/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\\\features\\\\country_mean\\\\country_mean.csv' does not exist", 483 | "output_type": "error", 484 | "traceback": [ 485 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 486 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 487 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mroman_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredictSparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdic_par\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;31m# roman_model.predict(dic_par, stack_feat, 5000, True, False)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 488 | "\u001b[0;32m/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah/mean_evaluation.py\u001b[0m in \u001b[0;36mpredictSparse\u001b[0;34m(self, dic_par, sparse, feature_list, num_round, save, fscore, score)\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0mfeature_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_feat_directory\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\\\'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 285\u001b[0;31m \u001b[0mfeature_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_feat_directory\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\\\'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfeature\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 286\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_col\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 287\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtocsr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 489 | "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 644\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 645\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 646\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 490 | "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 388\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 389\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mchunksize\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 491 | "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 729\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 730\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 731\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 492 | "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 921\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 922\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 923\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 924\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 925\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 493 | "\u001b[0;32m/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1388\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1390\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1391\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1392\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 494 | "\u001b[0;32mpandas/parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)\u001b[0;34m()\u001b[0m\n", 495 | "\u001b[0;32mpandas/parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)\u001b[0;34m()\u001b[0m\n", 496 | "\u001b[0;31mFileNotFoundError\u001b[0m: File b'/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\\\features\\\\country_mean\\\\country_mean.csv' does not exist" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "roman_model.predictSparse(dic_par, sp_data, feature_list, 5000, True, False)\n", 502 | "# roman_model.predict(dic_par, stack_feat, 5000, True, False)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": { 509 | "collapsed": true 510 | }, 511 | "outputs": [], 512 | "source": [] 513 | } 514 | ], 515 | "metadata": { 516 | "anaconda-cloud": {}, 517 | "kernelspec": { 518 | "display_name": "Python 3", 519 | "language": "python", 520 | "name": "python3" 521 | }, 522 | "language_info": { 523 | "codemirror_mode": { 524 | "name": "ipython", 525 | "version": 3 526 | }, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "nbconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": "3.5.2" 533 | } 534 | }, 535 | "nbformat": 4, 536 | "nbformat_minor": 1 537 | } 538 | -------------------------------------------------------------------------------- /Rank_1_Roman/mean_evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[17]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import xgboost as xgb 9 | import pickle 10 | import re 11 | from sklearn.model_selection import StratifiedKFold 12 | from sklearn.feature_extraction import DictVectorizer 13 | from sklearn.model_selection import cross_val_predict 14 | from sklearn.metrics import roc_auc_score 15 | from sklearn.linear_model import LogisticRegression 16 | from sklearn.ensemble import RandomForestClassifier 17 | from sklearn.preprocessing import OneHotEncoder 18 | from scipy.sparse import hstack 19 | import os 20 | from xgboost import XGBClassifier 21 | from sklearn.metrics import accuracy_score 22 | 23 | 24 | 25 | # In[18]: 26 | 27 | class roman_mean: 28 | def __init__(self, directory, data, target, n_folds_gen, n_folds_sub, seed, sub_seed, ltr, 29 | extra_train = None, extra_target = None): 30 | self.directory = directory 31 | self.n_folds_gen = n_folds_gen 32 | self.n_folds_sub = n_folds_sub 33 | self.seed = seed 34 | self.sub_seed = sub_seed 35 | self.ltr = ltr 36 | self.data = data 37 | self.target = target 38 | self.extra_train = extra_train 39 | self.extra_target = extra_target 40 | 41 | def save_in_file(self, data): 42 | for x in data.columns.values: 43 | directory = self.directory + '\\features\\' + x 44 | print(directory) 45 | if not os.path.exists(directory): 46 | os.makedirs(directory) 47 | else: 48 | print(x + ' already save.') 49 | continue 50 | data.loc[:, x].to_csv(directory + '\\' + x + '.csv', index = None, header = True) 51 | 52 | #mean_eval + mean_start + cols_mean Computation mean values by target with double cross_validation 53 | def mean_eval(self, pred, alpha, train_fold, test_fold, target, col_name): 54 | if type(self.extra_train) == type(None): 55 | cur_train = train_fold 56 | cur_target = target 57 | else: 58 | cur_train = pd.concat([self.extra_train.loc[:, col_name], train_fold], axis = 0) 59 | cur_train.index = range(len(cur_train)) 60 | cur_target = pd.concat([self.extra_target, target], axis = 0) 61 | cur_target.index = range(len(cur_target)) 62 | grouped = cur_target.groupby(cur_train) 63 | grouped_mean = grouped.mean().to_dict() 64 | grouped_count = grouped.count().to_dict() 65 | glob_mean = cur_target.mean() 66 | pred[list(test_fold.index)] = [(grouped_mean[x] * grouped_count[x] + glob_mean * alpha) / (grouped_count[x] + alpha) 67 | if x in grouped_mean else glob_mean for x in test_fold] 68 | 69 | def mean_start(self, col): 70 | kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True) 71 | kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True) 72 | alpha = 20 73 | directory = self.directory + '\\features\\' + col.name + '_mean' 74 | if not os.path.exists(directory): 75 | os.makedirs(directory) 76 | else: 77 | print(col.name + ' already exist.') 78 | return 79 | for i, (train_index, test_index) in enumerate(kf_gen.split(col[:self.ltr], self.target)): 80 | pred = pd.Series([-1] * len(col)) 81 | sub_col = col[train_index] 82 | sub_target = self.target[train_index] 83 | for j, (sub_train_index, sub_test_index) in enumerate(kf_sub.split(sub_col, sub_target)): 84 | self.mean_eval(pred, alpha, 85 | sub_col.iloc[sub_train_index], 86 | sub_col.iloc[sub_test_index], 87 | sub_target.iloc[sub_train_index], col.name) 88 | self.mean_eval(pred, alpha, 89 | col[train_index], 90 | col[test_index], 91 | self.target[train_index], col.name) 92 | self.mean_eval(pred, alpha, 93 | col[train_index], 94 | col[self.ltr:], 95 | self.target[train_index], col.name) 96 | pred.name = col.name + '_mean' 97 | pred.to_csv(self.directory + '\\features\\' + col.name + '_mean' + '\\' + str(i) 98 | + '.csv', index = None, header = True) 99 | 100 | def cols_mean(self, cols): 101 | for col in cols: 102 | print(col) 103 | self.mean_start(self.data.loc[:, col]) 104 | 105 | #Computation factor machine with double cross_validation 106 | 107 | #Computation logistic regression with double cross_validaton 108 | def cols_LR(self, feature_list): 109 | 110 | kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True) 111 | kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True) 112 | for i, (train_index, test_index) in enumerate(kf_gen.split(self.data[:self.ltr], self.target)): 113 | print(i) 114 | sp_data = pd.DataFrame() 115 | features_directory = self.directory + '\\features' 116 | col_i = 1 117 | for feature in feature_list: 118 | print(str(col_i) + '/' + str(len(feature_list))) 119 | col_i += 1 120 | cur_feat_directory = features_directory + '\\' + feature 121 | if len(os.listdir(cur_feat_directory)) > 1: 122 | feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv') 123 | else: 124 | feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv') 125 | sp_data = pd.concat([sp_data, feature_col], axis = 1) 126 | del feature_col 127 | pred = pd.Series([-1] * len(self.data)) 128 | clf = LogisticRegression(C = 20, n_jobs = -1) 129 | pred[train_index] = cross_val_predict(clf, sp_data.loc[train_index, :], self.target[train_index], cv = kf_sub, 130 | method = 'predict_proba', n_jobs = -1)[:, 1] 131 | print('OK') 132 | clf.fit(sp_data.loc[train_index, :], self.target[train_index]) 133 | pred[test_index] = clf.predict_proba(sp_data.loc[test_index, :])[:, 1] 134 | print(roc_auc_score(self.target[test_index], pred[test_index])) 135 | pred[self.ltr:] = clf.predict_proba(sp_data.loc[self.ltr:, :])[:, 1] 136 | pred.name = 'LR_true' 137 | directory = self.directory + '\\features\\LR_true2\\' 138 | if not os.path.exists(directory): 139 | os.makedirs(directory) 140 | pred.to_csv(directory + str(i) + '.csv', index = None, header = True) 141 | 142 | #Computation xgboost predict with double cross_validation 143 | def cols_XGB(self, feature_list, dic_par_list, num_round): 144 | features_directory = self.directory + '//features' 145 | 146 | kf_gen = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True) 147 | kf_sub = StratifiedKFold(n_splits=self.n_folds_sub, random_state=self.sub_seed, shuffle=True) 148 | for i, (train_index, test_index) in enumerate(kf_gen.split(self.data[:self.ltr], self.target)): 149 | print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen)) 150 | data = pd.DataFrame() 151 | col_i = 1 152 | for feature in feature_list: 153 | print(str(col_i) + '/' + str(len(feature_list))) 154 | col_i += 1 155 | cur_feat_directory = features_directory + '//' + feature 156 | if len(os.listdir(cur_feat_directory)) > 1: 157 | feature_col = pd.read_csv(cur_feat_directory + '//' + str(i) + '.csv') 158 | else: 159 | feature_col = pd.read_csv(cur_feat_directory + '//' + feature + '.csv') 160 | data = pd.concat([data, feature_col], axis = 1) 161 | #print(feature_col.columns) 162 | del feature_col 163 | print(i) 164 | for k, dic_par in enumerate(dic_par_list): 165 | pred = pd.Series([-1] * len(self.data)) 166 | for j, (sub_train_index, sub_test_index) in enumerate(kf_sub.split(self.data.loc[train_index, :], self.target[train_index])): 167 | print(i, k, j) 168 | xgall = xgb.DMatrix(data.loc[train_index[sub_train_index], :], self.target[train_index[sub_train_index]]) 169 | xgeval = xgb.DMatrix(data.loc[train_index[sub_test_index], :], self.target[train_index[sub_test_index]]) 170 | bst = xgb.train(dic_par, xgall, maximize=True, early_stopping_rounds=20, 171 | num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=False) 172 | pred[train_index[sub_test_index]] = bst.predict(xgeval) 173 | del xgall, xgeval, bst 174 | xgall = xgb.DMatrix(data.loc[train_index, :], self.target[train_index]) 175 | xg_cvtest = xgb.DMatrix(data.loc[test_index, :], self.target[test_index]) 176 | xg_test = xgb.DMatrix(data.loc[self.ltr:, :]) 177 | bst = xgb.train(dic_par, xgall, maximize=True, early_stopping_rounds=20, 178 | num_boost_round=num_round, evals=[(xgall, 'train'), (xg_cvtest, 'test')], verbose_eval=False) 179 | pred[test_index] = bst.predict(xg_cvtest) 180 | pred[self.ltr:] = bst.predict(xg_test) 181 | print(bst.best_score) 182 | print(roc_auc_score(self.target[test_index], pred[test_index])) 183 | name = 'XGB' + str(k + 3) 184 | pred.name = name 185 | directory = self.directory + '//features//' + name + '//' 186 | if not os.path.exists(directory): 187 | os.makedirs(directory) 188 | pred.to_csv(directory + str(i) + '.csv', index = None, header = True) 189 | del xgall, xg_cvtest, xg_test, bst, pred 190 | 191 | #Computation lightGBM with double cross_validaton 192 | 193 | #Computation SVD recommends with double cross_validation 194 | #Computation xgboost predict 195 | def predict(self, dic_par, feature_list, num_round, save = False, fscore = False): 196 | pred = pd.Series([-1] * len(self.data)) 197 | kf = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True) 198 | kf_split = kf.split(self.data.loc[:self.ltr-1, :], self.target) 199 | features_directory = self.directory + '\\features' 200 | pred_test = pd.DataFrame() 201 | score_list = [] 202 | tree_limit = [] 203 | pred_directory = self.directory + '\\models' 204 | model_number = 100 205 | model_directory = pred_directory + '\\model_' + str(model_number) 206 | if not os.path.exists(model_directory): 207 | os.makedirs(model_directory) 208 | for i, (train_index, test_index) in enumerate(kf_split): 209 | print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen)) 210 | data = pd.DataFrame() 211 | col_i = 1 212 | for feature in feature_list: 213 | print(str(col_i) + '/' + str(len(feature_list))) 214 | col_i += 1 215 | cur_feat_directory = features_directory + '\\' + feature 216 | if len(os.listdir(cur_feat_directory)) > 1: 217 | feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv') 218 | else: 219 | feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv') 220 | data = pd.concat([data, feature_col], axis = 1) 221 | xgall = xgb.DMatrix(data.loc[train_index, :], self.target[train_index]) 222 | xgeval = xgb.DMatrix(data.loc[test_index, :], self.target[test_index]) 223 | bst = xgb.train(dic_par, xgall, maximize=False, early_stopping_rounds=30, 224 | num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=50) 225 | del xgall, xgeval 226 | xg_cvtest = xgb.DMatrix(data.loc[test_index, :]) 227 | xg_test = xgb.DMatrix(data.loc[self.ltr:, :]) 228 | del data 229 | pred[test_index] = bst.predict(xg_cvtest, ntree_limit=bst.best_ntree_limit) 230 | score_list += [bst.best_score] 231 | if fscore == True: 232 | return bst.get_fscore() 233 | print(bst.best_score) 234 | print(bst.best_ntree_limit) 235 | tree_limit += [bst.best_ntree_limit] 236 | cur_pred = pd.DataFrame(bst.predict(xg_test, ntree_limit=bst.best_ntree_limit)) 237 | pred_test = pd.concat([pred_test, cur_pred], axis = 1) 238 | pred.to_csv(model_directory + '\\predict' + str(i) + '.csv') 239 | pred_test.to_csv(model_directory + '\\pred_test' + str(i) + '.csv') 240 | pred[self.ltr:] = np.array(pred_test.mean(axis = 1)) 241 | del xg_cvtest, xg_test, bst 242 | if save == True: 243 | pred_directory = self.directory + '\\models' 244 | if not os.path.exists(pred_directory): 245 | os.makedirs(pred_directory) 246 | model_number = len(os.listdir(pred_directory)) + 1 247 | model_directory = pred_directory + '\\model_' + str(model_number) 248 | if not os.path.exists(model_directory): 249 | os.makedirs(model_directory) 250 | f = open(model_directory + '\\info.txt', 'w') 251 | f.write('Model_' + str(model_number) + ' info:\n') 252 | for i, (x, y) in enumerate(zip(score_list, tree_limit)): 253 | f.write('Fold ' + str(i + 1) + ': Score: ' + str(x) + ' Tree_number: ' + str(y) + '\n') 254 | f.write('Model score:' + str(1 - np.mean(score_list))) 255 | f.close() 256 | pred.to_csv(model_directory + '\\predict.csv') 257 | del pred 258 | 259 | def predictSparse(self, dic_par, sparse, feature_list, num_round, save = False, fscore = False, score = False): 260 | pred = pd.Series([-1] * len(self.data)) 261 | kf = StratifiedKFold(n_splits=self.n_folds_gen, random_state=self.seed, shuffle=True) 262 | kf_split = kf.split(self.data.loc[:self.ltr-1, :], self.target) 263 | features_directory = self.directory + '\\features' 264 | pred_test = pd.DataFrame() 265 | score_list = [] 266 | tree_limit = [] 267 | pred_directory = self.directory + '\\models' 268 | model_number = 100 269 | model_directory = pred_directory + '\\model_' + str(model_number) 270 | if not os.path.exists(model_directory): 271 | os.makedirs(model_directory) 272 | for i, (train_index, test_index) in enumerate(kf_split): 273 | if score == False: 274 | print('Calculate ' + str(i + 1) + '/' + str(self.n_folds_gen)) 275 | data = pd.DataFrame() 276 | col_i = 1 277 | for feature in feature_list: 278 | if score == False: 279 | print(str(col_i) + '/' + str(len(feature_list))) 280 | col_i += 1 281 | cur_feat_directory = features_directory + '\\' + feature 282 | if len(os.listdir(cur_feat_directory)) > 1: 283 | feature_col = pd.read_csv(cur_feat_directory + '\\' + str(i) + '.csv') 284 | else: 285 | feature_col = pd.read_csv(cur_feat_directory + '\\' + feature + '.csv') 286 | data = pd.concat([data, feature_col], axis = 1) 287 | data = hstack([data, sparse]).tocsr() 288 | xgall = xgb.DMatrix(data[train_index], self.target[train_index]) 289 | xgeval = xgb.DMatrix(data[test_index], self.target[test_index]) 290 | bst = xgb.train(dic_par, xgall, maximize=False, early_stopping_rounds=50, 291 | num_boost_round=num_round, evals=[(xgall, 'train'), (xgeval, 'test')], verbose_eval=20) 292 | del xgall, xgeval 293 | xg_cvtest = xgb.DMatrix(data[test_index]) 294 | xg_test = xgb.DMatrix(data[self.ltr:]) 295 | del data 296 | pred[test_index] = bst.predict(xg_cvtest, ntree_limit=bst.best_ntree_limit) 297 | score_list += [bst.best_score] 298 | if fscore == True: 299 | return bst.get_fscore() 300 | print(bst.best_score) 301 | print(bst.best_ntree_limit) 302 | tree_limit += [bst.best_ntree_limit] 303 | cur_pred = pd.DataFrame(bst.predict(xg_test, ntree_limit=bst.best_ntree_limit)) 304 | pred_test = pd.concat([pred_test, cur_pred], axis = 1) 305 | pred.to_csv(model_directory + '\\predict' + str(i) + '.csv') 306 | pred_test.to_csv(model_directory + '\\pred_test' + str(i) + '.csv') 307 | pred[self.ltr:] = np.array(pred_test.mean(axis = 1)) 308 | if score == True: 309 | return accuracy_score(self.target, round(pred[:self.ltr]).astype(int)) 310 | del xg_cvtest, xg_test, bst 311 | if save == True: 312 | pred_directory = self.directory + '\\models' 313 | if not os.path.exists(pred_directory): 314 | os.makedirs(pred_directory) 315 | model_number = len(os.listdir(pred_directory)) + 1 316 | model_directory = pred_directory + '\\model_' + str(model_number) 317 | if not os.path.exists(model_directory): 318 | os.makedirs(model_directory) 319 | f = open(model_directory + '\\info.txt', 'w') 320 | f.write('Model_' + str(model_number) + ' info:\n') 321 | for i, (x, y) in enumerate(zip(score_list, tree_limit)): 322 | f.write('Fold ' + str(i + 1) + ': Score: ' + str(x) + ' Tree_number: ' + str(y) + '\n') 323 | f.write('Model score:' + str(1 - np.mean(score_list))) 324 | f.close() 325 | pred.to_csv(model_directory + '\\predict.csv') 326 | del pred 327 | 328 | -------------------------------------------------------------------------------- /Rank_1_Roman/start.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 15 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import xgboost as xgb\n", 23 | "import datetime\n", 24 | "import re\n", 25 | "from sklearn.svm import LinearSVC\n", 26 | "from sklearn.metrics import accuracy_score" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "CREATE DATA" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "train = pd.read_csv('train.csv')\n", 45 | "test = pd.read_csv('test.csv')\n", 46 | "\n", 47 | "final_status = train.final_status\n", 48 | "projest_id = train.project_id\n", 49 | "backers_count = train.backers_count\n", 50 | "\n", 51 | "ltr = len(train)\n", 52 | "train.drop(['final_status', 'backers_count'], axis = 1, inplace = True)\n", 53 | "\n", 54 | "data = pd.concat([train, test], axis = 0)\n", 55 | "data.index = range(len(data))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "int_disable_communication = []\n", 67 | "for x in data.disable_communication.tolist():\n", 68 | " if x == False:\n", 69 | " int_disable_communication += [0]\n", 70 | " else:\n", 71 | " int_disable_communication += [1]\n", 72 | "data['disable_communication'] = int_disable_communication" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "data['deadline-created_at'] = data.deadline - data.created_at\n", 84 | "data['launched_at-created_at'] = data.deadline - data.created_at\n", 85 | "data['state_changed_at-created_at'] = data.deadline - data.created_at\n", 86 | "data['state_changed_at-deadline'] = data.state_changed_at - data.deadline\n", 87 | "data['deadline-launched_at'] = data.deadline - data.launched_at\n", 88 | "data['state_changed_at-launched_at'] = data.state_changed_at - data.launched_at" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "data['len_name'] = [len(str(x)) for x in data.name.tolist()]\n", 100 | "data['len_desc'] = [len(str(x)) for x in data.desc.tolist()]\n", 101 | "data['len_keywords'] = [len(str(x)) for x in data.keywords.tolist()]\n", 102 | "data['numb_keywords'] = [len(str(x).split('-')) for x in data.keywords.tolist()]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "len_cov = []\n", 114 | "for x in data.desc.tolist():\n", 115 | " tokens = re.findall('\\\"', str(x))\n", 116 | " len_cov += [len(tokens)]\n", 117 | "data['len_cov'] = len_cov\n", 118 | "data['bad_znak'] = data['len_cov'] / data['len_desc']" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 7, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "normal_goal = []\n", 130 | "for x, y in zip(data.currency.tolist(), data.goal.tolist()):\n", 131 | " if x == 'USD':\n", 132 | " normal_goal += [y]\n", 133 | " if x == 'GBP':\n", 134 | " normal_goal += [1.5 * y]\n", 135 | " if x == 'EUR':\n", 136 | " normal_goal += [1.2 * y]\n", 137 | " if x == 'CAD':\n", 138 | " normal_goal += [0.85 * y]\n", 139 | " if x == 'AUD':\n", 140 | " normal_goal += [0.85 * y]\n", 141 | " if x == 'SEK':\n", 142 | " normal_goal += [0.14 * y]\n", 143 | " if x == 'NZD':\n", 144 | " normal_goal += [0.70 * y]\n", 145 | " if x == 'DKK':\n", 146 | " normal_goal += [0.17 * y]\n", 147 | " if x == 'NOK':\n", 148 | " normal_goal += [0.15 * y]\n", 149 | " if x == 'CHF':\n", 150 | " normal_goal += [y]\n", 151 | " if x == 'MXN':\n", 152 | " normal_goal += [0.07 * y]\n", 153 | " if x == 'SGD':\n", 154 | " normal_goal += [0.73 * y]\n", 155 | " if x == 'HKD':\n", 156 | " normal_goal += [0.13 * y]\n", 157 | "\n", 158 | "data['normal_goal'] = normal_goal\n", 159 | "data['deadline-created_at_normal_goal'] = data.loc[:, 'deadline-created_at'] / data.normal_goal\n", 160 | "data['launched_at-created_at_normal_goal'] = data.loc[:, 'launched_at-created_at'] / data.normal_goal\n", 161 | "data['state_changed_at-created_at_normal_goal'] = data.loc[:, 'state_changed_at-created_at'] / data.normal_goal\n", 162 | "data['state_changed_at-deadline_normal_goal'] = data.loc[:, 'state_changed_at-deadline'] / data.normal_goal\n", 163 | "data['deadline-launched_at_normal_goal'] = data.loc[:, 'deadline-launched_at'] / data.normal_goal\n", 164 | "data['state_changed_at-launched_at_normal_goal'] = data.loc[:, 'state_changed_at-launched_at'] / data.normal_goal" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 8, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "deadline_hour_weekday\n", 179 | "created_at_hour_weekday\n", 180 | "launched_at_hour_weekday\n", 181 | "state_changed_at_hour_weekday\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "time_feat = ['deadline', 'created_at', 'launched_at', 'state_changed_at']\n", 187 | "for time in time_feat:\n", 188 | " weekday = []\n", 189 | " hour = []\n", 190 | " day = []\n", 191 | " for x in data.loc[:, time].tolist():\n", 192 | " weekday += [datetime.datetime.fromtimestamp(x).weekday()]\n", 193 | " hour += [datetime.datetime.fromtimestamp(x).hour]\n", 194 | " day += [datetime.datetime.fromtimestamp(x).day]\n", 195 | " data[time + '_' + 'weekday'] = weekday\n", 196 | " data[time + '_' + 'hour'] = hour\n", 197 | " data[time + '_' + 'day'] = day\n", 198 | " \n", 199 | "for time in time_feat:\n", 200 | " print(time + '_' + 'hour_weekday')\n", 201 | " data[time + '_' + 'hour_weekday'] = data[time + '_' + 'hour'].astype(str) + '_' + data[time + '_' + 'weekday'].astype(str)\n", 202 | " data[time + '_' + 'hour_country'] = data[time + '_' + 'hour'].astype(str) + '_' + data['country'].astype(str)\n", 203 | " data[time + '_' + 'weekday_country'] = data[time + '_' + 'weekday'].astype(str) + '_' + data['country'].astype(str)\n", 204 | " data[time + '_' + 'day_country'] = data[time + '_' + 'day'].astype(str) + '_' + data['country'].astype(str)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "canceled = []\n", 216 | "for x in data.name.tolist():\n", 217 | " if len(re.findall('Canceled', str(x))) > 0:\n", 218 | " canceled += [1]\n", 219 | " else:\n", 220 | " canceled += [0]\n", 221 | "data['canceled'] = canceled" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 10, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "for x in ['deadline_hour_weekday',\n", 233 | "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n", 234 | " for y in ['country', 'currency']:\n", 235 | " data[x + y] = (data[x] + data[y]).astype('category').cat.codes\n", 236 | "\n", 237 | "for x in ['deadline_hour_weekday',\n", 238 | "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n", 239 | " for y in ['country']:\n", 240 | " for z in ['currency']:\n", 241 | " data[x + y] = (data[x] + data[y] + data[z]).astype('category').cat.codes\n", 242 | " \n", 243 | "for x in ['country', 'currency', 'deadline_hour_weekday',\n", 244 | "'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday']:\n", 245 | " data[x] = data[x].astype('category').cat.codes" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 11, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "for x in ['deadline_hour_country', 'deadline_weekday_country', 'deadline_day_country', 'created_at_hour_country',\n", 257 | "'created_at_weekday_country', 'created_at_day_country', 'launched_at_hour_country', 'launched_at_weekday_country',\n", 258 | "'launched_at_day_country', 'state_changed_at_hour_country', 'state_changed_at_weekday_country', 'state_changed_at_day_country']:\n", 259 | " data[x] = data[x].astype('category').cat.codes" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 12, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 271 | "\n", 272 | "name = data.name.fillna('None').tolist()\n", 273 | "tfidf = TfidfVectorizer(max_features = 500, stop_words = 'english', ngram_range = (1, 2))\n", 274 | "name_vect = tfidf.fit_transform(name)\n", 275 | "\n", 276 | "desc = data.desc.fillna('None').tolist()\n", 277 | "tfidf = TfidfVectorizer(max_features = 2000, stop_words = 'english', ngram_range = (1, 4))\n", 278 | "desc_vect = tfidf.fit_transform(desc)\n", 279 | "\n", 280 | "keywords = data.keywords.tolist()\n", 281 | "tfidf = TfidfVectorizer(max_features = 1000, stop_words = 'english', ngram_range = (1, 3))\n", 282 | "keywords_vect = tfidf.fit_transform(keywords)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 13, 288 | "metadata": { 289 | "collapsed": true 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "from scipy.sparse import hstack\n", 294 | "\n", 295 | "sp_data = hstack([keywords_vect, name_vect, desc_vect]).tocsr()\n", 296 | "del tfidf, keywords_vect, name_vect, desc_vect" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "LEARN MODEL" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 14, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "1\n", 318 | "1\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "name = data.name.fillna('None').tolist()\n", 324 | "tfidf = TfidfVectorizer(max_features = 2000, ngram_range = (1, 6), analyzer = 'char')\n", 325 | "name_char = tfidf.fit_transform(name)\n", 326 | "\n", 327 | "print(1)\n", 328 | "\n", 329 | "desc = data.desc.fillna('None').tolist()\n", 330 | "tfidf = TfidfVectorizer(max_features = 6000, ngram_range = (1, 6), analyzer = 'char')\n", 331 | "desc_char = tfidf.fit_transform(desc)\n", 332 | "print(1)\n", 333 | "keywords = data.keywords.tolist()\n", 334 | "tfidf = TfidfVectorizer(max_features = 3000, ngram_range = (1, 6), analyzer = 'char')\n", 335 | "keywords_char = tfidf.fit_transform(keywords)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 15, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "sp_data = hstack([sp_data, name_char, desc_char, keywords_char]).tocsr()\n", 347 | "del tfidf, name_char, desc_char, keywords_char" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 16, 353 | "metadata": { 354 | "collapsed": true 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "pred = pd.DataFrame()\n", 359 | "pred['svc'] = [-1] * (len(test))\n", 360 | "clf = LinearSVC()\n", 361 | "clf.fit(sp_data[:ltr], final_status)\n", 362 | "pred['svc'] = clf.predict(sp_data[ltr:])" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 17, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "from sklearn.linear_model import LogisticRegression\n", 374 | "clf = LogisticRegression(C = 2)\n", 375 | "clf.fit(sp_data[:ltr], final_status)\n", 376 | "pred['logreg'] = clf.predict_proba(sp_data[ltr:])[:, -1]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 18, 382 | "metadata": { 383 | "collapsed": true 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "data = np.array(data.drop(['project_id', 'name', 'desc', 'keywords'], axis = 1))\n", 388 | "sp_data1 = hstack([sp_data, data]).tocsr()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 19, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "del data, sp_data" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 20, 405 | "metadata": { 406 | "collapsed": true 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n", 411 | " 'max_depth':5, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 5}\n", 412 | "dtest = xgb.DMatrix(sp_data1[ltr:, 14500:])\n", 413 | "dtrain = xgb.DMatrix(sp_data1[:ltr, 14500:], label=final_status)\n", 414 | "\n", 415 | "bst = xgb.train(dic_par, dtrain, 222)\n", 416 | "pred['xgb1'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 21, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "del dtrain, dtest, bst" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 22, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n", 439 | " 'max_depth':8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5}\n", 440 | "dtest = xgb.DMatrix(sp_data1[ltr:])\n", 441 | "dtrain = xgb.DMatrix(sp_data1[:ltr], label=final_status)\n", 442 | "\n", 443 | "bst = xgb.train(dic_par, dtrain, 416)\n", 444 | "pred['xgb2'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 23, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "del dtrain, dtest, bst" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 24, 461 | "metadata": { 462 | "collapsed": true 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'error',\n", 467 | " 'max_depth':7, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 6}\n", 468 | "dtest = xgb.DMatrix(sp_data1[ltr:])\n", 469 | "dtrain = xgb.DMatrix(sp_data1[:ltr], label=final_status)\n", 470 | "\n", 471 | "bst = xgb.train(dic_par, dtrain, 228)\n", 472 | "pred['xgb3'] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 25, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "del dtrain, dtest, bst" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 26, 489 | "metadata": { 490 | "collapsed": true 491 | }, 492 | "outputs": [], 493 | "source": [ 494 | "import lightgbm as lgb" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 27, 500 | "metadata": { 501 | "collapsed": true 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "params = {\n", 506 | " 'objective': 'binary',\n", 507 | " 'metric': 'binary_error',\n", 508 | " 'num_leaves': 80,\n", 509 | " 'learning_rate': 0.1,\n", 510 | " 'feature_fraction': 0.9,\n", 511 | " 'bagging_fraction': 0.8,\n", 512 | " 'bagging_freq': 2\n", 513 | "}\n", 514 | "lgb_train = lgb.Dataset(sp_data1[:ltr], np.array(final_status))\n", 515 | "gbm = lgb.train(params,\n", 516 | " lgb_train,\n", 517 | " num_boost_round=148)\n", 518 | "pred['lgb1'] = gbm.predict(sp_data1[ltr:])" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 30, 524 | "metadata": { 525 | "collapsed": false 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "del lgb_train" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 31, 535 | "metadata": { 536 | "collapsed": true 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "params = {\n", 541 | " 'objective': 'binary',\n", 542 | " 'metric': 'binary_error',\n", 543 | " 'num_leaves': 50,\n", 544 | " 'learning_rate': 0.1,\n", 545 | " 'feature_fraction': 0.8,\n", 546 | " 'bagging_fraction': 0.7,\n", 547 | "}\n", 548 | "lgb_train = lgb.Dataset(sp_data1[:ltr, 14500:], np.array(final_status))\n", 549 | "gbm = lgb.train(params,\n", 550 | " lgb_train,\n", 551 | " num_boost_round=124)\n", 552 | "pred['lgb2'] = gbm.predict(sp_data1[ltr:, 14500:])" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 34, 558 | "metadata": { 559 | "collapsed": false 560 | }, 561 | "outputs": [], 562 | "source": [ 563 | "del lgb_train" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "READ PREDICT FROM ANOTHER FILE" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 35, 576 | "metadata": { 577 | "collapsed": false 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "pred_new = pd.read_csv('predict.csv', header = None)\n", 582 | "pred['xgb_old'] = pred_new[1][ltr:].tolist()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "ADD BAGGING(SPLIT BY TIME INTERVAL, COEF FROM LINEAR REGRESSION WITH NOISE)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 36, 595 | "metadata": { 596 | "collapsed": false 597 | }, 598 | "outputs": [], 599 | "source": [ 600 | "final_pred = pd.Series([-100] * len(test))\n", 601 | "col = ['logreg', 'xgb1', 'xgb2', 'xgb3', 'lgb1', 'lgb2', 'xgb_old']\n", 602 | "new_col = [x + '1' for x in col]\n", 603 | "pred[new_col] = pred[col].round()" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 37, 609 | "metadata": { 610 | "collapsed": false 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "date = pd.to_datetime(test.created_at,unit='s')" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 38, 620 | "metadata": { 621 | "collapsed": false 622 | }, 623 | "outputs": [], 624 | "source": [ 625 | "y2017 = date[date.dt.year == 2017].index\n", 626 | "coef = [-0.027163179271743373, 0.50829964473673006, 0.21379022476789045, 0.21797444121259385, \n", 627 | " 0.10735714061372345, 0.17883463215622081, -0.11532700181862125, 0.0054078524855090682, \n", 628 | " 0.019044340523711512, 0.020904671380734853, 0.016799278763598158, -0.035555538613519899, \n", 629 | " .070046648507590167, 0.015530628742049274, 0.041652250420829068]\n", 630 | "final_pred[y2017] = (np.array(pred.loc[y2017, :]) * coef).sum(axis = 1).round()" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 39, 636 | "metadata": { 637 | "collapsed": true 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "y2010 = date[date.dt.year.isin([2010,2011,2012,2013,2014])].index\n", 642 | "\n", 643 | "coef = [-0.027163179271743373, 0.50829964473673006, 0.21379022476789045, 0.21797444121259385, 0.10735714061372345, \n", 644 | " 0.17883463215622081, -0.11532700181862125, 0.0054078524855090682, 0.019044340523711512, 0.020904671380734853, \n", 645 | " 0.016799278763598158, -0.035555538613519899, 0.070046648507590167, 0.015530628742049274, 0.041652250420829068]\n", 646 | "final_pred[y2010] = (np.array(pred.loc[y2010, :]) * coef).sum(axis = 1).round()" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 40, 652 | "metadata": { 653 | "collapsed": true 654 | }, 655 | "outputs": [], 656 | "source": [ 657 | "coef_list2015 = [[[-0.41383995384225819, -0.16307572346026833, -0.10896588031522875, 0.35471079322753063, 0.22390883115227922, -0.32863241243857799, 0.097549160081059405, 1.1372073765804023, 0.87153515112747171, -0.080863813424291636, 0.28487078798716486, -0.55400069424988796, -0.17659008257259162, 0.35160714276697574, -0.21936988430087989], -0.08629609882731698], [[-0.39286094196229099, 0.34510484198718872, 0.89787931598668014, -1.486627118144938, 1.245325850060452, 1.1053038953569081, -1.4454198657086255, 0.55721922803318291, 0.27085791705579459, 0.064604935954636344, 0.13771723388179469, -0.11899338973184959, -0.064443006982082052, 0.14364967167706721, -0.27374821149664608], 0.0036556793890552552], [[0.083346325658424161, -0.12831268413013974, 1.1959007032760487, -0.5233849336825912, 0.39900940712026656, 0.42745138413344769, -2.1143414710142512, 1.4761439331118933, 0.083346325658423981, -0.10015748315347639, -0.47897883986230105, 0.40147546444362403, 0.31978463221249886, 0.46676606100165352, -0.43921808570249288], 0.10734424730150077], [[-0.025017998270211974, 0.8149649690076205, 0.22134553385093006, -0.27829229874696421, 0.2163670138984588, 1.4214632555094264, 0.020124861567298286, -0.66832723045793496, -0.11756296346591055, 0.024269691750705785, 0.20962183305716242, -0.39990573074287339, -0.17063736111252792, 0.052916305919370354, 0.27456099230977582], -0.087124834756392822], [[-0.17857188656760381, -0.50018596299048845, 1.4635581312988146, -0.0504486329542157, 1.4202366056430444, 0.13200100751257016, -1.3792525358813492, -0.0043358756294259448, 0.61417616538769393, -0.091765531130930111, -0.1849941363492198, -0.28635895012223983, 0.08344268469493421, 0.029755702251762495, -0.12095137688527768], 0.059323755834263248], [[0.23266570483925128, 0.59183162350733021, 0.85228046871709484, -0.18994063134104572, -0.13074213291852804, -0.57422222869450446, -0.55415140450869049, 1.4479664887719843, -0.50511446356786971, -0.3152090047679722, -0.12621744624435693, 0.053571357532884112, 0.57767217434333518, 0.26510233281407025, -0.37662132192200815], 0.013015069835713933], [[0.079112999667759101, 0.53610646520732519, -1.155397987642315, 0.2565622200003016, 0.14382146435348217, 1.3073772692753969, 0.87854749768335161, -0.42129780803337913, -0.24246377511520789, -0.055366480326769274, -0.14171743379467075, 0.26797543629685394, 0.19325962796902985, 0.062853835080842801, -0.31020409520730441], -0.001021638893230814], [[-0.14140528990693546, -0.4157381903396174, 0.32643815251828778, 0.47829470762966453, 0.14540838047690763, -0.070953783338113974, 0.07259713275983698, 1.0273916737378574, 0.2105072342428041, -0.15848873991231688, 0.23483160420629368, -0.37459121068207463, 0.03737705965952931, 0.092195302291443681, -0.085222845716571793], 0.0078031063428684044], [[0.10550662841276728, 0.72612646807507764, 0.88080140234336735, -0.058116892446551854, -0.30759765699777342, 0.85370792127323414, -0.73696074165507164, 0.1901825008658854, -0.18077110440369148, -0.1083241428815005, 0.10296198995045747, 0.13142052324342346, -0.1441361965581997, 0.15228816196374362, -0.16334161610384135], -0.042526473797645292], [[0.033740498382682026, 0.080843506890849542, -0.083625278536957698, 0.24728436962909617, -0.55848674776303997, 0.83791202376532703, 0.048375547816753614, 1.1468314736516987, -0.069594498991358028, -0.055720452596246917, 0.037641457795474897, 0.043932420232981995, -0.049062634778156131, -0.012202354713624297, -0.21266052896534438], 0.0022399725905200008], [[-0.17409663626947336, 0.33843178141729097, -0.53125937805643997, 0.79958304675601433, 0.04106395228403914, 0.034285833381269976, 0.70360379309129073, 0.45037001128304066, -0.009205923702995028, -0.031796753471828643, 0.033143113104870336, -0.20158329715391332, 0.32201721875719203, -0.26813488809618247, -0.08392575310327377], -0.013782307499392965], [[-0.30701584284152922, 0.5942492683928583, 0.23239298742646675, 0.51924826923963019, -0.018419637046819481, 0.45063965311071841, 0.71939432972730277, -0.99474793277483242, 0.43454889983497075, -0.01691951305694267, -0.12875288169134289, -0.2424468901124931, -0.051384055344316615, -0.026109141686985748, 0.33732870822515598], -0.022383224751678088], [[0.15177273114974479, -0.026347320130969451, 0.25714275716769786, -0.60081133386779295, 1.5031631570715014, 0.43161693644893617, -0.29902155663521324, 0.098163961202380037, -0.15912121243201116, -0.19138392116446523, 0.13537943701624849, -0.1437814082267192, -0.0049560636108855194, 0.13559807851976091, 0.049246074378967775], 0.022938205951003654], [[0.083373645090151383, 0.44047294051683972, 0.74861778201841345, -0.20226722175100323, 0.87339331088680272, -0.27091310474858565, -0.6773592374808135, 0.28950885701085288, -0.11428941375895169, 0.064335136098343737, 0.085030349125940985, -0.10794608697245556, 0.056561839146722304, 0.24180383330031219, -0.11578746024682307], -0.046245635922416151], [[-0.050371576006364556, 0.6726724372394467, 0.25161075195261418, 0.67810570052226105, 0.59750443557138855, -0.4095965095982248, 0.087341165718058555, -0.01399886708101386, -0.1129046724188246, 0.10705832882972, -0.08347928741234778, 0.036823106059101052, -0.046602563791332902, -0.060134854954618455, 0.053886915926962181], -0.070925883612583063], [[0.077757796330699722, 0.29439865068850385, 0.43827025281740206, 0.35369973922058889, -0.078012553798705031, 0.075846743133362399, -0.45084175214799821, 0.77945829571817871, -0.031010811043684208, 0.0021613621474618205, 0.0078662533977770233, 0.10998001384693995, -0.14270865670379468, 0.061212808939159924, -0.17481214510521037], -0.0082442323680364527], [[-0.0064521578344324771, 0.41694436929706225, 0.20744985991777531, 0.35062836067520409, 0.29385875391622807, 0.16496024305653861, 0.064932604791711881, 0.25991103584696063, -0.1497593824171288, -0.18249832759070406, -0.14947770489950465, -0.10636999044390935, 0.1030929747886534, -0.0052910789341637787, 0.0053631808788744129], -0.050704163539114], [[0.22618710579684795, 0.31056171954317158, 0.21148323865181329, 0.56238236792633811, -0.38802270579054898, 0.50123862939552399, 0.013060952495983269, 0.42142316833453486, -0.14627618413647422, -0.00074587493405625649, -0.080276015934133693, -0.020160317076800549, -0.065571331961821161, 0.11343891541036422, -0.12186886136797415], -0.067370582777799337], [[-0.021166784463613177, 0.37550565405307484, 0.40151444164484273, 0.55020638162551849, -0.40732832938350261, 0.52349114862659396, 0.092195903521491318, 0.00076687090595081589, 0.056342743605521772, 0.072530193047288166, -0.025441056107622151, 0.11105505003182653, -0.15479516729304266, -0.039872341320651872, 0.018317039855248352], -0.068029487047780002], [[-0.038502206012433142, 0.1846914167264177, -0.012346366389744123, -0.32675224793022695, 0.32883359264341405, 0.80578704244427946, 0.20223654563194438, 0.2867340017678921, 0.050752578137505111, 0.073907401772084463, 0.049807673019682697, -0.055565730740980895, -0.19496891199125929, -0.028915831048797663, 0.062628046019031713], -0.048029081326567191], [[0.10091406807450043, 0.29974702524870739, 0.4129055100414003, 0.081063474592990709, 0.73313546834260235, -0.050258705754476329, -0.076959817628007077, 0.15423294085164146, -0.063825304014272444, -0.016442470293993372, -0.058599320299935864, -0.05472523946420732, 0.026006936119348345, -0.025964162544909519, -0.055132724117975374], -0.057617515387948792], [[-0.13412571009324611, 0.28867598295950708, 0.83654410646535038, 0.28744844076126608, 0.096612475021383093, 0.22575683170007652, -0.38651929812922864, 0.22053437132158993, 0.091632146031291467, -0.092885990626032311, -0.085645502553226185, 0.064797226467597363, 0.071972380240118095, 0.0043700370634170427, -0.013152856788972489], -0.053802826760923406], [[0.11128812962073018, 0.090281010573965148, 0.4713430340572165, 0.49463928492923137, -0.85945069178762168, 1.1062059214649451, -0.11803519286306985, 0.27940357132598365, -0.010427237053803481, 0.06616255841965657, 0.047939009754045747, 0.13430250169613434, -0.26490673303777479, -0.095745368665667463, -0.058540872952473344], -0.035069013726363951], [[0.022176366776143132, 0.26783831291371463, 0.28277588311753432, 0.47846058106773559, 0.16097025556464878, 0.2571532474206788, 0.012421942353549095, 0.070547552636659627, 0.0038074992477577507, 0.060850205792590858, 0.011059711916478789, -0.17663880153962472, -0.08402572264726911, -0.048057976864883084, 0.10959690375543049], -0.073200989594718324], [[-0.090304449224284669, 0.49505311698570725, 0.9053338838928614, 0.40652720285926353, -0.11490412096443998, -0.098850876691273903, -0.39369186142375756, 0.34103526425569303, 0.038250938244487998, -0.029405985065134366, -0.18598476424640928, 0.08084270069570032, 0.090540519485523802, 0.057132674845881665, 0.052513088783564199], -0.07959706965928609], [[0.034032949283043942, 0.43840900310101066, 0.42569503775973472, 0.028574249007546057, 0.37555142622684001, 0.35767653562159568, -0.11049481867598984, 0.2268268093364002, -0.10829224779425944, -0.056374962049798139, -0.056207087776516756, -0.13788653698481018, 0.11076960903459684, -0.0010955468465459473, -0.011783861552982333], -0.056547139659597401], [[-0.083736262982500836, 0.51790857341129104, 0.22985557359544206, 0.036319451048168069, 0.08837876334638084, 0.70331749885554329, 0.13589645438844405, 0.20851642148244737, 0.022701999775491616, 0.083971680331921317, -0.079447240778194567, -0.099434288371511095, -0.14378732684723694, -0.070311448520760128, 0.036857677991006033], -0.10836956431151584], [[-0.068557504813032605, 0.26378895686980636, -0.26641784069181468, 0.040062426713872612, 0.56711754869939868, 0.42277416390708517, 0.2490748737220656, 0.30443647171716692, 0.062113553145254408, 0.040751176294925917, 0.056732236572030637, -0.074881428530962302, 0.003889022629293859, -0.053635216449648804, -0.056706126189836753], -0.056701181353114083], [[-0.17546570273489157, 0.43384915148572484, 0.16508675957300636, 0.44127927336104517, -0.88372309184143083, 0.98728170988045361, 0.28839230234366586, 0.27573818316821119, 0.2000919673725336, 0.03301671253297378, 0.14169158656984532, -0.0041617381686023802, -0.27982885612923369, -0.10830269104552115, -0.05674728609894264], -0.060562733003362679], [[-0.07666996966260306, 0.47714699207326589, 0.22605680907628603, 0.31502624020937509, -0.36475049392442677, 0.60351163741844072, -0.019577217727722446, 0.19943273776652326, 0.043846737957801002, -0.017958288969666725, 0.13689204825568435, 0.061463101783768737, -0.19770044948964921, 0.10110510955148963, -0.012943943165161009], -0.05128653661657806], [[-0.16132807588827527, 0.41774138045343401, -0.015151873430680387, 0.25997315921009484, 0.5763079399248735, -0.1113946094014111, 0.20336140106543807, 0.21908978843202909, 0.14287374729900668, -0.068921910922539964, -0.15961689075722965, -0.020865388903675952, 0.064943390319233324, -0.020705354823424116, 0.10982466710622138], -0.066536479411353544], [[0.033420578920728417, 0.36261397183334343, 0.11522342338398239, 0.26811771127358552, 0.45928653716345469, 0.088297155469125865, 0.046621023088578961, 0.16860717733653874, -0.10814273405689259, 0.09233421397035077, 0.061115125945500309, -0.10779632497544339, -0.011971386582525445, -0.060966834403295866, 0.013015925716524157], -0.041185756159896558], [[-0.2001934238267373, 0.35756821822539775, 0.65629248341856783, 0.7222437647466009, -0.45124717573004536, 0.16166894454599889, -0.55679306764127956, 0.74961179323168303, 0.11106026591460433, -0.044603850777884424, -0.069731305019352152, 0.016434566356183844, -0.14644456070896128, 0.10611304915772002, 0.037700570463224614], -0.071258519696936196], [[-0.10295951257957371, 0.26497789321265153, 0.17735696503865289, 0.25004078842064581, -0.1647187228055734, 0.65231499049870001, 0.43516256847086809, 0.11431377802962625, 0.10800730125931264, -0.14992026834471867, -0.12102503699504252, 0.12357349660658856, -0.082886271233495457, -0.067460966622815949, 0.036317888288777633], -0.070039765613889027], [[-0.096907389037331454, 0.17050439167661569, 0.64205061346879988, 0.76313628034326009, -0.0044909688331003783, 0.30761665709503117, -0.31100357813475199, 0.025406843425650694, 0.14796448023372932, -0.083493968100144422, -0.17118020356075014, -0.024403338970001798, -0.092876865018402421, 0.071073147526174274, 0.047936643137305279], -0.038680520288416842], [[0.079948716058980601, 0.31351124735614011, 0.55810534252532806, 0.61721371456929985, -0.20591359843991136, 0.14379263217675714, -0.092754694036302099, 0.45850832554198534, -0.089644213069306686, -0.1481613738985047, -0.0027943169767884468, -0.06712279432969534, -0.035892366242658402, 0.07226570483180475, -0.10956944991592577], -0.06771292192303735], [[0.14208770949351235, 0.32634980833629473, 0.0057023557206580394, 0.34005409286447319, 0.023877414588673748, 0.13129096485728847, 0.51437288924768865, 0.23756857474314735, 0.0077691359060347506, 0.036860099004501906, -0.033258458545080499, -0.032523937334426434, 0.0010050016670497541, -0.077843564598096826, -0.10308283687025624], -0.071094499113193876], [[-0.077394610941101172, 0.30918759781497629, 0.16011474994892291, 0.15240543266999998, 0.30830256344156948, 0.61035118576443181, 0.096442516757869634, 0.14390346875654592, 0.033232887524917154, -0.16539139459578883, 0.0058115506692817398, -0.16757131562514824, 0.042536117455280131, 0.039348880899364724, 0.030092896671985392], -0.060883113904824093], [[-0.0070828350649748077, 0.40781712459741987, 0.034236102462881002, -0.01338658192344374, 0.22754362109380527, 0.38596225272624901, 0.52043963194730458, 0.24760818062480192, -0.076569855880416871, -0.11286547598304614, 0.086397646579394582, -0.015818525068393652, -0.13414010910793983, -0.055122509422700028, 0.060905993687274673], -0.071882279302774355], [[-0.02783460471464733, 0.32386775286017178, 0.40398860140344511, 0.24814403493961668, -0.0025605358775599857, 0.70320912952820713, -0.073975073500936711, -0.12049477833890768, 0.012364886108188731, -0.066584009851180681, 0.093800535507852445, -0.041253359718894045, -0.11733492720617113, 0.071131323927756285, 0.030340345313332573], -0.023072640349124429], [[0.054224530017478262, 0.35199437661707283, -0.085541348942181281, -0.12135721626161355, 0.49833798692878006, 0.55726615408398217, 0.28427948322998875, 0.41481215479310918, -0.0099423104735685275, 0.10074679708786802, -0.20725673275882073, 0.023772809400098571, 0.012800216072735904, -0.079181836826978824, -0.15337319851858133], -0.082871891843055989], [[-0.0042761698649839332, 0.43608710290235025, 0.18529779382451289, 0.084334868886577957, 0.30706194694592254, 0.68899336942951139, 0.10435512625136395, 0.0044062011365280651, -0.10907245389055792, -0.11987942604225243, 0.06016292047339275, -0.075457948154415255, -0.087176235287321269, 0.17738944525485981, 0.027081393019452407], -0.064576609537433494], [[-0.12606536049257239, 0.62386988399598142, -0.023261447248895589, -0.17343459301746444, -0.053332308429939107, 0.4354331749164882, 0.16103332381176294, 0.5337953514815662, -0.0063621913100341126, 0.00037921639833465859, 0.13656400106184058, 0.12478707533979005, -0.1434102084799333, 0.16504635366330389, -0.12793340951786364], -0.038997739327800818], [[0.023625106997053526, 0.21465053115274352, -0.10239683891827478, 0.55091010709219912, -0.19591912739509182, 0.58174125012288447, 0.14048587981746832, 0.35939100493920428, 0.0036477217904137549, 0.062895547806887275, -0.025414936857275305, -0.16024746218096914, -0.05692126463352748, 0.095222291902597428, -0.0013976777143698271], -0.021205154288806594], [[-0.1415733161297775, 0.19282706324156584, -0.15317008495995021, 0.41760498989039097, 0.22934896395910748, 0.099855835777710789, 0.86143296168583372, 0.079825134266028178, 0.076888549405725759, 0.029142433900836595, -0.15039955470091237, 0.031885383748681551, -0.036849541966962523, -0.0090335151587409568, 0.093197750343661123], -0.075400121761051153], [[-0.019456985872773423, 0.34492550602029814, -0.085580857667536245, 1.1710411536252971, -0.29660686639492939, -0.39241176722993321, 0.054434520898320238, 0.80443734093500463, 0.051728582172760912, 0.041088729944448121, -0.057246015153181506, -0.098767569674687195, 0.036465417287937285, 0.11034756929537173, -0.11395338218577326], -0.085219239227286858], [[-0.14231106663132562, 0.44184632243492411, 0.47752304900707943, 0.16236003198198581, -0.18651407493988231, 0.33350107876263191, -0.11721668576444982, 0.39118530497284021, 0.12129555555447177, 0.086273883873666884, 0.17795776035727312, -0.3715642737177734, -0.045789414899917474, 0.07615615287210209, 0.14238442427090875], -0.056763246101817377], [[0.062778659062940265, 0.34929379382974429, 0.53605437288068436, 0.13244501373484097, 0.13203014126186671, 0.49908883811679128, -0.56382750092432421, 0.37578910719263359, -0.079515245962285139, 0.02533911740854948, 0.030263265775452797, 0.16348607963670136, -0.13187494888973528, 0.060474841975876825, -0.10904394959216032], -0.031395599343956082]]" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 41, 663 | "metadata": { 664 | "collapsed": true 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "for x in range(1, 13):\n", 669 | " for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n", 670 | " ind = date[(date.dt.year == 2015)&(date.dt.month == x)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n", 671 | " final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2015[(x-1)*4 + i][0]).sum(axis = 1) + coef_list2015[(x-1)*4 + i][1]]" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 42, 677 | "metadata": { 678 | "collapsed": true 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "coef_list2016 = [[[0.083160931053686798, 0.50123045434246438, -0.18436015016358764, 0.68130701919442604, 0.43200682038007177, -0.13604295964040092, 0.86730048043856223, -0.054315340181438576, -0.11990878642468367, -0.050063800461746855, -0.16781948020678739, -0.017739169634362117, -0.053654669444286818, -0.065720715845076993, 0.060483040613759156], -0.1249840501022485], [[0.063187436646098161, 0.30373368200674061, 1.0352977426357544, 0.3126191497028955, -0.077914433527196542, 0.87640946035320866, -0.63982168138475803, -0.14496076399427266, -0.060049558383010515, -0.20683417920784564, -0.0047388938169548589, -0.16135590218796603, -0.15628840179855302, 0.16740171378123109, 0.14767828336246941], -0.027583666045878918], [[-0.030980294648838998, 0.43922046683261201, -0.058868357706043262, 0.18424321650682049, 0.86106773791521762, -0.32550525072686465, 0.11896158762571502, 0.66969483521636475, -0.095674781196959491, 0.032006394265927218, -0.14233265851020718, -0.10707930219848585, 0.031629126969294985, -0.019324452143396537, -0.036687987683005269], -0.079626080992849846], [[-0.13954682033127497, 0.33378976286213791, 0.039806033607806893, 0.66123324614724044, -0.22516298826827527, 0.24567958234692971, 0.10308423516841406, 0.37442856856287487, 0.17653572475170914, 0.14059440547621241, -0.27656781828210741, 0.030288533414691385, 0.058761813190410017, -0.054413963372691443, -0.0020022353426871753], -0.036679859962110484], [[0.037129369093817581, 0.5964270728739044, -0.0081063460338467175, -0.22902343615441942, 0.025090731265902444, 0.99604128550763149, 0.16865515878609616, 0.13849912555419008, -0.16600225119406706, 0.033048310728461927, 0.13042055350669213, -0.0032510487880389738, -0.30229216986185653, -0.022642924492079664, -0.01660969447249272], -0.031204436876102359], [[0.0082779509397315557, 0.27521533089258582, 0.271282282448851, 0.24317783066043919, 0.062882102756682101, 0.4521431499010618, -0.19229296943676083, 0.5437477146870735, 0.058585547353780951, -0.0084156629510135639, -0.055255998117297006, -0.034297359496214271, -0.13039999678299585, 0.037031877430364679, -0.02121886377040233], -0.04764990752098508], [[-0.052013705254242218, 0.51331125256924759, -0.15988689871387099, 0.19022155200745466, -0.092351222909538594, 0.46369523140293517, 0.34804940954857166, 0.43290636769210378, -0.059085881660886547, 0.087312789233614685, -0.10203119330789079, 0.067741651875626863, 0.006542044979097883, -0.079964872107691432, -0.025968044414554387], -0.062214553397731853], [[0.0035154323757134588, 0.38454543946760283, -0.1697376761093464, 0.4045272761907584, 0.73035008984184091, 0.21273459085457591, 0.25397547633361278, -0.03324424351378491, -0.099615443747611124, 0.25953490739354618, -0.12172402518309783, -0.15005876063023216, -0.16200660359045868, -0.040408726661696287, 0.1357592809482786], -0.041462351736194336], [[-0.055766997867973386, 0.4767145176607413, -0.52928267087002179, -0.28042202462113103, 0.54888396859182875, 0.50465770289027989, 0.41277133713636116, 0.53375278268615411, -0.033405733150843719, 0.11803405634009667, 0.063341052706401113, -0.11591676800731615, 0.015009234471315414, -0.023084194758126175, -0.06309272421879808], -0.053198787799453862], [[-0.11921808654820741, 0.55304896032398199, 0.33545462577906782, 0.24652645735943704, 0.044552579593745895, 0.21973740219305932, -0.076825940759151876, 0.40570419920086587, 0.059279502970141673, -0.017626646813853342, -0.10219100754909555, 0.030149974189498718, 0.020593515861067502, 0.06185404788751081, -0.02542982596750254], -0.035417073816853384], [[-0.049542828528010417, 0.49810052204019534, 0.29169250395142649, 0.14836672215851654, -0.46431739526436122, 0.67813958273219033, 0.18972221177940243, 0.3559326649863791, -0.041996124544671193, -0.11896253867737042, 0.1487842103646303, -0.075842527222067302, -0.18728400629354813, 0.075909596907412891, -0.0019456230359987114], -0.018278768600546447], [[-0.10262823373263355, 0.65191373057167312, -0.28743238537415644, 0.3620766146781168, 0.38958765020601949, 0.017297579199162694, 0.20795136345319695, 0.19773175650810898, 0.076873259695250731, 0.16377980648044524, -0.083889894106966401, 0.0058007973684544578, -0.04766633765452899, 0.026744298136452127, -0.030190417104037574], -0.012864659958101266], [[-0.044112450776202093, 0.3664592386316341, 0.59732017641677249, 0.22072828001157588, 0.54930686941293161, -0.21040086570602037, -0.320286820585927, 0.16497839913200574, 0.06654955973086793, 0.0042008847447502218, -0.082302011852698892, -0.072833137017210337, 0.11822863790833205, 0.040761949445283441, -0.0063984967606624255], 0.041235476248580705], [[0.078435481397622769, 0.37244994152640953, 0.58707362776076422, 0.37490143249356256, -0.25283669292671007, 0.60866130292090204, -0.1970030047798168, -0.021329057359658829, 0.018989108798100673, -0.11011685335915292, -0.060082501704998348, -0.037443957435357278, -0.15762130581769557, 0.11426169779263162, 0.07190937929381036], -0.021064875515049541], [[0.0034718108656329578, 0.57915256686658578, 0.29528623742650506, -0.050229880493790402, 0.48137810862273306, 0.25318696244880917, -0.15889571006608449, 0.16483440770352081, -0.059730792487845241, 0.093544024981333074, -0.12247159270970034, 0.075771507207590461, -0.096402151710678813, -0.022497849716211576, 0.036354367030831419], 0.005256366888516828], [[-0.023820576680877423, 0.78225853472656826, 0.011634343713431342, 0.019918416317100285, 0.557084254430666, 0.27156835264849705, 0.14575719682470084, -0.088897624975371042, -0.063732953886952048, -0.039590585757928537, -0.052638053599495027, -0.1870774941584803, -0.040966366635034501, 0.098049128640642746, 0.088011347176140398], -0.021608780483270751], [[-0.061349462068088743, 0.69183746903156507, 0.52172715479450382, 0.48187658768522434, -0.099606104627660508, -0.052401242419038541, 0.024573034101930763, 0.23738518382344564, -0.032962004746863116, -0.12605462236670967, -0.036538133880784029, -0.14126295053073362, 0.14844289442390313, 0.008468265486780327, -0.027186130473260628], -0.033833884151762861], [[-0.11431828195368751, 0.78393055757029084, -0.063003189365440804, 0.18182597873019035, -0.33994031661063867, 0.7460911636953973, 0.32971674305189375, -0.04141630449778666, 0.0062870280295277969, 0.073761911986879036, -0.073507477054227055, 0.085813879520226566, 0.10732331935558642, -0.023845436016895677, -0.11301867089636497], -0.0054055816321251227], [[-0.054753180065529136, 0.72907257976196072, 0.51853520325796476, -0.17125328017481417, 0.35257103677379797, 0.30815676910938594, -0.46331790288173313, 0.079081445127745398, -0.020775639058208617, -0.065812688743678599, -0.015409164790581326, -0.094174344553187739, 0.0008859535771960636, 0.18820843876930005, 0.10425469702505108], 0.056803781410861676], [[-0.042467685625824328, 0.82658628356417985, -0.092932697163839367, 0.022071698051833677, 0.61328725281874952, -0.3824315690142488, 0.3787196213319835, 0.18888077541879378, -0.056751968974047287, -0.11727758155060736, 0.027543187445467854, 0.045457019178591185, -0.047819813632006034, 0.020668298110510047, 0.041497997852390389], 0.024869402799663931], [[-0.064757052637594945, 0.61254843023201899, 0.20086584118803563, 0.21615529630668404, -0.24447282050211117, 0.55194291330325529, -0.009869904195014656, -0.22518268192949908, 0.033520839040034556, -0.075252534891125988, -0.09619332773586535, -0.16578170784589868, 0.12373269435445577, 0.051440396724725945, 0.13820583675161502], 0.07605182599347754], [[-0.095896018600558228, 0.69788715734204487, 0.21436145719045058, 0.83706643929778035, 0.47507517220924905, -0.2908287955029823, 0.18132404698344207, -0.6709551186955669, 0.029008135445491634, -0.020478235431542147, -0.14937658392775088, -0.048159972223858927, 0.047785031619034357, 0.015409964290556966, 0.19223101919119823], 0.046328812007278453], [[-0.14139965440116536, 1.1099453120571932, 0.90215970671662171, -0.22271025596956984, -0.018733690889584986, 0.27713791517741265, -0.015171447692745841, -0.44736962141944825, -0.13945812177327943, -0.16770232269774765, -0.18555121833590268, 0.3340515092066168, -0.019131701690440761, 0.04779454714153375, 0.17032034003113261], -0.0011406461477660446], [[-0.15814004966474193, 0.76510545034432464, 0.55809107865746277, 0.055086482390308111, 1.6874616017811952, -0.98119567509265881, -0.22255726211598903, -0.42610664307362767, 0.048872138661632203, 0.086992973690962261, 0.22112821840946606, -0.31818104317672624, 0.032161073706369003, -0.016595663155686613, 0.17860319823197202], 0.061364613680657654], [[-0.085409697642654284, 0.87040181141592043, 1.1053281722579182, 0.13828450653163327, -1.2316974212995426, -0.0061250907444628522, 0.11930851679229618, -0.29960478093678317, 0.0077587108370841262, -0.29725813009402691, 0.25100538067247935, 0.15072089763164931, 0.14909301157086774, -0.018689890304061096, 0.39481458468156305], 0.05877411487918438], [[-0.29984373974635936, 0.71934734076942719, 1.2392092995547836, 0.41184431977829539, 0.13052097731042822, 0.012981282398934396, -0.77084668605305651, -0.23923303071710911, 0.24613616288030205, 0.0071792330569403484, -0.16906316486569095, 0.18095148458834975, -0.12071531335216135, 0.18887601960560274, 0.051885666612439096], 0.069759828760088682], [[-0.15449120581946524, 0.64951964246975047, 0.16969084588187366, 0.70820451021752118, 0.68696831944577208, -0.47619883509347344, -0.29090557003820378, 0.07620809485410926, 0.19548669409941749, -0.059913958975032836, -0.11821757647728848, -0.19305873749916524, -0.13176045531817471, 0.15929252439240482, 0.056679345616207244], 0.11912041223397452], [[0.006180404871175979, 0.80464282037114809, -0.33033917021567527, -0.14848881979217166, -1.235242142750355, 1.3095685396707577, 0.6957242895856508, -0.02989187842375074, -0.10674055798263112, -0.037619586416044015, 0.011956177131142276, 0.050958002969719307, -0.0086747554764927681, -0.11553937525330471, 0.32522194317886177], 0.029034802249806424], [[-0.1657612836095966, 0.94073016547909261, 0.51058101741405593, 0.030484186580697514, -0.47335063751875994, 0.57147407316464671, 0.37793039521913463, -0.47295122425549962, 0.010816584966879941, 0.097829543387180029, -0.21309623583345366, 0.074870851146988271, -0.058132337538235268, -0.24332889978101008, 0.42768616534437087], 0.038796184224211938], [[0.025647555447089326, 0.72666569404040504, 1.6463192615309852, 0.15034166562043572, -0.067317069313437283, 0.46532622741236068, -0.86103506347136061, -0.75741516656446195, -0.035803335172834738, -0.22717670542117901, 0.071836839974666111, -0.084342850750207754, -0.054927106726417874, 0.20650080761349554, 0.20007689716806881], 0.093074451237209366], [[-0.001959021207822406, 0.83239308122272682, 0.37957376389069414, 0.18619450498571441, 0.56580540139778224, -0.33440574530327416, -0.22863859727686675, -0.29546311597557634, -0.075875965058126549, -0.13685162690406641, 0.08330235469297663, -0.15623644870280184, 0.03650658244675975, 0.16184825625987359, 0.33483923528636689], 0.051801286141569425], [[-0.15561910248092847, 0.9655331160147993, 0.19024631202086678, -0.16389065866071933, 0.49217663484548924, 0.59772856889259007, -0.66386712615445909, 0.085032757491798894, -0.062744636941233445, 0.067025521253281539, 0.069696799220337891, 0.047764491738702577, -0.20580601665107331, 0.090321642139469582, 0.020765245239603963], 0.017690729166233488], [[-0.096453655024564017, 0.58878378008258592, 0.91393894691688504, 0.34938317289662602, 0.039152078643877972, 0.039398004355004235, -1.0468286783459875, 0.46362537171815743, 0.02312687381619816, 0.086659296247407092, -0.093363837264509641, 0.17609181669056617, -0.064349293969083843, 0.19115432937995736, -0.060071639048318637], 0.092949401936266773], [[0.018536111824581453, 0.70288133456920732, 0.1572960624779724, 0.94921016378166667, 0.65848222506481491, -0.53704710893855245, -0.29893180864523983, -0.40740458137528102, -0.14933227609016425, -0.011001519678496552, -0.031467342092052764, 0.016334637904423882, -0.011023627493020494, 0.046868387035175152, 0.077048979071970614], 0.096965255896032387], [[-0.25259118431472216, 0.8233415869510412, 0.4199431033875719, 1.1768474473476098, -1.2356704052210177, 0.14276615677208457, -0.071037668353966799, 0.41782871470689475, 0.15248698131333682, -0.24720613484926324, -0.42648857055296946, 0.36562680947974546, -0.085172934683006352, 0.29547776416200666, -0.06802360170871613], 0.052760631572301764], [[-0.072576660742084367, 0.81651417087679246, -0.3211034844306131, 0.30928733494137484, -0.09012779788680797, -0.50078664788211746, 0.9512547206547276, -0.0032189580080161928, -0.10700584083014469, 0.032073855929995732, -0.010406086197703862, 0.069831232786959546, 0.34587216310976254, 0.093766240427648195, -0.17325996366570007], 0.096218432324126135], [[-0.28432041279606995, 0.63988723059029973, 1.0836922161217861, 0.98661564214995512, -0.12433327393925117, -0.46263556657516319, -0.065707352399842003, -0.31079427858663244, 0.23194998789726029, -0.33864363672233078, 0.31520027787319682, -0.21545976552161242, -0.040905762230344422, -0.00748873833591579, -0.21221297709348941], 0.088021580093375407], [[-0.16089707161042341, 0.8965822415267819, 0.4103863205396423, 0.59468823344320354, 0.04254317994671461, 0.28553571860940874, 0.21152992243231228, -0.68788481353454334, 0.064970151509591206, -0.22785316880554574, -0.051201941851013566, -0.32032223026961659, 0.15010684940229679, -0.063439537422950276, 0.22006490290322889], 0.025322682321992973], [[0.11182075799149772, 0.65411818318708925, 1.3129187937616515, 0.85288827805273559, -1.6246409006550993, 0.74593025778158539, -0.38871261440224841, -0.087892290411351701, -0.045656229767157573, 0.045928349904733434, -0.16611837605758883, 0.2315789223043605, -0.34715191537467827, 0.054190747068566192, 0.18518850753887883], 0.05267236980336254], [[-0.13344781089068675, 0.63515225114238016, 0.77325119481004267, -0.23308661481801735, 0.2008554964948821, -0.095572234935413447, -0.76559358529034227, 1.1408187820540743, 0.011940503111879601, -0.012057379165902993, 0.066549237185619392, 0.13054067356138055, -0.15333355924898628, 0.024779068153564121, -0.28546668205502557], 0.09847076037014274], [[-0.21037641281310179, 0.9288051581200526, -0.66322254241331191, 0.24471045837055735, -0.19194499568561604, 1.1597561102737344, -0.003107297478991633, 0.26577554130079833, -0.012955549233710062, 0.57645607258796061, -0.27205638140095012, 0.098248048134944366, -0.41718998126023216, -0.080263742634118618, -0.20794407164281067], 0.033673328415926296], [[-0.14994541548372922, 0.98141226935245029, -0.27950378846317891, -0.32498007101383392, -0.35353773357469331, 0.5436907607315189, 0.67536406628105428, 0.11623154500823252, -0.00067651238963972737, 0.13859895364541563, 0.34966577728333448, 0.13896685765843331, -0.39184623610185354, -0.17269902859927255, 0.00040508572409858612], 0.028379878546068105], [[-0.14590691431003053, 1.0464832814251701, 1.0481929470578977, -0.48657408078753095, 0.62350112672748947, 0.93308813773062682, -0.34721584768203095, -1.4288201702593981, -0.095042162810413611, -0.069649525643022514, 0.18041281864798286, 0.0086063826300428747, -0.21864748321611804, 0.12624215537856975, 0.43201176983660594], -0.020934269528173011], [[0.07207680342415454, 0.77697734766041227, 0.3067820573261868, 0.022557231517119136, 0.82240771305419913, -0.19652115180943242, 0.07599237305462464, -0.36763840644908552, -0.013013836366206788, -0.26191350689882448, -0.45585536331995213, -0.11287556111335956, 0.38283161849607267, 0.025787016595057471, 0.036475336578099038], -0.005342340330847406], [[-0.017361177160982864, 1.042984596048806, -0.22705026056619793, -0.10520380866992814, 0.37986986873844969, 0.0035414727513486122, 0.93228250277674085, -0.56238540287508332, -0.23254838672858535, 0.11408832920138845, 0.41925891479783384, -0.055394342682305586, -0.41360226254919463, 0.082108661979596098, 0.21278205018642582], -0.0055409687694392695], [[-0.023808340221479832, 0.84635535835645104, 0.27922958125037833, 0.936967869657783, -0.40313567111024923, 0.84330653662802091, -0.42275541949917705, -0.24602082764493866, -0.076037549032303448, 0.18946227321254394, -0.63949378370955645, 0.0076838455058896304, 0.2848136733716235, -0.32201195158638829, 0.05865693087106294], -0.0040226517553044738], [[-0.084129629439401846, 0.38795226024982232, 1.1803963363130143, 1.9883352651032504, -1.9821675141870865, 0.22611224235732835, -0.69729948680873344, -0.075019732689127927, 0.23349343981311343, 0.26662333349037559, -0.48588083112512215, 0.41253348601745166, 0.22478142391797751, -0.24390460187141016, -0.18026241639775309], 0.081950757990531559], [[-0.047550062368741451, 0.88350187140437808, 0.56199307124951803, -0.096428489165694131, 0.90391694368564324, -1.4389011253249979, 0.39303569862021248, 0.69111563392488917, -0.025263595517992787, -0.049169933732915005, 0.16752516730843969, -0.30151511955072863, 0.23789111228216764, -0.48602221325105943, 0.23101391574902952], 0.00063119675804385045]]" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 43, 688 | "metadata": { 689 | "collapsed": true 690 | }, 691 | "outputs": [], 692 | "source": [ 693 | "for x in range(1, 13):\n", 694 | " for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n", 695 | " ind = date[(date.dt.year == 2016)&(date.dt.month == x)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n", 696 | " final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2016[(x-1)*4 + i][0]).sum(axis = 1) + coef_list2016[(x-1)*4 + i][1]]" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 44, 702 | "metadata": { 703 | "collapsed": true 704 | }, 705 | "outputs": [], 706 | "source": [ 707 | "coef_list2017 = [[[-0.092260279685306562, 0.47993713095700025, 1.3627279414391347, 0.19273462880948466, -0.43111286718320307, 0.3093180618980409, -0.63330613216205289, -0.06358364220948258, 0.19092647149070691, -0.22550467829007492, -0.0096726092674919162, 0.032662864635206401, -0.31034612227919123, 0.36000601525052756, 0.072208252512741211], 0.1033118679714411], [[0.026020136865209237, 0.71385675075166954, 0.19154883685548907, 0.34416777217129813, -0.95574217469666189, 0.14653896310740583, 0.54406576756292635, 0.50815570748139871, -0.11041845651374511, 0.15500194489430491, -0.30283088541950576, -0.11729819887503543, 0.11573965958550625, -0.037944605033069845, 0.23614815740095207], 0.032610559808463446], [[0.058485304818391169, 1.0464212109432975, 0.19961840811742557, 0.30101063937092576, 0.90901659787261513, 0.10254210837610717, 0.50387515072918676, -1.3332534333399786, -0.29452778524554846, -0.037687102120534721, -0.1671236775444731, -0.059676630015028098, -0.16696044545586608, 0.078660354501412511, 0.30556879269033232], 0.022689027579702148], [[-0.20008981539808449, 0.68803052136755405, 0.7171076414175982, -0.089068886422192503, 0.45442419258068401, 0.65089805716290039, -1.289926063724629, 0.31339860971881217, 0.16465953479286172, -0.060096924040990818, 0.26641700319021283, -0.10774731392237336, -0.26224116097107819, 0.1765668954172569, 0.013030401338264719], 0.044476192980860224]]" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 45, 713 | "metadata": { 714 | "collapsed": true 715 | }, 716 | "outputs": [], 717 | "source": [ 718 | "for i, y in enumerate([[0, 7],[7,15],[15,22],[22, 35]]):\n", 719 | " ind = date[(date.dt.year == 2017)&(date.dt.month == 1)&(date.dt.day > y[0])&(date.dt.day <= y[1])].index\n", 720 | " final_pred[ind] = [0 if x < 0.5 else 1 for x in (np.array(pred.loc[ind, :]) * coef_list2017[i][0]).sum(axis = 1) + coef_list2017[i][1]]" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 47, 726 | "metadata": { 727 | "collapsed": true 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "ans = pd.DataFrame()\n", 732 | "ans['project_id'] = test.project_id\n", 733 | "ans['final_status'] = final_pred.astype('int8')\n", 734 | "#ans.to_csv('predict_final_roman.csv', index = None)" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 49, 740 | "metadata": { 741 | "collapsed": true 742 | }, 743 | "outputs": [], 744 | "source": [ 745 | "offline = pd.read_csv('../../../offline_testcase.csv')" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 50, 751 | "metadata": { 752 | "collapsed": false 753 | }, 754 | "outputs": [], 755 | "source": [ 756 | "from sklearn.metrics import accuracy_score" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 51, 762 | "metadata": { 763 | "collapsed": false 764 | }, 765 | "outputs": [ 766 | { 767 | "data": { 768 | "text/plain": [ 769 | "0.76434255101236903" 770 | ] 771 | }, 772 | "execution_count": 51, 773 | "metadata": {}, 774 | "output_type": "execute_result" 775 | } 776 | ], 777 | "source": [ 778 | "accuracy_score(offline.final_status, ans.final_status)" 779 | ] 780 | } 781 | ], 782 | "metadata": { 783 | "anaconda-cloud": {}, 784 | "kernelspec": { 785 | "display_name": "Python 3", 786 | "language": "python", 787 | "name": "python3" 788 | }, 789 | "language_info": { 790 | "codemirror_mode": { 791 | "name": "ipython", 792 | "version": 3 793 | }, 794 | "file_extension": ".py", 795 | "mimetype": "text/x-python", 796 | "name": "python", 797 | "nbconvert_exporter": "python", 798 | "pygments_lexer": "ipython3", 799 | "version": "3.5.2" 800 | } 801 | }, 802 | "nbformat": 4, 803 | "nbformat_minor": 2 804 | } 805 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/Instructions: -------------------------------------------------------------------------------- 1 | To reproduce the score, run the files in following sequence: 2 | 1. Run best.py 3 | 2. Run lstm.py 4 | 3. Run layer2.py 5 | 6 | Note: Make sure you load functions from rest of the scripts. 7 | 8 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/best.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import enchant 4 | from sklearn.metrics import mean_squared_error 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from sklearn.linear_model import LogisticRegression 8 | from word2vecUtils import utils 9 | from xgboost import XGBClassifier 10 | from sklearn.preprocessing import LabelEncoder 11 | import time 12 | import lightgbm as lgb 13 | from sklearn.ensemble import RandomForestClassifier 14 | # from keras.preprocessing.text import Tokenizer 15 | # from keras.preprocessing.sequence import pad_sequences 16 | import re 17 | import readability 18 | from nltk.sentiment.vader import SentimentIntensityAnalyzer 19 | 20 | train = pd.read_csv('train.csv') 21 | test = pd.read_csv('test.csv') 22 | 23 | # trainp = np.loadtxt('trainp.csv', delimiter=',') 24 | # testp = np.loadtxt('testp.csv', delimiter=',') 25 | # trainp = pd.DataFrame(trainp) 26 | # testp = pd.DataFrame(testp) 27 | # train = pd.concat([train,trainp],axis = 1) 28 | # test = pd.concat([test,testp],axis = 1) 29 | 30 | train['created_atX'] = train['created_at']/max(train['created_at']) 31 | test['created_atX'] = test['created_at']/max(test['created_at']) 32 | train['deadlineX'] = train['deadline']/max(train['deadline']) 33 | test['deadlineX'] = test['deadline']/max(test['deadline']) 34 | 35 | y_train = train.final_status 36 | X_train = train.drop(['backers_count', 'final_status'], 1) 37 | X_test = test 38 | 39 | X = pd.concat([X_train, X_test]) 40 | 41 | X = X.set_index(np.arange(len(X))) 42 | 43 | 44 | def computeRead(text): 45 | rd = readability.Readability(text) 46 | score = rd.FleschKincaidGradeLevel() 47 | return int(score) 48 | def ARIscore(text): 49 | rd = readability.Readability(text) 50 | score = rd.ARI() 51 | return float(score) 52 | def LIXscore(text): 53 | rd = readability.Readability(text) 54 | score = rd.LIX() 55 | return float(score) 56 | 57 | 58 | X['readscore'] = X['desc'].apply(lambda d: computeRead(str(d))) 59 | X['ariscore'] = X['desc'].apply(lambda d: ARIscore(str(d))) 60 | X['lixscore'] = X['desc'].apply(lambda d: LIXscore(str(d))) 61 | 62 | X['readscoreX'] = X['name'].apply(lambda d: computeRead(str(d))) 63 | X['ariscoreX'] = X['name'].apply(lambda d: ARIscore(str(d))) 64 | X['lixscoreX'] = X['name'].apply(lambda d: LIXscore(str(d))) 65 | 66 | X['coeff'] = np.zeros(len(X)) 67 | X.coeff.ix[X.currency == 'USD'] = 1 68 | X.coeff.ix[X.currency == 'GBP'] = 0.78 69 | X.coeff.ix[X.currency == 'EUR'] = 0.89 70 | X.coeff.ix[X.currency == 'CAD'] = 1.32 71 | X.coeff.ix[X.currency == 'AUD'] = 1.31 72 | X.coeff.ix[X.currency == 'SEK'] = 8.71 73 | X.coeff.ix[X.currency == 'NZD'] = 1.38 74 | X.coeff.ix[X.currency == 'DKK'] = 6.63 75 | X.coeff.ix[X.currency == 'NOK'] = 8.42 76 | X.coeff.ix[X.currency == 'CHF'] = 0.97 77 | X.coeff.ix[X.currency == 'MXN'] = 17.95 78 | X.coeff.ix[X.currency == 'SGD'] = 1.38 79 | X.coeff.ix[X.currency == 'HKD'] = 7.8 80 | 81 | X['dollars'] = X['goal'] / X['coeff'] 82 | 83 | X = pd.get_dummies(X, columns=['country']) 84 | 85 | 86 | 87 | le = LabelEncoder() 88 | le.fit(X.disable_communication) 89 | X.disable_communication = le.transform(X.disable_communication) 90 | 91 | le = LabelEncoder() 92 | le.fit(X.currency) 93 | X.currency = le.transform(X.currency) 94 | 95 | 96 | def year(date): 97 | return int(time.strftime("%Y", time.localtime(date))) 98 | 99 | 100 | def month(date): 101 | return int(time.strftime("%m", time.localtime(date))) 102 | 103 | 104 | X['created_month'] = np.zeros(len(X)) 105 | X['deadline_month'] = np.zeros(len(X)) 106 | X['launched_month'] = np.zeros(len(X)) 107 | X['state_changed_month'] = np.zeros(len(X)) 108 | 109 | X['created_month'] = X['created_at'].apply(month) 110 | X['deadline_month'] = X['deadline'].apply(month) 111 | X['launched_month'] = X['launched_at'].apply(month) 112 | X['state_changed_month'] = X['state_changed_at'].apply(month) 113 | 114 | d = enchant.Dict("en_US") 115 | X['valideng'] = X['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', str(x)))) 116 | X['valideng'] = X['valideng'].apply(lambda x: sum(1 for c in str(x).split(' ') if len(c) < 4 or d.check(c))) 117 | analyzer = SentimentIntensityAnalyzer() 118 | def compoundScore(text): 119 | res = analyzer.polarity_scores(text) 120 | return float(res['compound']) 121 | def negSent(text): 122 | res = analyzer.polarity_scores(text) 123 | return float(res['neg']) 124 | def posSent(text): 125 | res = analyzer.polarity_scores(text) 126 | return float(res['pos']) 127 | def neuSent(text): 128 | res = analyzer.polarity_scores(text) 129 | return float(res['neu']) 130 | X['compoundScore'] = X['desc'].apply(lambda d: compoundScore(str(d))) 131 | X['negSent'] = X['desc'].apply(lambda d: negSent(str(d))) 132 | X['posSent'] = X['desc'].apply(lambda d: posSent(str(d))) 133 | X['neuSent'] = X['desc'].apply(lambda d: neuSent(str(d))) 134 | 135 | X['compoundScoreX'] = X['name'].apply(lambda d: compoundScore(str(d))) 136 | X['negSentX'] = X['name'].apply(lambda d: negSent(str(d))) 137 | X['posSentX'] = X['name'].apply(lambda d: posSent(str(d))) 138 | X['neuSentX'] = X['name'].apply(lambda d: neuSent(str(d))) 139 | 140 | cols_to_use = ['name', 'desc'] 141 | len_feats = ['name_len', 'desc_len'] 142 | count_feats = ['name_count', 'desc_count'] 143 | 144 | for i in np.arange(2): 145 | X[len_feats[i]] = X[cols_to_use[i]].apply(str).apply(len) 146 | X[count_feats[i]] = X[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' '))) 147 | 148 | X['keywords_len'] = X['keywords'].apply(str).apply(len) 149 | X['keywords_count'] = X['keywords'].apply(str).apply(lambda x: len(x.split('-'))) 150 | 151 | X['dots'] = X['desc'].apply(str).apply(lambda x: x.count('.')) 152 | X['comma'] = X['desc'].apply(str).apply(lambda x: x.count(',')) 153 | X['kav'] = X['desc'].apply(str).apply(lambda x: x.count('\"')) 154 | X['vopros'] = X['desc'].apply(str).apply(lambda x: x.count('?')) 155 | X['voskl'] = X['desc'].apply(str).apply(lambda x: x.count('!')) 156 | X['smiles'] = X['desc'].apply(str).apply(lambda x: x.count(":)")) 157 | X['Iocc'] = X['desc'].apply(str).apply(lambda x: x.count('I') + x.count('i')) 158 | X['kkstid'] = X['project_id'].apply(str).apply(lambda x: int(x.replace('kkst', ''))) 159 | 160 | X['digitsenc'] = X['desc'].apply(str).apply( 161 | lambda x: x.count('0') + x.count('1') + x.count('2') + x.count('3') + x.count('4') + x.count('5') + x.count( 162 | '6') + x.count('7') + x.count('8') + x.count('9')) 163 | 164 | X['kkstidlen'] = X['project_id'].apply(str).apply(len) 165 | X['potentiality'] = (X['deadline'] - X['created_at']) * X['dollars'] 166 | X['hardness'] = X['dollars'] / (X['deadline'] - X['created_at']) 167 | X['freshness'] = X['deadline'] / X['state_changed_at'] 168 | X['editingTime'] = X['created_at'] / X['launched_at'] 169 | X['diversity'] = X['keywords_len'] / X['name_len'] 170 | X['diversity2'] = X['desc_count'] / X['keywords_count'] 171 | X['upper'] = X['desc'].apply(str).apply(lambda x: sum(1 for c in x if c.isupper())) 172 | 173 | X['editingDuration'] = np.log(X['launched_at'] - X['created_at']) 174 | X['loggoal'] = np.log(X['dollars']) 175 | X['durationX'] = np.log(X['deadline'] - X['launched_at']) 176 | 177 | #from datetime import datetime 178 | #X['satornot'] = np.zeros(len(X)) 179 | #X['dow'] = X['deadline'].apply(lambda x: datetime.fromtimestamp(x/1000).strftime("%A")) 180 | #X.satornot.ix[X.dow == 'Saturday'] = 1 181 | #X.satornot.ix[X.dow != 'Saturday'] = 0 182 | #X = X.drop(['dow'], 1) 183 | 184 | #X['durationToChange'] = X['state_changed_at'] - X['deadline'] 185 | import datetime 186 | daydict = {} 187 | for index, row in X.iterrows(): 188 | if datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d') in daydict: 189 | daydict[datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d')] += 1 190 | else: 191 | daydict[datetime.datetime.fromtimestamp(int(row['deadline'])).strftime('%Y-%m-%d')] = 0 192 | 193 | X['zagr'] = X['deadline'].apply(lambda x: daydict[datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d')]) 194 | # time.strftime("%Y", time.localtime(X.deadline)) 195 | # time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(epoch)) 196 | 197 | 198 | # clean_desc= [] 199 | # for index,row in X.iterrows(): 200 | # clean_desc.append(" ".join(utils.review_to_wordlist(str(row['desc']) + "" + str(row['name']) + " " + str(row['keywords']), False))) 201 | # 202 | # vectorizer2 = TfidfVectorizer(min_df=3, max_features=300, 203 | # strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', 204 | # ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, 205 | # stop_words = None ) 206 | # 207 | # data_features = vectorizer2.fit_transform(clean_desc) 208 | # np.asarray(data_features) 209 | # data_features = data_features.astype(np.float64) 210 | # features_df = pd.DataFrame(data_features.todense(), columns=vectorizer2.get_feature_names()) 211 | # X = pd.concat([X, features_df], axis=1) 212 | 213 | clean_desc = [] 214 | for index, row in X.iterrows(): 215 | clean_desc.append(" ".join( 216 | utils.review_to_wordlist(str(row['desc']) + "" + str(row['name']) + " " + str(row['keywords']), False))) 217 | 218 | vectorizer = CountVectorizer(analyzer="word", 219 | tokenizer=None, 220 | preprocessor = None, 221 | stop_words = None, 222 | max_features = 3300) 223 | 224 | data_features = vectorizer.fit_transform(clean_desc) 225 | np.asarray(data_features) 226 | data_features = data_features.astype(np.float32) 227 | features_df = pd.DataFrame(data_features.todense(), columns=vectorizer.get_feature_names()) 228 | X = pd.concat([X, features_df], axis=1) 229 | 230 | # from sklearn.cluster import KMeans 231 | # kmeans = KMeans(n_clusters=15, random_state=0).fit(features_df) 232 | # labels = kmeans.labels_ 233 | # kmeans = [] 234 | # X['categoryX'] = labels 235 | # 236 | # X = pd.get_dummies(X, columns=['categoryX']) 237 | 238 | 239 | # 240 | # def tokenizerKeras(data): 241 | # data = data[['desc']] 242 | # 243 | # data['desc'] = data['desc'].apply(lambda x: str(x).lower()) 244 | # data['desc'] = data['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) 245 | # 246 | # for idx, row in data.iterrows(): 247 | # row[0] = row[0].replace('rt', ' ') 248 | # 249 | # max_features = 1500 250 | # tokenizer = Tokenizer(nb_words=max_features, split=' ') 251 | # tokenizer.fit_on_texts(data['desc'].values) 252 | # X = tokenizer.texts_to_sequences(data['desc'].values) 253 | # X = pad_sequences(X) 254 | # return X 255 | # features_df = pd.DataFrame(tokenizerKeras(X)) 256 | # X = pd.concat([X, features_df], axis=1) 257 | 258 | X = X.drop(['project_id', 'name', 'desc', 'keywords'], 1) 259 | # colnames = list(X.columns.values) 260 | # todrop = [] 261 | # for col in colnames: 262 | # try: 263 | # cur = col.astype(int) 264 | # todrop.append(col) 265 | # except: 266 | # continue 267 | # X.drop(todrop, 1) 268 | 269 | # cols = X.columns 270 | # for dup in X.columns: 271 | # cols[X.columns.get_loc(dup)] = [dup + '.' + str(d_idx) if d_idx != 0 else dup for d_idx in 272 | # range(X.columns.get_loc(dup).sum())] 273 | # X.columns = cols 274 | X_train = X.ix[:len(X_train) - 1] 275 | X_test = X.ix[len(X_train):] 276 | print("started training") 277 | 278 | gbm = lgb.LGBMClassifier(n_estimators=2900, max_depth=3, subsample=0.7, colsample_bytree= 0.7) 279 | gbm = gbm.fit(X_train, y_train) 280 | Y = gbm.predict_proba(X) 281 | np.savetxt('lgb',Y,delimiter = ',', fmt = '%0.6f') 282 | 283 | 284 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/layer2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import lightgbm as lgb 4 | X1 = pd.read_csv('lgb', delimiter = ',', header = None) 5 | X2 = pd.read_csv('lstm', delimiter = ',', header = None) 6 | train = pd.read_csv('train.csv') 7 | test = pd.read_csv('test.csv') 8 | y_train = train.final_status 9 | X = pd.concat([X1,X2], 1) 10 | X_train = X.ix[:len(train) - 1] 11 | X_test = X.ix[len(train):] 12 | gbm = lgb.LGBMClassifier() 13 | gbm.fit(X_train,y_train) 14 | y_pred = gbm.predict_proba(X_test) 15 | y_result = [] 16 | magic = 0.64 17 | for i in range(0, 63465): 18 | if y_pred[i][0] > magic: 19 | y_result.append(0) 20 | else: 21 | y_result.append(1) 22 | for index, row in test.iterrows(): 23 | if str(row['name']).count("Canceled") + str(row['name']).count("Suspended") > 0 or row['deadline'] > row['state_changed_at'] or row['disable_communication'] == True: 24 | y_result[index] = 0 25 | sub = pd.read_csv('samplesubmission.csv') 26 | sub.final_status = y_result 27 | sub.to_csv('sub2.csv', index=0) -------------------------------------------------------------------------------- /Rank_2_Sergazy/lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np # linear algebra 2 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | from keras.preprocessing.text import Tokenizer 5 | from keras.preprocessing.sequence import pad_sequences 6 | from keras.models import Sequential 7 | from keras.layers import Dense, Embedding, LSTM, GRU, Activation 8 | from sklearn.model_selection import train_test_split 9 | from keras.utils.np_utils import to_categorical 10 | import tensorflow as tf 11 | import re 12 | import nltk 13 | train = pd.read_csv('train.csv') 14 | test = pd.read_csv('test.csv') 15 | 16 | 17 | y_train = train.final_status 18 | train = train[['goal','desc', 'name', 'keywords']] 19 | test = test[['goal','desc', 'name', 'keywords']] 20 | # Keeping only the neccessary columns 21 | data = pd.concat([train, test]) 22 | data = data.set_index(np.arange(len(data))) 23 | data['desc'] = data['desc'] + data['name'] + data['keywords'] 24 | 25 | data['desc'] = data['desc'].apply(lambda x: str(x).lower()) 26 | data['desc'] = data['desc'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) 27 | 28 | 29 | max_features = 6000 30 | tokenizer = Tokenizer(nb_words=max_features, split=' ') 31 | tokenizer.fit_on_texts(data['desc'].values) 32 | X = tokenizer.texts_to_sequences(data['desc'].values) 33 | X = pad_sequences(X) 34 | embed_dim = 256 35 | lstm_out = 512 36 | model = Sequential() 37 | model.add(Embedding(max_features, embed_dim, input_length=X.shape[1], dropout=0.2)) 38 | model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2, return_sequences=True)) 39 | model.add(GRU(lstm_out, activation='relu')) 40 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='tanh')) 41 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='relu')) 42 | model.add(Dense(lstm_out, input_dim=lstm_out, activation='sigmoid')) 43 | model.add(Dense(2, activation='softmax')) 44 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 45 | print(model.summary()) 46 | 47 | y_train = pd.get_dummies(y_train).values 48 | x_train = X[:len(train)] 49 | x_test = X[len(train):] 50 | 51 | batch_size = 64 52 | model.fit(x_train, y_train, nb_epoch=2, batch_size=batch_size, verbose=2) 53 | Y = model.predict_proba(X) 54 | 55 | np.savetxt('lstm', Y, delimiter=',', fmt = '%0.6f') -------------------------------------------------------------------------------- /Rank_2_Sergazy/readability.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import math 4 | 5 | from utils import get_char_count 6 | from utils import get_words 7 | from utils import get_sentences 8 | from utils import count_syllables 9 | from utils import count_complex_words 10 | 11 | 12 | class Readability: 13 | analyzedVars = {} 14 | 15 | def __init__(self, text): 16 | self.analyze_text(text) 17 | 18 | def analyze_text(self, text): 19 | words = get_words(text) 20 | char_count = get_char_count(words) 21 | word_count = len(words) 22 | sentence_count = len(get_sentences(text)) 23 | syllable_count = count_syllables(words) 24 | complexwords_count = count_complex_words(text) 25 | avg_words_p_sentence = word_count/sentence_count 26 | 27 | self.analyzedVars = { 28 | 'words': words, 29 | 'char_cnt': float(char_count), 30 | 'word_cnt': float(word_count), 31 | 'sentence_cnt': float(sentence_count), 32 | 'syllable_cnt': float(syllable_count), 33 | 'complex_word_cnt': float(complexwords_count), 34 | 'avg_words_p_sentence': float(avg_words_p_sentence) 35 | } 36 | 37 | def ARI(self): 38 | score = 0.0 39 | if self.analyzedVars['word_cnt'] > 0.0: 40 | score = 4.71 * (self.analyzedVars['char_cnt'] / self.analyzedVars['word_cnt']) + 0.5 * (self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt']) - 21.43 41 | return score 42 | 43 | def FleschReadingEase(self): 44 | score = 0.0 45 | if self.analyzedVars['word_cnt'] > 0.0: 46 | score = 206.835 - (1.015 * (self.analyzedVars['avg_words_p_sentence'])) - (84.6 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt'])) 47 | return round(score, 4) 48 | 49 | def FleschKincaidGradeLevel(self): 50 | score = 0.0 51 | if self.analyzedVars['word_cnt'] > 0.0: 52 | score = 0.39 * (self.analyzedVars['avg_words_p_sentence']) + 11.8 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']) - 15.59 53 | return round(score, 4) 54 | 55 | def GunningFogIndex(self): 56 | score = 0.0 57 | if self.analyzedVars['word_cnt'] > 0.0: 58 | score = 0.4 * ((self.analyzedVars['avg_words_p_sentence']) + (100 * (self.analyzedVars['complex_word_cnt']/self.analyzedVars['word_cnt']))) 59 | return round(score, 4) 60 | 61 | def SMOGIndex(self): 62 | score = 0.0 63 | if self.analyzedVars['word_cnt'] > 0.0: 64 | score = (math.sqrt(self.analyzedVars['complex_word_cnt']*(30/self.analyzedVars['sentence_cnt'])) + 3) 65 | return score 66 | 67 | def ColemanLiauIndex(self): 68 | score = 0.0 69 | if self.analyzedVars['word_cnt'] > 0.0: 70 | score = (5.89*(self.analyzedVars['char_cnt']/self.analyzedVars['word_cnt']))-(30*(self.analyzedVars['sentence_cnt']/self.analyzedVars['word_cnt']))-15.8 71 | return round(score, 4) 72 | 73 | def LIX(self): 74 | longwords = 0.0 75 | score = 0.0 76 | if self.analyzedVars['word_cnt'] > 0.0: 77 | for word in self.analyzedVars['words']: 78 | if len(word) >= 7: 79 | longwords += 1.0 80 | score = self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt'] + float(100 * longwords) / self.analyzedVars['word_cnt'] 81 | return score 82 | 83 | def RIX(self): 84 | longwords = 0.0 85 | score = 0.0 86 | if self.analyzedVars['word_cnt'] > 0.0: 87 | for word in self.analyzedVars['words']: 88 | if len(word) >= 7: 89 | longwords += 1.0 90 | score = longwords / self.analyzedVars['sentence_cnt'] 91 | return score 92 | 93 | 94 | if __name__ == "__main__": 95 | text = """We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise! By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say. As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over.""" 96 | 97 | rd = Readability(text) 98 | 99 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/syllables_en.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fallback syllable counter 3 | 4 | This is based on the algorithm in Greg Fast's perl module 5 | Lingua::EN::Syllable. 6 | """ 7 | 8 | import string, re, os 9 | 10 | specialSyllables_en = """tottered 2 11 | chummed 1 12 | peeped 1 13 | moustaches 2 14 | shamefully 3 15 | messieurs 2 16 | satiated 4 17 | sailmaker 4 18 | sheered 1 19 | disinterred 3 20 | propitiatory 6 21 | bepatched 2 22 | particularized 5 23 | caressed 2 24 | trespassed 2 25 | sepulchre 3 26 | flapped 1 27 | hemispheres 3 28 | pencilled 2 29 | motioned 2 30 | poleman 2 31 | slandered 2 32 | sombre 2 33 | etc 4 34 | sidespring 2 35 | mimes 1 36 | effaces 2 37 | mr 2 38 | mrs 2 39 | ms 1 40 | dr 2 41 | st 1 42 | sr 2 43 | jr 2 44 | truckle 2 45 | foamed 1 46 | fringed 2 47 | clattered 2 48 | capered 2 49 | mangroves 2 50 | suavely 2 51 | reclined 2 52 | brutes 1 53 | effaced 2 54 | quivered 2 55 | h'm 1 56 | veriest 3 57 | sententiously 4 58 | deafened 2 59 | manoeuvred 3 60 | unstained 2 61 | gaped 1 62 | stammered 2 63 | shivered 2 64 | discoloured 3 65 | gravesend 2 66 | 60 2 67 | lb 1 68 | unexpressed 3 69 | greyish 2 70 | unostentatious 5 71 | """ 72 | 73 | fallback_cache = {} 74 | 75 | fallback_subsyl = ["cial", "tia", "cius", "cious", "gui", "ion", "iou", 76 | "sia$", ".ely$"] 77 | 78 | fallback_addsyl = ["ia", "riet", "dien", "iu", "io", "ii", 79 | "[aeiouy]bl$", "mbl$", 80 | "[aeiou]{3}", 81 | "^mc", "ism$", 82 | "(.)(?!\\1)([aeiouy])\\2l$", 83 | "[^l]llien", 84 | "^coad.", "^coag.", "^coal.", "^coax.", 85 | "(.)(?!\\1)[gq]ua(.)(?!\\2)[aeiou]", 86 | "dnt$"] 87 | 88 | 89 | # Compile our regular expressions 90 | for i in range(len(fallback_subsyl)): 91 | fallback_subsyl[i] = re.compile(fallback_subsyl[i]) 92 | for i in range(len(fallback_addsyl)): 93 | fallback_addsyl[i] = re.compile(fallback_addsyl[i]) 94 | 95 | def _normalize_word(word): 96 | return word.strip().lower() 97 | 98 | # Read our syllable override file and stash that info in the cache 99 | for line in specialSyllables_en.splitlines(): 100 | line = line.strip() 101 | if line: 102 | toks = line.split() 103 | assert len(toks) == 2 104 | fallback_cache[_normalize_word(toks[0])] = int(toks[1]) 105 | 106 | def count(word): 107 | word = _normalize_word(word) 108 | if not word: 109 | return 0 110 | 111 | # Check for a cached syllable count 112 | count = fallback_cache.get(word, -1) 113 | if count > 0: 114 | return count 115 | 116 | # Remove final silent 'e' 117 | if word[-1] == "e": 118 | word = word[:-1] 119 | 120 | # Count vowel groups 121 | count = 0 122 | prev_was_vowel = 0 123 | for c in word: 124 | is_vowel = c in ("a", "e", "i", "o", "u", "y") 125 | if is_vowel and not prev_was_vowel: 126 | count += 1 127 | prev_was_vowel = is_vowel 128 | 129 | # Add & subtract syllables 130 | for r in fallback_addsyl: 131 | if r.search(word): 132 | count += 1 133 | for r in fallback_subsyl: 134 | if r.search(word): 135 | count -= 1 136 | 137 | # Cache the syllable count 138 | fallback_cache[word] = count 139 | 140 | return count 141 | 142 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | utility functions for breaking down a given block of text 3 | into it's component syntactic parts. 4 | """ 5 | 6 | import nltk 7 | 8 | from nltk.tokenize import RegexpTokenizer 9 | import syllables_en 10 | 11 | TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+') 12 | SPECIAL_CHARS = ['.', ',', '!', '?'] 13 | 14 | def get_char_count(words): 15 | characters = 0 16 | for word in words: 17 | characters += len(word) 18 | return characters 19 | 20 | 21 | def get_words(text=''): 22 | words = [] 23 | words = TOKENIZER.tokenize(text) 24 | filtered_words = [] 25 | for word in words: 26 | if word in SPECIAL_CHARS or word == " ": 27 | pass 28 | else: 29 | new_word = word.replace(",","").replace(".","") 30 | new_word = new_word.replace("!","").replace("?","") 31 | filtered_words.append(new_word) 32 | return filtered_words 33 | 34 | def get_sentences(text=''): 35 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 36 | sentences = tokenizer.tokenize(text) 37 | return sentences 38 | 39 | def count_syllables(words): 40 | syllableCount = 0 41 | for word in words: 42 | syllableCount += syllables_en.count(word) 43 | return syllableCount 44 | 45 | #This method must be enhanced. At the moment it only 46 | #considers the number of syllables in a word. 47 | #This often results in that too many complex words are detected. 48 | def count_complex_words(text=''): 49 | words = get_words(text) 50 | sentences = get_sentences(text) 51 | complex_words = 0 52 | found = False 53 | cur_word = [] 54 | 55 | for word in words: 56 | cur_word.append(word) 57 | if count_syllables(cur_word)>= 3: 58 | 59 | #Checking proper nouns. If a word starts with a capital letter 60 | #and is NOT at the beginning of a sentence we don't add it 61 | #as a complex word. 62 | if not(word[0].isupper()): 63 | complex_words += 1 64 | else: 65 | for sentence in sentences: 66 | if str(sentence).startswith(word): 67 | found = True 68 | break 69 | if found: 70 | complex_words += 1 71 | found = False 72 | 73 | cur_word.remove(word) 74 | return complex_words 75 | 76 | -------------------------------------------------------------------------------- /Rank_2_Sergazy/word2vecUtils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import nltk 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from bs4 import BeautifulSoup 10 | from nltk.corpus import stopwords 11 | from nltk.stem import SnowballStemmer 12 | 13 | class utils(object): 14 | """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning""" 15 | 16 | @staticmethod 17 | def review_to_wordlist( review, remove_stopwords=False ): 18 | 19 | # 20 | # 2. Remove non-letters 21 | review_text = re.sub("[^a-zA-Z]"," ", review) 22 | # 23 | # 3. Convert words to lower case and split them 24 | words = review_text.lower().split() 25 | # 26 | # 4. Optionally remove stop words (false by default) 27 | if remove_stopwords: 28 | stops = set(stopwords.words("english")) 29 | words = [w for w in words if not w in stops] 30 | # 31 | # 5. Return a list of words 32 | return(words) 33 | 34 | # Define a function to split a review into parsed sentences 35 | @staticmethod 36 | def review_to_sentences( review, tokenizer, remove_stopwords=False ): 37 | # Function to split a review into parsed sentences. Returns a 38 | # list of sentences, where each sentence is a list of words 39 | # 40 | # 1. Use the NLTK tokenizer to split the paragraph into sentences 41 | raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) 42 | # 43 | # 2. Loop over each sentence 44 | sentences = [] 45 | for raw_sentence in raw_sentences: 46 | # If a sentence is empty, skip it 47 | if len(raw_sentence) > 0: 48 | # Otherwise, call review_to_wordlist to get a list of words 49 | sentences.append( utils.review_to_wordlist( raw_sentence, \ 50 | remove_stopwords )) 51 | # 52 | # Return the list of sentences (each sentence is a list of words, 53 | # so this returns a list of lists 54 | return sentences 55 | 56 | 57 | @staticmethod 58 | def text_to_wordlist(text, remove_stopwords=False, stem_words=False): 59 | # Clean the text, with the option to remove stopwords and to stem words. 60 | 61 | # Convert words to lower case and split them 62 | text = text.lower().split() 63 | 64 | # Optionally, remove stop words 65 | if remove_stopwords: 66 | stops = set(stopwords.words("english")) 67 | text = [w for w in text if not w in stops] 68 | 69 | text = " ".join(text) 70 | 71 | # Clean the text 72 | text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) 73 | text = re.sub(r"what's", "what is ", text) 74 | text = re.sub(r"\'s", " ", text) 75 | text = re.sub(r"\'ve", " have ", text) 76 | text = re.sub(r"can't", "cannot ", text) 77 | text = re.sub(r"n't", " not ", text) 78 | text = re.sub(r"i'm", "i am ", text) 79 | text = re.sub(r"\'re", " are ", text) 80 | text = re.sub(r"\'d", " would ", text) 81 | text = re.sub(r"\'ll", " will ", text) 82 | text = re.sub(r",", " ", text) 83 | text = re.sub(r"\.", " ", text) 84 | text = re.sub(r"!", " ! ", text) 85 | text = re.sub(r"\/", " ", text) 86 | text = re.sub(r"\^", " ^ ", text) 87 | text = re.sub(r"\+", " + ", text) 88 | text = re.sub(r"\-", " - ", text) 89 | text = re.sub(r"\=", " = ", text) 90 | text = re.sub(r"'", " ", text) 91 | text = re.sub(r"(\d+)(k)", r"\g<1>000", text) 92 | text = re.sub(r":", " : ", text) 93 | text = re.sub(r" e g ", " eg ", text) 94 | text = re.sub(r" b g ", " bg ", text) 95 | text = re.sub(r" u s ", " american ", text) 96 | text = re.sub(r"\0s", "0", text) 97 | text = re.sub(r" 9 11 ", "911", text) 98 | text = re.sub(r"e - mail", "email", text) 99 | text = re.sub(r"j k", "jk", text) 100 | text = re.sub(r"\s{2,}", " ", text) 101 | 102 | # Optionally, shorten words to their stems 103 | if stem_words: 104 | text = text.split() 105 | stemmer = SnowballStemmer('english') 106 | stemmed_words = [stemmer.stem(word) for word in text] 107 | text = " ".join(stemmed_words) 108 | 109 | # Return a list of words 110 | return(text) 111 | 112 | -------------------------------------------------------------------------------- /XGBoost_Python_TextFeats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook is created and shared to help people learn and understand the process of solving a problem which involves text variables. Apart from creating new variables, you'll learn to extract ~650 text (count) features and use them in training a xgboost model. This script scores ~0.70 on public leaderboard.\n", 15 | "For any questions, feel free to raise issues." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# load libraries\n", 27 | "\n", 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "import re\n", 31 | "import datetime\n", 32 | "from nltk.corpus import stopwords\n", 33 | "from sklearn.preprocessing import LabelEncoder\n", 34 | "from nltk.stem.snowball import SnowballStemmer\n", 35 | "from sklearn.feature_extraction.text import CountVectorizer\n", 36 | "import xgboost as xgb\n", 37 | "\n", 38 | "pd.set_option('display.max_colwidth',100)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 177, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "#load data\n", 50 | "train = pd.read_csv('train.csv')\n", 51 | "test = pd.read_csv('test.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 98, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# convert unix time format\n", 63 | "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n", 64 | "\n", 65 | "for x in unix_cols:\n", 66 | " train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))\n", 67 | " test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Some features" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 99, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "cols_to_use = ['name','desc']\n", 86 | "len_feats = ['name_len','desc_len']\n", 87 | "count_feats = ['name_count','desc_count']\n", 88 | "\n", 89 | "for i in np.arange(2):\n", 90 | " train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n", 91 | " test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 100, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "train['name_count'] = train['name'].str.split().str.len()\n", 103 | "train['desc_count'] = train['desc'].str.split().str.len()\n", 104 | "\n", 105 | "test['name_count'] = test['name'].str.split().str.len()\n", 106 | "test['desc_count'] = test['desc'].str.split().str.len()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 101, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "train['keywords_len'] = train['keywords'].str.len()\n", 118 | "train['keywords_count'] = train['keywords'].str.split('-').str.len()\n", 119 | "\n", 120 | "test['keywords_len'] = test['keywords'].str.len()\n", 121 | "test['keywords_count'] = test['keywords'].str.split('-').str.len()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Some more features" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 102, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "# converting string variables to datetime\n", 140 | "unix_cols = ['deadline','state_changed_at','launched_at','created_at']\n", 141 | "\n", 142 | "for x in unix_cols:\n", 143 | " train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))\n", 144 | " test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 103, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "# there should be simpler way - might take longer\n", 156 | "# creating list with time difference between 1) launched_at and created_at 2) deadline and launched_at\n", 157 | "\n", 158 | "time1 = []\n", 159 | "time3 = []\n", 160 | "for i in np.arange(train.shape[0]):\n", 161 | " time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))\n", 162 | " time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 104, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "train['time1'] = np.log(time1)\n", 174 | "train['time3'] = np.log(time3)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 105, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "# for test data\n", 186 | "time5 = []\n", 187 | "time6 = []\n", 188 | "for i in np.arange(test.shape[0]):\n", 189 | " time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))\n", 190 | " time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 106, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "test['time1'] = np.log(time5)\n", 202 | "test['time3'] = np.log(time6)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 107, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "feat = ['disable_communication','country']\n", 214 | "\n", 215 | "for x in feat:\n", 216 | " le = LabelEncoder()\n", 217 | " le.fit(list(train[x].values) + list(test[x].values))\n", 218 | " train[x] = le.transform(list(train[x]))\n", 219 | " test[x] = le.transform(list(test[x]))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 109, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "train['goal'] = np.log1p(train['goal'])\n", 231 | "test['goal'] = np.log1p(test['goal'])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Text Cleaning" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 110, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "# creating a full list of descriptions from train and etst\n", 250 | "kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 111, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower\n", 262 | "def desc_clean(word):\n", 263 | " p1 = re.sub(pattern='(\\W+)|(\\d+)|(\\s+)',repl=' ',string=word)\n", 264 | " p1 = p1.lower()\n", 265 | " return p1\n", 266 | "\n", 267 | "kickdesc = kickdesc.map(desc_clean)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 113, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "stop = set(stopwords.words('english'))\n", 279 | "kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]\n", 280 | "\n", 281 | "stemmer = SnowballStemmer(language='english')\n", 282 | "kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]\n", 283 | "\n", 284 | "kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]\n", 285 | "\n", 286 | "kickdesc = [' '.join(x) for x in kickdesc]" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "### Creating Count Features" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 147, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "# Due to memory error, limited the number of features to 650\n", 305 | "cv = CountVectorizer(max_features=650)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 148, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "alldesc = cv.fit_transform(kickdesc).todense()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 150, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "#create a data frame\n", 328 | "combine = pd.DataFrame(alldesc)\n", 329 | "combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 157, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "#split the text features\n", 341 | "\n", 342 | "train_text = combine[:train.shape[0]]\n", 343 | "test_text = combine[train.shape[0]:]\n", 344 | "\n", 345 | "test_text.reset_index(drop=True,inplace=True)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "### Finalizing train and test data before merging" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 162, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 198, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "target = train['final_status']" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 168, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "train = train.loc[:,cols_to_use]\n", 386 | "test = test.loc[:,cols_to_use]" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 174, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "X_train = pd.concat([train, train_text],axis=1)\n", 398 | "X_test = pd.concat([test, test_text],axis=1)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 175, 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "(108129, 659)\n", 413 | "(63465, 659)\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "print X_train.shape\n", 419 | "print X_test.shape" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "### Model Training" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 180, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "dtrain = xgb.DMatrix(data=X_train, label = target)\n", 438 | "dtest = xgb.DMatrix(data=X_test)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 185, 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "params = {\n", 450 | " 'objective':'binary:logistic',\n", 451 | " 'eval_metric':'error',\n", 452 | " 'eta':0.025,\n", 453 | " 'max_depth':6,\n", 454 | " 'subsample':0.7,\n", 455 | " 'colsample_bytree':0.7,\n", 456 | " 'min_child_weight':5\n", 457 | " \n", 458 | "}" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 186, 464 | "metadata": { 465 | "collapsed": false 466 | }, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "[0]\ttrain-error:0.312506+0.000818869\ttest-error:0.315126+0.00200614\n", 473 | "[10]\ttrain-error:0.309036+0.00111051\ttest-error:0.312694+0.00309961\n", 474 | "[20]\ttrain-error:0.308571+0.00121385\ttest-error:0.311917+0.00244751\n", 475 | "[30]\ttrain-error:0.307829+0.00111239\ttest-error:0.311871+0.00226141\n", 476 | "[40]\ttrain-error:0.306412+0.000131922\ttest-error:0.310502+0.00297732\n", 477 | "[50]\ttrain-error:0.3044+0.00051706\ttest-error:0.308846+0.00239344\n", 478 | "[60]\ttrain-error:0.302541+0.000442086\ttest-error:0.307773+0.00231223\n", 479 | "[70]\ttrain-error:0.301077+0.000528266\ttest-error:0.306812+0.00204724\n", 480 | "[80]\ttrain-error:0.29982+0.000589525\ttest-error:0.305988+0.00195492\n", 481 | "[90]\ttrain-error:0.298257+0.000508031\ttest-error:0.305064+0.00191186\n", 482 | "[100]\ttrain-error:0.297228+0.000279615\ttest-error:0.303723+0.00173837\n", 483 | "[110]\ttrain-error:0.296349+0.000327545\ttest-error:0.303223+0.00172593\n", 484 | "[120]\ttrain-error:0.295457+0.00028638\ttest-error:0.302289+0.00191928\n", 485 | "[130]\ttrain-error:0.294583+0.000378843\ttest-error:0.301623+0.00154688\n", 486 | "[140]\ttrain-error:0.293783+0.000438995\ttest-error:0.300948+0.00175078\n", 487 | "[150]\ttrain-error:0.292814+0.000369126\ttest-error:0.300467+0.00155916\n", 488 | "[160]\ttrain-error:0.292169+0.000444825\ttest-error:0.299968+0.00178122\n", 489 | "[170]\ttrain-error:0.291249+0.00034968\ttest-error:0.299459+0.00213267\n", 490 | "[180]\ttrain-error:0.290553+0.000387982\ttest-error:0.298793+0.00213854\n", 491 | "[190]\ttrain-error:0.28976+0.00040561\ttest-error:0.298321+0.00204807\n", 492 | "[200]\ttrain-error:0.289114+0.000467604\ttest-error:0.297748+0.00208411\n", 493 | "[210]\ttrain-error:0.288467+0.000442353\ttest-error:0.297507+0.00212083\n", 494 | "[220]\ttrain-error:0.287843+0.000490146\ttest-error:0.297008+0.00234605\n", 495 | "[230]\ttrain-error:0.287285+0.000393046\ttest-error:0.296879+0.00223639\n", 496 | "[240]\ttrain-error:0.286751+0.000357893\ttest-error:0.296574+0.00248519\n", 497 | "[250]\ttrain-error:0.286134+0.000314877\ttest-error:0.296009+0.00236468\n", 498 | "[260]\ttrain-error:0.285519+0.000468298\ttest-error:0.295843+0.00219033\n", 499 | "[270]\ttrain-error:0.284932+0.000440225\ttest-error:0.295658+0.00224829\n", 500 | "[280]\ttrain-error:0.28452+0.000440191\ttest-error:0.295399+0.0023164\n", 501 | "[290]\ttrain-error:0.283933+0.000573286\ttest-error:0.295436+0.00240771\n", 502 | "[300]\ttrain-error:0.28351+0.000553644\ttest-error:0.295094+0.00255344\n", 503 | "[310]\ttrain-error:0.283059+0.00042736\ttest-error:0.294622+0.00241165\n", 504 | "[320]\ttrain-error:0.282467+0.000447172\ttest-error:0.294243+0.00222645\n", 505 | "[330]\ttrain-error:0.281928+0.000534553\ttest-error:0.294298+0.0023867\n", 506 | "[340]\ttrain-error:0.281459+0.000575282\ttest-error:0.293965+0.0023361\n", 507 | "[350]\ttrain-error:0.28105+0.000749895\ttest-error:0.293947+0.00236084\n", 508 | "[360]\ttrain-error:0.280581+0.000750061\ttest-error:0.293669+0.00230567\n", 509 | "[370]\ttrain-error:0.280201+0.000686446\ttest-error:0.293632+0.00227019\n", 510 | "[380]\ttrain-error:0.279804+0.000762174\ttest-error:0.293457+0.00198204\n", 511 | "[390]\ttrain-error:0.279177+0.000770605\ttest-error:0.293401+0.00211599\n", 512 | "[400]\ttrain-error:0.278974+0.00074798\ttest-error:0.293438+0.00208914\n", 513 | "[410]\ttrain-error:0.278409+0.000716594\ttest-error:0.293207+0.00205472\n", 514 | "[420]\ttrain-error:0.278042+0.000809643\ttest-error:0.293078+0.00229264\n", 515 | "[430]\ttrain-error:0.27773+0.000680744\ttest-error:0.293078+0.00194402\n", 516 | "[440]\ttrain-error:0.277392+0.00069521\ttest-error:0.292957+0.00195425\n", 517 | "[450]\ttrain-error:0.276805+0.000554099\ttest-error:0.292754+0.00202523\n", 518 | "[460]\ttrain-error:0.276335+0.000462337\ttest-error:0.292356+0.00195339\n", 519 | "[470]\ttrain-error:0.276046+0.000483488\ttest-error:0.292171+0.00215688\n", 520 | "[480]\ttrain-error:0.275612+0.000481538\ttest-error:0.292153+0.00231926\n", 521 | "[490]\ttrain-error:0.275316+0.000540829\ttest-error:0.29206+0.0023093\n", 522 | "[500]\ttrain-error:0.274876+0.000555857\ttest-error:0.291893+0.00216299\n", 523 | "[510]\ttrain-error:0.274601+0.000543743\ttest-error:0.291727+0.0022489\n", 524 | "[520]\ttrain-error:0.274345+0.000556289\ttest-error:0.291672+0.00212988\n", 525 | "[530]\ttrain-error:0.273884+0.000640932\ttest-error:0.291671+0.002083\n", 526 | "[540]\ttrain-error:0.273445+0.000572263\ttest-error:0.291431+0.00215843\n", 527 | "[550]\ttrain-error:0.27307+0.000643974\ttest-error:0.291533+0.00208399\n", 528 | "[560]\ttrain-error:0.272839+0.000715068\ttest-error:0.291367+0.0021741\n", 529 | "[570]\ttrain-error:0.272474+0.000693709\ttest-error:0.291145+0.00218187\n", 530 | "[580]\ttrain-error:0.272116+0.000735978\ttest-error:0.291061+0.00239614\n", 531 | "[590]\ttrain-error:0.27172+0.000671488\ttest-error:0.291052+0.00220047\n", 532 | "[600]\ttrain-error:0.271392+0.000581353\ttest-error:0.291061+0.00205433\n", 533 | "[610]\ttrain-error:0.270997+0.000704158\ttest-error:0.291034+0.00215672\n", 534 | "[620]\ttrain-error:0.27073+0.00065256\ttest-error:0.290978+0.00208651\n", 535 | "[630]\ttrain-error:0.270305+0.00058566\ttest-error:0.290876+0.00236142\n", 536 | "[640]\ttrain-error:0.269984+0.000583791\ttest-error:0.290756+0.00241029\n", 537 | "[650]\ttrain-error:0.269609+0.000637878\ttest-error:0.290543+0.00218322\n", 538 | "[660]\ttrain-error:0.269343+0.000666656\ttest-error:0.290432+0.00199139\n", 539 | "[670]\ttrain-error:0.268943+0.000610928\ttest-error:0.290303+0.00202394\n", 540 | "[680]\ttrain-error:0.268562+0.000486423\ttest-error:0.290025+0.00214464\n", 541 | "[690]\ttrain-error:0.268263+0.000557561\ttest-error:0.290072+0.0020426\n", 542 | "[700]\ttrain-error:0.267801+0.000551476\ttest-error:0.289942+0.00195377\n", 543 | "[710]\ttrain-error:0.267494+0.000522724\ttest-error:0.289942+0.00203917\n", 544 | "[720]\ttrain-error:0.267221+0.000533431\ttest-error:0.290229+0.00199579\n", 545 | "[730]\ttrain-error:0.266888+0.000576349\ttest-error:0.289979+0.00215134\n", 546 | "[740]\ttrain-error:0.266578+0.000550324\ttest-error:0.289794+0.00203336\n", 547 | "[750]\ttrain-error:0.266263+0.000577213\ttest-error:0.289877+0.00193757\n", 548 | "[760]\ttrain-error:0.266023+0.000512068\ttest-error:0.289794+0.00204126\n", 549 | "[770]\ttrain-error:0.265692+0.000416448\ttest-error:0.289618+0.00217298\n", 550 | "[780]\ttrain-error:0.26532+0.000458632\ttest-error:0.289387+0.0019424\n", 551 | "[790]\ttrain-error:0.26507+0.000483665\ttest-error:0.289119+0.00195537\n", 552 | "[800]\ttrain-error:0.26483+0.000291451\ttest-error:0.289064+0.00178512\n", 553 | "[810]\ttrain-error:0.264453+0.00026856\ttest-error:0.288814+0.00180906\n", 554 | "[820]\ttrain-error:0.26431+0.000318559\ttest-error:0.288823+0.00181794\n", 555 | "[830]\ttrain-error:0.264018+0.000304194\ttest-error:0.288851+0.00188464\n", 556 | "[840]\ttrain-error:0.263632+0.000370743\ttest-error:0.288694+0.00201019\n", 557 | "[850]\ttrain-error:0.263352+0.000371047\ttest-error:0.288518+0.00189282\n", 558 | "[860]\ttrain-error:0.262953+0.000410568\ttest-error:0.288536+0.00195185\n", 559 | "[870]\ttrain-error:0.26261+0.000301595\ttest-error:0.288472+0.00210928\n", 560 | "[880]\ttrain-error:0.262349+0.000334279\ttest-error:0.288361+0.00204547\n", 561 | "[890]\ttrain-error:0.262137+0.000331987\ttest-error:0.288176+0.0019154\n", 562 | "[900]\ttrain-error:0.261792+0.00027822\ttest-error:0.28825+0.00200458\n", 563 | "[910]\ttrain-error:0.261489+0.000354748\ttest-error:0.287972+0.00209926\n", 564 | "[920]\ttrain-error:0.261239+0.000327636\ttest-error:0.28825+0.00186055\n", 565 | "[930]\ttrain-error:0.260909+0.000239154\ttest-error:0.287963+0.00191734\n", 566 | "[940]\ttrain-error:0.260596+0.000299306\ttest-error:0.287954+0.00166505\n", 567 | "[950]\ttrain-error:0.260319+0.000246223\ttest-error:0.287797+0.00174056\n", 568 | "[960]\ttrain-error:0.260134+0.000339131\ttest-error:0.287639+0.00181021\n", 569 | "[970]\ttrain-error:0.25991+0.000288789\ttest-error:0.287648+0.00182037\n", 570 | "[980]\ttrain-error:0.259715+0.000294464\ttest-error:0.287695+0.00192808\n", 571 | "[990]\ttrain-error:0.259422+0.000406691\ttest-error:0.287565+0.0019653\n" 572 | ] 573 | } 574 | ], 575 | "source": [ 576 | "# You can probably get better accuracy with rounds > 1000. \n", 577 | "bst = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=40,nfold=5L,verbose_eval=10)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 187, 583 | "metadata": { 584 | "collapsed": true 585 | }, 586 | "outputs": [], 587 | "source": [ 588 | "bst_train = xgb.train(params, dtrain, num_boost_round=1000)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 188, 594 | "metadata": { 595 | "collapsed": true 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "p_test = bst_train.predict(dtest)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 189, 605 | "metadata": { 606 | "collapsed": true 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "sub = pd.DataFrame()\n", 611 | "sub['project_id'] = test['project_id']\n", 612 | "sub['final_status'] = p_test" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 194, 618 | "metadata": { 619 | "collapsed": false 620 | }, 621 | "outputs": [], 622 | "source": [ 623 | "sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 196, 629 | "metadata": { 630 | "collapsed": true 631 | }, 632 | "outputs": [], 633 | "source": [ 634 | "sub.to_csv(\"xgb_with_python_feats.csv\",index=False) #0.70" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": null, 640 | "metadata": { 641 | "collapsed": true 642 | }, 643 | "outputs": [], 644 | "source": [] 645 | } 646 | ], 647 | "metadata": { 648 | "kernelspec": { 649 | "display_name": "Python 2", 650 | "language": "python", 651 | "name": "python2" 652 | }, 653 | "language_info": { 654 | "codemirror_mode": { 655 | "name": "ipython", 656 | "version": 2 657 | }, 658 | "file_extension": ".py", 659 | "mimetype": "text/x-python", 660 | "name": "python", 661 | "nbconvert_exporter": "python", 662 | "pygments_lexer": "ipython2", 663 | "version": "2.7.13" 664 | } 665 | }, 666 | "nbformat": 4, 667 | "nbformat_minor": 2 668 | } 669 | -------------------------------------------------------------------------------- /xgboost_starter.R: -------------------------------------------------------------------------------- 1 | 2 | # This script will help you learn how to build a xgboost models on features extracted using 3 | # Text Mining methods. This script scores ~0.70 on public leaderboard. 4 | 5 | 6 | # Load Libraries ---------------------------------------------------------- 7 | 8 | library(data.table) 9 | library(stringr) 10 | library(text2vec) 11 | 12 | train <- fread("train.csv") 13 | test <- fread("test.csv") 14 | 15 | 16 | # Convert Unix Time Format ------------------------------------------------ 17 | 18 | unix_feats <- c('deadline','state_changed_at','created_at','launched_at') 19 | train[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats] 20 | test[,c(unix_feats) := lapply(.SD, function(x) structure(x, class=c('POSIXct'))), .SDcols = unix_feats] 21 | 22 | 23 | # Create Features --------------------------------------------------------- 24 | 25 | len_feats <- c('name_len','desc_len','keywords_len') 26 | count_feats <- c('name_count','desc_count','keywords_count') 27 | cols <- c('name','desc','keywords') 28 | 29 | train[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols] 30 | train[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols] 31 | 32 | test[,c(len_feats) := lapply(.SD, function(x) str_count(x)), .SDcols = cols] 33 | test[,c(count_feats) := lapply(.SD, function(x) str_count(x,"\\w+")), .SDcols = cols] 34 | 35 | 36 | # Some More Features ------------------------------------------------------ 37 | 38 | train[,time1 := as.numeric(difftime(launched_at, created_at))] 39 | train[,time3 := as.numeric(difftime(deadline, launched_at))] 40 | 41 | train[,time1 := log(time1)] 42 | train[,time3 := log(time3)] 43 | 44 | test[,time1 := as.numeric(difftime(launched_at, created_at))] 45 | test[,time3 := as.numeric(difftime(deadline, launched_at))] 46 | 47 | test[,time1 := log(time1)] 48 | test[,time3 := log(time3)] 49 | 50 | 51 | 52 | # Encoding Variables ------------------------------------------------------ 53 | 54 | train[,disable_communication := as.integer(as.factor(disable_communication))-1] 55 | test[,disable_communication := as.integer(as.factor(disable_communication))-1] 56 | 57 | countryall <- data.table(country = append(train$country, test$country)) 58 | countryall[,country := as.integer(as.factor(country))-1] 59 | 60 | country_train <- countryall[1:nrow(train)] 61 | country_test <- countryall[(nrow(train)+1):nrow(countryall)] 62 | 63 | train[,country := NULL][,country := country_train$country] 64 | test[,country := NULL][, country := country_test$country] 65 | 66 | train[,goal := log1p(goal)] 67 | test[,goal := log1p(goal)] 68 | 69 | rm(country_test,country_train,countryall) 70 | gc() 71 | 72 | 73 | 74 | # Creating Features from 'Keywords' Variable ------------------------------ 75 | 76 | # We could have use a R package to perform the following text mining steps. 77 | # Rather we'll follow a manual cleaning process which will help you learn using regular expressions as well 78 | 79 | #creating a data frame by combining keywords from both data sets 80 | fullkey <- rbind(train[,.(project_id,keywords)], test[,.(project_id, keywords)]) 81 | 82 | 83 | 84 | # Text Cleaning ----------------------------------------------------------- 85 | 86 | fullkey[,keywords := lapply(keywords, function(x) str_split(string = x, pattern = "-"))] 87 | 88 | # function to remove stop words 89 | remov_stop <- function(x){ 90 | 91 | t <- unlist(x) 92 | t <- setdiff(t, tidytext::stop_words$word) 93 | return (t) 94 | 95 | } 96 | 97 | fullkey[,keywords := lapply(keywords, function(x) remov_stop(x))] 98 | fullkey[,keywords := lapply(keywords, function(x) str_replace_all(x, "[[:digit:]]",""))] 99 | fullkey[,keywords := lapply(keywords, function(x) SnowballC::wordStem(x))] 100 | fullkey[, keywords := lapply(keywords, function(x) x[nchar(x) > 2])] 101 | 102 | 103 | # creating count corpus 104 | 105 | vec_train <- itoken(fullkey$keywords,tokenizer = word_tokenizer,ids = fullkey$project_id) 106 | vocab = create_vocabulary(vec_train) 107 | vocab 108 | 109 | pruned_vocab <- prune_vocabulary(vocab,term_count_min = 150) # words occuring 150 or more times 110 | pruned_vocab 111 | 112 | vocab1 <- vocab_vectorizer(pruned_vocab) 113 | dtm_text <- create_dtm(vec_train,vocab1) 114 | dim(dtm_text) 115 | 116 | dtm_text1 <- as.data.table(as.matrix(dtm_text)) 117 | 118 | dtm_train <- dtm_text1[1:108129] 119 | dtm_test <- dtm_text1[108130:171594] 120 | 121 | 122 | # Adding text features in train and test data ----------------------------- 123 | 124 | X_train <- copy(train) 125 | X_test <- copy(test) 126 | 127 | cols_to_use <- c('name_len' 128 | ,'desc_len' 129 | ,'keywords_len' 130 | ,'name_count' 131 | ,'desc_count' 132 | ,'keywords_count' 133 | ,'time1' 134 | ,'time3' 135 | ,'goal') 136 | 137 | X_train <- cbind(X_train[,cols_to_use,with=F], dtm_train) 138 | X_test <- cbind(X_test[,cols_to_use,with=F], dtm_test) 139 | 140 | X_train <- cbind(X_train, train_isnum$is_number) 141 | X_test <- cbind(X_train, test_isnum$is_number) 142 | 143 | 144 | # Model Training ---------------------------------------------------------- 145 | 146 | library(xgboost) 147 | 148 | dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = as.numeric(train$final_status)) 149 | dtest <- xgb.DMatrix(data = as.matrix(X_test)) 150 | 151 | params <- list( 152 | 153 | objective = "binary:logistic", 154 | eta = 0.025, 155 | max_depth = 6, 156 | subsample = 0.7, 157 | colsample_bytree = 0.7, 158 | min_child_weight = 5 159 | 160 | ) 161 | 162 | big_cv <- xgb.cv(params = params 163 | ,data = dtrain 164 | ,nrounds = 1000 165 | ,nfold = 5L 166 | ,metrics = 'error' 167 | ,stratified = T 168 | ,print_every_n = 10 169 | ,early_stopping_rounds = 40) 170 | 171 | iter <- big_cv$best_iteration 172 | 173 | big_train <- xgb.train(params = params 174 | ,data = dtrain 175 | ,nrounds = iter) 176 | 177 | imp <- xgb.importance(model = big_train, feature_names = colnames(dtrain)) 178 | xgb.plot.importance(imp,top_n = 20) 179 | 180 | big_pred <- predict(big_train, dtest) 181 | big_pred <- ifelse(big_pred > 0.5,1,0) 182 | 183 | sub <- data.table(project_id = test$project_id, final_status = big_pred) 184 | fwrite(sub, "xgb_with_feats.csv") #0.70 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | --------------------------------------------------------------------------------