├── .ipynb_checkpoints ├── Basic Code-DM-checkpoint.ipynb ├── Basic Code-checkpoint.ipynb ├── Julia Code-checkpoint.ipynb ├── Naive Bayes - CrossVal-checkpoint.ipynb ├── Naive Bayes Code-checkpoint.ipynb ├── Test_SFO_OAK_FileGeneration-checkpoint.ipynb ├── Untitled0-checkpoint.ipynb ├── Untitled1-checkpoint.ipynb ├── Untitled2-checkpoint.ipynb └── Untitled3-checkpoint.ipynb ├── EDA_and_NB_performance_charts.py ├── INFO290T_Final_Project_Presentation_vFINAL.pptx ├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx ├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf ├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx ├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf ├── NB_performance_charts.py ├── Old Python Code ├── Basic Code-DM.ipynb ├── Basic.py ├── Dest.pkl ├── Julia Code.ipynb ├── NB.py ├── Naive Bayes - CrossVal.ipynb ├── Naive Bayes Code.ipynb ├── Origin.pkl ├── TailNum.pkl ├── Test_SFO_OAK_FileGeneration.ipynb ├── UniqueCarrier.pkl ├── Untitled0.ipynb ├── Untitled1.ipynb ├── Untitled2.ipynb ├── Untitled3.ipynb ├── accuracy.pkl ├── counter.py ├── counter1.py ├── data_reader_v2.py ├── data_reader_v3.py ├── data_reader_v4_ek.py ├── date_iterator_plot.py ├── logisticRegression.py ├── matrix.pkl ├── model_selector.py ├── output.txt ├── prec.pkl ├── results.pkl └── why.csv ├── README.md ├── data_reader_v4_ek.py ├── data_reader_v4_ek_rj_csv.py ├── date_graph2.py ├── date_iterator_plot2.py ├── logisticRegression.py ├── lr_app2.py ├── model_selector.py └── naive bayes.py /.ipynb_checkpoints/Julia Code-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:da393243e5798034294abbf7f55af08e5d9a8eecbfb718fe2ddd80cd4a4d11b5" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "import csv\n", 16 | "import pickle\n", 17 | "\n", 18 | "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n", 19 | "years = [2008]\n", 20 | "\n", 21 | "def ComputeDayofYear(row):\n", 22 | " \"\"\"This function will return an integer to represent the day of the year given an integer\n", 23 | " representing month and an integer representing the day of the month. This number will\n", 24 | " correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned\n", 25 | " as 0. Feb 29th will be returned as 59.\"\"\"\n", 26 | "\n", 27 | " if(row[0] == '1'):\n", 28 | " calc = 0 + int(row[1]) - 1\n", 29 | " row[1] = str(calc)\n", 30 | " elif(row[0] == '2'):\n", 31 | " calc = 31 + int(row[1]) - 1\n", 32 | " row[1] = str(calc)\n", 33 | " elif(row[0] == '3'):\n", 34 | " calc = 60 + int(row[1]) - 1\n", 35 | " row[1] = str(calc)\n", 36 | " elif(row[0] == '4'):\n", 37 | " calc = 91 + int(row[1]) - 1\n", 38 | " row[1] = str(calc)\n", 39 | " elif(row[0] == '5'):\n", 40 | " calc = 121 + int(row[1]) - 1\n", 41 | " row[1] = str(calc)\n", 42 | " elif(row[0] == '6'):\n", 43 | " calc = 152 + int(row[1]) - 1\n", 44 | " row[1] = str(calc)\n", 45 | " elif(row[0] == '7'):\n", 46 | " calc = 182 + int(row[1]) - 1\n", 47 | " row[1] = str(calc)\n", 48 | " elif(row[0] == '8'):\n", 49 | " calc = 213 + int(row[1]) - 1\n", 50 | " row[1] = str(calc)\n", 51 | " elif(row[0] == '9'):\n", 52 | " calc = 244 + int(row[1]) - 1\n", 53 | " row[1] = str(calc)\n", 54 | " elif(row[0] == '10'):\n", 55 | " calc = 274 + int(row[1]) - 1\n", 56 | " row[1] = str(calc)\n", 57 | " elif(row[0] == '11'):\n", 58 | " calc = 305 + int(row[1]) - 1\n", 59 | " row[1] = str(calc)\n", 60 | " elif(row[0] == '12'):\n", 61 | " calc = 335 + int(row[1]) - 1\n", 62 | " row[1] = str(calc)\n", 63 | " return row\n", 64 | "\n", 65 | "\n", 66 | "def DiscretizeDepTime(row):\n", 67 | " \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n", 68 | " morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value\n", 69 | " is assumed to be an integer in 24-hour time format. These labels will correspond to\n", 70 | " variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.\n", 71 | " An error time is returned as morning.\"\"\"\n", 72 | "\n", 73 | " if(int(row[3]) <= 559):\n", 74 | " row[3] = '2'\n", 75 | " elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n", 76 | " row[3] = '0'\n", 77 | " elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n", 78 | " row[3] = '1'\n", 79 | " elif(int(row[3]) >= 1800):\n", 80 | " row[3] = '2'\n", 81 | " else:\n", 82 | " row[3] = '0'\n", 83 | " return row\n", 84 | "\n", 85 | "\n", 86 | "def AddDepVar(row):\n", 87 | " \"\"\"This function adds a classification label based on the length of the recorded\n", 88 | " Departure Delay in the data set. It assumes an input integer value of the delay in mins.\n", 89 | " By airline industry standards, flight delays are defined as departure delays greater than\n", 90 | " or equal to 15 minutes. For delayed flights, this variable will have value \"1\".\n", 91 | " For on time flights, it will have value \"0\". Default value will be set at \"0\".\"\"\"\n", 92 | "\n", 93 | " if(row[6] >= '15'):\n", 94 | " row[6] = '1'\n", 95 | " else:\n", 96 | " row[6] = '0'\n", 97 | " return row\n", 98 | "\n", 99 | "def SaveData(data, pickle_file_name):\n", 100 | " \"\"\"This function pickles each file.\"\"\"\n", 101 | "\n", 102 | " f = open (pickle_file_name, \"w\")\n", 103 | " pickle.dump(data, f)\n", 104 | " f.close()\n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | "for i in years:\n", 109 | " data = []\n", 110 | " file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n", 111 | " pickle_file_name = 'data' + str(i)\n", 112 | " with open(file_path, 'r') as data_csv:\n", 113 | " csv_reader = csv.reader(data_csv, delimiter=',')\n", 114 | " for row in list(csv_reader):\n", 115 | " if row[21] == '0':\n", 116 | " content = list(row[i] for i in needed_cols)\n", 117 | " content2 = ComputeDayofYear(content)\n", 118 | " content3 = DiscretizeDepTime(content2)\n", 119 | " content4 = AddDepVar(content3)\n", 120 | " data.append(content4)\n", 121 | " SaveData(data, pickle_file_name)" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | } 127 | ], 128 | "metadata": {} 129 | } 130 | ] 131 | } -------------------------------------------------------------------------------- /.ipynb_checkpoints/Naive Bayes Code-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:4dd7867e8934ba7980fd61f1cdbc7df7ff1cccafc3f287e3da0b94562583a3d7" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import sklearn\n", 19 | "from sklearn.naive_bayes import *\n", 20 | "from sklearn.metrics import *\n", 21 | "import os\n", 22 | "import cPickle\n", 23 | "import sys\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "from optparse import OptionParser\n", 27 | "from sklearn import metrics, preprocessing\n", 28 | "from sklearn import svm, naive_bayes, neighbors, tree\n", 29 | "from sklearn.ensemble import AdaBoostClassifier\n", 30 | "from sklearn import cross_validation\n", 31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n", 32 | "from sklearn.svm import SVC # support vector machine classifier\n", 33 | "from sklearn.grid_search import GridSearchCV # hyperparameter grid search to find best model parameters\n", 34 | "from sklearn import preprocessing # preprocess string labels into numerics\n", 35 | "from sklearn import *\n", 36 | "from sklearn.metrics import precision_recall_fscore_support\n", 37 | "from sklearn.metrics import classification_report" 38 | ], 39 | "language": "python", 40 | "metadata": {}, 41 | "outputs": [], 42 | "prompt_number": 197 43 | }, 44 | { 45 | "cell_type": "code", 46 | "collapsed": false, 47 | "input": [ 48 | "# Setting up constants\n", 49 | "print \"Setting constants...\"\n", 50 | "\n", 51 | "TRAINING_LINE_NUMBER = 1000000\n", 52 | "YEARS = ['2008']\n", 53 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n", 54 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n", 55 | "# YEARS = ['2008']\n", 56 | "\n", 57 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n", 58 | "\n", 59 | "master = []\n", 60 | "print \"Reading into Pandas frame...\"\n", 61 | "try:\n", 62 | " for year in YEARS:\n", 63 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n", 64 | " print \"\\n\",path\n", 65 | " dfPart = pd.read_csv(\n", 66 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n", 67 | " u'Year', \n", 68 | " u'Month', \n", 69 | " u'DayofMonth', \n", 70 | " u'DayOfWeek', \n", 71 | " u'UniqueCarrier',\n", 72 | " u'DepTime', \n", 73 | " u'TailNum', \n", 74 | " u'Origin', \n", 75 | " u'Dest', \n", 76 | " u'DepDelay', \n", 77 | "# u'ArrDelay', \n", 78 | " u'Cancelled',\n", 79 | "# u'ArrTime',\n", 80 | "# u'ArrDelay',\n", 81 | "# u'Distance'\n", 82 | " ])\n", 83 | " print len(dfPart)\n", 84 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n", 85 | " print \"Removed cancelled flights, new length - \",len(dfPart)\n", 86 | " master.append(dfPart)\n", 87 | " print\n", 88 | "except Exception as e:\n", 89 | " print \"Supplemental Data Import failed\", e\n", 90 | "\n", 91 | "dfMaster = pd.concat(master, ignore_index=True)\n", 92 | "master=[]\n", 93 | "dfPart=[]\n", 94 | "\n", 95 | "print \"Total length - \", len(dfMaster)\n", 96 | "del dfMaster['Cancelled']\n", 97 | "\n", 98 | "dfMaster.fillna(0, inplace=True)\n", 99 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n", 100 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n", 101 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n", 102 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n", 103 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n", 104 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n", 105 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n", 106 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n", 107 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n", 108 | "\n", 109 | "df = dfMaster\n", 110 | "\n", 111 | "print \"Calculating classification label...\"\n", 112 | "df['label'] = 0\n", 113 | "df.label[df.DepDelay >= 15] = 1\n", 114 | "df.label[df.DepDelay < 15] = 0\n", 115 | "\n", 116 | "df['DepDelay'][df.DepDelay < 0]=0\n", 117 | "del df['DepDelay']\n", 118 | "# df['ArrDelay'][df.ArrDelay < 0]=0\n", 119 | "\n", 120 | "print \"Dataframe shape - \",df.shape\n", 121 | "print \"Columns -\", df.columns" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "stream": "stdout", 129 | "text": [ 130 | "Setting constants...\n", 131 | "Reading into Pandas frame...\n", 132 | "\n", 133 | "C:\\data\\airline\\2008.csv\n", 134 | "1000000" 135 | ] 136 | }, 137 | { 138 | "output_type": "stream", 139 | "stream": "stdout", 140 | "text": [ 141 | "\n", 142 | "Removed cancelled flights, new length - " 143 | ] 144 | }, 145 | { 146 | "output_type": "stream", 147 | "stream": "stdout", 148 | "text": [ 149 | " 967867\n", 150 | "\n", 151 | "Total length - " 152 | ] 153 | }, 154 | { 155 | "output_type": "stream", 156 | "stream": "stdout", 157 | "text": [ 158 | " 967867\n", 159 | "Calculating classification label..." 160 | ] 161 | }, 162 | { 163 | "output_type": "stream", 164 | "stream": "stdout", 165 | "text": [ 166 | "\n", 167 | "Dataframe shape - (967867, 10)\n", 168 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n" 169 | ] 170 | } 171 | ], 172 | "prompt_number": 198 173 | }, 174 | { 175 | "cell_type": "code", 176 | "collapsed": false, 177 | "input": [ 178 | "print \"Converting categorical data to numeric...\"\n", 179 | "for col in set(df.columns):\n", 180 | "# print col, train[col].dtype\n", 181 | " if df[col].dtype == np.dtype('object'):\n", 182 | " print \"Converting...\", col\n", 183 | " if col == 'TailNum':\n", 184 | " s = np.unique(df[col].values)\n", 185 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 186 | "# print TailNum\n", 187 | " if col == 'UniqueCarrier':\n", 188 | " s = np.unique(df[col].values)\n", 189 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 190 | "# print UniqueCarrier\n", 191 | " if col == 'Dest':\n", 192 | " s = np.unique(df[col].values)\n", 193 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 194 | "# print Dest\n", 195 | " if col == 'Origin':\n", 196 | " s = np.unique(df[col].values)\n", 197 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 198 | "# print Origin\n", 199 | "\n", 200 | "\n", 201 | "def getTailNum(inTailNum):\n", 202 | "# print \"In...\",type(inTailNum)\n", 203 | " out = []\n", 204 | " for x, y in inTailNum.iteritems():\n", 205 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n", 206 | " out.append(TailNum.get_value(y) + 1)\n", 207 | "# print \"final out\", out\n", 208 | " return out\n", 209 | "\n", 210 | "\n", 211 | "def getDest(inDest):\n", 212 | " out = []\n", 213 | " for x, y in inDest.iteritems():\n", 214 | " out.append(Dest.get_value(y) + 1)\n", 215 | " return out\n", 216 | "\n", 217 | "\n", 218 | "def getOrigin(inOrign):\n", 219 | " out = []\n", 220 | " for x, y in inOrign.iteritems():\n", 221 | " out.append(Origin.get_value(y) + 1)\n", 222 | " return out\n", 223 | "\n", 224 | "\n", 225 | "def getCarrier(inCarrier):\n", 226 | " out = []\n", 227 | " for x, y in inCarrier.iteritems():\n", 228 | " out.append(UniqueCarrier.get_value(y) + 1)\n", 229 | " return out\n", 230 | "\n", 231 | "df['TailNum'] = getTailNum(df['TailNum'])\n", 232 | "print \"TailNum completed.\"\n", 233 | "\n", 234 | "df['Dest'] = getDest(df['Dest'])\n", 235 | "print \"Dest completed.\"\n", 236 | "\n", 237 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n", 238 | "print \"UniqueCarrier completed.\"\n", 239 | "\n", 240 | "df['Origin'] = getOrigin(df['Origin'])\n", 241 | "print \"Origin completed.\"\n", 242 | "\n", 243 | "print \"Conversion to numeric completed.\"\n", 244 | "\n", 245 | "# print \"Pickling converted data...\"\n", 246 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")" 247 | ], 248 | "language": "python", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "output_type": "stream", 253 | "stream": "stdout", 254 | "text": [ 255 | "Converting categorical data to numeric...\n", 256 | "Converting... Origin\n", 257 | "Converting..." 258 | ] 259 | }, 260 | { 261 | "output_type": "stream", 262 | "stream": "stdout", 263 | "text": [ 264 | " UniqueCarrier\n", 265 | "Converting..." 266 | ] 267 | }, 268 | { 269 | "output_type": "stream", 270 | "stream": "stdout", 271 | "text": [ 272 | " Dest\n", 273 | "Converting..." 274 | ] 275 | }, 276 | { 277 | "output_type": "stream", 278 | "stream": "stdout", 279 | "text": [ 280 | " TailNum\n", 281 | "TailNum completed." 282 | ] 283 | }, 284 | { 285 | "output_type": "stream", 286 | "stream": "stdout", 287 | "text": [ 288 | "\n", 289 | "Dest completed." 290 | ] 291 | }, 292 | { 293 | "output_type": "stream", 294 | "stream": "stdout", 295 | "text": [ 296 | "\n", 297 | "UniqueCarrier completed." 298 | ] 299 | }, 300 | { 301 | "output_type": "stream", 302 | "stream": "stdout", 303 | "text": [ 304 | "\n", 305 | "Origin completed." 306 | ] 307 | }, 308 | { 309 | "output_type": "stream", 310 | "stream": "stdout", 311 | "text": [ 312 | "\n", 313 | "Conversion to numeric completed.\n" 314 | ] 315 | } 316 | ], 317 | "prompt_number": 199 318 | }, 319 | { 320 | "cell_type": "code", 321 | "collapsed": false, 322 | "input": [ 323 | "Origin['SFO'], Origin['OAK']" 324 | ], 325 | "language": "python", 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "metadata": {}, 330 | "output_type": "pyout", 331 | "prompt_number": 200, 332 | "text": [ 333 | "(243, 192)" 334 | ] 335 | } 336 | ], 337 | "prompt_number": 200 338 | }, 339 | { 340 | "cell_type": "code", 341 | "collapsed": false, 342 | "input": [ 343 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n", 344 | "\n", 345 | "# add columns to your data frame\n", 346 | "\n", 347 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n", 348 | "\n", 349 | "# define training and test sets\n", 350 | "train = df[df['is_train'] == True]\n", 351 | "test = df[df['is_train'] == False]\n", 352 | "trainTargets = np.array(train['label']).astype(int)\n", 353 | "testTargets = np.array(test['label']).astype(int)\n", 354 | "features = df.columns[0:9]\n", 355 | "\n", 356 | "testSFO = test[test['Dest']==Origin['SFO']]\n", 357 | "print len(testSFO)\n", 358 | "\n", 359 | "testOAK = test[test['Dest']==Origin['OAK']]\n", 360 | "print len(testOAK)\n", 361 | "\n", 362 | "print \"Model fitting and prediction started...\"\n", 363 | "gnb = tree.DecisionTreeClassifier()\n", 364 | "\n", 365 | "# train model\n", 366 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n", 367 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n", 368 | "\n", 369 | "print \"Classification completed.\"" 370 | ], 371 | "language": "python", 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "output_type": "stream", 376 | "stream": "stdout", 377 | "text": [ 378 | "Begin classification...75% training, 25% testing, randomly chosen\n", 379 | "887" 380 | ] 381 | }, 382 | { 383 | "output_type": "stream", 384 | "stream": "stdout", 385 | "text": [ 386 | "\n", 387 | "39\n", 388 | "Model fitting and prediction started...\n", 389 | "Classification completed." 390 | ] 391 | }, 392 | { 393 | "output_type": "stream", 394 | "stream": "stdout", 395 | "text": [ 396 | "\n" 397 | ] 398 | } 399 | ], 400 | "prompt_number": 215 401 | }, 402 | { 403 | "cell_type": "code", 404 | "collapsed": false, 405 | "input": [ 406 | "features" 407 | ], 408 | "language": "python", 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "metadata": {}, 413 | "output_type": "pyout", 414 | "prompt_number": 216, 415 | "text": [ 416 | "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')" 417 | ] 418 | } 419 | ], 420 | "prompt_number": 216 421 | }, 422 | { 423 | "cell_type": "code", 424 | "collapsed": false, 425 | "input": [ 426 | "print \"Calculating metrcs...\"\n", 427 | "# test['pred_label'] = y_gnb\n", 428 | "# test.head()\n", 429 | "acc = zip(test['label'], y_gnb)\n", 430 | "match_count = 0\n", 431 | "for i in acc:\n", 432 | " if i[0] == i[1]:\n", 433 | " match_count += 1\n", 434 | "print \"Matches - \", match_count\n", 435 | "print \"Total length - \", len(acc)\n", 436 | "print \"Accuracy:\", float(match_count) / len(acc)" 437 | ], 438 | "language": "python", 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "output_type": "stream", 443 | "stream": "stdout", 444 | "text": [ 445 | "Calculating metrcs...\n", 446 | "Matches - " 447 | ] 448 | }, 449 | { 450 | "output_type": "stream", 451 | "stream": "stdout", 452 | "text": [ 453 | " 184048\n", 454 | "Total length - 242386\n", 455 | "Accuracy: 0.75931778238\n" 456 | ] 457 | } 458 | ], 459 | "prompt_number": 217 460 | }, 461 | { 462 | "cell_type": "code", 463 | "collapsed": false, 464 | "input": [ 465 | "print accuracy_score(test['label'],y_gnb)\n", 466 | "print metrics.confusion_matrix(test['label'],y_gnb)" 467 | ], 468 | "language": "python", 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "output_type": "stream", 473 | "stream": "stdout", 474 | "text": [ 475 | "0.75931778238\n", 476 | "[[157152 29405]\n", 477 | " [ 28933 26896]]" 478 | ] 479 | }, 480 | { 481 | "output_type": "stream", 482 | "stream": "stdout", 483 | "text": [ 484 | "\n" 485 | ] 486 | } 487 | ], 488 | "prompt_number": 218 489 | }, 490 | { 491 | "cell_type": "code", 492 | "collapsed": false, 493 | "input": [ 494 | "gnb.feature_importances_" 495 | ], 496 | "language": "python", 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "metadata": {}, 501 | "output_type": "pyout", 502 | "prompt_number": 219, 503 | "text": [ 504 | "array([ 0. , 0.01151212, 0.0552584 , 0.03722765, 0.28496385,\n", 505 | " 0.07264084, 0.2130565 , 0.16164198, 0.16369866])" 506 | ] 507 | } 508 | ], 509 | "prompt_number": 219 510 | }, 511 | { 512 | "cell_type": "code", 513 | "collapsed": false, 514 | "input": [ 515 | "features" 516 | ], 517 | "language": "python", 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "metadata": {}, 522 | "output_type": "pyout", 523 | "prompt_number": 222, 524 | "text": [ 525 | "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')" 526 | ] 527 | } 528 | ], 529 | "prompt_number": 222 530 | }, 531 | { 532 | "cell_type": "code", 533 | "collapsed": false, 534 | "input": [ 535 | "# average_precision_score(test['label'],y_gnb)\n", 536 | "precision_recall_fscore_support(test['label'],y_gnb,average='micro')" 537 | ], 538 | "language": "python", 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "metadata": {}, 543 | "output_type": "pyout", 544 | "prompt_number": 223, 545 | "text": [ 546 | "(0.47771798014244865, 0.48175679306453634, 0.47972888611433151, 55829)" 547 | ] 548 | } 549 | ], 550 | "prompt_number": 223 551 | }, 552 | { 553 | "cell_type": "code", 554 | "collapsed": false, 555 | "input": [ 556 | "# dfMaster['FlightDate'] =pd.to_datetime(dfMaster.Year*10000+dfMaster.Month*100+dfMaster.DayofMonth,format='%Y%m%d')" 557 | ], 558 | "language": "python", 559 | "metadata": {}, 560 | "outputs": [], 561 | "prompt_number": 206 562 | }, 563 | { 564 | "cell_type": "code", 565 | "collapsed": false, 566 | "input": [ 567 | "# dfAirport = dfMaster[['FlightDate','Origin']].groupby([dfMaster['FlightDate'],dfMaster['Origin']]).agg([len])\n", 568 | "# # dfAirport.to_clipboard()\n", 569 | "# dfAirport" 570 | ], 571 | "language": "python", 572 | "metadata": {}, 573 | "outputs": [], 574 | "prompt_number": 207 575 | }, 576 | { 577 | "cell_type": "code", 578 | "collapsed": false, 579 | "input": [ 580 | "print y_gnb[:10]\n", 581 | "print y_prob[:10]" 582 | ], 583 | "language": "python", 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "output_type": "stream", 588 | "stream": "stdout", 589 | "text": [ 590 | "[0 0 0 1 1 0 0 0 1 1]\n", 591 | "[[ 1. 0.]\n", 592 | " [ 1. 0.]\n", 593 | " [ 1. 0.]\n", 594 | " [ 0. 1.]\n", 595 | " [ 0. 1.]\n", 596 | " [ 1. 0.]\n", 597 | " [ 1. 0.]\n", 598 | " [ 1. 0.]\n", 599 | " [ 0. 1.]\n", 600 | " [ 0. 1.]]\n" 601 | ] 602 | } 603 | ], 604 | "prompt_number": 224 605 | }, 606 | { 607 | "cell_type": "code", 608 | "collapsed": false, 609 | "input": [ 610 | "dfMaster[:100].to_csv(\"C:\\\\data\\\\airline\\\\SampleData.csv\")" 611 | ], 612 | "language": "python", 613 | "metadata": {}, 614 | "outputs": [], 615 | "prompt_number": 227 616 | } 617 | ], 618 | "metadata": {} 619 | } 620 | ] 621 | } -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled0-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import sklearn\n", 19 | "from sklearn.naive_bayes import *\n", 20 | "from sklearn.metrics import *\n", 21 | "import os\n", 22 | "import cPickle\n", 23 | "import sys\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "from optparse import OptionParser\n", 27 | "from sklearn import metrics, preprocessing\n", 28 | "from sklearn import svm, naive_bayes, neighbors, tree\n", 29 | "from sklearn.ensemble import AdaBoostClassifier\n", 30 | "from sklearn import cross_validation\n", 31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n", 32 | "from sklearn.svm import SVC # support vector machine classifier\n", 33 | "# hyperparameter grid search to find best model parameters\n", 34 | "from sklearn.grid_search import GridSearchCV\n", 35 | "from sklearn import preprocessing # preprocess string labels into numerics\n", 36 | "from sklearn import *\n", 37 | "from sklearn.metrics import precision_recall_fscore_support\n", 38 | "from sklearn.metrics import classification_report\n", 39 | "\n", 40 | "\n", 41 | "# In[135]:\n", 42 | "\n", 43 | "# Setting up constants\n", 44 | "print \"Setting constants...\"\n", 45 | "\n", 46 | "TRAINING_LINE_NUMBER = 500000\n", 47 | "YEARS = ['2006', '2008', '2007']\n", 48 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n", 49 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n", 50 | "# YEARS = ['2008']\n", 51 | "\n", 52 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n", 53 | "\n", 54 | "master = []\n", 55 | "print \"Reading into Pandas frame...\"\n", 56 | "try:\n", 57 | " for year in YEARS:\n", 58 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n", 59 | " print \"\\n\", path\n", 60 | " dfPart = pd.read_csv(\n", 61 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n", 62 | " u'Year',\n", 63 | " u'Month',\n", 64 | " u'DayofMonth',\n", 65 | " u'DayOfWeek',\n", 66 | " u'UniqueCarrier',\n", 67 | " u'DepTime',\n", 68 | " u'TailNum',\n", 69 | " u'Origin',\n", 70 | " u'Dest',\n", 71 | " u'DepDelay',\n", 72 | " # u'ArrDelay',\n", 73 | " u'Cancelled',\n", 74 | " # u'ArrTime',\n", 75 | " # u'ArrDelay',\n", 76 | " # u'Distance'\n", 77 | " ])\n", 78 | " print len(dfPart)\n", 79 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n", 80 | " print \"Removed cancelled flights, new length - \", len(dfPart)\n", 81 | " master.append(dfPart)\n", 82 | " print\n", 83 | "except Exception as e:\n", 84 | " print \"Supplemental Data Import failed\", e\n", 85 | "\n", 86 | "dfMaster = pd.concat(master, ignore_index=True)\n", 87 | "master = []\n", 88 | "dfPart = []\n", 89 | "\n", 90 | "print \"Total length - \", len(dfMaster)\n", 91 | "del dfMaster['Cancelled']\n", 92 | "\n", 93 | "dfMaster.fillna(0, inplace=True)\n", 94 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n", 95 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n", 96 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n", 97 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n", 98 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n", 99 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n", 100 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n", 101 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n", 102 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n", 103 | "\n", 104 | "df = dfMaster\n", 105 | "\n", 106 | "print \"Calculating classification label...\"\n", 107 | "df['label'] = 0\n", 108 | "df.label[df.DepDelay >= 15] = 1\n", 109 | "df.label[df.DepDelay < 15] = 0\n", 110 | "\n", 111 | "# df['DepDelay'][df.DepDelay < 0] = 0\n", 112 | "del df['DepDelay']\n", 113 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n", 114 | "\n", 115 | "print \"Dataframe shape - \", df.shape\n", 116 | "print \"Columns -\", df.columns\n", 117 | "\n", 118 | "\n", 119 | "# In[136]:\n", 120 | "\n", 121 | "print \"Converting categorical data to numeric...\"\n", 122 | "for col in set(df.columns):\n", 123 | "# print col, train[col].dtype\n", 124 | " if df[col].dtype == np.dtype('object'):\n", 125 | " print \"Converting...\", col\n", 126 | " if col == 'TailNum':\n", 127 | " s = np.unique(df[col].values)\n", 128 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 129 | "# print TailNum\n", 130 | " if col == 'UniqueCarrier':\n", 131 | " s = np.unique(df[col].values)\n", 132 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 133 | "# print UniqueCarrier\n", 134 | " if col == 'Dest':\n", 135 | " s = np.unique(df[col].values)\n", 136 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 137 | "# print Dest\n", 138 | " if col == 'Origin':\n", 139 | " s = np.unique(df[col].values)\n", 140 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 141 | "# print Origin\n", 142 | "\n", 143 | "\n", 144 | "def getTailNum(inTailNum):\n", 145 | "# print \"In...\",type(inTailNum)\n", 146 | " out = []\n", 147 | " for x, y in inTailNum.iteritems():\n", 148 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n", 149 | " out.append(TailNum.get_value(y) + 1)\n", 150 | "# print \"final out\", out\n", 151 | " return out\n", 152 | "\n", 153 | "\n", 154 | "def getDest(inDest):\n", 155 | " out = []\n", 156 | " for x, y in inDest.iteritems():\n", 157 | " out.append(Dest.get_value(y) + 1)\n", 158 | " return out\n", 159 | "\n", 160 | "\n", 161 | "def getOrigin(inOrign):\n", 162 | " out = []\n", 163 | " for x, y in inOrign.iteritems():\n", 164 | " out.append(Origin.get_value(y) + 1)\n", 165 | " return out\n", 166 | "\n", 167 | "\n", 168 | "def getCarrier(inCarrier):\n", 169 | " out = []\n", 170 | " for x, y in inCarrier.iteritems():\n", 171 | " out.append(UniqueCarrier.get_value(y) + 1)\n", 172 | " return out\n", 173 | "\n", 174 | "df['TailNum'] = getTailNum(df['TailNum'])\n", 175 | "print \"TailNum completed.\"\n", 176 | "\n", 177 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n", 178 | "print \"UniqueCarrier completed.\"\n", 179 | "\n", 180 | "df['Dest'] = getDest(df['Dest'])\n", 181 | "print \"Dest completed.\"\n", 182 | "\n", 183 | "df['Origin'] = getOrigin(df['Origin'])\n", 184 | "print \"Origin completed.\"\n", 185 | "\n", 186 | "print \"Conversion to numeric completed.\"\n", 187 | "\n", 188 | "# print \"Pickling converted data...\"\n", 189 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n" 190 | ], 191 | "language": "python", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "output_type": "stream", 196 | "stream": "stdout", 197 | "text": [ 198 | "Setting constants...\n", 199 | "Reading into Pandas frame...\n", 200 | "\n", 201 | "C:\\data\\airline\\2006.csv\n", 202 | "500000" 203 | ] 204 | }, 205 | { 206 | "output_type": "stream", 207 | "stream": "stdout", 208 | "text": [ 209 | "\n", 210 | "Removed cancelled flights, new length - " 211 | ] 212 | }, 213 | { 214 | "output_type": "stream", 215 | "stream": "stdout", 216 | "text": [ 217 | " 491158\n", 218 | "\n", 219 | "\n", 220 | "C:\\data\\airline\\2008.csv\n", 221 | "500000" 222 | ] 223 | }, 224 | { 225 | "output_type": "stream", 226 | "stream": "stdout", 227 | "text": [ 228 | "\n", 229 | "Removed cancelled flights, new length - " 230 | ] 231 | }, 232 | { 233 | "output_type": "stream", 234 | "stream": "stdout", 235 | "text": [ 236 | " 484708\n", 237 | "\n", 238 | "\n", 239 | "C:\\data\\airline\\2007.csv\n", 240 | "500000" 241 | ] 242 | }, 243 | { 244 | "output_type": "stream", 245 | "stream": "stdout", 246 | "text": [ 247 | "\n", 248 | "Removed cancelled flights, new length - " 249 | ] 250 | }, 251 | { 252 | "output_type": "stream", 253 | "stream": "stdout", 254 | "text": [ 255 | " 487243\n", 256 | "\n", 257 | "Total length - " 258 | ] 259 | }, 260 | { 261 | "output_type": "stream", 262 | "stream": "stdout", 263 | "text": [ 264 | " 1463109\n", 265 | "Calculating classification label..." 266 | ] 267 | }, 268 | { 269 | "output_type": "stream", 270 | "stream": "stdout", 271 | "text": [ 272 | "\n", 273 | "Dataframe shape - (1463109, 10)\n", 274 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n", 275 | "Converting categorical data to numeric...\n", 276 | "Converting..." 277 | ] 278 | }, 279 | { 280 | "output_type": "stream", 281 | "stream": "stdout", 282 | "text": [ 283 | " Origin\n", 284 | "Converting..." 285 | ] 286 | }, 287 | { 288 | "output_type": "stream", 289 | "stream": "stdout", 290 | "text": [ 291 | " UniqueCarrier\n", 292 | "Converting..." 293 | ] 294 | }, 295 | { 296 | "output_type": "stream", 297 | "stream": "stdout", 298 | "text": [ 299 | " Dest\n", 300 | "Converting..." 301 | ] 302 | }, 303 | { 304 | "output_type": "stream", 305 | "stream": "stdout", 306 | "text": [ 307 | " TailNum\n", 308 | "TailNum completed." 309 | ] 310 | }, 311 | { 312 | "output_type": "stream", 313 | "stream": "stdout", 314 | "text": [ 315 | "\n", 316 | "UniqueCarrier completed." 317 | ] 318 | }, 319 | { 320 | "output_type": "stream", 321 | "stream": "stdout", 322 | "text": [ 323 | "\n", 324 | "Dest completed." 325 | ] 326 | }, 327 | { 328 | "output_type": "stream", 329 | "stream": "stdout", 330 | "text": [ 331 | "\n", 332 | "Origin completed." 333 | ] 334 | }, 335 | { 336 | "output_type": "stream", 337 | "stream": "stdout", 338 | "text": [ 339 | "\n", 340 | "Conversion to numeric completed.\n" 341 | ] 342 | } 343 | ], 344 | "prompt_number": 13 345 | }, 346 | { 347 | "cell_type": "code", 348 | "collapsed": false, 349 | "input": [ 350 | "\n", 351 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n", 352 | "\n", 353 | "# add columns to your data frame\n", 354 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n", 355 | "\n", 356 | "# define training and test sets\n", 357 | "train = df[df['is_train'] == True]\n", 358 | "test = df[df['is_train'] == False]\n", 359 | "trainTargets = np.array(train['label']).astype(int)\n", 360 | "testTargets = np.array(test['label']).astype(int)\n", 361 | "features = df.columns[0:9]\n", 362 | "print \"Features - \",features\n", 363 | "print \"Model fitting and prediction started...\"\n", 364 | "gnb = GaussianNB()\n", 365 | "\n", 366 | "# train model\n", 367 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n", 368 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n", 369 | "\n", 370 | "print \"Classification completed.\"" 371 | ], 372 | "language": "python", 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "output_type": "stream", 377 | "stream": "stdout", 378 | "text": [ 379 | "Begin classification...75% training, 25% testing, randomly chosen\n", 380 | "Features - " 381 | ] 382 | }, 383 | { 384 | "output_type": "stream", 385 | "stream": "stdout", 386 | "text": [ 387 | " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n", 388 | "Model fitting and prediction started...\n", 389 | "Classification completed." 390 | ] 391 | }, 392 | { 393 | "output_type": "stream", 394 | "stream": "stdout", 395 | "text": [ 396 | "\n", 397 | "Calculating metrcs...\n", 398 | "Accuracy - 0.798698653544\n", 399 | "Confusion metrics\n", 400 | "[[291966 106]\n", 401 | " [ 73525 178]]" 402 | ] 403 | }, 404 | { 405 | "output_type": "stream", 406 | "stream": "stdout", 407 | "text": [ 408 | "\n", 409 | "Precision - " 410 | ] 411 | }, 412 | { 413 | "output_type": "stream", 414 | "stream": "stdout", 415 | "text": [ 416 | "0.62676056338\n", 417 | "Recall - " 418 | ] 419 | }, 420 | { 421 | "output_type": "stream", 422 | "stream": "stdout", 423 | "text": [ 424 | "0.00241509843561\n" 425 | ] 426 | } 427 | ], 428 | "prompt_number": 14 429 | }, 430 | { 431 | "cell_type": "code", 432 | "collapsed": false, 433 | "input": [ 434 | "print \"Calculating metrcs...\"\n", 435 | "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n", 436 | "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n", 437 | "print \"Precision - \", precision_score(test['label'], y_gnb)\n", 438 | "print \"Recall - \", recall_score(test['label'], y_gnb)\n" 439 | ], 440 | "language": "python", 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "output_type": "stream", 445 | "stream": "stdout", 446 | "text": [ 447 | "Calculating metrcs...\n", 448 | "Accuracy - 0.798698653544\n", 449 | "Confusion metrics\n", 450 | "[[291966 106]\n", 451 | " [ 73525 178]]" 452 | ] 453 | }, 454 | { 455 | "output_type": "stream", 456 | "stream": "stdout", 457 | "text": [ 458 | "\n", 459 | "Precision - " 460 | ] 461 | }, 462 | { 463 | "output_type": "stream", 464 | "stream": "stdout", 465 | "text": [ 466 | "0.62676056338\n", 467 | "Recall - " 468 | ] 469 | }, 470 | { 471 | "output_type": "stream", 472 | "stream": "stdout", 473 | "text": [ 474 | "0.00241509843561\n" 475 | ] 476 | } 477 | ], 478 | "prompt_number": 25 479 | }, 480 | { 481 | "cell_type": "code", 482 | "collapsed": false, 483 | "input": [ 484 | "testSFO = test[test['Origin'] == Origin['SFO']]\n", 485 | "print len(testSFO)\n", 486 | "\n", 487 | "testOAK = test[test['Origin'] == Origin['OAK']]\n", 488 | "print len(testOAK)\n" 489 | ], 490 | "language": "python", 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "output_type": "stream", 495 | "stream": "stdout", 496 | "text": [ 497 | "3563\n", 498 | "40\n" 499 | ] 500 | } 501 | ], 502 | "prompt_number": 22 503 | }, 504 | { 505 | "cell_type": "code", 506 | "collapsed": false, 507 | "input": [ 508 | " np.random.randint(2000, size=10)\n", 509 | " " 510 | ], 511 | "language": "python", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "metadata": {}, 516 | "output_type": "pyout", 517 | "prompt_number": 27, 518 | "text": [ 519 | "array([ 437, 1815, 742, 148, 1399, 1171, 205, 1480, 838, 1437])" 520 | ] 521 | } 522 | ], 523 | "prompt_number": 27 524 | } 525 | ], 526 | "metadata": {} 527 | } 528 | ] 529 | } -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:d3fd45c3529abf0b735e3b409e8980ec4b2e4e445277ba0cf2522e16729ae159" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "import sys\n", 17 | "import csv\n", 18 | "import datetime\n", 19 | "import matplotlib.pyplot as plt; plt.rcdefaults()\n", 20 | "\n", 21 | "TIME_DELTA = 3\n", 22 | "\n", 23 | "for arg in sys.argv:\n", 24 | "\tif(arg != 'date_graph.py'):\n", 25 | "\t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n", 26 | "\t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n", 27 | "\n", 28 | "delta = datetime.timedelta(days=TIME_DELTA)\n", 29 | "begin = start_date - delta\n", 30 | "end = start_date + delta\n", 31 | "\n", 32 | "SFO_Hash = {}\n", 33 | "OAK_Hash = {}\n", 34 | "SFO_count = 0\n", 35 | "OAK_count = 0\n", 36 | "with open('_dfTest2008.csv', 'r') as data:\n", 37 | "\tcsv_reader = csv.reader(data, delimiter=',')\n", 38 | "\tfor row in csv_reader:\n", 39 | "\t\tif(row[0] != 'Year'):\n", 40 | "\t\t\tyear = int(row[0])\n", 41 | "\t\t\tmonth = int(row[1])\n", 42 | "\t\t\tdate = int(row[2])\n", 43 | "\t\t\tcurr_date = datetime.date(year, month, date)\n", 44 | "\t\t\tif(curr_date >= begin and curr_date <= end):\n", 45 | "\t\t\t\torigin = row[7]\n", 46 | "\t\t\t\tif(origin == '270'):\n", 47 | "\t\t\t\t\tlabel = int(row[10])\n", 48 | "\t\t\t\t\tSFO_count += 1\n", 49 | "\t\t\t\t\tif(curr_date not in SFO_Hash):\n", 50 | "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n", 51 | "\t\t\t\t\telse:\n", 52 | "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n", 53 | "\t\t\t\tif(origin == '215'):\n", 54 | "\t\t\t\t\tlabel = int(row[10])\n", 55 | "\t\t\t\t\tOAK_count += 1\n", 56 | "\t\t\t\t\tif(curr_date not in OAK_Hash):\n", 57 | "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n", 58 | "\t\t\t\t\telse:\n", 59 | "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n", 60 | "\n", 61 | "iterator = datetime.timedelta(days=1)\n", 62 | "day_values = []\n", 63 | "SFO_Delays = []\n", 64 | "SFO_On_Time = []\n", 65 | "SFO_Flights = []\n", 66 | "SFO_Pct = []\n", 67 | "OAK_Delays = []\n", 68 | "OAK_On_Time = []\n", 69 | "OAK_Flights = []\n", 70 | "OAK_Pct = []\n", 71 | "\n", 72 | "while begin <= end:\n", 73 | "\tif(begin not in SFO_Hash):\n", 74 | "\t\tSFO_Delays.append(0)\n", 75 | "\t\tSFO_On_Time.append(0)\n", 76 | "\t\tSFO_Pct.append(0.00)\n", 77 | "\telse:\n", 78 | "\t\tSFO_Flights = SFO_Hash[begin]\n", 79 | "\t\tdelays = sum(SFO_Flights)\n", 80 | "\t\tnum_flights = len(SFO_Flights)\n", 81 | "\t\tpct = float(delays) / (num_flights + delays)\n", 82 | "\t\tSFO_Delays.append(delays)\n", 83 | "\t\tSFO_On_Time.append(num_flights - delays)\n", 84 | "\t\tSFO_Pct.append(pct)\n", 85 | "\t\n", 86 | "\tif(begin not in OAK_Hash):\n", 87 | "\t\tOAK_Delays.append(0)\n", 88 | "\t\tOAK_On_Time.append(0)\n", 89 | "\t\tOAK_Pct.append(0.00)\n", 90 | "\telse:\n", 91 | "\t\tOAK_Flights = OAK_Hash[begin]\n", 92 | "\t\tdelays = sum(OAK_Flights)\n", 93 | "\t\tnum_flights = len(OAK_Flights)\n", 94 | "\t\tpct = float(delays) / (num_flights + delays)\n", 95 | "\t\tOAK_Delays.append(delays)\n", 96 | "\t\tOAK_On_Time.append(num_flights - delays)\n", 97 | "\t\tOAK_Pct.append(pct)\n", 98 | "\t\n", 99 | "\tday_values.append(begin)\n", 100 | "\tbegin += iterator\n", 101 | "\n", 102 | "print SFO_Pct\n", 103 | "print OAK_Pct\n", 104 | "\n", 105 | "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n", 106 | "\n", 107 | "ax1 = plt.subplot(211)\n", 108 | "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n", 109 | "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n", 110 | "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n", 111 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n", 112 | "ax1.set_yticks([0, 200, 450])\n", 113 | "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n", 114 | "\n", 115 | "ax2 = plt.subplot(212)\n", 116 | "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n", 117 | "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n", 118 | "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n", 119 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n", 120 | "ax2.set_yticks([0, 200, 450])\n", 121 | "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n", 122 | "\n", 123 | "plt.show()" 124 | ], 125 | "language": "python", 126 | "metadata": {}, 127 | "outputs": [] 128 | } 129 | ], 130 | "metadata": {} 131 | } 132 | ] 133 | } -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:43ffcf25a0f9f00fd6bd77f3e24dfb6e62c5a764e70ce742b71da7b69b36310f" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [] 9 | } -------------------------------------------------------------------------------- /EDA_and_NB_performance_charts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import os 5 | from IPython.core.display import HTML 6 | from bokeh.plotting import * 7 | 8 | 9 | # load data into pandas 10 | INPUT_FILE = "C:\\data\\airline\\_dfTest2008.csv" 11 | 12 | SKIP_FIRST_LINE = True 13 | 14 | master = [] 15 | print "Reading into Pandas frame..." 16 | try: 17 | dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[ # nrows = 2000 18 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier', 19 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label' 20 | ]) 21 | print len(dfPart) 22 | master.append(dfPart) 23 | except Exception as e: 24 | print "Data import failed", e 25 | 26 | 27 | dfMaster = pd.concat(master, ignore_index=True) 28 | print "Total length: ", len(dfMaster) 29 | 30 | # change data types 31 | dfMaster['Year'] = dfMaster['Year'].astype('int') 32 | dfMaster['Month'] = dfMaster['Month'].astype('int') 33 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int') 34 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int') 35 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int') 36 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int') 37 | dfMaster['Origin'] = dfMaster['Origin'].astype('int') 38 | dfMaster['Dest'] = dfMaster['Dest'].astype('int') 39 | dfMaster['label'] = dfMaster['label'].astype('int') 40 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int') 41 | 42 | 43 | df = dfMaster 44 | print "Appneding new variables..." 45 | df['accurate'] = 0 46 | df.accurate[df.label == df.pred_label] = 1 47 | df.accurate[df.label <> df.pred_label] = 0 48 | 49 | 50 | df['dep_time'] = 0 51 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1 52 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2 53 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3 54 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3 55 | 56 | # compute accuracy rates 57 | month_acc = dfMaster.groupby('Month').accurate.sum() / \ 58 | dfMaster.groupby('Month').accurate.count() 59 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Month']) 60 | # print df_month_acc 61 | 62 | day_of_month_acc = dfMaster.groupby( 63 | 'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count() 64 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'DayofMonth']) 65 | # print df_day_of_month_acc 66 | 67 | day_of_week_acc = dfMaster.groupby( 68 | 'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count() 69 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'DayOfWeek']) 70 | # print df_day_of_week_acc 71 | 72 | unique_carrier_acc = dfMaster.groupby( 73 | 'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count() 74 | df_unique_carrier_acc = pd.DataFrame( 75 | unique_carrier_acc, columns=[u'UniqueCarrier']) 76 | # print df_unique_carrier_acc 77 | 78 | tail_num_acc = dfMaster.groupby( 79 | 'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count() 80 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'TailNum']) 81 | # print df_tail_num_acc 82 | 83 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \ 84 | dfMaster.groupby('Origin').accurate.count() 85 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Origin']) 86 | # print df_origin_acc 87 | 88 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \ 89 | dfMaster.groupby('Dest').accurate.count() 90 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Dest']) 91 | # print df_dest_acc 92 | 93 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \ 94 | dfMaster.groupby('dep_time').accurate.count() 95 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'dep_time']) 96 | # print dep_time_acc 97 | 98 | 99 | # compute proportion of delays by each variable 100 | 101 | month_delays = dfMaster.groupby( 102 | 'Month').label.sum() / dfMaster.groupby('Month').label.count() 103 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Month']) 104 | # print df_month_delays 105 | 106 | day_of_month_delays = dfMaster.groupby( 107 | 'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count() 108 | df_day_of_month_delays = pd.DataFrame( 109 | day_of_month_delays, columns=[u'DayofMonth']) 110 | # print df_day_of_month_delays 111 | 112 | day_of_week_delays = dfMaster.groupby( 113 | 'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count() 114 | df_day_of_week_delays = pd.DataFrame( 115 | day_of_week_delays, columns=[u'DayOfWeek']) 116 | # print df_day_of_week_delays 117 | 118 | unique_carrier_delays = dfMaster.groupby( 119 | 'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count() 120 | df_unique_carrier_delays = pd.DataFrame( 121 | unique_carrier_delays, columns=[u'UniqueCarrier']) 122 | # print df_unique_carrier_delays 123 | 124 | tail_num_delays = dfMaster.groupby( 125 | 'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count() 126 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'TailNum']) 127 | # print df_tail_num_delays 128 | 129 | origin_delays = dfMaster.groupby( 130 | 'Origin').label.sum() / dfMaster.groupby('Origin').label.count() 131 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Origin']) 132 | # print df_origin_delays 133 | 134 | dest_delays = dfMaster.groupby( 135 | 'Dest').label.sum() / dfMaster.groupby('Dest').label.count() 136 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Dest']) 137 | # print df_dest_delays 138 | 139 | dep_time_delays = dfMaster.groupby( 140 | 'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count() 141 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'dep_time']) 142 | # print df_dep_time_delays 143 | 144 | 145 | # bar charts to see where delays are more likely 146 | df_day_of_month_delays.plot(kind='bar', color='grey', stacked=True) 147 | 148 | # df_day_of_week_delays.plot(kind='bar', color='grey', stacked=True) 149 | 150 | # df_unique_carrier_delays.plot(kind='bar', color='grey', stacked=True) 151 | 152 | # df_tail_num_delays.plot(kind='bar', color='grey', stacked=True) 153 | 154 | # df_origin_delays.plot(kind='bar', color='grey', stacked=True) 155 | 156 | # df_dest_delays.plot(kind='bar', color='grey', stacked=True) 157 | 158 | # df_dep_time_delays.plot(kind='bar', color='grey', stacked=True) 159 | 160 | # df_month_delays.plot(kind='bar', color='grey', stacked=True) 161 | 162 | plt.show() 163 | 164 | 165 | # plot bar charts for accuracy measures 166 | # df_month_acc.plot(kind='bar', color='grey', background_fill="#EAEAF2") 167 | 168 | # df_day_of_month_acc.plot( 169 | # kind='bar', color='grey', background_fill="#EAEAF2") 170 | 171 | # df_day_of_week_acc.plot(kind='bar', color='grey') 172 | 173 | # df_unique_carrier_acc.plot(kind='bar', color='grey') 174 | 175 | # df_tail_num_acc.plot(kind='bar', color='grey') 176 | 177 | # df_origin_acc.plot(kind='bar', color='grey') 178 | 179 | # df_dest_acc.plot(kind='bar', color='grey') 180 | 181 | # df_dep_time_acc.plot(kind='bar', color='grey') 182 | 183 | plt.show() 184 | -------------------------------------------------------------------------------- /INFO290T_Final_Project_Presentation_vFINAL.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/INFO290T_Final_Project_Presentation_vFINAL.pptx -------------------------------------------------------------------------------- /Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx -------------------------------------------------------------------------------- /Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf -------------------------------------------------------------------------------- /Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx -------------------------------------------------------------------------------- /Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf -------------------------------------------------------------------------------- /NB_performance_charts.py: -------------------------------------------------------------------------------- 1 | # This code builds some exploratory graphs to see how prediction accuracy 2 | # of the Naive Bayes model varies by each variable used to build the model. 3 | 4 | 5 | # Importing various modules to build graphs 6 | from __future__ import division 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from pylab import figure, show 10 | from pandas import DataFrame, Series 11 | import pandas as pd 12 | import csv 13 | import os 14 | from bokeh.plotting import * 15 | import seaborn as sns 16 | from bokeh.objects import ColumnDataSource, Range1d 17 | from math import floor 18 | import bokeh as bokeh 19 | import seaborn as sns 20 | sns.set_context("talk") 21 | 22 | 23 | # load 2008 test data into pandas 24 | INPUT_FILE = "C:\\Users\\user\\Desktop\\INFO_290T\\Final Project\Visualizations\\SFO_OAK_data\\_dfTest2008.csv" 25 | 26 | SKIP_FIRST_LINE = True 27 | 28 | master = [] 29 | print "Reading into Pandas frame..." 30 | try: 31 | dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[ # nrows = 2000 32 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier', 33 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label' 34 | ]) 35 | print len(dfPart) 36 | master.append(dfPart) 37 | except Exception as e: 38 | print "Data import failed", e 39 | 40 | 41 | dfMaster = pd.concat(master, ignore_index=True) 42 | print "Total length: ", len(dfMaster) 43 | 44 | # change data types to integers 45 | dfMaster['Year'] = dfMaster['Year'].astype('int') 46 | dfMaster['Month'] = dfMaster['Month'].astype('int') 47 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int') 48 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int') 49 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int') 50 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int') 51 | dfMaster['Origin'] = dfMaster['Origin'].astype('int') 52 | dfMaster['Dest'] = dfMaster['Dest'].astype('int') 53 | dfMaster['label'] = dfMaster['label'].astype('int') 54 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int') 55 | 56 | 57 | df = dfMaster 58 | print "Appneding new variables..." 59 | 60 | # create a binary variable that indicates accuracy of prediction 61 | # for each record 62 | df['accurate'] = 0 63 | df.accurate[df.label == df.pred_label] = 1 64 | df.accurate[df.label <> df.pred_label] = 0 65 | 66 | 67 | # discretize time of day variable and create a categorical variable 68 | # that captures morning (from 7 am to 1 pm), afternoon (1 pm to 6 pm), 69 | # and night (from 6 pm to 7 am) 70 | df['dep_time'] = 0 71 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1 72 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2 73 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3 74 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3 75 | 76 | # compute accuracy rates for each variable 77 | month_acc = dfMaster.groupby('Month').accurate.sum() / \ 78 | dfMaster.groupby('Month').accurate.count() 79 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Accuracy']) 80 | 81 | 82 | day_of_month_acc = dfMaster.groupby( 83 | 'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count() 84 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'Accuracy']) 85 | 86 | day_of_week_acc = dfMaster.groupby( 87 | 'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count() 88 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'Accuracy']) 89 | 90 | unique_carrier_acc = dfMaster.groupby( 91 | 'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count() 92 | df_unique_carrier_acc = pd.DataFrame( 93 | unique_carrier_acc, columns=[u'Accuracy']) 94 | 95 | tail_num_acc = dfMaster.groupby( 96 | 'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count() 97 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'Accuracy']) 98 | 99 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \ 100 | dfMaster.groupby('Origin').accurate.count() 101 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Accuracy']) 102 | 103 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \ 104 | dfMaster.groupby('Dest').accurate.count() 105 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Accuracy']) 106 | 107 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \ 108 | dfMaster.groupby('dep_time').accurate.count() 109 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'Accuracy']) 110 | 111 | 112 | # compute proportion of delays by each variable 113 | month_delays = dfMaster.groupby( 114 | 'Month').label.sum() / dfMaster.groupby('Month').label.count() 115 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Accuracy']) 116 | 117 | day_of_month_delays = dfMaster.groupby( 118 | 'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count() 119 | df_day_of_month_delays = pd.DataFrame( 120 | day_of_month_delays, columns=[u'Accuracy']) 121 | 122 | day_of_week_delays = dfMaster.groupby( 123 | 'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count() 124 | df_day_of_week_delays = pd.DataFrame( 125 | day_of_week_delays, columns=[u'Accuracy']) 126 | 127 | unique_carrier_delays = dfMaster.groupby( 128 | 'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count() 129 | df_unique_carrier_delays = pd.DataFrame( 130 | unique_carrier_delays, columns=[u'Accuracy']) 131 | 132 | tail_num_delays = dfMaster.groupby( 133 | 'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count() 134 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'Accuracy']) 135 | 136 | origin_delays = dfMaster.groupby( 137 | 'Origin').label.sum() / dfMaster.groupby('Origin').label.count() 138 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Accuracy']) 139 | 140 | dest_delays = dfMaster.groupby( 141 | 'Dest').label.sum() / dfMaster.groupby('Dest').label.count() 142 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Accuracy']) 143 | 144 | dep_time_delays = dfMaster.groupby( 145 | 'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count() 146 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'Accuracy']) 147 | 148 | 149 | ############################################### BUILD GRAPHS ########################################### 150 | 151 | # build accuracy by day of month variable 152 | dfPlot = df_day_of_month_delays 153 | dfPlot.reset_index(inplace=True) 154 | dfPlot.columns 155 | plt.show() 156 | fig = plt.figure() 157 | fig.suptitle('Accuracy by Day of Month', fontsize=14, fontweight='bold') 158 | ax = fig.add_subplot(111) 159 | fig.subplots_adjust(top=0.95) 160 | ax.set_xlabel('Day of Month') 161 | ax.set_ylabel('Accuracy') 162 | ax.bar(dfPlot['DayofMonth'], dfPlot['Accuracy'], label="Label") 163 | plt.xticks(dfPlot['DayofMonth'], xrange(1, 32), rotation=45) 164 | plt.show() 165 | 166 | # build accuracy by month variable 167 | dfPlot = df_month_delays 168 | dfPlot.reset_index(inplace=True) 169 | dfPlot.columns 170 | plt.show() 171 | fig = plt.figure() 172 | fig.suptitle('Accuracy by Month', fontsize=14, fontweight='bold') 173 | ax = fig.add_subplot(111) 174 | fig.subplots_adjust(top=0.95) 175 | ax.set_xlabel('Month') 176 | ax.set_ylabel('Accuracy') 177 | ax.bar(dfPlot['Month'], dfPlot['Accuracy'], label="Label") 178 | plt.xticks(dfPlot['Month'], xrange(1, 32), rotation=45) 179 | plt.show() 180 | 181 | # build accuracy by day of week variable 182 | dfPlot = df_day_of_week_delays 183 | dfPlot.reset_index(inplace=True) 184 | dfPlot.columns 185 | plt.show() 186 | fig = plt.figure() 187 | fig.suptitle('Accuracy by day of week', fontsize=14, fontweight='bold') 188 | ax = fig.add_subplot(111) 189 | fig.subplots_adjust(top=0.95) 190 | ax.set_xlabel('Day of Week') 191 | ax.set_ylabel('Accuracy') 192 | ax.bar(dfPlot['DayOfWeek'], dfPlot['Accuracy'], label="Label") 193 | plt.xticks(dfPlot['DayOfWeek'], xrange(1, 32), rotation=45) 194 | plt.show() 195 | 196 | # build accuracy by unique carrier variable 197 | dfPlot = df_unique_carrier_delays 198 | dfPlot.reset_index(inplace=True) 199 | dfPlot.columns 200 | plt.show() 201 | fig = plt.figure() 202 | fig.suptitle('Accuracy by unique carrier', fontsize=14, fontweight='bold') 203 | ax = fig.add_subplot(111) 204 | fig.subplots_adjust(top=0.95) 205 | ax.set_xlabel('Unique carrier') 206 | ax.set_ylabel('Accuracy') 207 | ax.bar(dfPlot['UniqueCarrier'], dfPlot['Accuracy'], label="Label") 208 | plt.xticks(dfPlot['UniqueCarrier'], xrange(1, 32), rotation=45) 209 | plt.show() 210 | 211 | # build accuracy by tail number variable 212 | dfPlot = df_tail_num_delays 213 | dfPlot.reset_index(inplace=True) 214 | dfPlot.columns 215 | plt.show() 216 | fig = plt.figure() 217 | fig.suptitle('Accuracy by tail number', fontsize=14, fontweight='bold') 218 | ax = fig.add_subplot(111) 219 | fig.subplots_adjust(top=0.95) 220 | ax.set_xlabel('Tail number') 221 | ax.set_ylabel('Accuracy') 222 | ax.bar(dfPlot['TailNum'], dfPlot['Accuracy'], label="Label") 223 | plt.xticks(dfPlot['TailNum'], xrange(1, 32), rotation=45) 224 | plt.show() 225 | 226 | # build accuracy by origin variable 227 | dfPlot = df_origin_delays 228 | dfPlot.reset_index(inplace=True) 229 | dfPlot.columns 230 | plt.show() 231 | fig = plt.figure() 232 | fig.suptitle('Accuracy by origin', fontsize=14, fontweight='bold') 233 | ax = fig.add_subplot(111) 234 | fig.subplots_adjust(top=0.95) 235 | ax.set_xlabel('Origin airport') 236 | ax.set_ylabel('Accuracy') 237 | ax.bar(dfPlot['Origin'], dfPlot['Accuracy'], label="Label") 238 | plt.xticks(dfPlot['Origin'], xrange(1, 32), rotation=45) 239 | plt.show() 240 | 241 | # build accuracy by destination variable 242 | dfPlot = df_dest_delays 243 | dfPlot.reset_index(inplace=True) 244 | dfPlot.columns 245 | plt.show() 246 | fig = plt.figure() 247 | fig.suptitle('Accuracy by destination', fontsize=14, fontweight='bold') 248 | ax = fig.add_subplot(111) 249 | fig.subplots_adjust(top=0.95) 250 | ax.set_xlabel('Destination airport') 251 | ax.set_ylabel('Accuracy') 252 | ax.bar(dfPlot['Dest'], dfPlot['Accuracy'], label="Label") 253 | plt.xticks(dfPlot['Dest'], xrange(1, 32), rotation=45) 254 | plt.show() 255 | 256 | # build accuracy by departure time variable 257 | dfPlot = df_dep_time_delays 258 | dfPlot.reset_index(inplace=True) 259 | dfPlot.columns 260 | plt.show() 261 | fig = plt.figure() 262 | fig.suptitle('Accuracy by departure time', fontsize=14, fontweight='bold') 263 | ax = fig.add_subplot(111) 264 | fig.subplots_adjust(top=0.95) 265 | ax.set_xlabel('Departure time') 266 | ax.set_ylabel('Accuracy') 267 | ax.bar(dfPlot['dep_time'], dfPlot['Accuracy'], label="Label") 268 | plt.xticks(dfPlot['dep_time'], xrange(1, 32), rotation=45) 269 | plt.show() 270 | -------------------------------------------------------------------------------- /Old Python Code/Basic.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | # -*- coding: utf-8 -*- 4 | # 3.0 5 | 6 | # 7 | 8 | #!/usr/bin/env python 9 | 10 | """This file contains the code for the Data Mining Class. It uses the Airline dataset <>""" 11 | 12 | __author__ = "" 13 | __email__ = "" 14 | __status__ = "" 15 | 16 | # 17 | 18 | # Importing various modules 19 | 20 | import matplotlib.pyplot as plt 21 | import numpy as np 22 | from pylab import figure, show 23 | from pandas import DataFrame, Series 24 | import pandas as pd 25 | import csv 26 | import os 27 | import statsmodels.formula.api as smf 28 | import scipy.stats as stats 29 | import statsmodels.api as sm 30 | 31 | # 32 | 33 | # Setting global constants. Please initialize this before running the code 34 | 35 | TRAINING_LINE_NUMBER = 100000 # Number of lines to be read from the huge file, set to total file length while running for entire file 36 | INPUT_FILE_PATH="C:\\data\\airline\\" # Path of the folder where you have placed your files 37 | SKIP_FIRST_LINE = True # To skip the first line, as its the header 38 | YEARS = ['2008'] # Add more years in this list and add the files in the INPUT_FILE_PATH 39 | 40 | # 41 | 42 | # Setting the dataframes for Airline, Plane and Carriers 43 | 44 | try: 45 | path = "C:\\data\\airline\\plane-data.csv" 46 | dfPlane = pd.read_csv(path) 47 | path = 'C:\\data\\airline\\airports.csv' 48 | dfAirport = pd.read_csv(path) 49 | path = 'C:\\data\\airline\\carriers.csv' 50 | dfCarrier = pd.read_csv(path) 51 | except Exception as e: 52 | print "Supplemental Data Import failed", e 53 | 54 | # 55 | 56 | # Readng the main file in a Pandas dataframe 57 | 58 | try: 59 | for year in YEARS: 60 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year)) 61 | dfMaster = pd.read_csv(path, nrows=TRAINING_LINE_NUMBER,skiprows=0) 62 | except Exception as e: 63 | print "Supplemental Data Import failed", e 64 | dfMaster.head() 65 | 66 | # 67 | 68 | dfMaster.fillna(0,inplace=True) 69 | 70 | # 71 | 72 | # TODO: Do this for other dataframes as well 73 | 74 | # Convert all columns to respective datatypes 75 | 76 | dfMaster['Year'] = dfMaster['Year'].astype('int') 77 | dfMaster['Month'] = dfMaster['Month'].astype('int') 78 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int') 79 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int') 80 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int') 81 | dfMaster['CRSDepTime'] = dfMaster['CRSDepTime'].astype('int') 82 | dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int') 83 | dfMaster['CRSArrTime'] = dfMaster['CRSArrTime'].astype('int') 84 | dfMaster['FlightNum'] = dfMaster['FlightNum'].astype('int') 85 | dfMaster['ActualElapsedTime'] = dfMaster['ActualElapsedTime'].astype('int') 86 | dfMaster['CRSElapsedTime'] = dfMaster['CRSElapsedTime'].astype('int') 87 | dfMaster['AirTime'] = dfMaster['AirTime'].astype('int') 88 | dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int') 89 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int') 90 | dfMaster['Distance'] = dfMaster['Distance'].astype('int') 91 | dfMaster['TaxiIn'] = dfMaster['TaxiIn'].astype('int') 92 | dfMaster['TaxiOut'] = dfMaster['TaxiOut'].astype('int') 93 | dfMaster['Cancelled'] = dfMaster['Cancelled'].astype('int') 94 | dfMaster['Diverted'] = dfMaster['Diverted'].astype('int') 95 | print dfMaster.columns 96 | 97 | # 98 | 99 | # for col in dfMaster.columns: 100 | # print 'dfMaster[\'',col,'\'] = dfMaster[\'',col,'\'].astype(\'int\')' 101 | 102 | # 103 | 104 | results = sm.OLS.from_formula('DepDelay ~ ArrDelay', dfMaster).fit() 105 | print results.summary() 106 | 107 | # 108 | 109 | intercept, slope = results.params 110 | r2 = results.rsquared 111 | print slope, intercept, r2 112 | 113 | plt.plot(dfMaster['DepDelay'], dfMaster['ArrDelay'], 'bo') 114 | x = np.array([min(dfMaster['ArrDelay']), max(dfMaster['ArrDelay'])]) 115 | y = intercept + slope * x 116 | plt.plot(x, y, 'r-') 117 | plt.show() 118 | 119 | 120 | from statsmodels.stats.anova import anova_lm 121 | 122 | anova_lm(results) 123 | 124 | -------------------------------------------------------------------------------- /Old Python Code/Dest.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (cpandas.core.series 5 | Series 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_data' 14 | p6 15 | g0 16 | (cpandas.core.internals 17 | SingleBlockManager 18 | p7 19 | g2 20 | Ntp8 21 | Rp9 22 | ((lp10 23 | cnumpy.core.multiarray 24 | _reconstruct 25 | p11 26 | (cpandas.core.index 27 | Index 28 | p12 29 | (I0 30 | tp13 31 | S'b' 32 | p14 33 | tp15 34 | Rp16 35 | ((I1 36 | (L64L 37 | tp17 38 | cnumpy 39 | dtype 40 | p18 41 | (S'O8' 42 | p19 43 | I0 44 | I1 45 | tp20 46 | Rp21 47 | (I3 48 | S'|' 49 | p22 50 | NNNI-1 51 | I-1 52 | I63 53 | tp23 54 | bI00 55 | (lp24 56 | S'ABQ' 57 | p25 58 | aS'ALB' 59 | p26 60 | aS'AMA' 61 | p27 62 | aS'AUS' 63 | p28 64 | aS'BDL' 65 | p29 66 | aS'BHM' 67 | p30 68 | aS'BNA' 69 | p31 70 | aS'BOI' 71 | p32 72 | aS'BUF' 73 | p33 74 | aS'BUR' 75 | p34 76 | aS'BWI' 77 | p35 78 | aS'CLE' 79 | p36 80 | aS'CMH' 81 | p37 82 | aS'CRP' 83 | p38 84 | aS'DAL' 85 | p39 86 | aS'DEN' 87 | p40 88 | aS'DTW' 89 | p41 90 | aS'ELP' 91 | p42 92 | aS'FLL' 93 | p43 94 | aS'GEG' 95 | p44 96 | aS'HOU' 97 | p45 98 | aS'HRL' 99 | p46 100 | aS'IAD' 101 | p47 102 | aS'IND' 103 | p48 104 | aS'ISP' 105 | p49 106 | aS'JAN' 107 | p50 108 | aS'JAX' 109 | p51 110 | aS'LAS' 111 | p52 112 | aS'LAX' 113 | p53 114 | aS'LBB' 115 | p54 116 | aS'LIT' 117 | p55 118 | aS'MAF' 119 | p56 120 | aS'MCI' 121 | p57 122 | aS'MCO' 123 | p58 124 | aS'MDW' 125 | p59 126 | aS'MHT' 127 | p60 128 | aS'MSY' 129 | p61 130 | aS'OAK' 131 | p62 132 | aS'OKC' 133 | p63 134 | aS'OMA' 135 | p64 136 | aS'ONT' 137 | p65 138 | aS'ORF' 139 | p66 140 | aS'PBI' 141 | p67 142 | aS'PDX' 143 | p68 144 | aS'PHL' 145 | p69 146 | aS'PHX' 147 | p70 148 | aS'PIT' 149 | p71 150 | aS'PVD' 151 | p72 152 | aS'RDU' 153 | p73 154 | aS'RNO' 155 | p74 156 | aS'RSW' 157 | p75 158 | aS'SAN' 159 | p76 160 | aS'SAT' 161 | p77 162 | aS'SDF' 163 | p78 164 | aS'SEA' 165 | p79 166 | aS'SFO' 167 | p80 168 | aS'SJC' 169 | p81 170 | aS'SLC' 171 | p82 172 | aS'SMF' 173 | p83 174 | aS'SNA' 175 | p84 176 | aS'STL' 177 | p85 178 | aS'TPA' 179 | p86 180 | aS'TUL' 181 | p87 182 | aS'TUS' 183 | p88 184 | atp89 185 | (Ntp90 186 | tp91 187 | ba(lp92 188 | g11 189 | (cnumpy 190 | ndarray 191 | p93 192 | (I0 193 | tp94 194 | g14 195 | tp95 196 | Rp96 197 | (I1 198 | (L64L 199 | tp97 200 | g18 201 | (S'i8' 202 | p98 203 | I0 204 | I1 205 | tp99 206 | Rp100 207 | (I3 208 | S'<' 209 | p101 210 | NNNI-1 211 | I-1 212 | I0 213 | tp102 214 | bI00 215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00' 216 | p103 217 | tp104 218 | ba(lp105 219 | g16 220 | atp106 221 | bsS'name' 222 | p107 223 | Nsb. -------------------------------------------------------------------------------- /Old Python Code/Julia Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:2119dd4eb940c5d56c1cf3c63fe41c2b7d02d5ac902ce8287eaa7c250c822c89" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "import csv\n", 16 | "import pickle\n", 17 | "\n", 18 | "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n", 19 | "years = [2008]\n", 20 | "\n", 21 | "def ComputeDayofYear(row):\n", 22 | " \"\"\"This function will return an integer to represent the day of the year given an integer\n", 23 | " representing month and an integer representing the day of the month. This number will\n", 24 | " correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned\n", 25 | " as 0. Feb 29th will be returned as 59.\"\"\"\n", 26 | "\n", 27 | " if(row[0] == '1'):\n", 28 | " calc = 0 + int(row[1]) - 1\n", 29 | " row[1] = str(calc)\n", 30 | " elif(row[0] == '2'):\n", 31 | " calc = 31 + int(row[1]) - 1\n", 32 | " row[1] = str(calc)\n", 33 | " elif(row[0] == '3'):\n", 34 | " calc = 60 + int(row[1]) - 1\n", 35 | " row[1] = str(calc)\n", 36 | " elif(row[0] == '4'):\n", 37 | " calc = 91 + int(row[1]) - 1\n", 38 | " row[1] = str(calc)\n", 39 | " elif(row[0] == '5'):\n", 40 | " calc = 121 + int(row[1]) - 1\n", 41 | " row[1] = str(calc)\n", 42 | " elif(row[0] == '6'):\n", 43 | " calc = 152 + int(row[1]) - 1\n", 44 | " row[1] = str(calc)\n", 45 | " elif(row[0] == '7'):\n", 46 | " calc = 182 + int(row[1]) - 1\n", 47 | " row[1] = str(calc)\n", 48 | " elif(row[0] == '8'):\n", 49 | " calc = 213 + int(row[1]) - 1\n", 50 | " row[1] = str(calc)\n", 51 | " elif(row[0] == '9'):\n", 52 | " calc = 244 + int(row[1]) - 1\n", 53 | " row[1] = str(calc)\n", 54 | " elif(row[0] == '10'):\n", 55 | " calc = 274 + int(row[1]) - 1\n", 56 | " row[1] = str(calc)\n", 57 | " elif(row[0] == '11'):\n", 58 | " calc = 305 + int(row[1]) - 1\n", 59 | " row[1] = str(calc)\n", 60 | " elif(row[0] == '12'):\n", 61 | " calc = 335 + int(row[1]) - 1\n", 62 | " row[1] = str(calc)\n", 63 | " return row\n", 64 | "\n", 65 | "\n", 66 | "def DiscretizeDepTime(row):\n", 67 | " \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n", 68 | " morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value\n", 69 | " is assumed to be an integer in 24-hour time format. These labels will correspond to\n", 70 | " variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.\n", 71 | " An error time is returned as morning.\"\"\"\n", 72 | "\n", 73 | " if(int(row[3]) <= 559):\n", 74 | " row[3] = '2'\n", 75 | " elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n", 76 | " row[3] = '0'\n", 77 | " elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n", 78 | " row[3] = '1'\n", 79 | " elif(int(row[3]) >= 1800):\n", 80 | " row[3] = '2'\n", 81 | " else:\n", 82 | " row[3] = '0'\n", 83 | " return row\n", 84 | "\n", 85 | "\n", 86 | "def AddDepVar(row):\n", 87 | " \"\"\"This function adds a classification label based on the length of the recorded\n", 88 | " Departure Delay in the data set. It assumes an input integer value of the delay in mins.\n", 89 | " By airline industry standards, flight delays are defined as departure delays greater than\n", 90 | " or equal to 15 minutes. For delayed flights, this variable will have value \"1\".\n", 91 | " For on time flights, it will have value \"0\". Default value will be set at \"0\".\"\"\"\n", 92 | "\n", 93 | " if(row[6] >= '15'):\n", 94 | " row[6] = '1'\n", 95 | " else:\n", 96 | " row[6] = '0'\n", 97 | " return row\n", 98 | "\n", 99 | "def SaveData(data, pickle_file_name):\n", 100 | " \"\"\"This function pickles each file.\"\"\"\n", 101 | "\n", 102 | " f = open (pickle_file_name, \"w\")\n", 103 | " pickle.dump(data, f)\n", 104 | " f.close()\n", 105 | "\n", 106 | "for i in years:\n", 107 | " data = []\n", 108 | " file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n", 109 | " pickle_file_name = 'data' + str(i)\n", 110 | " with open(file_path, 'r') as data_csv:\n", 111 | " csv_reader = csv.reader(data_csv, delimiter=',')\n", 112 | " for row in list(csv_reader):\n", 113 | " if row[21] == '0':\n", 114 | " content = list(row[i] for i in needed_cols)\n", 115 | " content2 = ComputeDayofYear(content)\n", 116 | " content3 = DiscretizeDepTime(content2)\n", 117 | " content4 = AddDepVar(content3)\n", 118 | " data.append(content4)\n", 119 | " SaveData(data, pickle_file_name)" 120 | ], 121 | "language": "python", 122 | "metadata": {}, 123 | "outputs": [] 124 | } 125 | ], 126 | "metadata": {} 127 | } 128 | ] 129 | } -------------------------------------------------------------------------------- /Old Python Code/NB.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import pandas as pd 4 | import sklearn 5 | from sklearn.naive_bayes import * 6 | from sklearn.metrics import * 7 | import os 8 | import cPickle 9 | 10 | # Setting up constants 11 | print "Setting constants..." 12 | 13 | TRAINING_LINE_NUMBER = 100 14 | YEARS = ['2008', '2007'] 15 | # INPUT_FILE_PATH = "/home/dmenghani/python/" # Unix path 16 | INPUT_FILE_PATH = "C:\\data\\airline\\" # Windows path 17 | # YEARS = ['2008'] 18 | 19 | SKIP_FIRST_LINE = True # To skip the first line, as its the header 20 | 21 | master = [] 22 | print "Reading into Pandas frame..." 23 | try: 24 | for year in YEARS: 25 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year)) 26 | print path 27 | dfPart = pd.read_csv( 28 | path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[ 29 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier', 30 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'DepDelay', u'Cancelled' 31 | ]) 32 | dfPart = dfPart[dfPart['Cancelled'] == 0] 33 | print len(dfPart) 34 | master.append(dfPart) 35 | except Exception as e: 36 | print "Supplemental Data Import failed", e 37 | 38 | dfMaster = pd.concat(master, ignore_index=True) 39 | print "Total length - ", len(dfMaster) 40 | 41 | 42 | dfMaster.fillna(0, inplace=True) 43 | dfMaster['Year'] = dfMaster['Year'].astype('int') 44 | dfMaster['Month'] = dfMaster['Month'].astype('int') 45 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int') 46 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int') 47 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int') 48 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int') 49 | 50 | print "Length of pandas frame - ", len(dfMaster) 51 | print "Dataframe columns - ", dfMaster.columns 52 | 53 | df = dfMaster 54 | 55 | print "Calculating classification label..." 56 | df['label'] = 0 57 | df.label[df.DepDelay >= 15] = 1 58 | df.label[df.DepDelay < 15] = 0 59 | del df['DepDelay'] 60 | 61 | print "Converting categorical data to numeric..." 62 | for col in set(df.columns): 63 | # print col, train[col].dtype 64 | if df[col].dtype == np.dtype('object'): 65 | print "Converting...", col 66 | if col == 'TailNum': 67 | s = np.unique(df[col].values) 68 | TailNum = pd.Series([x[0] for x in enumerate(s)], index=s) 69 | # print TailNum 70 | if col == 'UniqueCarrier': 71 | s = np.unique(df[col].values) 72 | UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s) 73 | # print UniqueCarrier 74 | if col == 'Dest': 75 | s = np.unique(df[col].values) 76 | Dest = pd.Series([x[0] for x in enumerate(s)], index=s) 77 | # print Dest 78 | if col == 'Origin': 79 | s = np.unique(df[col].values) 80 | Origin = pd.Series([x[0] for x in enumerate(s)], index=s) 81 | # print Origin 82 | 83 | 84 | def getTailNum(inTailNum): 85 | # print "In...",type(inTailNum) 86 | out = [] 87 | for x, y in inTailNum.iteritems(): 88 | # print "x,y, out",x,y,TailNum.get_value(y) 89 | out.append(TailNum.get_value(y) + 1) 90 | # print "final out", out 91 | return out 92 | 93 | 94 | def getDest(inDest): 95 | out = [] 96 | for x, y in inDest.iteritems(): 97 | out.append(Dest.get_value(y) + 1) 98 | return out 99 | 100 | 101 | def getOrigin(inOrign): 102 | out = [] 103 | for x, y in inOrign.iteritems(): 104 | out.append(Origin.get_value(y) + 1) 105 | return out 106 | 107 | 108 | def getCarrier(inCarrier): 109 | out = [] 110 | for x, y in inCarrier.iteritems(): 111 | out.append(UniqueCarrier.get_value(y) + 1) 112 | return out 113 | 114 | df['TailNum'] = getTailNum(df['TailNum']) 115 | print "TailNum completed." 116 | 117 | df['Dest'] = getDest(df['Dest']) 118 | print "Dest completed." 119 | 120 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier']) 121 | print "UniqueCarrier completed." 122 | 123 | df['Origin'] = getOrigin(df['Origin']) 124 | print "Origin completed." 125 | 126 | print "Conversion to numeric completed." 127 | 128 | print "Pickling converted data..." 129 | df.to_pickle(INPUT_FILE_PATH + "\df.pkl") 130 | 131 | print "Begin classification...75% training, 25% testing, randomly chosen" 132 | arget_names = np.array(['Delayed', 'Not Delayed']) 133 | # add columns to your data frame 134 | df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75 135 | # define training and test sets 136 | train = df[df['is_train'] == True] 137 | test = df[df['is_train'] == False] 138 | trainTargets = np.array(train['label']).astype(int) 139 | testTargets = np.array(test['label']).astype(int) 140 | features = df.columns[0:9] 141 | print "Model fitting and prediction started..." 142 | gnb = MultinomialNB() 143 | # train model 144 | y_gnb = gnb.fit(train[features], trainTargets).predict(test[features]) 145 | print "Classification completed." 146 | print "Calculating metrcs..." 147 | test['pred_label'] = y_gnb 148 | test.head() 149 | acc = zip(test['label'], test['pred_label']) 150 | match_count = 0 151 | for i in acc: 152 | if i[0] == - i[1]: 153 | match_count += 1 154 | print "Matches - ", match_count 155 | print "Total length - ", len(acc) 156 | print "Accuracy:", float(match_count) / len(acc) 157 | -------------------------------------------------------------------------------- /Old Python Code/Origin.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (cpandas.core.series 5 | Series 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_data' 14 | p6 15 | g0 16 | (cpandas.core.internals 17 | SingleBlockManager 18 | p7 19 | g2 20 | Ntp8 21 | Rp9 22 | ((lp10 23 | cnumpy.core.multiarray 24 | _reconstruct 25 | p11 26 | (cpandas.core.index 27 | Index 28 | p12 29 | (I0 30 | tp13 31 | S'b' 32 | p14 33 | tp15 34 | Rp16 35 | ((I1 36 | (L64L 37 | tp17 38 | cnumpy 39 | dtype 40 | p18 41 | (S'O8' 42 | p19 43 | I0 44 | I1 45 | tp20 46 | Rp21 47 | (I3 48 | S'|' 49 | p22 50 | NNNI-1 51 | I-1 52 | I63 53 | tp23 54 | bI00 55 | (lp24 56 | S'ABQ' 57 | p25 58 | aS'ALB' 59 | p26 60 | aS'AMA' 61 | p27 62 | aS'AUS' 63 | p28 64 | aS'BDL' 65 | p29 66 | aS'BHM' 67 | p30 68 | aS'BNA' 69 | p31 70 | aS'BOI' 71 | p32 72 | aS'BUF' 73 | p33 74 | aS'BUR' 75 | p34 76 | aS'BWI' 77 | p35 78 | aS'CLE' 79 | p36 80 | aS'CMH' 81 | p37 82 | aS'CRP' 83 | p38 84 | aS'DAL' 85 | p39 86 | aS'DEN' 87 | p40 88 | aS'DTW' 89 | p41 90 | aS'ELP' 91 | p42 92 | aS'FLL' 93 | p43 94 | aS'GEG' 95 | p44 96 | aS'HOU' 97 | p45 98 | aS'HRL' 99 | p46 100 | aS'IAD' 101 | p47 102 | aS'IND' 103 | p48 104 | aS'ISP' 105 | p49 106 | aS'JAN' 107 | p50 108 | aS'JAX' 109 | p51 110 | aS'LAS' 111 | p52 112 | aS'LAX' 113 | p53 114 | aS'LBB' 115 | p54 116 | aS'LIT' 117 | p55 118 | aS'MAF' 119 | p56 120 | aS'MCI' 121 | p57 122 | aS'MCO' 123 | p58 124 | aS'MDW' 125 | p59 126 | aS'MHT' 127 | p60 128 | aS'MSY' 129 | p61 130 | aS'OAK' 131 | p62 132 | aS'OKC' 133 | p63 134 | aS'OMA' 135 | p64 136 | aS'ONT' 137 | p65 138 | aS'ORF' 139 | p66 140 | aS'PBI' 141 | p67 142 | aS'PDX' 143 | p68 144 | aS'PHL' 145 | p69 146 | aS'PHX' 147 | p70 148 | aS'PIT' 149 | p71 150 | aS'PVD' 151 | p72 152 | aS'RDU' 153 | p73 154 | aS'RNO' 155 | p74 156 | aS'RSW' 157 | p75 158 | aS'SAN' 159 | p76 160 | aS'SAT' 161 | p77 162 | aS'SDF' 163 | p78 164 | aS'SEA' 165 | p79 166 | aS'SFO' 167 | p80 168 | aS'SJC' 169 | p81 170 | aS'SLC' 171 | p82 172 | aS'SMF' 173 | p83 174 | aS'SNA' 175 | p84 176 | aS'STL' 177 | p85 178 | aS'TPA' 179 | p86 180 | aS'TUL' 181 | p87 182 | aS'TUS' 183 | p88 184 | atp89 185 | (Ntp90 186 | tp91 187 | ba(lp92 188 | g11 189 | (cnumpy 190 | ndarray 191 | p93 192 | (I0 193 | tp94 194 | g14 195 | tp95 196 | Rp96 197 | (I1 198 | (L64L 199 | tp97 200 | g18 201 | (S'i8' 202 | p98 203 | I0 204 | I1 205 | tp99 206 | Rp100 207 | (I3 208 | S'<' 209 | p101 210 | NNNI-1 211 | I-1 212 | I0 213 | tp102 214 | bI00 215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00' 216 | p103 217 | tp104 218 | ba(lp105 219 | g16 220 | atp106 221 | bsS'name' 222 | p107 223 | Nsb. -------------------------------------------------------------------------------- /Old Python Code/UniqueCarrier.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (cpandas.core.series 5 | Series 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_data' 14 | p6 15 | g0 16 | (cpandas.core.internals 17 | SingleBlockManager 18 | p7 19 | g2 20 | Ntp8 21 | Rp9 22 | ((lp10 23 | cnumpy.core.multiarray 24 | _reconstruct 25 | p11 26 | (cpandas.core.index 27 | Index 28 | p12 29 | (I0 30 | tp13 31 | S'b' 32 | p14 33 | tp15 34 | Rp16 35 | ((I1 36 | (L1L 37 | tp17 38 | cnumpy 39 | dtype 40 | p18 41 | (S'O8' 42 | p19 43 | I0 44 | I1 45 | tp20 46 | Rp21 47 | (I3 48 | S'|' 49 | p22 50 | NNNI-1 51 | I-1 52 | I63 53 | tp23 54 | bI00 55 | (lp24 56 | S'WN' 57 | p25 58 | atp26 59 | (Ntp27 60 | tp28 61 | ba(lp29 62 | g11 63 | (cnumpy 64 | ndarray 65 | p30 66 | (I0 67 | tp31 68 | g14 69 | tp32 70 | Rp33 71 | (I1 72 | (L1L 73 | tp34 74 | g18 75 | (S'i8' 76 | p35 77 | I0 78 | I1 79 | tp36 80 | Rp37 81 | (I3 82 | S'<' 83 | p38 84 | NNNI-1 85 | I-1 86 | I0 87 | tp39 88 | bI00 89 | S'\x00\x00\x00\x00\x00\x00\x00\x00' 90 | p40 91 | tp41 92 | ba(lp42 93 | g16 94 | atp43 95 | bsS'name' 96 | p44 97 | Nsb. -------------------------------------------------------------------------------- /Old Python Code/Untitled0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import sklearn\n", 19 | "from sklearn.naive_bayes import *\n", 20 | "from sklearn.metrics import *\n", 21 | "import os\n", 22 | "import cPickle\n", 23 | "import sys\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "from optparse import OptionParser\n", 27 | "from sklearn import metrics, preprocessing\n", 28 | "from sklearn import svm, naive_bayes, neighbors, tree\n", 29 | "from sklearn.ensemble import AdaBoostClassifier\n", 30 | "from sklearn import cross_validation\n", 31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n", 32 | "from sklearn.svm import SVC # support vector machine classifier\n", 33 | "# hyperparameter grid search to find best model parameters\n", 34 | "from sklearn.grid_search import GridSearchCV\n", 35 | "from sklearn import preprocessing # preprocess string labels into numerics\n", 36 | "from sklearn import *\n", 37 | "from sklearn.metrics import precision_recall_fscore_support\n", 38 | "from sklearn.metrics import classification_report\n", 39 | "\n", 40 | "\n", 41 | "# In[135]:\n", 42 | "\n", 43 | "# Setting up constants\n", 44 | "print \"Setting constants...\"\n", 45 | "\n", 46 | "TRAINING_LINE_NUMBER = 500000\n", 47 | "YEARS = ['2006', '2008', '2007']\n", 48 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n", 49 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n", 50 | "# YEARS = ['2008']\n", 51 | "\n", 52 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n", 53 | "\n", 54 | "master = []\n", 55 | "print \"Reading into Pandas frame...\"\n", 56 | "try:\n", 57 | " for year in YEARS:\n", 58 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n", 59 | " print \"\\n\", path\n", 60 | " dfPart = pd.read_csv(\n", 61 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n", 62 | " u'Year',\n", 63 | " u'Month',\n", 64 | " u'DayofMonth',\n", 65 | " u'DayOfWeek',\n", 66 | " u'UniqueCarrier',\n", 67 | " u'DepTime',\n", 68 | " u'TailNum',\n", 69 | " u'Origin',\n", 70 | " u'Dest',\n", 71 | " u'DepDelay',\n", 72 | " # u'ArrDelay',\n", 73 | " u'Cancelled',\n", 74 | " # u'ArrTime',\n", 75 | " # u'ArrDelay',\n", 76 | " # u'Distance'\n", 77 | " ])\n", 78 | " print len(dfPart)\n", 79 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n", 80 | " print \"Removed cancelled flights, new length - \", len(dfPart)\n", 81 | " master.append(dfPart)\n", 82 | " print\n", 83 | "except Exception as e:\n", 84 | " print \"Supplemental Data Import failed\", e\n", 85 | "\n", 86 | "dfMaster = pd.concat(master, ignore_index=True)\n", 87 | "master = []\n", 88 | "dfPart = []\n", 89 | "\n", 90 | "print \"Total length - \", len(dfMaster)\n", 91 | "del dfMaster['Cancelled']\n", 92 | "\n", 93 | "dfMaster.fillna(0, inplace=True)\n", 94 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n", 95 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n", 96 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n", 97 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n", 98 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n", 99 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n", 100 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n", 101 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n", 102 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n", 103 | "\n", 104 | "df = dfMaster\n", 105 | "\n", 106 | "print \"Calculating classification label...\"\n", 107 | "df['label'] = 0\n", 108 | "df.label[df.DepDelay >= 15] = 1\n", 109 | "df.label[df.DepDelay < 15] = 0\n", 110 | "\n", 111 | "# df['DepDelay'][df.DepDelay < 0] = 0\n", 112 | "del df['DepDelay']\n", 113 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n", 114 | "\n", 115 | "print \"Dataframe shape - \", df.shape\n", 116 | "print \"Columns -\", df.columns\n", 117 | "\n", 118 | "\n", 119 | "# In[136]:\n", 120 | "\n", 121 | "print \"Converting categorical data to numeric...\"\n", 122 | "for col in set(df.columns):\n", 123 | "# print col, train[col].dtype\n", 124 | " if df[col].dtype == np.dtype('object'):\n", 125 | " print \"Converting...\", col\n", 126 | " if col == 'TailNum':\n", 127 | " s = np.unique(df[col].values)\n", 128 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 129 | "# print TailNum\n", 130 | " if col == 'UniqueCarrier':\n", 131 | " s = np.unique(df[col].values)\n", 132 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 133 | "# print UniqueCarrier\n", 134 | " if col == 'Dest':\n", 135 | " s = np.unique(df[col].values)\n", 136 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 137 | "# print Dest\n", 138 | " if col == 'Origin':\n", 139 | " s = np.unique(df[col].values)\n", 140 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 141 | "# print Origin\n", 142 | "\n", 143 | "\n", 144 | "def getTailNum(inTailNum):\n", 145 | "# print \"In...\",type(inTailNum)\n", 146 | " out = []\n", 147 | " for x, y in inTailNum.iteritems():\n", 148 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n", 149 | " out.append(TailNum.get_value(y) + 1)\n", 150 | "# print \"final out\", out\n", 151 | " return out\n", 152 | "\n", 153 | "\n", 154 | "def getDest(inDest):\n", 155 | " out = []\n", 156 | " for x, y in inDest.iteritems():\n", 157 | " out.append(Dest.get_value(y) + 1)\n", 158 | " return out\n", 159 | "\n", 160 | "\n", 161 | "def getOrigin(inOrign):\n", 162 | " out = []\n", 163 | " for x, y in inOrign.iteritems():\n", 164 | " out.append(Origin.get_value(y) + 1)\n", 165 | " return out\n", 166 | "\n", 167 | "\n", 168 | "def getCarrier(inCarrier):\n", 169 | " out = []\n", 170 | " for x, y in inCarrier.iteritems():\n", 171 | " out.append(UniqueCarrier.get_value(y) + 1)\n", 172 | " return out\n", 173 | "\n", 174 | "df['TailNum'] = getTailNum(df['TailNum'])\n", 175 | "print \"TailNum completed.\"\n", 176 | "\n", 177 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n", 178 | "print \"UniqueCarrier completed.\"\n", 179 | "\n", 180 | "df['Dest'] = getDest(df['Dest'])\n", 181 | "print \"Dest completed.\"\n", 182 | "\n", 183 | "df['Origin'] = getOrigin(df['Origin'])\n", 184 | "print \"Origin completed.\"\n", 185 | "\n", 186 | "print \"Conversion to numeric completed.\"\n", 187 | "\n", 188 | "# print \"Pickling converted data...\"\n", 189 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n" 190 | ], 191 | "language": "python", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "output_type": "stream", 196 | "stream": "stdout", 197 | "text": [ 198 | "Setting constants...\n", 199 | "Reading into Pandas frame...\n", 200 | "\n", 201 | "C:\\data\\airline\\2006.csv\n", 202 | "500000" 203 | ] 204 | }, 205 | { 206 | "output_type": "stream", 207 | "stream": "stdout", 208 | "text": [ 209 | "\n", 210 | "Removed cancelled flights, new length - " 211 | ] 212 | }, 213 | { 214 | "output_type": "stream", 215 | "stream": "stdout", 216 | "text": [ 217 | " 491158\n", 218 | "\n", 219 | "\n", 220 | "C:\\data\\airline\\2008.csv\n", 221 | "500000" 222 | ] 223 | }, 224 | { 225 | "output_type": "stream", 226 | "stream": "stdout", 227 | "text": [ 228 | "\n", 229 | "Removed cancelled flights, new length - " 230 | ] 231 | }, 232 | { 233 | "output_type": "stream", 234 | "stream": "stdout", 235 | "text": [ 236 | " 484708\n", 237 | "\n", 238 | "\n", 239 | "C:\\data\\airline\\2007.csv\n", 240 | "500000" 241 | ] 242 | }, 243 | { 244 | "output_type": "stream", 245 | "stream": "stdout", 246 | "text": [ 247 | "\n", 248 | "Removed cancelled flights, new length - " 249 | ] 250 | }, 251 | { 252 | "output_type": "stream", 253 | "stream": "stdout", 254 | "text": [ 255 | " 487243\n", 256 | "\n", 257 | "Total length - " 258 | ] 259 | }, 260 | { 261 | "output_type": "stream", 262 | "stream": "stdout", 263 | "text": [ 264 | " 1463109\n", 265 | "Calculating classification label..." 266 | ] 267 | }, 268 | { 269 | "output_type": "stream", 270 | "stream": "stdout", 271 | "text": [ 272 | "\n", 273 | "Dataframe shape - (1463109, 10)\n", 274 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n", 275 | "Converting categorical data to numeric...\n", 276 | "Converting..." 277 | ] 278 | }, 279 | { 280 | "output_type": "stream", 281 | "stream": "stdout", 282 | "text": [ 283 | " Origin\n", 284 | "Converting..." 285 | ] 286 | }, 287 | { 288 | "output_type": "stream", 289 | "stream": "stdout", 290 | "text": [ 291 | " UniqueCarrier\n", 292 | "Converting..." 293 | ] 294 | }, 295 | { 296 | "output_type": "stream", 297 | "stream": "stdout", 298 | "text": [ 299 | " Dest\n", 300 | "Converting..." 301 | ] 302 | }, 303 | { 304 | "output_type": "stream", 305 | "stream": "stdout", 306 | "text": [ 307 | " TailNum\n", 308 | "TailNum completed." 309 | ] 310 | }, 311 | { 312 | "output_type": "stream", 313 | "stream": "stdout", 314 | "text": [ 315 | "\n", 316 | "UniqueCarrier completed." 317 | ] 318 | }, 319 | { 320 | "output_type": "stream", 321 | "stream": "stdout", 322 | "text": [ 323 | "\n", 324 | "Dest completed." 325 | ] 326 | }, 327 | { 328 | "output_type": "stream", 329 | "stream": "stdout", 330 | "text": [ 331 | "\n", 332 | "Origin completed." 333 | ] 334 | }, 335 | { 336 | "output_type": "stream", 337 | "stream": "stdout", 338 | "text": [ 339 | "\n", 340 | "Conversion to numeric completed.\n" 341 | ] 342 | } 343 | ], 344 | "prompt_number": 13 345 | }, 346 | { 347 | "cell_type": "code", 348 | "collapsed": false, 349 | "input": [ 350 | "\n", 351 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n", 352 | "\n", 353 | "# add columns to your data frame\n", 354 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n", 355 | "\n", 356 | "# define training and test sets\n", 357 | "train = df[df['is_train'] == True]\n", 358 | "test = df[df['is_train'] == False]\n", 359 | "trainTargets = np.array(train['label']).astype(int)\n", 360 | "testTargets = np.array(test['label']).astype(int)\n", 361 | "features = df.columns[0:9]\n", 362 | "print \"Features - \",features\n", 363 | "print \"Model fitting and prediction started...\"\n", 364 | "gnb = GaussianNB()\n", 365 | "\n", 366 | "# train model\n", 367 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n", 368 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n", 369 | "\n", 370 | "print \"Classification completed.\"" 371 | ], 372 | "language": "python", 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "output_type": "stream", 377 | "stream": "stdout", 378 | "text": [ 379 | "Begin classification...75% training, 25% testing, randomly chosen\n", 380 | "Features - " 381 | ] 382 | }, 383 | { 384 | "output_type": "stream", 385 | "stream": "stdout", 386 | "text": [ 387 | " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n", 388 | "Model fitting and prediction started...\n", 389 | "Classification completed." 390 | ] 391 | }, 392 | { 393 | "output_type": "stream", 394 | "stream": "stdout", 395 | "text": [ 396 | "\n", 397 | "Calculating metrcs...\n", 398 | "Accuracy - 0.798698653544\n", 399 | "Confusion metrics\n", 400 | "[[291966 106]\n", 401 | " [ 73525 178]]" 402 | ] 403 | }, 404 | { 405 | "output_type": "stream", 406 | "stream": "stdout", 407 | "text": [ 408 | "\n", 409 | "Precision - " 410 | ] 411 | }, 412 | { 413 | "output_type": "stream", 414 | "stream": "stdout", 415 | "text": [ 416 | "0.62676056338\n", 417 | "Recall - " 418 | ] 419 | }, 420 | { 421 | "output_type": "stream", 422 | "stream": "stdout", 423 | "text": [ 424 | "0.00241509843561\n" 425 | ] 426 | } 427 | ], 428 | "prompt_number": 14 429 | }, 430 | { 431 | "cell_type": "code", 432 | "collapsed": false, 433 | "input": [ 434 | "print \"Calculating metrcs...\"\n", 435 | "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n", 436 | "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n", 437 | "print \"Precision - \", precision_score(test['label'], y_gnb)\n", 438 | "print \"Recall - \", recall_score(test['label'], y_gnb)\n" 439 | ], 440 | "language": "python", 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "output_type": "stream", 445 | "stream": "stdout", 446 | "text": [ 447 | "Calculating metrcs...\n", 448 | "Accuracy - 0.798698653544\n", 449 | "Confusion metrics\n", 450 | "[[291966 106]\n", 451 | " [ 73525 178]]" 452 | ] 453 | }, 454 | { 455 | "output_type": "stream", 456 | "stream": "stdout", 457 | "text": [ 458 | "\n", 459 | "Precision - " 460 | ] 461 | }, 462 | { 463 | "output_type": "stream", 464 | "stream": "stdout", 465 | "text": [ 466 | "0.62676056338\n", 467 | "Recall - " 468 | ] 469 | }, 470 | { 471 | "output_type": "stream", 472 | "stream": "stdout", 473 | "text": [ 474 | "0.00241509843561\n" 475 | ] 476 | } 477 | ], 478 | "prompt_number": 25 479 | }, 480 | { 481 | "cell_type": "code", 482 | "collapsed": false, 483 | "input": [ 484 | "testSFO = test[test['Origin'] == Origin['SFO']]\n", 485 | "print len(testSFO)\n", 486 | "\n", 487 | "testOAK = test[test['Origin'] == Origin['OAK']]\n", 488 | "print len(testOAK)\n" 489 | ], 490 | "language": "python", 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "output_type": "stream", 495 | "stream": "stdout", 496 | "text": [ 497 | "3563\n", 498 | "40\n" 499 | ] 500 | } 501 | ], 502 | "prompt_number": 22 503 | }, 504 | { 505 | "cell_type": "code", 506 | "collapsed": false, 507 | "input": [ 508 | " np.random.randint(2000, size=10)\n", 509 | " " 510 | ], 511 | "language": "python", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "metadata": {}, 516 | "output_type": "pyout", 517 | "prompt_number": 27, 518 | "text": [ 519 | "array([ 437, 1815, 742, 148, 1399, 1171, 205, 1480, 838, 1437])" 520 | ] 521 | } 522 | ], 523 | "prompt_number": 27 524 | } 525 | ], 526 | "metadata": {} 527 | } 528 | ] 529 | } -------------------------------------------------------------------------------- /Old Python Code/Untitled1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:42405ac43042e4a863e6490ca6e8de6e19a63251aec5c9df6ebb479db0a2da04" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "import pickle\n", 17 | "import sklearn\n", 18 | "from sklearn.naive_bayes import *\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from sklearn import *\n", 22 | "import os\n", 23 | "from sklearn.metrics import *\n", 24 | "from sklearn import metrics, preprocessing\n", 25 | "from sklearn import svm, naive_bayes, neighbors, tree\n", 26 | "from sklearn.ensemble import AdaBoostClassifier\n", 27 | "\n", 28 | "\n", 29 | "def createPickle(data, filename):\n", 30 | " with open(filename, 'wb') as f:\n", 31 | " pickle.dump(data, f)\n", 32 | " print \"Pickled\", filename\n", 33 | "\n", 34 | "\n", 35 | "# Setting up constants\n", 36 | "print \"Setting constants...\"\n", 37 | "\n", 38 | "TRAINING_LINE_NUMBER = 10000\n", 39 | "# YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']\n", 40 | "# YEARS = ['2008', '2006', '2007']\n", 41 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n", 42 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n", 43 | "YEARS = ['2008']\n", 44 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n", 45 | "\n", 46 | "master = []\n", 47 | "print \"Reading into Pandas frame...\"\n", 48 | "try:\n", 49 | " for year in YEARS:\n", 50 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n", 51 | " print \"\\n\", path\n", 52 | " dfPart = pd.read_csv(\n", 53 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n", 54 | " u'Year',\n", 55 | " u'Month',\n", 56 | " u'DayofMonth',\n", 57 | " u'DayOfWeek',\n", 58 | " u'UniqueCarrier',\n", 59 | " u'DepTime',\n", 60 | " u'TailNum',\n", 61 | " u'Origin',\n", 62 | " u'Dest',\n", 63 | " u'DepDelay',\n", 64 | " # u'ArrDelay',\n", 65 | " u'Cancelled',\n", 66 | " # u'ArrTime',\n", 67 | " # u'ArrDelay',\n", 68 | " # u'Distance'\n", 69 | " ])\n", 70 | " print len(dfPart)\n", 71 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n", 72 | " # dfPart['Year'] = year\n", 73 | " # rows = np.random.choice(\n", 74 | " # np.random.permutation(dfPart.index.values), len(dfPart) // 1, replace=False)\n", 75 | " # print rows\n", 76 | " # sampled_dfPart = dfPart.ix[rows]\n", 77 | " sampled_dfPart = dfPart\n", 78 | " print \"Removed cancelled flights, new length - \", len(sampled_dfPart)\n", 79 | " master.append(sampled_dfPart)\n", 80 | " print\n", 81 | "except Exception as e:\n", 82 | " print \"Supplemental Data Import failed\", e\n", 83 | "\n", 84 | "dfMaster = pd.concat(master, ignore_index=True)\n", 85 | "master = []\n", 86 | "dfPart = []\n", 87 | "\n", 88 | "print \"Total length - \", len(dfMaster)\n", 89 | "del dfMaster['Cancelled']\n", 90 | "\n", 91 | "dfMaster.fillna(0, inplace=True)\n", 92 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n", 93 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n", 94 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n", 95 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n", 96 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n", 97 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n", 98 | "\n", 99 | "df = dfMaster\n", 100 | "\n", 101 | "print \"Calculating classification label...\"\n", 102 | "df['label'] = 0\n", 103 | "df.label[df.DepDelay >= 1] = 1\n", 104 | "df.label[df.DepDelay < 1] = 0\n", 105 | "print \"Actual delayed flights -\", np.sum(dfMaster['label']) / len(dfMaster['label'])\n", 106 | "\n", 107 | "# df['DepDelay'][df.DepDelay < 0] = 0\n", 108 | "del df['DepDelay']\n", 109 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n", 110 | "\n", 111 | "print \"Dataframe shape - \", df.shape\n", 112 | "print \"Columns -\", df.columns\n", 113 | "\n", 114 | "\n", 115 | "# In[136]:\n", 116 | "\n", 117 | "print \"Converting categorical data to numeric...\"\n", 118 | "for col in set(df.columns):\n", 119 | "# print col, train[col].dtype\n", 120 | " if df[col].dtype == np.dtype('object'):\n", 121 | " print \"Converting...\", col\n", 122 | " if col == 'TailNum':\n", 123 | " s = np.unique(df[col].values)\n", 124 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 125 | "# print TailNum\n", 126 | " if col == 'UniqueCarrier':\n", 127 | " s = np.unique(df[col].values)\n", 128 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 129 | "# print UniqueCarrier\n", 130 | " if col == 'Dest':\n", 131 | " s = np.unique(df[col].values)\n", 132 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 133 | " # print Dest\n", 134 | " if col == 'Origin':\n", 135 | " s = np.unique(df[col].values)\n", 136 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n", 137 | " # print Origin\n", 138 | "\n", 139 | "# print \"sfo,\", Origin['SFO']\n", 140 | "# print \"oak,\", Origin['OAK']\n", 141 | "\n", 142 | "# createPickle(Dest, 'Dest_2008.pkl')\n", 143 | "# createPickle(Origin, 'Origin_2008.pkl')\n", 144 | "# createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl')\n", 145 | "# createPickle(TailNum, 'TailNum_2008.pkl')\n", 146 | "\n", 147 | "print \"Pickle completed.\"\n", 148 | "\n", 149 | "\n", 150 | "def getTailNum(inTailNum):\n", 151 | "# print \"In...\",type(inTailNum)\n", 152 | " out = []\n", 153 | " for x, y in inTailNum.iteritems():\n", 154 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n", 155 | " out.append(TailNum.get_value(y) + 1)\n", 156 | "# print \"final out\", out\n", 157 | " return out\n", 158 | "\n", 159 | "\n", 160 | "def getDest(inDest):\n", 161 | " out = []\n", 162 | " for x, y in inDest.iteritems():\n", 163 | " out.append(Dest.get_value(y) + 1)\n", 164 | " return out\n", 165 | "\n", 166 | "\n", 167 | "def getOrigin(inOrign):\n", 168 | " out = []\n", 169 | "# print inOrign\n", 170 | " for x, y in inOrign.iteritems():\n", 171 | " out.append(Origin.get_value(y) + 1)\n", 172 | " return out\n", 173 | "\n", 174 | "\n", 175 | "def getCarrier(inCarrier):\n", 176 | " out = []\n", 177 | " for x, y in inCarrier.iteritems():\n", 178 | " out.append(UniqueCarrier.get_value(y) + 1)\n", 179 | " return out\n", 180 | "\n", 181 | "print \"Before conversion...\"\n", 182 | "print len(dfMaster[dfMaster['Origin'] == 'SFO'])\n", 183 | "print len(dfMaster[dfMaster['Origin'] == 'OAK'])\n", 184 | "# df[df['Origin'] == 'SFO']" 185 | ], 186 | "language": "python", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "output_type": "stream", 191 | "stream": "stdout", 192 | "text": [ 193 | "Setting constants...\n", 194 | "Reading into Pandas frame...\n", 195 | "\n", 196 | "C:\\data\\airline\\2008.csv\n", 197 | "10000\n", 198 | "Removed cancelled flights, new length - 9837\n", 199 | "\n", 200 | "Total length - 9837\n", 201 | "Calculating classification label...\n", 202 | "Actual delayed flights - 0.756429805835\n", 203 | "Dataframe shape - (9837, 10)\n", 204 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n", 205 | "Converting categorical data to numeric...\n", 206 | "Converting... Origin\n", 207 | "Converting... UniqueCarrier\n", 208 | "Converting... Dest\n", 209 | "Converting..." 210 | ] 211 | }, 212 | { 213 | "output_type": "stream", 214 | "stream": "stdout", 215 | "text": [ 216 | " TailNum\n", 217 | "Pickle completed.\n", 218 | "Before conversion...\n", 219 | "64\n", 220 | "383\n" 221 | ] 222 | } 223 | ], 224 | "prompt_number": 65 225 | }, 226 | { 227 | "cell_type": "code", 228 | "collapsed": false, 229 | "input": [ 230 | "len(getOrigin(df['Origin']))\n", 231 | "Origin['SFO']+1" 232 | ], 233 | "language": "python", 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "metadata": {}, 238 | "output_type": "pyout", 239 | "prompt_number": 69, 240 | "text": [ 241 | "56" 242 | ] 243 | } 244 | ], 245 | "prompt_number": 69 246 | }, 247 | { 248 | "cell_type": "code", 249 | "collapsed": false, 250 | "input": [ 251 | "\n", 252 | "df['TailNum'] = getTailNum(df['TailNum'])\n", 253 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n", 254 | "df['Dest_new'] = getDest(df['Dest'])\n", 255 | "df['Origin_new'] =getOrigin(df['Origin'])\n", 256 | "\n", 257 | "print \"TailNum completed.\"\n", 258 | "print \"UniqueCarrier completed.\"\n", 259 | "print \"Dest completed.\"\n", 260 | "print \"Origin completed.\"\n", 261 | "\n", 262 | "print \"Conversion to numeric completed.\"\n", 263 | "\n", 264 | "print \"After conversion...\"\n", 265 | "# dfSFO = df[df['Origin'].isin([Origin['SFO']])]\n", 266 | "dfSFO = df[df['Origin']==56]\n", 267 | "print \"SFO len - \", len(dfSFO)\n", 268 | "# print Dest[np.unique(dfSFO['Dest'])]\n", 269 | "\n", 270 | "dfOAK = df[df['Origin'].isin([Origin['OAK']])]\n", 271 | "print \"OAK len - \", len(dfOAK)\n", 272 | "# print Dest[np.unique(dfOAK['Dest'])]\n", 273 | "# print Origin+1\n", 274 | "# print Dest+1\n", 275 | "# df[df['Origin'] == 'SFO']\n", 276 | "# df.to_csv(\"why.csv\")" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "output_type": "stream", 283 | "stream": "stdout", 284 | "text": [ 285 | "TailNum completed.\n", 286 | "UniqueCarrier completed.\n", 287 | "Dest completed.\n", 288 | "Origin completed.\n", 289 | "Conversion to numeric completed.\n", 290 | "After conversion...\n", 291 | "SFO len - 0\n", 292 | "OAK len - 0\n" 293 | ] 294 | } 295 | ], 296 | "prompt_number": 67 297 | }, 298 | { 299 | "cell_type": "code", 300 | "collapsed": false, 301 | "input": [ 302 | "\n", 303 | "# print \"Begin cross validation...\"\n", 304 | "\n", 305 | "# features = df.columns[0:9]\n", 306 | "# target_names = ['Not Delayed', 'Delayed']\n", 307 | "# accuracy = {}\n", 308 | "# results = {}\n", 309 | "# matrix = {}\n", 310 | "# prec = {}\n", 311 | "# recall = {}\n", 312 | "\n", 313 | "# for year in YEARS:\n", 314 | "# print \"Testing on - \", year\n", 315 | "# train = df[df['Year'] != int(year)]\n", 316 | "# test = df[df['Year'] == int(year)]\n", 317 | "# test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])]\n", 318 | "# print len(train), len(test)\n", 319 | "# # rows = np.random.choice(np.random.permutation(\n", 320 | "# # test.index.values), len(test) // 1, replace=False)\n", 321 | "# # print rows\n", 322 | "# # sampled_test = test.ix[rows]\n", 323 | "# sampled_test = test\n", 324 | "# trainTargets = np.array(train['label']).astype(int)\n", 325 | "# testTargets = np.array(sampled_test['label']).astype(int)\n", 326 | "# print \"Train length - \", len(train), \"Test length - \", len(sampled_test)\n", 327 | "# # print train['Year']\n", 328 | "# # print test['Year']\n", 329 | "# print \"Model fitting and prediction started...\"\n", 330 | "# gnb = GaussianNB()\n", 331 | "# y_gnb = gnb.fit(train[features], trainTargets).predict(\n", 332 | "# sampled_test[features])\n", 333 | "# sampled_test['pred_label'] = y_gnb\n", 334 | "# # y_prob = gnb.fit(\n", 335 | "# # train[features], trainTargets).predict_proba(test[features])\n", 336 | "# # print y_prob\n", 337 | "# # test['pred_prob'] = y_prob[1][1]\n", 338 | "# print \"Classification completed.\"\n", 339 | "# createPickle(gnb, INPUT_FILE_PATH + \"classifier_\" + year + \".pkl\")\n", 340 | "# createPickle(y_gnb, INPUT_FILE_PATH + \"label_\" + year + \".pkl\")\n", 341 | "# sampled_test.to_csv(\n", 342 | "# INPUT_FILE_PATH + \"\\dfTest\" + year + \".csv\", index=False)\n", 343 | "\n", 344 | "# print \"\\nCalculating metrcs...\"\n", 345 | "# accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb)\n", 346 | "# print \"Accuracy score - \", accuracy[int(year)]\n", 347 | "# prec[int(year)] = precision_score(\n", 348 | "# sampled_test['label'], y_gnb, average='micro')\n", 349 | "# print \"Precision Score - \", prec[int(year)]\n", 350 | "# recall[int(year)] = recall_score(\n", 351 | "# sampled_test['label'], y_gnb, average='micro')\n", 352 | "# print \"Recall Score - \", recall[int(year)]\n", 353 | "# print \"Confusion matrix\"\n", 354 | "# matrix[int(year)] = metrics.confusion_matrix(\n", 355 | "# sampled_test['label'], y_gnb)\n", 356 | "# print matrix[int(year)]\n", 357 | "# results[int(year)] = precision_recall_fscore_support(\n", 358 | "# sampled_test['label'], y_gnb, average='micro')\n", 359 | "# print \"Precision, recall, F-Score, Support - \", results[int(year)]\n", 360 | "# print \"Classification report\"\n", 361 | "# print classification_report(np.array(sampled_test['label']), y_gnb,\n", 362 | "# target_names=target_names)\n", 363 | "# print\n", 364 | "# train = []\n", 365 | "# test = []\n", 366 | "\n", 367 | "# print \"Accuracy\\n\", accuracy\n", 368 | "# print \"\\nPrecision\\n\", prec\n", 369 | "# print \"\\nRecall\\n\", recall\n", 370 | "# print \"\\nMetrics\\n\", results\n", 371 | "# print \"\\nMatrix\\n\", matrix\n", 372 | "\n", 373 | "# print \"\\nMean Cross validation Precision score\", np.mean(pd.Series(prec))\n", 374 | "# print \"\\nMean Cross validation Recall score\", np.mean(pd.Series(recall))\n", 375 | "# print \"\\nMean Cross validation Accuracy score\", np.mean(pd.Series(accuracy))\n", 376 | "\n", 377 | "# # print \"\\nPickling stuff...\"\n", 378 | "# # createPickle(accuracy, 'accuracy.pkl')\n", 379 | "# # createPickle(prec, 'prec.pkl')\n", 380 | "# # createPickle(results, 'results.pkl')\n", 381 | "# # createPickle(matrix, 'matrix.pkl')\n", 382 | "# # createPickle(Dest, 'Dest.pkl')\n", 383 | "# # createPickle(Origin, 'Origin.pkl')\n", 384 | "# # createPickle(UniqueCarrier, 'UniqueCarrier.pkl')\n", 385 | "# # createPickle(TailNum, 'TailNum.pkl')\n" 386 | ], 387 | "language": "python", 388 | "metadata": {}, 389 | "outputs": [], 390 | "prompt_number": 33 391 | } 392 | ], 393 | "metadata": {} 394 | } 395 | ] 396 | } -------------------------------------------------------------------------------- /Old Python Code/Untitled2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:1b2400b379e8920e0aa6061e92e9cd24c52cafda7a0349568949d9c59aa51ae9" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "from __future__ import division\n", 16 | "%matplotlib inline\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import numpy as np\n", 19 | "from pylab import figure, show\n", 20 | "from pandas import DataFrame, Series\n", 21 | "import pandas as pd\n", 22 | "import csv\n", 23 | "import os\n", 24 | "import statsmodels.formula.api as smf\n", 25 | "import scipy.stats as stats\n", 26 | "import statsmodels.api as sm\n", 27 | "from IPython.core.display import HTML\n", 28 | "from bokeh.plotting import *\n", 29 | "import seaborn as sns\n", 30 | "from bokeh.objects import ColumnDataSource, Range1d\n", 31 | "from math import floor\n", 32 | "import bokeh as bokeh\n", 33 | "import sys\n", 34 | "import csv\n", 35 | "import datetime" 36 | ], 37 | "language": "python", 38 | "metadata": {}, 39 | "outputs": [], 40 | "prompt_number": 8 41 | }, 42 | { 43 | "cell_type": "code", 44 | "collapsed": false, 45 | "input": [ 46 | "\n", 47 | "TIME_DELTA = 3\n", 48 | "\n", 49 | "# for arg in sys.argv:\n", 50 | "# \tif(arg != 'date_graph.py'):\n", 51 | "# \t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n", 52 | "# \t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n", 53 | "\n", 54 | "start_date = datetime.datetime.strptime('05-08-08', '%m-%d-%y')\n", 55 | "print start_date\n", 56 | "\n", 57 | "delta = datetime.timedelta(days=TIME_DELTA)\n", 58 | "begin = start_date - delta\n", 59 | "end = start_date + delta\n", 60 | "\n", 61 | "SFO_Hash = {}\n", 62 | "OAK_Hash = {}\n", 63 | "SFO_count = 0\n", 64 | "OAK_count = 0\n", 65 | "with open('C:\\\\data\\\\airline\\\\_dfTest2008.csv', 'r') as data:\n", 66 | "\tcsv_reader = csv.reader(data, delimiter=',')\n", 67 | "\tfor row in csv_reader:\n", 68 | "\t\tif(row[0] != 'Year'):\n", 69 | "\t\t\tyear = int(row[0])\n", 70 | "\t\t\tmonth = int(row[1])\n", 71 | "\t\t\tdate = int(row[2])\n", 72 | "\t\t\tcurr_date = datetime.datetime(year, month, date)\n", 73 | "\t\t\tif(curr_date >= begin and curr_date <= end):\n", 74 | "\t\t\t\torigin = row[7]\n", 75 | "\t\t\t\tif(origin == '270'):\n", 76 | "\t\t\t\t\tlabel = int(row[10])\n", 77 | "\t\t\t\t\tSFO_count += 1\n", 78 | "\t\t\t\t\tif(curr_date not in SFO_Hash):\n", 79 | "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n", 80 | "\t\t\t\t\telse:\n", 81 | "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n", 82 | "\t\t\t\tif(origin == '215'):\n", 83 | "\t\t\t\t\tlabel = int(row[10])\n", 84 | "\t\t\t\t\tOAK_count += 1\n", 85 | "\t\t\t\t\tif(curr_date not in OAK_Hash):\n", 86 | "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n", 87 | "\t\t\t\t\telse:\n", 88 | "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n", 89 | "\n", 90 | "iterator = datetime.timedelta(days=1)\n", 91 | "day_values = []\n", 92 | "SFO_Delays = []\n", 93 | "SFO_On_Time = []\n", 94 | "SFO_Flights = []\n", 95 | "SFO_Pct = []\n", 96 | "OAK_Delays = []\n", 97 | "OAK_On_Time = []\n", 98 | "OAK_Flights = []\n", 99 | "OAK_Pct = []\n", 100 | "\n", 101 | "while begin <= end:\n", 102 | "\tif(begin not in SFO_Hash):\n", 103 | "\t\tSFO_Delays.append(0)\n", 104 | "\t\tSFO_On_Time.append(0)\n", 105 | "\t\tSFO_Pct.append(0.00)\n", 106 | "\telse:\n", 107 | "\t\tSFO_Flights = SFO_Hash[begin]\n", 108 | "\t\tdelays = sum(SFO_Flights)\n", 109 | "\t\tnum_flights = len(SFO_Flights)\n", 110 | "\t\tpct = float(delays) / (num_flights + delays)\n", 111 | "\t\tSFO_Delays.append(delays)\n", 112 | "\t\tSFO_On_Time.append(num_flights - delays)\n", 113 | "\t\tSFO_Pct.append(pct)\n", 114 | "\t\n", 115 | "\tif(begin not in OAK_Hash):\n", 116 | "\t\tOAK_Delays.append(0)\n", 117 | "\t\tOAK_On_Time.append(0)\n", 118 | "\t\tOAK_Pct.append(0.00)\n", 119 | "\telse:\n", 120 | "\t\tOAK_Flights = OAK_Hash[begin]\n", 121 | "\t\tdelays = sum(OAK_Flights)\n", 122 | "\t\tnum_flights = len(OAK_Flights)\n", 123 | "\t\tpct = float(delays) / (num_flights + delays)\n", 124 | "\t\tOAK_Delays.append(delays)\n", 125 | "\t\tOAK_On_Time.append(num_flights - delays)\n", 126 | "\t\tOAK_Pct.append(pct)\n", 127 | "\t\n", 128 | "\tday_values.append(begin)\n", 129 | "\tbegin += iterator\n", 130 | "\n", 131 | "print SFO_Pct\n", 132 | "print OAK_Pct" 133 | ], 134 | "language": "python", 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "output_type": "stream", 139 | "stream": "stdout", 140 | "text": [ 141 | "2008-05-08 00:00:00\n", 142 | "[0.22568093385214008, 0.23976608187134502, 0.2556390977443609, 0.2560747663551402, 0.263254113345521, 0.2478448275862069, 0.30275229357798167]" 143 | ] 144 | }, 145 | { 146 | "output_type": "stream", 147 | "stream": "stdout", 148 | "text": [ 149 | "\n", 150 | "[0.24793388429752067, 0.24680851063829787, 0.2697095435684647, 0.27058823529411763, 0.28185328185328185, 0.2613065326633166, 0.3004115226337449]\n" 151 | ] 152 | } 153 | ], 154 | "prompt_number": 4 155 | }, 156 | { 157 | "cell_type": "code", 158 | "collapsed": false, 159 | "input": [ 160 | "print \"Xastart_date" 161 | ], 162 | "language": "python", 163 | "metadata": {}, 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "collapsed": false, 169 | "input": [ 170 | "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n", 171 | "\n", 172 | "ax1 = plt.subplot(211)\n", 173 | "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n", 174 | "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n", 175 | "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n", 176 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n", 177 | "ax1.set_yticks([0, 200, 450])\n", 178 | "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n", 179 | "\n", 180 | "ax2 = plt.subplot(212)\n", 181 | "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n", 182 | "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n", 183 | "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n", 184 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n", 185 | "ax2.set_yticks([0, 200, 450])\n", 186 | "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n", 187 | "plt.show()" 188 | ], 189 | "language": "python", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "metadata": {}, 194 | "output_type": "display_data", 195 | "text": [ 196 | "" 197 | ] 198 | } 199 | ], 200 | "prompt_number": 7 201 | } 202 | ], 203 | "metadata": {} 204 | } 205 | ] 206 | } -------------------------------------------------------------------------------- /Old Python Code/accuracy.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | I2008 3 | cnumpy.core.multiarray 4 | scalar 5 | p1 6 | (cnumpy 7 | dtype 8 | p2 9 | (S'f8' 10 | p3 11 | I0 12 | I1 13 | tp4 14 | Rp5 15 | (I3 16 | S'<' 17 | p6 18 | NNNI-1 19 | I-1 20 | I0 21 | tp7 22 | bS'\x00\x00\x00\x00\x00\x00\xf0?' 23 | p8 24 | tp9 25 | Rp10 26 | sI2001 27 | g1 28 | (g5 29 | S'\x00\x00\x00\x00\x00\x00\xf0?' 30 | p11 31 | tp12 32 | Rp13 33 | sI2007 34 | g1 35 | (g5 36 | S'\x00\x00\x00\x00\x00\x00\xf0?' 37 | p14 38 | tp15 39 | Rp16 40 | s. -------------------------------------------------------------------------------- /Old Python Code/counter.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | with open('C:\\Dropbox\\Naive Bayes\\Analysis1.csv', 'r') as data: 4 | csv_reader = csv.reader(data, delimiter=',') 5 | SFO_count = 0 6 | OAK_count = 0 7 | for row in csv_reader: 8 | origin = row[1] 9 | if(origin == '270'): 10 | SFO_count += int(row[3]) 11 | elif(origin == '215'): 12 | OAK_count += int(row[3]) 13 | else: 14 | continue 15 | 16 | print OAK_count 17 | print SFO_count 18 | -------------------------------------------------------------------------------- /Old Python Code/counter1.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | with open('C:\\Dropbox\\Naive Bayes\\_dfTest2008\\_dfTest2008.csv', 'r') as data: 4 | csv_reader = csv.reader(data, delimiter=',') 5 | SFO_count = 0 6 | OAK_count = 0 7 | for row in csv_reader: 8 | origin = row[7] 9 | if(origin == '270'): 10 | SFO_count += 1 11 | elif(origin == '215'): 12 | OAK_count += 1 13 | else: 14 | continue 15 | 16 | print OAK_count 17 | print SFO_count 18 | -------------------------------------------------------------------------------- /Old Python Code/data_reader_v2.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pickle 3 | 4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17] 5 | years = [2008] 6 | 7 | def ComputeDayofYear(row): 8 | """This function will return an integer to represent the day of the year given an integer 9 | representing month and an integer representing the day of the month. This number will 10 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned 11 | as 0. Feb 29th will be returned as 59.""" 12 | 13 | if(row[0] == '1'): 14 | calc = 0 + int(row[1]) - 1 15 | row[1] = str(calc) 16 | elif(row[0] == '2'): 17 | calc = 31 + int(row[1]) - 1 18 | row[1] = str(calc) 19 | elif(row[0] == '3'): 20 | calc = 60 + int(row[1]) - 1 21 | row[1] = str(calc) 22 | elif(row[0] == '4'): 23 | calc = 91 + int(row[1]) - 1 24 | row[1] = str(calc) 25 | elif(row[0] == '5'): 26 | calc = 121 + int(row[1]) - 1 27 | row[1] = str(calc) 28 | elif(row[0] == '6'): 29 | calc = 152 + int(row[1]) - 1 30 | row[1] = str(calc) 31 | elif(row[0] == '7'): 32 | calc = 182 + int(row[1]) - 1 33 | row[1] = str(calc) 34 | elif(row[0] == '8'): 35 | calc = 213 + int(row[1]) - 1 36 | row[1] = str(calc) 37 | elif(row[0] == '9'): 38 | calc = 244 + int(row[1]) - 1 39 | row[1] = str(calc) 40 | elif(row[0] == '10'): 41 | calc = 274 + int(row[1]) - 1 42 | row[1] = str(calc) 43 | elif(row[0] == '11'): 44 | calc = 305 + int(row[1]) - 1 45 | row[1] = str(calc) 46 | elif(row[0] == '12'): 47 | calc = 335 + int(row[1]) - 1 48 | row[1] = str(calc) 49 | return row 50 | 51 | 52 | def DiscretizeDepTime(row): 53 | """This function takes a scheduled departure time, classifies the departure time as: 54 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value 55 | is assumed to be an integer in 24-hour time format. These labels will correspond to 56 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned. 57 | An error time is returned as morning.""" 58 | 59 | if(int(row[3]) <= 559): 60 | row[3] = '2' 61 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259): 62 | row[3] = '0' 63 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759): 64 | row[3] = '1' 65 | elif(int(row[3]) >= 1800): 66 | row[3] = '2' 67 | else: 68 | row[3] = '0' 69 | return row 70 | 71 | 72 | def AddDepVar(row): 73 | """This function adds a classification label based on the length of the recorded 74 | Departure Delay in the data set. It assumes an input integer value of the delay in mins. 75 | By airline industry standards, flight delays are defined as departure delays greater than 76 | or equal to 15 minutes. For delayed flights, this variable will have value "1". 77 | For on time flights, it will have value "0". Default value will be set at "0".""" 78 | 79 | if(row[6] >= '15'): 80 | row[6] = '1' 81 | else: 82 | row[6] = '0' 83 | return row 84 | 85 | def SaveData(data, pickle_file_name): 86 | """This function pickles each file.""" 87 | 88 | f = open (pickle_file_name, "w") 89 | pickle.dump(data, f) 90 | f.close() 91 | 92 | 93 | 94 | for i in years: 95 | data = [] 96 | file_path='"C:\\data\\airline\\2008.csv\\" ' + str(i) + '.csv' 97 | pickle_file_name = 'data' + str(i) 98 | with open(file_path, 'r') as data_csv: 99 | csv_reader = csv.reader(data_csv, delimiter=',') 100 | for row in list(csv_reader): 101 | if row[21] == '0': 102 | content = list(row[i] for i in needed_cols) 103 | content2 = ComputeDayofYear(content) 104 | content3 = DiscretizeDepTime(content2) 105 | content4 = AddDepVar(content3) 106 | data.append(content4) 107 | SaveData(data, pickle_file_name) 108 | -------------------------------------------------------------------------------- /Old Python Code/data_reader_v3.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pickle 3 | 4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17] 5 | years = [2008] 6 | 7 | def ComputeDayofYear(row): 8 | """This function will return an integer to represent the day of the year given an integer 9 | representing month and an integer representing the day of the month. This number will 10 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned 11 | as 0. Feb 29th will be returned as 59.""" 12 | 13 | if(row[0] == '1'): 14 | calc = 0 + int(row[1]) - 1 15 | row[1] = str(calc) 16 | elif(row[0] == '2'): 17 | calc = 31 + int(row[1]) - 1 18 | row[1] = str(calc) 19 | elif(row[0] == '3'): 20 | calc = 60 + int(row[1]) - 1 21 | row[1] = str(calc) 22 | elif(row[0] == '4'): 23 | calc = 91 + int(row[1]) - 1 24 | row[1] = str(calc) 25 | elif(row[0] == '5'): 26 | calc = 121 + int(row[1]) - 1 27 | row[1] = str(calc) 28 | elif(row[0] == '6'): 29 | calc = 152 + int(row[1]) - 1 30 | row[1] = str(calc) 31 | elif(row[0] == '7'): 32 | calc = 182 + int(row[1]) - 1 33 | row[1] = str(calc) 34 | elif(row[0] == '8'): 35 | calc = 213 + int(row[1]) - 1 36 | row[1] = str(calc) 37 | elif(row[0] == '9'): 38 | calc = 244 + int(row[1]) - 1 39 | row[1] = str(calc) 40 | elif(row[0] == '10'): 41 | calc = 274 + int(row[1]) - 1 42 | row[1] = str(calc) 43 | elif(row[0] == '11'): 44 | calc = 305 + int(row[1]) - 1 45 | row[1] = str(calc) 46 | elif(row[0] == '12'): 47 | calc = 335 + int(row[1]) - 1 48 | row[1] = str(calc) 49 | return row 50 | 51 | 52 | def DiscretizeDepTime(row): 53 | """This function takes a scheduled departure time, classifies the departure time as: 54 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value 55 | is assumed to be an integer in 24-hour time format. These labels will correspond to 56 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned. 57 | An error time is returned as morning.""" 58 | 59 | if(int(row[3]) <= 559): 60 | row[3] = '2' 61 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259): 62 | row[3] = '0' 63 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759): 64 | row[3] = '1' 65 | elif(int(row[3]) >= 1800): 66 | row[3] = '2' 67 | else: 68 | row[3] = '0' 69 | return row 70 | 71 | 72 | def AddDepVar(row): 73 | """This function adds a classification label based on the length of the recorded 74 | Departure Delay in the data set. It assumes an input integer value of the delay in mins. 75 | By airline industry standards, flight delays are defined as departure delays greater than 76 | or equal to 15 minutes. For delayed flights, this variable will have value "1". 77 | For on time flights, it will have value "0". Default value will be set at "0".""" 78 | 79 | if(row[6] >= '15'): 80 | row[6] = '1' 81 | else: 82 | row[6] = '0' 83 | return row 84 | 85 | def SaveData(data, pickle_file_name): 86 | """This function pickles each file.""" 87 | 88 | f = open (pickle_file_name, "w") 89 | pickle.dump(data, f) 90 | f.close() 91 | 92 | 93 | 94 | for i in years: 95 | data = [] 96 | file_path='C:\data\airline' + str(i) + '.csv' 97 | pickle_file_name = 'data' + str(i) 98 | with open(file_path, 'r') as data_csv: 99 | csv_reader = csv.reader(data_csv, delimiter=',') 100 | for row in list(csv_reader): 101 | if row[21] == '0': 102 | if (row[16] == 'SFO' or row[16] == 'OAK'): 103 | content = list(row[i] for i in needed_cols) 104 | content2 = ComputeDayofYear(content) 105 | content3 = DiscretizeDepTime(content2) 106 | content4 = AddDepVar(content3) 107 | data.append(content4) 108 | SaveData(data, pickle_file_name) 109 | 110 | 111 | -------------------------------------------------------------------------------- /Old Python Code/data_reader_v4_ek.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pickle 3 | import time 4 | import os 5 | from boto.s3.connection import S3Connection 6 | from boto.s3.key import Key 7 | 8 | 9 | timestr = time.strftime("%Y%m%d-%H%M%S") 10 | print timestr 11 | 12 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17] 13 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] 14 | j=0 15 | 16 | def ComputeDayofYear(row): 17 | """This function will return an integer to represent the day of the year given an integer 18 | representing month and an integer representing the day of the month. This number will 19 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned 20 | as 0. Feb 29th will be returned as 59.""" 21 | 22 | if(row[0] == '1'): 23 | calc = 0 + int(row[1]) - 1 24 | row[1] = str(calc) 25 | elif(row[0] == '2'): 26 | calc = 31 + int(row[1]) - 1 27 | row[1] = str(calc) 28 | elif(row[0] == '3'): 29 | calc = 60 + int(row[1]) - 1 30 | row[1] = str(calc) 31 | elif(row[0] == '4'): 32 | calc = 91 + int(row[1]) - 1 33 | row[1] = str(calc) 34 | elif(row[0] == '5'): 35 | calc = 121 + int(row[1]) - 1 36 | row[1] = str(calc) 37 | elif(row[0] == '6'): 38 | calc = 152 + int(row[1]) - 1 39 | row[1] = str(calc) 40 | elif(row[0] == '7'): 41 | calc = 182 + int(row[1]) - 1 42 | row[1] = str(calc) 43 | elif(row[0] == '8'): 44 | calc = 213 + int(row[1]) - 1 45 | row[1] = str(calc) 46 | elif(row[0] == '9'): 47 | calc = 244 + int(row[1]) - 1 48 | row[1] = str(calc) 49 | elif(row[0] == '10'): 50 | calc = 274 + int(row[1]) - 1 51 | row[1] = str(calc) 52 | elif(row[0] == '11'): 53 | calc = 305 + int(row[1]) - 1 54 | row[1] = str(calc) 55 | elif(row[0] == '12'): 56 | calc = 335 + int(row[1]) - 1 57 | row[1] = str(calc) 58 | return row 59 | 60 | 61 | def DiscretizeDepTime(row): 62 | """This function takes a scheduled departure time, classifies the departure time as: 63 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value 64 | is assumed to be an integer in 24-hour time format. These labels will correspond to 65 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned. 66 | An error time is returned as morning.""" 67 | 68 | if(int(row[3]) <= 559): 69 | row[3] = '2' 70 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259): 71 | row[3] = '0' 72 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759): 73 | row[3] = '1' 74 | elif(int(row[3]) >= 1800): 75 | row[3] = '2' 76 | else: 77 | row[3] = '0' 78 | return row 79 | 80 | 81 | def AddDepVar(row): 82 | """This function adds a classification label based on the length of the recorded 83 | Departure Delay in the data set. It assumes an input integer value of the delay in mins. 84 | By airline industry standards, flight delays are defined as departure delays greater than 85 | or equal to 15 minutes. For delayed flights, this variable will have value "1". 86 | For on time flights, it will have value "0". Default value will be set at "0".""" 87 | 88 | if(row[6] >= '15'): 89 | row[6] = '1' 90 | else: 91 | row[6] = '0' 92 | return row 93 | 94 | def SaveData(data, pickle_file_name): 95 | """This function pickles each file.""" 96 | 97 | f = open (pickle_file_name, "wb") 98 | try: 99 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) 100 | except Exception as e: 101 | print e 102 | f.close() 103 | 104 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 105 | bucket = conn.get_bucket('i290-aero') 106 | k = Key(bucket) 107 | k.key = pickle_file_name 108 | k.set_contents_from_filename(pickle_file_name) 109 | 110 | os.remove(pickle_file_name) 111 | 112 | 113 | for i in years: 114 | data = [] 115 | ''' 116 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 117 | bucket = conn.get_bucket('i290-aero') 118 | k = Key(bucket) 119 | k.key = 'data2001.csv' 120 | file_path = k.get_contents_as_string() 121 | ''' 122 | file_path='data' + str(i) + '.csv' 123 | pickle_file_name = timestr+'-data-' + str(i) 124 | with open(file_path, 'r') as data_csv: 125 | csv_reader = csv.reader(data_csv, delimiter=',') 126 | j = 0 127 | for row in csv_reader: 128 | if row[21] == '0': # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'): 129 | # if (row[16] == 'SFO' or row[16] == 'OAK'): 130 | content = [row[i] for i in needed_cols] 131 | content2 = ComputeDayofYear(content) 132 | content3 = DiscretizeDepTime(content2) 133 | content4 = AddDepVar(content3) 134 | data.append(content4) 135 | # print 'content4', content4 136 | # print 'data', data 137 | # fff = raw_input() 138 | j=j+1 139 | if j % 2000000 == 0: 140 | print j 141 | SaveData(data, pickle_file_name + '-' + str(j)) 142 | data = [] 143 | SaveData(data, pickle_file_name) 144 | 145 | 146 | -------------------------------------------------------------------------------- /Old Python Code/date_iterator_plot.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import csv 3 | import random 4 | import matplotlib.pyplot as plt; plt.rcdefaults() 5 | 6 | # Eunkwang data: SFO = 1; OAK = 2 7 | # Divya data: SFO = 136; OAK = 141 8 | 9 | # Need to change row indexes to make sure they match data from Eunkwang. 10 | 11 | '''with open('EunkwangSampleData.csv', 'r') as data: 12 | csv_reader = csv.reader(data, delimiter=',') 13 | SFO_EJ_Hash = {} 14 | OAK_EJ_Hash = {} 15 | for row in csv_reader: 16 | origin = row[8] 17 | if(origin == '1'): 18 | year = int(row[0]) 19 | month = int(row[1]) 20 | date = int(row[2]) 21 | key = datetime.date(year, month, date) 22 | label = int(row[9]) 23 | if(key not in SFO_EJ_Hash): 24 | SFO_EJ_Hash[key] = [label] 25 | else: 26 | SFO_EJ_Hash[key].append(label) 27 | elif(origin == '2'): 28 | year = int(row[0]) 29 | month = int(row[1]) 30 | date = int(row[2]) 31 | key = datetime.date(year, month, date) 32 | label = int(row[9]) 33 | if(key not in OAK_EJ_Hash): 34 | OAK_EJ_Hash[key] = [label] 35 | else: 36 | OAK_EJ_Hash[key].append(label) 37 | else: 38 | continue''' 39 | 40 | with open('DivyaSampleData.csv', 'r') as data: 41 | csv_reader = csv.reader(data, delimiter=',') 42 | SFO_DM_Hash = {} 43 | OAK_DM_Hash = {} 44 | for row in csv_reader: 45 | origin = row[8] 46 | if(origin == '136'): 47 | year = int(row[0]) 48 | month = int(row[1]) 49 | date = int(row[2]) 50 | key = datetime.date(year, month, date) 51 | label = int(row[9]) 52 | if(key not in SFO_DM_Hash): 53 | SFO_DM_Hash[key] = [label] 54 | else: 55 | SFO_DM_Hash[key].append(label) 56 | elif(origin == '141'): 57 | year = int(row[0]) 58 | month = int(row[1]) 59 | date = int(row[2]) 60 | key = datetime.date(year, month, date) 61 | label = int(row[9]) 62 | if(key not in OAK_DM_Hash): 63 | OAK_DM_Hash[key] = [label] 64 | else: 65 | OAK_DM_Hash[key].append(label) 66 | else: 67 | continue 68 | 69 | start_date = datetime.date(2008, 1, 1) 70 | end_date = datetime.date(2008, 1,31) 71 | date_values = [] 72 | SFO_DM_Delays = [] 73 | SFO_DM_On_Time = [] 74 | OAK_DM_Delays = [] 75 | OAK_DM_On_Time = [] 76 | SFO_EJ_Delays = [] 77 | SFO_EJ_On_Time = [] 78 | OAK_EJ_Delays = [] 79 | OAK_EJ_On_Time = [] 80 | 81 | d = start_date 82 | delta = datetime.timedelta(days=1) 83 | while d <= end_date: 84 | '''if(d not in SFO_EJ_Hash): 85 | SFO_EJ_Values.append([0,0]) 86 | else: 87 | SFO_EJ_Flights = SFO_EJ_Hash[d] 88 | delays = sum(SFO_EJ_Flights) 89 | num_flights = len(SFO_EJ_Flights) 90 | SFO_EJ_Delays.append(delays) 91 | SFO_EJ_On_Time.append(num_flights - delays) 92 | 93 | if(d not in OAK_EJ_Hash): 94 | OAK_EJ_Values.append([0,0]) 95 | else: 96 | OAK_EJ_Flights = OAK_EJ_Hash[d] 97 | delays = sum(OAK_EJ_Flights) 98 | num_flights = len(OAK_EJ_Flights) 99 | OAK_EJ_Delays.append(delays) 100 | OAK_EJ_On_Time.append(num_flights - delays)''' 101 | 102 | if(d not in SFO_DM_Hash): 103 | SFO_DM_Values.append([0,0]) 104 | else: 105 | SFO_DM_Flights = SFO_DM_Hash[d] 106 | delays = sum(SFO_DM_Flights) 107 | num_flights = len(SFO_DM_Flights) 108 | SFO_DM_Delays.append(delays) 109 | SFO_DM_On_Time.append(num_flights - delays) 110 | 111 | if(d not in OAK_DM_Hash): 112 | OAK_DM_Values.append([0,0]) 113 | else: 114 | OAK_DM_Flights = OAK_DM_Hash[d] 115 | delays = sum(OAK_DM_Flights) 116 | num_flights = len(OAK_DM_Flights) 117 | OAK_DM_Delays.append(delays) 118 | OAK_DM_On_Time.append(num_flights - delays) 119 | 120 | date_values.append(d) 121 | d += delta 122 | 123 | plt.title('Probability of Flight Delays at SFO vs. OAK') 124 | 125 | ax1 = plt.subplot(211) 126 | ax1.bar(date_values, SFO_DM_Delays, bottom = SFO_DM_On_Time, color = 'green') 127 | ax1.bar(date_values, SFO_DM_On_Time, color = 'blue') 128 | ax1.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008']) 129 | #ax1.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008']) 130 | ax1.set_yticks([0, 50, 100]) 131 | ax1.set_title('On-Time Flights and Delayed Flights at SFO') 132 | 133 | ax2 = plt.subplot(212) 134 | ax2.bar(date_values, OAK_DM_Delays, bottom = OAK_DM_On_Time, color = 'red') 135 | ax2.bar(date_values, OAK_DM_On_Time, color = 'grey') 136 | ax2.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008']) 137 | #ax2.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008']) 138 | ax2.set_yticks([0, 50, 100]) 139 | ax2.set_title('On-Time Flights and Delayed Flights at OAK') 140 | 141 | plt.show() -------------------------------------------------------------------------------- /Old Python Code/logisticRegression.py: -------------------------------------------------------------------------------- 1 | # import matplotlib.pyplot as plt 2 | import numpy as np 3 | import random 4 | import pickle 5 | import sys 6 | import os 7 | from boto.s3.connection import S3Connection 8 | from boto.s3.key import Key 9 | 10 | pickle2001 = ['20140428-190051-data-2001', 11 | '20140428-190051-data-2001-2000000', 12 | '20140428-190051-data-2001-4000000'] 13 | pickle2002 = ['20140428-190051-data-2002', 14 | '20140428-190051-data-2002-2000000', 15 | '20140428-190051-data-2002-4000000'] 16 | pickle2003 = ['20140428-190051-data-2003', 17 | '20140428-190051-data-2003-2000000', 18 | '20140428-190051-data-2003-4000000', 19 | '20140428-190051-data-2003-6000000'] 20 | pickle2004 = ['20140428-190051-data-2004', 21 | '20140428-190051-data-2004-2000000', 22 | '20140428-190051-data-2004-4000000', 23 | '20140428-190051-data-2004-6000000'] 24 | pickle2005 = ['20140428-190051-data-2005', 25 | '20140428-190051-data-2005-2000000', 26 | '20140428-190051-data-2005-4000000', 27 | '20140428-190051-data-2005-6000000'] 28 | pickle2006 = ['20140428-190051-data-2006', 29 | '20140428-190051-data-2006-2000000', 30 | '20140428-190051-data-2006-4000000', 31 | '20140428-190051-data-2006-6000000'] 32 | pickle2007 = ['20140428-190051-data-2007', 33 | '20140428-190051-data-2007-2000000', 34 | '20140428-190051-data-2007-4000000', 35 | '20140428-190051-data-2007-6000000'] 36 | pickle2008 = ['20140428-190051-data-2008', 37 | '20140428-190051-data-2008-2000000', 38 | '20140428-190051-data-2008-4000000', 39 | '20140428-190051-data-2008-6000000'] 40 | 41 | 42 | def loadData(fileName): 43 | if os.path.exists(fileName) == False: 44 | print 'downloading', fileName, 'from s3' 45 | conn = S3Connection('key', 'val') 46 | bucket = conn.get_bucket('i290-aero') 47 | k = Key(bucket) 48 | k.key = fileName 49 | k.get_contents_to_filename(fileName) 50 | print 'downloaded', fileName, 'from s3' 51 | 52 | print 'now unpickle...' 53 | x = pickle.load(open(fileName, "rb")) 54 | x = np.array(x) 55 | print 'x.shape = ', x.shape, x[:, -1:].shape 56 | y = x[:, -1:].copy() # last col is y value (delay or not) 57 | x[:, -1:] = 1. 58 | return x, y 59 | 60 | def gradientDescent(x, y, numIterations, dimension, theta): 61 | # theta = np.zeros(dimension)[np.newaxis].transpose() 62 | for i in range(1, numIterations): 63 | randIdx = random.randint(0, len(x) - 1) 64 | xTrans = x[randIdx][np.newaxis].transpose() 65 | # print theta.transpose(), xTrans 66 | u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans))) 67 | loss = y[randIdx] - u 68 | gradient = np.dot(loss[0][0], xTrans) 69 | # update 70 | theta = theta + gradient / i 71 | return theta 72 | 73 | def graph(formula, x_range): 74 | x = np.array(x_range) 75 | y = eval(formula) 76 | plt.plot(x, y) 77 | 78 | 79 | # def getData(fileName): 80 | # f = open(fileName, 'r') 81 | # x = np.array([0,0,0]) 82 | # x0 = [] 83 | # x1 = [] 84 | # y = np.array([0]) 85 | # for line in f: 86 | # arr = line.strip().split(' ') 87 | # x = np.vstack((x, [float(arr[0]), float(arr[1]), 1.])) 88 | # y = np.vstack((y, [float(arr[2])])) 89 | # if arr[2] == '0': 90 | # x0.append((float(arr[0]), float(arr[1]))) 91 | # else: 92 | # x1.append((float(arr[0]), float(arr[1]))) 93 | 94 | # x = np.delete(x, 0, 0) 95 | # y = np.delete(y, 0, 0) 96 | # f.close() 97 | 98 | # return x, x0, x1, y 99 | 100 | 101 | 102 | 103 | def main(): 104 | # arg = sys.argv 105 | # if len(arg) < 2: 106 | # print 'USE: $ python logisticRegression.py [dataset_file]' 107 | # return 108 | # x, y = loadData(arg[1]) 109 | 110 | # x, x0, x1, y = getData('classification.dat') 111 | 112 | if os.path.exists('pickled_theta') == False: 113 | theta = None 114 | for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2007: 115 | x, y = loadData(elem) 116 | if theta == None: 117 | theta = np.zeros(x.shape[1])[np.newaxis].transpose() 118 | print 'theta == None...... initialize..........', theta.shape 119 | theta = gradientDescent(x, y, 100000, x.shape[1], theta) 120 | print 'finished gradientDescent of ', elem 121 | print 'theta', theta 122 | 123 | f = open('pickled_theta', 'wb') 124 | pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL) 125 | f.close() 126 | 127 | theta = pickle.load(open('pickled_theta', 'rb')) 128 | 129 | accu = 0 130 | length = 0 131 | for elem in pickle2008: 132 | if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False: 133 | x, y = loadData(elem) 134 | dotProduct = np.dot(x, theta) 135 | print '============= dot product =============' 136 | print dotProduct 137 | print '=============y =============' 138 | print y 139 | pickle.dump(dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) 140 | pickle.dump(y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) 141 | else: 142 | dotProduct = pickle.load(open('dot-' + elem, 'rb')) 143 | y = pickle.load(open('y-' + elem, 'rb')) 144 | 145 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct] 146 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit] 147 | 148 | for i in range(len(prob)): 149 | if prob[i] == y[i]: 150 | accu += 1 151 | length += len(prob) 152 | print 'accuracy = ', accu * 100 / length 153 | 154 | # graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5)) 155 | print 'asdf' 156 | 157 | 158 | 159 | if __name__ == '__main__': 160 | main() -------------------------------------------------------------------------------- /Old Python Code/matrix.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | I2008 3 | cnumpy.core.multiarray 4 | _reconstruct 5 | p1 6 | (cnumpy 7 | ndarray 8 | p2 9 | (I0 10 | tp3 11 | S'b' 12 | p4 13 | tp5 14 | Rp6 15 | (I1 16 | (L2L 17 | L2L 18 | tp7 19 | cnumpy 20 | dtype 21 | p8 22 | (S'i4' 23 | p9 24 | I0 25 | I1 26 | tp10 27 | Rp11 28 | (I3 29 | S'<' 30 | p12 31 | NNNI-1 32 | I-1 33 | I0 34 | tp13 35 | bI00 36 | S'\xb9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe2\x03\x00\x00' 37 | p14 38 | tp15 39 | bsI2001 40 | g1 41 | (g2 42 | (I0 43 | tp16 44 | g4 45 | tp17 46 | Rp18 47 | (I1 48 | (L2L 49 | L2L 50 | tp19 51 | g11 52 | I00 53 | S'\x9a\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x04\x00\x00' 54 | p20 55 | tp21 56 | bsI2007 57 | g1 58 | (g2 59 | (I0 60 | tp22 61 | g4 62 | tp23 63 | Rp24 64 | (I1 65 | (L2L 66 | L2L 67 | tp25 68 | g11 69 | I00 70 | S'\xa9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf2\x03\x00\x00' 71 | p26 72 | tp27 73 | bs. -------------------------------------------------------------------------------- /Old Python Code/model_selector.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt; plt.rcdefaults() 2 | 3 | # Divya and Eunkwang to provide [precision, recall, accuracy] for each of their 8 results. 4 | # This script will graph the models against each other and select the best model. 5 | 6 | TEST_DATA = [[0.4,0.6,0.8] , [0.5,0.3,0.69], [0.8, 0.2, 0.75], [0.3, 0.9, 0.72], [0.8, 0.95, 0.9]] 7 | 8 | def calc_f1_score(precision, recall, accuracy): 9 | return (float(2 * (precision * recall) / (precision + recall))) 10 | 11 | precision_array = [] 12 | recall_array = [] 13 | best_f1 = 0.00000000000000000 14 | index = 0 15 | 16 | for each in TEST_DATA: 17 | precision_array.append(each[0]) 18 | recall_array.append(each[1]) 19 | 20 | f1 = calc_f1_score(each[0], each[1], each[2]) 21 | #print f1 22 | if(f1 > best_f1): 23 | best_f1 = f1 24 | best_index = index 25 | index +=1 26 | 27 | print "The Best Model is: Model " + str(best_index) 28 | 29 | fig = plt.subplot(111) 30 | fig.scatter(precision_array, recall_array) 31 | fig.set_xlabel('Recall') 32 | fig.set_ylabel('Precision') 33 | 34 | plt.show() -------------------------------------------------------------------------------- /Old Python Code/output.txt: -------------------------------------------------------------------------------- 1 | harbinger:~/python$ python Pickle.py 2 | /home/dmenghani/python_lib/scikit_learn-0.14.1-py2.7-linux-x86_64.egg/sklearn/pls.py:7: DeprecationWarning: This module has been moved to cross_decomposition and will be removed in 0.16 3 | "removed in 0.16", DeprecationWarning) 4 | Setting constants... 5 | Reading into Pandas frame... 6 | 7 | /home/dmenghani/python/2001.csv 8 | Length of original dataset - 5967780 9 | Removing cancelled flights... 10 | Length after random sampling, taking {one - third} of the file - 1912194 11 | 12 | 13 | /home/dmenghani/python/2002.csv 14 | Length of original dataset - 5271359 15 | Removing cancelled flights... 16 | Length after random sampling, taking {one - third} of the file - 1735405 17 | 18 | 19 | /home/dmenghani/python/2003.csv 20 | Length of original dataset - 6488540 21 | Removing cancelled flights... 22 | Length after random sampling, taking {one - third} of the file - 2129023 23 | 24 | 25 | /home/dmenghani/python/2004.csv 26 | Length of original dataset - 7129270 27 | Removing cancelled flights... 28 | Length after random sampling, taking {one - third} of the file - 2333837 29 | 30 | 31 | /home/dmenghani/python/2005.csv 32 | Length of original dataset - 7140596 33 | Removing cancelled flights... 34 | Length after random sampling, taking {one - third} of the file - 2335622 35 | 36 | 37 | /home/dmenghani/python/2006.csv 38 | Length of original dataset - 7141922 39 | Removing cancelled flights... 40 | Length after random sampling, taking {one - third} of the file - 2339996 41 | 42 | 43 | /home/dmenghani/python/2007.csv 44 | Length of original dataset - 7453215 45 | Removing cancelled flights... 46 | Length after random sampling, taking {one - third} of the file - 2430822 47 | 48 | 49 | /home/dmenghani/python/2008.csv 50 | Length of original dataset - 7009728 51 | Removing cancelled flights... 52 | Length after random sampling, taking {one - third} of the file - 2290764 53 | 54 | Total length for all years - 17507663 55 | Calculating classification label... 56 | Dataframe shape - (17507663, 12) 57 | Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'DepDelay', u'Origin', u'Dest', u'Distance', u'label'], dtype='object') 58 | Converting categorical data to numeric... 59 | Converting... Origin 60 | Converting... UniqueCarrier 61 | Converting... Dest 62 | Converting... TailNum 63 | Pickled origin_all.pkl 64 | Pickled tailnum_all.pkl 65 | Pickled dest_all.pkl 66 | Pickled carrier_all.pkl 67 | Conversion to discrete data completed. 68 | Pickled dataframe_all.pkl 69 | harbinger:~/python$ 70 | -------------------------------------------------------------------------------- /Old Python Code/prec.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | I2008 3 | cnumpy.core.multiarray 4 | scalar 5 | p1 6 | (cnumpy 7 | dtype 8 | p2 9 | (S'f8' 10 | p3 11 | I0 12 | I1 13 | tp4 14 | Rp5 15 | (I3 16 | S'<' 17 | p6 18 | NNNI-1 19 | I-1 20 | I0 21 | tp7 22 | bS'\x00\x00\x00\x00\x00\x00\xf0?' 23 | p8 24 | tp9 25 | Rp10 26 | sI2001 27 | g1 28 | (g5 29 | S'\x00\x00\x00\x00\x00\x00\xf0?' 30 | p11 31 | tp12 32 | Rp13 33 | sI2007 34 | g1 35 | (g5 36 | S'\x00\x00\x00\x00\x00\x00\xf0?' 37 | p14 38 | tp15 39 | Rp16 40 | s. -------------------------------------------------------------------------------- /Old Python Code/results.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | . -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Predicting Airline Delays - Fly from SFO or OAK? 2 | =================== 3 | 4 | Team 5 |
6 | Divya M 7 |
8 | Eunkwang J 9 |
10 | Ryan J 11 |
12 | Julia K 13 |
14 | 15 | Problem Statement 16 | Simplified version: "Given a destination and a date range, which is a better airport to fly out from - SFO or OAK?" 17 | We wanted to apply machine learning techniques to build a predictive model which can help flyer decide which airport to choose. Our model was built using data for all US domestic flights from 2001-08. Our models works for all airports, however we were particularly interested in SFO/OAK. There is a popular urban myth to fly from OAK to avoid delays. But we find that myth is not true always. 18 |
19 | 20 | About the Data 21 | We will be working with airline data for individual years found at http://stat-computing.org/dataexpo/2009/the-data.html.

22 | 23 | Techniques 24 | Naive Bayes 25 | Logistic Regression 26 |

27 | 28 | Python Libraries 29 | Pandas, Scikit, Matplotlib, Seaborn 30 | -------------------------------------------------------------------------------- /data_reader_v4_ek.py: -------------------------------------------------------------------------------- 1 | # 2 | # data_reader_v4_ek.py 3 | # author: eunkwang joo 4 | # description: This code prepares dataset for logistic regression algorithm, which is written by myself. 5 | # 6 | 7 | 8 | import csv 9 | import pickle 10 | import time 11 | import os 12 | from boto.s3.connection import S3Connection 13 | from boto.s3.key import Key 14 | 15 | 16 | timestr = time.strftime("%Y%m%d-%H%M%S") 17 | print timestr 18 | 19 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17] 20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] 21 | j = 0 22 | 23 | # 24 | # function: ComputeDayofYear() 25 | # description: This function will return an integer to represent the day of the year given an integer 26 | # representing month and an integer representing the day of the month. This number will 27 | # correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned 28 | # as 0. Feb 29th will be returned as 59. 29 | # input: row of csv file, a raw dataset 30 | # output: row of csv file, date of year value of which is encoded. 31 | # 32 | 33 | 34 | def ComputeDayofYear(row): 35 | if(row[0] == '1'): 36 | calc = 0 + int(row[1]) - 1 37 | row[1] = str(calc) 38 | elif(row[0] == '2'): 39 | calc = 31 + int(row[1]) - 1 40 | row[1] = str(calc) 41 | elif(row[0] == '3'): 42 | calc = 60 + int(row[1]) - 1 43 | row[1] = str(calc) 44 | elif(row[0] == '4'): 45 | calc = 91 + int(row[1]) - 1 46 | row[1] = str(calc) 47 | elif(row[0] == '5'): 48 | calc = 121 + int(row[1]) - 1 49 | row[1] = str(calc) 50 | elif(row[0] == '6'): 51 | calc = 152 + int(row[1]) - 1 52 | row[1] = str(calc) 53 | elif(row[0] == '7'): 54 | calc = 182 + int(row[1]) - 1 55 | row[1] = str(calc) 56 | elif(row[0] == '8'): 57 | calc = 213 + int(row[1]) - 1 58 | row[1] = str(calc) 59 | elif(row[0] == '9'): 60 | calc = 244 + int(row[1]) - 1 61 | row[1] = str(calc) 62 | elif(row[0] == '10'): 63 | calc = 274 + int(row[1]) - 1 64 | row[1] = str(calc) 65 | elif(row[0] == '11'): 66 | calc = 305 + int(row[1]) - 1 67 | row[1] = str(calc) 68 | elif(row[0] == '12'): 69 | calc = 335 + int(row[1]) - 1 70 | row[1] = str(calc) 71 | return row 72 | 73 | 74 | # 75 | # function: DiscretizeDepTime() 76 | # description: This function takes a scheduled departure time, classifies the departure time as: 77 | # morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value 78 | # is assumed to be an integer in 24-hour time format. These labels will correspond to 79 | # variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned. 80 | # An error time is returned as morning. 81 | # input: row of csv file, a raw dataset 82 | # output: row of csv file, departure time value of which is encoded. 83 | # 84 | 85 | def DiscretizeDepTime(row): 86 | 87 | if(int(row[3]) <= 559): 88 | row[3] = '2' 89 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259): 90 | row[3] = '0' 91 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759): 92 | row[3] = '1' 93 | elif(int(row[3]) >= 1800): 94 | row[3] = '2' 95 | else: 96 | row[3] = '0' 97 | return row 98 | 99 | # 100 | # function: AddDepVar() 101 | # description: This function adds a classification label based on the length of the recorded 102 | # Departure Delay in the data set. It assumes an input integer value of the delay in mins. 103 | # By airline industry standards, flight delays are defined as departure delays greater than 104 | # or equal to 15 minutes. For delayed flights, this variable will have value "1". 105 | # For on time flights, it will have value "0". Default value will be set at "0". 106 | # input: row of csv file, a raw dataset 107 | # output: row of csv file, delay value of which is encoded as binary. 108 | # 109 | 110 | 111 | def AddDepVar(row): 112 | 113 | if(row[6] >= '15'): 114 | row[6] = '1' 115 | else: 116 | row[6] = '0' 117 | return row 118 | 119 | # 120 | # function: SaveData() 121 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well. 122 | # input: data= data structure which will be stored for future uses 123 | # pickle_file_name= file name to be used to store data 124 | # output: null 125 | # 126 | 127 | 128 | def SaveData(data, pickle_file_name): 129 | 130 | f = open(pickle_file_name, "wb") 131 | try: 132 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) 133 | except Exception as e: 134 | print e 135 | f.close() 136 | 137 | conn = S3Connection( 138 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 139 | bucket = conn.get_bucket('i290-aero') 140 | k = Key(bucket) 141 | k.key = pickle_file_name 142 | k.set_contents_from_filename(pickle_file_name) 143 | 144 | os.remove(pickle_file_name) 145 | 146 | 147 | # it reads raw datset of every year, encodes variables, drop unused 148 | # variables, and pickle trimmed dataset in file system. 149 | 150 | for i in years: 151 | data = [] 152 | ''' 153 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 154 | bucket = conn.get_bucket('i290-aero') 155 | k = Key(bucket) 156 | k.key = 'data2001.csv' 157 | file_path = k.get_contents_as_string() 158 | ''' 159 | file_path = 'data' + str(i) + '.csv' 160 | pickle_file_name = timestr + '-data-' + str(i) 161 | with open(file_path, 'r') as data_csv: 162 | csv_reader = csv.reader(data_csv, delimiter=',') 163 | j = 0 164 | for row in csv_reader: 165 | # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'): 166 | if row[21] == '0': 167 | # if (row[16] == 'SFO' or row[16] == 'OAK'): 168 | content = [row[i] for i in needed_cols] 169 | content2 = ComputeDayofYear(content) 170 | content3 = DiscretizeDepTime(content2) 171 | content4 = AddDepVar(content3) 172 | data.append(content4) 173 | # print 'content4', content4 174 | # print 'data', data 175 | # fff = raw_input() 176 | j = j + 1 177 | if j % 2000000 == 0: 178 | print j 179 | SaveData(data, pickle_file_name + '-' + str(j)) 180 | data = [] 181 | SaveData(data, pickle_file_name) 182 | -------------------------------------------------------------------------------- /data_reader_v4_ek_rj_csv.py: -------------------------------------------------------------------------------- 1 | # 2 | # data_reader_v4_ek_rj_csv.py 3 | # author: eunkwang joo 4 | # description: This code prepares dataset for logistic regression using python pandas. 5 | # 6 | 7 | import csv 8 | import pickle 9 | import time 10 | import os 11 | from boto.s3.connection import S3Connection 12 | from boto.s3.key import Key 13 | 14 | 15 | timestr = time.strftime("%Y%m%d-%H%M%S") 16 | print timestr 17 | 18 | # columns to extract from raw dataset. 19 | needed_cols = [3, 4, 8, 15, 16, 17] 20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] 21 | 22 | j = 0 23 | 24 | # 25 | # function: ComputeDayofYear() 26 | # description: This function will return an integer to represent the day of the year given an integer 27 | # representing month and an integer representing the day of the month. This number will 28 | # correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned 29 | # as 0. Feb 29th will be returned as 59. 30 | # input: row of csv file, a raw dataset 31 | # output: row of csv file, date of year value of which is encoded. 32 | # 33 | 34 | 35 | def ComputeDayofYear(row): 36 | 37 | if(row[0] == '1'): 38 | calc = 0 + int(row[1]) - 1 39 | row[1] = float(calc) 40 | elif(row[0] == '2'): 41 | calc = 31 + int(row[1]) - 1 42 | row[1] = float(calc) 43 | elif(row[0] == '3'): 44 | calc = 60 + int(row[1]) - 1 45 | row[1] = float(calc) 46 | elif(row[0] == '4'): 47 | calc = 91 + int(row[1]) - 1 48 | row[1] = float(calc) 49 | elif(row[0] == '5'): 50 | calc = 121 + int(row[1]) - 1 51 | row[1] = float(calc) 52 | elif(row[0] == '6'): 53 | calc = 152 + int(row[1]) - 1 54 | row[1] = float(calc) 55 | elif(row[0] == '7'): 56 | calc = 182 + int(row[1]) - 1 57 | row[1] = float(calc) 58 | elif(row[0] == '8'): 59 | calc = 213 + int(row[1]) - 1 60 | row[1] = float(calc) 61 | elif(row[0] == '9'): 62 | calc = 244 + int(row[1]) - 1 63 | row[1] = float(calc) 64 | elif(row[0] == '10'): 65 | calc = 274 + int(row[1]) - 1 66 | row[1] = float(calc) 67 | elif(row[0] == '11'): 68 | calc = 305 + int(row[1]) - 1 69 | row[1] = float(calc) 70 | elif(row[0] == '12'): 71 | calc = 335 + int(row[1]) - 1 72 | row[1] = float(calc) 73 | return row 74 | 75 | 76 | # 77 | # function: DiscretizeDepTime() 78 | # description: This function takes a scheduled departure time, classifies the departure time as: 79 | # morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value 80 | # is assumed to be an integer in 24-hour time format. These labels will correspond to 81 | # variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned. 82 | # An error time is returned as morning. 83 | # input: row of csv file, a raw dataset 84 | # output: row of csv file, departure time value of which is encoded. 85 | # 86 | 87 | def DiscretizeDepTime(row): 88 | 89 | if(int(row[1]) <= 559): 90 | row[1] = 2. 91 | elif(int(row[1]) >= 600 and int(row[1]) <= 1259): 92 | row[1] = 0. 93 | elif(int(row[1]) >= 1300 and int(row[1]) <= 1759): 94 | row[1] = 1. 95 | elif(int(row[1]) >= 1800): 96 | row[1] = 2. 97 | else: 98 | row[1] = 0. 99 | return row 100 | 101 | 102 | # 103 | # function: AddDepVar() 104 | # description: This function adds a classification label based on the length of the recorded 105 | # Departure Delay in the data set. It assumes an input integer value of the delay in mins. 106 | # By airline industry standards, flight delays are defined as departure delays greater than 107 | # or equal to 15 minutes. For delayed flights, this variable will have value "1". 108 | # For on time flights, it will have value "0". Default value will be set at "0". 109 | # input: row of csv file, a raw dataset 110 | # output: row of csv file, delay value of which is encoded as binary. 111 | # 112 | 113 | def AddDepVar(row): 114 | 115 | if float(row[3]) >= float(15): 116 | row[3] = 1. 117 | else: 118 | row[3] = 0. 119 | return row 120 | 121 | # 122 | # function: SaveData() 123 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well. 124 | # input: data= data structure which will be stored for future uses 125 | # pickle_file_name= file name to be used to store data 126 | # output: null 127 | # 128 | 129 | 130 | def SaveData(data, pickle_file_name): 131 | 132 | f = open(pickle_file_name, "wb") 133 | try: 134 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) 135 | except Exception as e: 136 | print e 137 | f.close() 138 | 139 | conn = S3Connection( 140 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 141 | bucket = conn.get_bucket('i290-aero') 142 | k = Key(bucket) 143 | k.key = pickle_file_name 144 | k.set_contents_from_filename(pickle_file_name) 145 | 146 | os.remove(pickle_file_name) 147 | 148 | 149 | hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic'] 150 | 151 | 152 | # if os.path.exists(hashs[1]): 153 | # tailNumHash = pickle.load(open(hashs[1], "rb")) 154 | # else: 155 | # tailNumHash = {} 156 | # 157 | # function: createHash() 158 | # description: It creates dictionaries which matches an airport and a carrier to an integer. The dictionaries will be used to identify encoded airport and carrier. 159 | # input: null 160 | # output: null 161 | # 162 | def createHash(): 163 | airportHash = {} 164 | carrierHash = {} 165 | for i in years: 166 | file_path = '../Airport_Data/data' + str(i) + '.csv' 167 | with open(file_path, 'r') as data_csv: 168 | csv_reader = csv.reader(data_csv, delimiter=',') 169 | j = 0 170 | for row in csv_reader: 171 | if(row[17] not in airportHash): 172 | airportHash[row[17]] = len(airportHash) + 1 173 | if(row[8] not in carrierHash): 174 | carrierHash[row[8]] = len(carrierHash) + 1 175 | pickle.dump(airportHash, open('airportHash.dic', 'wb'), 176 | protocol=pickle.HIGHEST_PROTOCOL) 177 | pickle.dump(carrierHash, open('carrierHash.dic', 'wb'), 178 | protocol=pickle.HIGHEST_PROTOCOL) 179 | 180 | # createHash() 181 | 182 | airportHash = pickle.load(open(hashs[0], "rb")) 183 | carrierHash = pickle.load(open(hashs[2], "rb")) 184 | 185 | # it reads raw datset of every year, encodes variables, drop unused 186 | # variables, and pickle trimmed dataset in file system. 187 | 188 | for i in years: 189 | data = [] 190 | ''' 191 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 192 | bucket = conn.get_bucket('i290-aero') 193 | k = Key(bucket) 194 | k.key = 'data2001.csv' 195 | file_path = k.get_contents_as_string() 196 | ''' 197 | file_path = '../Airport_Data/data' + str(i) + '.csv' 198 | pickle_file_name = timestr + '-data-' + str(i) 199 | dropped = '' 200 | with open(file_path, 'r') as data_csv: 201 | csv_reader = csv.reader(data_csv, delimiter=',') 202 | j = 0 203 | with open('trimmed2_' + str(i) + '.csv', 'w') as output_csv: 204 | writer = csv.writer( 205 | output_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 206 | writer.writerow( 207 | ['dayOfWeek', 'depTime', 'carrier', 'dest', 'origin', 'delay']) 208 | for row in csv_reader: 209 | # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'): 210 | if row[21] == '0': 211 | # if (row[16] == 'SFO' or row[16] == 'OAK'): 212 | if (row[16] not in ['SFO', 'OAK']): 213 | dropped += row[16] + ' ' 214 | continue # airportHash[row[16]] = len(airportHash) + 1 215 | origin = airportHash[row[16]] 216 | 217 | if(row[17] not in airportHash): 218 | airportHash[row[17]] = len(airportHash) + 1 219 | dest = airportHash[row[17]] 220 | 221 | # if(row[10] not in tailNumHash): 222 | # tailNumHash[row[10]] = len(tailNumHash) + 1 223 | # tailNum = tailNumHash[row[10]] 224 | 225 | if(row[8] not in carrierHash): 226 | carrierHash[row[8]] = len(carrierHash) + 1 227 | carrier = carrierHash[row[8]] 228 | # print row[8], carrier, carrierHash 229 | # raw_input() 230 | 231 | content = [row[i] for i in needed_cols] 232 | # content2 = ComputeDayofYear(content) 233 | content3 = DiscretizeDepTime(content) 234 | content4 = AddDepVar(content3) 235 | content4[2] = carrier 236 | # content4[5] = tailNum 237 | content4[4] = origin 238 | content4[5] = dest 239 | for idx in range(len(content4)): 240 | content4[idx] = float(content4[idx]) 241 | temp = content4[3] 242 | content4[3] = content4[5] 243 | content4[5] = temp 244 | 245 | writer.writerow( 246 | [content4[0], content4[1], content4[2], content4[3], content4[4], content4[5]]) 247 | # print content4 248 | # data.append(content4) 249 | # print 'content4', content4 250 | # print 'data', data 251 | # fff = raw_input() 252 | # j=j+1 253 | # if j % 2000000 == 0: 254 | # print j 255 | # SaveData(data, pickle_file_name + '-' + str(j)) 256 | # data = [] 257 | # SaveData(data, pickle_file_name) 258 | # print dropped 259 | 260 | 261 | # hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic'] 262 | # hashVals = [airportHash, tailNumHash, carrierHash] 263 | # for idx in range(len(hashs)): 264 | # f = open (hashs[idx], "wb") 265 | # try: 266 | # pickle.dump(hashVals[idx], f, protocol=pickle.HIGHEST_PROTOCOL) 267 | # except Exception as e: 268 | # print e 269 | # f.close() 270 | -------------------------------------------------------------------------------- /date_graph2.py: -------------------------------------------------------------------------------- 1 | # 2 | # date_graph2.py 3 | # Author: Ryan Jung 4 | # Description: This function takes a date and calculates the probability of delay at SFO 5 | # and at OAK for the date and the 6 days prior. It then graphs these probabilities as 6 | # side-by-side bars for each day. 7 | # Dependencies: Run the Naive Bayes classification code in Crossval_r.py file. Ensure that the file _dfTest2008.csv is in the 8 | # same folder. 9 | # 10 | 11 | from __future__ import division 12 | import sys 13 | import csv 14 | import datetime 15 | import matplotlib.pyplot as plt 16 | plt.rcdefaults() 17 | import numpy as np 18 | 19 | # 20 | # These are the hard codes of the "look back" period (set at 6 days) and airport codes 21 | # from our Naive Bayes dictionary. 22 | # 23 | 24 | TIME_DELTA = 6 25 | SFO_AIRPORT_CODE = '270' 26 | OAK_AIRPORT_CODE = '215' 27 | JFK_AIRPORT_CODE = '160' 28 | ORD_AIRPORT_CODE = '225' 29 | ATL_AIRPORT_CODE = '25' 30 | LAX_AIRPORT_CODE = '168' 31 | LGA_AIRPORT_CODE = '174' 32 | DFW_AIRPORT_CODE = '85' 33 | 34 | # 35 | # Main Function 36 | # The function first takes an argument from the command line of the form: 37 | # python date_graph2.py m-d-yy 38 | # It then calculates the bounds of our query for probability of delay by day. 39 | # 40 | 41 | for arg in sys.argv: 42 | if(arg != 'date_graph2.py'): 43 | start_date = datetime.datetime.strptime(arg, '%m-%d-%y') 44 | start_date = datetime.date( 45 | start_date.year, start_date.month, start_date.day) 46 | 47 | delta = datetime.timedelta(days=TIME_DELTA) 48 | begin = start_date - delta 49 | end = start_date 50 | 51 | # 52 | # This block of code sets up a hash for each airport of the form {key: value} => {day: 53 | # [predict label,...]}. This is a list of the predicted labels for each flight on a 54 | # particular day from the origin airport to the destination airport. It iterates over 55 | # the days in our query range and constructs the hash. 56 | # 57 | 58 | SFO_Hash = {} 59 | OAK_Hash = {} 60 | with open('_dfTest2008.csv', 'r') as data: 61 | csv_reader = csv.reader(data, delimiter=',') 62 | for row in csv_reader: 63 | if(row[0] != 'Year'): 64 | year = int(row[0]) 65 | month = int(row[1]) 66 | date = int(row[2]) 67 | curr_date = datetime.date(year, month, date) 68 | if(curr_date >= begin and curr_date <= end): 69 | origin = row[7] 70 | dest = row[8] 71 | if(origin == SFO_AIRPORT_CODE and dest == LAX_AIRPORT_CODE): 72 | label = int(row[10]) 73 | if(curr_date not in SFO_Hash): 74 | SFO_Hash[curr_date] = [label] 75 | else: 76 | SFO_Hash[curr_date].append(label) 77 | if(origin == OAK_AIRPORT_CODE and dest == LAX_AIRPORT_CODE): 78 | label = int(row[10]) 79 | if(curr_date not in OAK_Hash): 80 | OAK_Hash[curr_date] = [label] 81 | else: 82 | OAK_Hash[curr_date].append(label) 83 | 84 | # 85 | # This block of code initializes values for day "steps" for our iterator later. 86 | # We also initialize lists which will have the number of delays, on-time flights, and 87 | # percentage of predicted delays for the days in our query. 88 | # 89 | 90 | iterator = datetime.timedelta(days=1) 91 | two_iterator = datetime.timedelta(days=2) 92 | three_iterator = datetime.timedelta(days=3) 93 | four_iterator = datetime.timedelta(days=4) 94 | five_iterator = datetime.timedelta(days=5) 95 | six_iterator = datetime.timedelta(days=6) 96 | 97 | day_values = [] 98 | SFO_Delays = [] 99 | SFO_On_Time = [] 100 | SFO_Flights = [] 101 | SFO_Pct = [] 102 | SFO_Comp = [] 103 | OAK_Delays = [] 104 | OAK_On_Time = [] 105 | OAK_Flights = [] 106 | OAK_Pct = [] 107 | OAK_Comp = [] 108 | 109 | # 110 | # We then loop through the query date range and populate the lists, counting number of 111 | # delayed flights, number of on-time flights, and percent of flights delayed. Each 112 | # list item corresponds to a date in our query range. 113 | # 114 | 115 | while begin <= end: 116 | if(begin not in SFO_Hash): 117 | SFO_Delays.append(0) 118 | SFO_On_Time.append(0) 119 | SFO_Pct.append(0.00) 120 | else: 121 | SFO_Flights = SFO_Hash[begin] 122 | delays = sum(SFO_Flights) 123 | num_flights = len(SFO_Flights) 124 | pct = float(delays) / (num_flights + delays) 125 | SFO_Delays.append(delays) 126 | SFO_On_Time.append(num_flights - delays) 127 | SFO_Pct.append(pct) 128 | SFO_Comp.append(1) 129 | 130 | if(begin not in OAK_Hash): 131 | OAK_Delays.append(0) 132 | OAK_On_Time.append(0) 133 | OAK_Pct.append(0.00) 134 | else: 135 | OAK_Flights = OAK_Hash[begin] 136 | delays = sum(OAK_Flights) 137 | num_flights = len(OAK_Flights) 138 | pct = float(delays) / (num_flights + delays) 139 | OAK_Delays.append(delays) 140 | OAK_On_Time.append(num_flights - delays) 141 | OAK_Pct.append(pct) 142 | OAK_Comp.append(1) 143 | 144 | day_values.append(begin) 145 | begin += iterator 146 | 147 | # 148 | # This block of code then graphs the percentage of delays by day as a side-by-side bar 149 | # graph for each day in the query. 150 | # 151 | 152 | Y1 = SFO_Pct 153 | Y2 = OAK_Pct 154 | Y3 = SFO_Comp 155 | Y4 = OAK_Comp 156 | 157 | N = 7 158 | ind = np.arange(N) # the x locations for the groups 159 | width = 0.35 # the width of the bars 160 | 161 | fig, ax = plt.subplots() 162 | rects1 = ax.bar(ind, Y1, width, color='blue') 163 | rects2 = ax.bar(ind + width, Y2, width, color='grey') 164 | 165 | fig.suptitle( 166 | 'Probability of Flight Delays at SFO vs. OAK Given Specific Date Through t-7 Days') 167 | ax.legend((rects1[0], rects2[0]), ('SFO', 'OAK'), loc='upper center') 168 | 169 | 170 | def autolabel(rects): 171 | for rect in rects: 172 | height = rect.get_height() 173 | ax.text( 174 | rect.get_x() + rect.get_width() / 175 | 2., 1.05 * height, '%.2f' % float(height), 176 | ha='center', va='bottom', rotation='vertical') 177 | 178 | autolabel(rects1) 179 | autolabel(rects2) 180 | ax.set_xticklabels( 181 | [start_date - six_iterator, start_date - five_iterator, start_date - four_iterator, 182 | start_date - three_iterator, start_date - two_iterator, start_date - iterator, start_date], rotation=45) 183 | ax.set_ylabel('Probability of Delay') 184 | 185 | plt.show() 186 | -------------------------------------------------------------------------------- /date_iterator_plot2.py: -------------------------------------------------------------------------------- 1 | # 2 | # date_iterator_plot2.py 3 | # Author: Ryan Jung 4 | # Description: This function reads the predicted results from one of our models. It then 5 | # aggregates the probability of delay by week and graphs the probability of delay at 6 | # both airports (SFO and OAK). Lastly, it calculates the t-score of the difference in 7 | # means of both airports to help determine if the difference is statistically significant. 8 | # 9 | 10 | import datetime 11 | import csv 12 | import matplotlib.pyplot as plt 13 | plt.rcdefaults() 14 | import numpy 15 | 16 | # Hard code of airport codes in our dictionary that correspond to Naive 17 | # Bayes model 18 | SFO_AIRPORT_CODE = '270' 19 | OAK_AIRPORT_CODE = '215' 20 | 21 | # 22 | # Function: ComputeDayofYear(month, day) 23 | # Description: This function takes a month and day of month and outputs a number which 24 | # corresponds to the day of year. This will be a number between 0 and 365. 25 | # Input: Integer values for month and day 26 | # Output: Integer value for day of year 27 | # 28 | 29 | 30 | def ComputeDayofYear(month, day): 31 | if(month == 1): 32 | numDays = 0 33 | if(month == 2): 34 | numDays = 31 35 | if(month == 3): 36 | numDays = 60 37 | if(month == 4): 38 | numDays = 91 39 | if(month == 5): 40 | numDays = 121 41 | if(month == 6): 42 | numDays = 152 43 | if(month == 7): 44 | numDays = 182 45 | if(month == 8): 46 | numDays = 213 47 | if(month == 9): 48 | numDays = 244 49 | if(month == 10): 50 | numDays = 274 51 | if(month == 11): 52 | numDays = 305 53 | if(month == 12): 54 | numDays = 335 55 | 56 | return (numDays + day - 1) 57 | 58 | # 59 | # Main Function 60 | # This block of code reads from the output of the Naive Bayes model and creates a hash 61 | # for SFO and OAK that corresponds to {key: value} = {week #: [predicted label,...]}. 62 | # The idea here is to create a list of all flights that are scheduled to leave SFO or OAK 63 | # by week (52 weeks in the year). The list will be 1's and 0's based on our prediction of 64 | # whether the flight will be delayed (1) or not delayed (0). 65 | # 66 | 67 | with open('_dfTest2008.csv', 'r') as data: 68 | csv_reader = csv.reader(data, delimiter=',') 69 | SFO_DM_Hash = {} 70 | OAK_DM_Hash = {} 71 | for row in csv_reader: 72 | origin = row[7] 73 | if(origin == SFO_AIRPORT_CODE): 74 | month = int(row[1]) 75 | date = int(row[2]) 76 | DayofYear = ComputeDayofYear(month, date) 77 | key = DayofYear / 7 78 | label = int(row[10]) 79 | if(key not in SFO_DM_Hash): 80 | SFO_DM_Hash[key] = [label] 81 | else: 82 | SFO_DM_Hash[key].append(label) 83 | elif(origin == OAK_AIRPORT_CODE): 84 | month = int(row[1]) 85 | date = int(row[2]) 86 | DayofYear = ComputeDayofYear(month, date) 87 | key = DayofYear / 7 88 | label = int(row[10]) 89 | if(key not in OAK_DM_Hash): 90 | OAK_DM_Hash[key] = [label] 91 | else: 92 | OAK_DM_Hash[key].append(label) 93 | else: 94 | continue 95 | 96 | # 97 | # This block of code separates out the value list of flights from the previous block of 98 | # code into a list of the number of delays and the number of on-time flights from SFO 99 | # and OAK by week. In other words, SFO_DM_Delays[14] will be the number of delayed 100 | # flights we predict at SFO in week 14. We create a 3rd list which is the percent of 101 | # flights that are delayed by week. 102 | # 103 | 104 | week_values = [] 105 | SFO_DM_Delays = [] 106 | SFO_DM_On_Time = [] 107 | SFO_DM_Pct = [] 108 | OAK_DM_Delays = [] 109 | OAK_DM_On_Time = [] 110 | OAK_DM_Pct = [] 111 | 112 | d = 0 113 | while d <= 51: 114 | if(d not in SFO_DM_Hash): 115 | SFO_DM_Delays.append(0) 116 | SFO_DM_On_Time.append(0) 117 | SFO_DM_Pct.append(0.00) 118 | else: 119 | SFO_DM_Flights = SFO_DM_Hash[d] 120 | delays = sum(SFO_DM_Flights) 121 | num_flights = len(SFO_DM_Flights) 122 | pct = float(delays) / (num_flights + delays) 123 | SFO_DM_Delays.append(delays) 124 | SFO_DM_On_Time.append(num_flights - delays) 125 | SFO_DM_Pct.append(pct) 126 | 127 | if(d not in OAK_DM_Hash): 128 | OAK_DM_Delays.append(0) 129 | OAK_DM_On_Time.append(0) 130 | OAK_DM_Pct.append(0.00) 131 | else: 132 | OAK_DM_Flights = OAK_DM_Hash[d] 133 | delays = sum(OAK_DM_Flights) 134 | num_flights = len(OAK_DM_Flights) 135 | pct = float(delays) / (num_flights + delays) 136 | OAK_DM_Delays.append(delays) 137 | OAK_DM_On_Time.append(num_flights - delays) 138 | OAK_DM_Pct.append(pct) 139 | 140 | week_values.append(d) 141 | d += 1 142 | 143 | # 144 | # This block of code calculates the mean and standard deviation of the percent of flights 145 | # that are predicted to be delayed. It uses these to calculate a t-score of the 146 | # difference in means which can be used to determine if the difference is statistically 147 | # significant. 148 | # 149 | 150 | SFO_mean = numpy.mean(SFO_DM_Pct) 151 | OAK_mean = sum(OAK_DM_Pct) / len(OAK_DM_Pct) 152 | SFO_std = numpy.std(SFO_DM_Pct) 153 | OAK_std = numpy.std(OAK_DM_Pct) 154 | SFO_n = len(SFO_DM_Pct) 155 | OAK_n = len(OAK_DM_Pct) 156 | Diff = OAK_mean - SFO_mean 157 | std_err = (((SFO_std ** 2) / SFO_n) + ((OAK_std ** 2) / OAK_n)) ** 0.5 158 | 159 | print "Standard Error", std_err 160 | print "t = ", Diff / std_err 161 | 162 | # 163 | # Graphic visualization of the probability of delay by week at SFO and OAK. SFO will be 164 | # the green line and OAK will be the blue line in the graph. X-axis is the week of 2008 165 | # and y-axis is probability of delay. 166 | # 167 | 168 | ax1 = plt.subplot(111) 169 | p1 = ax1.plot(week_values, SFO_DM_Pct, color='green') 170 | p2 = ax1.plot(week_values, OAK_DM_Pct, color='blue') 171 | ax1.set_title('Proportion of flights delayed in SFO (green) vs. OAK (blue)') 172 | ax1.set_xticklabels( 173 | ['Jan 2008', 'Mar 2008', 'May 2008', 'Jul 2008', 'Sep 2008', 'Nov 2008']) 174 | ax1.set_ylabel('Probability of Delay') 175 | ax1.legend((p1[0], p2[0]), ('SFO', 'OAK'), loc='upper center') 176 | 177 | plt.show() 178 | -------------------------------------------------------------------------------- /logisticRegression.py: -------------------------------------------------------------------------------- 1 | # 2 | # logisticRegression.py 3 | # author: eunkwang joo 4 | # description: Loading pickled dataset in several fragments, it runs logistic regression to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset. 5 | # 6 | 7 | import numpy as np 8 | import random 9 | import pickle 10 | import sys 11 | import os 12 | from boto.s3.connection import S3Connection 13 | from boto.s3.key import Key 14 | 15 | 16 | # Trimmed datasets are stroed in pickle format. Due to a memory problem, I 17 | # pickled datasets in many files. 18 | 19 | pickle2001 = ['20140428-190051-data-2001', 20 | '20140428-190051-data-2001-2000000', 21 | '20140428-190051-data-2001-4000000'] 22 | pickle2002 = ['20140428-190051-data-2002', 23 | '20140428-190051-data-2002-2000000', 24 | '20140428-190051-data-2002-4000000'] 25 | pickle2003 = ['20140428-190051-data-2003', 26 | '20140428-190051-data-2003-2000000', 27 | '20140428-190051-data-2003-4000000', 28 | '20140428-190051-data-2003-6000000'] 29 | pickle2004 = ['20140428-190051-data-2004', 30 | '20140428-190051-data-2004-2000000', 31 | '20140428-190051-data-2004-4000000', 32 | '20140428-190051-data-2004-6000000'] 33 | pickle2005 = ['20140428-190051-data-2005', 34 | '20140428-190051-data-2005-2000000', 35 | '20140428-190051-data-2005-4000000', 36 | '20140428-190051-data-2005-6000000'] 37 | pickle2006 = ['20140428-190051-data-2006', 38 | '20140428-190051-data-2006-2000000', 39 | '20140428-190051-data-2006-4000000', 40 | '20140428-190051-data-2006-6000000'] 41 | pickle2007 = ['20140428-190051-data-2007', 42 | '20140428-190051-data-2007-2000000', 43 | '20140428-190051-data-2007-4000000', 44 | '20140428-190051-data-2007-6000000'] 45 | pickle2008 = ['20140428-190051-data-2008', 46 | '20140428-190051-data-2008-2000000', 47 | '20140428-190051-data-2008-4000000', 48 | '20140428-190051-data-2008-6000000'] 49 | 50 | # 51 | # function: loadData() 52 | # description: It loads dataset from pickled files, and separates x variables (features) from y value (delay) 53 | # input: fileName= name of a pickled file 54 | # output: x and y matrices to be used for logistic regression 55 | # 56 | 57 | 58 | def loadData(fileName): 59 | if os.path.exists(fileName) == False: 60 | print 'downloading', fileName, 'from s3' 61 | conn = S3Connection( 62 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7') 63 | bucket = conn.get_bucket('i290-aero') 64 | k = Key(bucket) 65 | k.key = fileName 66 | k.get_contents_to_filename(fileName) 67 | print 'downloaded', fileName, 'from s3' 68 | 69 | print 'now unpickle...' 70 | x = pickle.load(open(fileName, "rb")) 71 | x = np.array(x) 72 | print 'x.shape = ', x.shape, x[:, -1:].shape 73 | y = x[:, -1:].copy() # last col is y value (delay or not) 74 | x[:, -1:] = 1. 75 | return x, y 76 | 77 | 78 | # 79 | # function: gradientDescent() 80 | # description: Using gradient descent algorithm, it runs logistic regression and estimates coefficients. 81 | # input: x= features to be used for logistic regression 82 | # y= ground truth value of delay 83 | # numIterations= number of iterations to take for logistic regression 84 | # dimension= dimension of x matrix 85 | # theta= coefficient we try to find 86 | # output: theta= coefficient matrix we have found to predict delay 87 | # 88 | 89 | def gradientDescent(x, y, numIterations, dimension, theta): 90 | # theta = np.zeros(dimension)[np.newaxis].transpose() 91 | for i in range(1, numIterations): 92 | randIdx = random.randint(0, len(x) - 1) 93 | xTrans = x[randIdx][np.newaxis].transpose() 94 | # print theta.transpose(), xTrans 95 | u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans))) 96 | loss = y[randIdx] - u 97 | gradient = np.dot(loss[0][0], xTrans) 98 | # update 99 | theta = theta + gradient / i 100 | return theta 101 | 102 | 103 | def main(): 104 | # arg = sys.argv 105 | # if len(arg) < 2: 106 | # print 'USE: $ python logisticRegression.py [dataset_file]' 107 | # return 108 | # x, y = loadData(arg[1]) 109 | 110 | # x, x0, x1, y = getData('classification.dat') 111 | 112 | # train theta for 7 years of dataset 113 | if os.path.exists('pickled_theta') == False: 114 | theta = None 115 | for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2008: 116 | x, y = loadData(elem) 117 | if theta == None: 118 | theta = np.zeros(x.shape[1])[np.newaxis].transpose() 119 | print 'theta == None...... initialize..........', theta.shape 120 | theta = gradientDescent(x, y, 100000, x.shape[1], theta) 121 | print 'finished gradientDescent of ', elem 122 | print 'theta', theta 123 | 124 | # pickle trained theta 125 | f = open('pickled_theta', 'wb') 126 | pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL) 127 | f.close() 128 | 129 | # load pickled theta 130 | theta = pickle.load(open('pickled_theta', 'rb')) 131 | 132 | # predict with test dataset 133 | accu = 0. 134 | length = 0. 135 | tp, tn, fp, fn = 0., 0., 0., 0. 136 | for elem in pickle2007: 137 | if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False: 138 | x, y = loadData(elem) 139 | dotProduct = np.dot(x, theta) 140 | print '============= dot product =============' 141 | print dotProduct 142 | print '=============y =============' 143 | print y 144 | pickle.dump( 145 | dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) 146 | pickle.dump( 147 | y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) 148 | else: 149 | dotProduct = pickle.load(open('dot-' + elem, 'rb')) 150 | y = pickle.load(open('y-' + elem, 'rb')) 151 | 152 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct] 153 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit] 154 | 155 | for i in range(len(prob)): 156 | if prob[i] == 1 and y[i] == 1: 157 | accu += 1 158 | tp += 1 159 | elif prob[i] == 1 and y[i] == 0: 160 | fp += 1 161 | elif prob[i] == 0 and y[i] == 1: 162 | fn += 1 163 | elif prob[i] == 0 and y[i] == 0: 164 | accu += 1 165 | tn += 1 166 | else: 167 | raise Exception('wtf!!!', prob[i], y[i]) 168 | length += len(prob) 169 | # print accuracy, precision, and recall 170 | print 'accuracy = ', accu * 100 / length, (tp + tn) / (tp + fp + fn + tn) 171 | print 'precision = ', tp / (tp + fp) 172 | print 'recall = ', tp / (tp + fn) 173 | 174 | # graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5)) 175 | print 'asdf' 176 | 177 | 178 | if __name__ == '__main__': 179 | main() 180 | -------------------------------------------------------------------------------- /lr_app2.py: -------------------------------------------------------------------------------- 1 | # 2 | # lr_app2.py 3 | # author: eunkwang joo 4 | # description: Loading trimmed datasets stored as csv files, it runs logistic regression using pandas to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset. 5 | # 6 | 7 | import pandas as pd 8 | import statsmodels.api as sm 9 | # import pylab as pl 10 | import numpy as np 11 | import sys 12 | import random 13 | import os 14 | import pickle 15 | 16 | # df = pd.read_csv('trimmed2_2001.csv')#sys.argv[1]) 17 | 18 | 19 | # 20 | # function: reader() 21 | # description: It loads dataset from csv file in dataframe format 22 | # input: f= name of a csv file of dataset 23 | # output: d= dataframe loaded from csv dataset 24 | # 25 | 26 | def reader(f): 27 | d = pd.read_csv(f, header=0) # , axis=1) 28 | # d.columns = range(d.shape[1]) 29 | return d 30 | 31 | 32 | # 33 | # function: shuffle() 34 | # description: It shuffles data 35 | # input: df= dataframe which holds data 36 | # n= number of shuffles 37 | # axis= shuffle in which axis 38 | # output: df= shuffled dataframe 39 | # 40 | 41 | def shuffle(df, n=1, axis=0): 42 | df = df.copy() 43 | for _ in range(n): 44 | df.apply(np.random.shuffle, axis=axis) 45 | return df 46 | 47 | 48 | # search for csv files 49 | for dirpath, dirnames, filenames in os.walk('.'): 50 | pass 51 | 52 | filenames = [f for f in filenames if '.csv' in f] 53 | filenames.sort() 54 | print filenames 55 | # concatenate all csv files in one dataframe 56 | #[1532189 rows x 6 columns] 57 | df = pd.concat([reader(f) for f in filenames], keys=filenames) 58 | 59 | print df.head() 60 | print df.columns 61 | 62 | # dumm1 = pd.get_dummies(df['carrier'], prefix='carrier') 63 | # dumm2 = pd.get_dummies(df['dest'], prefix='dest') 64 | # dumm3 = pd.get_dummies(df['origin'], prefix='origin') 65 | # dumm4 = pd.get_dummies(df['tailNum'], prefix='tailNum') 66 | 67 | cols = ['delay', 'dayOfWeek', 'depTime'] 68 | 69 | # data = df[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':]) 70 | # data = df[cols].join(dumm1).join(dumm2).join(dumm3) 71 | # data['intercept'] = 1.0 72 | # print data.head() #[5 rows x 123 columns] including delay column 73 | 74 | # data_delay = data[data['delay'] == 1] 75 | # data_nodelay = data[data['delay'] == 0] 76 | 77 | # get delayed data only 78 | data_delay = df[df['delay'] == 1] 79 | rows = random.sample(data_delay.index, len(data_delay)) 80 | data_delay_1 = data_delay.ix[rows] 81 | data_delay_2 = data_delay.drop(rows) 82 | 83 | # get not delayed data only 84 | data_nodelay = df[df['delay'] == 0] 85 | rows = random.sample(data_nodelay.index, len(data_delay)) 86 | data_nodelay = data_nodelay.ix[rows] 87 | # get sample dataset of 50% delayed and 50% not delayed data 88 | data_halfhalf = pd.concat([data_delay, data_nodelay]) 89 | 90 | rows = random.sample(data_nodelay.index, len(data_delay) / 2) 91 | data_nodelay = data_nodelay.ix[rows] 92 | data_halfhalf_2 = pd.concat([data_delay_2, data_nodelay]) 93 | 94 | # make dummy variables of carrier, dest, and origin 95 | dumm1 = pd.get_dummies(data_halfhalf['carrier'], prefix='carrier') 96 | dumm2 = pd.get_dummies(data_halfhalf['dest'], prefix='dest') 97 | dumm3 = pd.get_dummies(data_halfhalf['origin'], prefix='origin') 98 | data_halfhalf = data_halfhalf[cols].join(dumm1.ix[:, 'carrier_3.0':]).join( 99 | dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':]) 100 | data_halfhalf['intercept'] = 1.0 # (552264, 117) 101 | # data_halfhalf = shuffle(data_halfhalf) 102 | # data_halfhalf.reindex(np.random.permutation(data_halfhalf.index)) 103 | print 'delay = ', len(data_delay), len(data_delay), len(data_halfhalf) 104 | 105 | 106 | dumm1 = pd.get_dummies(data_halfhalf_2['carrier'], prefix='carrier') 107 | dumm2 = pd.get_dummies(data_halfhalf_2['dest'], prefix='dest') 108 | dumm3 = pd.get_dummies(data_halfhalf_2['origin'], prefix='origin') 109 | data_halfhalf_2 = data_halfhalf_2[cols].join(dumm1.ix[:, 'carrier_3.0':]).join( 110 | dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':]) 111 | data_halfhalf_2['intercept'] = 1.0 # (552264, 117) 112 | 113 | 114 | # train dataset with logistic regression algorithm 115 | train_cols = data_halfhalf.columns[1:] 116 | logit = sm.Logit(data_halfhalf['delay'], data_halfhalf[train_cols]) 117 | result = logit.fit(maxiter=1000) 118 | 119 | ff = open('halfhalf_sample_re3', 'w') 120 | ff.write(str(result.summary())) 121 | ff.close() 122 | print result.summary() 123 | 124 | 125 | # finally, we got theta - coefficient. 126 | a = np.array(result.params) 127 | pickle.dump(a, open('theta_half5', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) 128 | theta = pickle.load(open('theta_half5', 'rb')) 129 | 130 | 131 | # now k-fold test 132 | 133 | ''' 134 | df_test = pd.read_csv('trimmed2_2008.csv') 135 | dumm_test1 = pd.get_dummies(df_test['carrier'], prefix='carrier') 136 | dumm_test2 = pd.get_dummies(df_test['dest'], prefix='dest') 137 | dumm_test3 = pd.get_dummies(df_test['origin'], prefix='origin') 138 | data_test = df_test[cols].join(dumm_test1.ix[:, 'carrier_3.0':]).join(dumm_test2.ix[:, 'dest_6.0':]).join(dumm_test3.ix[:, 'origin_105.0':]) 139 | data_test['intercept'] = 1.0 140 | data_test_cal = data_test.drop('delay', 1) 141 | dot = np.dot(data_test_cal, theta) 142 | ''' 143 | 144 | rows = random.sample(data_halfhalf.index, len(data_halfhalf) / 10) 145 | df_10 = data_halfhalf.ix[rows] 146 | # df_90 = data_halfhalf.drop(rows) 147 | df_10_cal = df_10.drop('delay', 1) 148 | dotProduct = np.dot(df_10_cal, theta) # m x 122 * 122 x 1 149 | 150 | # get reverse logit 151 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct] 152 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit] 153 | 154 | # predict with test dataset and measure accuracy, precision, and recall 155 | y = df_10['delay'] 156 | tp, tn, fp, fn = 0., 0., 0., 0. 157 | for i in range(len(prob)): 158 | if prob[i] == 1 and y[i] == 1: 159 | tp += 1 160 | elif prob[i] == 1 and y[i] == 0: 161 | fp += 1 162 | elif prob[i] == 0 and y[i] == 1: 163 | fn += 1 164 | elif prob[i] == 0 and y[i] == 0: 165 | tn += 1 166 | else: 167 | raise Exception('wtf!!!', prob[i], y[i]) 168 | 169 | print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn) 170 | print 'precision = ', tp / (tp + fp) 171 | print 'recall = ', tp / (tp + fn) 172 | print tp, tn, fp, fn 173 | 174 | # >>> print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn) 175 | # accuracy = 0.60288632166 176 | # >>> print 'precision = ', tp / (tp + fp) 177 | # precision = 0.607973048849 178 | # >>> print 'recall = ', tp / (tp + fn) 179 | # recall = 0.586353790614 180 | # >>> print tp, tn, fp, fn 181 | # 16242.0 17053.0 10473.0 11458.0 182 | 183 | 184 | # meaure ROC curve 185 | 186 | rlsort = reverseLogit[:] 187 | rlsort.sort() 188 | diff = diff[51900] # min([j-i for i, j in zip(rlsort[:-1], rlsort[1:])]) 189 | 190 | p = len([e for e in y if e == 1]) 191 | n = len([e for e in y if e == 0]) 192 | j = rlsort[0] 193 | r = [] 194 | while j <= rlsort[-1]: 195 | prob = [1 if rev >= j else 0 for rev in reverseLogit] 196 | p1 = [x for x in prob if x == 1] 197 | # print p1 198 | # raw_input() 199 | tp, fp = 0., 0. 200 | for i in range(len(prob)): 201 | if prob[i] == 1 and y[i] == 1: 202 | tp += 1 203 | elif prob[i] == 1 and y[i] == 0: 204 | fp += 1 205 | r.append((fp / float(n), tp / float(p))) 206 | # print j, tp, fp, p, n 207 | j += 0.01 208 | 209 | # plot ROC curve 210 | import matplotlib as mpl 211 | mpl.use('Agg') 212 | import matplotlib.pyplot as plt 213 | from matplotlib.backends.backend_pdf import PdfPages 214 | 215 | r = pickle.load(open('roc.list', 'rb')) 216 | fig = plt.figure() 217 | plt.plot(*zip(*r), marker='o', color='r', ls='') 218 | pp = PdfPages('foo.pdf') 219 | pp.savefig(fig) 220 | pp.close() 221 | -------------------------------------------------------------------------------- /model_selector.py: -------------------------------------------------------------------------------- 1 | # 2 | # model_selector.py 3 | # Author: Ryan Jung 4 | # Description: This script graphs the results of validation tests with precision on the 5 | # y-axis and recall on the x-axis. 6 | # Because we only used 8-fold validation for the Naive Bayes model, this model is only 7 | # used for the testing results of that validation. 8 | # 9 | 10 | import matplotlib.pyplot as plt 11 | plt.rcdefaults() 12 | 13 | # Hard code of testing results of form [precision, recall, accuracy, title] 14 | DM_TEST_DATA = [ 15 | [0.59, 0.61, 0.61, 'NB 2008'], [0.60, 0.61, 0.60, 'NB 2007'], [ 16 | 0.60, 0.63, 0.62, 'NB 2006'], [0.62, 0.64, 0.64, 'NB 2005'], 17 | [0.63, 0.66, 0.66, 'NB 2004'], [0.65, 0.70, 0.70, 'NB 2003'], [0.60, 0.65, 0.65, 'NB 2002'], [0.58, 0.62, 0.61, 'NB 2001']] 18 | 19 | # 20 | # Function: calc_f1_score(precision, recall, accuracy) 21 | # Description: This function calculates the F1 score = 2*(precision * recall) / (precision + recall) 22 | # Input: Floating point values of precision, recall, and accuracy (not used) 23 | # Output: Floating point F1 score 24 | # 25 | 26 | 27 | def calc_f1_score(precision, recall, accuracy): 28 | return (float(2 * (precision * recall) / (precision + recall))) 29 | 30 | # 31 | # Main Function 32 | # Description: Creates array of precision and array of recall values. Uses best values to 33 | # track highest F1 score and title of test with best result. 34 | # 35 | 36 | precision_dm_array = [] 37 | recall_dm_array = [] 38 | dm_best_f1 = 0.00000000000000000 39 | index = 0 40 | dm_best_title = 'None' 41 | 42 | for each in DM_TEST_DATA: 43 | precision_dm_array.append(each[0]) 44 | recall_dm_array.append(each[1]) 45 | 46 | f1 = calc_f1_score(each[0], each[1], each[2]) 47 | if(f1 > dm_best_f1): 48 | dm_best_f1 = f1 49 | best_index = index 50 | dm_best_title = each[3] 51 | index += 1 52 | 53 | # prints title of Best performing model by F1 score 54 | # print "The Best Naive Bayes Model is: Model " + str(dm_best_title) 55 | 56 | # Scatter plot visualization of results with precision on y-axis and 57 | # recall on x-axis 58 | fig = plt.subplot(111) 59 | fig.scatter(precision_dm_array, recall_dm_array, color='blue') 60 | fig.set_xlabel('Recall') 61 | fig.set_ylabel('Precision') 62 | 63 | plt.show() 64 | -------------------------------------------------------------------------------- /naive bayes.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | # 4 | # Naive Bayes.py 5 | # Author: Divyakumar Menghani 6 | # Description: This code reads the dataset into pandas dataframes, builds a Naive Bayes Classifier, predicts labels for a subset of data. It also calculates metrics such as precision/recall/accuracy and F-Score after classification. The output is dumped in pickle files which are used later for visualization 7 | # 8 | 9 | import pickle 10 | import sklearn 11 | from sklearn.naive_bayes import * 12 | import pandas as pd 13 | import numpy as np 14 | from sklearn import * 15 | import os 16 | from sklearn.metrics import * 17 | from sklearn import metrics, preprocessing 18 | from sklearn import svm, naive_bayes, neighbors, tree 19 | 20 | # 21 | # Function: createPickle() 22 | # Description: This function will create a pickle file. 23 | # Input: data structure that you want to pickle 24 | # Output: a pickle file for the data structure. The file is stored in the 25 | # same path the code is running from 26 | # 27 | 28 | 29 | def createPickle(data, filename): 30 | with open(filename, 'wb') as f: 31 | pickle.dump(data, f) 32 | print "Pickled", filename 33 | 34 | 35 | # Global constants for this code 36 | print "Setting constants..." 37 | 38 | TRAINING_LINE_NUMBER = 8000000 # Number of lines to be read from input files 39 | # List of years for training and testing 40 | YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008'] 41 | INPUT_FILE_PATH = "/home/dmenghani/python/" # Unix path 42 | # INPUT_FILE_PATH = "C:\\data\\airline\\" # Windows path 43 | SKIP_FIRST_LINE = True # To skip the first line, as its the header 44 | 45 | # Creating the master data frame from all years. 46 | master = [] 47 | print "Reading into Pandas frame..." 48 | try: 49 | for year in YEARS: 50 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year)) 51 | print "\n", path 52 | dfPart = pd.read_csv( 53 | path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[ 54 | u'Year', 55 | u'Month', 56 | u'DayofMonth', 57 | u'DayOfWeek', 58 | u'UniqueCarrier', 59 | u'DepTime', 60 | u'TailNum', 61 | u'Origin', 62 | u'Dest', 63 | u'DepDelay', 64 | # u'ArrDelay', 65 | u'Cancelled', 66 | # u'ArrTime', 67 | # u'ArrDelay', 68 | # u'Distance' 69 | ]) 70 | print len(dfPart) 71 | # Removing cancelled flights from each year 72 | dfPart = dfPart[dfPart['Cancelled'] == 0] 73 | rows = np.random.choice( 74 | np.random.permutation(dfPart.index.values), len(dfPart) // 3, replace=False) # 33% sampling of training data 75 | print rows 76 | sampled_dfPart = dfPart.ix[rows] 77 | sampled_dfPart = dfPart 78 | master.append(sampled_dfPart) 79 | print 80 | except Exception as e: 81 | print "Supplemental Data Import failed", e 82 | 83 | # Building the master frame by concating it for all years 84 | dfMaster = pd.concat(master, ignore_index=True) 85 | master = [] 86 | dfPart = [] 87 | 88 | print "Total length - ", len(dfMaster) 89 | del dfMaster['Cancelled'] # Column not needed 90 | 91 | dfMaster.fillna(0, inplace=True) 92 | 93 | # Converting to appropriate datatypes for numeric cols. 94 | dfMaster['Year'] = dfMaster['Year'].astype('int') 95 | dfMaster['Month'] = dfMaster['Month'].astype('int') 96 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int') 97 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int') 98 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int') 99 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int') 100 | 101 | df = dfMaster 102 | 103 | # Since we dont have a classification label in the data, we are creating 104 | # one. Threshold of 5mins was chosen. 105 | print "Calculating classification label..." 106 | df['label'] = 0 107 | df.label[df.DepDelay >= 5] = 1 108 | df.label[df.DepDelay < 5] = 0 109 | print "Actual delayed flights -", np.sum(dfMaster['label']) / len(dfMaster['label']) 110 | 111 | del df['DepDelay'] 112 | 113 | print "Dataframe shape - ", df.shape 114 | print "Columns -", df.columns 115 | 116 | # Converting categorical data to numeric for cols - TailNum, 117 | # UniqueCarrier, Dest, Origin 118 | print "Converting categorical data to numeric..." 119 | for col in set(df.columns): 120 | if df[col].dtype == np.dtype('object'): 121 | print "Converting...", col 122 | if col == 'TailNum': 123 | s = np.unique(df[col].values) 124 | TailNum = pd.Series([x[0] for x in enumerate(s)], index=s) 125 | if col == 'UniqueCarrier': 126 | s = np.unique(df[col].values) 127 | UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s) 128 | if col == 'Dest': 129 | s = np.unique(df[col].values) 130 | Dest = pd.Series([x[0] for x in enumerate(s)], index=s) 131 | if col == 'Origin': 132 | s = np.unique(df[col].values) 133 | Origin = pd.Series([x[0] for x in enumerate(s)], index=s) 134 | 135 | # Creating Pickle files for the list containing key-value pairs 136 | createPickle(Dest, 'Dest_2008.pkl') 137 | createPickle(Origin, 'Origin_2008.pkl') 138 | createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl') 139 | createPickle(TailNum, 'TailNum_2008.pkl') 140 | print "Pickle completed." 141 | 142 | # 143 | # Function: getTailNum() 144 | # Description: This function will convert the input categorical value to corresponding numeric key. 145 | # Input: categorical value you want to convert 146 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup. 147 | # 148 | 149 | 150 | def getTailNum(inTailNum): 151 | out = [] 152 | for x, y in inTailNum.iteritems(): 153 | out.append(TailNum.get_value(y)) 154 | return out 155 | 156 | # 157 | # Function: getDest() 158 | # Description: This function will convert the input categorical value to corresponding numeric key. 159 | # Input: categorical value you want to convert 160 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup. 161 | # 162 | 163 | 164 | def getDest(inDest): 165 | out = [] 166 | for x, y in inDest.iteritems(): 167 | out.append(Dest.get_value(y)) 168 | return out 169 | 170 | # 171 | # Function: getOrigin() 172 | # Description: This function will convert the input categorical value to corresponding numeric key. 173 | # Input: categorical value you want to convert 174 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup. 175 | # 176 | 177 | 178 | def getOrigin(inOrign): 179 | out = [] 180 | for x, y in inOrign.iteritems(): 181 | out.append(Origin.get_value(y)) 182 | return out 183 | 184 | # 185 | # Function: getCarrier() 186 | # Description: This function will convert the input categorical value to corresponding numeric key. 187 | # Input: categorical value you want to convert 188 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup. 189 | # 190 | 191 | 192 | def getCarrier(inCarrier): 193 | out = [] 194 | for x, y in inCarrier.iteritems(): 195 | out.append(UniqueCarrier.get_value(y)) 196 | return out 197 | 198 | # Converting TailNum 199 | df['TailNum'] = getTailNum(df['TailNum']) 200 | print "TailNum completed." 201 | 202 | # Converting UniqueCarrier 203 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier']) 204 | print "UniqueCarrier completed." 205 | 206 | # Converting Dest 207 | df['Dest'] = getDest(df['Dest']) 208 | print "Dest completed." 209 | 210 | # Converting Origin 211 | df['Origin'] = getOrigin(df['Origin']) 212 | print "Origin completed." 213 | 214 | print "Conversion to numeric completed." 215 | 216 | # Building classifier 217 | print "Begin cross validation..." 218 | 219 | # Choosing features for classifier 220 | features = df.columns[0:9] 221 | 222 | # Creating lists for storing results for cross validation. 223 | accuracy = {} 224 | results = {} 225 | matrix = {} 226 | prec = {} 227 | recall = {} 228 | 229 | for year in YEARS: 230 | print "Testing on - ", year 231 | train = df[df['Year'] != int(year)] # Test on 1year, train on other 7years 232 | test = df[df['Year'] == int(year)] 233 | # test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])] 234 | print len(train), len(test) 235 | rows = np.random.choice(np.random.permutation( 236 | test.index.values), len(test) // 2, replace=False) # 50% sampling of test data to avoid memory errors faced. 237 | # print rows 238 | sampled_test = test.ix[rows] 239 | sampled_test = test 240 | # Putting the last column of Training data into a list 241 | trainTargets = np.array(train['label']).astype(int) 242 | 243 | # Putting the last column of Testing data into a list 244 | testTargets = np.array(sampled_test['label']).astype(int) 245 | print "Train length - ", len(train), "Test length - ", len(sampled_test) 246 | print train['Year'] 247 | print test['Year'] 248 | print "Model fitting and prediction started..." 249 | # Building the classifier and fitting the train data 250 | gnb = GaussianNB() 251 | y_gnb = gnb.fit(train[features], trainTargets).predict( 252 | sampled_test[features]) 253 | # Storing results in a new colum in the dataframe. 254 | sampled_test['pred_label'] = y_gnb 255 | print "Classification completed." 256 | # Creating pickle files with the classifier and the results of classifier 257 | createPickle(gnb, INPUT_FILE_PATH + "classifier_" + year + ".pkl") 258 | createPickle(y_gnb, INPUT_FILE_PATH + "label_" + year + ".pkl") 259 | sampled_test.to_csv( 260 | INPUT_FILE_PATH + "\dfTest" + year + ".csv", index=False) 261 | # Calculating metrics using sklearn metrics functions 262 | print "\nCalculating metrcs..." 263 | accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb) 264 | print "Accuracy score - ", accuracy[int(year)] 265 | prec[int(year)] = precision_score( 266 | sampled_test['label'], y_gnb, average='micro') 267 | print "Precision Score - ", prec[int(year)] 268 | recall[int(year)] = recall_score( 269 | sampled_test['label'], y_gnb, average='micro') 270 | print "Recall Score - ", recall[int(year)] 271 | print "Confusion matrix" 272 | matrix[int(year)] = metrics.confusion_matrix( 273 | sampled_test['label'], y_gnb) 274 | print matrix[int(year)] 275 | results[int(year)] = precision_recall_fscore_support( 276 | sampled_test['label'], y_gnb, average='micro') 277 | print "Precision, recall, F-Score, Support - ", results[int(year)] 278 | print "Classification report" 279 | print classification_report(np.array(sampled_test['label']), y_gnb, 280 | target_names=target_names) 281 | print 282 | train = [] 283 | test = [] 284 | 285 | print "Accuracy\n", accuracy 286 | print "\nPrecision\n", prec 287 | print "\nRecall\n", recall 288 | print "\nMetrics\n", results 289 | print "\nMatrix\n", matrix 290 | 291 | # Finding mean of metrics 292 | print "\nMean Cross validation Precision score", np.mean(pd.Series(prec)) 293 | print "\nMean Cross validation Recall score", np.mean(pd.Series(recall)) 294 | print "\nMean Cross validation Accuracy score", np.mean(pd.Series(accuracy)) 295 | 296 | # Pickling results 297 | print "\nPickling stuff..." 298 | createPickle(accuracy, 'accuracy.pkl') 299 | createPickle(prec, 'prec.pkl') 300 | createPickle(results, 'results.pkl') 301 | createPickle(matrix, 'matrix.pkl') 302 | --------------------------------------------------------------------------------