├── .ipynb_checkpoints
├── Basic Code-DM-checkpoint.ipynb
├── Basic Code-checkpoint.ipynb
├── Julia Code-checkpoint.ipynb
├── Naive Bayes - CrossVal-checkpoint.ipynb
├── Naive Bayes Code-checkpoint.ipynb
├── Test_SFO_OAK_FileGeneration-checkpoint.ipynb
├── Untitled0-checkpoint.ipynb
├── Untitled1-checkpoint.ipynb
├── Untitled2-checkpoint.ipynb
└── Untitled3-checkpoint.ipynb
├── EDA_and_NB_performance_charts.py
├── INFO290T_Final_Project_Presentation_vFINAL.pptx
├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx
├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf
├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx
├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf
├── NB_performance_charts.py
├── Old Python Code
├── Basic Code-DM.ipynb
├── Basic.py
├── Dest.pkl
├── Julia Code.ipynb
├── NB.py
├── Naive Bayes - CrossVal.ipynb
├── Naive Bayes Code.ipynb
├── Origin.pkl
├── TailNum.pkl
├── Test_SFO_OAK_FileGeneration.ipynb
├── UniqueCarrier.pkl
├── Untitled0.ipynb
├── Untitled1.ipynb
├── Untitled2.ipynb
├── Untitled3.ipynb
├── accuracy.pkl
├── counter.py
├── counter1.py
├── data_reader_v2.py
├── data_reader_v3.py
├── data_reader_v4_ek.py
├── date_iterator_plot.py
├── logisticRegression.py
├── matrix.pkl
├── model_selector.py
├── output.txt
├── prec.pkl
├── results.pkl
└── why.csv
├── README.md
├── data_reader_v4_ek.py
├── data_reader_v4_ek_rj_csv.py
├── date_graph2.py
├── date_iterator_plot2.py
├── logisticRegression.py
├── lr_app2.py
├── model_selector.py
└── naive bayes.py
/.ipynb_checkpoints/Julia Code-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:da393243e5798034294abbf7f55af08e5d9a8eecbfb718fe2ddd80cd4a4d11b5"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "import csv\n",
16 | "import pickle\n",
17 | "\n",
18 | "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n",
19 | "years = [2008]\n",
20 | "\n",
21 | "def ComputeDayofYear(row):\n",
22 | " \"\"\"This function will return an integer to represent the day of the year given an integer\n",
23 | " representing month and an integer representing the day of the month. This number will\n",
24 | " correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned\n",
25 | " as 0. Feb 29th will be returned as 59.\"\"\"\n",
26 | "\n",
27 | " if(row[0] == '1'):\n",
28 | " calc = 0 + int(row[1]) - 1\n",
29 | " row[1] = str(calc)\n",
30 | " elif(row[0] == '2'):\n",
31 | " calc = 31 + int(row[1]) - 1\n",
32 | " row[1] = str(calc)\n",
33 | " elif(row[0] == '3'):\n",
34 | " calc = 60 + int(row[1]) - 1\n",
35 | " row[1] = str(calc)\n",
36 | " elif(row[0] == '4'):\n",
37 | " calc = 91 + int(row[1]) - 1\n",
38 | " row[1] = str(calc)\n",
39 | " elif(row[0] == '5'):\n",
40 | " calc = 121 + int(row[1]) - 1\n",
41 | " row[1] = str(calc)\n",
42 | " elif(row[0] == '6'):\n",
43 | " calc = 152 + int(row[1]) - 1\n",
44 | " row[1] = str(calc)\n",
45 | " elif(row[0] == '7'):\n",
46 | " calc = 182 + int(row[1]) - 1\n",
47 | " row[1] = str(calc)\n",
48 | " elif(row[0] == '8'):\n",
49 | " calc = 213 + int(row[1]) - 1\n",
50 | " row[1] = str(calc)\n",
51 | " elif(row[0] == '9'):\n",
52 | " calc = 244 + int(row[1]) - 1\n",
53 | " row[1] = str(calc)\n",
54 | " elif(row[0] == '10'):\n",
55 | " calc = 274 + int(row[1]) - 1\n",
56 | " row[1] = str(calc)\n",
57 | " elif(row[0] == '11'):\n",
58 | " calc = 305 + int(row[1]) - 1\n",
59 | " row[1] = str(calc)\n",
60 | " elif(row[0] == '12'):\n",
61 | " calc = 335 + int(row[1]) - 1\n",
62 | " row[1] = str(calc)\n",
63 | " return row\n",
64 | "\n",
65 | "\n",
66 | "def DiscretizeDepTime(row):\n",
67 | " \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n",
68 | " morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value\n",
69 | " is assumed to be an integer in 24-hour time format. These labels will correspond to\n",
70 | " variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.\n",
71 | " An error time is returned as morning.\"\"\"\n",
72 | "\n",
73 | " if(int(row[3]) <= 559):\n",
74 | " row[3] = '2'\n",
75 | " elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n",
76 | " row[3] = '0'\n",
77 | " elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n",
78 | " row[3] = '1'\n",
79 | " elif(int(row[3]) >= 1800):\n",
80 | " row[3] = '2'\n",
81 | " else:\n",
82 | " row[3] = '0'\n",
83 | " return row\n",
84 | "\n",
85 | "\n",
86 | "def AddDepVar(row):\n",
87 | " \"\"\"This function adds a classification label based on the length of the recorded\n",
88 | " Departure Delay in the data set. It assumes an input integer value of the delay in mins.\n",
89 | " By airline industry standards, flight delays are defined as departure delays greater than\n",
90 | " or equal to 15 minutes. For delayed flights, this variable will have value \"1\".\n",
91 | " For on time flights, it will have value \"0\". Default value will be set at \"0\".\"\"\"\n",
92 | "\n",
93 | " if(row[6] >= '15'):\n",
94 | " row[6] = '1'\n",
95 | " else:\n",
96 | " row[6] = '0'\n",
97 | " return row\n",
98 | "\n",
99 | "def SaveData(data, pickle_file_name):\n",
100 | " \"\"\"This function pickles each file.\"\"\"\n",
101 | "\n",
102 | " f = open (pickle_file_name, \"w\")\n",
103 | " pickle.dump(data, f)\n",
104 | " f.close()\n",
105 | "\n",
106 | "\n",
107 | "\n",
108 | "for i in years:\n",
109 | " data = []\n",
110 | " file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n",
111 | " pickle_file_name = 'data' + str(i)\n",
112 | " with open(file_path, 'r') as data_csv:\n",
113 | " csv_reader = csv.reader(data_csv, delimiter=',')\n",
114 | " for row in list(csv_reader):\n",
115 | " if row[21] == '0':\n",
116 | " content = list(row[i] for i in needed_cols)\n",
117 | " content2 = ComputeDayofYear(content)\n",
118 | " content3 = DiscretizeDepTime(content2)\n",
119 | " content4 = AddDepVar(content3)\n",
120 | " data.append(content4)\n",
121 | " SaveData(data, pickle_file_name)"
122 | ],
123 | "language": "python",
124 | "metadata": {},
125 | "outputs": []
126 | }
127 | ],
128 | "metadata": {}
129 | }
130 | ]
131 | }
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Naive Bayes Code-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:4dd7867e8934ba7980fd61f1cdbc7df7ff1cccafc3f287e3da0b94562583a3d7"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import sklearn\n",
19 | "from sklearn.naive_bayes import *\n",
20 | "from sklearn.metrics import *\n",
21 | "import os\n",
22 | "import cPickle\n",
23 | "import sys\n",
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "from optparse import OptionParser\n",
27 | "from sklearn import metrics, preprocessing\n",
28 | "from sklearn import svm, naive_bayes, neighbors, tree\n",
29 | "from sklearn.ensemble import AdaBoostClassifier\n",
30 | "from sklearn import cross_validation\n",
31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n",
32 | "from sklearn.svm import SVC # support vector machine classifier\n",
33 | "from sklearn.grid_search import GridSearchCV # hyperparameter grid search to find best model parameters\n",
34 | "from sklearn import preprocessing # preprocess string labels into numerics\n",
35 | "from sklearn import *\n",
36 | "from sklearn.metrics import precision_recall_fscore_support\n",
37 | "from sklearn.metrics import classification_report"
38 | ],
39 | "language": "python",
40 | "metadata": {},
41 | "outputs": [],
42 | "prompt_number": 197
43 | },
44 | {
45 | "cell_type": "code",
46 | "collapsed": false,
47 | "input": [
48 | "# Setting up constants\n",
49 | "print \"Setting constants...\"\n",
50 | "\n",
51 | "TRAINING_LINE_NUMBER = 1000000\n",
52 | "YEARS = ['2008']\n",
53 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n",
54 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n",
55 | "# YEARS = ['2008']\n",
56 | "\n",
57 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n",
58 | "\n",
59 | "master = []\n",
60 | "print \"Reading into Pandas frame...\"\n",
61 | "try:\n",
62 | " for year in YEARS:\n",
63 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
64 | " print \"\\n\",path\n",
65 | " dfPart = pd.read_csv(\n",
66 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
67 | " u'Year', \n",
68 | " u'Month', \n",
69 | " u'DayofMonth', \n",
70 | " u'DayOfWeek', \n",
71 | " u'UniqueCarrier',\n",
72 | " u'DepTime', \n",
73 | " u'TailNum', \n",
74 | " u'Origin', \n",
75 | " u'Dest', \n",
76 | " u'DepDelay', \n",
77 | "# u'ArrDelay', \n",
78 | " u'Cancelled',\n",
79 | "# u'ArrTime',\n",
80 | "# u'ArrDelay',\n",
81 | "# u'Distance'\n",
82 | " ])\n",
83 | " print len(dfPart)\n",
84 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
85 | " print \"Removed cancelled flights, new length - \",len(dfPart)\n",
86 | " master.append(dfPart)\n",
87 | " print\n",
88 | "except Exception as e:\n",
89 | " print \"Supplemental Data Import failed\", e\n",
90 | "\n",
91 | "dfMaster = pd.concat(master, ignore_index=True)\n",
92 | "master=[]\n",
93 | "dfPart=[]\n",
94 | "\n",
95 | "print \"Total length - \", len(dfMaster)\n",
96 | "del dfMaster['Cancelled']\n",
97 | "\n",
98 | "dfMaster.fillna(0, inplace=True)\n",
99 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
100 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
101 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
102 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
103 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
104 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
105 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
106 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
107 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
108 | "\n",
109 | "df = dfMaster\n",
110 | "\n",
111 | "print \"Calculating classification label...\"\n",
112 | "df['label'] = 0\n",
113 | "df.label[df.DepDelay >= 15] = 1\n",
114 | "df.label[df.DepDelay < 15] = 0\n",
115 | "\n",
116 | "df['DepDelay'][df.DepDelay < 0]=0\n",
117 | "del df['DepDelay']\n",
118 | "# df['ArrDelay'][df.ArrDelay < 0]=0\n",
119 | "\n",
120 | "print \"Dataframe shape - \",df.shape\n",
121 | "print \"Columns -\", df.columns"
122 | ],
123 | "language": "python",
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "output_type": "stream",
128 | "stream": "stdout",
129 | "text": [
130 | "Setting constants...\n",
131 | "Reading into Pandas frame...\n",
132 | "\n",
133 | "C:\\data\\airline\\2008.csv\n",
134 | "1000000"
135 | ]
136 | },
137 | {
138 | "output_type": "stream",
139 | "stream": "stdout",
140 | "text": [
141 | "\n",
142 | "Removed cancelled flights, new length - "
143 | ]
144 | },
145 | {
146 | "output_type": "stream",
147 | "stream": "stdout",
148 | "text": [
149 | " 967867\n",
150 | "\n",
151 | "Total length - "
152 | ]
153 | },
154 | {
155 | "output_type": "stream",
156 | "stream": "stdout",
157 | "text": [
158 | " 967867\n",
159 | "Calculating classification label..."
160 | ]
161 | },
162 | {
163 | "output_type": "stream",
164 | "stream": "stdout",
165 | "text": [
166 | "\n",
167 | "Dataframe shape - (967867, 10)\n",
168 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n"
169 | ]
170 | }
171 | ],
172 | "prompt_number": 198
173 | },
174 | {
175 | "cell_type": "code",
176 | "collapsed": false,
177 | "input": [
178 | "print \"Converting categorical data to numeric...\"\n",
179 | "for col in set(df.columns):\n",
180 | "# print col, train[col].dtype\n",
181 | " if df[col].dtype == np.dtype('object'):\n",
182 | " print \"Converting...\", col\n",
183 | " if col == 'TailNum':\n",
184 | " s = np.unique(df[col].values)\n",
185 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
186 | "# print TailNum\n",
187 | " if col == 'UniqueCarrier':\n",
188 | " s = np.unique(df[col].values)\n",
189 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
190 | "# print UniqueCarrier\n",
191 | " if col == 'Dest':\n",
192 | " s = np.unique(df[col].values)\n",
193 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
194 | "# print Dest\n",
195 | " if col == 'Origin':\n",
196 | " s = np.unique(df[col].values)\n",
197 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
198 | "# print Origin\n",
199 | "\n",
200 | "\n",
201 | "def getTailNum(inTailNum):\n",
202 | "# print \"In...\",type(inTailNum)\n",
203 | " out = []\n",
204 | " for x, y in inTailNum.iteritems():\n",
205 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n",
206 | " out.append(TailNum.get_value(y) + 1)\n",
207 | "# print \"final out\", out\n",
208 | " return out\n",
209 | "\n",
210 | "\n",
211 | "def getDest(inDest):\n",
212 | " out = []\n",
213 | " for x, y in inDest.iteritems():\n",
214 | " out.append(Dest.get_value(y) + 1)\n",
215 | " return out\n",
216 | "\n",
217 | "\n",
218 | "def getOrigin(inOrign):\n",
219 | " out = []\n",
220 | " for x, y in inOrign.iteritems():\n",
221 | " out.append(Origin.get_value(y) + 1)\n",
222 | " return out\n",
223 | "\n",
224 | "\n",
225 | "def getCarrier(inCarrier):\n",
226 | " out = []\n",
227 | " for x, y in inCarrier.iteritems():\n",
228 | " out.append(UniqueCarrier.get_value(y) + 1)\n",
229 | " return out\n",
230 | "\n",
231 | "df['TailNum'] = getTailNum(df['TailNum'])\n",
232 | "print \"TailNum completed.\"\n",
233 | "\n",
234 | "df['Dest'] = getDest(df['Dest'])\n",
235 | "print \"Dest completed.\"\n",
236 | "\n",
237 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
238 | "print \"UniqueCarrier completed.\"\n",
239 | "\n",
240 | "df['Origin'] = getOrigin(df['Origin'])\n",
241 | "print \"Origin completed.\"\n",
242 | "\n",
243 | "print \"Conversion to numeric completed.\"\n",
244 | "\n",
245 | "# print \"Pickling converted data...\"\n",
246 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")"
247 | ],
248 | "language": "python",
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "stream": "stdout",
254 | "text": [
255 | "Converting categorical data to numeric...\n",
256 | "Converting... Origin\n",
257 | "Converting..."
258 | ]
259 | },
260 | {
261 | "output_type": "stream",
262 | "stream": "stdout",
263 | "text": [
264 | " UniqueCarrier\n",
265 | "Converting..."
266 | ]
267 | },
268 | {
269 | "output_type": "stream",
270 | "stream": "stdout",
271 | "text": [
272 | " Dest\n",
273 | "Converting..."
274 | ]
275 | },
276 | {
277 | "output_type": "stream",
278 | "stream": "stdout",
279 | "text": [
280 | " TailNum\n",
281 | "TailNum completed."
282 | ]
283 | },
284 | {
285 | "output_type": "stream",
286 | "stream": "stdout",
287 | "text": [
288 | "\n",
289 | "Dest completed."
290 | ]
291 | },
292 | {
293 | "output_type": "stream",
294 | "stream": "stdout",
295 | "text": [
296 | "\n",
297 | "UniqueCarrier completed."
298 | ]
299 | },
300 | {
301 | "output_type": "stream",
302 | "stream": "stdout",
303 | "text": [
304 | "\n",
305 | "Origin completed."
306 | ]
307 | },
308 | {
309 | "output_type": "stream",
310 | "stream": "stdout",
311 | "text": [
312 | "\n",
313 | "Conversion to numeric completed.\n"
314 | ]
315 | }
316 | ],
317 | "prompt_number": 199
318 | },
319 | {
320 | "cell_type": "code",
321 | "collapsed": false,
322 | "input": [
323 | "Origin['SFO'], Origin['OAK']"
324 | ],
325 | "language": "python",
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "metadata": {},
330 | "output_type": "pyout",
331 | "prompt_number": 200,
332 | "text": [
333 | "(243, 192)"
334 | ]
335 | }
336 | ],
337 | "prompt_number": 200
338 | },
339 | {
340 | "cell_type": "code",
341 | "collapsed": false,
342 | "input": [
343 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
344 | "\n",
345 | "# add columns to your data frame\n",
346 | "\n",
347 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
348 | "\n",
349 | "# define training and test sets\n",
350 | "train = df[df['is_train'] == True]\n",
351 | "test = df[df['is_train'] == False]\n",
352 | "trainTargets = np.array(train['label']).astype(int)\n",
353 | "testTargets = np.array(test['label']).astype(int)\n",
354 | "features = df.columns[0:9]\n",
355 | "\n",
356 | "testSFO = test[test['Dest']==Origin['SFO']]\n",
357 | "print len(testSFO)\n",
358 | "\n",
359 | "testOAK = test[test['Dest']==Origin['OAK']]\n",
360 | "print len(testOAK)\n",
361 | "\n",
362 | "print \"Model fitting and prediction started...\"\n",
363 | "gnb = tree.DecisionTreeClassifier()\n",
364 | "\n",
365 | "# train model\n",
366 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
367 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
368 | "\n",
369 | "print \"Classification completed.\""
370 | ],
371 | "language": "python",
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "output_type": "stream",
376 | "stream": "stdout",
377 | "text": [
378 | "Begin classification...75% training, 25% testing, randomly chosen\n",
379 | "887"
380 | ]
381 | },
382 | {
383 | "output_type": "stream",
384 | "stream": "stdout",
385 | "text": [
386 | "\n",
387 | "39\n",
388 | "Model fitting and prediction started...\n",
389 | "Classification completed."
390 | ]
391 | },
392 | {
393 | "output_type": "stream",
394 | "stream": "stdout",
395 | "text": [
396 | "\n"
397 | ]
398 | }
399 | ],
400 | "prompt_number": 215
401 | },
402 | {
403 | "cell_type": "code",
404 | "collapsed": false,
405 | "input": [
406 | "features"
407 | ],
408 | "language": "python",
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "metadata": {},
413 | "output_type": "pyout",
414 | "prompt_number": 216,
415 | "text": [
416 | "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')"
417 | ]
418 | }
419 | ],
420 | "prompt_number": 216
421 | },
422 | {
423 | "cell_type": "code",
424 | "collapsed": false,
425 | "input": [
426 | "print \"Calculating metrcs...\"\n",
427 | "# test['pred_label'] = y_gnb\n",
428 | "# test.head()\n",
429 | "acc = zip(test['label'], y_gnb)\n",
430 | "match_count = 0\n",
431 | "for i in acc:\n",
432 | " if i[0] == i[1]:\n",
433 | " match_count += 1\n",
434 | "print \"Matches - \", match_count\n",
435 | "print \"Total length - \", len(acc)\n",
436 | "print \"Accuracy:\", float(match_count) / len(acc)"
437 | ],
438 | "language": "python",
439 | "metadata": {},
440 | "outputs": [
441 | {
442 | "output_type": "stream",
443 | "stream": "stdout",
444 | "text": [
445 | "Calculating metrcs...\n",
446 | "Matches - "
447 | ]
448 | },
449 | {
450 | "output_type": "stream",
451 | "stream": "stdout",
452 | "text": [
453 | " 184048\n",
454 | "Total length - 242386\n",
455 | "Accuracy: 0.75931778238\n"
456 | ]
457 | }
458 | ],
459 | "prompt_number": 217
460 | },
461 | {
462 | "cell_type": "code",
463 | "collapsed": false,
464 | "input": [
465 | "print accuracy_score(test['label'],y_gnb)\n",
466 | "print metrics.confusion_matrix(test['label'],y_gnb)"
467 | ],
468 | "language": "python",
469 | "metadata": {},
470 | "outputs": [
471 | {
472 | "output_type": "stream",
473 | "stream": "stdout",
474 | "text": [
475 | "0.75931778238\n",
476 | "[[157152 29405]\n",
477 | " [ 28933 26896]]"
478 | ]
479 | },
480 | {
481 | "output_type": "stream",
482 | "stream": "stdout",
483 | "text": [
484 | "\n"
485 | ]
486 | }
487 | ],
488 | "prompt_number": 218
489 | },
490 | {
491 | "cell_type": "code",
492 | "collapsed": false,
493 | "input": [
494 | "gnb.feature_importances_"
495 | ],
496 | "language": "python",
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "metadata": {},
501 | "output_type": "pyout",
502 | "prompt_number": 219,
503 | "text": [
504 | "array([ 0. , 0.01151212, 0.0552584 , 0.03722765, 0.28496385,\n",
505 | " 0.07264084, 0.2130565 , 0.16164198, 0.16369866])"
506 | ]
507 | }
508 | ],
509 | "prompt_number": 219
510 | },
511 | {
512 | "cell_type": "code",
513 | "collapsed": false,
514 | "input": [
515 | "features"
516 | ],
517 | "language": "python",
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "metadata": {},
522 | "output_type": "pyout",
523 | "prompt_number": 222,
524 | "text": [
525 | "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')"
526 | ]
527 | }
528 | ],
529 | "prompt_number": 222
530 | },
531 | {
532 | "cell_type": "code",
533 | "collapsed": false,
534 | "input": [
535 | "# average_precision_score(test['label'],y_gnb)\n",
536 | "precision_recall_fscore_support(test['label'],y_gnb,average='micro')"
537 | ],
538 | "language": "python",
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "metadata": {},
543 | "output_type": "pyout",
544 | "prompt_number": 223,
545 | "text": [
546 | "(0.47771798014244865, 0.48175679306453634, 0.47972888611433151, 55829)"
547 | ]
548 | }
549 | ],
550 | "prompt_number": 223
551 | },
552 | {
553 | "cell_type": "code",
554 | "collapsed": false,
555 | "input": [
556 | "# dfMaster['FlightDate'] =pd.to_datetime(dfMaster.Year*10000+dfMaster.Month*100+dfMaster.DayofMonth,format='%Y%m%d')"
557 | ],
558 | "language": "python",
559 | "metadata": {},
560 | "outputs": [],
561 | "prompt_number": 206
562 | },
563 | {
564 | "cell_type": "code",
565 | "collapsed": false,
566 | "input": [
567 | "# dfAirport = dfMaster[['FlightDate','Origin']].groupby([dfMaster['FlightDate'],dfMaster['Origin']]).agg([len])\n",
568 | "# # dfAirport.to_clipboard()\n",
569 | "# dfAirport"
570 | ],
571 | "language": "python",
572 | "metadata": {},
573 | "outputs": [],
574 | "prompt_number": 207
575 | },
576 | {
577 | "cell_type": "code",
578 | "collapsed": false,
579 | "input": [
580 | "print y_gnb[:10]\n",
581 | "print y_prob[:10]"
582 | ],
583 | "language": "python",
584 | "metadata": {},
585 | "outputs": [
586 | {
587 | "output_type": "stream",
588 | "stream": "stdout",
589 | "text": [
590 | "[0 0 0 1 1 0 0 0 1 1]\n",
591 | "[[ 1. 0.]\n",
592 | " [ 1. 0.]\n",
593 | " [ 1. 0.]\n",
594 | " [ 0. 1.]\n",
595 | " [ 0. 1.]\n",
596 | " [ 1. 0.]\n",
597 | " [ 1. 0.]\n",
598 | " [ 1. 0.]\n",
599 | " [ 0. 1.]\n",
600 | " [ 0. 1.]]\n"
601 | ]
602 | }
603 | ],
604 | "prompt_number": 224
605 | },
606 | {
607 | "cell_type": "code",
608 | "collapsed": false,
609 | "input": [
610 | "dfMaster[:100].to_csv(\"C:\\\\data\\\\airline\\\\SampleData.csv\")"
611 | ],
612 | "language": "python",
613 | "metadata": {},
614 | "outputs": [],
615 | "prompt_number": 227
616 | }
617 | ],
618 | "metadata": {}
619 | }
620 | ]
621 | }
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled0-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import sklearn\n",
19 | "from sklearn.naive_bayes import *\n",
20 | "from sklearn.metrics import *\n",
21 | "import os\n",
22 | "import cPickle\n",
23 | "import sys\n",
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "from optparse import OptionParser\n",
27 | "from sklearn import metrics, preprocessing\n",
28 | "from sklearn import svm, naive_bayes, neighbors, tree\n",
29 | "from sklearn.ensemble import AdaBoostClassifier\n",
30 | "from sklearn import cross_validation\n",
31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n",
32 | "from sklearn.svm import SVC # support vector machine classifier\n",
33 | "# hyperparameter grid search to find best model parameters\n",
34 | "from sklearn.grid_search import GridSearchCV\n",
35 | "from sklearn import preprocessing # preprocess string labels into numerics\n",
36 | "from sklearn import *\n",
37 | "from sklearn.metrics import precision_recall_fscore_support\n",
38 | "from sklearn.metrics import classification_report\n",
39 | "\n",
40 | "\n",
41 | "# In[135]:\n",
42 | "\n",
43 | "# Setting up constants\n",
44 | "print \"Setting constants...\"\n",
45 | "\n",
46 | "TRAINING_LINE_NUMBER = 500000\n",
47 | "YEARS = ['2006', '2008', '2007']\n",
48 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n",
49 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n",
50 | "# YEARS = ['2008']\n",
51 | "\n",
52 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n",
53 | "\n",
54 | "master = []\n",
55 | "print \"Reading into Pandas frame...\"\n",
56 | "try:\n",
57 | " for year in YEARS:\n",
58 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
59 | " print \"\\n\", path\n",
60 | " dfPart = pd.read_csv(\n",
61 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
62 | " u'Year',\n",
63 | " u'Month',\n",
64 | " u'DayofMonth',\n",
65 | " u'DayOfWeek',\n",
66 | " u'UniqueCarrier',\n",
67 | " u'DepTime',\n",
68 | " u'TailNum',\n",
69 | " u'Origin',\n",
70 | " u'Dest',\n",
71 | " u'DepDelay',\n",
72 | " # u'ArrDelay',\n",
73 | " u'Cancelled',\n",
74 | " # u'ArrTime',\n",
75 | " # u'ArrDelay',\n",
76 | " # u'Distance'\n",
77 | " ])\n",
78 | " print len(dfPart)\n",
79 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
80 | " print \"Removed cancelled flights, new length - \", len(dfPart)\n",
81 | " master.append(dfPart)\n",
82 | " print\n",
83 | "except Exception as e:\n",
84 | " print \"Supplemental Data Import failed\", e\n",
85 | "\n",
86 | "dfMaster = pd.concat(master, ignore_index=True)\n",
87 | "master = []\n",
88 | "dfPart = []\n",
89 | "\n",
90 | "print \"Total length - \", len(dfMaster)\n",
91 | "del dfMaster['Cancelled']\n",
92 | "\n",
93 | "dfMaster.fillna(0, inplace=True)\n",
94 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
95 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
96 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
97 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
98 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
99 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
100 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
101 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
102 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
103 | "\n",
104 | "df = dfMaster\n",
105 | "\n",
106 | "print \"Calculating classification label...\"\n",
107 | "df['label'] = 0\n",
108 | "df.label[df.DepDelay >= 15] = 1\n",
109 | "df.label[df.DepDelay < 15] = 0\n",
110 | "\n",
111 | "# df['DepDelay'][df.DepDelay < 0] = 0\n",
112 | "del df['DepDelay']\n",
113 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
114 | "\n",
115 | "print \"Dataframe shape - \", df.shape\n",
116 | "print \"Columns -\", df.columns\n",
117 | "\n",
118 | "\n",
119 | "# In[136]:\n",
120 | "\n",
121 | "print \"Converting categorical data to numeric...\"\n",
122 | "for col in set(df.columns):\n",
123 | "# print col, train[col].dtype\n",
124 | " if df[col].dtype == np.dtype('object'):\n",
125 | " print \"Converting...\", col\n",
126 | " if col == 'TailNum':\n",
127 | " s = np.unique(df[col].values)\n",
128 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 | "# print TailNum\n",
130 | " if col == 'UniqueCarrier':\n",
131 | " s = np.unique(df[col].values)\n",
132 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 | "# print UniqueCarrier\n",
134 | " if col == 'Dest':\n",
135 | " s = np.unique(df[col].values)\n",
136 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 | "# print Dest\n",
138 | " if col == 'Origin':\n",
139 | " s = np.unique(df[col].values)\n",
140 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
141 | "# print Origin\n",
142 | "\n",
143 | "\n",
144 | "def getTailNum(inTailNum):\n",
145 | "# print \"In...\",type(inTailNum)\n",
146 | " out = []\n",
147 | " for x, y in inTailNum.iteritems():\n",
148 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n",
149 | " out.append(TailNum.get_value(y) + 1)\n",
150 | "# print \"final out\", out\n",
151 | " return out\n",
152 | "\n",
153 | "\n",
154 | "def getDest(inDest):\n",
155 | " out = []\n",
156 | " for x, y in inDest.iteritems():\n",
157 | " out.append(Dest.get_value(y) + 1)\n",
158 | " return out\n",
159 | "\n",
160 | "\n",
161 | "def getOrigin(inOrign):\n",
162 | " out = []\n",
163 | " for x, y in inOrign.iteritems():\n",
164 | " out.append(Origin.get_value(y) + 1)\n",
165 | " return out\n",
166 | "\n",
167 | "\n",
168 | "def getCarrier(inCarrier):\n",
169 | " out = []\n",
170 | " for x, y in inCarrier.iteritems():\n",
171 | " out.append(UniqueCarrier.get_value(y) + 1)\n",
172 | " return out\n",
173 | "\n",
174 | "df['TailNum'] = getTailNum(df['TailNum'])\n",
175 | "print \"TailNum completed.\"\n",
176 | "\n",
177 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
178 | "print \"UniqueCarrier completed.\"\n",
179 | "\n",
180 | "df['Dest'] = getDest(df['Dest'])\n",
181 | "print \"Dest completed.\"\n",
182 | "\n",
183 | "df['Origin'] = getOrigin(df['Origin'])\n",
184 | "print \"Origin completed.\"\n",
185 | "\n",
186 | "print \"Conversion to numeric completed.\"\n",
187 | "\n",
188 | "# print \"Pickling converted data...\"\n",
189 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n"
190 | ],
191 | "language": "python",
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "output_type": "stream",
196 | "stream": "stdout",
197 | "text": [
198 | "Setting constants...\n",
199 | "Reading into Pandas frame...\n",
200 | "\n",
201 | "C:\\data\\airline\\2006.csv\n",
202 | "500000"
203 | ]
204 | },
205 | {
206 | "output_type": "stream",
207 | "stream": "stdout",
208 | "text": [
209 | "\n",
210 | "Removed cancelled flights, new length - "
211 | ]
212 | },
213 | {
214 | "output_type": "stream",
215 | "stream": "stdout",
216 | "text": [
217 | " 491158\n",
218 | "\n",
219 | "\n",
220 | "C:\\data\\airline\\2008.csv\n",
221 | "500000"
222 | ]
223 | },
224 | {
225 | "output_type": "stream",
226 | "stream": "stdout",
227 | "text": [
228 | "\n",
229 | "Removed cancelled flights, new length - "
230 | ]
231 | },
232 | {
233 | "output_type": "stream",
234 | "stream": "stdout",
235 | "text": [
236 | " 484708\n",
237 | "\n",
238 | "\n",
239 | "C:\\data\\airline\\2007.csv\n",
240 | "500000"
241 | ]
242 | },
243 | {
244 | "output_type": "stream",
245 | "stream": "stdout",
246 | "text": [
247 | "\n",
248 | "Removed cancelled flights, new length - "
249 | ]
250 | },
251 | {
252 | "output_type": "stream",
253 | "stream": "stdout",
254 | "text": [
255 | " 487243\n",
256 | "\n",
257 | "Total length - "
258 | ]
259 | },
260 | {
261 | "output_type": "stream",
262 | "stream": "stdout",
263 | "text": [
264 | " 1463109\n",
265 | "Calculating classification label..."
266 | ]
267 | },
268 | {
269 | "output_type": "stream",
270 | "stream": "stdout",
271 | "text": [
272 | "\n",
273 | "Dataframe shape - (1463109, 10)\n",
274 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
275 | "Converting categorical data to numeric...\n",
276 | "Converting..."
277 | ]
278 | },
279 | {
280 | "output_type": "stream",
281 | "stream": "stdout",
282 | "text": [
283 | " Origin\n",
284 | "Converting..."
285 | ]
286 | },
287 | {
288 | "output_type": "stream",
289 | "stream": "stdout",
290 | "text": [
291 | " UniqueCarrier\n",
292 | "Converting..."
293 | ]
294 | },
295 | {
296 | "output_type": "stream",
297 | "stream": "stdout",
298 | "text": [
299 | " Dest\n",
300 | "Converting..."
301 | ]
302 | },
303 | {
304 | "output_type": "stream",
305 | "stream": "stdout",
306 | "text": [
307 | " TailNum\n",
308 | "TailNum completed."
309 | ]
310 | },
311 | {
312 | "output_type": "stream",
313 | "stream": "stdout",
314 | "text": [
315 | "\n",
316 | "UniqueCarrier completed."
317 | ]
318 | },
319 | {
320 | "output_type": "stream",
321 | "stream": "stdout",
322 | "text": [
323 | "\n",
324 | "Dest completed."
325 | ]
326 | },
327 | {
328 | "output_type": "stream",
329 | "stream": "stdout",
330 | "text": [
331 | "\n",
332 | "Origin completed."
333 | ]
334 | },
335 | {
336 | "output_type": "stream",
337 | "stream": "stdout",
338 | "text": [
339 | "\n",
340 | "Conversion to numeric completed.\n"
341 | ]
342 | }
343 | ],
344 | "prompt_number": 13
345 | },
346 | {
347 | "cell_type": "code",
348 | "collapsed": false,
349 | "input": [
350 | "\n",
351 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
352 | "\n",
353 | "# add columns to your data frame\n",
354 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
355 | "\n",
356 | "# define training and test sets\n",
357 | "train = df[df['is_train'] == True]\n",
358 | "test = df[df['is_train'] == False]\n",
359 | "trainTargets = np.array(train['label']).astype(int)\n",
360 | "testTargets = np.array(test['label']).astype(int)\n",
361 | "features = df.columns[0:9]\n",
362 | "print \"Features - \",features\n",
363 | "print \"Model fitting and prediction started...\"\n",
364 | "gnb = GaussianNB()\n",
365 | "\n",
366 | "# train model\n",
367 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
368 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
369 | "\n",
370 | "print \"Classification completed.\""
371 | ],
372 | "language": "python",
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "output_type": "stream",
377 | "stream": "stdout",
378 | "text": [
379 | "Begin classification...75% training, 25% testing, randomly chosen\n",
380 | "Features - "
381 | ]
382 | },
383 | {
384 | "output_type": "stream",
385 | "stream": "stdout",
386 | "text": [
387 | " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n",
388 | "Model fitting and prediction started...\n",
389 | "Classification completed."
390 | ]
391 | },
392 | {
393 | "output_type": "stream",
394 | "stream": "stdout",
395 | "text": [
396 | "\n",
397 | "Calculating metrcs...\n",
398 | "Accuracy - 0.798698653544\n",
399 | "Confusion metrics\n",
400 | "[[291966 106]\n",
401 | " [ 73525 178]]"
402 | ]
403 | },
404 | {
405 | "output_type": "stream",
406 | "stream": "stdout",
407 | "text": [
408 | "\n",
409 | "Precision - "
410 | ]
411 | },
412 | {
413 | "output_type": "stream",
414 | "stream": "stdout",
415 | "text": [
416 | "0.62676056338\n",
417 | "Recall - "
418 | ]
419 | },
420 | {
421 | "output_type": "stream",
422 | "stream": "stdout",
423 | "text": [
424 | "0.00241509843561\n"
425 | ]
426 | }
427 | ],
428 | "prompt_number": 14
429 | },
430 | {
431 | "cell_type": "code",
432 | "collapsed": false,
433 | "input": [
434 | "print \"Calculating metrcs...\"\n",
435 | "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n",
436 | "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n",
437 | "print \"Precision - \", precision_score(test['label'], y_gnb)\n",
438 | "print \"Recall - \", recall_score(test['label'], y_gnb)\n"
439 | ],
440 | "language": "python",
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "output_type": "stream",
445 | "stream": "stdout",
446 | "text": [
447 | "Calculating metrcs...\n",
448 | "Accuracy - 0.798698653544\n",
449 | "Confusion metrics\n",
450 | "[[291966 106]\n",
451 | " [ 73525 178]]"
452 | ]
453 | },
454 | {
455 | "output_type": "stream",
456 | "stream": "stdout",
457 | "text": [
458 | "\n",
459 | "Precision - "
460 | ]
461 | },
462 | {
463 | "output_type": "stream",
464 | "stream": "stdout",
465 | "text": [
466 | "0.62676056338\n",
467 | "Recall - "
468 | ]
469 | },
470 | {
471 | "output_type": "stream",
472 | "stream": "stdout",
473 | "text": [
474 | "0.00241509843561\n"
475 | ]
476 | }
477 | ],
478 | "prompt_number": 25
479 | },
480 | {
481 | "cell_type": "code",
482 | "collapsed": false,
483 | "input": [
484 | "testSFO = test[test['Origin'] == Origin['SFO']]\n",
485 | "print len(testSFO)\n",
486 | "\n",
487 | "testOAK = test[test['Origin'] == Origin['OAK']]\n",
488 | "print len(testOAK)\n"
489 | ],
490 | "language": "python",
491 | "metadata": {},
492 | "outputs": [
493 | {
494 | "output_type": "stream",
495 | "stream": "stdout",
496 | "text": [
497 | "3563\n",
498 | "40\n"
499 | ]
500 | }
501 | ],
502 | "prompt_number": 22
503 | },
504 | {
505 | "cell_type": "code",
506 | "collapsed": false,
507 | "input": [
508 | " np.random.randint(2000, size=10)\n",
509 | " "
510 | ],
511 | "language": "python",
512 | "metadata": {},
513 | "outputs": [
514 | {
515 | "metadata": {},
516 | "output_type": "pyout",
517 | "prompt_number": 27,
518 | "text": [
519 | "array([ 437, 1815, 742, 148, 1399, 1171, 205, 1480, 838, 1437])"
520 | ]
521 | }
522 | ],
523 | "prompt_number": 27
524 | }
525 | ],
526 | "metadata": {}
527 | }
528 | ]
529 | }
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:d3fd45c3529abf0b735e3b409e8980ec4b2e4e445277ba0cf2522e16729ae159"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "import sys\n",
17 | "import csv\n",
18 | "import datetime\n",
19 | "import matplotlib.pyplot as plt; plt.rcdefaults()\n",
20 | "\n",
21 | "TIME_DELTA = 3\n",
22 | "\n",
23 | "for arg in sys.argv:\n",
24 | "\tif(arg != 'date_graph.py'):\n",
25 | "\t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n",
26 | "\t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n",
27 | "\n",
28 | "delta = datetime.timedelta(days=TIME_DELTA)\n",
29 | "begin = start_date - delta\n",
30 | "end = start_date + delta\n",
31 | "\n",
32 | "SFO_Hash = {}\n",
33 | "OAK_Hash = {}\n",
34 | "SFO_count = 0\n",
35 | "OAK_count = 0\n",
36 | "with open('_dfTest2008.csv', 'r') as data:\n",
37 | "\tcsv_reader = csv.reader(data, delimiter=',')\n",
38 | "\tfor row in csv_reader:\n",
39 | "\t\tif(row[0] != 'Year'):\n",
40 | "\t\t\tyear = int(row[0])\n",
41 | "\t\t\tmonth = int(row[1])\n",
42 | "\t\t\tdate = int(row[2])\n",
43 | "\t\t\tcurr_date = datetime.date(year, month, date)\n",
44 | "\t\t\tif(curr_date >= begin and curr_date <= end):\n",
45 | "\t\t\t\torigin = row[7]\n",
46 | "\t\t\t\tif(origin == '270'):\n",
47 | "\t\t\t\t\tlabel = int(row[10])\n",
48 | "\t\t\t\t\tSFO_count += 1\n",
49 | "\t\t\t\t\tif(curr_date not in SFO_Hash):\n",
50 | "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n",
51 | "\t\t\t\t\telse:\n",
52 | "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n",
53 | "\t\t\t\tif(origin == '215'):\n",
54 | "\t\t\t\t\tlabel = int(row[10])\n",
55 | "\t\t\t\t\tOAK_count += 1\n",
56 | "\t\t\t\t\tif(curr_date not in OAK_Hash):\n",
57 | "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n",
58 | "\t\t\t\t\telse:\n",
59 | "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n",
60 | "\n",
61 | "iterator = datetime.timedelta(days=1)\n",
62 | "day_values = []\n",
63 | "SFO_Delays = []\n",
64 | "SFO_On_Time = []\n",
65 | "SFO_Flights = []\n",
66 | "SFO_Pct = []\n",
67 | "OAK_Delays = []\n",
68 | "OAK_On_Time = []\n",
69 | "OAK_Flights = []\n",
70 | "OAK_Pct = []\n",
71 | "\n",
72 | "while begin <= end:\n",
73 | "\tif(begin not in SFO_Hash):\n",
74 | "\t\tSFO_Delays.append(0)\n",
75 | "\t\tSFO_On_Time.append(0)\n",
76 | "\t\tSFO_Pct.append(0.00)\n",
77 | "\telse:\n",
78 | "\t\tSFO_Flights = SFO_Hash[begin]\n",
79 | "\t\tdelays = sum(SFO_Flights)\n",
80 | "\t\tnum_flights = len(SFO_Flights)\n",
81 | "\t\tpct = float(delays) / (num_flights + delays)\n",
82 | "\t\tSFO_Delays.append(delays)\n",
83 | "\t\tSFO_On_Time.append(num_flights - delays)\n",
84 | "\t\tSFO_Pct.append(pct)\n",
85 | "\t\n",
86 | "\tif(begin not in OAK_Hash):\n",
87 | "\t\tOAK_Delays.append(0)\n",
88 | "\t\tOAK_On_Time.append(0)\n",
89 | "\t\tOAK_Pct.append(0.00)\n",
90 | "\telse:\n",
91 | "\t\tOAK_Flights = OAK_Hash[begin]\n",
92 | "\t\tdelays = sum(OAK_Flights)\n",
93 | "\t\tnum_flights = len(OAK_Flights)\n",
94 | "\t\tpct = float(delays) / (num_flights + delays)\n",
95 | "\t\tOAK_Delays.append(delays)\n",
96 | "\t\tOAK_On_Time.append(num_flights - delays)\n",
97 | "\t\tOAK_Pct.append(pct)\n",
98 | "\t\n",
99 | "\tday_values.append(begin)\n",
100 | "\tbegin += iterator\n",
101 | "\n",
102 | "print SFO_Pct\n",
103 | "print OAK_Pct\n",
104 | "\n",
105 | "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n",
106 | "\n",
107 | "ax1 = plt.subplot(211)\n",
108 | "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n",
109 | "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n",
110 | "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
111 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
112 | "ax1.set_yticks([0, 200, 450])\n",
113 | "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n",
114 | "\n",
115 | "ax2 = plt.subplot(212)\n",
116 | "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n",
117 | "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n",
118 | "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
119 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
120 | "ax2.set_yticks([0, 200, 450])\n",
121 | "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n",
122 | "\n",
123 | "plt.show()"
124 | ],
125 | "language": "python",
126 | "metadata": {},
127 | "outputs": []
128 | }
129 | ],
130 | "metadata": {}
131 | }
132 | ]
133 | }
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled3-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:43ffcf25a0f9f00fd6bd77f3e24dfb6e62c5a764e70ce742b71da7b69b36310f"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": []
9 | }
--------------------------------------------------------------------------------
/EDA_and_NB_performance_charts.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import os
5 | from IPython.core.display import HTML
6 | from bokeh.plotting import *
7 |
8 |
9 | # load data into pandas
10 | INPUT_FILE = "C:\\data\\airline\\_dfTest2008.csv"
11 |
12 | SKIP_FIRST_LINE = True
13 |
14 | master = []
15 | print "Reading into Pandas frame..."
16 | try:
17 | dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[ # nrows = 2000
18 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
19 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label'
20 | ])
21 | print len(dfPart)
22 | master.append(dfPart)
23 | except Exception as e:
24 | print "Data import failed", e
25 |
26 |
27 | dfMaster = pd.concat(master, ignore_index=True)
28 | print "Total length: ", len(dfMaster)
29 |
30 | # change data types
31 | dfMaster['Year'] = dfMaster['Year'].astype('int')
32 | dfMaster['Month'] = dfMaster['Month'].astype('int')
33 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
34 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
35 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int')
36 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int')
37 | dfMaster['Origin'] = dfMaster['Origin'].astype('int')
38 | dfMaster['Dest'] = dfMaster['Dest'].astype('int')
39 | dfMaster['label'] = dfMaster['label'].astype('int')
40 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int')
41 |
42 |
43 | df = dfMaster
44 | print "Appneding new variables..."
45 | df['accurate'] = 0
46 | df.accurate[df.label == df.pred_label] = 1
47 | df.accurate[df.label <> df.pred_label] = 0
48 |
49 |
50 | df['dep_time'] = 0
51 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1
52 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2
53 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3
54 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3
55 |
56 | # compute accuracy rates
57 | month_acc = dfMaster.groupby('Month').accurate.sum() / \
58 | dfMaster.groupby('Month').accurate.count()
59 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Month'])
60 | # print df_month_acc
61 |
62 | day_of_month_acc = dfMaster.groupby(
63 | 'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count()
64 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'DayofMonth'])
65 | # print df_day_of_month_acc
66 |
67 | day_of_week_acc = dfMaster.groupby(
68 | 'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count()
69 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'DayOfWeek'])
70 | # print df_day_of_week_acc
71 |
72 | unique_carrier_acc = dfMaster.groupby(
73 | 'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count()
74 | df_unique_carrier_acc = pd.DataFrame(
75 | unique_carrier_acc, columns=[u'UniqueCarrier'])
76 | # print df_unique_carrier_acc
77 |
78 | tail_num_acc = dfMaster.groupby(
79 | 'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count()
80 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'TailNum'])
81 | # print df_tail_num_acc
82 |
83 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \
84 | dfMaster.groupby('Origin').accurate.count()
85 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Origin'])
86 | # print df_origin_acc
87 |
88 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \
89 | dfMaster.groupby('Dest').accurate.count()
90 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Dest'])
91 | # print df_dest_acc
92 |
93 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \
94 | dfMaster.groupby('dep_time').accurate.count()
95 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'dep_time'])
96 | # print dep_time_acc
97 |
98 |
99 | # compute proportion of delays by each variable
100 |
101 | month_delays = dfMaster.groupby(
102 | 'Month').label.sum() / dfMaster.groupby('Month').label.count()
103 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Month'])
104 | # print df_month_delays
105 |
106 | day_of_month_delays = dfMaster.groupby(
107 | 'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count()
108 | df_day_of_month_delays = pd.DataFrame(
109 | day_of_month_delays, columns=[u'DayofMonth'])
110 | # print df_day_of_month_delays
111 |
112 | day_of_week_delays = dfMaster.groupby(
113 | 'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count()
114 | df_day_of_week_delays = pd.DataFrame(
115 | day_of_week_delays, columns=[u'DayOfWeek'])
116 | # print df_day_of_week_delays
117 |
118 | unique_carrier_delays = dfMaster.groupby(
119 | 'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count()
120 | df_unique_carrier_delays = pd.DataFrame(
121 | unique_carrier_delays, columns=[u'UniqueCarrier'])
122 | # print df_unique_carrier_delays
123 |
124 | tail_num_delays = dfMaster.groupby(
125 | 'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count()
126 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'TailNum'])
127 | # print df_tail_num_delays
128 |
129 | origin_delays = dfMaster.groupby(
130 | 'Origin').label.sum() / dfMaster.groupby('Origin').label.count()
131 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Origin'])
132 | # print df_origin_delays
133 |
134 | dest_delays = dfMaster.groupby(
135 | 'Dest').label.sum() / dfMaster.groupby('Dest').label.count()
136 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Dest'])
137 | # print df_dest_delays
138 |
139 | dep_time_delays = dfMaster.groupby(
140 | 'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count()
141 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'dep_time'])
142 | # print df_dep_time_delays
143 |
144 |
145 | # bar charts to see where delays are more likely
146 | df_day_of_month_delays.plot(kind='bar', color='grey', stacked=True)
147 |
148 | # df_day_of_week_delays.plot(kind='bar', color='grey', stacked=True)
149 |
150 | # df_unique_carrier_delays.plot(kind='bar', color='grey', stacked=True)
151 |
152 | # df_tail_num_delays.plot(kind='bar', color='grey', stacked=True)
153 |
154 | # df_origin_delays.plot(kind='bar', color='grey', stacked=True)
155 |
156 | # df_dest_delays.plot(kind='bar', color='grey', stacked=True)
157 |
158 | # df_dep_time_delays.plot(kind='bar', color='grey', stacked=True)
159 |
160 | # df_month_delays.plot(kind='bar', color='grey', stacked=True)
161 |
162 | plt.show()
163 |
164 |
165 | # plot bar charts for accuracy measures
166 | # df_month_acc.plot(kind='bar', color='grey', background_fill="#EAEAF2")
167 |
168 | # df_day_of_month_acc.plot(
169 | # kind='bar', color='grey', background_fill="#EAEAF2")
170 |
171 | # df_day_of_week_acc.plot(kind='bar', color='grey')
172 |
173 | # df_unique_carrier_acc.plot(kind='bar', color='grey')
174 |
175 | # df_tail_num_acc.plot(kind='bar', color='grey')
176 |
177 | # df_origin_acc.plot(kind='bar', color='grey')
178 |
179 | # df_dest_acc.plot(kind='bar', color='grey')
180 |
181 | # df_dep_time_acc.plot(kind='bar', color='grey')
182 |
183 | plt.show()
184 |
--------------------------------------------------------------------------------
/INFO290T_Final_Project_Presentation_vFINAL.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/INFO290T_Final_Project_Presentation_vFINAL.pptx
--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx
--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf
--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx
--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf
--------------------------------------------------------------------------------
/NB_performance_charts.py:
--------------------------------------------------------------------------------
1 | # This code builds some exploratory graphs to see how prediction accuracy
2 | # of the Naive Bayes model varies by each variable used to build the model.
3 |
4 |
5 | # Importing various modules to build graphs
6 | from __future__ import division
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | from pylab import figure, show
10 | from pandas import DataFrame, Series
11 | import pandas as pd
12 | import csv
13 | import os
14 | from bokeh.plotting import *
15 | import seaborn as sns
16 | from bokeh.objects import ColumnDataSource, Range1d
17 | from math import floor
18 | import bokeh as bokeh
19 | import seaborn as sns
20 | sns.set_context("talk")
21 |
22 |
23 | # load 2008 test data into pandas
24 | INPUT_FILE = "C:\\Users\\user\\Desktop\\INFO_290T\\Final Project\Visualizations\\SFO_OAK_data\\_dfTest2008.csv"
25 |
26 | SKIP_FIRST_LINE = True
27 |
28 | master = []
29 | print "Reading into Pandas frame..."
30 | try:
31 | dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[ # nrows = 2000
32 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
33 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label'
34 | ])
35 | print len(dfPart)
36 | master.append(dfPart)
37 | except Exception as e:
38 | print "Data import failed", e
39 |
40 |
41 | dfMaster = pd.concat(master, ignore_index=True)
42 | print "Total length: ", len(dfMaster)
43 |
44 | # change data types to integers
45 | dfMaster['Year'] = dfMaster['Year'].astype('int')
46 | dfMaster['Month'] = dfMaster['Month'].astype('int')
47 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
48 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
49 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int')
50 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int')
51 | dfMaster['Origin'] = dfMaster['Origin'].astype('int')
52 | dfMaster['Dest'] = dfMaster['Dest'].astype('int')
53 | dfMaster['label'] = dfMaster['label'].astype('int')
54 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int')
55 |
56 |
57 | df = dfMaster
58 | print "Appneding new variables..."
59 |
60 | # create a binary variable that indicates accuracy of prediction
61 | # for each record
62 | df['accurate'] = 0
63 | df.accurate[df.label == df.pred_label] = 1
64 | df.accurate[df.label <> df.pred_label] = 0
65 |
66 |
67 | # discretize time of day variable and create a categorical variable
68 | # that captures morning (from 7 am to 1 pm), afternoon (1 pm to 6 pm),
69 | # and night (from 6 pm to 7 am)
70 | df['dep_time'] = 0
71 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1
72 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2
73 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3
74 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3
75 |
76 | # compute accuracy rates for each variable
77 | month_acc = dfMaster.groupby('Month').accurate.sum() / \
78 | dfMaster.groupby('Month').accurate.count()
79 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Accuracy'])
80 |
81 |
82 | day_of_month_acc = dfMaster.groupby(
83 | 'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count()
84 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'Accuracy'])
85 |
86 | day_of_week_acc = dfMaster.groupby(
87 | 'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count()
88 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'Accuracy'])
89 |
90 | unique_carrier_acc = dfMaster.groupby(
91 | 'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count()
92 | df_unique_carrier_acc = pd.DataFrame(
93 | unique_carrier_acc, columns=[u'Accuracy'])
94 |
95 | tail_num_acc = dfMaster.groupby(
96 | 'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count()
97 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'Accuracy'])
98 |
99 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \
100 | dfMaster.groupby('Origin').accurate.count()
101 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Accuracy'])
102 |
103 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \
104 | dfMaster.groupby('Dest').accurate.count()
105 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Accuracy'])
106 |
107 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \
108 | dfMaster.groupby('dep_time').accurate.count()
109 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'Accuracy'])
110 |
111 |
112 | # compute proportion of delays by each variable
113 | month_delays = dfMaster.groupby(
114 | 'Month').label.sum() / dfMaster.groupby('Month').label.count()
115 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Accuracy'])
116 |
117 | day_of_month_delays = dfMaster.groupby(
118 | 'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count()
119 | df_day_of_month_delays = pd.DataFrame(
120 | day_of_month_delays, columns=[u'Accuracy'])
121 |
122 | day_of_week_delays = dfMaster.groupby(
123 | 'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count()
124 | df_day_of_week_delays = pd.DataFrame(
125 | day_of_week_delays, columns=[u'Accuracy'])
126 |
127 | unique_carrier_delays = dfMaster.groupby(
128 | 'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count()
129 | df_unique_carrier_delays = pd.DataFrame(
130 | unique_carrier_delays, columns=[u'Accuracy'])
131 |
132 | tail_num_delays = dfMaster.groupby(
133 | 'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count()
134 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'Accuracy'])
135 |
136 | origin_delays = dfMaster.groupby(
137 | 'Origin').label.sum() / dfMaster.groupby('Origin').label.count()
138 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Accuracy'])
139 |
140 | dest_delays = dfMaster.groupby(
141 | 'Dest').label.sum() / dfMaster.groupby('Dest').label.count()
142 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Accuracy'])
143 |
144 | dep_time_delays = dfMaster.groupby(
145 | 'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count()
146 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'Accuracy'])
147 |
148 |
149 | ############################################### BUILD GRAPHS ###########################################
150 |
151 | # build accuracy by day of month variable
152 | dfPlot = df_day_of_month_delays
153 | dfPlot.reset_index(inplace=True)
154 | dfPlot.columns
155 | plt.show()
156 | fig = plt.figure()
157 | fig.suptitle('Accuracy by Day of Month', fontsize=14, fontweight='bold')
158 | ax = fig.add_subplot(111)
159 | fig.subplots_adjust(top=0.95)
160 | ax.set_xlabel('Day of Month')
161 | ax.set_ylabel('Accuracy')
162 | ax.bar(dfPlot['DayofMonth'], dfPlot['Accuracy'], label="Label")
163 | plt.xticks(dfPlot['DayofMonth'], xrange(1, 32), rotation=45)
164 | plt.show()
165 |
166 | # build accuracy by month variable
167 | dfPlot = df_month_delays
168 | dfPlot.reset_index(inplace=True)
169 | dfPlot.columns
170 | plt.show()
171 | fig = plt.figure()
172 | fig.suptitle('Accuracy by Month', fontsize=14, fontweight='bold')
173 | ax = fig.add_subplot(111)
174 | fig.subplots_adjust(top=0.95)
175 | ax.set_xlabel('Month')
176 | ax.set_ylabel('Accuracy')
177 | ax.bar(dfPlot['Month'], dfPlot['Accuracy'], label="Label")
178 | plt.xticks(dfPlot['Month'], xrange(1, 32), rotation=45)
179 | plt.show()
180 |
181 | # build accuracy by day of week variable
182 | dfPlot = df_day_of_week_delays
183 | dfPlot.reset_index(inplace=True)
184 | dfPlot.columns
185 | plt.show()
186 | fig = plt.figure()
187 | fig.suptitle('Accuracy by day of week', fontsize=14, fontweight='bold')
188 | ax = fig.add_subplot(111)
189 | fig.subplots_adjust(top=0.95)
190 | ax.set_xlabel('Day of Week')
191 | ax.set_ylabel('Accuracy')
192 | ax.bar(dfPlot['DayOfWeek'], dfPlot['Accuracy'], label="Label")
193 | plt.xticks(dfPlot['DayOfWeek'], xrange(1, 32), rotation=45)
194 | plt.show()
195 |
196 | # build accuracy by unique carrier variable
197 | dfPlot = df_unique_carrier_delays
198 | dfPlot.reset_index(inplace=True)
199 | dfPlot.columns
200 | plt.show()
201 | fig = plt.figure()
202 | fig.suptitle('Accuracy by unique carrier', fontsize=14, fontweight='bold')
203 | ax = fig.add_subplot(111)
204 | fig.subplots_adjust(top=0.95)
205 | ax.set_xlabel('Unique carrier')
206 | ax.set_ylabel('Accuracy')
207 | ax.bar(dfPlot['UniqueCarrier'], dfPlot['Accuracy'], label="Label")
208 | plt.xticks(dfPlot['UniqueCarrier'], xrange(1, 32), rotation=45)
209 | plt.show()
210 |
211 | # build accuracy by tail number variable
212 | dfPlot = df_tail_num_delays
213 | dfPlot.reset_index(inplace=True)
214 | dfPlot.columns
215 | plt.show()
216 | fig = plt.figure()
217 | fig.suptitle('Accuracy by tail number', fontsize=14, fontweight='bold')
218 | ax = fig.add_subplot(111)
219 | fig.subplots_adjust(top=0.95)
220 | ax.set_xlabel('Tail number')
221 | ax.set_ylabel('Accuracy')
222 | ax.bar(dfPlot['TailNum'], dfPlot['Accuracy'], label="Label")
223 | plt.xticks(dfPlot['TailNum'], xrange(1, 32), rotation=45)
224 | plt.show()
225 |
226 | # build accuracy by origin variable
227 | dfPlot = df_origin_delays
228 | dfPlot.reset_index(inplace=True)
229 | dfPlot.columns
230 | plt.show()
231 | fig = plt.figure()
232 | fig.suptitle('Accuracy by origin', fontsize=14, fontweight='bold')
233 | ax = fig.add_subplot(111)
234 | fig.subplots_adjust(top=0.95)
235 | ax.set_xlabel('Origin airport')
236 | ax.set_ylabel('Accuracy')
237 | ax.bar(dfPlot['Origin'], dfPlot['Accuracy'], label="Label")
238 | plt.xticks(dfPlot['Origin'], xrange(1, 32), rotation=45)
239 | plt.show()
240 |
241 | # build accuracy by destination variable
242 | dfPlot = df_dest_delays
243 | dfPlot.reset_index(inplace=True)
244 | dfPlot.columns
245 | plt.show()
246 | fig = plt.figure()
247 | fig.suptitle('Accuracy by destination', fontsize=14, fontweight='bold')
248 | ax = fig.add_subplot(111)
249 | fig.subplots_adjust(top=0.95)
250 | ax.set_xlabel('Destination airport')
251 | ax.set_ylabel('Accuracy')
252 | ax.bar(dfPlot['Dest'], dfPlot['Accuracy'], label="Label")
253 | plt.xticks(dfPlot['Dest'], xrange(1, 32), rotation=45)
254 | plt.show()
255 |
256 | # build accuracy by departure time variable
257 | dfPlot = df_dep_time_delays
258 | dfPlot.reset_index(inplace=True)
259 | dfPlot.columns
260 | plt.show()
261 | fig = plt.figure()
262 | fig.suptitle('Accuracy by departure time', fontsize=14, fontweight='bold')
263 | ax = fig.add_subplot(111)
264 | fig.subplots_adjust(top=0.95)
265 | ax.set_xlabel('Departure time')
266 | ax.set_ylabel('Accuracy')
267 | ax.bar(dfPlot['dep_time'], dfPlot['Accuracy'], label="Label")
268 | plt.xticks(dfPlot['dep_time'], xrange(1, 32), rotation=45)
269 | plt.show()
270 |
--------------------------------------------------------------------------------
/Old Python Code/Basic.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | # -*- coding: utf-8 -*-
4 | # 3.0
5 |
6 | #
7 |
8 | #!/usr/bin/env python
9 |
10 | """This file contains the code for the Data Mining Class. It uses the Airline dataset <>"""
11 |
12 | __author__ = ""
13 | __email__ = ""
14 | __status__ = ""
15 |
16 | #
17 |
18 | # Importing various modules
19 |
20 | import matplotlib.pyplot as plt
21 | import numpy as np
22 | from pylab import figure, show
23 | from pandas import DataFrame, Series
24 | import pandas as pd
25 | import csv
26 | import os
27 | import statsmodels.formula.api as smf
28 | import scipy.stats as stats
29 | import statsmodels.api as sm
30 |
31 | #
32 |
33 | # Setting global constants. Please initialize this before running the code
34 |
35 | TRAINING_LINE_NUMBER = 100000 # Number of lines to be read from the huge file, set to total file length while running for entire file
36 | INPUT_FILE_PATH="C:\\data\\airline\\" # Path of the folder where you have placed your files
37 | SKIP_FIRST_LINE = True # To skip the first line, as its the header
38 | YEARS = ['2008'] # Add more years in this list and add the files in the INPUT_FILE_PATH
39 |
40 | #
41 |
42 | # Setting the dataframes for Airline, Plane and Carriers
43 |
44 | try:
45 | path = "C:\\data\\airline\\plane-data.csv"
46 | dfPlane = pd.read_csv(path)
47 | path = 'C:\\data\\airline\\airports.csv'
48 | dfAirport = pd.read_csv(path)
49 | path = 'C:\\data\\airline\\carriers.csv'
50 | dfCarrier = pd.read_csv(path)
51 | except Exception as e:
52 | print "Supplemental Data Import failed", e
53 |
54 | #
55 |
56 | # Readng the main file in a Pandas dataframe
57 |
58 | try:
59 | for year in YEARS:
60 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
61 | dfMaster = pd.read_csv(path, nrows=TRAINING_LINE_NUMBER,skiprows=0)
62 | except Exception as e:
63 | print "Supplemental Data Import failed", e
64 | dfMaster.head()
65 |
66 | #
67 |
68 | dfMaster.fillna(0,inplace=True)
69 |
70 | #
71 |
72 | # TODO: Do this for other dataframes as well
73 |
74 | # Convert all columns to respective datatypes
75 |
76 | dfMaster['Year'] = dfMaster['Year'].astype('int')
77 | dfMaster['Month'] = dfMaster['Month'].astype('int')
78 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
79 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
80 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
81 | dfMaster['CRSDepTime'] = dfMaster['CRSDepTime'].astype('int')
82 | dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')
83 | dfMaster['CRSArrTime'] = dfMaster['CRSArrTime'].astype('int')
84 | dfMaster['FlightNum'] = dfMaster['FlightNum'].astype('int')
85 | dfMaster['ActualElapsedTime'] = dfMaster['ActualElapsedTime'].astype('int')
86 | dfMaster['CRSElapsedTime'] = dfMaster['CRSElapsedTime'].astype('int')
87 | dfMaster['AirTime'] = dfMaster['AirTime'].astype('int')
88 | dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')
89 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
90 | dfMaster['Distance'] = dfMaster['Distance'].astype('int')
91 | dfMaster['TaxiIn'] = dfMaster['TaxiIn'].astype('int')
92 | dfMaster['TaxiOut'] = dfMaster['TaxiOut'].astype('int')
93 | dfMaster['Cancelled'] = dfMaster['Cancelled'].astype('int')
94 | dfMaster['Diverted'] = dfMaster['Diverted'].astype('int')
95 | print dfMaster.columns
96 |
97 | #
98 |
99 | # for col in dfMaster.columns:
100 | # print 'dfMaster[\'',col,'\'] = dfMaster[\'',col,'\'].astype(\'int\')'
101 |
102 | #
103 |
104 | results = sm.OLS.from_formula('DepDelay ~ ArrDelay', dfMaster).fit()
105 | print results.summary()
106 |
107 | #
108 |
109 | intercept, slope = results.params
110 | r2 = results.rsquared
111 | print slope, intercept, r2
112 |
113 | plt.plot(dfMaster['DepDelay'], dfMaster['ArrDelay'], 'bo')
114 | x = np.array([min(dfMaster['ArrDelay']), max(dfMaster['ArrDelay'])])
115 | y = intercept + slope * x
116 | plt.plot(x, y, 'r-')
117 | plt.show()
118 |
119 |
120 | from statsmodels.stats.anova import anova_lm
121 |
122 | anova_lm(results)
123 |
124 |
--------------------------------------------------------------------------------
/Old Python Code/Dest.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (cpandas.core.series
5 | Series
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_data'
14 | p6
15 | g0
16 | (cpandas.core.internals
17 | SingleBlockManager
18 | p7
19 | g2
20 | Ntp8
21 | Rp9
22 | ((lp10
23 | cnumpy.core.multiarray
24 | _reconstruct
25 | p11
26 | (cpandas.core.index
27 | Index
28 | p12
29 | (I0
30 | tp13
31 | S'b'
32 | p14
33 | tp15
34 | Rp16
35 | ((I1
36 | (L64L
37 | tp17
38 | cnumpy
39 | dtype
40 | p18
41 | (S'O8'
42 | p19
43 | I0
44 | I1
45 | tp20
46 | Rp21
47 | (I3
48 | S'|'
49 | p22
50 | NNNI-1
51 | I-1
52 | I63
53 | tp23
54 | bI00
55 | (lp24
56 | S'ABQ'
57 | p25
58 | aS'ALB'
59 | p26
60 | aS'AMA'
61 | p27
62 | aS'AUS'
63 | p28
64 | aS'BDL'
65 | p29
66 | aS'BHM'
67 | p30
68 | aS'BNA'
69 | p31
70 | aS'BOI'
71 | p32
72 | aS'BUF'
73 | p33
74 | aS'BUR'
75 | p34
76 | aS'BWI'
77 | p35
78 | aS'CLE'
79 | p36
80 | aS'CMH'
81 | p37
82 | aS'CRP'
83 | p38
84 | aS'DAL'
85 | p39
86 | aS'DEN'
87 | p40
88 | aS'DTW'
89 | p41
90 | aS'ELP'
91 | p42
92 | aS'FLL'
93 | p43
94 | aS'GEG'
95 | p44
96 | aS'HOU'
97 | p45
98 | aS'HRL'
99 | p46
100 | aS'IAD'
101 | p47
102 | aS'IND'
103 | p48
104 | aS'ISP'
105 | p49
106 | aS'JAN'
107 | p50
108 | aS'JAX'
109 | p51
110 | aS'LAS'
111 | p52
112 | aS'LAX'
113 | p53
114 | aS'LBB'
115 | p54
116 | aS'LIT'
117 | p55
118 | aS'MAF'
119 | p56
120 | aS'MCI'
121 | p57
122 | aS'MCO'
123 | p58
124 | aS'MDW'
125 | p59
126 | aS'MHT'
127 | p60
128 | aS'MSY'
129 | p61
130 | aS'OAK'
131 | p62
132 | aS'OKC'
133 | p63
134 | aS'OMA'
135 | p64
136 | aS'ONT'
137 | p65
138 | aS'ORF'
139 | p66
140 | aS'PBI'
141 | p67
142 | aS'PDX'
143 | p68
144 | aS'PHL'
145 | p69
146 | aS'PHX'
147 | p70
148 | aS'PIT'
149 | p71
150 | aS'PVD'
151 | p72
152 | aS'RDU'
153 | p73
154 | aS'RNO'
155 | p74
156 | aS'RSW'
157 | p75
158 | aS'SAN'
159 | p76
160 | aS'SAT'
161 | p77
162 | aS'SDF'
163 | p78
164 | aS'SEA'
165 | p79
166 | aS'SFO'
167 | p80
168 | aS'SJC'
169 | p81
170 | aS'SLC'
171 | p82
172 | aS'SMF'
173 | p83
174 | aS'SNA'
175 | p84
176 | aS'STL'
177 | p85
178 | aS'TPA'
179 | p86
180 | aS'TUL'
181 | p87
182 | aS'TUS'
183 | p88
184 | atp89
185 | (Ntp90
186 | tp91
187 | ba(lp92
188 | g11
189 | (cnumpy
190 | ndarray
191 | p93
192 | (I0
193 | tp94
194 | g14
195 | tp95
196 | Rp96
197 | (I1
198 | (L64L
199 | tp97
200 | g18
201 | (S'i8'
202 | p98
203 | I0
204 | I1
205 | tp99
206 | Rp100
207 | (I3
208 | S'<'
209 | p101
210 | NNNI-1
211 | I-1
212 | I0
213 | tp102
214 | bI00
215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00'
216 | p103
217 | tp104
218 | ba(lp105
219 | g16
220 | atp106
221 | bsS'name'
222 | p107
223 | Nsb.
--------------------------------------------------------------------------------
/Old Python Code/Julia Code.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:2119dd4eb940c5d56c1cf3c63fe41c2b7d02d5ac902ce8287eaa7c250c822c89"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "import csv\n",
16 | "import pickle\n",
17 | "\n",
18 | "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n",
19 | "years = [2008]\n",
20 | "\n",
21 | "def ComputeDayofYear(row):\n",
22 | " \"\"\"This function will return an integer to represent the day of the year given an integer\n",
23 | " representing month and an integer representing the day of the month. This number will\n",
24 | " correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned\n",
25 | " as 0. Feb 29th will be returned as 59.\"\"\"\n",
26 | "\n",
27 | " if(row[0] == '1'):\n",
28 | " calc = 0 + int(row[1]) - 1\n",
29 | " row[1] = str(calc)\n",
30 | " elif(row[0] == '2'):\n",
31 | " calc = 31 + int(row[1]) - 1\n",
32 | " row[1] = str(calc)\n",
33 | " elif(row[0] == '3'):\n",
34 | " calc = 60 + int(row[1]) - 1\n",
35 | " row[1] = str(calc)\n",
36 | " elif(row[0] == '4'):\n",
37 | " calc = 91 + int(row[1]) - 1\n",
38 | " row[1] = str(calc)\n",
39 | " elif(row[0] == '5'):\n",
40 | " calc = 121 + int(row[1]) - 1\n",
41 | " row[1] = str(calc)\n",
42 | " elif(row[0] == '6'):\n",
43 | " calc = 152 + int(row[1]) - 1\n",
44 | " row[1] = str(calc)\n",
45 | " elif(row[0] == '7'):\n",
46 | " calc = 182 + int(row[1]) - 1\n",
47 | " row[1] = str(calc)\n",
48 | " elif(row[0] == '8'):\n",
49 | " calc = 213 + int(row[1]) - 1\n",
50 | " row[1] = str(calc)\n",
51 | " elif(row[0] == '9'):\n",
52 | " calc = 244 + int(row[1]) - 1\n",
53 | " row[1] = str(calc)\n",
54 | " elif(row[0] == '10'):\n",
55 | " calc = 274 + int(row[1]) - 1\n",
56 | " row[1] = str(calc)\n",
57 | " elif(row[0] == '11'):\n",
58 | " calc = 305 + int(row[1]) - 1\n",
59 | " row[1] = str(calc)\n",
60 | " elif(row[0] == '12'):\n",
61 | " calc = 335 + int(row[1]) - 1\n",
62 | " row[1] = str(calc)\n",
63 | " return row\n",
64 | "\n",
65 | "\n",
66 | "def DiscretizeDepTime(row):\n",
67 | " \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n",
68 | " morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value\n",
69 | " is assumed to be an integer in 24-hour time format. These labels will correspond to\n",
70 | " variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.\n",
71 | " An error time is returned as morning.\"\"\"\n",
72 | "\n",
73 | " if(int(row[3]) <= 559):\n",
74 | " row[3] = '2'\n",
75 | " elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n",
76 | " row[3] = '0'\n",
77 | " elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n",
78 | " row[3] = '1'\n",
79 | " elif(int(row[3]) >= 1800):\n",
80 | " row[3] = '2'\n",
81 | " else:\n",
82 | " row[3] = '0'\n",
83 | " return row\n",
84 | "\n",
85 | "\n",
86 | "def AddDepVar(row):\n",
87 | " \"\"\"This function adds a classification label based on the length of the recorded\n",
88 | " Departure Delay in the data set. It assumes an input integer value of the delay in mins.\n",
89 | " By airline industry standards, flight delays are defined as departure delays greater than\n",
90 | " or equal to 15 minutes. For delayed flights, this variable will have value \"1\".\n",
91 | " For on time flights, it will have value \"0\". Default value will be set at \"0\".\"\"\"\n",
92 | "\n",
93 | " if(row[6] >= '15'):\n",
94 | " row[6] = '1'\n",
95 | " else:\n",
96 | " row[6] = '0'\n",
97 | " return row\n",
98 | "\n",
99 | "def SaveData(data, pickle_file_name):\n",
100 | " \"\"\"This function pickles each file.\"\"\"\n",
101 | "\n",
102 | " f = open (pickle_file_name, \"w\")\n",
103 | " pickle.dump(data, f)\n",
104 | " f.close()\n",
105 | "\n",
106 | "for i in years:\n",
107 | " data = []\n",
108 | " file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n",
109 | " pickle_file_name = 'data' + str(i)\n",
110 | " with open(file_path, 'r') as data_csv:\n",
111 | " csv_reader = csv.reader(data_csv, delimiter=',')\n",
112 | " for row in list(csv_reader):\n",
113 | " if row[21] == '0':\n",
114 | " content = list(row[i] for i in needed_cols)\n",
115 | " content2 = ComputeDayofYear(content)\n",
116 | " content3 = DiscretizeDepTime(content2)\n",
117 | " content4 = AddDepVar(content3)\n",
118 | " data.append(content4)\n",
119 | " SaveData(data, pickle_file_name)"
120 | ],
121 | "language": "python",
122 | "metadata": {},
123 | "outputs": []
124 | }
125 | ],
126 | "metadata": {}
127 | }
128 | ]
129 | }
--------------------------------------------------------------------------------
/Old Python Code/NB.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import numpy as np
3 | import pandas as pd
4 | import sklearn
5 | from sklearn.naive_bayes import *
6 | from sklearn.metrics import *
7 | import os
8 | import cPickle
9 |
10 | # Setting up constants
11 | print "Setting constants..."
12 |
13 | TRAINING_LINE_NUMBER = 100
14 | YEARS = ['2008', '2007']
15 | # INPUT_FILE_PATH = "/home/dmenghani/python/" # Unix path
16 | INPUT_FILE_PATH = "C:\\data\\airline\\" # Windows path
17 | # YEARS = ['2008']
18 |
19 | SKIP_FIRST_LINE = True # To skip the first line, as its the header
20 |
21 | master = []
22 | print "Reading into Pandas frame..."
23 | try:
24 | for year in YEARS:
25 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
26 | print path
27 | dfPart = pd.read_csv(
28 | path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[
29 | u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
30 | u'DepTime', u'TailNum', u'Origin', u'Dest', u'DepDelay', u'Cancelled'
31 | ])
32 | dfPart = dfPart[dfPart['Cancelled'] == 0]
33 | print len(dfPart)
34 | master.append(dfPart)
35 | except Exception as e:
36 | print "Supplemental Data Import failed", e
37 |
38 | dfMaster = pd.concat(master, ignore_index=True)
39 | print "Total length - ", len(dfMaster)
40 |
41 |
42 | dfMaster.fillna(0, inplace=True)
43 | dfMaster['Year'] = dfMaster['Year'].astype('int')
44 | dfMaster['Month'] = dfMaster['Month'].astype('int')
45 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
46 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
47 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
48 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
49 |
50 | print "Length of pandas frame - ", len(dfMaster)
51 | print "Dataframe columns - ", dfMaster.columns
52 |
53 | df = dfMaster
54 |
55 | print "Calculating classification label..."
56 | df['label'] = 0
57 | df.label[df.DepDelay >= 15] = 1
58 | df.label[df.DepDelay < 15] = 0
59 | del df['DepDelay']
60 |
61 | print "Converting categorical data to numeric..."
62 | for col in set(df.columns):
63 | # print col, train[col].dtype
64 | if df[col].dtype == np.dtype('object'):
65 | print "Converting...", col
66 | if col == 'TailNum':
67 | s = np.unique(df[col].values)
68 | TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)
69 | # print TailNum
70 | if col == 'UniqueCarrier':
71 | s = np.unique(df[col].values)
72 | UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)
73 | # print UniqueCarrier
74 | if col == 'Dest':
75 | s = np.unique(df[col].values)
76 | Dest = pd.Series([x[0] for x in enumerate(s)], index=s)
77 | # print Dest
78 | if col == 'Origin':
79 | s = np.unique(df[col].values)
80 | Origin = pd.Series([x[0] for x in enumerate(s)], index=s)
81 | # print Origin
82 |
83 |
84 | def getTailNum(inTailNum):
85 | # print "In...",type(inTailNum)
86 | out = []
87 | for x, y in inTailNum.iteritems():
88 | # print "x,y, out",x,y,TailNum.get_value(y)
89 | out.append(TailNum.get_value(y) + 1)
90 | # print "final out", out
91 | return out
92 |
93 |
94 | def getDest(inDest):
95 | out = []
96 | for x, y in inDest.iteritems():
97 | out.append(Dest.get_value(y) + 1)
98 | return out
99 |
100 |
101 | def getOrigin(inOrign):
102 | out = []
103 | for x, y in inOrign.iteritems():
104 | out.append(Origin.get_value(y) + 1)
105 | return out
106 |
107 |
108 | def getCarrier(inCarrier):
109 | out = []
110 | for x, y in inCarrier.iteritems():
111 | out.append(UniqueCarrier.get_value(y) + 1)
112 | return out
113 |
114 | df['TailNum'] = getTailNum(df['TailNum'])
115 | print "TailNum completed."
116 |
117 | df['Dest'] = getDest(df['Dest'])
118 | print "Dest completed."
119 |
120 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])
121 | print "UniqueCarrier completed."
122 |
123 | df['Origin'] = getOrigin(df['Origin'])
124 | print "Origin completed."
125 |
126 | print "Conversion to numeric completed."
127 |
128 | print "Pickling converted data..."
129 | df.to_pickle(INPUT_FILE_PATH + "\df.pkl")
130 |
131 | print "Begin classification...75% training, 25% testing, randomly chosen"
132 | arget_names = np.array(['Delayed', 'Not Delayed'])
133 | # add columns to your data frame
134 | df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75
135 | # define training and test sets
136 | train = df[df['is_train'] == True]
137 | test = df[df['is_train'] == False]
138 | trainTargets = np.array(train['label']).astype(int)
139 | testTargets = np.array(test['label']).astype(int)
140 | features = df.columns[0:9]
141 | print "Model fitting and prediction started..."
142 | gnb = MultinomialNB()
143 | # train model
144 | y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])
145 | print "Classification completed."
146 | print "Calculating metrcs..."
147 | test['pred_label'] = y_gnb
148 | test.head()
149 | acc = zip(test['label'], test['pred_label'])
150 | match_count = 0
151 | for i in acc:
152 | if i[0] == - i[1]:
153 | match_count += 1
154 | print "Matches - ", match_count
155 | print "Total length - ", len(acc)
156 | print "Accuracy:", float(match_count) / len(acc)
157 |
--------------------------------------------------------------------------------
/Old Python Code/Origin.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (cpandas.core.series
5 | Series
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_data'
14 | p6
15 | g0
16 | (cpandas.core.internals
17 | SingleBlockManager
18 | p7
19 | g2
20 | Ntp8
21 | Rp9
22 | ((lp10
23 | cnumpy.core.multiarray
24 | _reconstruct
25 | p11
26 | (cpandas.core.index
27 | Index
28 | p12
29 | (I0
30 | tp13
31 | S'b'
32 | p14
33 | tp15
34 | Rp16
35 | ((I1
36 | (L64L
37 | tp17
38 | cnumpy
39 | dtype
40 | p18
41 | (S'O8'
42 | p19
43 | I0
44 | I1
45 | tp20
46 | Rp21
47 | (I3
48 | S'|'
49 | p22
50 | NNNI-1
51 | I-1
52 | I63
53 | tp23
54 | bI00
55 | (lp24
56 | S'ABQ'
57 | p25
58 | aS'ALB'
59 | p26
60 | aS'AMA'
61 | p27
62 | aS'AUS'
63 | p28
64 | aS'BDL'
65 | p29
66 | aS'BHM'
67 | p30
68 | aS'BNA'
69 | p31
70 | aS'BOI'
71 | p32
72 | aS'BUF'
73 | p33
74 | aS'BUR'
75 | p34
76 | aS'BWI'
77 | p35
78 | aS'CLE'
79 | p36
80 | aS'CMH'
81 | p37
82 | aS'CRP'
83 | p38
84 | aS'DAL'
85 | p39
86 | aS'DEN'
87 | p40
88 | aS'DTW'
89 | p41
90 | aS'ELP'
91 | p42
92 | aS'FLL'
93 | p43
94 | aS'GEG'
95 | p44
96 | aS'HOU'
97 | p45
98 | aS'HRL'
99 | p46
100 | aS'IAD'
101 | p47
102 | aS'IND'
103 | p48
104 | aS'ISP'
105 | p49
106 | aS'JAN'
107 | p50
108 | aS'JAX'
109 | p51
110 | aS'LAS'
111 | p52
112 | aS'LAX'
113 | p53
114 | aS'LBB'
115 | p54
116 | aS'LIT'
117 | p55
118 | aS'MAF'
119 | p56
120 | aS'MCI'
121 | p57
122 | aS'MCO'
123 | p58
124 | aS'MDW'
125 | p59
126 | aS'MHT'
127 | p60
128 | aS'MSY'
129 | p61
130 | aS'OAK'
131 | p62
132 | aS'OKC'
133 | p63
134 | aS'OMA'
135 | p64
136 | aS'ONT'
137 | p65
138 | aS'ORF'
139 | p66
140 | aS'PBI'
141 | p67
142 | aS'PDX'
143 | p68
144 | aS'PHL'
145 | p69
146 | aS'PHX'
147 | p70
148 | aS'PIT'
149 | p71
150 | aS'PVD'
151 | p72
152 | aS'RDU'
153 | p73
154 | aS'RNO'
155 | p74
156 | aS'RSW'
157 | p75
158 | aS'SAN'
159 | p76
160 | aS'SAT'
161 | p77
162 | aS'SDF'
163 | p78
164 | aS'SEA'
165 | p79
166 | aS'SFO'
167 | p80
168 | aS'SJC'
169 | p81
170 | aS'SLC'
171 | p82
172 | aS'SMF'
173 | p83
174 | aS'SNA'
175 | p84
176 | aS'STL'
177 | p85
178 | aS'TPA'
179 | p86
180 | aS'TUL'
181 | p87
182 | aS'TUS'
183 | p88
184 | atp89
185 | (Ntp90
186 | tp91
187 | ba(lp92
188 | g11
189 | (cnumpy
190 | ndarray
191 | p93
192 | (I0
193 | tp94
194 | g14
195 | tp95
196 | Rp96
197 | (I1
198 | (L64L
199 | tp97
200 | g18
201 | (S'i8'
202 | p98
203 | I0
204 | I1
205 | tp99
206 | Rp100
207 | (I3
208 | S'<'
209 | p101
210 | NNNI-1
211 | I-1
212 | I0
213 | tp102
214 | bI00
215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00'
216 | p103
217 | tp104
218 | ba(lp105
219 | g16
220 | atp106
221 | bsS'name'
222 | p107
223 | Nsb.
--------------------------------------------------------------------------------
/Old Python Code/UniqueCarrier.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (cpandas.core.series
5 | Series
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_data'
14 | p6
15 | g0
16 | (cpandas.core.internals
17 | SingleBlockManager
18 | p7
19 | g2
20 | Ntp8
21 | Rp9
22 | ((lp10
23 | cnumpy.core.multiarray
24 | _reconstruct
25 | p11
26 | (cpandas.core.index
27 | Index
28 | p12
29 | (I0
30 | tp13
31 | S'b'
32 | p14
33 | tp15
34 | Rp16
35 | ((I1
36 | (L1L
37 | tp17
38 | cnumpy
39 | dtype
40 | p18
41 | (S'O8'
42 | p19
43 | I0
44 | I1
45 | tp20
46 | Rp21
47 | (I3
48 | S'|'
49 | p22
50 | NNNI-1
51 | I-1
52 | I63
53 | tp23
54 | bI00
55 | (lp24
56 | S'WN'
57 | p25
58 | atp26
59 | (Ntp27
60 | tp28
61 | ba(lp29
62 | g11
63 | (cnumpy
64 | ndarray
65 | p30
66 | (I0
67 | tp31
68 | g14
69 | tp32
70 | Rp33
71 | (I1
72 | (L1L
73 | tp34
74 | g18
75 | (S'i8'
76 | p35
77 | I0
78 | I1
79 | tp36
80 | Rp37
81 | (I3
82 | S'<'
83 | p38
84 | NNNI-1
85 | I-1
86 | I0
87 | tp39
88 | bI00
89 | S'\x00\x00\x00\x00\x00\x00\x00\x00'
90 | p40
91 | tp41
92 | ba(lp42
93 | g16
94 | atp43
95 | bsS'name'
96 | p44
97 | Nsb.
--------------------------------------------------------------------------------
/Old Python Code/Untitled0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import sklearn\n",
19 | "from sklearn.naive_bayes import *\n",
20 | "from sklearn.metrics import *\n",
21 | "import os\n",
22 | "import cPickle\n",
23 | "import sys\n",
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "from optparse import OptionParser\n",
27 | "from sklearn import metrics, preprocessing\n",
28 | "from sklearn import svm, naive_bayes, neighbors, tree\n",
29 | "from sklearn.ensemble import AdaBoostClassifier\n",
30 | "from sklearn import cross_validation\n",
31 | "from sklearn.ensemble import RandomForestClassifier # random forest\n",
32 | "from sklearn.svm import SVC # support vector machine classifier\n",
33 | "# hyperparameter grid search to find best model parameters\n",
34 | "from sklearn.grid_search import GridSearchCV\n",
35 | "from sklearn import preprocessing # preprocess string labels into numerics\n",
36 | "from sklearn import *\n",
37 | "from sklearn.metrics import precision_recall_fscore_support\n",
38 | "from sklearn.metrics import classification_report\n",
39 | "\n",
40 | "\n",
41 | "# In[135]:\n",
42 | "\n",
43 | "# Setting up constants\n",
44 | "print \"Setting constants...\"\n",
45 | "\n",
46 | "TRAINING_LINE_NUMBER = 500000\n",
47 | "YEARS = ['2006', '2008', '2007']\n",
48 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n",
49 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n",
50 | "# YEARS = ['2008']\n",
51 | "\n",
52 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n",
53 | "\n",
54 | "master = []\n",
55 | "print \"Reading into Pandas frame...\"\n",
56 | "try:\n",
57 | " for year in YEARS:\n",
58 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
59 | " print \"\\n\", path\n",
60 | " dfPart = pd.read_csv(\n",
61 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
62 | " u'Year',\n",
63 | " u'Month',\n",
64 | " u'DayofMonth',\n",
65 | " u'DayOfWeek',\n",
66 | " u'UniqueCarrier',\n",
67 | " u'DepTime',\n",
68 | " u'TailNum',\n",
69 | " u'Origin',\n",
70 | " u'Dest',\n",
71 | " u'DepDelay',\n",
72 | " # u'ArrDelay',\n",
73 | " u'Cancelled',\n",
74 | " # u'ArrTime',\n",
75 | " # u'ArrDelay',\n",
76 | " # u'Distance'\n",
77 | " ])\n",
78 | " print len(dfPart)\n",
79 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
80 | " print \"Removed cancelled flights, new length - \", len(dfPart)\n",
81 | " master.append(dfPart)\n",
82 | " print\n",
83 | "except Exception as e:\n",
84 | " print \"Supplemental Data Import failed\", e\n",
85 | "\n",
86 | "dfMaster = pd.concat(master, ignore_index=True)\n",
87 | "master = []\n",
88 | "dfPart = []\n",
89 | "\n",
90 | "print \"Total length - \", len(dfMaster)\n",
91 | "del dfMaster['Cancelled']\n",
92 | "\n",
93 | "dfMaster.fillna(0, inplace=True)\n",
94 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
95 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
96 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
97 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
98 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
99 | "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
100 | "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
101 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
102 | "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
103 | "\n",
104 | "df = dfMaster\n",
105 | "\n",
106 | "print \"Calculating classification label...\"\n",
107 | "df['label'] = 0\n",
108 | "df.label[df.DepDelay >= 15] = 1\n",
109 | "df.label[df.DepDelay < 15] = 0\n",
110 | "\n",
111 | "# df['DepDelay'][df.DepDelay < 0] = 0\n",
112 | "del df['DepDelay']\n",
113 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
114 | "\n",
115 | "print \"Dataframe shape - \", df.shape\n",
116 | "print \"Columns -\", df.columns\n",
117 | "\n",
118 | "\n",
119 | "# In[136]:\n",
120 | "\n",
121 | "print \"Converting categorical data to numeric...\"\n",
122 | "for col in set(df.columns):\n",
123 | "# print col, train[col].dtype\n",
124 | " if df[col].dtype == np.dtype('object'):\n",
125 | " print \"Converting...\", col\n",
126 | " if col == 'TailNum':\n",
127 | " s = np.unique(df[col].values)\n",
128 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 | "# print TailNum\n",
130 | " if col == 'UniqueCarrier':\n",
131 | " s = np.unique(df[col].values)\n",
132 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 | "# print UniqueCarrier\n",
134 | " if col == 'Dest':\n",
135 | " s = np.unique(df[col].values)\n",
136 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 | "# print Dest\n",
138 | " if col == 'Origin':\n",
139 | " s = np.unique(df[col].values)\n",
140 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
141 | "# print Origin\n",
142 | "\n",
143 | "\n",
144 | "def getTailNum(inTailNum):\n",
145 | "# print \"In...\",type(inTailNum)\n",
146 | " out = []\n",
147 | " for x, y in inTailNum.iteritems():\n",
148 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n",
149 | " out.append(TailNum.get_value(y) + 1)\n",
150 | "# print \"final out\", out\n",
151 | " return out\n",
152 | "\n",
153 | "\n",
154 | "def getDest(inDest):\n",
155 | " out = []\n",
156 | " for x, y in inDest.iteritems():\n",
157 | " out.append(Dest.get_value(y) + 1)\n",
158 | " return out\n",
159 | "\n",
160 | "\n",
161 | "def getOrigin(inOrign):\n",
162 | " out = []\n",
163 | " for x, y in inOrign.iteritems():\n",
164 | " out.append(Origin.get_value(y) + 1)\n",
165 | " return out\n",
166 | "\n",
167 | "\n",
168 | "def getCarrier(inCarrier):\n",
169 | " out = []\n",
170 | " for x, y in inCarrier.iteritems():\n",
171 | " out.append(UniqueCarrier.get_value(y) + 1)\n",
172 | " return out\n",
173 | "\n",
174 | "df['TailNum'] = getTailNum(df['TailNum'])\n",
175 | "print \"TailNum completed.\"\n",
176 | "\n",
177 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
178 | "print \"UniqueCarrier completed.\"\n",
179 | "\n",
180 | "df['Dest'] = getDest(df['Dest'])\n",
181 | "print \"Dest completed.\"\n",
182 | "\n",
183 | "df['Origin'] = getOrigin(df['Origin'])\n",
184 | "print \"Origin completed.\"\n",
185 | "\n",
186 | "print \"Conversion to numeric completed.\"\n",
187 | "\n",
188 | "# print \"Pickling converted data...\"\n",
189 | "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n"
190 | ],
191 | "language": "python",
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "output_type": "stream",
196 | "stream": "stdout",
197 | "text": [
198 | "Setting constants...\n",
199 | "Reading into Pandas frame...\n",
200 | "\n",
201 | "C:\\data\\airline\\2006.csv\n",
202 | "500000"
203 | ]
204 | },
205 | {
206 | "output_type": "stream",
207 | "stream": "stdout",
208 | "text": [
209 | "\n",
210 | "Removed cancelled flights, new length - "
211 | ]
212 | },
213 | {
214 | "output_type": "stream",
215 | "stream": "stdout",
216 | "text": [
217 | " 491158\n",
218 | "\n",
219 | "\n",
220 | "C:\\data\\airline\\2008.csv\n",
221 | "500000"
222 | ]
223 | },
224 | {
225 | "output_type": "stream",
226 | "stream": "stdout",
227 | "text": [
228 | "\n",
229 | "Removed cancelled flights, new length - "
230 | ]
231 | },
232 | {
233 | "output_type": "stream",
234 | "stream": "stdout",
235 | "text": [
236 | " 484708\n",
237 | "\n",
238 | "\n",
239 | "C:\\data\\airline\\2007.csv\n",
240 | "500000"
241 | ]
242 | },
243 | {
244 | "output_type": "stream",
245 | "stream": "stdout",
246 | "text": [
247 | "\n",
248 | "Removed cancelled flights, new length - "
249 | ]
250 | },
251 | {
252 | "output_type": "stream",
253 | "stream": "stdout",
254 | "text": [
255 | " 487243\n",
256 | "\n",
257 | "Total length - "
258 | ]
259 | },
260 | {
261 | "output_type": "stream",
262 | "stream": "stdout",
263 | "text": [
264 | " 1463109\n",
265 | "Calculating classification label..."
266 | ]
267 | },
268 | {
269 | "output_type": "stream",
270 | "stream": "stdout",
271 | "text": [
272 | "\n",
273 | "Dataframe shape - (1463109, 10)\n",
274 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
275 | "Converting categorical data to numeric...\n",
276 | "Converting..."
277 | ]
278 | },
279 | {
280 | "output_type": "stream",
281 | "stream": "stdout",
282 | "text": [
283 | " Origin\n",
284 | "Converting..."
285 | ]
286 | },
287 | {
288 | "output_type": "stream",
289 | "stream": "stdout",
290 | "text": [
291 | " UniqueCarrier\n",
292 | "Converting..."
293 | ]
294 | },
295 | {
296 | "output_type": "stream",
297 | "stream": "stdout",
298 | "text": [
299 | " Dest\n",
300 | "Converting..."
301 | ]
302 | },
303 | {
304 | "output_type": "stream",
305 | "stream": "stdout",
306 | "text": [
307 | " TailNum\n",
308 | "TailNum completed."
309 | ]
310 | },
311 | {
312 | "output_type": "stream",
313 | "stream": "stdout",
314 | "text": [
315 | "\n",
316 | "UniqueCarrier completed."
317 | ]
318 | },
319 | {
320 | "output_type": "stream",
321 | "stream": "stdout",
322 | "text": [
323 | "\n",
324 | "Dest completed."
325 | ]
326 | },
327 | {
328 | "output_type": "stream",
329 | "stream": "stdout",
330 | "text": [
331 | "\n",
332 | "Origin completed."
333 | ]
334 | },
335 | {
336 | "output_type": "stream",
337 | "stream": "stdout",
338 | "text": [
339 | "\n",
340 | "Conversion to numeric completed.\n"
341 | ]
342 | }
343 | ],
344 | "prompt_number": 13
345 | },
346 | {
347 | "cell_type": "code",
348 | "collapsed": false,
349 | "input": [
350 | "\n",
351 | "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
352 | "\n",
353 | "# add columns to your data frame\n",
354 | "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
355 | "\n",
356 | "# define training and test sets\n",
357 | "train = df[df['is_train'] == True]\n",
358 | "test = df[df['is_train'] == False]\n",
359 | "trainTargets = np.array(train['label']).astype(int)\n",
360 | "testTargets = np.array(test['label']).astype(int)\n",
361 | "features = df.columns[0:9]\n",
362 | "print \"Features - \",features\n",
363 | "print \"Model fitting and prediction started...\"\n",
364 | "gnb = GaussianNB()\n",
365 | "\n",
366 | "# train model\n",
367 | "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
368 | "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
369 | "\n",
370 | "print \"Classification completed.\""
371 | ],
372 | "language": "python",
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "output_type": "stream",
377 | "stream": "stdout",
378 | "text": [
379 | "Begin classification...75% training, 25% testing, randomly chosen\n",
380 | "Features - "
381 | ]
382 | },
383 | {
384 | "output_type": "stream",
385 | "stream": "stdout",
386 | "text": [
387 | " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n",
388 | "Model fitting and prediction started...\n",
389 | "Classification completed."
390 | ]
391 | },
392 | {
393 | "output_type": "stream",
394 | "stream": "stdout",
395 | "text": [
396 | "\n",
397 | "Calculating metrcs...\n",
398 | "Accuracy - 0.798698653544\n",
399 | "Confusion metrics\n",
400 | "[[291966 106]\n",
401 | " [ 73525 178]]"
402 | ]
403 | },
404 | {
405 | "output_type": "stream",
406 | "stream": "stdout",
407 | "text": [
408 | "\n",
409 | "Precision - "
410 | ]
411 | },
412 | {
413 | "output_type": "stream",
414 | "stream": "stdout",
415 | "text": [
416 | "0.62676056338\n",
417 | "Recall - "
418 | ]
419 | },
420 | {
421 | "output_type": "stream",
422 | "stream": "stdout",
423 | "text": [
424 | "0.00241509843561\n"
425 | ]
426 | }
427 | ],
428 | "prompt_number": 14
429 | },
430 | {
431 | "cell_type": "code",
432 | "collapsed": false,
433 | "input": [
434 | "print \"Calculating metrcs...\"\n",
435 | "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n",
436 | "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n",
437 | "print \"Precision - \", precision_score(test['label'], y_gnb)\n",
438 | "print \"Recall - \", recall_score(test['label'], y_gnb)\n"
439 | ],
440 | "language": "python",
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "output_type": "stream",
445 | "stream": "stdout",
446 | "text": [
447 | "Calculating metrcs...\n",
448 | "Accuracy - 0.798698653544\n",
449 | "Confusion metrics\n",
450 | "[[291966 106]\n",
451 | " [ 73525 178]]"
452 | ]
453 | },
454 | {
455 | "output_type": "stream",
456 | "stream": "stdout",
457 | "text": [
458 | "\n",
459 | "Precision - "
460 | ]
461 | },
462 | {
463 | "output_type": "stream",
464 | "stream": "stdout",
465 | "text": [
466 | "0.62676056338\n",
467 | "Recall - "
468 | ]
469 | },
470 | {
471 | "output_type": "stream",
472 | "stream": "stdout",
473 | "text": [
474 | "0.00241509843561\n"
475 | ]
476 | }
477 | ],
478 | "prompt_number": 25
479 | },
480 | {
481 | "cell_type": "code",
482 | "collapsed": false,
483 | "input": [
484 | "testSFO = test[test['Origin'] == Origin['SFO']]\n",
485 | "print len(testSFO)\n",
486 | "\n",
487 | "testOAK = test[test['Origin'] == Origin['OAK']]\n",
488 | "print len(testOAK)\n"
489 | ],
490 | "language": "python",
491 | "metadata": {},
492 | "outputs": [
493 | {
494 | "output_type": "stream",
495 | "stream": "stdout",
496 | "text": [
497 | "3563\n",
498 | "40\n"
499 | ]
500 | }
501 | ],
502 | "prompt_number": 22
503 | },
504 | {
505 | "cell_type": "code",
506 | "collapsed": false,
507 | "input": [
508 | " np.random.randint(2000, size=10)\n",
509 | " "
510 | ],
511 | "language": "python",
512 | "metadata": {},
513 | "outputs": [
514 | {
515 | "metadata": {},
516 | "output_type": "pyout",
517 | "prompt_number": 27,
518 | "text": [
519 | "array([ 437, 1815, 742, 148, 1399, 1171, 205, 1480, 838, 1437])"
520 | ]
521 | }
522 | ],
523 | "prompt_number": 27
524 | }
525 | ],
526 | "metadata": {}
527 | }
528 | ]
529 | }
--------------------------------------------------------------------------------
/Old Python Code/Untitled1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:42405ac43042e4a863e6490ca6e8de6e19a63251aec5c9df6ebb479db0a2da04"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "import pickle\n",
17 | "import sklearn\n",
18 | "from sklearn.naive_bayes import *\n",
19 | "import pandas as pd\n",
20 | "import numpy as np\n",
21 | "from sklearn import *\n",
22 | "import os\n",
23 | "from sklearn.metrics import *\n",
24 | "from sklearn import metrics, preprocessing\n",
25 | "from sklearn import svm, naive_bayes, neighbors, tree\n",
26 | "from sklearn.ensemble import AdaBoostClassifier\n",
27 | "\n",
28 | "\n",
29 | "def createPickle(data, filename):\n",
30 | " with open(filename, 'wb') as f:\n",
31 | " pickle.dump(data, f)\n",
32 | " print \"Pickled\", filename\n",
33 | "\n",
34 | "\n",
35 | "# Setting up constants\n",
36 | "print \"Setting constants...\"\n",
37 | "\n",
38 | "TRAINING_LINE_NUMBER = 10000\n",
39 | "# YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']\n",
40 | "# YEARS = ['2008', '2006', '2007']\n",
41 | "# INPUT_FILE_PATH = \"/home/dmenghani/python/\" # Unix path\n",
42 | "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\" # Windows path\n",
43 | "YEARS = ['2008']\n",
44 | "SKIP_FIRST_LINE = True # To skip the first line, as its the header\n",
45 | "\n",
46 | "master = []\n",
47 | "print \"Reading into Pandas frame...\"\n",
48 | "try:\n",
49 | " for year in YEARS:\n",
50 | " path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
51 | " print \"\\n\", path\n",
52 | " dfPart = pd.read_csv(\n",
53 | " path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
54 | " u'Year',\n",
55 | " u'Month',\n",
56 | " u'DayofMonth',\n",
57 | " u'DayOfWeek',\n",
58 | " u'UniqueCarrier',\n",
59 | " u'DepTime',\n",
60 | " u'TailNum',\n",
61 | " u'Origin',\n",
62 | " u'Dest',\n",
63 | " u'DepDelay',\n",
64 | " # u'ArrDelay',\n",
65 | " u'Cancelled',\n",
66 | " # u'ArrTime',\n",
67 | " # u'ArrDelay',\n",
68 | " # u'Distance'\n",
69 | " ])\n",
70 | " print len(dfPart)\n",
71 | " dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
72 | " # dfPart['Year'] = year\n",
73 | " # rows = np.random.choice(\n",
74 | " # np.random.permutation(dfPart.index.values), len(dfPart) // 1, replace=False)\n",
75 | " # print rows\n",
76 | " # sampled_dfPart = dfPart.ix[rows]\n",
77 | " sampled_dfPart = dfPart\n",
78 | " print \"Removed cancelled flights, new length - \", len(sampled_dfPart)\n",
79 | " master.append(sampled_dfPart)\n",
80 | " print\n",
81 | "except Exception as e:\n",
82 | " print \"Supplemental Data Import failed\", e\n",
83 | "\n",
84 | "dfMaster = pd.concat(master, ignore_index=True)\n",
85 | "master = []\n",
86 | "dfPart = []\n",
87 | "\n",
88 | "print \"Total length - \", len(dfMaster)\n",
89 | "del dfMaster['Cancelled']\n",
90 | "\n",
91 | "dfMaster.fillna(0, inplace=True)\n",
92 | "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
93 | "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
94 | "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
95 | "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
96 | "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
97 | "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
98 | "\n",
99 | "df = dfMaster\n",
100 | "\n",
101 | "print \"Calculating classification label...\"\n",
102 | "df['label'] = 0\n",
103 | "df.label[df.DepDelay >= 1] = 1\n",
104 | "df.label[df.DepDelay < 1] = 0\n",
105 | "print \"Actual delayed flights -\", np.sum(dfMaster['label']) / len(dfMaster['label'])\n",
106 | "\n",
107 | "# df['DepDelay'][df.DepDelay < 0] = 0\n",
108 | "del df['DepDelay']\n",
109 | "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
110 | "\n",
111 | "print \"Dataframe shape - \", df.shape\n",
112 | "print \"Columns -\", df.columns\n",
113 | "\n",
114 | "\n",
115 | "# In[136]:\n",
116 | "\n",
117 | "print \"Converting categorical data to numeric...\"\n",
118 | "for col in set(df.columns):\n",
119 | "# print col, train[col].dtype\n",
120 | " if df[col].dtype == np.dtype('object'):\n",
121 | " print \"Converting...\", col\n",
122 | " if col == 'TailNum':\n",
123 | " s = np.unique(df[col].values)\n",
124 | " TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
125 | "# print TailNum\n",
126 | " if col == 'UniqueCarrier':\n",
127 | " s = np.unique(df[col].values)\n",
128 | " UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 | "# print UniqueCarrier\n",
130 | " if col == 'Dest':\n",
131 | " s = np.unique(df[col].values)\n",
132 | " Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 | " # print Dest\n",
134 | " if col == 'Origin':\n",
135 | " s = np.unique(df[col].values)\n",
136 | " Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 | " # print Origin\n",
138 | "\n",
139 | "# print \"sfo,\", Origin['SFO']\n",
140 | "# print \"oak,\", Origin['OAK']\n",
141 | "\n",
142 | "# createPickle(Dest, 'Dest_2008.pkl')\n",
143 | "# createPickle(Origin, 'Origin_2008.pkl')\n",
144 | "# createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl')\n",
145 | "# createPickle(TailNum, 'TailNum_2008.pkl')\n",
146 | "\n",
147 | "print \"Pickle completed.\"\n",
148 | "\n",
149 | "\n",
150 | "def getTailNum(inTailNum):\n",
151 | "# print \"In...\",type(inTailNum)\n",
152 | " out = []\n",
153 | " for x, y in inTailNum.iteritems():\n",
154 | "# print \"x,y, out\",x,y,TailNum.get_value(y)\n",
155 | " out.append(TailNum.get_value(y) + 1)\n",
156 | "# print \"final out\", out\n",
157 | " return out\n",
158 | "\n",
159 | "\n",
160 | "def getDest(inDest):\n",
161 | " out = []\n",
162 | " for x, y in inDest.iteritems():\n",
163 | " out.append(Dest.get_value(y) + 1)\n",
164 | " return out\n",
165 | "\n",
166 | "\n",
167 | "def getOrigin(inOrign):\n",
168 | " out = []\n",
169 | "# print inOrign\n",
170 | " for x, y in inOrign.iteritems():\n",
171 | " out.append(Origin.get_value(y) + 1)\n",
172 | " return out\n",
173 | "\n",
174 | "\n",
175 | "def getCarrier(inCarrier):\n",
176 | " out = []\n",
177 | " for x, y in inCarrier.iteritems():\n",
178 | " out.append(UniqueCarrier.get_value(y) + 1)\n",
179 | " return out\n",
180 | "\n",
181 | "print \"Before conversion...\"\n",
182 | "print len(dfMaster[dfMaster['Origin'] == 'SFO'])\n",
183 | "print len(dfMaster[dfMaster['Origin'] == 'OAK'])\n",
184 | "# df[df['Origin'] == 'SFO']"
185 | ],
186 | "language": "python",
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "output_type": "stream",
191 | "stream": "stdout",
192 | "text": [
193 | "Setting constants...\n",
194 | "Reading into Pandas frame...\n",
195 | "\n",
196 | "C:\\data\\airline\\2008.csv\n",
197 | "10000\n",
198 | "Removed cancelled flights, new length - 9837\n",
199 | "\n",
200 | "Total length - 9837\n",
201 | "Calculating classification label...\n",
202 | "Actual delayed flights - 0.756429805835\n",
203 | "Dataframe shape - (9837, 10)\n",
204 | "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
205 | "Converting categorical data to numeric...\n",
206 | "Converting... Origin\n",
207 | "Converting... UniqueCarrier\n",
208 | "Converting... Dest\n",
209 | "Converting..."
210 | ]
211 | },
212 | {
213 | "output_type": "stream",
214 | "stream": "stdout",
215 | "text": [
216 | " TailNum\n",
217 | "Pickle completed.\n",
218 | "Before conversion...\n",
219 | "64\n",
220 | "383\n"
221 | ]
222 | }
223 | ],
224 | "prompt_number": 65
225 | },
226 | {
227 | "cell_type": "code",
228 | "collapsed": false,
229 | "input": [
230 | "len(getOrigin(df['Origin']))\n",
231 | "Origin['SFO']+1"
232 | ],
233 | "language": "python",
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "metadata": {},
238 | "output_type": "pyout",
239 | "prompt_number": 69,
240 | "text": [
241 | "56"
242 | ]
243 | }
244 | ],
245 | "prompt_number": 69
246 | },
247 | {
248 | "cell_type": "code",
249 | "collapsed": false,
250 | "input": [
251 | "\n",
252 | "df['TailNum'] = getTailNum(df['TailNum'])\n",
253 | "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
254 | "df['Dest_new'] = getDest(df['Dest'])\n",
255 | "df['Origin_new'] =getOrigin(df['Origin'])\n",
256 | "\n",
257 | "print \"TailNum completed.\"\n",
258 | "print \"UniqueCarrier completed.\"\n",
259 | "print \"Dest completed.\"\n",
260 | "print \"Origin completed.\"\n",
261 | "\n",
262 | "print \"Conversion to numeric completed.\"\n",
263 | "\n",
264 | "print \"After conversion...\"\n",
265 | "# dfSFO = df[df['Origin'].isin([Origin['SFO']])]\n",
266 | "dfSFO = df[df['Origin']==56]\n",
267 | "print \"SFO len - \", len(dfSFO)\n",
268 | "# print Dest[np.unique(dfSFO['Dest'])]\n",
269 | "\n",
270 | "dfOAK = df[df['Origin'].isin([Origin['OAK']])]\n",
271 | "print \"OAK len - \", len(dfOAK)\n",
272 | "# print Dest[np.unique(dfOAK['Dest'])]\n",
273 | "# print Origin+1\n",
274 | "# print Dest+1\n",
275 | "# df[df['Origin'] == 'SFO']\n",
276 | "# df.to_csv(\"why.csv\")"
277 | ],
278 | "language": "python",
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "output_type": "stream",
283 | "stream": "stdout",
284 | "text": [
285 | "TailNum completed.\n",
286 | "UniqueCarrier completed.\n",
287 | "Dest completed.\n",
288 | "Origin completed.\n",
289 | "Conversion to numeric completed.\n",
290 | "After conversion...\n",
291 | "SFO len - 0\n",
292 | "OAK len - 0\n"
293 | ]
294 | }
295 | ],
296 | "prompt_number": 67
297 | },
298 | {
299 | "cell_type": "code",
300 | "collapsed": false,
301 | "input": [
302 | "\n",
303 | "# print \"Begin cross validation...\"\n",
304 | "\n",
305 | "# features = df.columns[0:9]\n",
306 | "# target_names = ['Not Delayed', 'Delayed']\n",
307 | "# accuracy = {}\n",
308 | "# results = {}\n",
309 | "# matrix = {}\n",
310 | "# prec = {}\n",
311 | "# recall = {}\n",
312 | "\n",
313 | "# for year in YEARS:\n",
314 | "# print \"Testing on - \", year\n",
315 | "# train = df[df['Year'] != int(year)]\n",
316 | "# test = df[df['Year'] == int(year)]\n",
317 | "# test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])]\n",
318 | "# print len(train), len(test)\n",
319 | "# # rows = np.random.choice(np.random.permutation(\n",
320 | "# # test.index.values), len(test) // 1, replace=False)\n",
321 | "# # print rows\n",
322 | "# # sampled_test = test.ix[rows]\n",
323 | "# sampled_test = test\n",
324 | "# trainTargets = np.array(train['label']).astype(int)\n",
325 | "# testTargets = np.array(sampled_test['label']).astype(int)\n",
326 | "# print \"Train length - \", len(train), \"Test length - \", len(sampled_test)\n",
327 | "# # print train['Year']\n",
328 | "# # print test['Year']\n",
329 | "# print \"Model fitting and prediction started...\"\n",
330 | "# gnb = GaussianNB()\n",
331 | "# y_gnb = gnb.fit(train[features], trainTargets).predict(\n",
332 | "# sampled_test[features])\n",
333 | "# sampled_test['pred_label'] = y_gnb\n",
334 | "# # y_prob = gnb.fit(\n",
335 | "# # train[features], trainTargets).predict_proba(test[features])\n",
336 | "# # print y_prob\n",
337 | "# # test['pred_prob'] = y_prob[1][1]\n",
338 | "# print \"Classification completed.\"\n",
339 | "# createPickle(gnb, INPUT_FILE_PATH + \"classifier_\" + year + \".pkl\")\n",
340 | "# createPickle(y_gnb, INPUT_FILE_PATH + \"label_\" + year + \".pkl\")\n",
341 | "# sampled_test.to_csv(\n",
342 | "# INPUT_FILE_PATH + \"\\dfTest\" + year + \".csv\", index=False)\n",
343 | "\n",
344 | "# print \"\\nCalculating metrcs...\"\n",
345 | "# accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb)\n",
346 | "# print \"Accuracy score - \", accuracy[int(year)]\n",
347 | "# prec[int(year)] = precision_score(\n",
348 | "# sampled_test['label'], y_gnb, average='micro')\n",
349 | "# print \"Precision Score - \", prec[int(year)]\n",
350 | "# recall[int(year)] = recall_score(\n",
351 | "# sampled_test['label'], y_gnb, average='micro')\n",
352 | "# print \"Recall Score - \", recall[int(year)]\n",
353 | "# print \"Confusion matrix\"\n",
354 | "# matrix[int(year)] = metrics.confusion_matrix(\n",
355 | "# sampled_test['label'], y_gnb)\n",
356 | "# print matrix[int(year)]\n",
357 | "# results[int(year)] = precision_recall_fscore_support(\n",
358 | "# sampled_test['label'], y_gnb, average='micro')\n",
359 | "# print \"Precision, recall, F-Score, Support - \", results[int(year)]\n",
360 | "# print \"Classification report\"\n",
361 | "# print classification_report(np.array(sampled_test['label']), y_gnb,\n",
362 | "# target_names=target_names)\n",
363 | "# print\n",
364 | "# train = []\n",
365 | "# test = []\n",
366 | "\n",
367 | "# print \"Accuracy\\n\", accuracy\n",
368 | "# print \"\\nPrecision\\n\", prec\n",
369 | "# print \"\\nRecall\\n\", recall\n",
370 | "# print \"\\nMetrics\\n\", results\n",
371 | "# print \"\\nMatrix\\n\", matrix\n",
372 | "\n",
373 | "# print \"\\nMean Cross validation Precision score\", np.mean(pd.Series(prec))\n",
374 | "# print \"\\nMean Cross validation Recall score\", np.mean(pd.Series(recall))\n",
375 | "# print \"\\nMean Cross validation Accuracy score\", np.mean(pd.Series(accuracy))\n",
376 | "\n",
377 | "# # print \"\\nPickling stuff...\"\n",
378 | "# # createPickle(accuracy, 'accuracy.pkl')\n",
379 | "# # createPickle(prec, 'prec.pkl')\n",
380 | "# # createPickle(results, 'results.pkl')\n",
381 | "# # createPickle(matrix, 'matrix.pkl')\n",
382 | "# # createPickle(Dest, 'Dest.pkl')\n",
383 | "# # createPickle(Origin, 'Origin.pkl')\n",
384 | "# # createPickle(UniqueCarrier, 'UniqueCarrier.pkl')\n",
385 | "# # createPickle(TailNum, 'TailNum.pkl')\n"
386 | ],
387 | "language": "python",
388 | "metadata": {},
389 | "outputs": [],
390 | "prompt_number": 33
391 | }
392 | ],
393 | "metadata": {}
394 | }
395 | ]
396 | }
--------------------------------------------------------------------------------
/Old Python Code/Untitled2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:1b2400b379e8920e0aa6061e92e9cd24c52cafda7a0349568949d9c59aa51ae9"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "code",
13 | "collapsed": false,
14 | "input": [
15 | "from __future__ import division\n",
16 | "%matplotlib inline\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "import numpy as np\n",
19 | "from pylab import figure, show\n",
20 | "from pandas import DataFrame, Series\n",
21 | "import pandas as pd\n",
22 | "import csv\n",
23 | "import os\n",
24 | "import statsmodels.formula.api as smf\n",
25 | "import scipy.stats as stats\n",
26 | "import statsmodels.api as sm\n",
27 | "from IPython.core.display import HTML\n",
28 | "from bokeh.plotting import *\n",
29 | "import seaborn as sns\n",
30 | "from bokeh.objects import ColumnDataSource, Range1d\n",
31 | "from math import floor\n",
32 | "import bokeh as bokeh\n",
33 | "import sys\n",
34 | "import csv\n",
35 | "import datetime"
36 | ],
37 | "language": "python",
38 | "metadata": {},
39 | "outputs": [],
40 | "prompt_number": 8
41 | },
42 | {
43 | "cell_type": "code",
44 | "collapsed": false,
45 | "input": [
46 | "\n",
47 | "TIME_DELTA = 3\n",
48 | "\n",
49 | "# for arg in sys.argv:\n",
50 | "# \tif(arg != 'date_graph.py'):\n",
51 | "# \t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n",
52 | "# \t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n",
53 | "\n",
54 | "start_date = datetime.datetime.strptime('05-08-08', '%m-%d-%y')\n",
55 | "print start_date\n",
56 | "\n",
57 | "delta = datetime.timedelta(days=TIME_DELTA)\n",
58 | "begin = start_date - delta\n",
59 | "end = start_date + delta\n",
60 | "\n",
61 | "SFO_Hash = {}\n",
62 | "OAK_Hash = {}\n",
63 | "SFO_count = 0\n",
64 | "OAK_count = 0\n",
65 | "with open('C:\\\\data\\\\airline\\\\_dfTest2008.csv', 'r') as data:\n",
66 | "\tcsv_reader = csv.reader(data, delimiter=',')\n",
67 | "\tfor row in csv_reader:\n",
68 | "\t\tif(row[0] != 'Year'):\n",
69 | "\t\t\tyear = int(row[0])\n",
70 | "\t\t\tmonth = int(row[1])\n",
71 | "\t\t\tdate = int(row[2])\n",
72 | "\t\t\tcurr_date = datetime.datetime(year, month, date)\n",
73 | "\t\t\tif(curr_date >= begin and curr_date <= end):\n",
74 | "\t\t\t\torigin = row[7]\n",
75 | "\t\t\t\tif(origin == '270'):\n",
76 | "\t\t\t\t\tlabel = int(row[10])\n",
77 | "\t\t\t\t\tSFO_count += 1\n",
78 | "\t\t\t\t\tif(curr_date not in SFO_Hash):\n",
79 | "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n",
80 | "\t\t\t\t\telse:\n",
81 | "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n",
82 | "\t\t\t\tif(origin == '215'):\n",
83 | "\t\t\t\t\tlabel = int(row[10])\n",
84 | "\t\t\t\t\tOAK_count += 1\n",
85 | "\t\t\t\t\tif(curr_date not in OAK_Hash):\n",
86 | "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n",
87 | "\t\t\t\t\telse:\n",
88 | "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n",
89 | "\n",
90 | "iterator = datetime.timedelta(days=1)\n",
91 | "day_values = []\n",
92 | "SFO_Delays = []\n",
93 | "SFO_On_Time = []\n",
94 | "SFO_Flights = []\n",
95 | "SFO_Pct = []\n",
96 | "OAK_Delays = []\n",
97 | "OAK_On_Time = []\n",
98 | "OAK_Flights = []\n",
99 | "OAK_Pct = []\n",
100 | "\n",
101 | "while begin <= end:\n",
102 | "\tif(begin not in SFO_Hash):\n",
103 | "\t\tSFO_Delays.append(0)\n",
104 | "\t\tSFO_On_Time.append(0)\n",
105 | "\t\tSFO_Pct.append(0.00)\n",
106 | "\telse:\n",
107 | "\t\tSFO_Flights = SFO_Hash[begin]\n",
108 | "\t\tdelays = sum(SFO_Flights)\n",
109 | "\t\tnum_flights = len(SFO_Flights)\n",
110 | "\t\tpct = float(delays) / (num_flights + delays)\n",
111 | "\t\tSFO_Delays.append(delays)\n",
112 | "\t\tSFO_On_Time.append(num_flights - delays)\n",
113 | "\t\tSFO_Pct.append(pct)\n",
114 | "\t\n",
115 | "\tif(begin not in OAK_Hash):\n",
116 | "\t\tOAK_Delays.append(0)\n",
117 | "\t\tOAK_On_Time.append(0)\n",
118 | "\t\tOAK_Pct.append(0.00)\n",
119 | "\telse:\n",
120 | "\t\tOAK_Flights = OAK_Hash[begin]\n",
121 | "\t\tdelays = sum(OAK_Flights)\n",
122 | "\t\tnum_flights = len(OAK_Flights)\n",
123 | "\t\tpct = float(delays) / (num_flights + delays)\n",
124 | "\t\tOAK_Delays.append(delays)\n",
125 | "\t\tOAK_On_Time.append(num_flights - delays)\n",
126 | "\t\tOAK_Pct.append(pct)\n",
127 | "\t\n",
128 | "\tday_values.append(begin)\n",
129 | "\tbegin += iterator\n",
130 | "\n",
131 | "print SFO_Pct\n",
132 | "print OAK_Pct"
133 | ],
134 | "language": "python",
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "output_type": "stream",
139 | "stream": "stdout",
140 | "text": [
141 | "2008-05-08 00:00:00\n",
142 | "[0.22568093385214008, 0.23976608187134502, 0.2556390977443609, 0.2560747663551402, 0.263254113345521, 0.2478448275862069, 0.30275229357798167]"
143 | ]
144 | },
145 | {
146 | "output_type": "stream",
147 | "stream": "stdout",
148 | "text": [
149 | "\n",
150 | "[0.24793388429752067, 0.24680851063829787, 0.2697095435684647, 0.27058823529411763, 0.28185328185328185, 0.2613065326633166, 0.3004115226337449]\n"
151 | ]
152 | }
153 | ],
154 | "prompt_number": 4
155 | },
156 | {
157 | "cell_type": "code",
158 | "collapsed": false,
159 | "input": [
160 | "print \"Xastart_date"
161 | ],
162 | "language": "python",
163 | "metadata": {},
164 | "outputs": []
165 | },
166 | {
167 | "cell_type": "code",
168 | "collapsed": false,
169 | "input": [
170 | "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n",
171 | "\n",
172 | "ax1 = plt.subplot(211)\n",
173 | "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n",
174 | "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n",
175 | "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
176 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
177 | "ax1.set_yticks([0, 200, 450])\n",
178 | "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n",
179 | "\n",
180 | "ax2 = plt.subplot(212)\n",
181 | "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n",
182 | "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n",
183 | "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
184 | "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
185 | "ax2.set_yticks([0, 200, 450])\n",
186 | "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n",
187 | "plt.show()"
188 | ],
189 | "language": "python",
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "metadata": {},
194 | "output_type": "display_data",
195 | "text": [
196 | ""
197 | ]
198 | }
199 | ],
200 | "prompt_number": 7
201 | }
202 | ],
203 | "metadata": {}
204 | }
205 | ]
206 | }
--------------------------------------------------------------------------------
/Old Python Code/accuracy.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | I2008
3 | cnumpy.core.multiarray
4 | scalar
5 | p1
6 | (cnumpy
7 | dtype
8 | p2
9 | (S'f8'
10 | p3
11 | I0
12 | I1
13 | tp4
14 | Rp5
15 | (I3
16 | S'<'
17 | p6
18 | NNNI-1
19 | I-1
20 | I0
21 | tp7
22 | bS'\x00\x00\x00\x00\x00\x00\xf0?'
23 | p8
24 | tp9
25 | Rp10
26 | sI2001
27 | g1
28 | (g5
29 | S'\x00\x00\x00\x00\x00\x00\xf0?'
30 | p11
31 | tp12
32 | Rp13
33 | sI2007
34 | g1
35 | (g5
36 | S'\x00\x00\x00\x00\x00\x00\xf0?'
37 | p14
38 | tp15
39 | Rp16
40 | s.
--------------------------------------------------------------------------------
/Old Python Code/counter.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | with open('C:\\Dropbox\\Naive Bayes\\Analysis1.csv', 'r') as data:
4 | csv_reader = csv.reader(data, delimiter=',')
5 | SFO_count = 0
6 | OAK_count = 0
7 | for row in csv_reader:
8 | origin = row[1]
9 | if(origin == '270'):
10 | SFO_count += int(row[3])
11 | elif(origin == '215'):
12 | OAK_count += int(row[3])
13 | else:
14 | continue
15 |
16 | print OAK_count
17 | print SFO_count
18 |
--------------------------------------------------------------------------------
/Old Python Code/counter1.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | with open('C:\\Dropbox\\Naive Bayes\\_dfTest2008\\_dfTest2008.csv', 'r') as data:
4 | csv_reader = csv.reader(data, delimiter=',')
5 | SFO_count = 0
6 | OAK_count = 0
7 | for row in csv_reader:
8 | origin = row[7]
9 | if(origin == '270'):
10 | SFO_count += 1
11 | elif(origin == '215'):
12 | OAK_count += 1
13 | else:
14 | continue
15 |
16 | print OAK_count
17 | print SFO_count
18 |
--------------------------------------------------------------------------------
/Old Python Code/data_reader_v2.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import pickle
3 |
4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
5 | years = [2008]
6 |
7 | def ComputeDayofYear(row):
8 | """This function will return an integer to represent the day of the year given an integer
9 | representing month and an integer representing the day of the month. This number will
10 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned
11 | as 0. Feb 29th will be returned as 59."""
12 |
13 | if(row[0] == '1'):
14 | calc = 0 + int(row[1]) - 1
15 | row[1] = str(calc)
16 | elif(row[0] == '2'):
17 | calc = 31 + int(row[1]) - 1
18 | row[1] = str(calc)
19 | elif(row[0] == '3'):
20 | calc = 60 + int(row[1]) - 1
21 | row[1] = str(calc)
22 | elif(row[0] == '4'):
23 | calc = 91 + int(row[1]) - 1
24 | row[1] = str(calc)
25 | elif(row[0] == '5'):
26 | calc = 121 + int(row[1]) - 1
27 | row[1] = str(calc)
28 | elif(row[0] == '6'):
29 | calc = 152 + int(row[1]) - 1
30 | row[1] = str(calc)
31 | elif(row[0] == '7'):
32 | calc = 182 + int(row[1]) - 1
33 | row[1] = str(calc)
34 | elif(row[0] == '8'):
35 | calc = 213 + int(row[1]) - 1
36 | row[1] = str(calc)
37 | elif(row[0] == '9'):
38 | calc = 244 + int(row[1]) - 1
39 | row[1] = str(calc)
40 | elif(row[0] == '10'):
41 | calc = 274 + int(row[1]) - 1
42 | row[1] = str(calc)
43 | elif(row[0] == '11'):
44 | calc = 305 + int(row[1]) - 1
45 | row[1] = str(calc)
46 | elif(row[0] == '12'):
47 | calc = 335 + int(row[1]) - 1
48 | row[1] = str(calc)
49 | return row
50 |
51 |
52 | def DiscretizeDepTime(row):
53 | """This function takes a scheduled departure time, classifies the departure time as:
54 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value
55 | is assumed to be an integer in 24-hour time format. These labels will correspond to
56 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.
57 | An error time is returned as morning."""
58 |
59 | if(int(row[3]) <= 559):
60 | row[3] = '2'
61 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
62 | row[3] = '0'
63 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
64 | row[3] = '1'
65 | elif(int(row[3]) >= 1800):
66 | row[3] = '2'
67 | else:
68 | row[3] = '0'
69 | return row
70 |
71 |
72 | def AddDepVar(row):
73 | """This function adds a classification label based on the length of the recorded
74 | Departure Delay in the data set. It assumes an input integer value of the delay in mins.
75 | By airline industry standards, flight delays are defined as departure delays greater than
76 | or equal to 15 minutes. For delayed flights, this variable will have value "1".
77 | For on time flights, it will have value "0". Default value will be set at "0"."""
78 |
79 | if(row[6] >= '15'):
80 | row[6] = '1'
81 | else:
82 | row[6] = '0'
83 | return row
84 |
85 | def SaveData(data, pickle_file_name):
86 | """This function pickles each file."""
87 |
88 | f = open (pickle_file_name, "w")
89 | pickle.dump(data, f)
90 | f.close()
91 |
92 |
93 |
94 | for i in years:
95 | data = []
96 | file_path='"C:\\data\\airline\\2008.csv\\" ' + str(i) + '.csv'
97 | pickle_file_name = 'data' + str(i)
98 | with open(file_path, 'r') as data_csv:
99 | csv_reader = csv.reader(data_csv, delimiter=',')
100 | for row in list(csv_reader):
101 | if row[21] == '0':
102 | content = list(row[i] for i in needed_cols)
103 | content2 = ComputeDayofYear(content)
104 | content3 = DiscretizeDepTime(content2)
105 | content4 = AddDepVar(content3)
106 | data.append(content4)
107 | SaveData(data, pickle_file_name)
108 |
--------------------------------------------------------------------------------
/Old Python Code/data_reader_v3.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import pickle
3 |
4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
5 | years = [2008]
6 |
7 | def ComputeDayofYear(row):
8 | """This function will return an integer to represent the day of the year given an integer
9 | representing month and an integer representing the day of the month. This number will
10 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned
11 | as 0. Feb 29th will be returned as 59."""
12 |
13 | if(row[0] == '1'):
14 | calc = 0 + int(row[1]) - 1
15 | row[1] = str(calc)
16 | elif(row[0] == '2'):
17 | calc = 31 + int(row[1]) - 1
18 | row[1] = str(calc)
19 | elif(row[0] == '3'):
20 | calc = 60 + int(row[1]) - 1
21 | row[1] = str(calc)
22 | elif(row[0] == '4'):
23 | calc = 91 + int(row[1]) - 1
24 | row[1] = str(calc)
25 | elif(row[0] == '5'):
26 | calc = 121 + int(row[1]) - 1
27 | row[1] = str(calc)
28 | elif(row[0] == '6'):
29 | calc = 152 + int(row[1]) - 1
30 | row[1] = str(calc)
31 | elif(row[0] == '7'):
32 | calc = 182 + int(row[1]) - 1
33 | row[1] = str(calc)
34 | elif(row[0] == '8'):
35 | calc = 213 + int(row[1]) - 1
36 | row[1] = str(calc)
37 | elif(row[0] == '9'):
38 | calc = 244 + int(row[1]) - 1
39 | row[1] = str(calc)
40 | elif(row[0] == '10'):
41 | calc = 274 + int(row[1]) - 1
42 | row[1] = str(calc)
43 | elif(row[0] == '11'):
44 | calc = 305 + int(row[1]) - 1
45 | row[1] = str(calc)
46 | elif(row[0] == '12'):
47 | calc = 335 + int(row[1]) - 1
48 | row[1] = str(calc)
49 | return row
50 |
51 |
52 | def DiscretizeDepTime(row):
53 | """This function takes a scheduled departure time, classifies the departure time as:
54 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value
55 | is assumed to be an integer in 24-hour time format. These labels will correspond to
56 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.
57 | An error time is returned as morning."""
58 |
59 | if(int(row[3]) <= 559):
60 | row[3] = '2'
61 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
62 | row[3] = '0'
63 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
64 | row[3] = '1'
65 | elif(int(row[3]) >= 1800):
66 | row[3] = '2'
67 | else:
68 | row[3] = '0'
69 | return row
70 |
71 |
72 | def AddDepVar(row):
73 | """This function adds a classification label based on the length of the recorded
74 | Departure Delay in the data set. It assumes an input integer value of the delay in mins.
75 | By airline industry standards, flight delays are defined as departure delays greater than
76 | or equal to 15 minutes. For delayed flights, this variable will have value "1".
77 | For on time flights, it will have value "0". Default value will be set at "0"."""
78 |
79 | if(row[6] >= '15'):
80 | row[6] = '1'
81 | else:
82 | row[6] = '0'
83 | return row
84 |
85 | def SaveData(data, pickle_file_name):
86 | """This function pickles each file."""
87 |
88 | f = open (pickle_file_name, "w")
89 | pickle.dump(data, f)
90 | f.close()
91 |
92 |
93 |
94 | for i in years:
95 | data = []
96 | file_path='C:\data\airline' + str(i) + '.csv'
97 | pickle_file_name = 'data' + str(i)
98 | with open(file_path, 'r') as data_csv:
99 | csv_reader = csv.reader(data_csv, delimiter=',')
100 | for row in list(csv_reader):
101 | if row[21] == '0':
102 | if (row[16] == 'SFO' or row[16] == 'OAK'):
103 | content = list(row[i] for i in needed_cols)
104 | content2 = ComputeDayofYear(content)
105 | content3 = DiscretizeDepTime(content2)
106 | content4 = AddDepVar(content3)
107 | data.append(content4)
108 | SaveData(data, pickle_file_name)
109 |
110 |
111 |
--------------------------------------------------------------------------------
/Old Python Code/data_reader_v4_ek.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import pickle
3 | import time
4 | import os
5 | from boto.s3.connection import S3Connection
6 | from boto.s3.key import Key
7 |
8 |
9 | timestr = time.strftime("%Y%m%d-%H%M%S")
10 | print timestr
11 |
12 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
13 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
14 | j=0
15 |
16 | def ComputeDayofYear(row):
17 | """This function will return an integer to represent the day of the year given an integer
18 | representing month and an integer representing the day of the month. This number will
19 | correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned
20 | as 0. Feb 29th will be returned as 59."""
21 |
22 | if(row[0] == '1'):
23 | calc = 0 + int(row[1]) - 1
24 | row[1] = str(calc)
25 | elif(row[0] == '2'):
26 | calc = 31 + int(row[1]) - 1
27 | row[1] = str(calc)
28 | elif(row[0] == '3'):
29 | calc = 60 + int(row[1]) - 1
30 | row[1] = str(calc)
31 | elif(row[0] == '4'):
32 | calc = 91 + int(row[1]) - 1
33 | row[1] = str(calc)
34 | elif(row[0] == '5'):
35 | calc = 121 + int(row[1]) - 1
36 | row[1] = str(calc)
37 | elif(row[0] == '6'):
38 | calc = 152 + int(row[1]) - 1
39 | row[1] = str(calc)
40 | elif(row[0] == '7'):
41 | calc = 182 + int(row[1]) - 1
42 | row[1] = str(calc)
43 | elif(row[0] == '8'):
44 | calc = 213 + int(row[1]) - 1
45 | row[1] = str(calc)
46 | elif(row[0] == '9'):
47 | calc = 244 + int(row[1]) - 1
48 | row[1] = str(calc)
49 | elif(row[0] == '10'):
50 | calc = 274 + int(row[1]) - 1
51 | row[1] = str(calc)
52 | elif(row[0] == '11'):
53 | calc = 305 + int(row[1]) - 1
54 | row[1] = str(calc)
55 | elif(row[0] == '12'):
56 | calc = 335 + int(row[1]) - 1
57 | row[1] = str(calc)
58 | return row
59 |
60 |
61 | def DiscretizeDepTime(row):
62 | """This function takes a scheduled departure time, classifies the departure time as:
63 | morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value
64 | is assumed to be an integer in 24-hour time format. These labels will correspond to
65 | variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.
66 | An error time is returned as morning."""
67 |
68 | if(int(row[3]) <= 559):
69 | row[3] = '2'
70 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
71 | row[3] = '0'
72 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
73 | row[3] = '1'
74 | elif(int(row[3]) >= 1800):
75 | row[3] = '2'
76 | else:
77 | row[3] = '0'
78 | return row
79 |
80 |
81 | def AddDepVar(row):
82 | """This function adds a classification label based on the length of the recorded
83 | Departure Delay in the data set. It assumes an input integer value of the delay in mins.
84 | By airline industry standards, flight delays are defined as departure delays greater than
85 | or equal to 15 minutes. For delayed flights, this variable will have value "1".
86 | For on time flights, it will have value "0". Default value will be set at "0"."""
87 |
88 | if(row[6] >= '15'):
89 | row[6] = '1'
90 | else:
91 | row[6] = '0'
92 | return row
93 |
94 | def SaveData(data, pickle_file_name):
95 | """This function pickles each file."""
96 |
97 | f = open (pickle_file_name, "wb")
98 | try:
99 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
100 | except Exception as e:
101 | print e
102 | f.close()
103 |
104 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
105 | bucket = conn.get_bucket('i290-aero')
106 | k = Key(bucket)
107 | k.key = pickle_file_name
108 | k.set_contents_from_filename(pickle_file_name)
109 |
110 | os.remove(pickle_file_name)
111 |
112 |
113 | for i in years:
114 | data = []
115 | '''
116 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
117 | bucket = conn.get_bucket('i290-aero')
118 | k = Key(bucket)
119 | k.key = 'data2001.csv'
120 | file_path = k.get_contents_as_string()
121 | '''
122 | file_path='data' + str(i) + '.csv'
123 | pickle_file_name = timestr+'-data-' + str(i)
124 | with open(file_path, 'r') as data_csv:
125 | csv_reader = csv.reader(data_csv, delimiter=',')
126 | j = 0
127 | for row in csv_reader:
128 | if row[21] == '0': # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
129 | # if (row[16] == 'SFO' or row[16] == 'OAK'):
130 | content = [row[i] for i in needed_cols]
131 | content2 = ComputeDayofYear(content)
132 | content3 = DiscretizeDepTime(content2)
133 | content4 = AddDepVar(content3)
134 | data.append(content4)
135 | # print 'content4', content4
136 | # print 'data', data
137 | # fff = raw_input()
138 | j=j+1
139 | if j % 2000000 == 0:
140 | print j
141 | SaveData(data, pickle_file_name + '-' + str(j))
142 | data = []
143 | SaveData(data, pickle_file_name)
144 |
145 |
146 |
--------------------------------------------------------------------------------
/Old Python Code/date_iterator_plot.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import csv
3 | import random
4 | import matplotlib.pyplot as plt; plt.rcdefaults()
5 |
6 | # Eunkwang data: SFO = 1; OAK = 2
7 | # Divya data: SFO = 136; OAK = 141
8 |
9 | # Need to change row indexes to make sure they match data from Eunkwang.
10 |
11 | '''with open('EunkwangSampleData.csv', 'r') as data:
12 | csv_reader = csv.reader(data, delimiter=',')
13 | SFO_EJ_Hash = {}
14 | OAK_EJ_Hash = {}
15 | for row in csv_reader:
16 | origin = row[8]
17 | if(origin == '1'):
18 | year = int(row[0])
19 | month = int(row[1])
20 | date = int(row[2])
21 | key = datetime.date(year, month, date)
22 | label = int(row[9])
23 | if(key not in SFO_EJ_Hash):
24 | SFO_EJ_Hash[key] = [label]
25 | else:
26 | SFO_EJ_Hash[key].append(label)
27 | elif(origin == '2'):
28 | year = int(row[0])
29 | month = int(row[1])
30 | date = int(row[2])
31 | key = datetime.date(year, month, date)
32 | label = int(row[9])
33 | if(key not in OAK_EJ_Hash):
34 | OAK_EJ_Hash[key] = [label]
35 | else:
36 | OAK_EJ_Hash[key].append(label)
37 | else:
38 | continue'''
39 |
40 | with open('DivyaSampleData.csv', 'r') as data:
41 | csv_reader = csv.reader(data, delimiter=',')
42 | SFO_DM_Hash = {}
43 | OAK_DM_Hash = {}
44 | for row in csv_reader:
45 | origin = row[8]
46 | if(origin == '136'):
47 | year = int(row[0])
48 | month = int(row[1])
49 | date = int(row[2])
50 | key = datetime.date(year, month, date)
51 | label = int(row[9])
52 | if(key not in SFO_DM_Hash):
53 | SFO_DM_Hash[key] = [label]
54 | else:
55 | SFO_DM_Hash[key].append(label)
56 | elif(origin == '141'):
57 | year = int(row[0])
58 | month = int(row[1])
59 | date = int(row[2])
60 | key = datetime.date(year, month, date)
61 | label = int(row[9])
62 | if(key not in OAK_DM_Hash):
63 | OAK_DM_Hash[key] = [label]
64 | else:
65 | OAK_DM_Hash[key].append(label)
66 | else:
67 | continue
68 |
69 | start_date = datetime.date(2008, 1, 1)
70 | end_date = datetime.date(2008, 1,31)
71 | date_values = []
72 | SFO_DM_Delays = []
73 | SFO_DM_On_Time = []
74 | OAK_DM_Delays = []
75 | OAK_DM_On_Time = []
76 | SFO_EJ_Delays = []
77 | SFO_EJ_On_Time = []
78 | OAK_EJ_Delays = []
79 | OAK_EJ_On_Time = []
80 |
81 | d = start_date
82 | delta = datetime.timedelta(days=1)
83 | while d <= end_date:
84 | '''if(d not in SFO_EJ_Hash):
85 | SFO_EJ_Values.append([0,0])
86 | else:
87 | SFO_EJ_Flights = SFO_EJ_Hash[d]
88 | delays = sum(SFO_EJ_Flights)
89 | num_flights = len(SFO_EJ_Flights)
90 | SFO_EJ_Delays.append(delays)
91 | SFO_EJ_On_Time.append(num_flights - delays)
92 |
93 | if(d not in OAK_EJ_Hash):
94 | OAK_EJ_Values.append([0,0])
95 | else:
96 | OAK_EJ_Flights = OAK_EJ_Hash[d]
97 | delays = sum(OAK_EJ_Flights)
98 | num_flights = len(OAK_EJ_Flights)
99 | OAK_EJ_Delays.append(delays)
100 | OAK_EJ_On_Time.append(num_flights - delays)'''
101 |
102 | if(d not in SFO_DM_Hash):
103 | SFO_DM_Values.append([0,0])
104 | else:
105 | SFO_DM_Flights = SFO_DM_Hash[d]
106 | delays = sum(SFO_DM_Flights)
107 | num_flights = len(SFO_DM_Flights)
108 | SFO_DM_Delays.append(delays)
109 | SFO_DM_On_Time.append(num_flights - delays)
110 |
111 | if(d not in OAK_DM_Hash):
112 | OAK_DM_Values.append([0,0])
113 | else:
114 | OAK_DM_Flights = OAK_DM_Hash[d]
115 | delays = sum(OAK_DM_Flights)
116 | num_flights = len(OAK_DM_Flights)
117 | OAK_DM_Delays.append(delays)
118 | OAK_DM_On_Time.append(num_flights - delays)
119 |
120 | date_values.append(d)
121 | d += delta
122 |
123 | plt.title('Probability of Flight Delays at SFO vs. OAK')
124 |
125 | ax1 = plt.subplot(211)
126 | ax1.bar(date_values, SFO_DM_Delays, bottom = SFO_DM_On_Time, color = 'green')
127 | ax1.bar(date_values, SFO_DM_On_Time, color = 'blue')
128 | ax1.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008'])
129 | #ax1.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008'])
130 | ax1.set_yticks([0, 50, 100])
131 | ax1.set_title('On-Time Flights and Delayed Flights at SFO')
132 |
133 | ax2 = plt.subplot(212)
134 | ax2.bar(date_values, OAK_DM_Delays, bottom = OAK_DM_On_Time, color = 'red')
135 | ax2.bar(date_values, OAK_DM_On_Time, color = 'grey')
136 | ax2.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008'])
137 | #ax2.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008'])
138 | ax2.set_yticks([0, 50, 100])
139 | ax2.set_title('On-Time Flights and Delayed Flights at OAK')
140 |
141 | plt.show()
--------------------------------------------------------------------------------
/Old Python Code/logisticRegression.py:
--------------------------------------------------------------------------------
1 | # import matplotlib.pyplot as plt
2 | import numpy as np
3 | import random
4 | import pickle
5 | import sys
6 | import os
7 | from boto.s3.connection import S3Connection
8 | from boto.s3.key import Key
9 |
10 | pickle2001 = ['20140428-190051-data-2001',
11 | '20140428-190051-data-2001-2000000',
12 | '20140428-190051-data-2001-4000000']
13 | pickle2002 = ['20140428-190051-data-2002',
14 | '20140428-190051-data-2002-2000000',
15 | '20140428-190051-data-2002-4000000']
16 | pickle2003 = ['20140428-190051-data-2003',
17 | '20140428-190051-data-2003-2000000',
18 | '20140428-190051-data-2003-4000000',
19 | '20140428-190051-data-2003-6000000']
20 | pickle2004 = ['20140428-190051-data-2004',
21 | '20140428-190051-data-2004-2000000',
22 | '20140428-190051-data-2004-4000000',
23 | '20140428-190051-data-2004-6000000']
24 | pickle2005 = ['20140428-190051-data-2005',
25 | '20140428-190051-data-2005-2000000',
26 | '20140428-190051-data-2005-4000000',
27 | '20140428-190051-data-2005-6000000']
28 | pickle2006 = ['20140428-190051-data-2006',
29 | '20140428-190051-data-2006-2000000',
30 | '20140428-190051-data-2006-4000000',
31 | '20140428-190051-data-2006-6000000']
32 | pickle2007 = ['20140428-190051-data-2007',
33 | '20140428-190051-data-2007-2000000',
34 | '20140428-190051-data-2007-4000000',
35 | '20140428-190051-data-2007-6000000']
36 | pickle2008 = ['20140428-190051-data-2008',
37 | '20140428-190051-data-2008-2000000',
38 | '20140428-190051-data-2008-4000000',
39 | '20140428-190051-data-2008-6000000']
40 |
41 |
42 | def loadData(fileName):
43 | if os.path.exists(fileName) == False:
44 | print 'downloading', fileName, 'from s3'
45 | conn = S3Connection('key', 'val')
46 | bucket = conn.get_bucket('i290-aero')
47 | k = Key(bucket)
48 | k.key = fileName
49 | k.get_contents_to_filename(fileName)
50 | print 'downloaded', fileName, 'from s3'
51 |
52 | print 'now unpickle...'
53 | x = pickle.load(open(fileName, "rb"))
54 | x = np.array(x)
55 | print 'x.shape = ', x.shape, x[:, -1:].shape
56 | y = x[:, -1:].copy() # last col is y value (delay or not)
57 | x[:, -1:] = 1.
58 | return x, y
59 |
60 | def gradientDescent(x, y, numIterations, dimension, theta):
61 | # theta = np.zeros(dimension)[np.newaxis].transpose()
62 | for i in range(1, numIterations):
63 | randIdx = random.randint(0, len(x) - 1)
64 | xTrans = x[randIdx][np.newaxis].transpose()
65 | # print theta.transpose(), xTrans
66 | u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans)))
67 | loss = y[randIdx] - u
68 | gradient = np.dot(loss[0][0], xTrans)
69 | # update
70 | theta = theta + gradient / i
71 | return theta
72 |
73 | def graph(formula, x_range):
74 | x = np.array(x_range)
75 | y = eval(formula)
76 | plt.plot(x, y)
77 |
78 |
79 | # def getData(fileName):
80 | # f = open(fileName, 'r')
81 | # x = np.array([0,0,0])
82 | # x0 = []
83 | # x1 = []
84 | # y = np.array([0])
85 | # for line in f:
86 | # arr = line.strip().split(' ')
87 | # x = np.vstack((x, [float(arr[0]), float(arr[1]), 1.]))
88 | # y = np.vstack((y, [float(arr[2])]))
89 | # if arr[2] == '0':
90 | # x0.append((float(arr[0]), float(arr[1])))
91 | # else:
92 | # x1.append((float(arr[0]), float(arr[1])))
93 |
94 | # x = np.delete(x, 0, 0)
95 | # y = np.delete(y, 0, 0)
96 | # f.close()
97 |
98 | # return x, x0, x1, y
99 |
100 |
101 |
102 |
103 | def main():
104 | # arg = sys.argv
105 | # if len(arg) < 2:
106 | # print 'USE: $ python logisticRegression.py [dataset_file]'
107 | # return
108 | # x, y = loadData(arg[1])
109 |
110 | # x, x0, x1, y = getData('classification.dat')
111 |
112 | if os.path.exists('pickled_theta') == False:
113 | theta = None
114 | for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2007:
115 | x, y = loadData(elem)
116 | if theta == None:
117 | theta = np.zeros(x.shape[1])[np.newaxis].transpose()
118 | print 'theta == None...... initialize..........', theta.shape
119 | theta = gradientDescent(x, y, 100000, x.shape[1], theta)
120 | print 'finished gradientDescent of ', elem
121 | print 'theta', theta
122 |
123 | f = open('pickled_theta', 'wb')
124 | pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL)
125 | f.close()
126 |
127 | theta = pickle.load(open('pickled_theta', 'rb'))
128 |
129 | accu = 0
130 | length = 0
131 | for elem in pickle2008:
132 | if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False:
133 | x, y = loadData(elem)
134 | dotProduct = np.dot(x, theta)
135 | print '============= dot product ============='
136 | print dotProduct
137 | print '=============y ============='
138 | print y
139 | pickle.dump(dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
140 | pickle.dump(y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
141 | else:
142 | dotProduct = pickle.load(open('dot-' + elem, 'rb'))
143 | y = pickle.load(open('y-' + elem, 'rb'))
144 |
145 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
146 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
147 |
148 | for i in range(len(prob)):
149 | if prob[i] == y[i]:
150 | accu += 1
151 | length += len(prob)
152 | print 'accuracy = ', accu * 100 / length
153 |
154 | # graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5))
155 | print 'asdf'
156 |
157 |
158 |
159 | if __name__ == '__main__':
160 | main()
--------------------------------------------------------------------------------
/Old Python Code/matrix.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | I2008
3 | cnumpy.core.multiarray
4 | _reconstruct
5 | p1
6 | (cnumpy
7 | ndarray
8 | p2
9 | (I0
10 | tp3
11 | S'b'
12 | p4
13 | tp5
14 | Rp6
15 | (I1
16 | (L2L
17 | L2L
18 | tp7
19 | cnumpy
20 | dtype
21 | p8
22 | (S'i4'
23 | p9
24 | I0
25 | I1
26 | tp10
27 | Rp11
28 | (I3
29 | S'<'
30 | p12
31 | NNNI-1
32 | I-1
33 | I0
34 | tp13
35 | bI00
36 | S'\xb9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe2\x03\x00\x00'
37 | p14
38 | tp15
39 | bsI2001
40 | g1
41 | (g2
42 | (I0
43 | tp16
44 | g4
45 | tp17
46 | Rp18
47 | (I1
48 | (L2L
49 | L2L
50 | tp19
51 | g11
52 | I00
53 | S'\x9a\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x04\x00\x00'
54 | p20
55 | tp21
56 | bsI2007
57 | g1
58 | (g2
59 | (I0
60 | tp22
61 | g4
62 | tp23
63 | Rp24
64 | (I1
65 | (L2L
66 | L2L
67 | tp25
68 | g11
69 | I00
70 | S'\xa9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf2\x03\x00\x00'
71 | p26
72 | tp27
73 | bs.
--------------------------------------------------------------------------------
/Old Python Code/model_selector.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt; plt.rcdefaults()
2 |
3 | # Divya and Eunkwang to provide [precision, recall, accuracy] for each of their 8 results.
4 | # This script will graph the models against each other and select the best model.
5 |
6 | TEST_DATA = [[0.4,0.6,0.8] , [0.5,0.3,0.69], [0.8, 0.2, 0.75], [0.3, 0.9, 0.72], [0.8, 0.95, 0.9]]
7 |
8 | def calc_f1_score(precision, recall, accuracy):
9 | return (float(2 * (precision * recall) / (precision + recall)))
10 |
11 | precision_array = []
12 | recall_array = []
13 | best_f1 = 0.00000000000000000
14 | index = 0
15 |
16 | for each in TEST_DATA:
17 | precision_array.append(each[0])
18 | recall_array.append(each[1])
19 |
20 | f1 = calc_f1_score(each[0], each[1], each[2])
21 | #print f1
22 | if(f1 > best_f1):
23 | best_f1 = f1
24 | best_index = index
25 | index +=1
26 |
27 | print "The Best Model is: Model " + str(best_index)
28 |
29 | fig = plt.subplot(111)
30 | fig.scatter(precision_array, recall_array)
31 | fig.set_xlabel('Recall')
32 | fig.set_ylabel('Precision')
33 |
34 | plt.show()
--------------------------------------------------------------------------------
/Old Python Code/output.txt:
--------------------------------------------------------------------------------
1 | harbinger:~/python$ python Pickle.py
2 | /home/dmenghani/python_lib/scikit_learn-0.14.1-py2.7-linux-x86_64.egg/sklearn/pls.py:7: DeprecationWarning: This module has been moved to cross_decomposition and will be removed in 0.16
3 | "removed in 0.16", DeprecationWarning)
4 | Setting constants...
5 | Reading into Pandas frame...
6 |
7 | /home/dmenghani/python/2001.csv
8 | Length of original dataset - 5967780
9 | Removing cancelled flights...
10 | Length after random sampling, taking {one - third} of the file - 1912194
11 |
12 |
13 | /home/dmenghani/python/2002.csv
14 | Length of original dataset - 5271359
15 | Removing cancelled flights...
16 | Length after random sampling, taking {one - third} of the file - 1735405
17 |
18 |
19 | /home/dmenghani/python/2003.csv
20 | Length of original dataset - 6488540
21 | Removing cancelled flights...
22 | Length after random sampling, taking {one - third} of the file - 2129023
23 |
24 |
25 | /home/dmenghani/python/2004.csv
26 | Length of original dataset - 7129270
27 | Removing cancelled flights...
28 | Length after random sampling, taking {one - third} of the file - 2333837
29 |
30 |
31 | /home/dmenghani/python/2005.csv
32 | Length of original dataset - 7140596
33 | Removing cancelled flights...
34 | Length after random sampling, taking {one - third} of the file - 2335622
35 |
36 |
37 | /home/dmenghani/python/2006.csv
38 | Length of original dataset - 7141922
39 | Removing cancelled flights...
40 | Length after random sampling, taking {one - third} of the file - 2339996
41 |
42 |
43 | /home/dmenghani/python/2007.csv
44 | Length of original dataset - 7453215
45 | Removing cancelled flights...
46 | Length after random sampling, taking {one - third} of the file - 2430822
47 |
48 |
49 | /home/dmenghani/python/2008.csv
50 | Length of original dataset - 7009728
51 | Removing cancelled flights...
52 | Length after random sampling, taking {one - third} of the file - 2290764
53 |
54 | Total length for all years - 17507663
55 | Calculating classification label...
56 | Dataframe shape - (17507663, 12)
57 | Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'DepDelay', u'Origin', u'Dest', u'Distance', u'label'], dtype='object')
58 | Converting categorical data to numeric...
59 | Converting... Origin
60 | Converting... UniqueCarrier
61 | Converting... Dest
62 | Converting... TailNum
63 | Pickled origin_all.pkl
64 | Pickled tailnum_all.pkl
65 | Pickled dest_all.pkl
66 | Pickled carrier_all.pkl
67 | Conversion to discrete data completed.
68 | Pickled dataframe_all.pkl
69 | harbinger:~/python$
70 |
--------------------------------------------------------------------------------
/Old Python Code/prec.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | I2008
3 | cnumpy.core.multiarray
4 | scalar
5 | p1
6 | (cnumpy
7 | dtype
8 | p2
9 | (S'f8'
10 | p3
11 | I0
12 | I1
13 | tp4
14 | Rp5
15 | (I3
16 | S'<'
17 | p6
18 | NNNI-1
19 | I-1
20 | I0
21 | tp7
22 | bS'\x00\x00\x00\x00\x00\x00\xf0?'
23 | p8
24 | tp9
25 | Rp10
26 | sI2001
27 | g1
28 | (g5
29 | S'\x00\x00\x00\x00\x00\x00\xf0?'
30 | p11
31 | tp12
32 | Rp13
33 | sI2007
34 | g1
35 | (g5
36 | S'\x00\x00\x00\x00\x00\x00\xf0?'
37 | p14
38 | tp15
39 | Rp16
40 | s.
--------------------------------------------------------------------------------
/Old Python Code/results.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | .
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Predicting Airline Delays - Fly from SFO or OAK?
2 | ===================
3 |
4 | Team
5 |
6 | Divya M
7 |
8 | Eunkwang J
9 |
10 | Ryan J
11 |
12 | Julia K
13 |
14 |
15 | Problem Statement
16 | Simplified version: "Given a destination and a date range, which is a better airport to fly out from - SFO or OAK?"
17 | We wanted to apply machine learning techniques to build a predictive model which can help flyer decide which airport to choose. Our model was built using data for all US domestic flights from 2001-08. Our models works for all airports, however we were particularly interested in SFO/OAK. There is a popular urban myth to fly from OAK to avoid delays. But we find that myth is not true always.
18 |
19 |
20 | About the Data
21 | We will be working with airline data for individual years found at http://stat-computing.org/dataexpo/2009/the-data.html.
22 |
23 | Techniques
24 | Naive Bayes
25 | Logistic Regression
26 |
27 |
28 | Python Libraries
29 | Pandas, Scikit, Matplotlib, Seaborn
30 |
--------------------------------------------------------------------------------
/data_reader_v4_ek.py:
--------------------------------------------------------------------------------
1 | #
2 | # data_reader_v4_ek.py
3 | # author: eunkwang joo
4 | # description: This code prepares dataset for logistic regression algorithm, which is written by myself.
5 | #
6 |
7 |
8 | import csv
9 | import pickle
10 | import time
11 | import os
12 | from boto.s3.connection import S3Connection
13 | from boto.s3.key import Key
14 |
15 |
16 | timestr = time.strftime("%Y%m%d-%H%M%S")
17 | print timestr
18 |
19 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
21 | j = 0
22 |
23 | #
24 | # function: ComputeDayofYear()
25 | # description: This function will return an integer to represent the day of the year given an integer
26 | # representing month and an integer representing the day of the month. This number will
27 | # correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned
28 | # as 0. Feb 29th will be returned as 59.
29 | # input: row of csv file, a raw dataset
30 | # output: row of csv file, date of year value of which is encoded.
31 | #
32 |
33 |
34 | def ComputeDayofYear(row):
35 | if(row[0] == '1'):
36 | calc = 0 + int(row[1]) - 1
37 | row[1] = str(calc)
38 | elif(row[0] == '2'):
39 | calc = 31 + int(row[1]) - 1
40 | row[1] = str(calc)
41 | elif(row[0] == '3'):
42 | calc = 60 + int(row[1]) - 1
43 | row[1] = str(calc)
44 | elif(row[0] == '4'):
45 | calc = 91 + int(row[1]) - 1
46 | row[1] = str(calc)
47 | elif(row[0] == '5'):
48 | calc = 121 + int(row[1]) - 1
49 | row[1] = str(calc)
50 | elif(row[0] == '6'):
51 | calc = 152 + int(row[1]) - 1
52 | row[1] = str(calc)
53 | elif(row[0] == '7'):
54 | calc = 182 + int(row[1]) - 1
55 | row[1] = str(calc)
56 | elif(row[0] == '8'):
57 | calc = 213 + int(row[1]) - 1
58 | row[1] = str(calc)
59 | elif(row[0] == '9'):
60 | calc = 244 + int(row[1]) - 1
61 | row[1] = str(calc)
62 | elif(row[0] == '10'):
63 | calc = 274 + int(row[1]) - 1
64 | row[1] = str(calc)
65 | elif(row[0] == '11'):
66 | calc = 305 + int(row[1]) - 1
67 | row[1] = str(calc)
68 | elif(row[0] == '12'):
69 | calc = 335 + int(row[1]) - 1
70 | row[1] = str(calc)
71 | return row
72 |
73 |
74 | #
75 | # function: DiscretizeDepTime()
76 | # description: This function takes a scheduled departure time, classifies the departure time as:
77 | # morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value
78 | # is assumed to be an integer in 24-hour time format. These labels will correspond to
79 | # variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.
80 | # An error time is returned as morning.
81 | # input: row of csv file, a raw dataset
82 | # output: row of csv file, departure time value of which is encoded.
83 | #
84 |
85 | def DiscretizeDepTime(row):
86 |
87 | if(int(row[3]) <= 559):
88 | row[3] = '2'
89 | elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
90 | row[3] = '0'
91 | elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
92 | row[3] = '1'
93 | elif(int(row[3]) >= 1800):
94 | row[3] = '2'
95 | else:
96 | row[3] = '0'
97 | return row
98 |
99 | #
100 | # function: AddDepVar()
101 | # description: This function adds a classification label based on the length of the recorded
102 | # Departure Delay in the data set. It assumes an input integer value of the delay in mins.
103 | # By airline industry standards, flight delays are defined as departure delays greater than
104 | # or equal to 15 minutes. For delayed flights, this variable will have value "1".
105 | # For on time flights, it will have value "0". Default value will be set at "0".
106 | # input: row of csv file, a raw dataset
107 | # output: row of csv file, delay value of which is encoded as binary.
108 | #
109 |
110 |
111 | def AddDepVar(row):
112 |
113 | if(row[6] >= '15'):
114 | row[6] = '1'
115 | else:
116 | row[6] = '0'
117 | return row
118 |
119 | #
120 | # function: SaveData()
121 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well.
122 | # input: data= data structure which will be stored for future uses
123 | # pickle_file_name= file name to be used to store data
124 | # output: null
125 | #
126 |
127 |
128 | def SaveData(data, pickle_file_name):
129 |
130 | f = open(pickle_file_name, "wb")
131 | try:
132 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
133 | except Exception as e:
134 | print e
135 | f.close()
136 |
137 | conn = S3Connection(
138 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
139 | bucket = conn.get_bucket('i290-aero')
140 | k = Key(bucket)
141 | k.key = pickle_file_name
142 | k.set_contents_from_filename(pickle_file_name)
143 |
144 | os.remove(pickle_file_name)
145 |
146 |
147 | # it reads raw datset of every year, encodes variables, drop unused
148 | # variables, and pickle trimmed dataset in file system.
149 |
150 | for i in years:
151 | data = []
152 | '''
153 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
154 | bucket = conn.get_bucket('i290-aero')
155 | k = Key(bucket)
156 | k.key = 'data2001.csv'
157 | file_path = k.get_contents_as_string()
158 | '''
159 | file_path = 'data' + str(i) + '.csv'
160 | pickle_file_name = timestr + '-data-' + str(i)
161 | with open(file_path, 'r') as data_csv:
162 | csv_reader = csv.reader(data_csv, delimiter=',')
163 | j = 0
164 | for row in csv_reader:
165 | # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
166 | if row[21] == '0':
167 | # if (row[16] == 'SFO' or row[16] == 'OAK'):
168 | content = [row[i] for i in needed_cols]
169 | content2 = ComputeDayofYear(content)
170 | content3 = DiscretizeDepTime(content2)
171 | content4 = AddDepVar(content3)
172 | data.append(content4)
173 | # print 'content4', content4
174 | # print 'data', data
175 | # fff = raw_input()
176 | j = j + 1
177 | if j % 2000000 == 0:
178 | print j
179 | SaveData(data, pickle_file_name + '-' + str(j))
180 | data = []
181 | SaveData(data, pickle_file_name)
182 |
--------------------------------------------------------------------------------
/data_reader_v4_ek_rj_csv.py:
--------------------------------------------------------------------------------
1 | #
2 | # data_reader_v4_ek_rj_csv.py
3 | # author: eunkwang joo
4 | # description: This code prepares dataset for logistic regression using python pandas.
5 | #
6 |
7 | import csv
8 | import pickle
9 | import time
10 | import os
11 | from boto.s3.connection import S3Connection
12 | from boto.s3.key import Key
13 |
14 |
15 | timestr = time.strftime("%Y%m%d-%H%M%S")
16 | print timestr
17 |
18 | # columns to extract from raw dataset.
19 | needed_cols = [3, 4, 8, 15, 16, 17]
20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
21 |
22 | j = 0
23 |
24 | #
25 | # function: ComputeDayofYear()
26 | # description: This function will return an integer to represent the day of the year given an integer
27 | # representing month and an integer representing the day of the month. This number will
28 | # correspond to the ordered day of the year [0-365]. For instance, Jan 1st will be returned
29 | # as 0. Feb 29th will be returned as 59.
30 | # input: row of csv file, a raw dataset
31 | # output: row of csv file, date of year value of which is encoded.
32 | #
33 |
34 |
35 | def ComputeDayofYear(row):
36 |
37 | if(row[0] == '1'):
38 | calc = 0 + int(row[1]) - 1
39 | row[1] = float(calc)
40 | elif(row[0] == '2'):
41 | calc = 31 + int(row[1]) - 1
42 | row[1] = float(calc)
43 | elif(row[0] == '3'):
44 | calc = 60 + int(row[1]) - 1
45 | row[1] = float(calc)
46 | elif(row[0] == '4'):
47 | calc = 91 + int(row[1]) - 1
48 | row[1] = float(calc)
49 | elif(row[0] == '5'):
50 | calc = 121 + int(row[1]) - 1
51 | row[1] = float(calc)
52 | elif(row[0] == '6'):
53 | calc = 152 + int(row[1]) - 1
54 | row[1] = float(calc)
55 | elif(row[0] == '7'):
56 | calc = 182 + int(row[1]) - 1
57 | row[1] = float(calc)
58 | elif(row[0] == '8'):
59 | calc = 213 + int(row[1]) - 1
60 | row[1] = float(calc)
61 | elif(row[0] == '9'):
62 | calc = 244 + int(row[1]) - 1
63 | row[1] = float(calc)
64 | elif(row[0] == '10'):
65 | calc = 274 + int(row[1]) - 1
66 | row[1] = float(calc)
67 | elif(row[0] == '11'):
68 | calc = 305 + int(row[1]) - 1
69 | row[1] = float(calc)
70 | elif(row[0] == '12'):
71 | calc = 335 + int(row[1]) - 1
72 | row[1] = float(calc)
73 | return row
74 |
75 |
76 | #
77 | # function: DiscretizeDepTime()
78 | # description: This function takes a scheduled departure time, classifies the departure time as:
79 | # morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659). The input value
80 | # is assumed to be an integer in 24-hour time format. These labels will correspond to
81 | # variable values of 0 = morning, 1 = afternoon, 2 = evening. The value is then returned.
82 | # An error time is returned as morning.
83 | # input: row of csv file, a raw dataset
84 | # output: row of csv file, departure time value of which is encoded.
85 | #
86 |
87 | def DiscretizeDepTime(row):
88 |
89 | if(int(row[1]) <= 559):
90 | row[1] = 2.
91 | elif(int(row[1]) >= 600 and int(row[1]) <= 1259):
92 | row[1] = 0.
93 | elif(int(row[1]) >= 1300 and int(row[1]) <= 1759):
94 | row[1] = 1.
95 | elif(int(row[1]) >= 1800):
96 | row[1] = 2.
97 | else:
98 | row[1] = 0.
99 | return row
100 |
101 |
102 | #
103 | # function: AddDepVar()
104 | # description: This function adds a classification label based on the length of the recorded
105 | # Departure Delay in the data set. It assumes an input integer value of the delay in mins.
106 | # By airline industry standards, flight delays are defined as departure delays greater than
107 | # or equal to 15 minutes. For delayed flights, this variable will have value "1".
108 | # For on time flights, it will have value "0". Default value will be set at "0".
109 | # input: row of csv file, a raw dataset
110 | # output: row of csv file, delay value of which is encoded as binary.
111 | #
112 |
113 | def AddDepVar(row):
114 |
115 | if float(row[3]) >= float(15):
116 | row[3] = 1.
117 | else:
118 | row[3] = 0.
119 | return row
120 |
121 | #
122 | # function: SaveData()
123 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well.
124 | # input: data= data structure which will be stored for future uses
125 | # pickle_file_name= file name to be used to store data
126 | # output: null
127 | #
128 |
129 |
130 | def SaveData(data, pickle_file_name):
131 |
132 | f = open(pickle_file_name, "wb")
133 | try:
134 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
135 | except Exception as e:
136 | print e
137 | f.close()
138 |
139 | conn = S3Connection(
140 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
141 | bucket = conn.get_bucket('i290-aero')
142 | k = Key(bucket)
143 | k.key = pickle_file_name
144 | k.set_contents_from_filename(pickle_file_name)
145 |
146 | os.remove(pickle_file_name)
147 |
148 |
149 | hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic']
150 |
151 |
152 | # if os.path.exists(hashs[1]):
153 | # tailNumHash = pickle.load(open(hashs[1], "rb"))
154 | # else:
155 | # tailNumHash = {}
156 | #
157 | # function: createHash()
158 | # description: It creates dictionaries which matches an airport and a carrier to an integer. The dictionaries will be used to identify encoded airport and carrier.
159 | # input: null
160 | # output: null
161 | #
162 | def createHash():
163 | airportHash = {}
164 | carrierHash = {}
165 | for i in years:
166 | file_path = '../Airport_Data/data' + str(i) + '.csv'
167 | with open(file_path, 'r') as data_csv:
168 | csv_reader = csv.reader(data_csv, delimiter=',')
169 | j = 0
170 | for row in csv_reader:
171 | if(row[17] not in airportHash):
172 | airportHash[row[17]] = len(airportHash) + 1
173 | if(row[8] not in carrierHash):
174 | carrierHash[row[8]] = len(carrierHash) + 1
175 | pickle.dump(airportHash, open('airportHash.dic', 'wb'),
176 | protocol=pickle.HIGHEST_PROTOCOL)
177 | pickle.dump(carrierHash, open('carrierHash.dic', 'wb'),
178 | protocol=pickle.HIGHEST_PROTOCOL)
179 |
180 | # createHash()
181 |
182 | airportHash = pickle.load(open(hashs[0], "rb"))
183 | carrierHash = pickle.load(open(hashs[2], "rb"))
184 |
185 | # it reads raw datset of every year, encodes variables, drop unused
186 | # variables, and pickle trimmed dataset in file system.
187 |
188 | for i in years:
189 | data = []
190 | '''
191 | conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
192 | bucket = conn.get_bucket('i290-aero')
193 | k = Key(bucket)
194 | k.key = 'data2001.csv'
195 | file_path = k.get_contents_as_string()
196 | '''
197 | file_path = '../Airport_Data/data' + str(i) + '.csv'
198 | pickle_file_name = timestr + '-data-' + str(i)
199 | dropped = ''
200 | with open(file_path, 'r') as data_csv:
201 | csv_reader = csv.reader(data_csv, delimiter=',')
202 | j = 0
203 | with open('trimmed2_' + str(i) + '.csv', 'w') as output_csv:
204 | writer = csv.writer(
205 | output_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
206 | writer.writerow(
207 | ['dayOfWeek', 'depTime', 'carrier', 'dest', 'origin', 'delay'])
208 | for row in csv_reader:
209 | # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
210 | if row[21] == '0':
211 | # if (row[16] == 'SFO' or row[16] == 'OAK'):
212 | if (row[16] not in ['SFO', 'OAK']):
213 | dropped += row[16] + ' '
214 | continue # airportHash[row[16]] = len(airportHash) + 1
215 | origin = airportHash[row[16]]
216 |
217 | if(row[17] not in airportHash):
218 | airportHash[row[17]] = len(airportHash) + 1
219 | dest = airportHash[row[17]]
220 |
221 | # if(row[10] not in tailNumHash):
222 | # tailNumHash[row[10]] = len(tailNumHash) + 1
223 | # tailNum = tailNumHash[row[10]]
224 |
225 | if(row[8] not in carrierHash):
226 | carrierHash[row[8]] = len(carrierHash) + 1
227 | carrier = carrierHash[row[8]]
228 | # print row[8], carrier, carrierHash
229 | # raw_input()
230 |
231 | content = [row[i] for i in needed_cols]
232 | # content2 = ComputeDayofYear(content)
233 | content3 = DiscretizeDepTime(content)
234 | content4 = AddDepVar(content3)
235 | content4[2] = carrier
236 | # content4[5] = tailNum
237 | content4[4] = origin
238 | content4[5] = dest
239 | for idx in range(len(content4)):
240 | content4[idx] = float(content4[idx])
241 | temp = content4[3]
242 | content4[3] = content4[5]
243 | content4[5] = temp
244 |
245 | writer.writerow(
246 | [content4[0], content4[1], content4[2], content4[3], content4[4], content4[5]])
247 | # print content4
248 | # data.append(content4)
249 | # print 'content4', content4
250 | # print 'data', data
251 | # fff = raw_input()
252 | # j=j+1
253 | # if j % 2000000 == 0:
254 | # print j
255 | # SaveData(data, pickle_file_name + '-' + str(j))
256 | # data = []
257 | # SaveData(data, pickle_file_name)
258 | # print dropped
259 |
260 |
261 | # hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic']
262 | # hashVals = [airportHash, tailNumHash, carrierHash]
263 | # for idx in range(len(hashs)):
264 | # f = open (hashs[idx], "wb")
265 | # try:
266 | # pickle.dump(hashVals[idx], f, protocol=pickle.HIGHEST_PROTOCOL)
267 | # except Exception as e:
268 | # print e
269 | # f.close()
270 |
--------------------------------------------------------------------------------
/date_graph2.py:
--------------------------------------------------------------------------------
1 | #
2 | # date_graph2.py
3 | # Author: Ryan Jung
4 | # Description: This function takes a date and calculates the probability of delay at SFO
5 | # and at OAK for the date and the 6 days prior. It then graphs these probabilities as
6 | # side-by-side bars for each day.
7 | # Dependencies: Run the Naive Bayes classification code in Crossval_r.py file. Ensure that the file _dfTest2008.csv is in the
8 | # same folder.
9 | #
10 |
11 | from __future__ import division
12 | import sys
13 | import csv
14 | import datetime
15 | import matplotlib.pyplot as plt
16 | plt.rcdefaults()
17 | import numpy as np
18 |
19 | #
20 | # These are the hard codes of the "look back" period (set at 6 days) and airport codes
21 | # from our Naive Bayes dictionary.
22 | #
23 |
24 | TIME_DELTA = 6
25 | SFO_AIRPORT_CODE = '270'
26 | OAK_AIRPORT_CODE = '215'
27 | JFK_AIRPORT_CODE = '160'
28 | ORD_AIRPORT_CODE = '225'
29 | ATL_AIRPORT_CODE = '25'
30 | LAX_AIRPORT_CODE = '168'
31 | LGA_AIRPORT_CODE = '174'
32 | DFW_AIRPORT_CODE = '85'
33 |
34 | #
35 | # Main Function
36 | # The function first takes an argument from the command line of the form:
37 | # python date_graph2.py m-d-yy
38 | # It then calculates the bounds of our query for probability of delay by day.
39 | #
40 |
41 | for arg in sys.argv:
42 | if(arg != 'date_graph2.py'):
43 | start_date = datetime.datetime.strptime(arg, '%m-%d-%y')
44 | start_date = datetime.date(
45 | start_date.year, start_date.month, start_date.day)
46 |
47 | delta = datetime.timedelta(days=TIME_DELTA)
48 | begin = start_date - delta
49 | end = start_date
50 |
51 | #
52 | # This block of code sets up a hash for each airport of the form {key: value} => {day:
53 | # [predict label,...]}. This is a list of the predicted labels for each flight on a
54 | # particular day from the origin airport to the destination airport. It iterates over
55 | # the days in our query range and constructs the hash.
56 | #
57 |
58 | SFO_Hash = {}
59 | OAK_Hash = {}
60 | with open('_dfTest2008.csv', 'r') as data:
61 | csv_reader = csv.reader(data, delimiter=',')
62 | for row in csv_reader:
63 | if(row[0] != 'Year'):
64 | year = int(row[0])
65 | month = int(row[1])
66 | date = int(row[2])
67 | curr_date = datetime.date(year, month, date)
68 | if(curr_date >= begin and curr_date <= end):
69 | origin = row[7]
70 | dest = row[8]
71 | if(origin == SFO_AIRPORT_CODE and dest == LAX_AIRPORT_CODE):
72 | label = int(row[10])
73 | if(curr_date not in SFO_Hash):
74 | SFO_Hash[curr_date] = [label]
75 | else:
76 | SFO_Hash[curr_date].append(label)
77 | if(origin == OAK_AIRPORT_CODE and dest == LAX_AIRPORT_CODE):
78 | label = int(row[10])
79 | if(curr_date not in OAK_Hash):
80 | OAK_Hash[curr_date] = [label]
81 | else:
82 | OAK_Hash[curr_date].append(label)
83 |
84 | #
85 | # This block of code initializes values for day "steps" for our iterator later.
86 | # We also initialize lists which will have the number of delays, on-time flights, and
87 | # percentage of predicted delays for the days in our query.
88 | #
89 |
90 | iterator = datetime.timedelta(days=1)
91 | two_iterator = datetime.timedelta(days=2)
92 | three_iterator = datetime.timedelta(days=3)
93 | four_iterator = datetime.timedelta(days=4)
94 | five_iterator = datetime.timedelta(days=5)
95 | six_iterator = datetime.timedelta(days=6)
96 |
97 | day_values = []
98 | SFO_Delays = []
99 | SFO_On_Time = []
100 | SFO_Flights = []
101 | SFO_Pct = []
102 | SFO_Comp = []
103 | OAK_Delays = []
104 | OAK_On_Time = []
105 | OAK_Flights = []
106 | OAK_Pct = []
107 | OAK_Comp = []
108 |
109 | #
110 | # We then loop through the query date range and populate the lists, counting number of
111 | # delayed flights, number of on-time flights, and percent of flights delayed. Each
112 | # list item corresponds to a date in our query range.
113 | #
114 |
115 | while begin <= end:
116 | if(begin not in SFO_Hash):
117 | SFO_Delays.append(0)
118 | SFO_On_Time.append(0)
119 | SFO_Pct.append(0.00)
120 | else:
121 | SFO_Flights = SFO_Hash[begin]
122 | delays = sum(SFO_Flights)
123 | num_flights = len(SFO_Flights)
124 | pct = float(delays) / (num_flights + delays)
125 | SFO_Delays.append(delays)
126 | SFO_On_Time.append(num_flights - delays)
127 | SFO_Pct.append(pct)
128 | SFO_Comp.append(1)
129 |
130 | if(begin not in OAK_Hash):
131 | OAK_Delays.append(0)
132 | OAK_On_Time.append(0)
133 | OAK_Pct.append(0.00)
134 | else:
135 | OAK_Flights = OAK_Hash[begin]
136 | delays = sum(OAK_Flights)
137 | num_flights = len(OAK_Flights)
138 | pct = float(delays) / (num_flights + delays)
139 | OAK_Delays.append(delays)
140 | OAK_On_Time.append(num_flights - delays)
141 | OAK_Pct.append(pct)
142 | OAK_Comp.append(1)
143 |
144 | day_values.append(begin)
145 | begin += iterator
146 |
147 | #
148 | # This block of code then graphs the percentage of delays by day as a side-by-side bar
149 | # graph for each day in the query.
150 | #
151 |
152 | Y1 = SFO_Pct
153 | Y2 = OAK_Pct
154 | Y3 = SFO_Comp
155 | Y4 = OAK_Comp
156 |
157 | N = 7
158 | ind = np.arange(N) # the x locations for the groups
159 | width = 0.35 # the width of the bars
160 |
161 | fig, ax = plt.subplots()
162 | rects1 = ax.bar(ind, Y1, width, color='blue')
163 | rects2 = ax.bar(ind + width, Y2, width, color='grey')
164 |
165 | fig.suptitle(
166 | 'Probability of Flight Delays at SFO vs. OAK Given Specific Date Through t-7 Days')
167 | ax.legend((rects1[0], rects2[0]), ('SFO', 'OAK'), loc='upper center')
168 |
169 |
170 | def autolabel(rects):
171 | for rect in rects:
172 | height = rect.get_height()
173 | ax.text(
174 | rect.get_x() + rect.get_width() /
175 | 2., 1.05 * height, '%.2f' % float(height),
176 | ha='center', va='bottom', rotation='vertical')
177 |
178 | autolabel(rects1)
179 | autolabel(rects2)
180 | ax.set_xticklabels(
181 | [start_date - six_iterator, start_date - five_iterator, start_date - four_iterator,
182 | start_date - three_iterator, start_date - two_iterator, start_date - iterator, start_date], rotation=45)
183 | ax.set_ylabel('Probability of Delay')
184 |
185 | plt.show()
186 |
--------------------------------------------------------------------------------
/date_iterator_plot2.py:
--------------------------------------------------------------------------------
1 | #
2 | # date_iterator_plot2.py
3 | # Author: Ryan Jung
4 | # Description: This function reads the predicted results from one of our models. It then
5 | # aggregates the probability of delay by week and graphs the probability of delay at
6 | # both airports (SFO and OAK). Lastly, it calculates the t-score of the difference in
7 | # means of both airports to help determine if the difference is statistically significant.
8 | #
9 |
10 | import datetime
11 | import csv
12 | import matplotlib.pyplot as plt
13 | plt.rcdefaults()
14 | import numpy
15 |
16 | # Hard code of airport codes in our dictionary that correspond to Naive
17 | # Bayes model
18 | SFO_AIRPORT_CODE = '270'
19 | OAK_AIRPORT_CODE = '215'
20 |
21 | #
22 | # Function: ComputeDayofYear(month, day)
23 | # Description: This function takes a month and day of month and outputs a number which
24 | # corresponds to the day of year. This will be a number between 0 and 365.
25 | # Input: Integer values for month and day
26 | # Output: Integer value for day of year
27 | #
28 |
29 |
30 | def ComputeDayofYear(month, day):
31 | if(month == 1):
32 | numDays = 0
33 | if(month == 2):
34 | numDays = 31
35 | if(month == 3):
36 | numDays = 60
37 | if(month == 4):
38 | numDays = 91
39 | if(month == 5):
40 | numDays = 121
41 | if(month == 6):
42 | numDays = 152
43 | if(month == 7):
44 | numDays = 182
45 | if(month == 8):
46 | numDays = 213
47 | if(month == 9):
48 | numDays = 244
49 | if(month == 10):
50 | numDays = 274
51 | if(month == 11):
52 | numDays = 305
53 | if(month == 12):
54 | numDays = 335
55 |
56 | return (numDays + day - 1)
57 |
58 | #
59 | # Main Function
60 | # This block of code reads from the output of the Naive Bayes model and creates a hash
61 | # for SFO and OAK that corresponds to {key: value} = {week #: [predicted label,...]}.
62 | # The idea here is to create a list of all flights that are scheduled to leave SFO or OAK
63 | # by week (52 weeks in the year). The list will be 1's and 0's based on our prediction of
64 | # whether the flight will be delayed (1) or not delayed (0).
65 | #
66 |
67 | with open('_dfTest2008.csv', 'r') as data:
68 | csv_reader = csv.reader(data, delimiter=',')
69 | SFO_DM_Hash = {}
70 | OAK_DM_Hash = {}
71 | for row in csv_reader:
72 | origin = row[7]
73 | if(origin == SFO_AIRPORT_CODE):
74 | month = int(row[1])
75 | date = int(row[2])
76 | DayofYear = ComputeDayofYear(month, date)
77 | key = DayofYear / 7
78 | label = int(row[10])
79 | if(key not in SFO_DM_Hash):
80 | SFO_DM_Hash[key] = [label]
81 | else:
82 | SFO_DM_Hash[key].append(label)
83 | elif(origin == OAK_AIRPORT_CODE):
84 | month = int(row[1])
85 | date = int(row[2])
86 | DayofYear = ComputeDayofYear(month, date)
87 | key = DayofYear / 7
88 | label = int(row[10])
89 | if(key not in OAK_DM_Hash):
90 | OAK_DM_Hash[key] = [label]
91 | else:
92 | OAK_DM_Hash[key].append(label)
93 | else:
94 | continue
95 |
96 | #
97 | # This block of code separates out the value list of flights from the previous block of
98 | # code into a list of the number of delays and the number of on-time flights from SFO
99 | # and OAK by week. In other words, SFO_DM_Delays[14] will be the number of delayed
100 | # flights we predict at SFO in week 14. We create a 3rd list which is the percent of
101 | # flights that are delayed by week.
102 | #
103 |
104 | week_values = []
105 | SFO_DM_Delays = []
106 | SFO_DM_On_Time = []
107 | SFO_DM_Pct = []
108 | OAK_DM_Delays = []
109 | OAK_DM_On_Time = []
110 | OAK_DM_Pct = []
111 |
112 | d = 0
113 | while d <= 51:
114 | if(d not in SFO_DM_Hash):
115 | SFO_DM_Delays.append(0)
116 | SFO_DM_On_Time.append(0)
117 | SFO_DM_Pct.append(0.00)
118 | else:
119 | SFO_DM_Flights = SFO_DM_Hash[d]
120 | delays = sum(SFO_DM_Flights)
121 | num_flights = len(SFO_DM_Flights)
122 | pct = float(delays) / (num_flights + delays)
123 | SFO_DM_Delays.append(delays)
124 | SFO_DM_On_Time.append(num_flights - delays)
125 | SFO_DM_Pct.append(pct)
126 |
127 | if(d not in OAK_DM_Hash):
128 | OAK_DM_Delays.append(0)
129 | OAK_DM_On_Time.append(0)
130 | OAK_DM_Pct.append(0.00)
131 | else:
132 | OAK_DM_Flights = OAK_DM_Hash[d]
133 | delays = sum(OAK_DM_Flights)
134 | num_flights = len(OAK_DM_Flights)
135 | pct = float(delays) / (num_flights + delays)
136 | OAK_DM_Delays.append(delays)
137 | OAK_DM_On_Time.append(num_flights - delays)
138 | OAK_DM_Pct.append(pct)
139 |
140 | week_values.append(d)
141 | d += 1
142 |
143 | #
144 | # This block of code calculates the mean and standard deviation of the percent of flights
145 | # that are predicted to be delayed. It uses these to calculate a t-score of the
146 | # difference in means which can be used to determine if the difference is statistically
147 | # significant.
148 | #
149 |
150 | SFO_mean = numpy.mean(SFO_DM_Pct)
151 | OAK_mean = sum(OAK_DM_Pct) / len(OAK_DM_Pct)
152 | SFO_std = numpy.std(SFO_DM_Pct)
153 | OAK_std = numpy.std(OAK_DM_Pct)
154 | SFO_n = len(SFO_DM_Pct)
155 | OAK_n = len(OAK_DM_Pct)
156 | Diff = OAK_mean - SFO_mean
157 | std_err = (((SFO_std ** 2) / SFO_n) + ((OAK_std ** 2) / OAK_n)) ** 0.5
158 |
159 | print "Standard Error", std_err
160 | print "t = ", Diff / std_err
161 |
162 | #
163 | # Graphic visualization of the probability of delay by week at SFO and OAK. SFO will be
164 | # the green line and OAK will be the blue line in the graph. X-axis is the week of 2008
165 | # and y-axis is probability of delay.
166 | #
167 |
168 | ax1 = plt.subplot(111)
169 | p1 = ax1.plot(week_values, SFO_DM_Pct, color='green')
170 | p2 = ax1.plot(week_values, OAK_DM_Pct, color='blue')
171 | ax1.set_title('Proportion of flights delayed in SFO (green) vs. OAK (blue)')
172 | ax1.set_xticklabels(
173 | ['Jan 2008', 'Mar 2008', 'May 2008', 'Jul 2008', 'Sep 2008', 'Nov 2008'])
174 | ax1.set_ylabel('Probability of Delay')
175 | ax1.legend((p1[0], p2[0]), ('SFO', 'OAK'), loc='upper center')
176 |
177 | plt.show()
178 |
--------------------------------------------------------------------------------
/logisticRegression.py:
--------------------------------------------------------------------------------
1 | #
2 | # logisticRegression.py
3 | # author: eunkwang joo
4 | # description: Loading pickled dataset in several fragments, it runs logistic regression to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset.
5 | #
6 |
7 | import numpy as np
8 | import random
9 | import pickle
10 | import sys
11 | import os
12 | from boto.s3.connection import S3Connection
13 | from boto.s3.key import Key
14 |
15 |
16 | # Trimmed datasets are stroed in pickle format. Due to a memory problem, I
17 | # pickled datasets in many files.
18 |
19 | pickle2001 = ['20140428-190051-data-2001',
20 | '20140428-190051-data-2001-2000000',
21 | '20140428-190051-data-2001-4000000']
22 | pickle2002 = ['20140428-190051-data-2002',
23 | '20140428-190051-data-2002-2000000',
24 | '20140428-190051-data-2002-4000000']
25 | pickle2003 = ['20140428-190051-data-2003',
26 | '20140428-190051-data-2003-2000000',
27 | '20140428-190051-data-2003-4000000',
28 | '20140428-190051-data-2003-6000000']
29 | pickle2004 = ['20140428-190051-data-2004',
30 | '20140428-190051-data-2004-2000000',
31 | '20140428-190051-data-2004-4000000',
32 | '20140428-190051-data-2004-6000000']
33 | pickle2005 = ['20140428-190051-data-2005',
34 | '20140428-190051-data-2005-2000000',
35 | '20140428-190051-data-2005-4000000',
36 | '20140428-190051-data-2005-6000000']
37 | pickle2006 = ['20140428-190051-data-2006',
38 | '20140428-190051-data-2006-2000000',
39 | '20140428-190051-data-2006-4000000',
40 | '20140428-190051-data-2006-6000000']
41 | pickle2007 = ['20140428-190051-data-2007',
42 | '20140428-190051-data-2007-2000000',
43 | '20140428-190051-data-2007-4000000',
44 | '20140428-190051-data-2007-6000000']
45 | pickle2008 = ['20140428-190051-data-2008',
46 | '20140428-190051-data-2008-2000000',
47 | '20140428-190051-data-2008-4000000',
48 | '20140428-190051-data-2008-6000000']
49 |
50 | #
51 | # function: loadData()
52 | # description: It loads dataset from pickled files, and separates x variables (features) from y value (delay)
53 | # input: fileName= name of a pickled file
54 | # output: x and y matrices to be used for logistic regression
55 | #
56 |
57 |
58 | def loadData(fileName):
59 | if os.path.exists(fileName) == False:
60 | print 'downloading', fileName, 'from s3'
61 | conn = S3Connection(
62 | 'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
63 | bucket = conn.get_bucket('i290-aero')
64 | k = Key(bucket)
65 | k.key = fileName
66 | k.get_contents_to_filename(fileName)
67 | print 'downloaded', fileName, 'from s3'
68 |
69 | print 'now unpickle...'
70 | x = pickle.load(open(fileName, "rb"))
71 | x = np.array(x)
72 | print 'x.shape = ', x.shape, x[:, -1:].shape
73 | y = x[:, -1:].copy() # last col is y value (delay or not)
74 | x[:, -1:] = 1.
75 | return x, y
76 |
77 |
78 | #
79 | # function: gradientDescent()
80 | # description: Using gradient descent algorithm, it runs logistic regression and estimates coefficients.
81 | # input: x= features to be used for logistic regression
82 | # y= ground truth value of delay
83 | # numIterations= number of iterations to take for logistic regression
84 | # dimension= dimension of x matrix
85 | # theta= coefficient we try to find
86 | # output: theta= coefficient matrix we have found to predict delay
87 | #
88 |
89 | def gradientDescent(x, y, numIterations, dimension, theta):
90 | # theta = np.zeros(dimension)[np.newaxis].transpose()
91 | for i in range(1, numIterations):
92 | randIdx = random.randint(0, len(x) - 1)
93 | xTrans = x[randIdx][np.newaxis].transpose()
94 | # print theta.transpose(), xTrans
95 | u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans)))
96 | loss = y[randIdx] - u
97 | gradient = np.dot(loss[0][0], xTrans)
98 | # update
99 | theta = theta + gradient / i
100 | return theta
101 |
102 |
103 | def main():
104 | # arg = sys.argv
105 | # if len(arg) < 2:
106 | # print 'USE: $ python logisticRegression.py [dataset_file]'
107 | # return
108 | # x, y = loadData(arg[1])
109 |
110 | # x, x0, x1, y = getData('classification.dat')
111 |
112 | # train theta for 7 years of dataset
113 | if os.path.exists('pickled_theta') == False:
114 | theta = None
115 | for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2008:
116 | x, y = loadData(elem)
117 | if theta == None:
118 | theta = np.zeros(x.shape[1])[np.newaxis].transpose()
119 | print 'theta == None...... initialize..........', theta.shape
120 | theta = gradientDescent(x, y, 100000, x.shape[1], theta)
121 | print 'finished gradientDescent of ', elem
122 | print 'theta', theta
123 |
124 | # pickle trained theta
125 | f = open('pickled_theta', 'wb')
126 | pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL)
127 | f.close()
128 |
129 | # load pickled theta
130 | theta = pickle.load(open('pickled_theta', 'rb'))
131 |
132 | # predict with test dataset
133 | accu = 0.
134 | length = 0.
135 | tp, tn, fp, fn = 0., 0., 0., 0.
136 | for elem in pickle2007:
137 | if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False:
138 | x, y = loadData(elem)
139 | dotProduct = np.dot(x, theta)
140 | print '============= dot product ============='
141 | print dotProduct
142 | print '=============y ============='
143 | print y
144 | pickle.dump(
145 | dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
146 | pickle.dump(
147 | y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
148 | else:
149 | dotProduct = pickle.load(open('dot-' + elem, 'rb'))
150 | y = pickle.load(open('y-' + elem, 'rb'))
151 |
152 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
153 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
154 |
155 | for i in range(len(prob)):
156 | if prob[i] == 1 and y[i] == 1:
157 | accu += 1
158 | tp += 1
159 | elif prob[i] == 1 and y[i] == 0:
160 | fp += 1
161 | elif prob[i] == 0 and y[i] == 1:
162 | fn += 1
163 | elif prob[i] == 0 and y[i] == 0:
164 | accu += 1
165 | tn += 1
166 | else:
167 | raise Exception('wtf!!!', prob[i], y[i])
168 | length += len(prob)
169 | # print accuracy, precision, and recall
170 | print 'accuracy = ', accu * 100 / length, (tp + tn) / (tp + fp + fn + tn)
171 | print 'precision = ', tp / (tp + fp)
172 | print 'recall = ', tp / (tp + fn)
173 |
174 | # graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5))
175 | print 'asdf'
176 |
177 |
178 | if __name__ == '__main__':
179 | main()
180 |
--------------------------------------------------------------------------------
/lr_app2.py:
--------------------------------------------------------------------------------
1 | #
2 | # lr_app2.py
3 | # author: eunkwang joo
4 | # description: Loading trimmed datasets stored as csv files, it runs logistic regression using pandas to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset.
5 | #
6 |
7 | import pandas as pd
8 | import statsmodels.api as sm
9 | # import pylab as pl
10 | import numpy as np
11 | import sys
12 | import random
13 | import os
14 | import pickle
15 |
16 | # df = pd.read_csv('trimmed2_2001.csv')#sys.argv[1])
17 |
18 |
19 | #
20 | # function: reader()
21 | # description: It loads dataset from csv file in dataframe format
22 | # input: f= name of a csv file of dataset
23 | # output: d= dataframe loaded from csv dataset
24 | #
25 |
26 | def reader(f):
27 | d = pd.read_csv(f, header=0) # , axis=1)
28 | # d.columns = range(d.shape[1])
29 | return d
30 |
31 |
32 | #
33 | # function: shuffle()
34 | # description: It shuffles data
35 | # input: df= dataframe which holds data
36 | # n= number of shuffles
37 | # axis= shuffle in which axis
38 | # output: df= shuffled dataframe
39 | #
40 |
41 | def shuffle(df, n=1, axis=0):
42 | df = df.copy()
43 | for _ in range(n):
44 | df.apply(np.random.shuffle, axis=axis)
45 | return df
46 |
47 |
48 | # search for csv files
49 | for dirpath, dirnames, filenames in os.walk('.'):
50 | pass
51 |
52 | filenames = [f for f in filenames if '.csv' in f]
53 | filenames.sort()
54 | print filenames
55 | # concatenate all csv files in one dataframe
56 | #[1532189 rows x 6 columns]
57 | df = pd.concat([reader(f) for f in filenames], keys=filenames)
58 |
59 | print df.head()
60 | print df.columns
61 |
62 | # dumm1 = pd.get_dummies(df['carrier'], prefix='carrier')
63 | # dumm2 = pd.get_dummies(df['dest'], prefix='dest')
64 | # dumm3 = pd.get_dummies(df['origin'], prefix='origin')
65 | # dumm4 = pd.get_dummies(df['tailNum'], prefix='tailNum')
66 |
67 | cols = ['delay', 'dayOfWeek', 'depTime']
68 |
69 | # data = df[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
70 | # data = df[cols].join(dumm1).join(dumm2).join(dumm3)
71 | # data['intercept'] = 1.0
72 | # print data.head() #[5 rows x 123 columns] including delay column
73 |
74 | # data_delay = data[data['delay'] == 1]
75 | # data_nodelay = data[data['delay'] == 0]
76 |
77 | # get delayed data only
78 | data_delay = df[df['delay'] == 1]
79 | rows = random.sample(data_delay.index, len(data_delay))
80 | data_delay_1 = data_delay.ix[rows]
81 | data_delay_2 = data_delay.drop(rows)
82 |
83 | # get not delayed data only
84 | data_nodelay = df[df['delay'] == 0]
85 | rows = random.sample(data_nodelay.index, len(data_delay))
86 | data_nodelay = data_nodelay.ix[rows]
87 | # get sample dataset of 50% delayed and 50% not delayed data
88 | data_halfhalf = pd.concat([data_delay, data_nodelay])
89 |
90 | rows = random.sample(data_nodelay.index, len(data_delay) / 2)
91 | data_nodelay = data_nodelay.ix[rows]
92 | data_halfhalf_2 = pd.concat([data_delay_2, data_nodelay])
93 |
94 | # make dummy variables of carrier, dest, and origin
95 | dumm1 = pd.get_dummies(data_halfhalf['carrier'], prefix='carrier')
96 | dumm2 = pd.get_dummies(data_halfhalf['dest'], prefix='dest')
97 | dumm3 = pd.get_dummies(data_halfhalf['origin'], prefix='origin')
98 | data_halfhalf = data_halfhalf[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(
99 | dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
100 | data_halfhalf['intercept'] = 1.0 # (552264, 117)
101 | # data_halfhalf = shuffle(data_halfhalf)
102 | # data_halfhalf.reindex(np.random.permutation(data_halfhalf.index))
103 | print 'delay = ', len(data_delay), len(data_delay), len(data_halfhalf)
104 |
105 |
106 | dumm1 = pd.get_dummies(data_halfhalf_2['carrier'], prefix='carrier')
107 | dumm2 = pd.get_dummies(data_halfhalf_2['dest'], prefix='dest')
108 | dumm3 = pd.get_dummies(data_halfhalf_2['origin'], prefix='origin')
109 | data_halfhalf_2 = data_halfhalf_2[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(
110 | dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
111 | data_halfhalf_2['intercept'] = 1.0 # (552264, 117)
112 |
113 |
114 | # train dataset with logistic regression algorithm
115 | train_cols = data_halfhalf.columns[1:]
116 | logit = sm.Logit(data_halfhalf['delay'], data_halfhalf[train_cols])
117 | result = logit.fit(maxiter=1000)
118 |
119 | ff = open('halfhalf_sample_re3', 'w')
120 | ff.write(str(result.summary()))
121 | ff.close()
122 | print result.summary()
123 |
124 |
125 | # finally, we got theta - coefficient.
126 | a = np.array(result.params)
127 | pickle.dump(a, open('theta_half5', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
128 | theta = pickle.load(open('theta_half5', 'rb'))
129 |
130 |
131 | # now k-fold test
132 |
133 | '''
134 | df_test = pd.read_csv('trimmed2_2008.csv')
135 | dumm_test1 = pd.get_dummies(df_test['carrier'], prefix='carrier')
136 | dumm_test2 = pd.get_dummies(df_test['dest'], prefix='dest')
137 | dumm_test3 = pd.get_dummies(df_test['origin'], prefix='origin')
138 | data_test = df_test[cols].join(dumm_test1.ix[:, 'carrier_3.0':]).join(dumm_test2.ix[:, 'dest_6.0':]).join(dumm_test3.ix[:, 'origin_105.0':])
139 | data_test['intercept'] = 1.0
140 | data_test_cal = data_test.drop('delay', 1)
141 | dot = np.dot(data_test_cal, theta)
142 | '''
143 |
144 | rows = random.sample(data_halfhalf.index, len(data_halfhalf) / 10)
145 | df_10 = data_halfhalf.ix[rows]
146 | # df_90 = data_halfhalf.drop(rows)
147 | df_10_cal = df_10.drop('delay', 1)
148 | dotProduct = np.dot(df_10_cal, theta) # m x 122 * 122 x 1
149 |
150 | # get reverse logit
151 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
152 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
153 |
154 | # predict with test dataset and measure accuracy, precision, and recall
155 | y = df_10['delay']
156 | tp, tn, fp, fn = 0., 0., 0., 0.
157 | for i in range(len(prob)):
158 | if prob[i] == 1 and y[i] == 1:
159 | tp += 1
160 | elif prob[i] == 1 and y[i] == 0:
161 | fp += 1
162 | elif prob[i] == 0 and y[i] == 1:
163 | fn += 1
164 | elif prob[i] == 0 and y[i] == 0:
165 | tn += 1
166 | else:
167 | raise Exception('wtf!!!', prob[i], y[i])
168 |
169 | print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn)
170 | print 'precision = ', tp / (tp + fp)
171 | print 'recall = ', tp / (tp + fn)
172 | print tp, tn, fp, fn
173 |
174 | # >>> print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn)
175 | # accuracy = 0.60288632166
176 | # >>> print 'precision = ', tp / (tp + fp)
177 | # precision = 0.607973048849
178 | # >>> print 'recall = ', tp / (tp + fn)
179 | # recall = 0.586353790614
180 | # >>> print tp, tn, fp, fn
181 | # 16242.0 17053.0 10473.0 11458.0
182 |
183 |
184 | # meaure ROC curve
185 |
186 | rlsort = reverseLogit[:]
187 | rlsort.sort()
188 | diff = diff[51900] # min([j-i for i, j in zip(rlsort[:-1], rlsort[1:])])
189 |
190 | p = len([e for e in y if e == 1])
191 | n = len([e for e in y if e == 0])
192 | j = rlsort[0]
193 | r = []
194 | while j <= rlsort[-1]:
195 | prob = [1 if rev >= j else 0 for rev in reverseLogit]
196 | p1 = [x for x in prob if x == 1]
197 | # print p1
198 | # raw_input()
199 | tp, fp = 0., 0.
200 | for i in range(len(prob)):
201 | if prob[i] == 1 and y[i] == 1:
202 | tp += 1
203 | elif prob[i] == 1 and y[i] == 0:
204 | fp += 1
205 | r.append((fp / float(n), tp / float(p)))
206 | # print j, tp, fp, p, n
207 | j += 0.01
208 |
209 | # plot ROC curve
210 | import matplotlib as mpl
211 | mpl.use('Agg')
212 | import matplotlib.pyplot as plt
213 | from matplotlib.backends.backend_pdf import PdfPages
214 |
215 | r = pickle.load(open('roc.list', 'rb'))
216 | fig = plt.figure()
217 | plt.plot(*zip(*r), marker='o', color='r', ls='')
218 | pp = PdfPages('foo.pdf')
219 | pp.savefig(fig)
220 | pp.close()
221 |
--------------------------------------------------------------------------------
/model_selector.py:
--------------------------------------------------------------------------------
1 | #
2 | # model_selector.py
3 | # Author: Ryan Jung
4 | # Description: This script graphs the results of validation tests with precision on the
5 | # y-axis and recall on the x-axis.
6 | # Because we only used 8-fold validation for the Naive Bayes model, this model is only
7 | # used for the testing results of that validation.
8 | #
9 |
10 | import matplotlib.pyplot as plt
11 | plt.rcdefaults()
12 |
13 | # Hard code of testing results of form [precision, recall, accuracy, title]
14 | DM_TEST_DATA = [
15 | [0.59, 0.61, 0.61, 'NB 2008'], [0.60, 0.61, 0.60, 'NB 2007'], [
16 | 0.60, 0.63, 0.62, 'NB 2006'], [0.62, 0.64, 0.64, 'NB 2005'],
17 | [0.63, 0.66, 0.66, 'NB 2004'], [0.65, 0.70, 0.70, 'NB 2003'], [0.60, 0.65, 0.65, 'NB 2002'], [0.58, 0.62, 0.61, 'NB 2001']]
18 |
19 | #
20 | # Function: calc_f1_score(precision, recall, accuracy)
21 | # Description: This function calculates the F1 score = 2*(precision * recall) / (precision + recall)
22 | # Input: Floating point values of precision, recall, and accuracy (not used)
23 | # Output: Floating point F1 score
24 | #
25 |
26 |
27 | def calc_f1_score(precision, recall, accuracy):
28 | return (float(2 * (precision * recall) / (precision + recall)))
29 |
30 | #
31 | # Main Function
32 | # Description: Creates array of precision and array of recall values. Uses best values to
33 | # track highest F1 score and title of test with best result.
34 | #
35 |
36 | precision_dm_array = []
37 | recall_dm_array = []
38 | dm_best_f1 = 0.00000000000000000
39 | index = 0
40 | dm_best_title = 'None'
41 |
42 | for each in DM_TEST_DATA:
43 | precision_dm_array.append(each[0])
44 | recall_dm_array.append(each[1])
45 |
46 | f1 = calc_f1_score(each[0], each[1], each[2])
47 | if(f1 > dm_best_f1):
48 | dm_best_f1 = f1
49 | best_index = index
50 | dm_best_title = each[3]
51 | index += 1
52 |
53 | # prints title of Best performing model by F1 score
54 | # print "The Best Naive Bayes Model is: Model " + str(dm_best_title)
55 |
56 | # Scatter plot visualization of results with precision on y-axis and
57 | # recall on x-axis
58 | fig = plt.subplot(111)
59 | fig.scatter(precision_dm_array, recall_dm_array, color='blue')
60 | fig.set_xlabel('Recall')
61 | fig.set_ylabel('Precision')
62 |
63 | plt.show()
64 |
--------------------------------------------------------------------------------
/naive bayes.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | #
4 | # Naive Bayes.py
5 | # Author: Divyakumar Menghani
6 | # Description: This code reads the dataset into pandas dataframes, builds a Naive Bayes Classifier, predicts labels for a subset of data. It also calculates metrics such as precision/recall/accuracy and F-Score after classification. The output is dumped in pickle files which are used later for visualization
7 | #
8 |
9 | import pickle
10 | import sklearn
11 | from sklearn.naive_bayes import *
12 | import pandas as pd
13 | import numpy as np
14 | from sklearn import *
15 | import os
16 | from sklearn.metrics import *
17 | from sklearn import metrics, preprocessing
18 | from sklearn import svm, naive_bayes, neighbors, tree
19 |
20 | #
21 | # Function: createPickle()
22 | # Description: This function will create a pickle file.
23 | # Input: data structure that you want to pickle
24 | # Output: a pickle file for the data structure. The file is stored in the
25 | # same path the code is running from
26 | #
27 |
28 |
29 | def createPickle(data, filename):
30 | with open(filename, 'wb') as f:
31 | pickle.dump(data, f)
32 | print "Pickled", filename
33 |
34 |
35 | # Global constants for this code
36 | print "Setting constants..."
37 |
38 | TRAINING_LINE_NUMBER = 8000000 # Number of lines to be read from input files
39 | # List of years for training and testing
40 | YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']
41 | INPUT_FILE_PATH = "/home/dmenghani/python/" # Unix path
42 | # INPUT_FILE_PATH = "C:\\data\\airline\\" # Windows path
43 | SKIP_FIRST_LINE = True # To skip the first line, as its the header
44 |
45 | # Creating the master data frame from all years.
46 | master = []
47 | print "Reading into Pandas frame..."
48 | try:
49 | for year in YEARS:
50 | path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
51 | print "\n", path
52 | dfPart = pd.read_csv(
53 | path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[
54 | u'Year',
55 | u'Month',
56 | u'DayofMonth',
57 | u'DayOfWeek',
58 | u'UniqueCarrier',
59 | u'DepTime',
60 | u'TailNum',
61 | u'Origin',
62 | u'Dest',
63 | u'DepDelay',
64 | # u'ArrDelay',
65 | u'Cancelled',
66 | # u'ArrTime',
67 | # u'ArrDelay',
68 | # u'Distance'
69 | ])
70 | print len(dfPart)
71 | # Removing cancelled flights from each year
72 | dfPart = dfPart[dfPart['Cancelled'] == 0]
73 | rows = np.random.choice(
74 | np.random.permutation(dfPart.index.values), len(dfPart) // 3, replace=False) # 33% sampling of training data
75 | print rows
76 | sampled_dfPart = dfPart.ix[rows]
77 | sampled_dfPart = dfPart
78 | master.append(sampled_dfPart)
79 | print
80 | except Exception as e:
81 | print "Supplemental Data Import failed", e
82 |
83 | # Building the master frame by concating it for all years
84 | dfMaster = pd.concat(master, ignore_index=True)
85 | master = []
86 | dfPart = []
87 |
88 | print "Total length - ", len(dfMaster)
89 | del dfMaster['Cancelled'] # Column not needed
90 |
91 | dfMaster.fillna(0, inplace=True)
92 |
93 | # Converting to appropriate datatypes for numeric cols.
94 | dfMaster['Year'] = dfMaster['Year'].astype('int')
95 | dfMaster['Month'] = dfMaster['Month'].astype('int')
96 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
97 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
98 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
99 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
100 |
101 | df = dfMaster
102 |
103 | # Since we dont have a classification label in the data, we are creating
104 | # one. Threshold of 5mins was chosen.
105 | print "Calculating classification label..."
106 | df['label'] = 0
107 | df.label[df.DepDelay >= 5] = 1
108 | df.label[df.DepDelay < 5] = 0
109 | print "Actual delayed flights -", np.sum(dfMaster['label']) / len(dfMaster['label'])
110 |
111 | del df['DepDelay']
112 |
113 | print "Dataframe shape - ", df.shape
114 | print "Columns -", df.columns
115 |
116 | # Converting categorical data to numeric for cols - TailNum,
117 | # UniqueCarrier, Dest, Origin
118 | print "Converting categorical data to numeric..."
119 | for col in set(df.columns):
120 | if df[col].dtype == np.dtype('object'):
121 | print "Converting...", col
122 | if col == 'TailNum':
123 | s = np.unique(df[col].values)
124 | TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)
125 | if col == 'UniqueCarrier':
126 | s = np.unique(df[col].values)
127 | UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)
128 | if col == 'Dest':
129 | s = np.unique(df[col].values)
130 | Dest = pd.Series([x[0] for x in enumerate(s)], index=s)
131 | if col == 'Origin':
132 | s = np.unique(df[col].values)
133 | Origin = pd.Series([x[0] for x in enumerate(s)], index=s)
134 |
135 | # Creating Pickle files for the list containing key-value pairs
136 | createPickle(Dest, 'Dest_2008.pkl')
137 | createPickle(Origin, 'Origin_2008.pkl')
138 | createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl')
139 | createPickle(TailNum, 'TailNum_2008.pkl')
140 | print "Pickle completed."
141 |
142 | #
143 | # Function: getTailNum()
144 | # Description: This function will convert the input categorical value to corresponding numeric key.
145 | # Input: categorical value you want to convert
146 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
147 | #
148 |
149 |
150 | def getTailNum(inTailNum):
151 | out = []
152 | for x, y in inTailNum.iteritems():
153 | out.append(TailNum.get_value(y))
154 | return out
155 |
156 | #
157 | # Function: getDest()
158 | # Description: This function will convert the input categorical value to corresponding numeric key.
159 | # Input: categorical value you want to convert
160 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
161 | #
162 |
163 |
164 | def getDest(inDest):
165 | out = []
166 | for x, y in inDest.iteritems():
167 | out.append(Dest.get_value(y))
168 | return out
169 |
170 | #
171 | # Function: getOrigin()
172 | # Description: This function will convert the input categorical value to corresponding numeric key.
173 | # Input: categorical value you want to convert
174 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
175 | #
176 |
177 |
178 | def getOrigin(inOrign):
179 | out = []
180 | for x, y in inOrign.iteritems():
181 | out.append(Origin.get_value(y))
182 | return out
183 |
184 | #
185 | # Function: getCarrier()
186 | # Description: This function will convert the input categorical value to corresponding numeric key.
187 | # Input: categorical value you want to convert
188 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
189 | #
190 |
191 |
192 | def getCarrier(inCarrier):
193 | out = []
194 | for x, y in inCarrier.iteritems():
195 | out.append(UniqueCarrier.get_value(y))
196 | return out
197 |
198 | # Converting TailNum
199 | df['TailNum'] = getTailNum(df['TailNum'])
200 | print "TailNum completed."
201 |
202 | # Converting UniqueCarrier
203 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])
204 | print "UniqueCarrier completed."
205 |
206 | # Converting Dest
207 | df['Dest'] = getDest(df['Dest'])
208 | print "Dest completed."
209 |
210 | # Converting Origin
211 | df['Origin'] = getOrigin(df['Origin'])
212 | print "Origin completed."
213 |
214 | print "Conversion to numeric completed."
215 |
216 | # Building classifier
217 | print "Begin cross validation..."
218 |
219 | # Choosing features for classifier
220 | features = df.columns[0:9]
221 |
222 | # Creating lists for storing results for cross validation.
223 | accuracy = {}
224 | results = {}
225 | matrix = {}
226 | prec = {}
227 | recall = {}
228 |
229 | for year in YEARS:
230 | print "Testing on - ", year
231 | train = df[df['Year'] != int(year)] # Test on 1year, train on other 7years
232 | test = df[df['Year'] == int(year)]
233 | # test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])]
234 | print len(train), len(test)
235 | rows = np.random.choice(np.random.permutation(
236 | test.index.values), len(test) // 2, replace=False) # 50% sampling of test data to avoid memory errors faced.
237 | # print rows
238 | sampled_test = test.ix[rows]
239 | sampled_test = test
240 | # Putting the last column of Training data into a list
241 | trainTargets = np.array(train['label']).astype(int)
242 |
243 | # Putting the last column of Testing data into a list
244 | testTargets = np.array(sampled_test['label']).astype(int)
245 | print "Train length - ", len(train), "Test length - ", len(sampled_test)
246 | print train['Year']
247 | print test['Year']
248 | print "Model fitting and prediction started..."
249 | # Building the classifier and fitting the train data
250 | gnb = GaussianNB()
251 | y_gnb = gnb.fit(train[features], trainTargets).predict(
252 | sampled_test[features])
253 | # Storing results in a new colum in the dataframe.
254 | sampled_test['pred_label'] = y_gnb
255 | print "Classification completed."
256 | # Creating pickle files with the classifier and the results of classifier
257 | createPickle(gnb, INPUT_FILE_PATH + "classifier_" + year + ".pkl")
258 | createPickle(y_gnb, INPUT_FILE_PATH + "label_" + year + ".pkl")
259 | sampled_test.to_csv(
260 | INPUT_FILE_PATH + "\dfTest" + year + ".csv", index=False)
261 | # Calculating metrics using sklearn metrics functions
262 | print "\nCalculating metrcs..."
263 | accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb)
264 | print "Accuracy score - ", accuracy[int(year)]
265 | prec[int(year)] = precision_score(
266 | sampled_test['label'], y_gnb, average='micro')
267 | print "Precision Score - ", prec[int(year)]
268 | recall[int(year)] = recall_score(
269 | sampled_test['label'], y_gnb, average='micro')
270 | print "Recall Score - ", recall[int(year)]
271 | print "Confusion matrix"
272 | matrix[int(year)] = metrics.confusion_matrix(
273 | sampled_test['label'], y_gnb)
274 | print matrix[int(year)]
275 | results[int(year)] = precision_recall_fscore_support(
276 | sampled_test['label'], y_gnb, average='micro')
277 | print "Precision, recall, F-Score, Support - ", results[int(year)]
278 | print "Classification report"
279 | print classification_report(np.array(sampled_test['label']), y_gnb,
280 | target_names=target_names)
281 | print
282 | train = []
283 | test = []
284 |
285 | print "Accuracy\n", accuracy
286 | print "\nPrecision\n", prec
287 | print "\nRecall\n", recall
288 | print "\nMetrics\n", results
289 | print "\nMatrix\n", matrix
290 |
291 | # Finding mean of metrics
292 | print "\nMean Cross validation Precision score", np.mean(pd.Series(prec))
293 | print "\nMean Cross validation Recall score", np.mean(pd.Series(recall))
294 | print "\nMean Cross validation Accuracy score", np.mean(pd.Series(accuracy))
295 |
296 | # Pickling results
297 | print "\nPickling stuff..."
298 | createPickle(accuracy, 'accuracy.pkl')
299 | createPickle(prec, 'prec.pkl')
300 | createPickle(results, 'results.pkl')
301 | createPickle(matrix, 'matrix.pkl')
302 |
--------------------------------------------------------------------------------