├── .ipynb_checkpoints
    ├── Basic Code-DM-checkpoint.ipynb
    ├── Basic Code-checkpoint.ipynb
    ├── Julia Code-checkpoint.ipynb
    ├── Naive Bayes - CrossVal-checkpoint.ipynb
    ├── Naive Bayes Code-checkpoint.ipynb
    ├── Test_SFO_OAK_FileGeneration-checkpoint.ipynb
    ├── Untitled0-checkpoint.ipynb
    ├── Untitled1-checkpoint.ipynb
    ├── Untitled2-checkpoint.ipynb
    └── Untitled3-checkpoint.ipynb
├── EDA_and_NB_performance_charts.py
├── INFO290T_Final_Project_Presentation_vFINAL.pptx
├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx
├── Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf
├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx
├── Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf
├── NB_performance_charts.py
├── Old Python Code
    ├── Basic Code-DM.ipynb
    ├── Basic.py
    ├── Dest.pkl
    ├── Julia Code.ipynb
    ├── NB.py
    ├── Naive Bayes - CrossVal.ipynb
    ├── Naive Bayes Code.ipynb
    ├── Origin.pkl
    ├── TailNum.pkl
    ├── Test_SFO_OAK_FileGeneration.ipynb
    ├── UniqueCarrier.pkl
    ├── Untitled0.ipynb
    ├── Untitled1.ipynb
    ├── Untitled2.ipynb
    ├── Untitled3.ipynb
    ├── accuracy.pkl
    ├── counter.py
    ├── counter1.py
    ├── data_reader_v2.py
    ├── data_reader_v3.py
    ├── data_reader_v4_ek.py
    ├── date_iterator_plot.py
    ├── logisticRegression.py
    ├── matrix.pkl
    ├── model_selector.py
    ├── output.txt
    ├── prec.pkl
    ├── results.pkl
    └── why.csv
├── README.md
├── data_reader_v4_ek.py
├── data_reader_v4_ek_rj_csv.py
├── date_graph2.py
├── date_iterator_plot2.py
├── logisticRegression.py
├── lr_app2.py
├── model_selector.py
└── naive bayes.py


/.ipynb_checkpoints/Julia Code-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:da393243e5798034294abbf7f55af08e5d9a8eecbfb718fe2ddd80cd4a4d11b5"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "import csv\n",
 16 |       "import pickle\n",
 17 |       "\n",
 18 |       "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n",
 19 |       "years = [2008]\n",
 20 |       "\n",
 21 |       "def ComputeDayofYear(row):\n",
 22 |       "    \"\"\"This function will return an integer to represent the day of the year given an integer\n",
 23 |       "    representing month and an integer representing the day of the month.  This number will\n",
 24 |       "    correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned\n",
 25 |       "    as 0.  Feb 29th will be returned as 59.\"\"\"\n",
 26 |       "\n",
 27 |       "    if(row[0] == '1'):\n",
 28 |       "        calc = 0 + int(row[1]) - 1\n",
 29 |       "        row[1] = str(calc)\n",
 30 |       "    elif(row[0] == '2'):\n",
 31 |       "        calc = 31 + int(row[1]) - 1\n",
 32 |       "        row[1] = str(calc)\n",
 33 |       "    elif(row[0] == '3'):\n",
 34 |       "        calc = 60 + int(row[1]) - 1\n",
 35 |       "        row[1] = str(calc)\n",
 36 |       "    elif(row[0] == '4'):\n",
 37 |       "        calc = 91 + int(row[1]) - 1\n",
 38 |       "        row[1] = str(calc)\n",
 39 |       "    elif(row[0] == '5'):\n",
 40 |       "        calc = 121 + int(row[1]) - 1\n",
 41 |       "        row[1] = str(calc)\n",
 42 |       "    elif(row[0] == '6'):\n",
 43 |       "        calc = 152 + int(row[1]) - 1\n",
 44 |       "        row[1] = str(calc)\n",
 45 |       "    elif(row[0] == '7'):\n",
 46 |       "        calc = 182 + int(row[1]) - 1\n",
 47 |       "        row[1] = str(calc)\n",
 48 |       "    elif(row[0] == '8'):\n",
 49 |       "        calc = 213 + int(row[1]) - 1\n",
 50 |       "        row[1] = str(calc)\n",
 51 |       "    elif(row[0] == '9'):\n",
 52 |       "        calc = 244 + int(row[1]) - 1\n",
 53 |       "        row[1] = str(calc)\n",
 54 |       "    elif(row[0] == '10'):\n",
 55 |       "        calc = 274 + int(row[1]) - 1\n",
 56 |       "        row[1] = str(calc)\n",
 57 |       "    elif(row[0] == '11'):\n",
 58 |       "        calc = 305 + int(row[1]) - 1\n",
 59 |       "        row[1] = str(calc)\n",
 60 |       "    elif(row[0] == '12'):\n",
 61 |       "        calc = 335 + int(row[1]) - 1\n",
 62 |       "        row[1] = str(calc)\n",
 63 |       "    return row\n",
 64 |       "\n",
 65 |       "\n",
 66 |       "def DiscretizeDepTime(row):\n",
 67 |       "    \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n",
 68 |       "    morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value\n",
 69 |       "    is assumed to be an integer in 24-hour time format.  These labels will correspond to\n",
 70 |       "    variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.\n",
 71 |       "    An error time is returned as morning.\"\"\"\n",
 72 |       "\n",
 73 |       "    if(int(row[3]) <= 559):\n",
 74 |       "        row[3] = '2'\n",
 75 |       "    elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n",
 76 |       "        row[3] = '0'\n",
 77 |       "    elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n",
 78 |       "        row[3] = '1'\n",
 79 |       "    elif(int(row[3]) >= 1800):\n",
 80 |       "        row[3] = '2'\n",
 81 |       "    else:\n",
 82 |       "        row[3] = '0'\n",
 83 |       "    return row\n",
 84 |       "\n",
 85 |       "\n",
 86 |       "def AddDepVar(row):\n",
 87 |       "    \"\"\"This function adds a classification label based on the length of the recorded\n",
 88 |       "    Departure Delay in the data set.  It assumes an input integer value of the delay in mins.\n",
 89 |       "    By airline industry standards, flight delays are defined as departure delays greater than\n",
 90 |       "    or equal to 15 minutes.  For delayed flights, this variable will have value \"1\".\n",
 91 |       "    For on time flights, it will have value \"0\".  Default value will be set at \"0\".\"\"\"\n",
 92 |       "\n",
 93 |       "    if(row[6] >= '15'):\n",
 94 |       "        row[6] = '1'\n",
 95 |       "    else:\n",
 96 |       "        row[6] = '0'\n",
 97 |       "    return row\n",
 98 |       "\n",
 99 |       "def SaveData(data, pickle_file_name):\n",
100 |       "    \"\"\"This function pickles each file.\"\"\"\n",
101 |       "\n",
102 |       "    f = open (pickle_file_name, \"w\")\n",
103 |       "    pickle.dump(data, f)\n",
104 |       "    f.close()\n",
105 |       "\n",
106 |       "\n",
107 |       "\n",
108 |       "for i in years:\n",
109 |       "    data = []\n",
110 |       "    file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n",
111 |       "    pickle_file_name = 'data' + str(i)\n",
112 |       "    with open(file_path, 'r') as data_csv:\n",
113 |       "        csv_reader = csv.reader(data_csv, delimiter=',')\n",
114 |       "        for row in list(csv_reader):\n",
115 |       "            if row[21] == '0':\n",
116 |       "                content = list(row[i] for i in needed_cols)\n",
117 |       "                content2 = ComputeDayofYear(content)\n",
118 |       "                content3 = DiscretizeDepTime(content2)\n",
119 |       "                content4 = AddDepVar(content3)\n",
120 |       "                data.append(content4)\n",
121 |       "    SaveData(data, pickle_file_name)"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": []
126 |     }
127 |    ],
128 |    "metadata": {}
129 |   }
130 |  ]
131 | }


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Naive Bayes Code-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:4dd7867e8934ba7980fd61f1cdbc7df7ff1cccafc3f287e3da0b94562583a3d7"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "import numpy as np\n",
 17 |       "import pandas as pd\n",
 18 |       "import sklearn\n",
 19 |       "from sklearn.naive_bayes import *\n",
 20 |       "from sklearn.metrics import *\n",
 21 |       "import os\n",
 22 |       "import cPickle\n",
 23 |       "import sys\n",
 24 |       "import pandas as pd\n",
 25 |       "import numpy as np\n",
 26 |       "from optparse import OptionParser\n",
 27 |       "from sklearn import metrics, preprocessing\n",
 28 |       "from sklearn import svm, naive_bayes, neighbors, tree\n",
 29 |       "from sklearn.ensemble import AdaBoostClassifier\n",
 30 |       "from sklearn import cross_validation\n",
 31 |       "from sklearn.ensemble import RandomForestClassifier # random forest\n",
 32 |       "from sklearn.svm import SVC # support vector machine classifier\n",
 33 |       "from sklearn.grid_search import GridSearchCV # hyperparameter grid search to find best model parameters\n",
 34 |       "from sklearn import preprocessing # preprocess string labels into numerics\n",
 35 |       "from sklearn import *\n",
 36 |       "from sklearn.metrics import precision_recall_fscore_support\n",
 37 |       "from sklearn.metrics import classification_report"
 38 |      ],
 39 |      "language": "python",
 40 |      "metadata": {},
 41 |      "outputs": [],
 42 |      "prompt_number": 197
 43 |     },
 44 |     {
 45 |      "cell_type": "code",
 46 |      "collapsed": false,
 47 |      "input": [
 48 |       "# Setting up constants\n",
 49 |       "print \"Setting constants...\"\n",
 50 |       "\n",
 51 |       "TRAINING_LINE_NUMBER = 1000000\n",
 52 |       "YEARS = ['2008']\n",
 53 |       "# INPUT_FILE_PATH = \"/home/dmenghani/python/\"  # Unix path\n",
 54 |       "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\"  # Windows path\n",
 55 |       "# YEARS = ['2008']\n",
 56 |       "\n",
 57 |       "SKIP_FIRST_LINE = True  # To skip the first line, as its the header\n",
 58 |       "\n",
 59 |       "master = []\n",
 60 |       "print \"Reading into Pandas frame...\"\n",
 61 |       "try:\n",
 62 |       "    for year in YEARS:\n",
 63 |       "        path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
 64 |       "        print \"\\n\",path\n",
 65 |       "        dfPart = pd.read_csv(\n",
 66 |       "            path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
 67 |       "                u'Year', \n",
 68 |       "                u'Month', \n",
 69 |       "                u'DayofMonth', \n",
 70 |       "                u'DayOfWeek', \n",
 71 |       "                u'UniqueCarrier',\n",
 72 |       "                u'DepTime', \n",
 73 |       "                u'TailNum', \n",
 74 |       "                u'Origin', \n",
 75 |       "                u'Dest', \n",
 76 |       "                u'DepDelay', \n",
 77 |       "#                 u'ArrDelay', \n",
 78 |       "                u'Cancelled',\n",
 79 |       "#                 u'ArrTime',\n",
 80 |       "#                 u'ArrDelay',\n",
 81 |       "#                 u'Distance'\n",
 82 |       "            ])\n",
 83 |       "        print len(dfPart)\n",
 84 |       "        dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
 85 |       "        print \"Removed cancelled flights, new length - \",len(dfPart)\n",
 86 |       "        master.append(dfPart)\n",
 87 |       "        print\n",
 88 |       "except Exception as e:\n",
 89 |       "    print \"Supplemental Data Import failed\", e\n",
 90 |       "\n",
 91 |       "dfMaster = pd.concat(master, ignore_index=True)\n",
 92 |       "master=[]\n",
 93 |       "dfPart=[]\n",
 94 |       "\n",
 95 |       "print \"Total length - \", len(dfMaster)\n",
 96 |       "del dfMaster['Cancelled']\n",
 97 |       "\n",
 98 |       "dfMaster.fillna(0, inplace=True)\n",
 99 |       "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
100 |       "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
101 |       "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
102 |       "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
103 |       "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
104 |       "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
105 |       "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
106 |       "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
107 |       "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
108 |       "\n",
109 |       "df = dfMaster\n",
110 |       "\n",
111 |       "print \"Calculating classification label...\"\n",
112 |       "df['label'] = 0\n",
113 |       "df.label[df.DepDelay >= 15] = 1\n",
114 |       "df.label[df.DepDelay < 15] = 0\n",
115 |       "\n",
116 |       "df['DepDelay'][df.DepDelay < 0]=0\n",
117 |       "del df['DepDelay']\n",
118 |       "# df['ArrDelay'][df.ArrDelay < 0]=0\n",
119 |       "\n",
120 |       "print \"Dataframe shape - \",df.shape\n",
121 |       "print \"Columns -\", df.columns"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": [
126 |       {
127 |        "output_type": "stream",
128 |        "stream": "stdout",
129 |        "text": [
130 |         "Setting constants...\n",
131 |         "Reading into Pandas frame...\n",
132 |         "\n",
133 |         "C:\\data\\airline\\2008.csv\n",
134 |         "1000000"
135 |        ]
136 |       },
137 |       {
138 |        "output_type": "stream",
139 |        "stream": "stdout",
140 |        "text": [
141 |         "\n",
142 |         "Removed cancelled flights, new length - "
143 |        ]
144 |       },
145 |       {
146 |        "output_type": "stream",
147 |        "stream": "stdout",
148 |        "text": [
149 |         " 967867\n",
150 |         "\n",
151 |         "Total length - "
152 |        ]
153 |       },
154 |       {
155 |        "output_type": "stream",
156 |        "stream": "stdout",
157 |        "text": [
158 |         " 967867\n",
159 |         "Calculating classification label..."
160 |        ]
161 |       },
162 |       {
163 |        "output_type": "stream",
164 |        "stream": "stdout",
165 |        "text": [
166 |         "\n",
167 |         "Dataframe shape -  (967867, 10)\n",
168 |         "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n"
169 |        ]
170 |       }
171 |      ],
172 |      "prompt_number": 198
173 |     },
174 |     {
175 |      "cell_type": "code",
176 |      "collapsed": false,
177 |      "input": [
178 |       "print \"Converting categorical data to numeric...\"\n",
179 |       "for col in set(df.columns):\n",
180 |       "# print col, train[col].dtype\n",
181 |       "    if df[col].dtype == np.dtype('object'):\n",
182 |       "        print \"Converting...\", col\n",
183 |       "        if col == 'TailNum':\n",
184 |       "            s = np.unique(df[col].values)\n",
185 |       "            TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
186 |       "#             print TailNum\n",
187 |       "        if col == 'UniqueCarrier':\n",
188 |       "            s = np.unique(df[col].values)\n",
189 |       "            UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
190 |       "#             print UniqueCarrier\n",
191 |       "        if col == 'Dest':\n",
192 |       "            s = np.unique(df[col].values)\n",
193 |       "            Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
194 |       "#             print Dest\n",
195 |       "        if col == 'Origin':\n",
196 |       "            s = np.unique(df[col].values)\n",
197 |       "            Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
198 |       "#             print Origin\n",
199 |       "\n",
200 |       "\n",
201 |       "def getTailNum(inTailNum):\n",
202 |       "#     print \"In...\",type(inTailNum)\n",
203 |       "    out = []\n",
204 |       "    for x, y in inTailNum.iteritems():\n",
205 |       "#         print \"x,y, out\",x,y,TailNum.get_value(y)\n",
206 |       "        out.append(TailNum.get_value(y) + 1)\n",
207 |       "#     print \"final out\", out\n",
208 |       "    return out\n",
209 |       "\n",
210 |       "\n",
211 |       "def getDest(inDest):\n",
212 |       "    out = []\n",
213 |       "    for x, y in inDest.iteritems():\n",
214 |       "        out.append(Dest.get_value(y) + 1)\n",
215 |       "    return out\n",
216 |       "\n",
217 |       "\n",
218 |       "def getOrigin(inOrign):\n",
219 |       "    out = []\n",
220 |       "    for x, y in inOrign.iteritems():\n",
221 |       "        out.append(Origin.get_value(y) + 1)\n",
222 |       "    return out\n",
223 |       "\n",
224 |       "\n",
225 |       "def getCarrier(inCarrier):\n",
226 |       "    out = []\n",
227 |       "    for x, y in inCarrier.iteritems():\n",
228 |       "        out.append(UniqueCarrier.get_value(y) + 1)\n",
229 |       "    return out\n",
230 |       "\n",
231 |       "df['TailNum'] = getTailNum(df['TailNum'])\n",
232 |       "print \"TailNum completed.\"\n",
233 |       "\n",
234 |       "df['Dest'] = getDest(df['Dest'])\n",
235 |       "print \"Dest completed.\"\n",
236 |       "\n",
237 |       "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
238 |       "print \"UniqueCarrier completed.\"\n",
239 |       "\n",
240 |       "df['Origin'] = getOrigin(df['Origin'])\n",
241 |       "print \"Origin completed.\"\n",
242 |       "\n",
243 |       "print \"Conversion to numeric completed.\"\n",
244 |       "\n",
245 |       "# print \"Pickling converted data...\"\n",
246 |       "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")"
247 |      ],
248 |      "language": "python",
249 |      "metadata": {},
250 |      "outputs": [
251 |       {
252 |        "output_type": "stream",
253 |        "stream": "stdout",
254 |        "text": [
255 |         "Converting categorical data to numeric...\n",
256 |         "Converting... Origin\n",
257 |         "Converting..."
258 |        ]
259 |       },
260 |       {
261 |        "output_type": "stream",
262 |        "stream": "stdout",
263 |        "text": [
264 |         " UniqueCarrier\n",
265 |         "Converting..."
266 |        ]
267 |       },
268 |       {
269 |        "output_type": "stream",
270 |        "stream": "stdout",
271 |        "text": [
272 |         " Dest\n",
273 |         "Converting..."
274 |        ]
275 |       },
276 |       {
277 |        "output_type": "stream",
278 |        "stream": "stdout",
279 |        "text": [
280 |         " TailNum\n",
281 |         "TailNum completed."
282 |        ]
283 |       },
284 |       {
285 |        "output_type": "stream",
286 |        "stream": "stdout",
287 |        "text": [
288 |         "\n",
289 |         "Dest completed."
290 |        ]
291 |       },
292 |       {
293 |        "output_type": "stream",
294 |        "stream": "stdout",
295 |        "text": [
296 |         "\n",
297 |         "UniqueCarrier completed."
298 |        ]
299 |       },
300 |       {
301 |        "output_type": "stream",
302 |        "stream": "stdout",
303 |        "text": [
304 |         "\n",
305 |         "Origin completed."
306 |        ]
307 |       },
308 |       {
309 |        "output_type": "stream",
310 |        "stream": "stdout",
311 |        "text": [
312 |         "\n",
313 |         "Conversion to numeric completed.\n"
314 |        ]
315 |       }
316 |      ],
317 |      "prompt_number": 199
318 |     },
319 |     {
320 |      "cell_type": "code",
321 |      "collapsed": false,
322 |      "input": [
323 |       "Origin['SFO'], Origin['OAK']"
324 |      ],
325 |      "language": "python",
326 |      "metadata": {},
327 |      "outputs": [
328 |       {
329 |        "metadata": {},
330 |        "output_type": "pyout",
331 |        "prompt_number": 200,
332 |        "text": [
333 |         "(243, 192)"
334 |        ]
335 |       }
336 |      ],
337 |      "prompt_number": 200
338 |     },
339 |     {
340 |      "cell_type": "code",
341 |      "collapsed": false,
342 |      "input": [
343 |       "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
344 |       "\n",
345 |       "# add columns to your data frame\n",
346 |       "\n",
347 |       "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
348 |       "\n",
349 |       "# define training and test sets\n",
350 |       "train = df[df['is_train'] == True]\n",
351 |       "test = df[df['is_train'] == False]\n",
352 |       "trainTargets = np.array(train['label']).astype(int)\n",
353 |       "testTargets = np.array(test['label']).astype(int)\n",
354 |       "features = df.columns[0:9]\n",
355 |       "\n",
356 |       "testSFO = test[test['Dest']==Origin['SFO']]\n",
357 |       "print len(testSFO)\n",
358 |       "\n",
359 |       "testOAK = test[test['Dest']==Origin['OAK']]\n",
360 |       "print len(testOAK)\n",
361 |       "\n",
362 |       "print \"Model fitting and prediction started...\"\n",
363 |       "gnb = tree.DecisionTreeClassifier()\n",
364 |       "\n",
365 |       "# train model\n",
366 |       "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
367 |       "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
368 |       "\n",
369 |       "print \"Classification completed.\""
370 |      ],
371 |      "language": "python",
372 |      "metadata": {},
373 |      "outputs": [
374 |       {
375 |        "output_type": "stream",
376 |        "stream": "stdout",
377 |        "text": [
378 |         "Begin classification...75% training, 25% testing, randomly chosen\n",
379 |         "887"
380 |        ]
381 |       },
382 |       {
383 |        "output_type": "stream",
384 |        "stream": "stdout",
385 |        "text": [
386 |         "\n",
387 |         "39\n",
388 |         "Model fitting and prediction started...\n",
389 |         "Classification completed."
390 |        ]
391 |       },
392 |       {
393 |        "output_type": "stream",
394 |        "stream": "stdout",
395 |        "text": [
396 |         "\n"
397 |        ]
398 |       }
399 |      ],
400 |      "prompt_number": 215
401 |     },
402 |     {
403 |      "cell_type": "code",
404 |      "collapsed": false,
405 |      "input": [
406 |       "features"
407 |      ],
408 |      "language": "python",
409 |      "metadata": {},
410 |      "outputs": [
411 |       {
412 |        "metadata": {},
413 |        "output_type": "pyout",
414 |        "prompt_number": 216,
415 |        "text": [
416 |         "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')"
417 |        ]
418 |       }
419 |      ],
420 |      "prompt_number": 216
421 |     },
422 |     {
423 |      "cell_type": "code",
424 |      "collapsed": false,
425 |      "input": [
426 |       "print \"Calculating metrcs...\"\n",
427 |       "# test['pred_label'] = y_gnb\n",
428 |       "# test.head()\n",
429 |       "acc = zip(test['label'], y_gnb)\n",
430 |       "match_count = 0\n",
431 |       "for i in acc:\n",
432 |       "    if i[0] ==  i[1]:\n",
433 |       "        match_count += 1\n",
434 |       "print \"Matches - \", match_count\n",
435 |       "print \"Total length - \", len(acc)\n",
436 |       "print \"Accuracy:\", float(match_count) / len(acc)"
437 |      ],
438 |      "language": "python",
439 |      "metadata": {},
440 |      "outputs": [
441 |       {
442 |        "output_type": "stream",
443 |        "stream": "stdout",
444 |        "text": [
445 |         "Calculating metrcs...\n",
446 |         "Matches - "
447 |        ]
448 |       },
449 |       {
450 |        "output_type": "stream",
451 |        "stream": "stdout",
452 |        "text": [
453 |         " 184048\n",
454 |         "Total length -  242386\n",
455 |         "Accuracy: 0.75931778238\n"
456 |        ]
457 |       }
458 |      ],
459 |      "prompt_number": 217
460 |     },
461 |     {
462 |      "cell_type": "code",
463 |      "collapsed": false,
464 |      "input": [
465 |       "print accuracy_score(test['label'],y_gnb)\n",
466 |       "print metrics.confusion_matrix(test['label'],y_gnb)"
467 |      ],
468 |      "language": "python",
469 |      "metadata": {},
470 |      "outputs": [
471 |       {
472 |        "output_type": "stream",
473 |        "stream": "stdout",
474 |        "text": [
475 |         "0.75931778238\n",
476 |         "[[157152  29405]\n",
477 |         " [ 28933  26896]]"
478 |        ]
479 |       },
480 |       {
481 |        "output_type": "stream",
482 |        "stream": "stdout",
483 |        "text": [
484 |         "\n"
485 |        ]
486 |       }
487 |      ],
488 |      "prompt_number": 218
489 |     },
490 |     {
491 |      "cell_type": "code",
492 |      "collapsed": false,
493 |      "input": [
494 |       "gnb.feature_importances_"
495 |      ],
496 |      "language": "python",
497 |      "metadata": {},
498 |      "outputs": [
499 |       {
500 |        "metadata": {},
501 |        "output_type": "pyout",
502 |        "prompt_number": 219,
503 |        "text": [
504 |         "array([ 0.        ,  0.01151212,  0.0552584 ,  0.03722765,  0.28496385,\n",
505 |         "        0.07264084,  0.2130565 ,  0.16164198,  0.16369866])"
506 |        ]
507 |       }
508 |      ],
509 |      "prompt_number": 219
510 |     },
511 |     {
512 |      "cell_type": "code",
513 |      "collapsed": false,
514 |      "input": [
515 |       "features"
516 |      ],
517 |      "language": "python",
518 |      "metadata": {},
519 |      "outputs": [
520 |       {
521 |        "metadata": {},
522 |        "output_type": "pyout",
523 |        "prompt_number": 222,
524 |        "text": [
525 |         "Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')"
526 |        ]
527 |       }
528 |      ],
529 |      "prompt_number": 222
530 |     },
531 |     {
532 |      "cell_type": "code",
533 |      "collapsed": false,
534 |      "input": [
535 |       "# average_precision_score(test['label'],y_gnb)\n",
536 |       "precision_recall_fscore_support(test['label'],y_gnb,average='micro')"
537 |      ],
538 |      "language": "python",
539 |      "metadata": {},
540 |      "outputs": [
541 |       {
542 |        "metadata": {},
543 |        "output_type": "pyout",
544 |        "prompt_number": 223,
545 |        "text": [
546 |         "(0.47771798014244865, 0.48175679306453634, 0.47972888611433151, 55829)"
547 |        ]
548 |       }
549 |      ],
550 |      "prompt_number": 223
551 |     },
552 |     {
553 |      "cell_type": "code",
554 |      "collapsed": false,
555 |      "input": [
556 |       "# dfMaster['FlightDate'] =pd.to_datetime(dfMaster.Year*10000+dfMaster.Month*100+dfMaster.DayofMonth,format='%Y%m%d')"
557 |      ],
558 |      "language": "python",
559 |      "metadata": {},
560 |      "outputs": [],
561 |      "prompt_number": 206
562 |     },
563 |     {
564 |      "cell_type": "code",
565 |      "collapsed": false,
566 |      "input": [
567 |       "# dfAirport = dfMaster[['FlightDate','Origin']].groupby([dfMaster['FlightDate'],dfMaster['Origin']]).agg([len])\n",
568 |       "# # dfAirport.to_clipboard()\n",
569 |       "# dfAirport"
570 |      ],
571 |      "language": "python",
572 |      "metadata": {},
573 |      "outputs": [],
574 |      "prompt_number": 207
575 |     },
576 |     {
577 |      "cell_type": "code",
578 |      "collapsed": false,
579 |      "input": [
580 |       "print y_gnb[:10]\n",
581 |       "print y_prob[:10]"
582 |      ],
583 |      "language": "python",
584 |      "metadata": {},
585 |      "outputs": [
586 |       {
587 |        "output_type": "stream",
588 |        "stream": "stdout",
589 |        "text": [
590 |         "[0 0 0 1 1 0 0 0 1 1]\n",
591 |         "[[ 1.  0.]\n",
592 |         " [ 1.  0.]\n",
593 |         " [ 1.  0.]\n",
594 |         " [ 0.  1.]\n",
595 |         " [ 0.  1.]\n",
596 |         " [ 1.  0.]\n",
597 |         " [ 1.  0.]\n",
598 |         " [ 1.  0.]\n",
599 |         " [ 0.  1.]\n",
600 |         " [ 0.  1.]]\n"
601 |        ]
602 |       }
603 |      ],
604 |      "prompt_number": 224
605 |     },
606 |     {
607 |      "cell_type": "code",
608 |      "collapsed": false,
609 |      "input": [
610 |       "dfMaster[:100].to_csv(\"C:\\\\data\\\\airline\\\\SampleData.csv\")"
611 |      ],
612 |      "language": "python",
613 |      "metadata": {},
614 |      "outputs": [],
615 |      "prompt_number": 227
616 |     }
617 |    ],
618 |    "metadata": {}
619 |   }
620 |  ]
621 | }


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled0-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "import numpy as np\n",
 17 |       "import pandas as pd\n",
 18 |       "import sklearn\n",
 19 |       "from sklearn.naive_bayes import *\n",
 20 |       "from sklearn.metrics import *\n",
 21 |       "import os\n",
 22 |       "import cPickle\n",
 23 |       "import sys\n",
 24 |       "import pandas as pd\n",
 25 |       "import numpy as np\n",
 26 |       "from optparse import OptionParser\n",
 27 |       "from sklearn import metrics, preprocessing\n",
 28 |       "from sklearn import svm, naive_bayes, neighbors, tree\n",
 29 |       "from sklearn.ensemble import AdaBoostClassifier\n",
 30 |       "from sklearn import cross_validation\n",
 31 |       "from sklearn.ensemble import RandomForestClassifier  # random forest\n",
 32 |       "from sklearn.svm import SVC  # support vector machine classifier\n",
 33 |       "# hyperparameter grid search to find best model parameters\n",
 34 |       "from sklearn.grid_search import GridSearchCV\n",
 35 |       "from sklearn import preprocessing  # preprocess string labels into numerics\n",
 36 |       "from sklearn import *\n",
 37 |       "from sklearn.metrics import precision_recall_fscore_support\n",
 38 |       "from sklearn.metrics import classification_report\n",
 39 |       "\n",
 40 |       "\n",
 41 |       "# In[135]:\n",
 42 |       "\n",
 43 |       "# Setting up constants\n",
 44 |       "print \"Setting constants...\"\n",
 45 |       "\n",
 46 |       "TRAINING_LINE_NUMBER = 500000\n",
 47 |       "YEARS = ['2006', '2008', '2007']\n",
 48 |       "# INPUT_FILE_PATH = \"/home/dmenghani/python/\"  # Unix path\n",
 49 |       "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\"  # Windows path\n",
 50 |       "# YEARS = ['2008']\n",
 51 |       "\n",
 52 |       "SKIP_FIRST_LINE = True  # To skip the first line, as its the header\n",
 53 |       "\n",
 54 |       "master = []\n",
 55 |       "print \"Reading into Pandas frame...\"\n",
 56 |       "try:\n",
 57 |       "    for year in YEARS:\n",
 58 |       "        path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
 59 |       "        print \"\\n\", path\n",
 60 |       "        dfPart = pd.read_csv(\n",
 61 |       "            path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
 62 |       "                u'Year',\n",
 63 |       "                u'Month',\n",
 64 |       "                u'DayofMonth',\n",
 65 |       "                u'DayOfWeek',\n",
 66 |       "                u'UniqueCarrier',\n",
 67 |       "                u'DepTime',\n",
 68 |       "                u'TailNum',\n",
 69 |       "                u'Origin',\n",
 70 |       "                u'Dest',\n",
 71 |       "                u'DepDelay',\n",
 72 |       "                # u'ArrDelay',\n",
 73 |       "                u'Cancelled',\n",
 74 |       "                #                 u'ArrTime',\n",
 75 |       "                #                 u'ArrDelay',\n",
 76 |       "                #                 u'Distance'\n",
 77 |       "            ])\n",
 78 |       "        print len(dfPart)\n",
 79 |       "        dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
 80 |       "        print \"Removed cancelled flights, new length - \", len(dfPart)\n",
 81 |       "        master.append(dfPart)\n",
 82 |       "        print\n",
 83 |       "except Exception as e:\n",
 84 |       "    print \"Supplemental Data Import failed\", e\n",
 85 |       "\n",
 86 |       "dfMaster = pd.concat(master, ignore_index=True)\n",
 87 |       "master = []\n",
 88 |       "dfPart = []\n",
 89 |       "\n",
 90 |       "print \"Total length - \", len(dfMaster)\n",
 91 |       "del dfMaster['Cancelled']\n",
 92 |       "\n",
 93 |       "dfMaster.fillna(0, inplace=True)\n",
 94 |       "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
 95 |       "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
 96 |       "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
 97 |       "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
 98 |       "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
 99 |       "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
100 |       "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
101 |       "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
102 |       "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
103 |       "\n",
104 |       "df = dfMaster\n",
105 |       "\n",
106 |       "print \"Calculating classification label...\"\n",
107 |       "df['label'] = 0\n",
108 |       "df.label[df.DepDelay >= 15] = 1\n",
109 |       "df.label[df.DepDelay < 15] = 0\n",
110 |       "\n",
111 |       "# df['DepDelay'][df.DepDelay < 0] = 0\n",
112 |       "del df['DepDelay']\n",
113 |       "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
114 |       "\n",
115 |       "print \"Dataframe shape - \", df.shape\n",
116 |       "print \"Columns -\", df.columns\n",
117 |       "\n",
118 |       "\n",
119 |       "# In[136]:\n",
120 |       "\n",
121 |       "print \"Converting categorical data to numeric...\"\n",
122 |       "for col in set(df.columns):\n",
123 |       "# print col, train[col].dtype\n",
124 |       "    if df[col].dtype == np.dtype('object'):\n",
125 |       "        print \"Converting...\", col\n",
126 |       "        if col == 'TailNum':\n",
127 |       "            s = np.unique(df[col].values)\n",
128 |       "            TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 |       "#             print TailNum\n",
130 |       "        if col == 'UniqueCarrier':\n",
131 |       "            s = np.unique(df[col].values)\n",
132 |       "            UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 |       "#             print UniqueCarrier\n",
134 |       "        if col == 'Dest':\n",
135 |       "            s = np.unique(df[col].values)\n",
136 |       "            Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 |       "#             print Dest\n",
138 |       "        if col == 'Origin':\n",
139 |       "            s = np.unique(df[col].values)\n",
140 |       "            Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
141 |       "#             print Origin\n",
142 |       "\n",
143 |       "\n",
144 |       "def getTailNum(inTailNum):\n",
145 |       "#     print \"In...\",type(inTailNum)\n",
146 |       "    out = []\n",
147 |       "    for x, y in inTailNum.iteritems():\n",
148 |       "#         print \"x,y, out\",x,y,TailNum.get_value(y)\n",
149 |       "        out.append(TailNum.get_value(y) + 1)\n",
150 |       "#     print \"final out\", out\n",
151 |       "    return out\n",
152 |       "\n",
153 |       "\n",
154 |       "def getDest(inDest):\n",
155 |       "    out = []\n",
156 |       "    for x, y in inDest.iteritems():\n",
157 |       "        out.append(Dest.get_value(y) + 1)\n",
158 |       "    return out\n",
159 |       "\n",
160 |       "\n",
161 |       "def getOrigin(inOrign):\n",
162 |       "    out = []\n",
163 |       "    for x, y in inOrign.iteritems():\n",
164 |       "        out.append(Origin.get_value(y) + 1)\n",
165 |       "    return out\n",
166 |       "\n",
167 |       "\n",
168 |       "def getCarrier(inCarrier):\n",
169 |       "    out = []\n",
170 |       "    for x, y in inCarrier.iteritems():\n",
171 |       "        out.append(UniqueCarrier.get_value(y) + 1)\n",
172 |       "    return out\n",
173 |       "\n",
174 |       "df['TailNum'] = getTailNum(df['TailNum'])\n",
175 |       "print \"TailNum completed.\"\n",
176 |       "\n",
177 |       "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
178 |       "print \"UniqueCarrier completed.\"\n",
179 |       "\n",
180 |       "df['Dest'] = getDest(df['Dest'])\n",
181 |       "print \"Dest completed.\"\n",
182 |       "\n",
183 |       "df['Origin'] = getOrigin(df['Origin'])\n",
184 |       "print \"Origin completed.\"\n",
185 |       "\n",
186 |       "print \"Conversion to numeric completed.\"\n",
187 |       "\n",
188 |       "# print \"Pickling converted data...\"\n",
189 |       "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n"
190 |      ],
191 |      "language": "python",
192 |      "metadata": {},
193 |      "outputs": [
194 |       {
195 |        "output_type": "stream",
196 |        "stream": "stdout",
197 |        "text": [
198 |         "Setting constants...\n",
199 |         "Reading into Pandas frame...\n",
200 |         "\n",
201 |         "C:\\data\\airline\\2006.csv\n",
202 |         "500000"
203 |        ]
204 |       },
205 |       {
206 |        "output_type": "stream",
207 |        "stream": "stdout",
208 |        "text": [
209 |         "\n",
210 |         "Removed cancelled flights, new length - "
211 |        ]
212 |       },
213 |       {
214 |        "output_type": "stream",
215 |        "stream": "stdout",
216 |        "text": [
217 |         " 491158\n",
218 |         "\n",
219 |         "\n",
220 |         "C:\\data\\airline\\2008.csv\n",
221 |         "500000"
222 |        ]
223 |       },
224 |       {
225 |        "output_type": "stream",
226 |        "stream": "stdout",
227 |        "text": [
228 |         "\n",
229 |         "Removed cancelled flights, new length - "
230 |        ]
231 |       },
232 |       {
233 |        "output_type": "stream",
234 |        "stream": "stdout",
235 |        "text": [
236 |         " 484708\n",
237 |         "\n",
238 |         "\n",
239 |         "C:\\data\\airline\\2007.csv\n",
240 |         "500000"
241 |        ]
242 |       },
243 |       {
244 |        "output_type": "stream",
245 |        "stream": "stdout",
246 |        "text": [
247 |         "\n",
248 |         "Removed cancelled flights, new length - "
249 |        ]
250 |       },
251 |       {
252 |        "output_type": "stream",
253 |        "stream": "stdout",
254 |        "text": [
255 |         " 487243\n",
256 |         "\n",
257 |         "Total length - "
258 |        ]
259 |       },
260 |       {
261 |        "output_type": "stream",
262 |        "stream": "stdout",
263 |        "text": [
264 |         " 1463109\n",
265 |         "Calculating classification label..."
266 |        ]
267 |       },
268 |       {
269 |        "output_type": "stream",
270 |        "stream": "stdout",
271 |        "text": [
272 |         "\n",
273 |         "Dataframe shape -  (1463109, 10)\n",
274 |         "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
275 |         "Converting categorical data to numeric...\n",
276 |         "Converting..."
277 |        ]
278 |       },
279 |       {
280 |        "output_type": "stream",
281 |        "stream": "stdout",
282 |        "text": [
283 |         " Origin\n",
284 |         "Converting..."
285 |        ]
286 |       },
287 |       {
288 |        "output_type": "stream",
289 |        "stream": "stdout",
290 |        "text": [
291 |         " UniqueCarrier\n",
292 |         "Converting..."
293 |        ]
294 |       },
295 |       {
296 |        "output_type": "stream",
297 |        "stream": "stdout",
298 |        "text": [
299 |         " Dest\n",
300 |         "Converting..."
301 |        ]
302 |       },
303 |       {
304 |        "output_type": "stream",
305 |        "stream": "stdout",
306 |        "text": [
307 |         " TailNum\n",
308 |         "TailNum completed."
309 |        ]
310 |       },
311 |       {
312 |        "output_type": "stream",
313 |        "stream": "stdout",
314 |        "text": [
315 |         "\n",
316 |         "UniqueCarrier completed."
317 |        ]
318 |       },
319 |       {
320 |        "output_type": "stream",
321 |        "stream": "stdout",
322 |        "text": [
323 |         "\n",
324 |         "Dest completed."
325 |        ]
326 |       },
327 |       {
328 |        "output_type": "stream",
329 |        "stream": "stdout",
330 |        "text": [
331 |         "\n",
332 |         "Origin completed."
333 |        ]
334 |       },
335 |       {
336 |        "output_type": "stream",
337 |        "stream": "stdout",
338 |        "text": [
339 |         "\n",
340 |         "Conversion to numeric completed.\n"
341 |        ]
342 |       }
343 |      ],
344 |      "prompt_number": 13
345 |     },
346 |     {
347 |      "cell_type": "code",
348 |      "collapsed": false,
349 |      "input": [
350 |       "\n",
351 |       "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
352 |       "\n",
353 |       "# add columns to your data frame\n",
354 |       "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
355 |       "\n",
356 |       "# define training and test sets\n",
357 |       "train = df[df['is_train'] == True]\n",
358 |       "test = df[df['is_train'] == False]\n",
359 |       "trainTargets = np.array(train['label']).astype(int)\n",
360 |       "testTargets = np.array(test['label']).astype(int)\n",
361 |       "features = df.columns[0:9]\n",
362 |       "print \"Features - \",features\n",
363 |       "print \"Model fitting and prediction started...\"\n",
364 |       "gnb = GaussianNB()\n",
365 |       "\n",
366 |       "# train model\n",
367 |       "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
368 |       "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
369 |       "\n",
370 |       "print \"Classification completed.\""
371 |      ],
372 |      "language": "python",
373 |      "metadata": {},
374 |      "outputs": [
375 |       {
376 |        "output_type": "stream",
377 |        "stream": "stdout",
378 |        "text": [
379 |         "Begin classification...75% training, 25% testing, randomly chosen\n",
380 |         "Features - "
381 |        ]
382 |       },
383 |       {
384 |        "output_type": "stream",
385 |        "stream": "stdout",
386 |        "text": [
387 |         " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n",
388 |         "Model fitting and prediction started...\n",
389 |         "Classification completed."
390 |        ]
391 |       },
392 |       {
393 |        "output_type": "stream",
394 |        "stream": "stdout",
395 |        "text": [
396 |         "\n",
397 |         "Calculating metrcs...\n",
398 |         "Accuracy -  0.798698653544\n",
399 |         "Confusion metrics\n",
400 |         "[[291966    106]\n",
401 |         " [ 73525    178]]"
402 |        ]
403 |       },
404 |       {
405 |        "output_type": "stream",
406 |        "stream": "stdout",
407 |        "text": [
408 |         "\n",
409 |         "Precision -  "
410 |        ]
411 |       },
412 |       {
413 |        "output_type": "stream",
414 |        "stream": "stdout",
415 |        "text": [
416 |         "0.62676056338\n",
417 |         "Recall -  "
418 |        ]
419 |       },
420 |       {
421 |        "output_type": "stream",
422 |        "stream": "stdout",
423 |        "text": [
424 |         "0.00241509843561\n"
425 |        ]
426 |       }
427 |      ],
428 |      "prompt_number": 14
429 |     },
430 |     {
431 |      "cell_type": "code",
432 |      "collapsed": false,
433 |      "input": [
434 |       "print \"Calculating metrcs...\"\n",
435 |       "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n",
436 |       "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n",
437 |       "print \"Precision - \", precision_score(test['label'], y_gnb)\n",
438 |       "print \"Recall - \", recall_score(test['label'], y_gnb)\n"
439 |      ],
440 |      "language": "python",
441 |      "metadata": {},
442 |      "outputs": [
443 |       {
444 |        "output_type": "stream",
445 |        "stream": "stdout",
446 |        "text": [
447 |         "Calculating metrcs...\n",
448 |         "Accuracy -  0.798698653544\n",
449 |         "Confusion metrics\n",
450 |         "[[291966    106]\n",
451 |         " [ 73525    178]]"
452 |        ]
453 |       },
454 |       {
455 |        "output_type": "stream",
456 |        "stream": "stdout",
457 |        "text": [
458 |         "\n",
459 |         "Precision -  "
460 |        ]
461 |       },
462 |       {
463 |        "output_type": "stream",
464 |        "stream": "stdout",
465 |        "text": [
466 |         "0.62676056338\n",
467 |         "Recall -  "
468 |        ]
469 |       },
470 |       {
471 |        "output_type": "stream",
472 |        "stream": "stdout",
473 |        "text": [
474 |         "0.00241509843561\n"
475 |        ]
476 |       }
477 |      ],
478 |      "prompt_number": 25
479 |     },
480 |     {
481 |      "cell_type": "code",
482 |      "collapsed": false,
483 |      "input": [
484 |       "testSFO = test[test['Origin'] == Origin['SFO']]\n",
485 |       "print len(testSFO)\n",
486 |       "\n",
487 |       "testOAK = test[test['Origin'] == Origin['OAK']]\n",
488 |       "print len(testOAK)\n"
489 |      ],
490 |      "language": "python",
491 |      "metadata": {},
492 |      "outputs": [
493 |       {
494 |        "output_type": "stream",
495 |        "stream": "stdout",
496 |        "text": [
497 |         "3563\n",
498 |         "40\n"
499 |        ]
500 |       }
501 |      ],
502 |      "prompt_number": 22
503 |     },
504 |     {
505 |      "cell_type": "code",
506 |      "collapsed": false,
507 |      "input": [
508 |       " np.random.randint(2000, size=10)\n",
509 |       "    "
510 |      ],
511 |      "language": "python",
512 |      "metadata": {},
513 |      "outputs": [
514 |       {
515 |        "metadata": {},
516 |        "output_type": "pyout",
517 |        "prompt_number": 27,
518 |        "text": [
519 |         "array([ 437, 1815,  742,  148, 1399, 1171,  205, 1480,  838, 1437])"
520 |        ]
521 |       }
522 |      ],
523 |      "prompt_number": 27
524 |     }
525 |    ],
526 |    "metadata": {}
527 |   }
528 |  ]
529 | }


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:d3fd45c3529abf0b735e3b409e8980ec4b2e4e445277ba0cf2522e16729ae159"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "import sys\n",
 17 |       "import csv\n",
 18 |       "import datetime\n",
 19 |       "import matplotlib.pyplot as plt; plt.rcdefaults()\n",
 20 |       "\n",
 21 |       "TIME_DELTA = 3\n",
 22 |       "\n",
 23 |       "for arg in sys.argv:\n",
 24 |       "\tif(arg != 'date_graph.py'):\n",
 25 |       "\t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n",
 26 |       "\t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n",
 27 |       "\n",
 28 |       "delta = datetime.timedelta(days=TIME_DELTA)\n",
 29 |       "begin = start_date - delta\n",
 30 |       "end = start_date + delta\n",
 31 |       "\n",
 32 |       "SFO_Hash = {}\n",
 33 |       "OAK_Hash = {}\n",
 34 |       "SFO_count = 0\n",
 35 |       "OAK_count = 0\n",
 36 |       "with open('_dfTest2008.csv', 'r') as data:\n",
 37 |       "\tcsv_reader = csv.reader(data, delimiter=',')\n",
 38 |       "\tfor row in csv_reader:\n",
 39 |       "\t\tif(row[0] != 'Year'):\n",
 40 |       "\t\t\tyear = int(row[0])\n",
 41 |       "\t\t\tmonth = int(row[1])\n",
 42 |       "\t\t\tdate = int(row[2])\n",
 43 |       "\t\t\tcurr_date = datetime.date(year, month, date)\n",
 44 |       "\t\t\tif(curr_date >= begin and curr_date <= end):\n",
 45 |       "\t\t\t\torigin = row[7]\n",
 46 |       "\t\t\t\tif(origin == '270'):\n",
 47 |       "\t\t\t\t\tlabel = int(row[10])\n",
 48 |       "\t\t\t\t\tSFO_count += 1\n",
 49 |       "\t\t\t\t\tif(curr_date not in SFO_Hash):\n",
 50 |       "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n",
 51 |       "\t\t\t\t\telse:\n",
 52 |       "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n",
 53 |       "\t\t\t\tif(origin == '215'):\n",
 54 |       "\t\t\t\t\tlabel = int(row[10])\n",
 55 |       "\t\t\t\t\tOAK_count += 1\n",
 56 |       "\t\t\t\t\tif(curr_date not in OAK_Hash):\n",
 57 |       "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n",
 58 |       "\t\t\t\t\telse:\n",
 59 |       "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n",
 60 |       "\n",
 61 |       "iterator = datetime.timedelta(days=1)\n",
 62 |       "day_values = []\n",
 63 |       "SFO_Delays = []\n",
 64 |       "SFO_On_Time = []\n",
 65 |       "SFO_Flights = []\n",
 66 |       "SFO_Pct = []\n",
 67 |       "OAK_Delays = []\n",
 68 |       "OAK_On_Time = []\n",
 69 |       "OAK_Flights = []\n",
 70 |       "OAK_Pct = []\n",
 71 |       "\n",
 72 |       "while begin <= end:\n",
 73 |       "\tif(begin not in SFO_Hash):\n",
 74 |       "\t\tSFO_Delays.append(0)\n",
 75 |       "\t\tSFO_On_Time.append(0)\n",
 76 |       "\t\tSFO_Pct.append(0.00)\n",
 77 |       "\telse:\n",
 78 |       "\t\tSFO_Flights = SFO_Hash[begin]\n",
 79 |       "\t\tdelays = sum(SFO_Flights)\n",
 80 |       "\t\tnum_flights = len(SFO_Flights)\n",
 81 |       "\t\tpct = float(delays) / (num_flights + delays)\n",
 82 |       "\t\tSFO_Delays.append(delays)\n",
 83 |       "\t\tSFO_On_Time.append(num_flights - delays)\n",
 84 |       "\t\tSFO_Pct.append(pct)\n",
 85 |       "\t\n",
 86 |       "\tif(begin not in OAK_Hash):\n",
 87 |       "\t\tOAK_Delays.append(0)\n",
 88 |       "\t\tOAK_On_Time.append(0)\n",
 89 |       "\t\tOAK_Pct.append(0.00)\n",
 90 |       "\telse:\n",
 91 |       "\t\tOAK_Flights = OAK_Hash[begin]\n",
 92 |       "\t\tdelays = sum(OAK_Flights)\n",
 93 |       "\t\tnum_flights = len(OAK_Flights)\n",
 94 |       "\t\tpct = float(delays) / (num_flights + delays)\n",
 95 |       "\t\tOAK_Delays.append(delays)\n",
 96 |       "\t\tOAK_On_Time.append(num_flights - delays)\n",
 97 |       "\t\tOAK_Pct.append(pct)\n",
 98 |       "\t\n",
 99 |       "\tday_values.append(begin)\n",
100 |       "\tbegin += iterator\n",
101 |       "\n",
102 |       "print SFO_Pct\n",
103 |       "print OAK_Pct\n",
104 |       "\n",
105 |       "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n",
106 |       "\n",
107 |       "ax1 = plt.subplot(211)\n",
108 |       "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n",
109 |       "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n",
110 |       "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
111 |       "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
112 |       "ax1.set_yticks([0, 200, 450])\n",
113 |       "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n",
114 |       "\n",
115 |       "ax2 = plt.subplot(212)\n",
116 |       "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n",
117 |       "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n",
118 |       "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
119 |       "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
120 |       "ax2.set_yticks([0, 200, 450])\n",
121 |       "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n",
122 |       "\n",
123 |       "plt.show()"
124 |      ],
125 |      "language": "python",
126 |      "metadata": {},
127 |      "outputs": []
128 |     }
129 |    ],
130 |    "metadata": {}
131 |   }
132 |  ]
133 | }


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled3-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "metadata": {
3 |   "name": "",
4 |   "signature": "sha256:43ffcf25a0f9f00fd6bd77f3e24dfb6e62c5a764e70ce742b71da7b69b36310f"
5 |  },
6 |  "nbformat": 3,
7 |  "nbformat_minor": 0,
8 |  "worksheets": []
9 | }


--------------------------------------------------------------------------------
/EDA_and_NB_performance_charts.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import os
  5 | from IPython.core.display import HTML
  6 | from bokeh.plotting import *
  7 | 
  8 | 
  9 | # load data into pandas
 10 | INPUT_FILE = "C:\\data\\airline\\_dfTest2008.csv"
 11 | 
 12 | SKIP_FIRST_LINE = True
 13 | 
 14 | master = []
 15 | print "Reading into Pandas frame..."
 16 | try:
 17 |     dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[  # nrows = 2000
 18 |                          u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
 19 |                          u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label'
 20 |                          ])
 21 |     print len(dfPart)
 22 |     master.append(dfPart)
 23 | except Exception as e:
 24 |     print "Data import failed", e
 25 | 
 26 | 
 27 | dfMaster = pd.concat(master, ignore_index=True)
 28 | print "Total length: ", len(dfMaster)
 29 | 
 30 | # change data types
 31 | dfMaster['Year'] = dfMaster['Year'].astype('int')
 32 | dfMaster['Month'] = dfMaster['Month'].astype('int')
 33 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
 34 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
 35 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int')
 36 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int')
 37 | dfMaster['Origin'] = dfMaster['Origin'].astype('int')
 38 | dfMaster['Dest'] = dfMaster['Dest'].astype('int')
 39 | dfMaster['label'] = dfMaster['label'].astype('int')
 40 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int')
 41 | 
 42 | 
 43 | df = dfMaster
 44 | print "Appneding new variables..."
 45 | df['accurate'] = 0
 46 | df.accurate[df.label == df.pred_label] = 1
 47 | df.accurate[df.label <> df.pred_label] = 0
 48 | 
 49 | 
 50 | df['dep_time'] = 0
 51 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1
 52 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2
 53 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3
 54 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3
 55 | 
 56 | # compute accuracy rates
 57 | month_acc = dfMaster.groupby('Month').accurate.sum() / \
 58 |     dfMaster.groupby('Month').accurate.count()
 59 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Month'])
 60 | # print df_month_acc
 61 | 
 62 | day_of_month_acc = dfMaster.groupby(
 63 |     'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count()
 64 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'DayofMonth'])
 65 | # print df_day_of_month_acc
 66 | 
 67 | day_of_week_acc = dfMaster.groupby(
 68 |     'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count()
 69 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'DayOfWeek'])
 70 | # print df_day_of_week_acc
 71 | 
 72 | unique_carrier_acc = dfMaster.groupby(
 73 |     'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count()
 74 | df_unique_carrier_acc = pd.DataFrame(
 75 |     unique_carrier_acc, columns=[u'UniqueCarrier'])
 76 | # print df_unique_carrier_acc
 77 | 
 78 | tail_num_acc = dfMaster.groupby(
 79 |     'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count()
 80 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'TailNum'])
 81 | # print df_tail_num_acc
 82 | 
 83 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \
 84 |     dfMaster.groupby('Origin').accurate.count()
 85 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Origin'])
 86 | # print df_origin_acc
 87 | 
 88 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \
 89 |     dfMaster.groupby('Dest').accurate.count()
 90 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Dest'])
 91 | # print df_dest_acc
 92 | 
 93 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \
 94 |     dfMaster.groupby('dep_time').accurate.count()
 95 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'dep_time'])
 96 | # print dep_time_acc
 97 | 
 98 | 
 99 | # compute proportion of delays by each variable
100 | 
101 | month_delays = dfMaster.groupby(
102 |     'Month').label.sum() / dfMaster.groupby('Month').label.count()
103 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Month'])
104 | # print df_month_delays
105 | 
106 | day_of_month_delays = dfMaster.groupby(
107 |     'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count()
108 | df_day_of_month_delays = pd.DataFrame(
109 |     day_of_month_delays, columns=[u'DayofMonth'])
110 | # print df_day_of_month_delays
111 | 
112 | day_of_week_delays = dfMaster.groupby(
113 |     'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count()
114 | df_day_of_week_delays = pd.DataFrame(
115 |     day_of_week_delays, columns=[u'DayOfWeek'])
116 | # print df_day_of_week_delays
117 | 
118 | unique_carrier_delays = dfMaster.groupby(
119 |     'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count()
120 | df_unique_carrier_delays = pd.DataFrame(
121 |     unique_carrier_delays, columns=[u'UniqueCarrier'])
122 | # print df_unique_carrier_delays
123 | 
124 | tail_num_delays = dfMaster.groupby(
125 |     'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count()
126 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'TailNum'])
127 | # print df_tail_num_delays
128 | 
129 | origin_delays = dfMaster.groupby(
130 |     'Origin').label.sum() / dfMaster.groupby('Origin').label.count()
131 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Origin'])
132 | # print df_origin_delays
133 | 
134 | dest_delays = dfMaster.groupby(
135 |     'Dest').label.sum() / dfMaster.groupby('Dest').label.count()
136 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Dest'])
137 | # print df_dest_delays
138 | 
139 | dep_time_delays = dfMaster.groupby(
140 |     'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count()
141 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'dep_time'])
142 | # print df_dep_time_delays
143 | 
144 | 
145 | # bar charts to see where delays are more likely
146 | df_day_of_month_delays.plot(kind='bar', color='grey', stacked=True)
147 | 
148 | # df_day_of_week_delays.plot(kind='bar', color='grey', stacked=True)
149 | 
150 | # df_unique_carrier_delays.plot(kind='bar', color='grey', stacked=True)
151 | 
152 | # df_tail_num_delays.plot(kind='bar', color='grey', stacked=True)
153 | 
154 | # df_origin_delays.plot(kind='bar', color='grey', stacked=True)
155 | 
156 | # df_dest_delays.plot(kind='bar', color='grey', stacked=True)
157 | 
158 | # df_dep_time_delays.plot(kind='bar', color='grey', stacked=True)
159 | 
160 | # df_month_delays.plot(kind='bar', color='grey', stacked=True)
161 | 
162 | plt.show()
163 | 
164 | 
165 | # plot bar charts for accuracy measures
166 | # df_month_acc.plot(kind='bar', color='grey', background_fill="#EAEAF2")
167 | 
168 | # df_day_of_month_acc.plot(
169 | # kind='bar', color='grey', background_fill="#EAEAF2")
170 | 
171 | # df_day_of_week_acc.plot(kind='bar', color='grey')
172 | 
173 | # df_unique_carrier_acc.plot(kind='bar', color='grey')
174 | 
175 | # df_tail_num_acc.plot(kind='bar', color='grey')
176 | 
177 | # df_origin_acc.plot(kind='bar', color='grey')
178 | 
179 | # df_dest_acc.plot(kind='bar', color='grey')
180 | 
181 | # df_dep_time_acc.plot(kind='bar', color='grey')
182 | 
183 | plt.show()
184 | 


--------------------------------------------------------------------------------
/INFO290T_Final_Project_Presentation_vFINAL.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/INFO290T_Final_Project_Presentation_vFINAL.pptx


--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.docx


--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_FinalProjectReport.pdf


--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.docx


--------------------------------------------------------------------------------
/Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divyakkm/Data-Mining-Project/962ac5c99320720afb6c68409dc36d5e0ee7c70f/Joo_Jung_Kosheleva_Menghani_Project_Proposal.pdf


--------------------------------------------------------------------------------
/NB_performance_charts.py:
--------------------------------------------------------------------------------
  1 | # This code builds some exploratory graphs to see how prediction accuracy
  2 | # of the Naive Bayes model varies by each variable used to build the model.
  3 | 
  4 | 
  5 | #  Importing various modules to  build graphs
  6 | from __future__ import division
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | from pylab import figure, show
 10 | from pandas import DataFrame, Series
 11 | import pandas as pd
 12 | import csv
 13 | import os
 14 | from bokeh.plotting import *
 15 | import seaborn as sns
 16 | from bokeh.objects import ColumnDataSource, Range1d
 17 | from math import floor
 18 | import bokeh as bokeh
 19 | import seaborn as sns
 20 | sns.set_context("talk")
 21 | 
 22 | 
 23 | # load 2008 test data into pandas
 24 | INPUT_FILE = "C:\\Users\\user\\Desktop\\INFO_290T\\Final Project\Visualizations\\SFO_OAK_data\\_dfTest2008.csv"
 25 | 
 26 | SKIP_FIRST_LINE = True
 27 | 
 28 | master = []
 29 | print "Reading into Pandas frame..."
 30 | try:
 31 |     dfPart = pd.read_csv(INPUT_FILE, skiprows=0, usecols=[  # nrows = 2000
 32 |                          u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
 33 |                          u'DepTime', u'TailNum', u'Origin', u'Dest', u'label', u'pred_label'
 34 |                          ])
 35 |     print len(dfPart)
 36 |     master.append(dfPart)
 37 | except Exception as e:
 38 |     print "Data import failed", e
 39 | 
 40 | 
 41 | dfMaster = pd.concat(master, ignore_index=True)
 42 | print "Total length: ", len(dfMaster)
 43 | 
 44 | # change data types to integers
 45 | dfMaster['Year'] = dfMaster['Year'].astype('int')
 46 | dfMaster['Month'] = dfMaster['Month'].astype('int')
 47 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
 48 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
 49 | dfMaster['UniqueCarrier'] = dfMaster['UniqueCarrier'].astype('int')
 50 | dfMaster['TailNum'] = dfMaster['TailNum'].astype('int')
 51 | dfMaster['Origin'] = dfMaster['Origin'].astype('int')
 52 | dfMaster['Dest'] = dfMaster['Dest'].astype('int')
 53 | dfMaster['label'] = dfMaster['label'].astype('int')
 54 | dfMaster['pred_label'] = dfMaster['pred_label'].astype('int')
 55 | 
 56 | 
 57 | df = dfMaster
 58 | print "Appneding new variables..."
 59 | 
 60 | # create a binary variable that indicates accuracy of prediction
 61 | # for each record
 62 | df['accurate'] = 0
 63 | df.accurate[df.label == df.pred_label] = 1
 64 | df.accurate[df.label <> df.pred_label] = 0
 65 | 
 66 | 
 67 | # discretize time of day variable and create a categorical variable
 68 | # that captures morning (from 7 am to 1 pm), afternoon (1 pm to 6 pm),
 69 | # and night (from 6 pm to 7 am)
 70 | df['dep_time'] = 0
 71 | df.dep_time[df.DepTime.isin(xrange(700, 1301))] = 1
 72 | df.dep_time[df.DepTime.isin(xrange(1300, 1801))] = 2
 73 | df.dep_time[df.DepTime.isin(xrange(1800, 2401))] = 3
 74 | df.dep_time[df.DepTime.isin(xrange(0, 701))] = 3
 75 | 
 76 | # compute accuracy rates for each variable
 77 | month_acc = dfMaster.groupby('Month').accurate.sum() / \
 78 |     dfMaster.groupby('Month').accurate.count()
 79 | df_month_acc = pd.DataFrame(month_acc, columns=[u'Accuracy'])
 80 | 
 81 | 
 82 | day_of_month_acc = dfMaster.groupby(
 83 |     'DayofMonth').accurate.sum() / dfMaster.groupby('DayofMonth').accurate.count()
 84 | df_day_of_month_acc = pd.DataFrame(day_of_month_acc, columns=[u'Accuracy'])
 85 | 
 86 | day_of_week_acc = dfMaster.groupby(
 87 |     'DayOfWeek').accurate.sum() / dfMaster.groupby('DayOfWeek').accurate.count()
 88 | df_day_of_week_acc = pd.DataFrame(day_of_week_acc, columns=[u'Accuracy'])
 89 | 
 90 | unique_carrier_acc = dfMaster.groupby(
 91 |     'UniqueCarrier').accurate.sum() / dfMaster.groupby('UniqueCarrier').accurate.count()
 92 | df_unique_carrier_acc = pd.DataFrame(
 93 |     unique_carrier_acc, columns=[u'Accuracy'])
 94 | 
 95 | tail_num_acc = dfMaster.groupby(
 96 |     'TailNum').accurate.sum() / dfMaster.groupby('TailNum').accurate.count()
 97 | df_tail_num_acc = pd.DataFrame(tail_num_acc, columns=[u'Accuracy'])
 98 | 
 99 | origin_acc = dfMaster.groupby('Origin').accurate.sum() / \
100 |     dfMaster.groupby('Origin').accurate.count()
101 | df_origin_acc = pd.DataFrame(origin_acc, columns=[u'Accuracy'])
102 | 
103 | dest_acc = dfMaster.groupby('Dest').accurate.sum() / \
104 |     dfMaster.groupby('Dest').accurate.count()
105 | df_dest_acc = pd.DataFrame(dest_acc, columns=[u'Accuracy'])
106 | 
107 | dep_time_acc = dfMaster.groupby('dep_time').accurate.sum() / \
108 |     dfMaster.groupby('dep_time').accurate.count()
109 | df_dep_time_acc = pd.DataFrame(dep_time_acc, columns=[u'Accuracy'])
110 | 
111 | 
112 | # compute proportion of delays by each variable
113 | month_delays = dfMaster.groupby(
114 |     'Month').label.sum() / dfMaster.groupby('Month').label.count()
115 | df_month_delays = pd.DataFrame(month_delays, columns=[u'Accuracy'])
116 | 
117 | day_of_month_delays = dfMaster.groupby(
118 |     'DayofMonth').label.sum() / dfMaster.groupby('DayofMonth').label.count()
119 | df_day_of_month_delays = pd.DataFrame(
120 |     day_of_month_delays, columns=[u'Accuracy'])
121 | 
122 | day_of_week_delays = dfMaster.groupby(
123 |     'DayOfWeek').label.sum() / dfMaster.groupby('DayOfWeek').label.count()
124 | df_day_of_week_delays = pd.DataFrame(
125 |     day_of_week_delays, columns=[u'Accuracy'])
126 | 
127 | unique_carrier_delays = dfMaster.groupby(
128 |     'UniqueCarrier').label.sum() / dfMaster.groupby('UniqueCarrier').label.count()
129 | df_unique_carrier_delays = pd.DataFrame(
130 |     unique_carrier_delays, columns=[u'Accuracy'])
131 | 
132 | tail_num_delays = dfMaster.groupby(
133 |     'TailNum').label.sum() / dfMaster.groupby('TailNum').label.count()
134 | df_tail_num_delays = pd.DataFrame(tail_num_delays, columns=[u'Accuracy'])
135 | 
136 | origin_delays = dfMaster.groupby(
137 |     'Origin').label.sum() / dfMaster.groupby('Origin').label.count()
138 | df_origin_delays = pd.DataFrame(origin_delays, columns=[u'Accuracy'])
139 | 
140 | dest_delays = dfMaster.groupby(
141 |     'Dest').label.sum() / dfMaster.groupby('Dest').label.count()
142 | df_dest_delays = pd.DataFrame(dest_delays, columns=[u'Accuracy'])
143 | 
144 | dep_time_delays = dfMaster.groupby(
145 |     'dep_time').label.sum() / dfMaster.groupby('dep_time').label.count()
146 | df_dep_time_delays = pd.DataFrame(dep_time_delays, columns=[u'Accuracy'])
147 | 
148 | 
149 | ############################################### BUILD GRAPHS ###########################################
150 | 
151 | # build accuracy by day of month variable
152 | dfPlot = df_day_of_month_delays
153 | dfPlot.reset_index(inplace=True)
154 | dfPlot.columns
155 | plt.show()
156 | fig = plt.figure()
157 | fig.suptitle('Accuracy by Day of Month', fontsize=14, fontweight='bold')
158 | ax = fig.add_subplot(111)
159 | fig.subplots_adjust(top=0.95)
160 | ax.set_xlabel('Day of Month')
161 | ax.set_ylabel('Accuracy')
162 | ax.bar(dfPlot['DayofMonth'], dfPlot['Accuracy'], label="Label")
163 | plt.xticks(dfPlot['DayofMonth'], xrange(1, 32), rotation=45)
164 | plt.show()
165 | 
166 | # build accuracy by month variable
167 | dfPlot = df_month_delays
168 | dfPlot.reset_index(inplace=True)
169 | dfPlot.columns
170 | plt.show()
171 | fig = plt.figure()
172 | fig.suptitle('Accuracy by Month', fontsize=14, fontweight='bold')
173 | ax = fig.add_subplot(111)
174 | fig.subplots_adjust(top=0.95)
175 | ax.set_xlabel('Month')
176 | ax.set_ylabel('Accuracy')
177 | ax.bar(dfPlot['Month'], dfPlot['Accuracy'], label="Label")
178 | plt.xticks(dfPlot['Month'], xrange(1, 32), rotation=45)
179 | plt.show()
180 | 
181 | # build accuracy by day of week variable
182 | dfPlot = df_day_of_week_delays
183 | dfPlot.reset_index(inplace=True)
184 | dfPlot.columns
185 | plt.show()
186 | fig = plt.figure()
187 | fig.suptitle('Accuracy by day of week', fontsize=14, fontweight='bold')
188 | ax = fig.add_subplot(111)
189 | fig.subplots_adjust(top=0.95)
190 | ax.set_xlabel('Day of Week')
191 | ax.set_ylabel('Accuracy')
192 | ax.bar(dfPlot['DayOfWeek'], dfPlot['Accuracy'], label="Label")
193 | plt.xticks(dfPlot['DayOfWeek'], xrange(1, 32), rotation=45)
194 | plt.show()
195 | 
196 | # build accuracy by unique carrier variable
197 | dfPlot = df_unique_carrier_delays
198 | dfPlot.reset_index(inplace=True)
199 | dfPlot.columns
200 | plt.show()
201 | fig = plt.figure()
202 | fig.suptitle('Accuracy by unique carrier', fontsize=14, fontweight='bold')
203 | ax = fig.add_subplot(111)
204 | fig.subplots_adjust(top=0.95)
205 | ax.set_xlabel('Unique carrier')
206 | ax.set_ylabel('Accuracy')
207 | ax.bar(dfPlot['UniqueCarrier'], dfPlot['Accuracy'], label="Label")
208 | plt.xticks(dfPlot['UniqueCarrier'], xrange(1, 32), rotation=45)
209 | plt.show()
210 | 
211 | # build accuracy by tail number variable
212 | dfPlot = df_tail_num_delays
213 | dfPlot.reset_index(inplace=True)
214 | dfPlot.columns
215 | plt.show()
216 | fig = plt.figure()
217 | fig.suptitle('Accuracy by tail number', fontsize=14, fontweight='bold')
218 | ax = fig.add_subplot(111)
219 | fig.subplots_adjust(top=0.95)
220 | ax.set_xlabel('Tail number')
221 | ax.set_ylabel('Accuracy')
222 | ax.bar(dfPlot['TailNum'], dfPlot['Accuracy'], label="Label")
223 | plt.xticks(dfPlot['TailNum'], xrange(1, 32), rotation=45)
224 | plt.show()
225 | 
226 | # build accuracy by origin variable
227 | dfPlot = df_origin_delays
228 | dfPlot.reset_index(inplace=True)
229 | dfPlot.columns
230 | plt.show()
231 | fig = plt.figure()
232 | fig.suptitle('Accuracy by origin', fontsize=14, fontweight='bold')
233 | ax = fig.add_subplot(111)
234 | fig.subplots_adjust(top=0.95)
235 | ax.set_xlabel('Origin airport')
236 | ax.set_ylabel('Accuracy')
237 | ax.bar(dfPlot['Origin'], dfPlot['Accuracy'], label="Label")
238 | plt.xticks(dfPlot['Origin'], xrange(1, 32), rotation=45)
239 | plt.show()
240 | 
241 | # build accuracy by destination variable
242 | dfPlot = df_dest_delays
243 | dfPlot.reset_index(inplace=True)
244 | dfPlot.columns
245 | plt.show()
246 | fig = plt.figure()
247 | fig.suptitle('Accuracy by destination', fontsize=14, fontweight='bold')
248 | ax = fig.add_subplot(111)
249 | fig.subplots_adjust(top=0.95)
250 | ax.set_xlabel('Destination airport')
251 | ax.set_ylabel('Accuracy')
252 | ax.bar(dfPlot['Dest'], dfPlot['Accuracy'], label="Label")
253 | plt.xticks(dfPlot['Dest'], xrange(1, 32), rotation=45)
254 | plt.show()
255 | 
256 | # build accuracy by departure time variable
257 | dfPlot = df_dep_time_delays
258 | dfPlot.reset_index(inplace=True)
259 | dfPlot.columns
260 | plt.show()
261 | fig = plt.figure()
262 | fig.suptitle('Accuracy by departure time', fontsize=14, fontweight='bold')
263 | ax = fig.add_subplot(111)
264 | fig.subplots_adjust(top=0.95)
265 | ax.set_xlabel('Departure time')
266 | ax.set_ylabel('Accuracy')
267 | ax.bar(dfPlot['dep_time'], dfPlot['Accuracy'], label="Label")
268 | plt.xticks(dfPlot['dep_time'], xrange(1, 32), rotation=45)
269 | plt.show()
270 | 


--------------------------------------------------------------------------------
/Old Python Code/Basic.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | # -*- coding: utf-8 -*-
  4 | # <nbformat>3.0</nbformat>
  5 | 
  6 | # <codecell>
  7 | 
  8 | #!/usr/bin/env python
  9 | 
 10 | """This file contains the code for the Data Mining Class. It uses the Airline dataset <<add link>>"""
 11 | 
 12 | __author__ = ""
 13 | __email__ = ""
 14 | __status__ = ""
 15 | 
 16 | # <codecell>
 17 | 
 18 | #  Importing various modules
 19 | 
 20 | import matplotlib.pyplot as plt
 21 | import numpy as np
 22 | from pylab import figure, show
 23 | from pandas import DataFrame, Series
 24 | import pandas as pd
 25 | import csv
 26 | import os
 27 | import statsmodels.formula.api as smf
 28 | import scipy.stats as stats
 29 | import statsmodels.api as sm
 30 | 
 31 | # <codecell>
 32 | 
 33 | #  Setting global constants. Please initialize this before running the code
 34 | 
 35 | TRAINING_LINE_NUMBER = 100000 # Number of lines to be read from the huge file, set to total file length while running for entire file
 36 | INPUT_FILE_PATH="C:\\data\\airline\\" # Path of the folder where you have placed your files
 37 | SKIP_FIRST_LINE = True # To skip the first line, as its the header
 38 | YEARS = ['2008'] # Add more years in this list and add the files in the INPUT_FILE_PATH
 39 | 
 40 | # <codecell>
 41 | 
 42 | # Setting the dataframes for Airline, Plane and Carriers
 43 | 
 44 | try:
 45 |     path = "C:\\data\\airline\\plane-data.csv"
 46 |     dfPlane = pd.read_csv(path)
 47 |     path = 'C:\\data\\airline\\airports.csv'
 48 |     dfAirport = pd.read_csv(path)
 49 |     path = 'C:\\data\\airline\\carriers.csv'
 50 |     dfCarrier = pd.read_csv(path)
 51 | except Exception as e:
 52 |     print "Supplemental Data Import failed", e
 53 | 
 54 | # <codecell>
 55 | 
 56 | # Readng the main file in a Pandas dataframe
 57 | 
 58 | try:
 59 |     for year in YEARS:
 60 |         path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
 61 |         dfMaster = pd.read_csv(path, nrows=TRAINING_LINE_NUMBER,skiprows=0)
 62 | except Exception as e:
 63 |     print "Supplemental Data Import failed", e
 64 | dfMaster.head()
 65 | 
 66 | # <codecell>
 67 | 
 68 | dfMaster.fillna(0,inplace=True)
 69 | 
 70 | # <codecell>
 71 | 
 72 | # TODO: Do this for other dataframes as well
 73 | 
 74 | #  Convert all columns to respective datatypes
 75 | 
 76 | dfMaster['Year'] = dfMaster['Year'].astype('int')
 77 | dfMaster['Month'] = dfMaster['Month'].astype('int')
 78 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
 79 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
 80 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
 81 | dfMaster['CRSDepTime'] = dfMaster['CRSDepTime'].astype('int')
 82 | dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')
 83 | dfMaster['CRSArrTime'] = dfMaster['CRSArrTime'].astype('int')
 84 | dfMaster['FlightNum'] = dfMaster['FlightNum'].astype('int')
 85 | dfMaster['ActualElapsedTime'] = dfMaster['ActualElapsedTime'].astype('int')
 86 | dfMaster['CRSElapsedTime'] = dfMaster['CRSElapsedTime'].astype('int')
 87 | dfMaster['AirTime'] = dfMaster['AirTime'].astype('int')
 88 | dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')
 89 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
 90 | dfMaster['Distance'] = dfMaster['Distance'].astype('int')
 91 | dfMaster['TaxiIn'] = dfMaster['TaxiIn'].astype('int')
 92 | dfMaster['TaxiOut'] = dfMaster['TaxiOut'].astype('int')
 93 | dfMaster['Cancelled'] = dfMaster['Cancelled'].astype('int')
 94 | dfMaster['Diverted'] = dfMaster['Diverted'].astype('int')
 95 | print dfMaster.columns
 96 | 
 97 | # <codecell>
 98 | 
 99 | # for col in dfMaster.columns:
100 | #     print 'dfMaster[\'',col,'\'] = dfMaster[\'',col,'\'].astype(\'int\')'
101 | 
102 | # <codecell>
103 | 
104 | results = sm.OLS.from_formula('DepDelay ~ ArrDelay', dfMaster).fit()
105 | print results.summary()
106 | 
107 | # <codecell>
108 | 
109 | intercept, slope = results.params
110 | r2 = results.rsquared
111 | print slope, intercept, r2
112 | 
113 | plt.plot(dfMaster['DepDelay'], dfMaster['ArrDelay'], 'bo')
114 | x = np.array([min(dfMaster['ArrDelay']), max(dfMaster['ArrDelay'])])
115 | y = intercept + slope * x
116 | plt.plot(x, y, 'r-')
117 | plt.show()
118 | 
119 | 
120 | from statsmodels.stats.anova import anova_lm
121 | 
122 | anova_lm(results)
123 | 
124 | 


--------------------------------------------------------------------------------
/Old Python Code/Dest.pkl:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (cpandas.core.series
  5 | Series
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_data'
 14 | p6
 15 | g0
 16 | (cpandas.core.internals
 17 | SingleBlockManager
 18 | p7
 19 | g2
 20 | Ntp8
 21 | Rp9
 22 | ((lp10
 23 | cnumpy.core.multiarray
 24 | _reconstruct
 25 | p11
 26 | (cpandas.core.index
 27 | Index
 28 | p12
 29 | (I0
 30 | tp13
 31 | S'b'
 32 | p14
 33 | tp15
 34 | Rp16
 35 | ((I1
 36 | (L64L
 37 | tp17
 38 | cnumpy
 39 | dtype
 40 | p18
 41 | (S'O8'
 42 | p19
 43 | I0
 44 | I1
 45 | tp20
 46 | Rp21
 47 | (I3
 48 | S'|'
 49 | p22
 50 | NNNI-1
 51 | I-1
 52 | I63
 53 | tp23
 54 | bI00
 55 | (lp24
 56 | S'ABQ'
 57 | p25
 58 | aS'ALB'
 59 | p26
 60 | aS'AMA'
 61 | p27
 62 | aS'AUS'
 63 | p28
 64 | aS'BDL'
 65 | p29
 66 | aS'BHM'
 67 | p30
 68 | aS'BNA'
 69 | p31
 70 | aS'BOI'
 71 | p32
 72 | aS'BUF'
 73 | p33
 74 | aS'BUR'
 75 | p34
 76 | aS'BWI'
 77 | p35
 78 | aS'CLE'
 79 | p36
 80 | aS'CMH'
 81 | p37
 82 | aS'CRP'
 83 | p38
 84 | aS'DAL'
 85 | p39
 86 | aS'DEN'
 87 | p40
 88 | aS'DTW'
 89 | p41
 90 | aS'ELP'
 91 | p42
 92 | aS'FLL'
 93 | p43
 94 | aS'GEG'
 95 | p44
 96 | aS'HOU'
 97 | p45
 98 | aS'HRL'
 99 | p46
100 | aS'IAD'
101 | p47
102 | aS'IND'
103 | p48
104 | aS'ISP'
105 | p49
106 | aS'JAN'
107 | p50
108 | aS'JAX'
109 | p51
110 | aS'LAS'
111 | p52
112 | aS'LAX'
113 | p53
114 | aS'LBB'
115 | p54
116 | aS'LIT'
117 | p55
118 | aS'MAF'
119 | p56
120 | aS'MCI'
121 | p57
122 | aS'MCO'
123 | p58
124 | aS'MDW'
125 | p59
126 | aS'MHT'
127 | p60
128 | aS'MSY'
129 | p61
130 | aS'OAK'
131 | p62
132 | aS'OKC'
133 | p63
134 | aS'OMA'
135 | p64
136 | aS'ONT'
137 | p65
138 | aS'ORF'
139 | p66
140 | aS'PBI'
141 | p67
142 | aS'PDX'
143 | p68
144 | aS'PHL'
145 | p69
146 | aS'PHX'
147 | p70
148 | aS'PIT'
149 | p71
150 | aS'PVD'
151 | p72
152 | aS'RDU'
153 | p73
154 | aS'RNO'
155 | p74
156 | aS'RSW'
157 | p75
158 | aS'SAN'
159 | p76
160 | aS'SAT'
161 | p77
162 | aS'SDF'
163 | p78
164 | aS'SEA'
165 | p79
166 | aS'SFO'
167 | p80
168 | aS'SJC'
169 | p81
170 | aS'SLC'
171 | p82
172 | aS'SMF'
173 | p83
174 | aS'SNA'
175 | p84
176 | aS'STL'
177 | p85
178 | aS'TPA'
179 | p86
180 | aS'TUL'
181 | p87
182 | aS'TUS'
183 | p88
184 | atp89
185 | (Ntp90
186 | tp91
187 | ba(lp92
188 | g11
189 | (cnumpy
190 | ndarray
191 | p93
192 | (I0
193 | tp94
194 | g14
195 | tp95
196 | Rp96
197 | (I1
198 | (L64L
199 | tp97
200 | g18
201 | (S'i8'
202 | p98
203 | I0
204 | I1
205 | tp99
206 | Rp100
207 | (I3
208 | S'<'
209 | p101
210 | NNNI-1
211 | I-1
212 | I0
213 | tp102
214 | bI00
215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00'
216 | p103
217 | tp104
218 | ba(lp105
219 | g16
220 | atp106
221 | bsS'name'
222 | p107
223 | Nsb.


--------------------------------------------------------------------------------
/Old Python Code/Julia Code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:2119dd4eb940c5d56c1cf3c63fe41c2b7d02d5ac902ce8287eaa7c250c822c89"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "import csv\n",
 16 |       "import pickle\n",
 17 |       "\n",
 18 |       "needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]\n",
 19 |       "years = [2008]\n",
 20 |       "\n",
 21 |       "def ComputeDayofYear(row):\n",
 22 |       "    \"\"\"This function will return an integer to represent the day of the year given an integer\n",
 23 |       "    representing month and an integer representing the day of the month.  This number will\n",
 24 |       "    correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned\n",
 25 |       "    as 0.  Feb 29th will be returned as 59.\"\"\"\n",
 26 |       "\n",
 27 |       "    if(row[0] == '1'):\n",
 28 |       "        calc = 0 + int(row[1]) - 1\n",
 29 |       "        row[1] = str(calc)\n",
 30 |       "    elif(row[0] == '2'):\n",
 31 |       "        calc = 31 + int(row[1]) - 1\n",
 32 |       "        row[1] = str(calc)\n",
 33 |       "    elif(row[0] == '3'):\n",
 34 |       "        calc = 60 + int(row[1]) - 1\n",
 35 |       "        row[1] = str(calc)\n",
 36 |       "    elif(row[0] == '4'):\n",
 37 |       "        calc = 91 + int(row[1]) - 1\n",
 38 |       "        row[1] = str(calc)\n",
 39 |       "    elif(row[0] == '5'):\n",
 40 |       "        calc = 121 + int(row[1]) - 1\n",
 41 |       "        row[1] = str(calc)\n",
 42 |       "    elif(row[0] == '6'):\n",
 43 |       "        calc = 152 + int(row[1]) - 1\n",
 44 |       "        row[1] = str(calc)\n",
 45 |       "    elif(row[0] == '7'):\n",
 46 |       "        calc = 182 + int(row[1]) - 1\n",
 47 |       "        row[1] = str(calc)\n",
 48 |       "    elif(row[0] == '8'):\n",
 49 |       "        calc = 213 + int(row[1]) - 1\n",
 50 |       "        row[1] = str(calc)\n",
 51 |       "    elif(row[0] == '9'):\n",
 52 |       "        calc = 244 + int(row[1]) - 1\n",
 53 |       "        row[1] = str(calc)\n",
 54 |       "    elif(row[0] == '10'):\n",
 55 |       "        calc = 274 + int(row[1]) - 1\n",
 56 |       "        row[1] = str(calc)\n",
 57 |       "    elif(row[0] == '11'):\n",
 58 |       "        calc = 305 + int(row[1]) - 1\n",
 59 |       "        row[1] = str(calc)\n",
 60 |       "    elif(row[0] == '12'):\n",
 61 |       "        calc = 335 + int(row[1]) - 1\n",
 62 |       "        row[1] = str(calc)\n",
 63 |       "    return row\n",
 64 |       "\n",
 65 |       "\n",
 66 |       "def DiscretizeDepTime(row):\n",
 67 |       "    \"\"\"This function takes a scheduled departure time, classifies the departure time as:\n",
 68 |       "    morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value\n",
 69 |       "    is assumed to be an integer in 24-hour time format.  These labels will correspond to\n",
 70 |       "    variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.\n",
 71 |       "    An error time is returned as morning.\"\"\"\n",
 72 |       "\n",
 73 |       "    if(int(row[3]) <= 559):\n",
 74 |       "        row[3] = '2'\n",
 75 |       "    elif(int(row[3]) >= 600 and int(row[3]) <= 1259):\n",
 76 |       "        row[3] = '0'\n",
 77 |       "    elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):\n",
 78 |       "        row[3] = '1'\n",
 79 |       "    elif(int(row[3]) >= 1800):\n",
 80 |       "        row[3] = '2'\n",
 81 |       "    else:\n",
 82 |       "        row[3] = '0'\n",
 83 |       "    return row\n",
 84 |       "\n",
 85 |       "\n",
 86 |       "def AddDepVar(row):\n",
 87 |       "    \"\"\"This function adds a classification label based on the length of the recorded\n",
 88 |       "    Departure Delay in the data set.  It assumes an input integer value of the delay in mins.\n",
 89 |       "    By airline industry standards, flight delays are defined as departure delays greater than\n",
 90 |       "    or equal to 15 minutes.  For delayed flights, this variable will have value \"1\".\n",
 91 |       "    For on time flights, it will have value \"0\".  Default value will be set at \"0\".\"\"\"\n",
 92 |       "\n",
 93 |       "    if(row[6] >= '15'):\n",
 94 |       "        row[6] = '1'\n",
 95 |       "    else:\n",
 96 |       "        row[6] = '0'\n",
 97 |       "    return row\n",
 98 |       "\n",
 99 |       "def SaveData(data, pickle_file_name):\n",
100 |       "    \"\"\"This function pickles each file.\"\"\"\n",
101 |       "\n",
102 |       "    f = open (pickle_file_name, \"w\")\n",
103 |       "    pickle.dump(data, f)\n",
104 |       "    f.close()\n",
105 |       "\n",
106 |       "for i in years:\n",
107 |       "    data = []\n",
108 |       "    file_path='C:\\\\data\\\\airline\\\\'+str(i) + '.csv'\n",
109 |       "    pickle_file_name = 'data' + str(i)\n",
110 |       "    with open(file_path, 'r') as data_csv:\n",
111 |       "        csv_reader = csv.reader(data_csv, delimiter=',')\n",
112 |       "        for row in list(csv_reader):\n",
113 |       "            if row[21] == '0':\n",
114 |       "                content = list(row[i] for i in needed_cols)\n",
115 |       "                content2 = ComputeDayofYear(content)\n",
116 |       "                content3 = DiscretizeDepTime(content2)\n",
117 |       "                content4 = AddDepVar(content3)\n",
118 |       "                data.append(content4)\n",
119 |       "    SaveData(data, pickle_file_name)"
120 |      ],
121 |      "language": "python",
122 |      "metadata": {},
123 |      "outputs": []
124 |     }
125 |    ],
126 |    "metadata": {}
127 |   }
128 |  ]
129 | }


--------------------------------------------------------------------------------
/Old Python Code/NB.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import pandas as pd
  4 | import sklearn
  5 | from sklearn.naive_bayes import *
  6 | from sklearn.metrics import *
  7 | import os
  8 | import cPickle
  9 | 
 10 | # Setting up constants
 11 | print "Setting constants..."
 12 | 
 13 | TRAINING_LINE_NUMBER = 100
 14 | YEARS = ['2008', '2007']
 15 | # INPUT_FILE_PATH = "/home/dmenghani/python/"  # Unix path
 16 | INPUT_FILE_PATH = "C:\\data\\airline\\"  # Windows path
 17 | # YEARS = ['2008']
 18 | 
 19 | SKIP_FIRST_LINE = True  # To skip the first line, as its the header
 20 | 
 21 | master = []
 22 | print "Reading into Pandas frame..."
 23 | try:
 24 |     for year in YEARS:
 25 |         path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
 26 |         print path
 27 |         dfPart = pd.read_csv(
 28 |             path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[
 29 |                 u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'UniqueCarrier',
 30 |                 u'DepTime', u'TailNum', u'Origin', u'Dest', u'DepDelay', u'Cancelled'
 31 |             ])
 32 |         dfPart = dfPart[dfPart['Cancelled'] == 0]
 33 |         print len(dfPart)
 34 |         master.append(dfPart)
 35 | except Exception as e:
 36 |     print "Supplemental Data Import failed", e
 37 | 
 38 | dfMaster = pd.concat(master, ignore_index=True)
 39 | print "Total length - ", len(dfMaster)
 40 | 
 41 | 
 42 | dfMaster.fillna(0, inplace=True)
 43 | dfMaster['Year'] = dfMaster['Year'].astype('int')
 44 | dfMaster['Month'] = dfMaster['Month'].astype('int')
 45 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
 46 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
 47 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
 48 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
 49 | 
 50 | print "Length of pandas frame - ", len(dfMaster)
 51 | print "Dataframe columns - ", dfMaster.columns
 52 | 
 53 | df = dfMaster
 54 | 
 55 | print "Calculating classification label..."
 56 | df['label'] = 0
 57 | df.label[df.DepDelay >= 15] = 1
 58 | df.label[df.DepDelay < 15] = 0
 59 | del df['DepDelay']
 60 | 
 61 | print "Converting categorical data to numeric..."
 62 | for col in set(df.columns):
 63 | # print col, train[col].dtype
 64 |     if df[col].dtype == np.dtype('object'):
 65 |         print "Converting...", col
 66 |         if col == 'TailNum':
 67 |             s = np.unique(df[col].values)
 68 |             TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)
 69 | #             print TailNum
 70 |         if col == 'UniqueCarrier':
 71 |             s = np.unique(df[col].values)
 72 |             UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)
 73 | #             print UniqueCarrier
 74 |         if col == 'Dest':
 75 |             s = np.unique(df[col].values)
 76 |             Dest = pd.Series([x[0] for x in enumerate(s)], index=s)
 77 | #             print Dest
 78 |         if col == 'Origin':
 79 |             s = np.unique(df[col].values)
 80 |             Origin = pd.Series([x[0] for x in enumerate(s)], index=s)
 81 | #             print Origin
 82 | 
 83 | 
 84 | def getTailNum(inTailNum):
 85 | #     print "In...",type(inTailNum)
 86 |     out = []
 87 |     for x, y in inTailNum.iteritems():
 88 | #         print "x,y, out",x,y,TailNum.get_value(y)
 89 |         out.append(TailNum.get_value(y) + 1)
 90 | #     print "final out", out
 91 |     return out
 92 | 
 93 | 
 94 | def getDest(inDest):
 95 |     out = []
 96 |     for x, y in inDest.iteritems():
 97 |         out.append(Dest.get_value(y) + 1)
 98 |     return out
 99 | 
100 | 
101 | def getOrigin(inOrign):
102 |     out = []
103 |     for x, y in inOrign.iteritems():
104 |         out.append(Origin.get_value(y) + 1)
105 |     return out
106 | 
107 | 
108 | def getCarrier(inCarrier):
109 |     out = []
110 |     for x, y in inCarrier.iteritems():
111 |         out.append(UniqueCarrier.get_value(y) + 1)
112 |     return out
113 | 
114 | df['TailNum'] = getTailNum(df['TailNum'])
115 | print "TailNum completed."
116 | 
117 | df['Dest'] = getDest(df['Dest'])
118 | print "Dest completed."
119 | 
120 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])
121 | print "UniqueCarrier completed."
122 | 
123 | df['Origin'] = getOrigin(df['Origin'])
124 | print "Origin completed."
125 | 
126 | print "Conversion to numeric completed."
127 | 
128 | print "Pickling converted data..."
129 | df.to_pickle(INPUT_FILE_PATH + "\df.pkl")
130 | 
131 | print "Begin classification...75% training, 25% testing, randomly chosen"
132 | arget_names = np.array(['Delayed', 'Not Delayed'])
133 | # add columns to your data frame
134 | df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75
135 | # define training and test sets
136 | train = df[df['is_train'] == True]
137 | test = df[df['is_train'] == False]
138 | trainTargets = np.array(train['label']).astype(int)
139 | testTargets = np.array(test['label']).astype(int)
140 | features = df.columns[0:9]
141 | print "Model fitting and prediction started..."
142 | gnb = MultinomialNB()
143 | # train model
144 | y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])
145 | print "Classification completed."
146 | print "Calculating metrcs..."
147 | test['pred_label'] = y_gnb
148 | test.head()
149 | acc = zip(test['label'], test['pred_label'])
150 | match_count = 0
151 | for i in acc:
152 |     if i[0] == - i[1]:
153 |         match_count += 1
154 | print "Matches - ", match_count
155 | print "Total length - ", len(acc)
156 | print "Accuracy:", float(match_count) / len(acc)
157 | 


--------------------------------------------------------------------------------
/Old Python Code/Origin.pkl:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (cpandas.core.series
  5 | Series
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_data'
 14 | p6
 15 | g0
 16 | (cpandas.core.internals
 17 | SingleBlockManager
 18 | p7
 19 | g2
 20 | Ntp8
 21 | Rp9
 22 | ((lp10
 23 | cnumpy.core.multiarray
 24 | _reconstruct
 25 | p11
 26 | (cpandas.core.index
 27 | Index
 28 | p12
 29 | (I0
 30 | tp13
 31 | S'b'
 32 | p14
 33 | tp15
 34 | Rp16
 35 | ((I1
 36 | (L64L
 37 | tp17
 38 | cnumpy
 39 | dtype
 40 | p18
 41 | (S'O8'
 42 | p19
 43 | I0
 44 | I1
 45 | tp20
 46 | Rp21
 47 | (I3
 48 | S'|'
 49 | p22
 50 | NNNI-1
 51 | I-1
 52 | I63
 53 | tp23
 54 | bI00
 55 | (lp24
 56 | S'ABQ'
 57 | p25
 58 | aS'ALB'
 59 | p26
 60 | aS'AMA'
 61 | p27
 62 | aS'AUS'
 63 | p28
 64 | aS'BDL'
 65 | p29
 66 | aS'BHM'
 67 | p30
 68 | aS'BNA'
 69 | p31
 70 | aS'BOI'
 71 | p32
 72 | aS'BUF'
 73 | p33
 74 | aS'BUR'
 75 | p34
 76 | aS'BWI'
 77 | p35
 78 | aS'CLE'
 79 | p36
 80 | aS'CMH'
 81 | p37
 82 | aS'CRP'
 83 | p38
 84 | aS'DAL'
 85 | p39
 86 | aS'DEN'
 87 | p40
 88 | aS'DTW'
 89 | p41
 90 | aS'ELP'
 91 | p42
 92 | aS'FLL'
 93 | p43
 94 | aS'GEG'
 95 | p44
 96 | aS'HOU'
 97 | p45
 98 | aS'HRL'
 99 | p46
100 | aS'IAD'
101 | p47
102 | aS'IND'
103 | p48
104 | aS'ISP'
105 | p49
106 | aS'JAN'
107 | p50
108 | aS'JAX'
109 | p51
110 | aS'LAS'
111 | p52
112 | aS'LAX'
113 | p53
114 | aS'LBB'
115 | p54
116 | aS'LIT'
117 | p55
118 | aS'MAF'
119 | p56
120 | aS'MCI'
121 | p57
122 | aS'MCO'
123 | p58
124 | aS'MDW'
125 | p59
126 | aS'MHT'
127 | p60
128 | aS'MSY'
129 | p61
130 | aS'OAK'
131 | p62
132 | aS'OKC'
133 | p63
134 | aS'OMA'
135 | p64
136 | aS'ONT'
137 | p65
138 | aS'ORF'
139 | p66
140 | aS'PBI'
141 | p67
142 | aS'PDX'
143 | p68
144 | aS'PHL'
145 | p69
146 | aS'PHX'
147 | p70
148 | aS'PIT'
149 | p71
150 | aS'PVD'
151 | p72
152 | aS'RDU'
153 | p73
154 | aS'RNO'
155 | p74
156 | aS'RSW'
157 | p75
158 | aS'SAN'
159 | p76
160 | aS'SAT'
161 | p77
162 | aS'SDF'
163 | p78
164 | aS'SEA'
165 | p79
166 | aS'SFO'
167 | p80
168 | aS'SJC'
169 | p81
170 | aS'SLC'
171 | p82
172 | aS'SMF'
173 | p83
174 | aS'SNA'
175 | p84
176 | aS'STL'
177 | p85
178 | aS'TPA'
179 | p86
180 | aS'TUL'
181 | p87
182 | aS'TUS'
183 | p88
184 | atp89
185 | (Ntp90
186 | tp91
187 | ba(lp92
188 | g11
189 | (cnumpy
190 | ndarray
191 | p93
192 | (I0
193 | tp94
194 | g14
195 | tp95
196 | Rp96
197 | (I1
198 | (L64L
199 | tp97
200 | g18
201 | (S'i8'
202 | p98
203 | I0
204 | I1
205 | tp99
206 | Rp100
207 | (I3
208 | S'<'
209 | p101
210 | NNNI-1
211 | I-1
212 | I0
213 | tp102
214 | bI00
215 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00\x00\x00\x00\x00\x1a\x00\x00\x00\x00\x00\x00\x00\x1b\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00!\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00#\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00\x00\x00\x00\x00%\x00\x00\x00\x00\x00\x00\x00&\x00\x00\x00\x00\x00\x00\x00\'\x00\x00\x00\x00\x00\x00\x00(\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x00\x00\x00\x00\x00*\x00\x00\x00\x00\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00,\x00\x00\x00\x00\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x00\x00\x00/\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x001\x00\x00\x00\x00\x00\x00\x002\x00\x00\x00\x00\x00\x00\x003\x00\x00\x00\x00\x00\x00\x004\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x006\x00\x00\x00\x00\x00\x00\x007\x00\x00\x00\x00\x00\x00\x008\x00\x00\x00\x00\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00:\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00<\x00\x00\x00\x00\x00\x00\x00=\x00\x00\x00\x00\x00\x00\x00>\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00'
216 | p103
217 | tp104
218 | ba(lp105
219 | g16
220 | atp106
221 | bsS'name'
222 | p107
223 | Nsb.


--------------------------------------------------------------------------------
/Old Python Code/UniqueCarrier.pkl:
--------------------------------------------------------------------------------
 1 | ccopy_reg
 2 | _reconstructor
 3 | p0
 4 | (cpandas.core.series
 5 | Series
 6 | p1
 7 | c__builtin__
 8 | object
 9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_data'
14 | p6
15 | g0
16 | (cpandas.core.internals
17 | SingleBlockManager
18 | p7
19 | g2
20 | Ntp8
21 | Rp9
22 | ((lp10
23 | cnumpy.core.multiarray
24 | _reconstruct
25 | p11
26 | (cpandas.core.index
27 | Index
28 | p12
29 | (I0
30 | tp13
31 | S'b'
32 | p14
33 | tp15
34 | Rp16
35 | ((I1
36 | (L1L
37 | tp17
38 | cnumpy
39 | dtype
40 | p18
41 | (S'O8'
42 | p19
43 | I0
44 | I1
45 | tp20
46 | Rp21
47 | (I3
48 | S'|'
49 | p22
50 | NNNI-1
51 | I-1
52 | I63
53 | tp23
54 | bI00
55 | (lp24
56 | S'WN'
57 | p25
58 | atp26
59 | (Ntp27
60 | tp28
61 | ba(lp29
62 | g11
63 | (cnumpy
64 | ndarray
65 | p30
66 | (I0
67 | tp31
68 | g14
69 | tp32
70 | Rp33
71 | (I1
72 | (L1L
73 | tp34
74 | g18
75 | (S'i8'
76 | p35
77 | I0
78 | I1
79 | tp36
80 | Rp37
81 | (I3
82 | S'<'
83 | p38
84 | NNNI-1
85 | I-1
86 | I0
87 | tp39
88 | bI00
89 | S'\x00\x00\x00\x00\x00\x00\x00\x00'
90 | p40
91 | tp41
92 | ba(lp42
93 | g16
94 | atp43
95 | bsS'name'
96 | p44
97 | Nsb.


--------------------------------------------------------------------------------
/Old Python Code/Untitled0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:44f7be7f1af03eb634d779b3e3fc1b7473ad8af24b380e9e53f9a15ad5274aaf"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "import numpy as np\n",
 17 |       "import pandas as pd\n",
 18 |       "import sklearn\n",
 19 |       "from sklearn.naive_bayes import *\n",
 20 |       "from sklearn.metrics import *\n",
 21 |       "import os\n",
 22 |       "import cPickle\n",
 23 |       "import sys\n",
 24 |       "import pandas as pd\n",
 25 |       "import numpy as np\n",
 26 |       "from optparse import OptionParser\n",
 27 |       "from sklearn import metrics, preprocessing\n",
 28 |       "from sklearn import svm, naive_bayes, neighbors, tree\n",
 29 |       "from sklearn.ensemble import AdaBoostClassifier\n",
 30 |       "from sklearn import cross_validation\n",
 31 |       "from sklearn.ensemble import RandomForestClassifier  # random forest\n",
 32 |       "from sklearn.svm import SVC  # support vector machine classifier\n",
 33 |       "# hyperparameter grid search to find best model parameters\n",
 34 |       "from sklearn.grid_search import GridSearchCV\n",
 35 |       "from sklearn import preprocessing  # preprocess string labels into numerics\n",
 36 |       "from sklearn import *\n",
 37 |       "from sklearn.metrics import precision_recall_fscore_support\n",
 38 |       "from sklearn.metrics import classification_report\n",
 39 |       "\n",
 40 |       "\n",
 41 |       "# In[135]:\n",
 42 |       "\n",
 43 |       "# Setting up constants\n",
 44 |       "print \"Setting constants...\"\n",
 45 |       "\n",
 46 |       "TRAINING_LINE_NUMBER = 500000\n",
 47 |       "YEARS = ['2006', '2008', '2007']\n",
 48 |       "# INPUT_FILE_PATH = \"/home/dmenghani/python/\"  # Unix path\n",
 49 |       "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\"  # Windows path\n",
 50 |       "# YEARS = ['2008']\n",
 51 |       "\n",
 52 |       "SKIP_FIRST_LINE = True  # To skip the first line, as its the header\n",
 53 |       "\n",
 54 |       "master = []\n",
 55 |       "print \"Reading into Pandas frame...\"\n",
 56 |       "try:\n",
 57 |       "    for year in YEARS:\n",
 58 |       "        path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
 59 |       "        print \"\\n\", path\n",
 60 |       "        dfPart = pd.read_csv(\n",
 61 |       "            path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
 62 |       "                u'Year',\n",
 63 |       "                u'Month',\n",
 64 |       "                u'DayofMonth',\n",
 65 |       "                u'DayOfWeek',\n",
 66 |       "                u'UniqueCarrier',\n",
 67 |       "                u'DepTime',\n",
 68 |       "                u'TailNum',\n",
 69 |       "                u'Origin',\n",
 70 |       "                u'Dest',\n",
 71 |       "                u'DepDelay',\n",
 72 |       "                # u'ArrDelay',\n",
 73 |       "                u'Cancelled',\n",
 74 |       "                #                 u'ArrTime',\n",
 75 |       "                #                 u'ArrDelay',\n",
 76 |       "                #                 u'Distance'\n",
 77 |       "            ])\n",
 78 |       "        print len(dfPart)\n",
 79 |       "        dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
 80 |       "        print \"Removed cancelled flights, new length - \", len(dfPart)\n",
 81 |       "        master.append(dfPart)\n",
 82 |       "        print\n",
 83 |       "except Exception as e:\n",
 84 |       "    print \"Supplemental Data Import failed\", e\n",
 85 |       "\n",
 86 |       "dfMaster = pd.concat(master, ignore_index=True)\n",
 87 |       "master = []\n",
 88 |       "dfPart = []\n",
 89 |       "\n",
 90 |       "print \"Total length - \", len(dfMaster)\n",
 91 |       "del dfMaster['Cancelled']\n",
 92 |       "\n",
 93 |       "dfMaster.fillna(0, inplace=True)\n",
 94 |       "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
 95 |       "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
 96 |       "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
 97 |       "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
 98 |       "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
 99 |       "# dfMaster['ArrTime'] = dfMaster['ArrTime'].astype('int')\n",
100 |       "# dfMaster['ArrDelay'] = dfMaster['ArrDelay'].astype('int')\n",
101 |       "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
102 |       "# dfMaster['Distance'] = dfMaster['Distance'].astype('int')\n",
103 |       "\n",
104 |       "df = dfMaster\n",
105 |       "\n",
106 |       "print \"Calculating classification label...\"\n",
107 |       "df['label'] = 0\n",
108 |       "df.label[df.DepDelay >= 15] = 1\n",
109 |       "df.label[df.DepDelay < 15] = 0\n",
110 |       "\n",
111 |       "# df['DepDelay'][df.DepDelay < 0] = 0\n",
112 |       "del df['DepDelay']\n",
113 |       "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
114 |       "\n",
115 |       "print \"Dataframe shape - \", df.shape\n",
116 |       "print \"Columns -\", df.columns\n",
117 |       "\n",
118 |       "\n",
119 |       "# In[136]:\n",
120 |       "\n",
121 |       "print \"Converting categorical data to numeric...\"\n",
122 |       "for col in set(df.columns):\n",
123 |       "# print col, train[col].dtype\n",
124 |       "    if df[col].dtype == np.dtype('object'):\n",
125 |       "        print \"Converting...\", col\n",
126 |       "        if col == 'TailNum':\n",
127 |       "            s = np.unique(df[col].values)\n",
128 |       "            TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 |       "#             print TailNum\n",
130 |       "        if col == 'UniqueCarrier':\n",
131 |       "            s = np.unique(df[col].values)\n",
132 |       "            UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 |       "#             print UniqueCarrier\n",
134 |       "        if col == 'Dest':\n",
135 |       "            s = np.unique(df[col].values)\n",
136 |       "            Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 |       "#             print Dest\n",
138 |       "        if col == 'Origin':\n",
139 |       "            s = np.unique(df[col].values)\n",
140 |       "            Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
141 |       "#             print Origin\n",
142 |       "\n",
143 |       "\n",
144 |       "def getTailNum(inTailNum):\n",
145 |       "#     print \"In...\",type(inTailNum)\n",
146 |       "    out = []\n",
147 |       "    for x, y in inTailNum.iteritems():\n",
148 |       "#         print \"x,y, out\",x,y,TailNum.get_value(y)\n",
149 |       "        out.append(TailNum.get_value(y) + 1)\n",
150 |       "#     print \"final out\", out\n",
151 |       "    return out\n",
152 |       "\n",
153 |       "\n",
154 |       "def getDest(inDest):\n",
155 |       "    out = []\n",
156 |       "    for x, y in inDest.iteritems():\n",
157 |       "        out.append(Dest.get_value(y) + 1)\n",
158 |       "    return out\n",
159 |       "\n",
160 |       "\n",
161 |       "def getOrigin(inOrign):\n",
162 |       "    out = []\n",
163 |       "    for x, y in inOrign.iteritems():\n",
164 |       "        out.append(Origin.get_value(y) + 1)\n",
165 |       "    return out\n",
166 |       "\n",
167 |       "\n",
168 |       "def getCarrier(inCarrier):\n",
169 |       "    out = []\n",
170 |       "    for x, y in inCarrier.iteritems():\n",
171 |       "        out.append(UniqueCarrier.get_value(y) + 1)\n",
172 |       "    return out\n",
173 |       "\n",
174 |       "df['TailNum'] = getTailNum(df['TailNum'])\n",
175 |       "print \"TailNum completed.\"\n",
176 |       "\n",
177 |       "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
178 |       "print \"UniqueCarrier completed.\"\n",
179 |       "\n",
180 |       "df['Dest'] = getDest(df['Dest'])\n",
181 |       "print \"Dest completed.\"\n",
182 |       "\n",
183 |       "df['Origin'] = getOrigin(df['Origin'])\n",
184 |       "print \"Origin completed.\"\n",
185 |       "\n",
186 |       "print \"Conversion to numeric completed.\"\n",
187 |       "\n",
188 |       "# print \"Pickling converted data...\"\n",
189 |       "# df.to_pickle(INPUT_FILE_PATH + \"\\df.pkl\")\n"
190 |      ],
191 |      "language": "python",
192 |      "metadata": {},
193 |      "outputs": [
194 |       {
195 |        "output_type": "stream",
196 |        "stream": "stdout",
197 |        "text": [
198 |         "Setting constants...\n",
199 |         "Reading into Pandas frame...\n",
200 |         "\n",
201 |         "C:\\data\\airline\\2006.csv\n",
202 |         "500000"
203 |        ]
204 |       },
205 |       {
206 |        "output_type": "stream",
207 |        "stream": "stdout",
208 |        "text": [
209 |         "\n",
210 |         "Removed cancelled flights, new length - "
211 |        ]
212 |       },
213 |       {
214 |        "output_type": "stream",
215 |        "stream": "stdout",
216 |        "text": [
217 |         " 491158\n",
218 |         "\n",
219 |         "\n",
220 |         "C:\\data\\airline\\2008.csv\n",
221 |         "500000"
222 |        ]
223 |       },
224 |       {
225 |        "output_type": "stream",
226 |        "stream": "stdout",
227 |        "text": [
228 |         "\n",
229 |         "Removed cancelled flights, new length - "
230 |        ]
231 |       },
232 |       {
233 |        "output_type": "stream",
234 |        "stream": "stdout",
235 |        "text": [
236 |         " 484708\n",
237 |         "\n",
238 |         "\n",
239 |         "C:\\data\\airline\\2007.csv\n",
240 |         "500000"
241 |        ]
242 |       },
243 |       {
244 |        "output_type": "stream",
245 |        "stream": "stdout",
246 |        "text": [
247 |         "\n",
248 |         "Removed cancelled flights, new length - "
249 |        ]
250 |       },
251 |       {
252 |        "output_type": "stream",
253 |        "stream": "stdout",
254 |        "text": [
255 |         " 487243\n",
256 |         "\n",
257 |         "Total length - "
258 |        ]
259 |       },
260 |       {
261 |        "output_type": "stream",
262 |        "stream": "stdout",
263 |        "text": [
264 |         " 1463109\n",
265 |         "Calculating classification label..."
266 |        ]
267 |       },
268 |       {
269 |        "output_type": "stream",
270 |        "stream": "stdout",
271 |        "text": [
272 |         "\n",
273 |         "Dataframe shape -  (1463109, 10)\n",
274 |         "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
275 |         "Converting categorical data to numeric...\n",
276 |         "Converting..."
277 |        ]
278 |       },
279 |       {
280 |        "output_type": "stream",
281 |        "stream": "stdout",
282 |        "text": [
283 |         " Origin\n",
284 |         "Converting..."
285 |        ]
286 |       },
287 |       {
288 |        "output_type": "stream",
289 |        "stream": "stdout",
290 |        "text": [
291 |         " UniqueCarrier\n",
292 |         "Converting..."
293 |        ]
294 |       },
295 |       {
296 |        "output_type": "stream",
297 |        "stream": "stdout",
298 |        "text": [
299 |         " Dest\n",
300 |         "Converting..."
301 |        ]
302 |       },
303 |       {
304 |        "output_type": "stream",
305 |        "stream": "stdout",
306 |        "text": [
307 |         " TailNum\n",
308 |         "TailNum completed."
309 |        ]
310 |       },
311 |       {
312 |        "output_type": "stream",
313 |        "stream": "stdout",
314 |        "text": [
315 |         "\n",
316 |         "UniqueCarrier completed."
317 |        ]
318 |       },
319 |       {
320 |        "output_type": "stream",
321 |        "stream": "stdout",
322 |        "text": [
323 |         "\n",
324 |         "Dest completed."
325 |        ]
326 |       },
327 |       {
328 |        "output_type": "stream",
329 |        "stream": "stdout",
330 |        "text": [
331 |         "\n",
332 |         "Origin completed."
333 |        ]
334 |       },
335 |       {
336 |        "output_type": "stream",
337 |        "stream": "stdout",
338 |        "text": [
339 |         "\n",
340 |         "Conversion to numeric completed.\n"
341 |        ]
342 |       }
343 |      ],
344 |      "prompt_number": 13
345 |     },
346 |     {
347 |      "cell_type": "code",
348 |      "collapsed": false,
349 |      "input": [
350 |       "\n",
351 |       "print \"Begin classification...75% training, 25% testing, randomly chosen\"\n",
352 |       "\n",
353 |       "# add columns to your data frame\n",
354 |       "df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75\n",
355 |       "\n",
356 |       "# define training and test sets\n",
357 |       "train = df[df['is_train'] == True]\n",
358 |       "test = df[df['is_train'] == False]\n",
359 |       "trainTargets = np.array(train['label']).astype(int)\n",
360 |       "testTargets = np.array(test['label']).astype(int)\n",
361 |       "features = df.columns[0:9]\n",
362 |       "print \"Features - \",features\n",
363 |       "print \"Model fitting and prediction started...\"\n",
364 |       "gnb = GaussianNB()\n",
365 |       "\n",
366 |       "# train model\n",
367 |       "y_gnb = gnb.fit(train[features], trainTargets).predict(test[features])\n",
368 |       "y_prob = gnb.fit(train[features], trainTargets).predict_proba(test[features])\n",
369 |       "\n",
370 |       "print \"Classification completed.\""
371 |      ],
372 |      "language": "python",
373 |      "metadata": {},
374 |      "outputs": [
375 |       {
376 |        "output_type": "stream",
377 |        "stream": "stdout",
378 |        "text": [
379 |         "Begin classification...75% training, 25% testing, randomly chosen\n",
380 |         "Features - "
381 |        ]
382 |       },
383 |       {
384 |        "output_type": "stream",
385 |        "stream": "stdout",
386 |        "text": [
387 |         " Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest'], dtype='object')\n",
388 |         "Model fitting and prediction started...\n",
389 |         "Classification completed."
390 |        ]
391 |       },
392 |       {
393 |        "output_type": "stream",
394 |        "stream": "stdout",
395 |        "text": [
396 |         "\n",
397 |         "Calculating metrcs...\n",
398 |         "Accuracy -  0.798698653544\n",
399 |         "Confusion metrics\n",
400 |         "[[291966    106]\n",
401 |         " [ 73525    178]]"
402 |        ]
403 |       },
404 |       {
405 |        "output_type": "stream",
406 |        "stream": "stdout",
407 |        "text": [
408 |         "\n",
409 |         "Precision -  "
410 |        ]
411 |       },
412 |       {
413 |        "output_type": "stream",
414 |        "stream": "stdout",
415 |        "text": [
416 |         "0.62676056338\n",
417 |         "Recall -  "
418 |        ]
419 |       },
420 |       {
421 |        "output_type": "stream",
422 |        "stream": "stdout",
423 |        "text": [
424 |         "0.00241509843561\n"
425 |        ]
426 |       }
427 |      ],
428 |      "prompt_number": 14
429 |     },
430 |     {
431 |      "cell_type": "code",
432 |      "collapsed": false,
433 |      "input": [
434 |       "print \"Calculating metrcs...\"\n",
435 |       "print \"Accuracy - \", accuracy_score(test['label'], y_gnb)\n",
436 |       "print \"Confusion metrics\\n\", metrics.confusion_matrix(test['label'], y_gnb,labels=(0,1))\n",
437 |       "print \"Precision - \", precision_score(test['label'], y_gnb)\n",
438 |       "print \"Recall - \", recall_score(test['label'], y_gnb)\n"
439 |      ],
440 |      "language": "python",
441 |      "metadata": {},
442 |      "outputs": [
443 |       {
444 |        "output_type": "stream",
445 |        "stream": "stdout",
446 |        "text": [
447 |         "Calculating metrcs...\n",
448 |         "Accuracy -  0.798698653544\n",
449 |         "Confusion metrics\n",
450 |         "[[291966    106]\n",
451 |         " [ 73525    178]]"
452 |        ]
453 |       },
454 |       {
455 |        "output_type": "stream",
456 |        "stream": "stdout",
457 |        "text": [
458 |         "\n",
459 |         "Precision -  "
460 |        ]
461 |       },
462 |       {
463 |        "output_type": "stream",
464 |        "stream": "stdout",
465 |        "text": [
466 |         "0.62676056338\n",
467 |         "Recall -  "
468 |        ]
469 |       },
470 |       {
471 |        "output_type": "stream",
472 |        "stream": "stdout",
473 |        "text": [
474 |         "0.00241509843561\n"
475 |        ]
476 |       }
477 |      ],
478 |      "prompt_number": 25
479 |     },
480 |     {
481 |      "cell_type": "code",
482 |      "collapsed": false,
483 |      "input": [
484 |       "testSFO = test[test['Origin'] == Origin['SFO']]\n",
485 |       "print len(testSFO)\n",
486 |       "\n",
487 |       "testOAK = test[test['Origin'] == Origin['OAK']]\n",
488 |       "print len(testOAK)\n"
489 |      ],
490 |      "language": "python",
491 |      "metadata": {},
492 |      "outputs": [
493 |       {
494 |        "output_type": "stream",
495 |        "stream": "stdout",
496 |        "text": [
497 |         "3563\n",
498 |         "40\n"
499 |        ]
500 |       }
501 |      ],
502 |      "prompt_number": 22
503 |     },
504 |     {
505 |      "cell_type": "code",
506 |      "collapsed": false,
507 |      "input": [
508 |       " np.random.randint(2000, size=10)\n",
509 |       "    "
510 |      ],
511 |      "language": "python",
512 |      "metadata": {},
513 |      "outputs": [
514 |       {
515 |        "metadata": {},
516 |        "output_type": "pyout",
517 |        "prompt_number": 27,
518 |        "text": [
519 |         "array([ 437, 1815,  742,  148, 1399, 1171,  205, 1480,  838, 1437])"
520 |        ]
521 |       }
522 |      ],
523 |      "prompt_number": 27
524 |     }
525 |    ],
526 |    "metadata": {}
527 |   }
528 |  ]
529 | }


--------------------------------------------------------------------------------
/Old Python Code/Untitled1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:42405ac43042e4a863e6490ca6e8de6e19a63251aec5c9df6ebb479db0a2da04"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "import pickle\n",
 17 |       "import sklearn\n",
 18 |       "from sklearn.naive_bayes import *\n",
 19 |       "import pandas as pd\n",
 20 |       "import numpy as np\n",
 21 |       "from sklearn import *\n",
 22 |       "import os\n",
 23 |       "from sklearn.metrics import *\n",
 24 |       "from sklearn import metrics, preprocessing\n",
 25 |       "from sklearn import svm, naive_bayes, neighbors, tree\n",
 26 |       "from sklearn.ensemble import AdaBoostClassifier\n",
 27 |       "\n",
 28 |       "\n",
 29 |       "def createPickle(data, filename):\n",
 30 |       "    with open(filename, 'wb') as f:\n",
 31 |       "            pickle.dump(data, f)\n",
 32 |       "    print \"Pickled\", filename\n",
 33 |       "\n",
 34 |       "\n",
 35 |       "# Setting up constants\n",
 36 |       "print \"Setting constants...\"\n",
 37 |       "\n",
 38 |       "TRAINING_LINE_NUMBER = 10000\n",
 39 |       "# YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']\n",
 40 |       "# YEARS = ['2008', '2006', '2007']\n",
 41 |       "# INPUT_FILE_PATH = \"/home/dmenghani/python/\"  # Unix path\n",
 42 |       "INPUT_FILE_PATH = \"C:\\\\data\\\\airline\\\\\"  # Windows path\n",
 43 |       "YEARS = ['2008']\n",
 44 |       "SKIP_FIRST_LINE = True  # To skip the first line, as its the header\n",
 45 |       "\n",
 46 |       "master = []\n",
 47 |       "print \"Reading into Pandas frame...\"\n",
 48 |       "try:\n",
 49 |       "    for year in YEARS:\n",
 50 |       "        path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))\n",
 51 |       "        print \"\\n\", path\n",
 52 |       "        dfPart = pd.read_csv(\n",
 53 |       "            path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[\n",
 54 |       "                u'Year',\n",
 55 |       "                u'Month',\n",
 56 |       "                u'DayofMonth',\n",
 57 |       "                u'DayOfWeek',\n",
 58 |       "                u'UniqueCarrier',\n",
 59 |       "                u'DepTime',\n",
 60 |       "                u'TailNum',\n",
 61 |       "                u'Origin',\n",
 62 |       "                u'Dest',\n",
 63 |       "                u'DepDelay',\n",
 64 |       "                # u'ArrDelay',\n",
 65 |       "                u'Cancelled',\n",
 66 |       "                #                 u'ArrTime',\n",
 67 |       "                #                 u'ArrDelay',\n",
 68 |       "                #                 u'Distance'\n",
 69 |       "            ])\n",
 70 |       "        print len(dfPart)\n",
 71 |       "        dfPart = dfPart[dfPart['Cancelled'] == 0]\n",
 72 |       "        # dfPart['Year'] = year\n",
 73 |       "        # rows = np.random.choice(\n",
 74 |       "        #     np.random.permutation(dfPart.index.values), len(dfPart) // 1, replace=False)\n",
 75 |       "        # print rows\n",
 76 |       "        # sampled_dfPart = dfPart.ix[rows]\n",
 77 |       "        sampled_dfPart = dfPart\n",
 78 |       "        print \"Removed cancelled flights, new length - \", len(sampled_dfPart)\n",
 79 |       "        master.append(sampled_dfPart)\n",
 80 |       "        print\n",
 81 |       "except Exception as e:\n",
 82 |       "    print \"Supplemental Data Import failed\", e\n",
 83 |       "\n",
 84 |       "dfMaster = pd.concat(master, ignore_index=True)\n",
 85 |       "master = []\n",
 86 |       "dfPart = []\n",
 87 |       "\n",
 88 |       "print \"Total length - \", len(dfMaster)\n",
 89 |       "del dfMaster['Cancelled']\n",
 90 |       "\n",
 91 |       "dfMaster.fillna(0, inplace=True)\n",
 92 |       "dfMaster['Year'] = dfMaster['Year'].astype('int')\n",
 93 |       "dfMaster['Month'] = dfMaster['Month'].astype('int')\n",
 94 |       "dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')\n",
 95 |       "dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')\n",
 96 |       "dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')\n",
 97 |       "dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')\n",
 98 |       "\n",
 99 |       "df = dfMaster\n",
100 |       "\n",
101 |       "print \"Calculating classification label...\"\n",
102 |       "df['label'] = 0\n",
103 |       "df.label[df.DepDelay >= 1] = 1\n",
104 |       "df.label[df.DepDelay < 1] = 0\n",
105 |       "print \"Actual delayed flights  -\", np.sum(dfMaster['label']) / len(dfMaster['label'])\n",
106 |       "\n",
107 |       "# df['DepDelay'][df.DepDelay < 0] = 0\n",
108 |       "del df['DepDelay']\n",
109 |       "# df['ArrDelay'][df.ArrDelay < 0] = 0\n",
110 |       "\n",
111 |       "print \"Dataframe shape - \", df.shape\n",
112 |       "print \"Columns -\", df.columns\n",
113 |       "\n",
114 |       "\n",
115 |       "# In[136]:\n",
116 |       "\n",
117 |       "print \"Converting categorical data to numeric...\"\n",
118 |       "for col in set(df.columns):\n",
119 |       "# print col, train[col].dtype\n",
120 |       "    if df[col].dtype == np.dtype('object'):\n",
121 |       "        print \"Converting...\", col\n",
122 |       "        if col == 'TailNum':\n",
123 |       "            s = np.unique(df[col].values)\n",
124 |       "            TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
125 |       "#             print TailNum\n",
126 |       "        if col == 'UniqueCarrier':\n",
127 |       "            s = np.unique(df[col].values)\n",
128 |       "            UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
129 |       "#             print UniqueCarrier\n",
130 |       "        if col == 'Dest':\n",
131 |       "            s = np.unique(df[col].values)\n",
132 |       "            Dest = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
133 |       "            # print Dest\n",
134 |       "        if col == 'Origin':\n",
135 |       "            s = np.unique(df[col].values)\n",
136 |       "            Origin = pd.Series([x[0] for x in enumerate(s)], index=s)\n",
137 |       "            # print Origin\n",
138 |       "\n",
139 |       "# print \"sfo,\", Origin['SFO']\n",
140 |       "# print \"oak,\", Origin['OAK']\n",
141 |       "\n",
142 |       "# createPickle(Dest, 'Dest_2008.pkl')\n",
143 |       "# createPickle(Origin, 'Origin_2008.pkl')\n",
144 |       "# createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl')\n",
145 |       "# createPickle(TailNum, 'TailNum_2008.pkl')\n",
146 |       "\n",
147 |       "print \"Pickle completed.\"\n",
148 |       "\n",
149 |       "\n",
150 |       "def getTailNum(inTailNum):\n",
151 |       "#     print \"In...\",type(inTailNum)\n",
152 |       "    out = []\n",
153 |       "    for x, y in inTailNum.iteritems():\n",
154 |       "#         print \"x,y, out\",x,y,TailNum.get_value(y)\n",
155 |       "        out.append(TailNum.get_value(y) + 1)\n",
156 |       "#     print \"final out\", out\n",
157 |       "    return out\n",
158 |       "\n",
159 |       "\n",
160 |       "def getDest(inDest):\n",
161 |       "    out = []\n",
162 |       "    for x, y in inDest.iteritems():\n",
163 |       "        out.append(Dest.get_value(y) + 1)\n",
164 |       "    return out\n",
165 |       "\n",
166 |       "\n",
167 |       "def getOrigin(inOrign):\n",
168 |       "    out = []\n",
169 |       "#     print inOrign\n",
170 |       "    for x, y in inOrign.iteritems():\n",
171 |       "        out.append(Origin.get_value(y) + 1)\n",
172 |       "    return out\n",
173 |       "\n",
174 |       "\n",
175 |       "def getCarrier(inCarrier):\n",
176 |       "    out = []\n",
177 |       "    for x, y in inCarrier.iteritems():\n",
178 |       "        out.append(UniqueCarrier.get_value(y) + 1)\n",
179 |       "    return out\n",
180 |       "\n",
181 |       "print \"Before conversion...\"\n",
182 |       "print len(dfMaster[dfMaster['Origin'] == 'SFO'])\n",
183 |       "print len(dfMaster[dfMaster['Origin'] == 'OAK'])\n",
184 |       "# df[df['Origin'] == 'SFO']"
185 |      ],
186 |      "language": "python",
187 |      "metadata": {},
188 |      "outputs": [
189 |       {
190 |        "output_type": "stream",
191 |        "stream": "stdout",
192 |        "text": [
193 |         "Setting constants...\n",
194 |         "Reading into Pandas frame...\n",
195 |         "\n",
196 |         "C:\\data\\airline\\2008.csv\n",
197 |         "10000\n",
198 |         "Removed cancelled flights, new length -  9837\n",
199 |         "\n",
200 |         "Total length -  9837\n",
201 |         "Calculating classification label...\n",
202 |         "Actual delayed flights  - 0.756429805835\n",
203 |         "Dataframe shape -  (9837, 10)\n",
204 |         "Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'Origin', u'Dest', u'label'], dtype='object')\n",
205 |         "Converting categorical data to numeric...\n",
206 |         "Converting... Origin\n",
207 |         "Converting... UniqueCarrier\n",
208 |         "Converting... Dest\n",
209 |         "Converting..."
210 |        ]
211 |       },
212 |       {
213 |        "output_type": "stream",
214 |        "stream": "stdout",
215 |        "text": [
216 |         " TailNum\n",
217 |         "Pickle completed.\n",
218 |         "Before conversion...\n",
219 |         "64\n",
220 |         "383\n"
221 |        ]
222 |       }
223 |      ],
224 |      "prompt_number": 65
225 |     },
226 |     {
227 |      "cell_type": "code",
228 |      "collapsed": false,
229 |      "input": [
230 |       "len(getOrigin(df['Origin']))\n",
231 |       "Origin['SFO']+1"
232 |      ],
233 |      "language": "python",
234 |      "metadata": {},
235 |      "outputs": [
236 |       {
237 |        "metadata": {},
238 |        "output_type": "pyout",
239 |        "prompt_number": 69,
240 |        "text": [
241 |         "56"
242 |        ]
243 |       }
244 |      ],
245 |      "prompt_number": 69
246 |     },
247 |     {
248 |      "cell_type": "code",
249 |      "collapsed": false,
250 |      "input": [
251 |       "\n",
252 |       "df['TailNum'] = getTailNum(df['TailNum'])\n",
253 |       "df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])\n",
254 |       "df['Dest_new'] = getDest(df['Dest'])\n",
255 |       "df['Origin_new'] =getOrigin(df['Origin'])\n",
256 |       "\n",
257 |       "print \"TailNum completed.\"\n",
258 |       "print \"UniqueCarrier completed.\"\n",
259 |       "print \"Dest completed.\"\n",
260 |       "print \"Origin completed.\"\n",
261 |       "\n",
262 |       "print \"Conversion to numeric completed.\"\n",
263 |       "\n",
264 |       "print \"After conversion...\"\n",
265 |       "# dfSFO = df[df['Origin'].isin([Origin['SFO']])]\n",
266 |       "dfSFO = df[df['Origin']==56]\n",
267 |       "print \"SFO len - \", len(dfSFO)\n",
268 |       "# print Dest[np.unique(dfSFO['Dest'])]\n",
269 |       "\n",
270 |       "dfOAK = df[df['Origin'].isin([Origin['OAK']])]\n",
271 |       "print \"OAK len - \", len(dfOAK)\n",
272 |       "# print Dest[np.unique(dfOAK['Dest'])]\n",
273 |       "# print Origin+1\n",
274 |       "# print Dest+1\n",
275 |       "# df[df['Origin'] == 'SFO']\n",
276 |       "# df.to_csv(\"why.csv\")"
277 |      ],
278 |      "language": "python",
279 |      "metadata": {},
280 |      "outputs": [
281 |       {
282 |        "output_type": "stream",
283 |        "stream": "stdout",
284 |        "text": [
285 |         "TailNum completed.\n",
286 |         "UniqueCarrier completed.\n",
287 |         "Dest completed.\n",
288 |         "Origin completed.\n",
289 |         "Conversion to numeric completed.\n",
290 |         "After conversion...\n",
291 |         "SFO len -  0\n",
292 |         "OAK len -  0\n"
293 |        ]
294 |       }
295 |      ],
296 |      "prompt_number": 67
297 |     },
298 |     {
299 |      "cell_type": "code",
300 |      "collapsed": false,
301 |      "input": [
302 |       "\n",
303 |       "# print \"Begin cross validation...\"\n",
304 |       "\n",
305 |       "# features = df.columns[0:9]\n",
306 |       "# target_names = ['Not Delayed', 'Delayed']\n",
307 |       "# accuracy = {}\n",
308 |       "# results = {}\n",
309 |       "# matrix = {}\n",
310 |       "# prec = {}\n",
311 |       "# recall = {}\n",
312 |       "\n",
313 |       "# for year in YEARS:\n",
314 |       "#     print \"Testing on - \", year\n",
315 |       "#     train = df[df['Year'] != int(year)]\n",
316 |       "#     test = df[df['Year'] == int(year)]\n",
317 |       "#     test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])]\n",
318 |       "#     print len(train), len(test)\n",
319 |       "#     # rows = np.random.choice(np.random.permutation(\n",
320 |       "#     #     test.index.values), len(test) // 1, replace=False)\n",
321 |       "#     # print rows\n",
322 |       "#     # sampled_test = test.ix[rows]\n",
323 |       "#     sampled_test = test\n",
324 |       "#     trainTargets = np.array(train['label']).astype(int)\n",
325 |       "#     testTargets = np.array(sampled_test['label']).astype(int)\n",
326 |       "#     print \"Train length - \", len(train), \"Test length -  \", len(sampled_test)\n",
327 |       "# #     print train['Year']\n",
328 |       "# #     print test['Year']\n",
329 |       "#     print \"Model fitting and prediction started...\"\n",
330 |       "#     gnb = GaussianNB()\n",
331 |       "#     y_gnb = gnb.fit(train[features], trainTargets).predict(\n",
332 |       "#         sampled_test[features])\n",
333 |       "#     sampled_test['pred_label'] = y_gnb\n",
334 |       "#     # y_prob = gnb.fit(\n",
335 |       "#     #     train[features], trainTargets).predict_proba(test[features])\n",
336 |       "#     # print y_prob\n",
337 |       "#     # test['pred_prob'] = y_prob[1][1]\n",
338 |       "#     print \"Classification completed.\"\n",
339 |       "#     createPickle(gnb, INPUT_FILE_PATH + \"classifier_\" + year + \".pkl\")\n",
340 |       "#     createPickle(y_gnb, INPUT_FILE_PATH + \"label_\" + year + \".pkl\")\n",
341 |       "#     sampled_test.to_csv(\n",
342 |       "#         INPUT_FILE_PATH + \"\\dfTest\" + year + \".csv\", index=False)\n",
343 |       "\n",
344 |       "#     print \"\\nCalculating metrcs...\"\n",
345 |       "#     accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb)\n",
346 |       "#     print \"Accuracy score - \", accuracy[int(year)]\n",
347 |       "#     prec[int(year)] = precision_score(\n",
348 |       "#         sampled_test['label'], y_gnb, average='micro')\n",
349 |       "#     print \"Precision Score - \", prec[int(year)]\n",
350 |       "#     recall[int(year)] = recall_score(\n",
351 |       "#         sampled_test['label'], y_gnb, average='micro')\n",
352 |       "#     print \"Recall Score - \", recall[int(year)]\n",
353 |       "#     print \"Confusion matrix\"\n",
354 |       "#     matrix[int(year)] = metrics.confusion_matrix(\n",
355 |       "#         sampled_test['label'], y_gnb)\n",
356 |       "#     print matrix[int(year)]\n",
357 |       "#     results[int(year)] = precision_recall_fscore_support(\n",
358 |       "#         sampled_test['label'], y_gnb, average='micro')\n",
359 |       "#     print \"Precision, recall, F-Score, Support - \", results[int(year)]\n",
360 |       "#     print \"Classification report\"\n",
361 |       "#     print classification_report(np.array(sampled_test['label']), y_gnb,\n",
362 |       "#                                 target_names=target_names)\n",
363 |       "#     print\n",
364 |       "#     train = []\n",
365 |       "#     test = []\n",
366 |       "\n",
367 |       "# print \"Accuracy\\n\", accuracy\n",
368 |       "# print \"\\nPrecision\\n\", prec\n",
369 |       "# print \"\\nRecall\\n\", recall\n",
370 |       "# print \"\\nMetrics\\n\", results\n",
371 |       "# print \"\\nMatrix\\n\", matrix\n",
372 |       "\n",
373 |       "# print \"\\nMean Cross validation Precision score\", np.mean(pd.Series(prec))\n",
374 |       "# print \"\\nMean Cross validation Recall score\", np.mean(pd.Series(recall))\n",
375 |       "# print \"\\nMean Cross validation Accuracy score\", np.mean(pd.Series(accuracy))\n",
376 |       "\n",
377 |       "# # print \"\\nPickling stuff...\"\n",
378 |       "# # createPickle(accuracy, 'accuracy.pkl')\n",
379 |       "# # createPickle(prec, 'prec.pkl')\n",
380 |       "# # createPickle(results, 'results.pkl')\n",
381 |       "# # createPickle(matrix, 'matrix.pkl')\n",
382 |       "# # createPickle(Dest, 'Dest.pkl')\n",
383 |       "# # createPickle(Origin, 'Origin.pkl')\n",
384 |       "# # createPickle(UniqueCarrier, 'UniqueCarrier.pkl')\n",
385 |       "# # createPickle(TailNum, 'TailNum.pkl')\n"
386 |      ],
387 |      "language": "python",
388 |      "metadata": {},
389 |      "outputs": [],
390 |      "prompt_number": 33
391 |     }
392 |    ],
393 |    "metadata": {}
394 |   }
395 |  ]
396 | }


--------------------------------------------------------------------------------
/Old Python Code/Untitled2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:1b2400b379e8920e0aa6061e92e9cd24c52cafda7a0349568949d9c59aa51ae9"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "from __future__ import division\n",
 16 |       "%matplotlib inline\n",
 17 |       "import matplotlib.pyplot as plt\n",
 18 |       "import numpy as np\n",
 19 |       "from pylab import figure, show\n",
 20 |       "from pandas import DataFrame, Series\n",
 21 |       "import pandas as pd\n",
 22 |       "import csv\n",
 23 |       "import os\n",
 24 |       "import statsmodels.formula.api as smf\n",
 25 |       "import scipy.stats as stats\n",
 26 |       "import statsmodels.api as sm\n",
 27 |       "from IPython.core.display import HTML\n",
 28 |       "from bokeh.plotting import *\n",
 29 |       "import seaborn as sns\n",
 30 |       "from bokeh.objects import ColumnDataSource, Range1d\n",
 31 |       "from math import floor\n",
 32 |       "import bokeh as bokeh\n",
 33 |       "import sys\n",
 34 |       "import csv\n",
 35 |       "import datetime"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": [],
 40 |      "prompt_number": 8
 41 |     },
 42 |     {
 43 |      "cell_type": "code",
 44 |      "collapsed": false,
 45 |      "input": [
 46 |       "\n",
 47 |       "TIME_DELTA = 3\n",
 48 |       "\n",
 49 |       "# for arg in sys.argv:\n",
 50 |       "# \tif(arg != 'date_graph.py'):\n",
 51 |       "# \t\tstart_date = datetime.datetime.strptime(arg, '%m-%d-%y')\n",
 52 |       "# \t\tstart_date = datetime.date(start_date.year, start_date.month, start_date.day)\n",
 53 |       "\n",
 54 |       "start_date = datetime.datetime.strptime('05-08-08', '%m-%d-%y')\n",
 55 |       "print start_date\n",
 56 |       "\n",
 57 |       "delta = datetime.timedelta(days=TIME_DELTA)\n",
 58 |       "begin = start_date - delta\n",
 59 |       "end = start_date + delta\n",
 60 |       "\n",
 61 |       "SFO_Hash = {}\n",
 62 |       "OAK_Hash = {}\n",
 63 |       "SFO_count = 0\n",
 64 |       "OAK_count = 0\n",
 65 |       "with open('C:\\\\data\\\\airline\\\\_dfTest2008.csv', 'r') as data:\n",
 66 |       "\tcsv_reader = csv.reader(data, delimiter=',')\n",
 67 |       "\tfor row in csv_reader:\n",
 68 |       "\t\tif(row[0] != 'Year'):\n",
 69 |       "\t\t\tyear = int(row[0])\n",
 70 |       "\t\t\tmonth = int(row[1])\n",
 71 |       "\t\t\tdate = int(row[2])\n",
 72 |       "\t\t\tcurr_date = datetime.datetime(year, month, date)\n",
 73 |       "\t\t\tif(curr_date >= begin and curr_date <= end):\n",
 74 |       "\t\t\t\torigin = row[7]\n",
 75 |       "\t\t\t\tif(origin == '270'):\n",
 76 |       "\t\t\t\t\tlabel = int(row[10])\n",
 77 |       "\t\t\t\t\tSFO_count += 1\n",
 78 |       "\t\t\t\t\tif(curr_date not in SFO_Hash):\n",
 79 |       "\t\t\t\t\t\tSFO_Hash[curr_date] = [label]\n",
 80 |       "\t\t\t\t\telse:\n",
 81 |       "\t\t\t\t\t\tSFO_Hash[curr_date].append(label)\t\n",
 82 |       "\t\t\t\tif(origin == '215'):\n",
 83 |       "\t\t\t\t\tlabel = int(row[10])\n",
 84 |       "\t\t\t\t\tOAK_count += 1\n",
 85 |       "\t\t\t\t\tif(curr_date not in OAK_Hash):\n",
 86 |       "\t\t\t\t\t\tOAK_Hash[curr_date] = [label]\n",
 87 |       "\t\t\t\t\telse:\n",
 88 |       "\t\t\t\t\t\tOAK_Hash[curr_date].append(label)\n",
 89 |       "\n",
 90 |       "iterator = datetime.timedelta(days=1)\n",
 91 |       "day_values = []\n",
 92 |       "SFO_Delays = []\n",
 93 |       "SFO_On_Time = []\n",
 94 |       "SFO_Flights = []\n",
 95 |       "SFO_Pct = []\n",
 96 |       "OAK_Delays = []\n",
 97 |       "OAK_On_Time = []\n",
 98 |       "OAK_Flights = []\n",
 99 |       "OAK_Pct = []\n",
100 |       "\n",
101 |       "while begin <= end:\n",
102 |       "\tif(begin not in SFO_Hash):\n",
103 |       "\t\tSFO_Delays.append(0)\n",
104 |       "\t\tSFO_On_Time.append(0)\n",
105 |       "\t\tSFO_Pct.append(0.00)\n",
106 |       "\telse:\n",
107 |       "\t\tSFO_Flights = SFO_Hash[begin]\n",
108 |       "\t\tdelays = sum(SFO_Flights)\n",
109 |       "\t\tnum_flights = len(SFO_Flights)\n",
110 |       "\t\tpct = float(delays) / (num_flights + delays)\n",
111 |       "\t\tSFO_Delays.append(delays)\n",
112 |       "\t\tSFO_On_Time.append(num_flights - delays)\n",
113 |       "\t\tSFO_Pct.append(pct)\n",
114 |       "\t\n",
115 |       "\tif(begin not in OAK_Hash):\n",
116 |       "\t\tOAK_Delays.append(0)\n",
117 |       "\t\tOAK_On_Time.append(0)\n",
118 |       "\t\tOAK_Pct.append(0.00)\n",
119 |       "\telse:\n",
120 |       "\t\tOAK_Flights = OAK_Hash[begin]\n",
121 |       "\t\tdelays = sum(OAK_Flights)\n",
122 |       "\t\tnum_flights = len(OAK_Flights)\n",
123 |       "\t\tpct = float(delays) / (num_flights + delays)\n",
124 |       "\t\tOAK_Delays.append(delays)\n",
125 |       "\t\tOAK_On_Time.append(num_flights - delays)\n",
126 |       "\t\tOAK_Pct.append(pct)\n",
127 |       "\t\n",
128 |       "\tday_values.append(begin)\n",
129 |       "\tbegin += iterator\n",
130 |       "\n",
131 |       "print SFO_Pct\n",
132 |       "print OAK_Pct"
133 |      ],
134 |      "language": "python",
135 |      "metadata": {},
136 |      "outputs": [
137 |       {
138 |        "output_type": "stream",
139 |        "stream": "stdout",
140 |        "text": [
141 |         "2008-05-08 00:00:00\n",
142 |         "[0.22568093385214008, 0.23976608187134502, 0.2556390977443609, 0.2560747663551402, 0.263254113345521, 0.2478448275862069, 0.30275229357798167]"
143 |        ]
144 |       },
145 |       {
146 |        "output_type": "stream",
147 |        "stream": "stdout",
148 |        "text": [
149 |         "\n",
150 |         "[0.24793388429752067, 0.24680851063829787, 0.2697095435684647, 0.27058823529411763, 0.28185328185328185, 0.2613065326633166, 0.3004115226337449]\n"
151 |        ]
152 |       }
153 |      ],
154 |      "prompt_number": 4
155 |     },
156 |     {
157 |      "cell_type": "code",
158 |      "collapsed": false,
159 |      "input": [
160 |       "print \"Xastart_date"
161 |      ],
162 |      "language": "python",
163 |      "metadata": {},
164 |      "outputs": []
165 |     },
166 |     {
167 |      "cell_type": "code",
168 |      "collapsed": false,
169 |      "input": [
170 |       "plt.title('Probability of Flight Delays at SFO vs. OAK Given Specific Date and +/- 3 Days')\n",
171 |       "\n",
172 |       "ax1 = plt.subplot(211)\n",
173 |       "#ax1.bar(day_values, SFO_Delays, bottom = SFO_On_Time, color = 'red')\n",
174 |       "#ax1.bar(day_values, SFO_On_Time, color = 'blue')\n",
175 |       "ax1.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
176 |       "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
177 |       "ax1.set_yticks([0, 200, 450])\n",
178 |       "ax1.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at SFO')\n",
179 |       "\n",
180 |       "ax2 = plt.subplot(212)\n",
181 |       "#ax2.bar(day_values, OAK_Delays, bottom = OAK_On_Time, color = 'red')\n",
182 |       "#ax2.bar(day_values, OAK_On_Time, color = 'blue')\n",
183 |       "ax2.set_xticklabels([start_date - delta, '', '', start_date, '', '', start_date + delta], rotation = 45)\n",
184 |       "ax1.text(start_date, 250, 'Test', fontsize=15)\n",
185 |       "ax2.set_yticks([0, 200, 450])\n",
186 |       "ax2.set_title('On-Time Flights (Blue) and Delayed Flights (Red) at OAK')\n",
187 |       "plt.show()"
188 |      ],
189 |      "language": "python",
190 |      "metadata": {},
191 |      "outputs": [
192 |       {
193 |        "metadata": {},
194 |        "output_type": "display_data",
195 |        "text": [
196 |         "<matplotlib.figure.Figure at 0x15540438>"
197 |        ]
198 |       }
199 |      ],
200 |      "prompt_number": 7
201 |     }
202 |    ],
203 |    "metadata": {}
204 |   }
205 |  ]
206 | }


--------------------------------------------------------------------------------
/Old Python Code/accuracy.pkl:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | I2008
 3 | cnumpy.core.multiarray
 4 | scalar
 5 | p1
 6 | (cnumpy
 7 | dtype
 8 | p2
 9 | (S'f8'
10 | p3
11 | I0
12 | I1
13 | tp4
14 | Rp5
15 | (I3
16 | S'<'
17 | p6
18 | NNNI-1
19 | I-1
20 | I0
21 | tp7
22 | bS'\x00\x00\x00\x00\x00\x00\xf0?'
23 | p8
24 | tp9
25 | Rp10
26 | sI2001
27 | g1
28 | (g5
29 | S'\x00\x00\x00\x00\x00\x00\xf0?'
30 | p11
31 | tp12
32 | Rp13
33 | sI2007
34 | g1
35 | (g5
36 | S'\x00\x00\x00\x00\x00\x00\xf0?'
37 | p14
38 | tp15
39 | Rp16
40 | s.


--------------------------------------------------------------------------------
/Old Python Code/counter.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | with open('C:\\Dropbox\\Naive Bayes\\Analysis1.csv', 'r') as data:
 4 |     csv_reader = csv.reader(data, delimiter=',')
 5 |     SFO_count = 0
 6 |     OAK_count = 0
 7 |     for row in csv_reader:
 8 |         origin = row[1]
 9 |         if(origin == '270'):
10 |             SFO_count += int(row[3])
11 |         elif(origin == '215'):
12 |             OAK_count += int(row[3])
13 |         else:
14 |             continue
15 | 
16 | print OAK_count
17 | print SFO_count
18 | 


--------------------------------------------------------------------------------
/Old Python Code/counter1.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | with open('C:\\Dropbox\\Naive Bayes\\_dfTest2008\\_dfTest2008.csv', 'r') as data:
 4 |     csv_reader = csv.reader(data, delimiter=',')
 5 |     SFO_count = 0
 6 |     OAK_count = 0
 7 |     for row in csv_reader:
 8 |         origin = row[7]
 9 |         if(origin == '270'):
10 |             SFO_count += 1
11 |         elif(origin == '215'):
12 |             OAK_count += 1
13 |         else:
14 |             continue
15 | 
16 | print OAK_count
17 | print SFO_count
18 | 


--------------------------------------------------------------------------------
/Old Python Code/data_reader_v2.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pickle
  3 | 
  4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
  5 | years = [2008]
  6 | 
  7 | def ComputeDayofYear(row):
  8 |     """This function will return an integer to represent the day of the year given an integer
  9 |     representing month and an integer representing the day of the month.  This number will
 10 |     correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned
 11 |     as 0.  Feb 29th will be returned as 59."""
 12 | 
 13 |     if(row[0] == '1'):
 14 |         calc = 0 + int(row[1]) - 1
 15 |         row[1] = str(calc)
 16 |     elif(row[0] == '2'):
 17 |         calc = 31 + int(row[1]) - 1
 18 |         row[1] = str(calc)
 19 |     elif(row[0] == '3'):
 20 |         calc = 60 + int(row[1]) - 1
 21 |         row[1] = str(calc)
 22 |     elif(row[0] == '4'):
 23 |         calc = 91 + int(row[1]) - 1
 24 |         row[1] = str(calc)
 25 |     elif(row[0] == '5'):
 26 |         calc = 121 + int(row[1]) - 1
 27 |         row[1] = str(calc)
 28 |     elif(row[0] == '6'):
 29 |         calc = 152 + int(row[1]) - 1
 30 |         row[1] = str(calc)
 31 |     elif(row[0] == '7'):
 32 |         calc = 182 + int(row[1]) - 1
 33 |         row[1] = str(calc)
 34 |     elif(row[0] == '8'):
 35 |         calc = 213 + int(row[1]) - 1
 36 |         row[1] = str(calc)
 37 |     elif(row[0] == '9'):
 38 |         calc = 244 + int(row[1]) - 1
 39 |         row[1] = str(calc)
 40 |     elif(row[0] == '10'):
 41 |         calc = 274 + int(row[1]) - 1
 42 |         row[1] = str(calc)
 43 |     elif(row[0] == '11'):
 44 |         calc = 305 + int(row[1]) - 1
 45 |         row[1] = str(calc)
 46 |     elif(row[0] == '12'):
 47 |         calc = 335 + int(row[1]) - 1
 48 |         row[1] = str(calc)
 49 |     return row
 50 | 
 51 | 
 52 | def DiscretizeDepTime(row):
 53 |     """This function takes a scheduled departure time, classifies the departure time as:
 54 |     morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value
 55 |     is assumed to be an integer in 24-hour time format.  These labels will correspond to
 56 |     variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.
 57 |     An error time is returned as morning."""
 58 | 
 59 |     if(int(row[3]) <= 559):
 60 |         row[3] = '2'
 61 |     elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
 62 |         row[3] = '0'
 63 |     elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
 64 |         row[3] = '1'
 65 |     elif(int(row[3]) >= 1800):
 66 |         row[3] = '2'
 67 |     else:
 68 |         row[3] = '0'
 69 |     return row
 70 | 
 71 | 
 72 | def AddDepVar(row):
 73 |     """This function adds a classification label based on the length of the recorded
 74 |     Departure Delay in the data set.  It assumes an input integer value of the delay in mins.
 75 |     By airline industry standards, flight delays are defined as departure delays greater than
 76 |     or equal to 15 minutes.  For delayed flights, this variable will have value "1".
 77 |     For on time flights, it will have value "0".  Default value will be set at "0"."""
 78 | 
 79 |     if(row[6] >= '15'):
 80 |         row[6] = '1'
 81 |     else:
 82 |         row[6] = '0'
 83 |     return row
 84 | 
 85 | def SaveData(data, pickle_file_name):
 86 |     """This function pickles each file."""
 87 | 
 88 |     f = open (pickle_file_name, "w")
 89 |     pickle.dump(data, f)
 90 |     f.close()
 91 | 
 92 | 
 93 | 
 94 | for i in years:
 95 |     data = []
 96 |     file_path='"C:\\data\\airline\\2008.csv\\"  ' + str(i) + '.csv'
 97 |     pickle_file_name = 'data' + str(i)
 98 |     with open(file_path, 'r') as data_csv:
 99 |         csv_reader = csv.reader(data_csv, delimiter=',')
100 |         for row in list(csv_reader):
101 |             if row[21] == '0':
102 |                 content = list(row[i] for i in needed_cols)
103 |                 content2 = ComputeDayofYear(content)
104 |                 content3 = DiscretizeDepTime(content2)
105 |                 content4 = AddDepVar(content3)
106 |                 data.append(content4)
107 |     SaveData(data, pickle_file_name)
108 | 


--------------------------------------------------------------------------------
/Old Python Code/data_reader_v3.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pickle
  3 | 
  4 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
  5 | years = [2008]
  6 | 
  7 | def ComputeDayofYear(row):
  8 |     """This function will return an integer to represent the day of the year given an integer
  9 |     representing month and an integer representing the day of the month.  This number will
 10 |     correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned
 11 |     as 0.  Feb 29th will be returned as 59."""
 12 | 
 13 |     if(row[0] == '1'):
 14 |         calc = 0 + int(row[1]) - 1
 15 |         row[1] = str(calc)
 16 |     elif(row[0] == '2'):
 17 |         calc = 31 + int(row[1]) - 1
 18 |         row[1] = str(calc)
 19 |     elif(row[0] == '3'):
 20 |         calc = 60 + int(row[1]) - 1
 21 |         row[1] = str(calc)
 22 |     elif(row[0] == '4'):
 23 |         calc = 91 + int(row[1]) - 1
 24 |         row[1] = str(calc)
 25 |     elif(row[0] == '5'):
 26 |         calc = 121 + int(row[1]) - 1
 27 |         row[1] = str(calc)
 28 |     elif(row[0] == '6'):
 29 |         calc = 152 + int(row[1]) - 1
 30 |         row[1] = str(calc)
 31 |     elif(row[0] == '7'):
 32 |         calc = 182 + int(row[1]) - 1
 33 |         row[1] = str(calc)
 34 |     elif(row[0] == '8'):
 35 |         calc = 213 + int(row[1]) - 1
 36 |         row[1] = str(calc)
 37 |     elif(row[0] == '9'):
 38 |         calc = 244 + int(row[1]) - 1
 39 |         row[1] = str(calc)
 40 |     elif(row[0] == '10'):
 41 |         calc = 274 + int(row[1]) - 1
 42 |         row[1] = str(calc)
 43 |     elif(row[0] == '11'):
 44 |         calc = 305 + int(row[1]) - 1
 45 |         row[1] = str(calc)
 46 |     elif(row[0] == '12'):
 47 |         calc = 335 + int(row[1]) - 1
 48 |         row[1] = str(calc)
 49 |     return row
 50 | 
 51 | 
 52 | def DiscretizeDepTime(row):
 53 |     """This function takes a scheduled departure time, classifies the departure time as:
 54 |     morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value
 55 |     is assumed to be an integer in 24-hour time format.  These labels will correspond to
 56 |     variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.
 57 |     An error time is returned as morning."""
 58 | 
 59 |     if(int(row[3]) <= 559):
 60 |         row[3] = '2'
 61 |     elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
 62 |         row[3] = '0'
 63 |     elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
 64 |         row[3] = '1'
 65 |     elif(int(row[3]) >= 1800):
 66 |         row[3] = '2'
 67 |     else:
 68 |         row[3] = '0'
 69 |     return row
 70 | 
 71 | 
 72 | def AddDepVar(row):
 73 |     """This function adds a classification label based on the length of the recorded
 74 |     Departure Delay in the data set.  It assumes an input integer value of the delay in mins.
 75 |     By airline industry standards, flight delays are defined as departure delays greater than
 76 |     or equal to 15 minutes.  For delayed flights, this variable will have value "1".
 77 |     For on time flights, it will have value "0".  Default value will be set at "0"."""
 78 | 
 79 |     if(row[6] >= '15'):
 80 |         row[6] = '1'
 81 |     else:
 82 |         row[6] = '0'
 83 |     return row
 84 | 
 85 | def SaveData(data, pickle_file_name):
 86 |     """This function pickles each file."""
 87 | 
 88 |     f = open (pickle_file_name, "w")
 89 |     pickle.dump(data, f)
 90 |     f.close()
 91 | 
 92 | 
 93 | 
 94 | for i in years:
 95 |     data = []
 96 |     file_path='C:\data\airline' + str(i) + '.csv'
 97 |     pickle_file_name = 'data' + str(i)
 98 |     with open(file_path, 'r') as data_csv:
 99 |         csv_reader = csv.reader(data_csv, delimiter=',')
100 |         for row in list(csv_reader):
101 |             if row[21] == '0':
102 |                 if (row[16] == 'SFO' or row[16] == 'OAK'):
103 |                     content = list(row[i] for i in needed_cols)
104 |                     content2 = ComputeDayofYear(content)
105 |                     content3 = DiscretizeDepTime(content2)
106 |                     content4 = AddDepVar(content3)
107 |                     data.append(content4)
108 |     SaveData(data, pickle_file_name)
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/Old Python Code/data_reader_v4_ek.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pickle
  3 | import time
  4 | import os
  5 | from boto.s3.connection import S3Connection
  6 | from boto.s3.key import Key
  7 | 
  8 | 
  9 | timestr = time.strftime("%Y%m%d-%H%M%S")
 10 | print timestr
 11 | 
 12 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
 13 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
 14 | j=0
 15 | 
 16 | def ComputeDayofYear(row):
 17 |     """This function will return an integer to represent the day of the year given an integer
 18 |     representing month and an integer representing the day of the month.  This number will
 19 |     correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned
 20 |     as 0.  Feb 29th will be returned as 59."""
 21 | 
 22 |     if(row[0] == '1'):
 23 |         calc = 0 + int(row[1]) - 1
 24 |         row[1] = str(calc)
 25 |     elif(row[0] == '2'):
 26 |         calc = 31 + int(row[1]) - 1
 27 |         row[1] = str(calc)
 28 |     elif(row[0] == '3'):
 29 |         calc = 60 + int(row[1]) - 1
 30 |         row[1] = str(calc)
 31 |     elif(row[0] == '4'):
 32 |         calc = 91 + int(row[1]) - 1
 33 |         row[1] = str(calc)
 34 |     elif(row[0] == '5'):
 35 |         calc = 121 + int(row[1]) - 1
 36 |         row[1] = str(calc)
 37 |     elif(row[0] == '6'):
 38 |         calc = 152 + int(row[1]) - 1
 39 |         row[1] = str(calc)
 40 |     elif(row[0] == '7'):
 41 |         calc = 182 + int(row[1]) - 1
 42 |         row[1] = str(calc)
 43 |     elif(row[0] == '8'):
 44 |         calc = 213 + int(row[1]) - 1
 45 |         row[1] = str(calc)
 46 |     elif(row[0] == '9'):
 47 |         calc = 244 + int(row[1]) - 1
 48 |         row[1] = str(calc)
 49 |     elif(row[0] == '10'):
 50 |         calc = 274 + int(row[1]) - 1
 51 |         row[1] = str(calc)
 52 |     elif(row[0] == '11'):
 53 |         calc = 305 + int(row[1]) - 1
 54 |         row[1] = str(calc)
 55 |     elif(row[0] == '12'):
 56 |         calc = 335 + int(row[1]) - 1
 57 |         row[1] = str(calc)
 58 |     return row
 59 | 
 60 | 
 61 | def DiscretizeDepTime(row):
 62 |     """This function takes a scheduled departure time, classifies the departure time as:
 63 |     morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value
 64 |     is assumed to be an integer in 24-hour time format.  These labels will correspond to 
 65 |     variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.  
 66 |     An error time is returned as morning."""
 67 |     
 68 |     if(int(row[3]) <= 559):
 69 |         row[3] = '2'
 70 |     elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
 71 |         row[3] = '0'
 72 |     elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
 73 |         row[3] = '1'
 74 |     elif(int(row[3]) >= 1800):
 75 |         row[3] = '2'
 76 |     else:
 77 |         row[3] = '0'
 78 |     return row
 79 | 
 80 | 
 81 | def AddDepVar(row):
 82 |     """This function adds a classification label based on the length of the recorded
 83 |     Departure Delay in the data set.  It assumes an input integer value of the delay in mins.
 84 |     By airline industry standards, flight delays are defined as departure delays greater than
 85 |     or equal to 15 minutes.  For delayed flights, this variable will have value "1".  
 86 |     For on time flights, it will have value "0".  Default value will be set at "0"."""
 87 | 
 88 |     if(row[6] >= '15'):
 89 |         row[6] = '1'
 90 |     else:
 91 |         row[6] = '0'
 92 |     return row
 93 | 
 94 | def SaveData(data, pickle_file_name):
 95 |     """This function pickles each file."""
 96 | 
 97 |     f = open (pickle_file_name, "wb")
 98 |     try:
 99 |         pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
100 |     except Exception as e:
101 |         print e
102 |     f.close()
103 | 
104 |     conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
105 |     bucket = conn.get_bucket('i290-aero')
106 |     k = Key(bucket)
107 |     k.key = pickle_file_name
108 |     k.set_contents_from_filename(pickle_file_name)
109 | 
110 |     os.remove(pickle_file_name)
111 | 
112 | 
113 | for i in years:
114 |     data = []
115 |     '''
116 |     conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
117 |     bucket = conn.get_bucket('i290-aero')
118 |     k = Key(bucket)
119 |     k.key = 'data2001.csv'
120 |     file_path = k.get_contents_as_string()
121 |     '''
122 |     file_path='data' + str(i) + '.csv'
123 |     pickle_file_name = timestr+'-data-' + str(i)
124 |     with open(file_path, 'r') as data_csv:
125 |         csv_reader = csv.reader(data_csv, delimiter=',')
126 |         j = 0
127 |         for row in csv_reader:
128 |             if row[21] == '0': # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
129 | #                 if (row[16] == 'SFO' or row[16] == 'OAK'):
130 |                     content = [row[i] for i in needed_cols]
131 |                     content2 = ComputeDayofYear(content)
132 |                     content3 = DiscretizeDepTime(content2)
133 |                     content4 = AddDepVar(content3)
134 |                     data.append(content4)
135 |                     # print 'content4', content4
136 |                     # print 'data', data
137 |                     # fff = raw_input()
138 |                     j=j+1
139 |                     if j % 2000000 == 0:
140 |                         print j
141 |                         SaveData(data, pickle_file_name + '-' + str(j))
142 |                         data = []
143 |     SaveData(data, pickle_file_name)
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/Old Python Code/date_iterator_plot.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import csv
  3 | import random
  4 | import matplotlib.pyplot as plt; plt.rcdefaults()
  5 | 
  6 | # Eunkwang data:  SFO = 1; OAK = 2
  7 | # Divya data: SFO = 136; OAK = 141
  8 | 
  9 | # Need to change row indexes to make sure they match data from Eunkwang.
 10 | 
 11 | '''with open('EunkwangSampleData.csv', 'r') as data:
 12 | 	csv_reader = csv.reader(data, delimiter=',')
 13 | 	SFO_EJ_Hash = {}
 14 | 	OAK_EJ_Hash = {}
 15 | 	for row in csv_reader:
 16 | 		origin = row[8]
 17 | 		if(origin == '1'):
 18 | 			year = int(row[0])
 19 | 			month = int(row[1])
 20 | 			date = int(row[2])
 21 | 			key = datetime.date(year, month, date)
 22 | 			label = int(row[9])
 23 | 			if(key not in SFO_EJ_Hash):
 24 | 				SFO_EJ_Hash[key] = [label]
 25 | 			else:
 26 | 				SFO_EJ_Hash[key].append(label)
 27 | 		elif(origin == '2'):
 28 | 			year = int(row[0])
 29 | 			month = int(row[1])
 30 | 			date = int(row[2])
 31 | 			key = datetime.date(year, month, date)
 32 | 			label = int(row[9])
 33 | 			if(key not in OAK_EJ_Hash):
 34 | 				OAK_EJ_Hash[key] = [label]
 35 | 			else:
 36 | 				OAK_EJ_Hash[key].append(label)
 37 | 		else:
 38 | 			continue'''		
 39 | 
 40 | with open('DivyaSampleData.csv', 'r') as data:
 41 | 	csv_reader = csv.reader(data, delimiter=',')
 42 | 	SFO_DM_Hash = {}
 43 | 	OAK_DM_Hash = {}
 44 | 	for row in csv_reader:
 45 | 		origin = row[8]
 46 | 		if(origin == '136'):
 47 | 			year = int(row[0])
 48 | 			month = int(row[1])
 49 | 			date = int(row[2])
 50 | 			key = datetime.date(year, month, date)
 51 | 			label = int(row[9])
 52 | 			if(key not in SFO_DM_Hash):
 53 | 				SFO_DM_Hash[key] = [label]
 54 | 			else:
 55 | 				SFO_DM_Hash[key].append(label)
 56 | 		elif(origin == '141'):
 57 | 			year = int(row[0])
 58 | 			month = int(row[1])
 59 | 			date = int(row[2])
 60 | 			key = datetime.date(year, month, date)
 61 | 			label = int(row[9])
 62 | 			if(key not in OAK_DM_Hash):
 63 | 				OAK_DM_Hash[key] = [label]
 64 | 			else:
 65 | 				OAK_DM_Hash[key].append(label)
 66 | 		else:
 67 | 			continue				
 68 | 
 69 | start_date = datetime.date(2008, 1, 1)
 70 | end_date = datetime.date(2008, 1,31)
 71 | date_values = []
 72 | SFO_DM_Delays = []
 73 | SFO_DM_On_Time = []
 74 | OAK_DM_Delays = []
 75 | OAK_DM_On_Time = []
 76 | SFO_EJ_Delays = []
 77 | SFO_EJ_On_Time = []
 78 | OAK_EJ_Delays = []
 79 | OAK_EJ_On_Time = []
 80 | 
 81 | d = start_date
 82 | delta = datetime.timedelta(days=1)
 83 | while d <= end_date:
 84 | 	'''if(d not in SFO_EJ_Hash):
 85 | 		SFO_EJ_Values.append([0,0])
 86 | 	else:
 87 | 		SFO_EJ_Flights = SFO_EJ_Hash[d]
 88 | 		delays = sum(SFO_EJ_Flights)
 89 | 		num_flights = len(SFO_EJ_Flights)
 90 | 		SFO_EJ_Delays.append(delays)
 91 | 		SFO_EJ_On_Time.append(num_flights - delays)
 92 | 	
 93 | 	if(d not in OAK_EJ_Hash):
 94 | 		OAK_EJ_Values.append([0,0])
 95 | 	else:
 96 | 		OAK_EJ_Flights = OAK_EJ_Hash[d]
 97 | 		delays = sum(OAK_EJ_Flights)
 98 | 		num_flights = len(OAK_EJ_Flights)
 99 | 		OAK_EJ_Delays.append(delays)
100 | 		OAK_EJ_On_Time.append(num_flights - delays)'''
101 | 	
102 | 	if(d not in SFO_DM_Hash):
103 | 		SFO_DM_Values.append([0,0])
104 | 	else:
105 | 		SFO_DM_Flights = SFO_DM_Hash[d]
106 | 		delays = sum(SFO_DM_Flights)
107 | 		num_flights = len(SFO_DM_Flights)
108 | 		SFO_DM_Delays.append(delays)
109 | 		SFO_DM_On_Time.append(num_flights - delays)
110 | 	
111 | 	if(d not in OAK_DM_Hash):
112 | 		OAK_DM_Values.append([0,0])
113 | 	else:
114 | 		OAK_DM_Flights = OAK_DM_Hash[d]
115 | 		delays = sum(OAK_DM_Flights)
116 | 		num_flights = len(OAK_DM_Flights)
117 | 		OAK_DM_Delays.append(delays)
118 | 		OAK_DM_On_Time.append(num_flights - delays)
119 | 	
120 | 	date_values.append(d)
121 | 	d += delta
122 | 
123 | plt.title('Probability of Flight Delays at SFO vs. OAK')
124 | 
125 | ax1 = plt.subplot(211)
126 | ax1.bar(date_values, SFO_DM_Delays, bottom = SFO_DM_On_Time, color = 'green')
127 | ax1.bar(date_values, SFO_DM_On_Time, color = 'blue')
128 | ax1.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008'])
129 | #ax1.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008'])
130 | ax1.set_yticks([0, 50, 100])
131 | ax1.set_title('On-Time Flights and Delayed Flights at SFO')
132 | 
133 | ax2 = plt.subplot(212)
134 | ax2.bar(date_values, OAK_DM_Delays, bottom = OAK_DM_On_Time, color = 'red')
135 | ax2.bar(date_values, OAK_DM_On_Time, color = 'grey')
136 | ax2.set_xticklabels(['Jan 1 2008', '', '', '', '', '', '','','','','','','','','Jan 15 2008', '','','','','','','','','','','','','','','','Jan 31 2008'])
137 | #ax2.set_xticklabels(['Jan 2008','','','','','Jun 2008','','','','','','Dec 2008'])
138 | ax2.set_yticks([0, 50, 100])
139 | ax2.set_title('On-Time Flights and Delayed Flights at OAK')
140 | 
141 | plt.show()


--------------------------------------------------------------------------------
/Old Python Code/logisticRegression.py:
--------------------------------------------------------------------------------
  1 | # import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import random
  4 | import pickle
  5 | import sys
  6 | import os
  7 | from boto.s3.connection import S3Connection
  8 | from boto.s3.key import Key
  9 | 
 10 | pickle2001 = ['20140428-190051-data-2001',
 11 | 	'20140428-190051-data-2001-2000000',
 12 | 	'20140428-190051-data-2001-4000000']
 13 | pickle2002 = ['20140428-190051-data-2002',
 14 | 	'20140428-190051-data-2002-2000000',
 15 | 	'20140428-190051-data-2002-4000000']
 16 | pickle2003 = ['20140428-190051-data-2003',
 17 | '20140428-190051-data-2003-2000000',
 18 | '20140428-190051-data-2003-4000000',
 19 | '20140428-190051-data-2003-6000000']
 20 | pickle2004 = ['20140428-190051-data-2004',
 21 | '20140428-190051-data-2004-2000000',
 22 | '20140428-190051-data-2004-4000000',
 23 | '20140428-190051-data-2004-6000000']
 24 | pickle2005 = ['20140428-190051-data-2005',
 25 | '20140428-190051-data-2005-2000000',
 26 | '20140428-190051-data-2005-4000000',
 27 | '20140428-190051-data-2005-6000000']
 28 | pickle2006 = ['20140428-190051-data-2006',
 29 | '20140428-190051-data-2006-2000000',
 30 | '20140428-190051-data-2006-4000000',
 31 | '20140428-190051-data-2006-6000000']
 32 | pickle2007 = ['20140428-190051-data-2007',
 33 | '20140428-190051-data-2007-2000000',
 34 | '20140428-190051-data-2007-4000000',
 35 | '20140428-190051-data-2007-6000000']
 36 | pickle2008 = ['20140428-190051-data-2008',
 37 | '20140428-190051-data-2008-2000000',
 38 | '20140428-190051-data-2008-4000000',
 39 | '20140428-190051-data-2008-6000000']
 40 | 
 41 | 
 42 | def loadData(fileName):
 43 | 	if os.path.exists(fileName) == False:
 44 | 		print 'downloading', fileName, 'from s3'
 45 | 		conn = S3Connection('key', 'val')
 46 | 		bucket = conn.get_bucket('i290-aero')
 47 | 		k = Key(bucket)
 48 | 		k.key = fileName
 49 | 		k.get_contents_to_filename(fileName)
 50 | 		print 'downloaded', fileName, 'from s3'
 51 | 
 52 | 	print 'now unpickle...'
 53 | 	x = pickle.load(open(fileName, "rb"))
 54 | 	x = np.array(x)
 55 | 	print 'x.shape = ', x.shape, x[:, -1:].shape
 56 | 	y = x[:, -1:].copy() # last col is y value (delay or not)
 57 | 	x[:, -1:] = 1.
 58 | 	return x, y
 59 | 
 60 | def gradientDescent(x, y, numIterations, dimension, theta):
 61 | 	# theta = np.zeros(dimension)[np.newaxis].transpose()
 62 | 	for i in range(1, numIterations):
 63 | 		randIdx = random.randint(0, len(x) - 1)
 64 | 		xTrans = x[randIdx][np.newaxis].transpose()
 65 | 		# print theta.transpose(), xTrans
 66 | 		u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans)))
 67 | 		loss = y[randIdx] - u
 68 | 		gradient = np.dot(loss[0][0], xTrans)
 69 | 		# update
 70 | 		theta = theta + gradient / i
 71 | 	return theta
 72 | 
 73 | def graph(formula, x_range):  
 74 | 	x = np.array(x_range)  
 75 | 	y = eval(formula)
 76 | 	plt.plot(x, y)
 77 | 
 78 | 
 79 | # def getData(fileName):
 80 | # 	f = open(fileName, 'r')
 81 | # 	x = np.array([0,0,0])
 82 | # 	x0 = []
 83 | # 	x1 = []
 84 | # 	y = np.array([0])
 85 | # 	for line in f:
 86 | # 		arr = line.strip().split(' ')
 87 | # 		x = np.vstack((x, [float(arr[0]), float(arr[1]), 1.]))
 88 | # 		y = np.vstack((y, [float(arr[2])]))
 89 | # 		if arr[2] == '0':
 90 | # 			x0.append((float(arr[0]), float(arr[1])))
 91 | # 		else:
 92 | # 			x1.append((float(arr[0]), float(arr[1])))
 93 | 
 94 | # 	x = np.delete(x, 0, 0)
 95 | # 	y = np.delete(y, 0, 0)
 96 | # 	f.close()
 97 | 
 98 | # 	return x, x0, x1, y
 99 | 
100 | 
101 | 
102 | 
103 | def main():
104 | 	# arg = sys.argv
105 | 	# if len(arg) < 2:
106 | 	# 	print 'USE: $ python logisticRegression.py [dataset_file]'
107 | 	# 	return
108 | 	# x, y = loadData(arg[1])
109 | 
110 | 	# x, x0, x1, y = getData('classification.dat')
111 | 
112 | 	if os.path.exists('pickled_theta') == False:
113 | 		theta = None
114 | 		for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2007:
115 | 			x, y = loadData(elem)
116 | 			if theta == None:
117 | 				theta = np.zeros(x.shape[1])[np.newaxis].transpose()
118 | 				print 'theta == None...... initialize..........', theta.shape
119 | 			theta = gradientDescent(x, y, 100000, x.shape[1], theta)
120 | 			print 'finished gradientDescent of ', elem
121 | 		print 'theta', theta
122 | 
123 | 		f = open('pickled_theta', 'wb')
124 | 		pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL)
125 | 		f.close()
126 | 
127 | 	theta = pickle.load(open('pickled_theta', 'rb'))
128 | 
129 | 	accu = 0
130 | 	length = 0
131 | 	for elem in pickle2008:
132 | 		if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False:
133 | 			x, y = loadData(elem)
134 | 			dotProduct = np.dot(x, theta)
135 | 			print '============= dot product ============='
136 | 			print dotProduct
137 | 			print '=============y ============='
138 | 			print y
139 | 			pickle.dump(dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
140 | 			pickle.dump(y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
141 | 		else:
142 | 			dotProduct = pickle.load(open('dot-' + elem, 'rb'))
143 | 			y = pickle.load(open('y-' + elem, 'rb'))
144 | 
145 | 		reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
146 | 		prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
147 | 
148 | 		for i in range(len(prob)):
149 | 			if prob[i] == y[i]:
150 | 				accu += 1
151 | 		length += len(prob)
152 | 	print 'accuracy = ', accu * 100 / length
153 | 
154 | 	# graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5))
155 | 	print 'asdf'
156 | 
157 | 
158 | 
159 | if __name__ == '__main__':
160 | 	main()


--------------------------------------------------------------------------------
/Old Python Code/matrix.pkl:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | I2008
 3 | cnumpy.core.multiarray
 4 | _reconstruct
 5 | p1
 6 | (cnumpy
 7 | ndarray
 8 | p2
 9 | (I0
10 | tp3
11 | S'b'
12 | p4
13 | tp5
14 | Rp6
15 | (I1
16 | (L2L
17 | L2L
18 | tp7
19 | cnumpy
20 | dtype
21 | p8
22 | (S'i4'
23 | p9
24 | I0
25 | I1
26 | tp10
27 | Rp11
28 | (I3
29 | S'<'
30 | p12
31 | NNNI-1
32 | I-1
33 | I0
34 | tp13
35 | bI00
36 | S'\xb9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe2\x03\x00\x00'
37 | p14
38 | tp15
39 | bsI2001
40 | g1
41 | (g2
42 | (I0
43 | tp16
44 | g4
45 | tp17
46 | Rp18
47 | (I1
48 | (L2L
49 | L2L
50 | tp19
51 | g11
52 | I00
53 | S'\x9a\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x04\x00\x00'
54 | p20
55 | tp21
56 | bsI2007
57 | g1
58 | (g2
59 | (I0
60 | tp22
61 | g4
62 | tp23
63 | Rp24
64 | (I1
65 | (L2L
66 | L2L
67 | tp25
68 | g11
69 | I00
70 | S'\xa9\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf2\x03\x00\x00'
71 | p26
72 | tp27
73 | bs.


--------------------------------------------------------------------------------
/Old Python Code/model_selector.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt; plt.rcdefaults()
 2 | 
 3 | # Divya and Eunkwang to provide [precision, recall, accuracy] for each of their 8 results.
 4 | # This script will graph the models against each other and select the best model.
 5 | 
 6 | TEST_DATA = [[0.4,0.6,0.8] , [0.5,0.3,0.69], [0.8, 0.2, 0.75], [0.3, 0.9, 0.72], [0.8, 0.95, 0.9]]
 7 | 
 8 | def calc_f1_score(precision, recall, accuracy):
 9 | 	return (float(2 * (precision * recall) / (precision + recall)))
10 | 
11 | precision_array = []
12 | recall_array = []
13 | best_f1 = 0.00000000000000000
14 | index = 0
15 | 
16 | for each in TEST_DATA:
17 | 	precision_array.append(each[0])
18 | 	recall_array.append(each[1])
19 | 	
20 | 	f1 = calc_f1_score(each[0], each[1], each[2])
21 | 	#print f1
22 | 	if(f1 > best_f1):
23 | 		best_f1 = f1
24 | 		best_index = index
25 | 	index +=1
26 | 
27 | print "The Best Model is: Model " + str(best_index)
28 | 
29 | fig = plt.subplot(111)
30 | fig.scatter(precision_array, recall_array)
31 | fig.set_xlabel('Recall')
32 | fig.set_ylabel('Precision')
33 | 
34 | plt.show()


--------------------------------------------------------------------------------
/Old Python Code/output.txt:
--------------------------------------------------------------------------------
 1 | harbinger:~/python$ python Pickle.py
 2 | /home/dmenghani/python_lib/scikit_learn-0.14.1-py2.7-linux-x86_64.egg/sklearn/pls.py:7: DeprecationWarning: This module has been moved to cross_decomposition and will be removed in 0.16
 3 |   "removed in 0.16", DeprecationWarning)
 4 | Setting constants...
 5 | Reading into Pandas frame...
 6 | 
 7 | /home/dmenghani/python/2001.csv
 8 | Length of original dataset -  5967780
 9 | Removing cancelled flights...
10 | Length after random sampling, taking {one - third} of the file -  1912194
11 | 
12 | 
13 | /home/dmenghani/python/2002.csv
14 | Length of original dataset -  5271359
15 | Removing cancelled flights...
16 | Length after random sampling, taking {one - third} of the file -  1735405
17 | 
18 | 
19 | /home/dmenghani/python/2003.csv
20 | Length of original dataset -  6488540
21 | Removing cancelled flights...
22 | Length after random sampling, taking {one - third} of the file -  2129023
23 | 
24 | 
25 | /home/dmenghani/python/2004.csv
26 | Length of original dataset -  7129270
27 | Removing cancelled flights...
28 | Length after random sampling, taking {one - third} of the file -  2333837
29 | 
30 | 
31 | /home/dmenghani/python/2005.csv
32 | Length of original dataset -  7140596
33 | Removing cancelled flights...
34 | Length after random sampling, taking {one - third} of the file -  2335622
35 | 
36 | 
37 | /home/dmenghani/python/2006.csv
38 | Length of original dataset -  7141922
39 | Removing cancelled flights...
40 | Length after random sampling, taking {one - third} of the file -  2339996
41 | 
42 | 
43 | /home/dmenghani/python/2007.csv
44 | Length of original dataset -  7453215
45 | Removing cancelled flights...
46 | Length after random sampling, taking {one - third} of the file -  2430822
47 | 
48 | 
49 | /home/dmenghani/python/2008.csv
50 | Length of original dataset -  7009728
51 | Removing cancelled flights...
52 | Length after random sampling, taking {one - third} of the file -  2290764
53 | 
54 | Total length for all years -  17507663
55 | Calculating classification label...
56 | Dataframe shape -  (17507663, 12)
57 | Columns - Index([u'Year', u'Month', u'DayofMonth', u'DayOfWeek', u'DepTime', u'UniqueCarrier', u'TailNum', u'DepDelay', u'Origin', u'Dest', u'Distance', u'label'], dtype='object')
58 | Converting categorical data to numeric...
59 | Converting... Origin
60 | Converting... UniqueCarrier
61 | Converting... Dest
62 | Converting... TailNum
63 | Pickled origin_all.pkl
64 | Pickled tailnum_all.pkl
65 | Pickled dest_all.pkl
66 | Pickled carrier_all.pkl
67 | Conversion to discrete data completed.
68 | Pickled dataframe_all.pkl
69 | harbinger:~/python$
70 | 


--------------------------------------------------------------------------------
/Old Python Code/prec.pkl:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | I2008
 3 | cnumpy.core.multiarray
 4 | scalar
 5 | p1
 6 | (cnumpy
 7 | dtype
 8 | p2
 9 | (S'f8'
10 | p3
11 | I0
12 | I1
13 | tp4
14 | Rp5
15 | (I3
16 | S'<'
17 | p6
18 | NNNI-1
19 | I-1
20 | I0
21 | tp7
22 | bS'\x00\x00\x00\x00\x00\x00\xf0?'
23 | p8
24 | tp9
25 | Rp10
26 | sI2001
27 | g1
28 | (g5
29 | S'\x00\x00\x00\x00\x00\x00\xf0?'
30 | p11
31 | tp12
32 | Rp13
33 | sI2007
34 | g1
35 | (g5
36 | S'\x00\x00\x00\x00\x00\x00\xf0?'
37 | p14
38 | tp15
39 | Rp16
40 | s.


--------------------------------------------------------------------------------
/Old Python Code/results.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | .


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Predicting Airline Delays - Fly from SFO or OAK?
 2 | ===================
 3 | 
 4 | Team
 5 | <br>
 6 | Divya M
 7 | <br>
 8 | Eunkwang J
 9 | <br>
10 | Ryan J
11 | <br>
12 | Julia K
13 | <br>
14 | 
15 | <b>Problem Statement</b>
16 | Simplified version: <i>"Given a destination and a date range, which is a better airport to fly out from - SFO or OAK?"</i>
17 | We wanted to apply machine learning techniques to build a predictive model which can help flyer decide which airport to choose. Our model was built using data for all US domestic flights from 2001-08. Our models works for all airports, however we were particularly interested in SFO/OAK. There is a popular urban myth to fly from OAK to avoid delays. But we find that myth is not true always. 
18 | <br>
19 | 
20 | <b>About the Data </b>
21 | We will be working with airline data for individual years found at http://stat-computing.org/dataexpo/2009/the-data.html. <br><br>
22 | 
23 | <b> Techniques </b>
24 | Naive Bayes
25 | Logistic Regression
26 | <br><br>
27 | 
28 | <b>Python Libraries</b>
29 | Pandas, Scikit, Matplotlib, Seaborn
30 | 


--------------------------------------------------------------------------------
/data_reader_v4_ek.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # data_reader_v4_ek.py
  3 | # author: eunkwang joo
  4 | # description: This code prepares dataset for logistic regression algorithm, which is written by myself.
  5 | #
  6 | 
  7 | 
  8 | import csv
  9 | import pickle
 10 | import time
 11 | import os
 12 | from boto.s3.connection import S3Connection
 13 | from boto.s3.key import Key
 14 | 
 15 | 
 16 | timestr = time.strftime("%Y%m%d-%H%M%S")
 17 | print timestr
 18 | 
 19 | needed_cols = [1, 2, 3, 4, 8, 10, 15, 16, 17]
 20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
 21 | j = 0
 22 | 
 23 | #
 24 | # function: ComputeDayofYear()
 25 | # description: This function will return an integer to represent the day of the year given an integer
 26 | #    representing month and an integer representing the day of the month.  This number will
 27 | #    correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned
 28 | #    as 0.  Feb 29th will be returned as 59.
 29 | # input: row of csv file, a raw dataset
 30 | # output: row of csv file, date of year value of which is encoded.
 31 | #
 32 | 
 33 | 
 34 | def ComputeDayofYear(row):
 35 |     if(row[0] == '1'):
 36 |         calc = 0 + int(row[1]) - 1
 37 |         row[1] = str(calc)
 38 |     elif(row[0] == '2'):
 39 |         calc = 31 + int(row[1]) - 1
 40 |         row[1] = str(calc)
 41 |     elif(row[0] == '3'):
 42 |         calc = 60 + int(row[1]) - 1
 43 |         row[1] = str(calc)
 44 |     elif(row[0] == '4'):
 45 |         calc = 91 + int(row[1]) - 1
 46 |         row[1] = str(calc)
 47 |     elif(row[0] == '5'):
 48 |         calc = 121 + int(row[1]) - 1
 49 |         row[1] = str(calc)
 50 |     elif(row[0] == '6'):
 51 |         calc = 152 + int(row[1]) - 1
 52 |         row[1] = str(calc)
 53 |     elif(row[0] == '7'):
 54 |         calc = 182 + int(row[1]) - 1
 55 |         row[1] = str(calc)
 56 |     elif(row[0] == '8'):
 57 |         calc = 213 + int(row[1]) - 1
 58 |         row[1] = str(calc)
 59 |     elif(row[0] == '9'):
 60 |         calc = 244 + int(row[1]) - 1
 61 |         row[1] = str(calc)
 62 |     elif(row[0] == '10'):
 63 |         calc = 274 + int(row[1]) - 1
 64 |         row[1] = str(calc)
 65 |     elif(row[0] == '11'):
 66 |         calc = 305 + int(row[1]) - 1
 67 |         row[1] = str(calc)
 68 |     elif(row[0] == '12'):
 69 |         calc = 335 + int(row[1]) - 1
 70 |         row[1] = str(calc)
 71 |     return row
 72 | 
 73 | 
 74 | #
 75 | # function: DiscretizeDepTime()
 76 | # description: This function takes a scheduled departure time, classifies the departure time as:
 77 | #    morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value
 78 | #    is assumed to be an integer in 24-hour time format.  These labels will correspond to
 79 | #    variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.
 80 | #    An error time is returned as morning.
 81 | # input: row of csv file, a raw dataset
 82 | # output: row of csv file, departure time value of which is encoded.
 83 | #
 84 | 
 85 | def DiscretizeDepTime(row):
 86 | 
 87 |     if(int(row[3]) <= 559):
 88 |         row[3] = '2'
 89 |     elif(int(row[3]) >= 600 and int(row[3]) <= 1259):
 90 |         row[3] = '0'
 91 |     elif(int(row[3]) >= 1300 and int(row[3]) <= 1759):
 92 |         row[3] = '1'
 93 |     elif(int(row[3]) >= 1800):
 94 |         row[3] = '2'
 95 |     else:
 96 |         row[3] = '0'
 97 |     return row
 98 | 
 99 | #
100 | # function: AddDepVar()
101 | # description: This function adds a classification label based on the length of the recorded
102 | #    Departure Delay in the data set.  It assumes an input integer value of the delay in mins.
103 | #    By airline industry standards, flight delays are defined as departure delays greater than
104 | #    or equal to 15 minutes.  For delayed flights, this variable will have value "1".
105 | #    For on time flights, it will have value "0".  Default value will be set at "0".
106 | # input: row of csv file, a raw dataset
107 | # output: row of csv file, delay value of which is encoded as binary.
108 | #
109 | 
110 | 
111 | def AddDepVar(row):
112 | 
113 |     if(row[6] >= '15'):
114 |         row[6] = '1'
115 |     else:
116 |         row[6] = '0'
117 |     return row
118 | 
119 | #
120 | # function: SaveData()
121 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well.
122 | # input: data= data structure which will be stored for future uses
123 | #        pickle_file_name= file name to be used to store data
124 | # output: null
125 | #
126 | 
127 | 
128 | def SaveData(data, pickle_file_name):
129 | 
130 |     f = open(pickle_file_name, "wb")
131 |     try:
132 |         pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
133 |     except Exception as e:
134 |         print e
135 |     f.close()
136 | 
137 |     conn = S3Connection(
138 |         'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
139 |     bucket = conn.get_bucket('i290-aero')
140 |     k = Key(bucket)
141 |     k.key = pickle_file_name
142 |     k.set_contents_from_filename(pickle_file_name)
143 | 
144 |     os.remove(pickle_file_name)
145 | 
146 | 
147 | # it reads raw datset of every year, encodes variables, drop unused
148 | # variables, and pickle trimmed dataset in file system.
149 | 
150 | for i in years:
151 |     data = []
152 |     '''
153 |     conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
154 |     bucket = conn.get_bucket('i290-aero')
155 |     k = Key(bucket)
156 |     k.key = 'data2001.csv'
157 |     file_path = k.get_contents_as_string()
158 |     '''
159 |     file_path = 'data' + str(i) + '.csv'
160 |     pickle_file_name = timestr + '-data-' + str(i)
161 |     with open(file_path, 'r') as data_csv:
162 |         csv_reader = csv.reader(data_csv, delimiter=',')
163 |         j = 0
164 |         for row in csv_reader:
165 |             # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
166 |             if row[21] == '0':
167 | #                 if (row[16] == 'SFO' or row[16] == 'OAK'):
168 |                     content = [row[i] for i in needed_cols]
169 |                     content2 = ComputeDayofYear(content)
170 |                     content3 = DiscretizeDepTime(content2)
171 |                     content4 = AddDepVar(content3)
172 |                     data.append(content4)
173 |                     # print 'content4', content4
174 |                     # print 'data', data
175 |                     # fff = raw_input()
176 |                     j = j + 1
177 |                     if j % 2000000 == 0:
178 |                         print j
179 |                         SaveData(data, pickle_file_name + '-' + str(j))
180 |                         data = []
181 |     SaveData(data, pickle_file_name)
182 | 


--------------------------------------------------------------------------------
/data_reader_v4_ek_rj_csv.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # data_reader_v4_ek_rj_csv.py
  3 | # author: eunkwang joo
  4 | # description: This code prepares dataset for logistic regression using python pandas.
  5 | #
  6 | 
  7 | import csv
  8 | import pickle
  9 | import time
 10 | import os
 11 | from boto.s3.connection import S3Connection
 12 | from boto.s3.key import Key
 13 | 
 14 | 
 15 | timestr = time.strftime("%Y%m%d-%H%M%S")
 16 | print timestr
 17 | 
 18 | # columns to extract from raw dataset.
 19 | needed_cols = [3, 4, 8, 15, 16, 17]
 20 | years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008]
 21 | 
 22 | j = 0
 23 | 
 24 | #
 25 | # function: ComputeDayofYear()
 26 | # description: This function will return an integer to represent the day of the year given an integer
 27 | #    representing month and an integer representing the day of the month.  This number will
 28 | #    correspond to the ordered day of the year [0-365].  For instance, Jan 1st will be returned
 29 | #    as 0.  Feb 29th will be returned as 59.
 30 | # input: row of csv file, a raw dataset
 31 | # output: row of csv file, date of year value of which is encoded.
 32 | #
 33 | 
 34 | 
 35 | def ComputeDayofYear(row):
 36 | 
 37 |     if(row[0] == '1'):
 38 |         calc = 0 + int(row[1]) - 1
 39 |         row[1] = float(calc)
 40 |     elif(row[0] == '2'):
 41 |         calc = 31 + int(row[1]) - 1
 42 |         row[1] = float(calc)
 43 |     elif(row[0] == '3'):
 44 |         calc = 60 + int(row[1]) - 1
 45 |         row[1] = float(calc)
 46 |     elif(row[0] == '4'):
 47 |         calc = 91 + int(row[1]) - 1
 48 |         row[1] = float(calc)
 49 |     elif(row[0] == '5'):
 50 |         calc = 121 + int(row[1]) - 1
 51 |         row[1] = float(calc)
 52 |     elif(row[0] == '6'):
 53 |         calc = 152 + int(row[1]) - 1
 54 |         row[1] = float(calc)
 55 |     elif(row[0] == '7'):
 56 |         calc = 182 + int(row[1]) - 1
 57 |         row[1] = float(calc)
 58 |     elif(row[0] == '8'):
 59 |         calc = 213 + int(row[1]) - 1
 60 |         row[1] = float(calc)
 61 |     elif(row[0] == '9'):
 62 |         calc = 244 + int(row[1]) - 1
 63 |         row[1] = float(calc)
 64 |     elif(row[0] == '10'):
 65 |         calc = 274 + int(row[1]) - 1
 66 |         row[1] = float(calc)
 67 |     elif(row[0] == '11'):
 68 |         calc = 305 + int(row[1]) - 1
 69 |         row[1] = float(calc)
 70 |     elif(row[0] == '12'):
 71 |         calc = 335 + int(row[1]) - 1
 72 |         row[1] = float(calc)
 73 |     return row
 74 | 
 75 | 
 76 | #
 77 | # function: DiscretizeDepTime()
 78 | # description: This function takes a scheduled departure time, classifies the departure time as:
 79 | #    morning (0700 - 1259), afternoon (1300 - 1759), or evening (1800-0659).  The input value
 80 | #    is assumed to be an integer in 24-hour time format.  These labels will correspond to
 81 | #    variable values of 0 = morning, 1 = afternoon, 2 = evening.  The value is then returned.
 82 | #    An error time is returned as morning.
 83 | # input: row of csv file, a raw dataset
 84 | # output: row of csv file, departure time value of which is encoded.
 85 | #
 86 | 
 87 | def DiscretizeDepTime(row):
 88 | 
 89 |     if(int(row[1]) <= 559):
 90 |         row[1] = 2.
 91 |     elif(int(row[1]) >= 600 and int(row[1]) <= 1259):
 92 |         row[1] = 0.
 93 |     elif(int(row[1]) >= 1300 and int(row[1]) <= 1759):
 94 |         row[1] = 1.
 95 |     elif(int(row[1]) >= 1800):
 96 |         row[1] = 2.
 97 |     else:
 98 |         row[1] = 0.
 99 |     return row
100 | 
101 | 
102 | #
103 | # function: AddDepVar()
104 | # description: This function adds a classification label based on the length of the recorded
105 | #    Departure Delay in the data set.  It assumes an input integer value of the delay in mins.
106 | #    By airline industry standards, flight delays are defined as departure delays greater than
107 | #    or equal to 15 minutes.  For delayed flights, this variable will have value "1".
108 | #    For on time flights, it will have value "0".  Default value will be set at "0".
109 | # input: row of csv file, a raw dataset
110 | # output: row of csv file, delay value of which is encoded as binary.
111 | #
112 | 
113 | def AddDepVar(row):
114 | 
115 |     if float(row[3]) >= float(15):
116 |         row[3] = 1.
117 |     else:
118 |         row[3] = 0.
119 |     return row
120 | 
121 | #
122 | # function: SaveData()
123 | # description: This function pickles each file. Also, due to the lack of storage space on local server, it stores data to S3 server as well.
124 | # input: data= data structure which will be stored for future uses
125 | #        pickle_file_name= file name to be used to store data
126 | # output: null
127 | #
128 | 
129 | 
130 | def SaveData(data, pickle_file_name):
131 | 
132 |     f = open(pickle_file_name, "wb")
133 |     try:
134 |         pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
135 |     except Exception as e:
136 |         print e
137 |     f.close()
138 | 
139 |     conn = S3Connection(
140 |         'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
141 |     bucket = conn.get_bucket('i290-aero')
142 |     k = Key(bucket)
143 |     k.key = pickle_file_name
144 |     k.set_contents_from_filename(pickle_file_name)
145 | 
146 |     os.remove(pickle_file_name)
147 | 
148 | 
149 | hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic']
150 | 
151 | 
152 | # if os.path.exists(hashs[1]):
153 | #     tailNumHash = pickle.load(open(hashs[1], "rb"))
154 | # else:
155 | #     tailNumHash = {}
156 | #
157 | # function: createHash()
158 | # description: It creates dictionaries which matches an airport and a carrier to an integer. The dictionaries will be used to identify encoded airport and carrier.
159 | # input: null
160 | # output: null
161 | #
162 | def createHash():
163 |     airportHash = {}
164 |     carrierHash = {}
165 |     for i in years:
166 |         file_path = '../Airport_Data/data' + str(i) + '.csv'
167 |         with open(file_path, 'r') as data_csv:
168 |             csv_reader = csv.reader(data_csv, delimiter=',')
169 |             j = 0
170 |             for row in csv_reader:
171 |                 if(row[17] not in airportHash):
172 |                     airportHash[row[17]] = len(airportHash) + 1
173 |                 if(row[8] not in carrierHash):
174 |                     carrierHash[row[8]] = len(carrierHash) + 1
175 |     pickle.dump(airportHash, open('airportHash.dic', 'wb'),
176 |                 protocol=pickle.HIGHEST_PROTOCOL)
177 |     pickle.dump(carrierHash, open('carrierHash.dic', 'wb'),
178 |                 protocol=pickle.HIGHEST_PROTOCOL)
179 | 
180 | # createHash()
181 | 
182 | airportHash = pickle.load(open(hashs[0], "rb"))
183 | carrierHash = pickle.load(open(hashs[2], "rb"))
184 | 
185 | # it reads raw datset of every year, encodes variables, drop unused
186 | # variables, and pickle trimmed dataset in file system.
187 | 
188 | for i in years:
189 |     data = []
190 |     '''
191 |     conn = S3Connection('AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
192 |     bucket = conn.get_bucket('i290-aero')
193 |     k = Key(bucket)
194 |     k.key = 'data2001.csv'
195 |     file_path = k.get_contents_as_string()
196 |     '''
197 |     file_path = '../Airport_Data/data' + str(i) + '.csv'
198 |     pickle_file_name = timestr + '-data-' + str(i)
199 |     dropped = ''
200 |     with open(file_path, 'r') as data_csv:
201 |         csv_reader = csv.reader(data_csv, delimiter=',')
202 |         j = 0
203 |         with open('trimmed2_' + str(i) + '.csv', 'w') as output_csv:
204 |             writer = csv.writer(
205 |                 output_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
206 |             writer.writerow(
207 |                 ['dayOfWeek', 'depTime', 'carrier', 'dest', 'origin', 'delay'])
208 |             for row in csv_reader:
209 |                 # and j<80000000: #and (row[16] == 'SFO' or row[16] == 'OAK'):
210 |                 if row[21] == '0':
211 |     #                 if (row[16] == 'SFO' or row[16] == 'OAK'):
212 |                     if (row[16] not in ['SFO', 'OAK']):
213 |                         dropped += row[16] + ' '
214 |                         continue  # airportHash[row[16]] = len(airportHash) + 1
215 |                     origin = airportHash[row[16]]
216 | 
217 |                     if(row[17] not in airportHash):
218 |                         airportHash[row[17]] = len(airportHash) + 1
219 |                     dest = airportHash[row[17]]
220 | 
221 |                     # if(row[10] not in tailNumHash):
222 |                     #     tailNumHash[row[10]] = len(tailNumHash) + 1
223 |                     # tailNum = tailNumHash[row[10]]
224 | 
225 |                     if(row[8] not in carrierHash):
226 |                         carrierHash[row[8]] = len(carrierHash) + 1
227 |                     carrier = carrierHash[row[8]]
228 |                     # print row[8], carrier, carrierHash
229 |                     # raw_input()
230 | 
231 |                     content = [row[i] for i in needed_cols]
232 |                     # content2 = ComputeDayofYear(content)
233 |                     content3 = DiscretizeDepTime(content)
234 |                     content4 = AddDepVar(content3)
235 |                     content4[2] = carrier
236 |                     # content4[5] = tailNum
237 |                     content4[4] = origin
238 |                     content4[5] = dest
239 |                     for idx in range(len(content4)):
240 |                         content4[idx] = float(content4[idx])
241 |                     temp = content4[3]
242 |                     content4[3] = content4[5]
243 |                     content4[5] = temp
244 | 
245 |                     writer.writerow(
246 |                         [content4[0], content4[1], content4[2], content4[3], content4[4], content4[5]])
247 |                     # print content4
248 |                     # data.append(content4)
249 |                     # print 'content4', content4
250 |                     # print 'data', data
251 |                     # fff = raw_input()
252 |                     # j=j+1
253 |                     # if j % 2000000 == 0:
254 |                     #     print j
255 |                     #     SaveData(data, pickle_file_name + '-' + str(j))
256 |                     #     data = []
257 |     # SaveData(data, pickle_file_name)
258 |     # print dropped
259 | 
260 | 
261 | # hashs = ['airportHash.dic', 'tailHash.dic', 'carrierHash.dic']
262 | # hashVals = [airportHash, tailNumHash, carrierHash]
263 | # for idx in range(len(hashs)):
264 | #     f = open (hashs[idx], "wb")
265 | #     try:
266 | #         pickle.dump(hashVals[idx], f, protocol=pickle.HIGHEST_PROTOCOL)
267 | #     except Exception as e:
268 | #         print e
269 | #     f.close()
270 | 


--------------------------------------------------------------------------------
/date_graph2.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # date_graph2.py
  3 | # Author: Ryan Jung
  4 | # Description: This function takes a date and calculates the probability of delay at SFO
  5 | # and at OAK for the date and the 6 days prior.  It then graphs these probabilities as
  6 | # side-by-side bars for each day.
  7 | # Dependencies:  Run the Naive Bayes classification code in Crossval_r.py file. Ensure that the file _dfTest2008.csv is in the
  8 | # same folder.
  9 | #
 10 | 
 11 | from __future__ import division
 12 | import sys
 13 | import csv
 14 | import datetime
 15 | import matplotlib.pyplot as plt
 16 | plt.rcdefaults()
 17 | import numpy as np
 18 | 
 19 | #
 20 | # These are the hard codes of the "look back" period (set at 6 days) and airport codes
 21 | # from our Naive Bayes dictionary.
 22 | #
 23 | 
 24 | TIME_DELTA = 6
 25 | SFO_AIRPORT_CODE = '270'
 26 | OAK_AIRPORT_CODE = '215'
 27 | JFK_AIRPORT_CODE = '160'
 28 | ORD_AIRPORT_CODE = '225'
 29 | ATL_AIRPORT_CODE = '25'
 30 | LAX_AIRPORT_CODE = '168'
 31 | LGA_AIRPORT_CODE = '174'
 32 | DFW_AIRPORT_CODE = '85'
 33 | 
 34 | #
 35 | # Main Function
 36 | # The function first takes an argument from the command line of the form:
 37 | #			python date_graph2.py m-d-yy
 38 | # It then calculates the bounds of our query for probability of delay by day.
 39 | #
 40 | 
 41 | for arg in sys.argv:
 42 |     if(arg != 'date_graph2.py'):
 43 |         start_date = datetime.datetime.strptime(arg, '%m-%d-%y')
 44 |         start_date = datetime.date(
 45 |             start_date.year, start_date.month, start_date.day)
 46 | 
 47 | delta = datetime.timedelta(days=TIME_DELTA)
 48 | begin = start_date - delta
 49 | end = start_date
 50 | 
 51 | #
 52 | # This block of code sets up a hash for each airport of the form {key: value} => {day:
 53 | # [predict label,...]}.  This is a list of the predicted labels for each flight on a
 54 | # particular day from the origin airport to the destination airport.  It iterates over
 55 | # the days in our query range and constructs the hash.
 56 | #
 57 | 
 58 | SFO_Hash = {}
 59 | OAK_Hash = {}
 60 | with open('_dfTest2008.csv', 'r') as data:
 61 |     csv_reader = csv.reader(data, delimiter=',')
 62 |     for row in csv_reader:
 63 |         if(row[0] != 'Year'):
 64 |             year = int(row[0])
 65 |             month = int(row[1])
 66 |             date = int(row[2])
 67 |             curr_date = datetime.date(year, month, date)
 68 |             if(curr_date >= begin and curr_date <= end):
 69 |                 origin = row[7]
 70 |                 dest = row[8]
 71 |                 if(origin == SFO_AIRPORT_CODE and dest == LAX_AIRPORT_CODE):
 72 |                     label = int(row[10])
 73 |                     if(curr_date not in SFO_Hash):
 74 |                         SFO_Hash[curr_date] = [label]
 75 |                     else:
 76 |                         SFO_Hash[curr_date].append(label)
 77 |                 if(origin == OAK_AIRPORT_CODE and dest == LAX_AIRPORT_CODE):
 78 |                     label = int(row[10])
 79 |                     if(curr_date not in OAK_Hash):
 80 |                         OAK_Hash[curr_date] = [label]
 81 |                     else:
 82 |                         OAK_Hash[curr_date].append(label)
 83 | 
 84 | #
 85 | # This block of code initializes values for day "steps" for our iterator later.
 86 | # We also initialize lists which will have the number of delays, on-time flights, and
 87 | # percentage of predicted delays for the days in our query.
 88 | #
 89 | 
 90 | iterator = datetime.timedelta(days=1)
 91 | two_iterator = datetime.timedelta(days=2)
 92 | three_iterator = datetime.timedelta(days=3)
 93 | four_iterator = datetime.timedelta(days=4)
 94 | five_iterator = datetime.timedelta(days=5)
 95 | six_iterator = datetime.timedelta(days=6)
 96 | 
 97 | day_values = []
 98 | SFO_Delays = []
 99 | SFO_On_Time = []
100 | SFO_Flights = []
101 | SFO_Pct = []
102 | SFO_Comp = []
103 | OAK_Delays = []
104 | OAK_On_Time = []
105 | OAK_Flights = []
106 | OAK_Pct = []
107 | OAK_Comp = []
108 | 
109 | #
110 | # We then loop through the query date range and populate the lists, counting number of
111 | # delayed flights, number of on-time flights, and percent of flights delayed.  Each
112 | # list item corresponds to a date in our query range.
113 | #
114 | 
115 | while begin <= end:
116 |     if(begin not in SFO_Hash):
117 |         SFO_Delays.append(0)
118 |         SFO_On_Time.append(0)
119 |         SFO_Pct.append(0.00)
120 |     else:
121 |         SFO_Flights = SFO_Hash[begin]
122 |         delays = sum(SFO_Flights)
123 |         num_flights = len(SFO_Flights)
124 |         pct = float(delays) / (num_flights + delays)
125 |         SFO_Delays.append(delays)
126 |         SFO_On_Time.append(num_flights - delays)
127 |         SFO_Pct.append(pct)
128 |         SFO_Comp.append(1)
129 | 
130 |     if(begin not in OAK_Hash):
131 |         OAK_Delays.append(0)
132 |         OAK_On_Time.append(0)
133 |         OAK_Pct.append(0.00)
134 |     else:
135 |         OAK_Flights = OAK_Hash[begin]
136 |         delays = sum(OAK_Flights)
137 |         num_flights = len(OAK_Flights)
138 |         pct = float(delays) / (num_flights + delays)
139 |         OAK_Delays.append(delays)
140 |         OAK_On_Time.append(num_flights - delays)
141 |         OAK_Pct.append(pct)
142 |         OAK_Comp.append(1)
143 | 
144 |     day_values.append(begin)
145 |     begin += iterator
146 | 
147 | #
148 | # This block of code then graphs the percentage of delays by day as a side-by-side bar
149 | # graph for each day in the query.
150 | #
151 | 
152 | Y1 = SFO_Pct
153 | Y2 = OAK_Pct
154 | Y3 = SFO_Comp
155 | Y4 = OAK_Comp
156 | 
157 | N = 7
158 | ind = np.arange(N)  # the x locations for the groups
159 | width = 0.35       # the width of the bars
160 | 
161 | fig, ax = plt.subplots()
162 | rects1 = ax.bar(ind, Y1, width, color='blue')
163 | rects2 = ax.bar(ind + width, Y2, width, color='grey')
164 | 
165 | fig.suptitle(
166 |     'Probability of Flight Delays at SFO vs. OAK Given Specific Date Through t-7 Days')
167 | ax.legend((rects1[0], rects2[0]), ('SFO', 'OAK'), loc='upper center')
168 | 
169 | 
170 | def autolabel(rects):
171 |     for rect in rects:
172 |         height = rect.get_height()
173 |         ax.text(
174 |             rect.get_x() + rect.get_width() /
175 |             2., 1.05 * height, '%.2f' % float(height),
176 |             ha='center', va='bottom', rotation='vertical')
177 | 
178 | autolabel(rects1)
179 | autolabel(rects2)
180 | ax.set_xticklabels(
181 |     [start_date - six_iterator, start_date - five_iterator, start_date - four_iterator,
182 |      start_date - three_iterator, start_date - two_iterator, start_date - iterator, start_date], rotation=45)
183 | ax.set_ylabel('Probability of Delay')
184 | 
185 | plt.show()
186 | 


--------------------------------------------------------------------------------
/date_iterator_plot2.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # date_iterator_plot2.py
  3 | # Author: Ryan Jung
  4 | # Description: This function reads the predicted results from one of our models.  It then
  5 | # aggregates the probability of delay by week and graphs the probability of delay at
  6 | # both airports (SFO and OAK).  Lastly, it calculates the t-score of the difference in
  7 | # means of both airports to help determine if the difference is statistically significant.
  8 | #
  9 | 
 10 | import datetime
 11 | import csv
 12 | import matplotlib.pyplot as plt
 13 | plt.rcdefaults()
 14 | import numpy
 15 | 
 16 | # Hard code of airport codes in our dictionary that correspond to Naive
 17 | # Bayes model
 18 | SFO_AIRPORT_CODE = '270'
 19 | OAK_AIRPORT_CODE = '215'
 20 | 
 21 | #
 22 | # Function: ComputeDayofYear(month, day)
 23 | # Description: This function takes a month and day of month and outputs a number which
 24 | # corresponds to the day of year.  This will be a number between 0 and 365.
 25 | # Input: Integer values for month and day
 26 | # Output: Integer value for day of year
 27 | #
 28 | 
 29 | 
 30 | def ComputeDayofYear(month, day):
 31 |     if(month == 1):
 32 |         numDays = 0
 33 |     if(month == 2):
 34 |         numDays = 31
 35 |     if(month == 3):
 36 |         numDays = 60
 37 |     if(month == 4):
 38 |         numDays = 91
 39 |     if(month == 5):
 40 |         numDays = 121
 41 |     if(month == 6):
 42 |         numDays = 152
 43 |     if(month == 7):
 44 |         numDays = 182
 45 |     if(month == 8):
 46 |         numDays = 213
 47 |     if(month == 9):
 48 |         numDays = 244
 49 |     if(month == 10):
 50 |         numDays = 274
 51 |     if(month == 11):
 52 |         numDays = 305
 53 |     if(month == 12):
 54 |         numDays = 335
 55 | 
 56 |     return (numDays + day - 1)
 57 | 
 58 | #
 59 | # Main Function
 60 | # This block of code reads from the output of the Naive Bayes model and creates a hash
 61 | # for SFO and OAK that corresponds to {key: value} = {week #: [predicted label,...]}.
 62 | # The idea here is to create a list of all flights that are scheduled to leave SFO or OAK
 63 | # by week (52 weeks in the year).  The list will be 1's and 0's based on our prediction of
 64 | # whether the flight will be delayed (1) or not delayed (0).
 65 | #
 66 | 
 67 | with open('_dfTest2008.csv', 'r') as data:
 68 |     csv_reader = csv.reader(data, delimiter=',')
 69 |     SFO_DM_Hash = {}
 70 |     OAK_DM_Hash = {}
 71 |     for row in csv_reader:
 72 |         origin = row[7]
 73 |         if(origin == SFO_AIRPORT_CODE):
 74 |             month = int(row[1])
 75 |             date = int(row[2])
 76 |             DayofYear = ComputeDayofYear(month, date)
 77 |             key = DayofYear / 7
 78 |             label = int(row[10])
 79 |             if(key not in SFO_DM_Hash):
 80 |                 SFO_DM_Hash[key] = [label]
 81 |             else:
 82 |                 SFO_DM_Hash[key].append(label)
 83 |         elif(origin == OAK_AIRPORT_CODE):
 84 |             month = int(row[1])
 85 |             date = int(row[2])
 86 |             DayofYear = ComputeDayofYear(month, date)
 87 |             key = DayofYear / 7
 88 |             label = int(row[10])
 89 |             if(key not in OAK_DM_Hash):
 90 |                 OAK_DM_Hash[key] = [label]
 91 |             else:
 92 |                 OAK_DM_Hash[key].append(label)
 93 |         else:
 94 |             continue
 95 | 
 96 | #
 97 | # This block of code separates out the value list of flights from the previous block of
 98 | # code into a list of the number of delays and the number of on-time flights from SFO
 99 | # and OAK by week.  In other words, SFO_DM_Delays[14] will be the number of delayed
100 | # flights we predict at SFO in week 14.  We create a 3rd list which is the percent of
101 | # flights that are delayed by week.
102 | #
103 | 
104 | week_values = []
105 | SFO_DM_Delays = []
106 | SFO_DM_On_Time = []
107 | SFO_DM_Pct = []
108 | OAK_DM_Delays = []
109 | OAK_DM_On_Time = []
110 | OAK_DM_Pct = []
111 | 
112 | d = 0
113 | while d <= 51:
114 |     if(d not in SFO_DM_Hash):
115 |         SFO_DM_Delays.append(0)
116 |         SFO_DM_On_Time.append(0)
117 |         SFO_DM_Pct.append(0.00)
118 |     else:
119 |         SFO_DM_Flights = SFO_DM_Hash[d]
120 |         delays = sum(SFO_DM_Flights)
121 |         num_flights = len(SFO_DM_Flights)
122 |         pct = float(delays) / (num_flights + delays)
123 |         SFO_DM_Delays.append(delays)
124 |         SFO_DM_On_Time.append(num_flights - delays)
125 |         SFO_DM_Pct.append(pct)
126 | 
127 |     if(d not in OAK_DM_Hash):
128 |         OAK_DM_Delays.append(0)
129 |         OAK_DM_On_Time.append(0)
130 |         OAK_DM_Pct.append(0.00)
131 |     else:
132 |         OAK_DM_Flights = OAK_DM_Hash[d]
133 |         delays = sum(OAK_DM_Flights)
134 |         num_flights = len(OAK_DM_Flights)
135 |         pct = float(delays) / (num_flights + delays)
136 |         OAK_DM_Delays.append(delays)
137 |         OAK_DM_On_Time.append(num_flights - delays)
138 |         OAK_DM_Pct.append(pct)
139 | 
140 |     week_values.append(d)
141 |     d += 1
142 | 
143 | #
144 | # This block of code calculates the mean and standard deviation of the percent of flights
145 | # that are predicted to be delayed.  It uses these to calculate a t-score of the
146 | # difference in means which can be used to determine if the difference is statistically
147 | # significant.
148 | #
149 | 
150 | SFO_mean = numpy.mean(SFO_DM_Pct)
151 | OAK_mean = sum(OAK_DM_Pct) / len(OAK_DM_Pct)
152 | SFO_std = numpy.std(SFO_DM_Pct)
153 | OAK_std = numpy.std(OAK_DM_Pct)
154 | SFO_n = len(SFO_DM_Pct)
155 | OAK_n = len(OAK_DM_Pct)
156 | Diff = OAK_mean - SFO_mean
157 | std_err = (((SFO_std ** 2) / SFO_n) + ((OAK_std ** 2) / OAK_n)) ** 0.5
158 | 
159 | print "Standard Error", std_err
160 | print "t = ", Diff / std_err
161 | 
162 | #
163 | # Graphic visualization of the probability of delay by week at SFO and OAK.  SFO will be
164 | # the green line and OAK will be the blue line in the graph.  X-axis is the week of 2008
165 | # and y-axis is probability of delay.
166 | #
167 | 
168 | ax1 = plt.subplot(111)
169 | p1 = ax1.plot(week_values, SFO_DM_Pct, color='green')
170 | p2 = ax1.plot(week_values, OAK_DM_Pct, color='blue')
171 | ax1.set_title('Proportion of flights delayed in SFO (green) vs. OAK (blue)')
172 | ax1.set_xticklabels(
173 |     ['Jan 2008', 'Mar 2008', 'May 2008', 'Jul 2008', 'Sep 2008', 'Nov 2008'])
174 | ax1.set_ylabel('Probability of Delay')
175 | ax1.legend((p1[0], p2[0]), ('SFO', 'OAK'), loc='upper center')
176 | 
177 | plt.show()
178 | 


--------------------------------------------------------------------------------
/logisticRegression.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # logisticRegression.py
  3 | # author: eunkwang joo
  4 | # description: Loading pickled dataset in several fragments, it runs logistic regression to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset.
  5 | #
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import pickle
 10 | import sys
 11 | import os
 12 | from boto.s3.connection import S3Connection
 13 | from boto.s3.key import Key
 14 | 
 15 | 
 16 | # Trimmed datasets are stroed in pickle format. Due to a memory problem, I
 17 | # pickled datasets in many files.
 18 | 
 19 | pickle2001 = ['20140428-190051-data-2001',
 20 |               '20140428-190051-data-2001-2000000',
 21 |               '20140428-190051-data-2001-4000000']
 22 | pickle2002 = ['20140428-190051-data-2002',
 23 |               '20140428-190051-data-2002-2000000',
 24 |               '20140428-190051-data-2002-4000000']
 25 | pickle2003 = ['20140428-190051-data-2003',
 26 |               '20140428-190051-data-2003-2000000',
 27 |               '20140428-190051-data-2003-4000000',
 28 |               '20140428-190051-data-2003-6000000']
 29 | pickle2004 = ['20140428-190051-data-2004',
 30 |               '20140428-190051-data-2004-2000000',
 31 |               '20140428-190051-data-2004-4000000',
 32 |               '20140428-190051-data-2004-6000000']
 33 | pickle2005 = ['20140428-190051-data-2005',
 34 |               '20140428-190051-data-2005-2000000',
 35 |               '20140428-190051-data-2005-4000000',
 36 |               '20140428-190051-data-2005-6000000']
 37 | pickle2006 = ['20140428-190051-data-2006',
 38 |               '20140428-190051-data-2006-2000000',
 39 |               '20140428-190051-data-2006-4000000',
 40 |               '20140428-190051-data-2006-6000000']
 41 | pickle2007 = ['20140428-190051-data-2007',
 42 |               '20140428-190051-data-2007-2000000',
 43 |               '20140428-190051-data-2007-4000000',
 44 |               '20140428-190051-data-2007-6000000']
 45 | pickle2008 = ['20140428-190051-data-2008',
 46 |               '20140428-190051-data-2008-2000000',
 47 |               '20140428-190051-data-2008-4000000',
 48 |               '20140428-190051-data-2008-6000000']
 49 | 
 50 | #
 51 | # function: loadData()
 52 | # description: It loads dataset from pickled files, and separates x variables (features) from y value (delay)
 53 | # input: fileName= name of a pickled file
 54 | # output: x and y matrices to be used for logistic regression
 55 | #
 56 | 
 57 | 
 58 | def loadData(fileName):
 59 |     if os.path.exists(fileName) == False:
 60 |         print 'downloading', fileName, 'from s3'
 61 |         conn = S3Connection(
 62 |             'AKIAJ3S6FFCVZ7NZPPPA', 'egDauV1C6HY3Q31tjpQg4IiMwSq/Sm4ATASYVl+7')
 63 |         bucket = conn.get_bucket('i290-aero')
 64 |         k = Key(bucket)
 65 |         k.key = fileName
 66 |         k.get_contents_to_filename(fileName)
 67 |         print 'downloaded', fileName, 'from s3'
 68 | 
 69 |     print 'now unpickle...'
 70 |     x = pickle.load(open(fileName, "rb"))
 71 |     x = np.array(x)
 72 |     print 'x.shape = ', x.shape, x[:, -1:].shape
 73 |     y = x[:, -1:].copy()  # last col is y value (delay or not)
 74 |     x[:, -1:] = 1.
 75 |     return x, y
 76 | 
 77 | 
 78 | #
 79 | # function: gradientDescent()
 80 | # description: Using gradient descent algorithm, it runs logistic regression and estimates coefficients.
 81 | # input: x= features to be used for logistic regression
 82 | #			y= ground truth value of delay
 83 | #			numIterations= number of iterations to take for logistic regression
 84 | #			dimension= dimension of x matrix
 85 | #			theta= coefficient we try to find
 86 | # output: theta= coefficient matrix we have found to predict delay
 87 | #
 88 | 
 89 | def gradientDescent(x, y, numIterations, dimension, theta):
 90 |     # theta = np.zeros(dimension)[np.newaxis].transpose()
 91 |     for i in range(1, numIterations):
 92 |         randIdx = random.randint(0, len(x) - 1)
 93 |         xTrans = x[randIdx][np.newaxis].transpose()
 94 |         # print theta.transpose(), xTrans
 95 |         u = 1 / (1 + np.exp(np.dot(theta.transpose() * (-1), xTrans)))
 96 |         loss = y[randIdx] - u
 97 |         gradient = np.dot(loss[0][0], xTrans)
 98 |         # update
 99 |         theta = theta + gradient / i
100 |     return theta
101 | 
102 | 
103 | def main():
104 |     # arg = sys.argv
105 |     # if len(arg) < 2:
106 |     # 	print 'USE: $ python logisticRegression.py [dataset_file]'
107 |     # 	return
108 |     # x, y = loadData(arg[1])
109 | 
110 |     # x, x0, x1, y = getData('classification.dat')
111 | 
112 |     # train theta for 7 years of dataset
113 |     if os.path.exists('pickled_theta') == False:
114 |         theta = None
115 |         for elem in pickle2001 + pickle2002 + pickle2003 + pickle2004 + pickle2005 + pickle2006 + pickle2008:
116 |             x, y = loadData(elem)
117 |             if theta == None:
118 |                 theta = np.zeros(x.shape[1])[np.newaxis].transpose()
119 |                 print 'theta == None...... initialize..........', theta.shape
120 |             theta = gradientDescent(x, y, 100000, x.shape[1], theta)
121 |             print 'finished gradientDescent of ', elem
122 |         print 'theta', theta
123 | 
124 |         # pickle trained theta
125 |         f = open('pickled_theta', 'wb')
126 |         pickle.dump(theta, f, protocol=pickle.HIGHEST_PROTOCOL)
127 |         f.close()
128 | 
129 |     # load pickled theta
130 |     theta = pickle.load(open('pickled_theta', 'rb'))
131 | 
132 |     # predict with test dataset
133 |     accu = 0.
134 |     length = 0.
135 |     tp, tn, fp, fn = 0., 0., 0., 0.
136 |     for elem in pickle2007:
137 |         if os.path.exists('dot-' + elem) == False or os.path.exists('y-' + elem) == False:
138 |             x, y = loadData(elem)
139 |             dotProduct = np.dot(x, theta)
140 |             print '============= dot product ============='
141 |             print dotProduct
142 |             print '=============y ============='
143 |             print y
144 |             pickle.dump(
145 |                 dotProduct, open('dot-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
146 |             pickle.dump(
147 |                 y, open('y-' + elem, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
148 |         else:
149 |             dotProduct = pickle.load(open('dot-' + elem, 'rb'))
150 |             y = pickle.load(open('y-' + elem, 'rb'))
151 | 
152 |         reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
153 |         prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
154 | 
155 |         for i in range(len(prob)):
156 |             if prob[i] == 1 and y[i] == 1:
157 |                 accu += 1
158 |                 tp += 1
159 |             elif prob[i] == 1 and y[i] == 0:
160 |                 fp += 1
161 |             elif prob[i] == 0 and y[i] == 1:
162 |                 fn += 1
163 |             elif prob[i] == 0 and y[i] == 0:
164 |                 accu += 1
165 |                 tn += 1
166 |             else:
167 |                 raise Exception('wtf!!!', prob[i], y[i])
168 |         length += len(prob)
169 |     # print accuracy, precision, and recall
170 |     print 'accuracy = ', accu * 100 / length, (tp + tn) / (tp + fp + fn + tn)
171 |     print 'precision = ', tp / (tp + fp)
172 |     print 'recall = ', tp / (tp + fn)
173 | 
174 |     # graph('(-1) * theta[2][0] / theta[1][0] - (theta[0][0] / theta[1][0]) * x', range(-3, 5))
175 |     print 'asdf'
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     main()
180 | 


--------------------------------------------------------------------------------
/lr_app2.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # lr_app2.py
  3 | # author: eunkwang joo
  4 | # description: Loading trimmed datasets stored as csv files, it runs logistic regression using pandas to calculate effective coefficients. Then, it predicts accuracy of the estimates using test dataset.
  5 | #
  6 | 
  7 | import pandas as pd
  8 | import statsmodels.api as sm
  9 | # import pylab as pl
 10 | import numpy as np
 11 | import sys
 12 | import random
 13 | import os
 14 | import pickle
 15 | 
 16 | # df = pd.read_csv('trimmed2_2001.csv')#sys.argv[1])
 17 | 
 18 | 
 19 | #
 20 | # function: reader()
 21 | # description: It loads dataset from csv file in dataframe format
 22 | # input: f= name of a csv file of dataset
 23 | # output: d= dataframe loaded from csv dataset
 24 | #
 25 | 
 26 | def reader(f):
 27 |     d = pd.read_csv(f, header=0)  # , axis=1)
 28 |     # d.columns = range(d.shape[1])
 29 |     return d
 30 | 
 31 | 
 32 | #
 33 | # function: shuffle()
 34 | # description: It shuffles data
 35 | # input: df= dataframe which holds data
 36 | #			n= number of shuffles
 37 | #			axis= shuffle in which axis
 38 | # output: df= shuffled dataframe
 39 | #
 40 | 
 41 | def shuffle(df, n=1, axis=0):
 42 |     df = df.copy()
 43 |     for _ in range(n):
 44 |         df.apply(np.random.shuffle, axis=axis)
 45 |     return df
 46 | 
 47 | 
 48 | # search for csv files
 49 | for dirpath, dirnames, filenames in os.walk('.'):
 50 |     pass
 51 | 
 52 | filenames = [f for f in filenames if '.csv' in f]
 53 | filenames.sort()
 54 | print filenames
 55 | # concatenate all csv files in one dataframe
 56 | #[1532189 rows x 6 columns]
 57 | df = pd.concat([reader(f) for f in filenames], keys=filenames)
 58 | 
 59 | print df.head()
 60 | print df.columns
 61 | 
 62 | # dumm1 = pd.get_dummies(df['carrier'], prefix='carrier')
 63 | # dumm2 = pd.get_dummies(df['dest'], prefix='dest')
 64 | # dumm3 = pd.get_dummies(df['origin'], prefix='origin')
 65 | # dumm4 = pd.get_dummies(df['tailNum'], prefix='tailNum')
 66 | 
 67 | cols = ['delay', 'dayOfWeek', 'depTime']
 68 | 
 69 | # data = df[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
 70 | # data = df[cols].join(dumm1).join(dumm2).join(dumm3)
 71 | # data['intercept'] = 1.0
 72 | # print data.head() #[5 rows x 123 columns] including delay column
 73 | 
 74 | # data_delay = data[data['delay'] == 1]
 75 | # data_nodelay = data[data['delay'] == 0]
 76 | 
 77 | # get delayed data only
 78 | data_delay = df[df['delay'] == 1]
 79 | rows = random.sample(data_delay.index, len(data_delay))
 80 | data_delay_1 = data_delay.ix[rows]
 81 | data_delay_2 = data_delay.drop(rows)
 82 | 
 83 | # get not delayed data only
 84 | data_nodelay = df[df['delay'] == 0]
 85 | rows = random.sample(data_nodelay.index, len(data_delay))
 86 | data_nodelay = data_nodelay.ix[rows]
 87 | # get sample dataset of 50% delayed and 50% not delayed data
 88 | data_halfhalf = pd.concat([data_delay, data_nodelay])
 89 | 
 90 | rows = random.sample(data_nodelay.index, len(data_delay) / 2)
 91 | data_nodelay = data_nodelay.ix[rows]
 92 | data_halfhalf_2 = pd.concat([data_delay_2, data_nodelay])
 93 | 
 94 | # make dummy variables of carrier, dest, and origin
 95 | dumm1 = pd.get_dummies(data_halfhalf['carrier'], prefix='carrier')
 96 | dumm2 = pd.get_dummies(data_halfhalf['dest'], prefix='dest')
 97 | dumm3 = pd.get_dummies(data_halfhalf['origin'], prefix='origin')
 98 | data_halfhalf = data_halfhalf[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(
 99 |     dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
100 | data_halfhalf['intercept'] = 1.0  # (552264, 117)
101 | # data_halfhalf = shuffle(data_halfhalf)
102 | # data_halfhalf.reindex(np.random.permutation(data_halfhalf.index))
103 | print 'delay = ', len(data_delay), len(data_delay), len(data_halfhalf)
104 | 
105 | 
106 | dumm1 = pd.get_dummies(data_halfhalf_2['carrier'], prefix='carrier')
107 | dumm2 = pd.get_dummies(data_halfhalf_2['dest'], prefix='dest')
108 | dumm3 = pd.get_dummies(data_halfhalf_2['origin'], prefix='origin')
109 | data_halfhalf_2 = data_halfhalf_2[cols].join(dumm1.ix[:, 'carrier_3.0':]).join(
110 |     dumm2.ix[:, 'dest_6.0':]).join(dumm3.ix[:, 'origin_105.0':])
111 | data_halfhalf_2['intercept'] = 1.0  # (552264, 117)
112 | 
113 | 
114 | # train dataset with logistic regression algorithm
115 | train_cols = data_halfhalf.columns[1:]
116 | logit = sm.Logit(data_halfhalf['delay'], data_halfhalf[train_cols])
117 | result = logit.fit(maxiter=1000)
118 | 
119 | ff = open('halfhalf_sample_re3', 'w')
120 | ff.write(str(result.summary()))
121 | ff.close()
122 | print result.summary()
123 | 
124 | 
125 | # finally, we got theta - coefficient.
126 | a = np.array(result.params)
127 | pickle.dump(a, open('theta_half5', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
128 | theta = pickle.load(open('theta_half5', 'rb'))
129 | 
130 | 
131 | # now k-fold test
132 | 
133 | '''
134 | df_test = pd.read_csv('trimmed2_2008.csv')
135 | dumm_test1 = pd.get_dummies(df_test['carrier'], prefix='carrier')
136 | dumm_test2 = pd.get_dummies(df_test['dest'], prefix='dest')
137 | dumm_test3 = pd.get_dummies(df_test['origin'], prefix='origin')
138 | data_test = df_test[cols].join(dumm_test1.ix[:, 'carrier_3.0':]).join(dumm_test2.ix[:, 'dest_6.0':]).join(dumm_test3.ix[:, 'origin_105.0':])
139 | data_test['intercept'] = 1.0
140 | data_test_cal = data_test.drop('delay', 1)
141 | dot = np.dot(data_test_cal, theta)
142 | '''
143 | 
144 | rows = random.sample(data_halfhalf.index, len(data_halfhalf) / 10)
145 | df_10 = data_halfhalf.ix[rows]
146 | # df_90 = data_halfhalf.drop(rows)
147 | df_10_cal = df_10.drop('delay', 1)
148 | dotProduct = np.dot(df_10_cal, theta)  # m x 122 * 122 x 1
149 | 
150 | # get reverse logit
151 | reverseLogit = [np.exp(dot) / (1 + np.exp(dot)) for dot in dotProduct]
152 | prob = [1 if rev >= 0.5 else 0 for rev in reverseLogit]
153 | 
154 | # predict with test dataset and measure accuracy, precision, and recall
155 | y = df_10['delay']
156 | tp, tn, fp, fn = 0., 0., 0., 0.
157 | for i in range(len(prob)):
158 |     if prob[i] == 1 and y[i] == 1:
159 |         tp += 1
160 |     elif prob[i] == 1 and y[i] == 0:
161 |         fp += 1
162 |     elif prob[i] == 0 and y[i] == 1:
163 |         fn += 1
164 |     elif prob[i] == 0 and y[i] == 0:
165 |         tn += 1
166 |     else:
167 |         raise Exception('wtf!!!', prob[i], y[i])
168 | 
169 | print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn)
170 | print 'precision = ', tp / (tp + fp)
171 | print 'recall = ', tp / (tp + fn)
172 | print tp, tn, fp, fn
173 | 
174 | # >>> print 'accuracy = ', (tp + tn) / (tp + fp + fn + tn)
175 | # accuracy =  0.60288632166
176 | # >>> print 'precision = ', tp / (tp + fp)
177 | # precision =  0.607973048849
178 | # >>> print 'recall = ', tp / (tp + fn)
179 | # recall =  0.586353790614
180 | # >>> print tp, tn, fp, fn
181 | # 16242.0 17053.0 10473.0 11458.0
182 | 
183 | 
184 | # meaure ROC curve
185 | 
186 | rlsort = reverseLogit[:]
187 | rlsort.sort()
188 | diff = diff[51900]  # min([j-i for i, j in zip(rlsort[:-1], rlsort[1:])])
189 | 
190 | p = len([e for e in y if e == 1])
191 | n = len([e for e in y if e == 0])
192 | j = rlsort[0]
193 | r = []
194 | while j <= rlsort[-1]:
195 |     prob = [1 if rev >= j else 0 for rev in reverseLogit]
196 |     p1 = [x for x in prob if x == 1]
197 |     # print p1
198 |     # raw_input()
199 |     tp, fp = 0., 0.
200 |     for i in range(len(prob)):
201 |         if prob[i] == 1 and y[i] == 1:
202 |             tp += 1
203 |         elif prob[i] == 1 and y[i] == 0:
204 |             fp += 1
205 |     r.append((fp / float(n), tp / float(p)))
206 |     # print j, tp, fp, p, n
207 |     j += 0.01
208 | 
209 | # plot ROC curve
210 | import matplotlib as mpl
211 | mpl.use('Agg')
212 | import matplotlib.pyplot as plt
213 | from matplotlib.backends.backend_pdf import PdfPages
214 | 
215 | r = pickle.load(open('roc.list', 'rb'))
216 | fig = plt.figure()
217 | plt.plot(*zip(*r), marker='o', color='r', ls='')
218 | pp = PdfPages('foo.pdf')
219 | pp.savefig(fig)
220 | pp.close()
221 | 


--------------------------------------------------------------------------------
/model_selector.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # model_selector.py
 3 | # Author:  Ryan Jung
 4 | # Description:  This script graphs the results of validation tests with precision on the
 5 | # y-axis and recall on the x-axis.
 6 | # Because we only used 8-fold validation for the Naive Bayes model, this model is only
 7 | # used for the testing results of that validation.
 8 | #
 9 | 
10 | import matplotlib.pyplot as plt
11 | plt.rcdefaults()
12 | 
13 | # Hard code of testing results of form [precision, recall, accuracy, title]
14 | DM_TEST_DATA = [
15 |     [0.59, 0.61, 0.61, 'NB 2008'], [0.60, 0.61, 0.60, 'NB 2007'], [
16 |         0.60, 0.63, 0.62, 'NB 2006'], [0.62, 0.64, 0.64, 'NB 2005'],
17 |     [0.63, 0.66, 0.66, 'NB 2004'], [0.65, 0.70, 0.70, 'NB 2003'], [0.60, 0.65, 0.65, 'NB 2002'], [0.58, 0.62, 0.61, 'NB 2001']]
18 | 
19 | #
20 | # Function: calc_f1_score(precision, recall, accuracy)
21 | # Description: This function calculates the F1 score = 2*(precision * recall) / (precision + recall)
22 | # Input: Floating point values of precision, recall, and accuracy (not used)
23 | # Output: Floating point F1 score
24 | #
25 | 
26 | 
27 | def calc_f1_score(precision, recall, accuracy):
28 |     return (float(2 * (precision * recall) / (precision + recall)))
29 | 
30 | #
31 | # Main Function
32 | # Description: Creates array of precision and array of recall values.  Uses best values to
33 | # track highest F1 score and title of test with best result.
34 | #
35 | 
36 | precision_dm_array = []
37 | recall_dm_array = []
38 | dm_best_f1 = 0.00000000000000000
39 | index = 0
40 | dm_best_title = 'None'
41 | 
42 | for each in DM_TEST_DATA:
43 |     precision_dm_array.append(each[0])
44 |     recall_dm_array.append(each[1])
45 | 
46 |     f1 = calc_f1_score(each[0], each[1], each[2])
47 |     if(f1 > dm_best_f1):
48 |         dm_best_f1 = f1
49 |         best_index = index
50 |         dm_best_title = each[3]
51 |     index += 1
52 | 
53 | # prints title of Best performing model by F1 score
54 | # print "The Best Naive Bayes Model is: Model " + str(dm_best_title)
55 | 
56 | # Scatter plot visualization of results with precision on y-axis and
57 | # recall on x-axis
58 | fig = plt.subplot(111)
59 | fig.scatter(precision_dm_array, recall_dm_array, color='blue')
60 | fig.set_xlabel('Recall')
61 | fig.set_ylabel('Precision')
62 | 
63 | plt.show()
64 | 


--------------------------------------------------------------------------------
/naive bayes.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | #
  4 | # Naive Bayes.py
  5 | # Author: Divyakumar Menghani
  6 | # Description: This code  reads the dataset into pandas dataframes, builds a Naive Bayes Classifier, predicts labels for a subset of data. It also calculates metrics such as precision/recall/accuracy and F-Score after classification. The output is dumped in pickle files which are used later for visualization
  7 | #
  8 | 
  9 | import pickle
 10 | import sklearn
 11 | from sklearn.naive_bayes import *
 12 | import pandas as pd
 13 | import numpy as np
 14 | from sklearn import *
 15 | import os
 16 | from sklearn.metrics import *
 17 | from sklearn import metrics, preprocessing
 18 | from sklearn import svm, naive_bayes, neighbors, tree
 19 | 
 20 | #
 21 | # Function: createPickle()
 22 | # Description: This function will create a pickle file.
 23 | # Input: data structure that you want to pickle
 24 | # Output: a pickle file for the data structure. The file is stored in the
 25 | # same path the code is running from
 26 | #
 27 | 
 28 | 
 29 | def createPickle(data, filename):
 30 |     with open(filename, 'wb') as f:
 31 |             pickle.dump(data, f)
 32 |     print "Pickled", filename
 33 | 
 34 | 
 35 | # Global constants for this code
 36 | print "Setting constants..."
 37 | 
 38 | TRAINING_LINE_NUMBER = 8000000  # Number of lines to be read from input files
 39 | # List of years for training and testing
 40 | YEARS = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']
 41 | INPUT_FILE_PATH = "/home/dmenghani/python/"  # Unix path
 42 | # INPUT_FILE_PATH = "C:\\data\\airline\\"  # Windows path
 43 | SKIP_FIRST_LINE = True  # To skip the first line, as its the header
 44 | 
 45 | # Creating the master data frame from all years.
 46 | master = []
 47 | print "Reading into Pandas frame..."
 48 | try:
 49 |     for year in YEARS:
 50 |         path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
 51 |         print "\n", path
 52 |         dfPart = pd.read_csv(
 53 |             path, nrows=TRAINING_LINE_NUMBER, skiprows=0, usecols=[
 54 |                 u'Year',
 55 |                 u'Month',
 56 |                 u'DayofMonth',
 57 |                 u'DayOfWeek',
 58 |                 u'UniqueCarrier',
 59 |                 u'DepTime',
 60 |                 u'TailNum',
 61 |                 u'Origin',
 62 |                 u'Dest',
 63 |                 u'DepDelay',
 64 |                 # u'ArrDelay',
 65 |                 u'Cancelled',
 66 |                 #                 u'ArrTime',
 67 |                 #                 u'ArrDelay',
 68 |                 #                 u'Distance'
 69 |             ])
 70 |         print len(dfPart)
 71 |         # Removing cancelled flights from each year
 72 |         dfPart = dfPart[dfPart['Cancelled'] == 0]
 73 |         rows = np.random.choice(
 74 |             np.random.permutation(dfPart.index.values), len(dfPart) // 3, replace=False)  # 33% sampling of training data
 75 |         print rows
 76 |         sampled_dfPart = dfPart.ix[rows]
 77 |         sampled_dfPart = dfPart
 78 |         master.append(sampled_dfPart)
 79 |         print
 80 | except Exception as e:
 81 |     print "Supplemental Data Import failed", e
 82 | 
 83 | # Building the master frame by concating it for all years
 84 | dfMaster = pd.concat(master, ignore_index=True)
 85 | master = []
 86 | dfPart = []
 87 | 
 88 | print "Total length - ", len(dfMaster)
 89 | del dfMaster['Cancelled']  # Column not needed
 90 | 
 91 | dfMaster.fillna(0, inplace=True)
 92 | 
 93 | # Converting to appropriate datatypes for numeric cols.
 94 | dfMaster['Year'] = dfMaster['Year'].astype('int')
 95 | dfMaster['Month'] = dfMaster['Month'].astype('int')
 96 | dfMaster['DayofMonth'] = dfMaster['DayofMonth'].astype('int')
 97 | dfMaster['DayOfWeek'] = dfMaster['DayOfWeek'].astype('int')
 98 | dfMaster['DepTime'] = dfMaster['DepTime'].astype('int')
 99 | dfMaster['DepDelay'] = dfMaster['DepDelay'].astype('int')
100 | 
101 | df = dfMaster
102 | 
103 | # Since we dont have a classification label in the data, we are creating
104 | # one. Threshold of 5mins was chosen.
105 | print "Calculating classification label..."
106 | df['label'] = 0
107 | df.label[df.DepDelay >= 5] = 1
108 | df.label[df.DepDelay < 5] = 0
109 | print "Actual delayed flights  -", np.sum(dfMaster['label']) / len(dfMaster['label'])
110 | 
111 | del df['DepDelay']
112 | 
113 | print "Dataframe shape - ", df.shape
114 | print "Columns -", df.columns
115 | 
116 | # Converting categorical data to numeric for cols - TailNum,
117 | # UniqueCarrier, Dest, Origin
118 | print "Converting categorical data to numeric..."
119 | for col in set(df.columns):
120 |     if df[col].dtype == np.dtype('object'):
121 |         print "Converting...", col
122 |         if col == 'TailNum':
123 |             s = np.unique(df[col].values)
124 |             TailNum = pd.Series([x[0] for x in enumerate(s)], index=s)
125 |         if col == 'UniqueCarrier':
126 |             s = np.unique(df[col].values)
127 |             UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)
128 |         if col == 'Dest':
129 |             s = np.unique(df[col].values)
130 |             Dest = pd.Series([x[0] for x in enumerate(s)], index=s)
131 |         if col == 'Origin':
132 |             s = np.unique(df[col].values)
133 |             Origin = pd.Series([x[0] for x in enumerate(s)], index=s)
134 | 
135 | # Creating Pickle files for the list containing key-value pairs
136 | createPickle(Dest, 'Dest_2008.pkl')
137 | createPickle(Origin, 'Origin_2008.pkl')
138 | createPickle(UniqueCarrier, 'UniqueCarrier_2008.pkl')
139 | createPickle(TailNum, 'TailNum_2008.pkl')
140 | print "Pickle completed."
141 | 
142 | #
143 | # Function: getTailNum()
144 | # Description: This function will convert the input categorical value to corresponding numeric key.
145 | # Input: categorical value you want to convert
146 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
147 | #
148 | 
149 | 
150 | def getTailNum(inTailNum):
151 |     out = []
152 |     for x, y in inTailNum.iteritems():
153 |         out.append(TailNum.get_value(y))
154 |     return out
155 | 
156 | #
157 | # Function: getDest()
158 | # Description: This function will convert the input categorical value to corresponding numeric key.
159 | # Input: categorical value you want to convert
160 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
161 | #
162 | 
163 | 
164 | def getDest(inDest):
165 |     out = []
166 |     for x, y in inDest.iteritems():
167 |         out.append(Dest.get_value(y))
168 |     return out
169 | 
170 | #
171 | # Function: getOrigin()
172 | # Description: This function will convert the input categorical value to corresponding numeric key.
173 | # Input: categorical value you want to convert
174 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
175 | #
176 | 
177 | 
178 | def getOrigin(inOrign):
179 |     out = []
180 |     for x, y in inOrign.iteritems():
181 |         out.append(Origin.get_value(y))
182 |     return out
183 | 
184 | #
185 | # Function: getCarrier()
186 | # Description: This function will convert the input categorical value to corresponding numeric key.
187 | # Input: categorical value you want to convert
188 | # Output: a numeric value corresponding to the value passed. It uses the list created previously for lookup.
189 | #
190 | 
191 | 
192 | def getCarrier(inCarrier):
193 |     out = []
194 |     for x, y in inCarrier.iteritems():
195 |         out.append(UniqueCarrier.get_value(y))
196 |     return out
197 | 
198 | # Converting TailNum
199 | df['TailNum'] = getTailNum(df['TailNum'])
200 | print "TailNum completed."
201 | 
202 | # Converting UniqueCarrier
203 | df['UniqueCarrier'] = getCarrier(df['UniqueCarrier'])
204 | print "UniqueCarrier completed."
205 | 
206 | # Converting Dest
207 | df['Dest'] = getDest(df['Dest'])
208 | print "Dest completed."
209 | 
210 | # Converting Origin
211 | df['Origin'] = getOrigin(df['Origin'])
212 | print "Origin completed."
213 | 
214 | print "Conversion to numeric completed."
215 | 
216 | # Building classifier
217 | print "Begin cross validation..."
218 | 
219 | # Choosing features for classifier
220 | features = df.columns[0:9]
221 | 
222 | # Creating lists for storing results for cross validation.
223 | accuracy = {}
224 | results = {}
225 | matrix = {}
226 | prec = {}
227 | recall = {}
228 | 
229 | for year in YEARS:
230 |     print "Testing on - ", year
231 |     train = df[df['Year'] != int(year)]  # Test on 1year, train on other 7years
232 |     test = df[df['Year'] == int(year)]
233 |     # test = test[test['Origin'].isin([Origin['OAK'], Origin['SFO']])]
234 |     print len(train), len(test)
235 |     rows = np.random.choice(np.random.permutation(
236 |                             test.index.values), len(test) // 2, replace=False)  # 50% sampling of test data to avoid memory errors faced.
237 |     # print rows
238 |     sampled_test = test.ix[rows]
239 |     sampled_test = test
240 |     # Putting the last column of Training data into a list
241 |     trainTargets = np.array(train['label']).astype(int)
242 | 
243 |     # Putting the last column of Testing data into a list
244 |     testTargets = np.array(sampled_test['label']).astype(int)
245 |     print "Train length - ", len(train), "Test length -  ", len(sampled_test)
246 |     print train['Year']
247 |     print test['Year']
248 |     print "Model fitting and prediction started..."
249 |     # Building the classifier and fitting the train data
250 |     gnb = GaussianNB()
251 |     y_gnb = gnb.fit(train[features], trainTargets).predict(
252 |         sampled_test[features])
253 |     # Storing results in a new colum in the dataframe.
254 |     sampled_test['pred_label'] = y_gnb
255 |     print "Classification completed."
256 |     # Creating pickle files with the classifier and the results of classifier
257 |     createPickle(gnb, INPUT_FILE_PATH + "classifier_" + year + ".pkl")
258 |     createPickle(y_gnb, INPUT_FILE_PATH + "label_" + year + ".pkl")
259 |     sampled_test.to_csv(
260 |         INPUT_FILE_PATH + "\dfTest" + year + ".csv", index=False)
261 | # Calculating metrics using sklearn metrics functions
262 |     print "\nCalculating metrcs..."
263 |     accuracy[int(year)] = accuracy_score(sampled_test['label'], y_gnb)
264 |     print "Accuracy score - ", accuracy[int(year)]
265 |     prec[int(year)] = precision_score(
266 |         sampled_test['label'], y_gnb, average='micro')
267 |     print "Precision Score - ", prec[int(year)]
268 |     recall[int(year)] = recall_score(
269 |         sampled_test['label'], y_gnb, average='micro')
270 |     print "Recall Score - ", recall[int(year)]
271 |     print "Confusion matrix"
272 |     matrix[int(year)] = metrics.confusion_matrix(
273 |         sampled_test['label'], y_gnb)
274 |     print matrix[int(year)]
275 |     results[int(year)] = precision_recall_fscore_support(
276 |         sampled_test['label'], y_gnb, average='micro')
277 |     print "Precision, recall, F-Score, Support - ", results[int(year)]
278 |     print "Classification report"
279 |     print classification_report(np.array(sampled_test['label']), y_gnb,
280 |                                 target_names=target_names)
281 |     print
282 |     train = []
283 |     test = []
284 | 
285 | print "Accuracy\n", accuracy
286 | print "\nPrecision\n", prec
287 | print "\nRecall\n", recall
288 | print "\nMetrics\n", results
289 | print "\nMatrix\n", matrix
290 | 
291 | # Finding mean of metrics
292 | print "\nMean Cross validation Precision score", np.mean(pd.Series(prec))
293 | print "\nMean Cross validation Recall score", np.mean(pd.Series(recall))
294 | print "\nMean Cross validation Accuracy score", np.mean(pd.Series(accuracy))
295 | 
296 | # Pickling results
297 | print "\nPickling stuff..."
298 | createPickle(accuracy, 'accuracy.pkl')
299 | createPickle(prec, 'prec.pkl')
300 | createPickle(results, 'results.pkl')
301 | createPickle(matrix, 'matrix.pkl')
302 | 


--------------------------------------------------------------------------------