├── Figures ├── 2.Average trip distance by hour.png ├── 3.Median trip distance by hour.png ├── 4.Average trip speeds by week.png ├── 5.Average trip speeds by hour.png └── 1.Histogram of the trip distance.png ├── README.md └── Capital One Data Challenge.py /Figures/2.Average trip distance by hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/2.Average trip distance by hour.png -------------------------------------------------------------------------------- /Figures/3.Median trip distance by hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/3.Median trip distance by hour.png -------------------------------------------------------------------------------- /Figures/4.Average trip speeds by week.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/4.Average trip speeds by week.png -------------------------------------------------------------------------------- /Figures/5.Average trip speeds by hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/5.Average trip speeds by hour.png -------------------------------------------------------------------------------- /Figures/1.Histogram of the trip distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/1.Histogram of the trip distance.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Capital_One 2 | 3 | To run the code: 4 | 5 | 1) Change the 'Path' to local path where data is stored 6 | 2) Use python (version 2) preferably. 7 | -------------------------------------------------------------------------------- /Capital One Data Challenge.py: -------------------------------------------------------------------------------- 1 | ######################################################################################################### 2 | 3 | ''' 4 | Import necessary libraries 5 | ''' 6 | 7 | global fig #To keep track of figure counts 8 | fig = 0 9 | 10 | import collections 11 | import pandas as pd 12 | #Import module for plotting 13 | import matplotlib.pyplot as plt 14 | from sklearn.linear_model import LinearRegression 15 | import numpy as np 16 | from sklearn.preprocessing import normalize 17 | 18 | ######################################################################################################### 19 | 20 | ''' 21 | 1. Importing the given data 22 | ''' 23 | 24 | path = r'E:\Users\Dell\Desktop\green_tripdata_2015-09.csv' #Change this to local path where data is stored 25 | df = pd.read_csv(path) #df is a DataFrame 26 | #The DataFrame 'df' has the given data 27 | 28 | ######################################################################################################### 29 | 30 | ''' 31 | 2. Plotting the histogram of the trip distance ("Trip Distance") 32 | ''' 33 | 34 | plt.figure(fig) 35 | fig = fig + 1 36 | #Number of bins are taken as 300. The minimum Trip_distance is 0 and maximum is 603.10000000000002 37 | counts, bins, bars = plt.hist(df['Trip_distance'], bins=300, color='black', label='Trip_distance') 38 | plt.legend(loc='upper right') 39 | plt.xlabel('Trip Distance') 40 | plt.ylabel('Counts in the Corresponding Bins') 41 | plt.title('Histogram of Trip_distance (bins = 300)') 42 | plt.savefig('1.Histogram of the trip distance.png', dpi=900, bbox_inches='tight') 43 | 44 | ######################################################################################################### 45 | 46 | ''' 47 | 3. Report mean and median trip distance grouped by hour of day. 48 | ''' 49 | 50 | #I have considered the pick up datetime as the time to work upon here, analysing on hourly basis 51 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime']) 52 | #extract the hour counterpart 53 | df['hours'] = pd.DatetimeIndex(df['pickup_times']).hour 54 | 55 | #Temporary dataframe used, not to tamper the original dataframe 56 | temp1 = df 57 | 58 | #Sorting by the 'hours' column, to facilitate the calculation of average 59 | temp1 = temp1.sort_values(['hours'], ascending=True) 60 | temp1 = temp1.reset_index() 61 | hours_dict = dict(collections.Counter(temp1['hours'])) 62 | 63 | dist_avg_by_hour = [] 64 | dist_median_by_hour = [] 65 | j = 0 66 | 67 | #Due to division by 0, some have 'inf' value. Replacing that with '0' 68 | #Averaging Calculation 69 | for i in hours_dict.values(): 70 | temp = temp1['Trip_distance'][j:(j+i)].replace(np.inf, 0) 71 | dist_avg_by_hour.append(np.mean(temp)) 72 | dist_median_by_hour.append(np.median(temp)) 73 | j = j + i 74 | #The list 'dist_avg_by_hour' represents average trip distance as a function of each hour of day 75 | 76 | #Plotting average trip distance each hour for better visualization 77 | plt.figure(fig) 78 | fig = fig + 1 79 | plt.stem(dist_avg_by_hour) 80 | plt.xlabel('Time of the day') 81 | plt.ylabel('Average trip distance in the hour') 82 | plt.margins(0.1, 0.1) 83 | plt.title('Average trip distance by hour') 84 | plt.savefig('2.Average trip distance by hour.png', dpi=900, bbox_inches='tight') 85 | 86 | #Plotting median trip distance each hour for better visualization 87 | plt.figure(fig) 88 | fig = fig + 1 89 | plt.stem(dist_median_by_hour) 90 | plt.xlabel('Time of the day') 91 | plt.ylabel('Median trip distance in the hour') 92 | plt.margins(0.1, 0.1) 93 | plt.title('Median trip distance by hour') 94 | plt.savefig('3.Median trip distance by hour.png', dpi=900, bbox_inches='tight') 95 | 96 | ######################################################################################################### 97 | 98 | ''' 99 | 4. Build a derived variable for tip_out_of_total_fare, 100 | which is tip as a percentage of the total fare. 101 | ''' 102 | 103 | #Add a new column named 'tip_out_of_total_fare' to the DataFrame 'df' 104 | df['tip_out_of_total_fare'] = (df['Tip_amount'] / df['Total_amount']) * 100 105 | #The column 'tip_out_of_total_fare' of 'df' DataFrame has the tip as a percentage of the total fare. 106 | 107 | ''' 108 | 4. Build a predictive model for tip as a percentage 109 | of the total fare. Use as much of the data as you 110 | like (or all of it). We will validate a sample. 111 | ''' 112 | 113 | #makiing a temporary duplicate 114 | temp = df 115 | 116 | #Extract Predictions 117 | train_targets = np.nan_to_num(temp['tip_out_of_total_fare']) 118 | 119 | #Delete unnecesary columns that lack the capability to predict 120 | #Tip amount is also excluded, and if its kept, prediction remains a mere division problem! 121 | columns_to_keep = ['RateCodeID', 'Pickup_longitude', 'Pickup_latitude', 122 | 'Dropoff_longitude', 'Dropoff_latitude', 'Passenger_count', 123 | 'Trip_distance', 'Fare_amount', 'Extra', 'improvement_surcharge', 124 | 'Total_amount', 'Payment_type', 'Trip_type ', 'MTA_tax'] 125 | temp = temp[columns_to_keep] 126 | 127 | #To make every sample, a unit vector. This is due to different ranges of feature values 128 | temp = normalize(np.nan_to_num(temp), norm='l2', axis=1) 129 | 130 | #Replace all the nan values with '0' 131 | train = np.nan_to_num(np.array(temp)) 132 | 133 | #fit a linear regression model 134 | model_LR = LinearRegression() 135 | model_LR.fit(train, train_targets) 136 | ''' 137 | 'model_LR' is the required model 138 | Now, for validating, put the data in exact order as 'columns_to_keep', 139 | and then assign that to to_be_tested. Then un-comment the following line 140 | model_LR.predict(normalize(to_be_tested), norm='l2', axis=1) 141 | ''' 142 | 143 | ######################################################################################################### 144 | 145 | ''' 146 | 5. Option A1: Build a derived variable representing the average speed over the course of a trip. 147 | ''' 148 | 149 | #Calculated the total time (in seconds) 150 | df['time_taken'] = (pd.to_datetime(df['Lpep_dropoff_datetime']) - pd.to_datetime(df['lpep_pickup_datetime'])).astype('timedelta64[s]') 151 | #convert seconds to hours 152 | df['time_taken'] =df['time_taken'] / 3600 153 | #The column 'time_taken' of 'df' DataFrame has the average speed over the course of a trip. 154 | 155 | #Average speed = Total distance / Total Time Taken 156 | #I have calculated speed in miles per hours 157 | df['Avg_speed'] = df['Trip_distance'] / df['time_taken'] 158 | #The column 'Avg_speed' of 'df' DataFrame has the average speed over the course of a trip. 159 | 160 | 161 | ''' 162 | 5. Option A2: Perform a test to determine if the average trip speeds are 163 | materially the same in all weeks of September. If you decide they 164 | are not the same, can you form a hypothesis regarding why they differ? 165 | ''' 166 | 167 | #I have considered the pick up datetime as the time to work upon here 168 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime']) 169 | #extract the day counterpart 170 | df['day'] = pd.DatetimeIndex(df['pickup_times']).day 171 | day_dict = dict(collections.Counter(df['day'])) 172 | #day_dict represents number of rides per day 173 | days_in_weeks = [5, 7, 7, 7, 4] #number of days in each week in september 2015 174 | j = 0 175 | weektrips = [] 176 | 177 | #For calculating trips in each week 178 | for i in days_in_weeks: 179 | weektrips.append(np.sum(day_dict.values()[j:(j+i)])) 180 | j = j + i 181 | 182 | j = 0 183 | avg_by_week = [] 184 | #Due to division by 0, some have 'inf' value. Replacing that with '0' 185 | #Averaging Calculation 186 | for i in weektrips: 187 | temp = df['Avg_speed'][j:(j+i)].replace(np.inf, 0) 188 | avg_by_week.append(np.mean(temp)) 189 | j = j + i 190 | #The list 'avg_by_week' represents average trip speed as a function of week 191 | 192 | labels = ['First', 'Second', 'Third', 'Fourth', 'Fifth'] 193 | 194 | plt.figure(fig) 195 | fig = fig + 1 196 | plt.stem(avg_by_week) 197 | plt.xlabel('Weeks') 198 | plt.ylabel('Average trip speeds in the week') 199 | plt.xticks([0, 1, 2, 3, 4], labels) 200 | plt.margins(0.1, 0.1) 201 | plt.ylim([13, 17]) 202 | plt.title('Average trip speeds by week') 203 | plt.savefig('4.Average trip speeds by week.png', dpi=900, bbox_inches='tight') 204 | 205 | 206 | ''' 207 | 5. Option A3: Build a hypothesis of average trip speed as a function of time of day 208 | ''' 209 | 210 | 211 | #I have considered the pick up datetime as the time to work upon here, analysing on hourly basis 212 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime']) 213 | #extract the hour counterpart 214 | df['hours'] = pd.DatetimeIndex(df['pickup_times']).hour 215 | 216 | #Temporary dataframe used, not to tamper the original dataframe 217 | temp1 = df 218 | 219 | #Sorting by the 'hours' column, to facilitate the calculation of average 220 | temp1 = temp1.sort_values(['hours'], ascending=True) 221 | temp1 = temp1.reset_index() 222 | hours_dict = dict(collections.Counter(temp1['hours'])) 223 | 224 | avg_by_hour = [] 225 | j = 0 226 | 227 | #Due to division by 0, some have 'inf' value. Replacing that with '0' 228 | #Averaging Calculation 229 | for i in hours_dict.values(): 230 | temp = temp1['Avg_speed'][j:(j+i)].replace(np.inf, 0) 231 | avg_by_hour.append(np.mean(temp)) 232 | j = j + i 233 | #The list 'avg_by_hour' represents average trip speed as a function of each hour of day 234 | 235 | #Plotting for better visualization 236 | plt.figure(fig) 237 | fig = fig + 1 238 | plt.stem(avg_by_hour) 239 | plt.xlabel('Time of the day') 240 | plt.ylabel('Average trip speeds in the hour') 241 | plt.margins(0.1, 0.1) 242 | plt.title('Average trip speeds by hour') 243 | plt.savefig('5.Average trip speeds by hour.png', dpi=900, bbox_inches='tight') 244 | 245 | ######################################################################################################### 246 | --------------------------------------------------------------------------------