├── Figures
    ├── 2.Average trip distance by hour.png
    ├── 3.Median trip distance by hour.png
    ├── 4.Average trip speeds by week.png
    ├── 5.Average trip speeds by hour.png
    └── 1.Histogram of the trip distance.png
├── README.md
└── Capital One Data Challenge.py


/Figures/2.Average trip distance by hour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/2.Average trip distance by hour.png


--------------------------------------------------------------------------------
/Figures/3.Median trip distance by hour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/3.Median trip distance by hour.png


--------------------------------------------------------------------------------
/Figures/4.Average trip speeds by week.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/4.Average trip speeds by week.png


--------------------------------------------------------------------------------
/Figures/5.Average trip speeds by hour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/5.Average trip speeds by hour.png


--------------------------------------------------------------------------------
/Figures/1.Histogram of the trip distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Capital_One/master/Figures/1.Histogram of the trip distance.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Capital_One
2 | 
3 | To run the code: 
4 | 
5 | 1) Change the 'Path' to local path where data is stored
6 | 2) Use python (version 2) preferably.
7 | 


--------------------------------------------------------------------------------
/Capital One Data Challenge.py:
--------------------------------------------------------------------------------
  1 | #########################################################################################################
  2 | 
  3 | '''
  4 | Import necessary libraries
  5 | '''
  6 | 
  7 | global fig    #To keep track of figure counts
  8 | fig = 0
  9 | 
 10 | import collections
 11 | import pandas as pd
 12 | #Import module for plotting
 13 | import matplotlib.pyplot as plt
 14 | from sklearn.linear_model import LinearRegression
 15 | import numpy as np
 16 | from sklearn.preprocessing import normalize
 17 | 
 18 | #########################################################################################################
 19 | 
 20 | '''
 21 | 1. Importing the given data
 22 | '''
 23 | 
 24 | path = r'E:\Users\Dell\Desktop\green_tripdata_2015-09.csv'    #Change this to local path where data is stored
 25 | df = pd.read_csv(path)          #df is a DataFrame
 26 | #The DataFrame 'df' has the given data
 27 | 
 28 | #########################################################################################################
 29 | 
 30 | '''
 31 | 2. Plotting the histogram of the trip distance ("Trip Distance")
 32 | '''
 33 | 
 34 | plt.figure(fig)
 35 | fig = fig + 1
 36 | #Number of bins are taken as 300. The minimum Trip_distance is 0 and maximum is 603.10000000000002
 37 | counts, bins, bars = plt.hist(df['Trip_distance'], bins=300, color='black', label='Trip_distance')
 38 | plt.legend(loc='upper right')
 39 | plt.xlabel('Trip Distance')
 40 | plt.ylabel('Counts in the Corresponding Bins')
 41 | plt.title('Histogram of Trip_distance (bins = 300)')
 42 | plt.savefig('1.Histogram of the trip distance.png', dpi=900, bbox_inches='tight')
 43 | 
 44 | #########################################################################################################
 45 | 
 46 | '''
 47 | 3. Report mean and median trip distance grouped by hour of day.
 48 | '''
 49 | 
 50 | #I have considered the pick up datetime as the time to work upon here, analysing on hourly basis
 51 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime'])
 52 | #extract the hour counterpart
 53 | df['hours'] = pd.DatetimeIndex(df['pickup_times']).hour
 54 | 
 55 | #Temporary dataframe used, not to tamper the original dataframe
 56 | temp1 = df
 57 | 
 58 | #Sorting by the 'hours' column, to facilitate the calculation of average
 59 | temp1 = temp1.sort_values(['hours'], ascending=True)
 60 | temp1 = temp1.reset_index()
 61 | hours_dict = dict(collections.Counter(temp1['hours']))
 62 | 
 63 | dist_avg_by_hour = []
 64 | dist_median_by_hour = []
 65 | j = 0
 66 | 
 67 | #Due to division by 0, some have 'inf' value. Replacing that with '0'
 68 | #Averaging Calculation
 69 | for i in hours_dict.values():
 70 |     temp = temp1['Trip_distance'][j:(j+i)].replace(np.inf, 0)
 71 |     dist_avg_by_hour.append(np.mean(temp))
 72 |     dist_median_by_hour.append(np.median(temp))
 73 |     j = j + i
 74 | #The list 'dist_avg_by_hour' represents average trip distance as a function of each hour of day
 75 | 
 76 | #Plotting average trip distance each hour for better visualization
 77 | plt.figure(fig)
 78 | fig = fig + 1
 79 | plt.stem(dist_avg_by_hour)
 80 | plt.xlabel('Time of the day')
 81 | plt.ylabel('Average trip distance in the hour')
 82 | plt.margins(0.1, 0.1)
 83 | plt.title('Average trip distance by hour')
 84 | plt.savefig('2.Average trip distance by hour.png', dpi=900, bbox_inches='tight')
 85 | 
 86 | #Plotting median trip distance each hour for better visualization
 87 | plt.figure(fig)
 88 | fig = fig + 1
 89 | plt.stem(dist_median_by_hour)
 90 | plt.xlabel('Time of the day')
 91 | plt.ylabel('Median trip distance in the hour')
 92 | plt.margins(0.1, 0.1)
 93 | plt.title('Median trip distance by hour')
 94 | plt.savefig('3.Median trip distance by hour.png', dpi=900, bbox_inches='tight')
 95 | 
 96 | #########################################################################################################
 97 | 
 98 | '''
 99 | 4. Build a derived variable for tip_out_of_total_fare, 
100 |    which is tip as a percentage of the total fare.
101 | '''
102 | 
103 | #Add a new column named 'tip_out_of_total_fare' to the DataFrame 'df'
104 | df['tip_out_of_total_fare'] = (df['Tip_amount'] / df['Total_amount']) * 100
105 | #The column 'tip_out_of_total_fare' of 'df' DataFrame has the tip as a percentage of the total fare.
106 | 
107 | '''
108 | 4. Build a predictive model for tip as a percentage 
109 |    of the total fare. Use as much of the data as you 
110 |    like (or all of it). We will validate a sample.
111 | '''
112 | 
113 | #makiing a temporary duplicate
114 | temp = df
115 | 
116 | #Extract Predictions
117 | train_targets = np.nan_to_num(temp['tip_out_of_total_fare'])
118 | 
119 | #Delete unnecesary columns that lack the capability to predict
120 | #Tip amount is also excluded, and if its kept, prediction remains a mere division problem!
121 | columns_to_keep = ['RateCodeID', 'Pickup_longitude', 'Pickup_latitude', 
122 |                    'Dropoff_longitude', 'Dropoff_latitude', 'Passenger_count',
123 |                    'Trip_distance', 'Fare_amount', 'Extra', 'improvement_surcharge',
124 |                    'Total_amount', 'Payment_type', 'Trip_type ', 'MTA_tax']
125 | temp = temp[columns_to_keep]
126 | 
127 | #To make every sample, a unit vector. This is due to different ranges of feature values
128 | temp = normalize(np.nan_to_num(temp), norm='l2', axis=1)
129 | 
130 | #Replace all the nan values with '0' 
131 | train = np.nan_to_num(np.array(temp))
132 | 
133 | #fit a linear regression model
134 | model_LR = LinearRegression()
135 | model_LR.fit(train, train_targets)
136 | '''
137 | 'model_LR' is the required model
138 | Now, for validating, put the data in exact order as 'columns_to_keep',
139 | and then assign that to to_be_tested. Then un-comment the following line
140 | model_LR.predict(normalize(to_be_tested), norm='l2', axis=1)
141 | '''
142 | 
143 | #########################################################################################################
144 | 
145 | '''
146 | 5. Option A1: Build a derived variable representing the average speed over the course of a trip.
147 | '''
148 | 
149 | #Calculated the total time (in seconds)
150 | df['time_taken'] = (pd.to_datetime(df['Lpep_dropoff_datetime']) - pd.to_datetime(df['lpep_pickup_datetime'])).astype('timedelta64[s]')
151 | #convert seconds to hours
152 | df['time_taken'] =df['time_taken'] / 3600     
153 | #The column 'time_taken' of 'df' DataFrame has the average speed over the course of a trip.
154 | 
155 | #Average speed = Total distance / Total Time Taken
156 | #I have calculated speed in miles per hours
157 | df['Avg_speed'] = df['Trip_distance'] / df['time_taken']
158 | #The column 'Avg_speed' of 'df' DataFrame has the average speed over the course of a trip.
159 | 
160 | 
161 | '''
162 | 5. Option A2: Perform a test to determine if the average trip speeds are 
163 | materially the same in all weeks of September. If you decide they 
164 | are not the same, can you form a hypothesis regarding why they differ?
165 | '''
166 | 
167 | #I have considered the pick up datetime as the time to work upon here
168 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime'])
169 | #extract the day counterpart
170 | df['day'] = pd.DatetimeIndex(df['pickup_times']).day
171 | day_dict = dict(collections.Counter(df['day']))
172 | #day_dict represents number of rides per day
173 | days_in_weeks = [5, 7, 7, 7, 4]   #number of days in each week in september 2015
174 | j = 0
175 | weektrips = []
176 | 
177 | #For calculating trips in each week
178 | for i in days_in_weeks:
179 |     weektrips.append(np.sum(day_dict.values()[j:(j+i)]))
180 |     j = j + i
181 | 
182 | j = 0
183 | avg_by_week = []
184 | #Due to division by 0, some have 'inf' value. Replacing that with '0'
185 | #Averaging Calculation
186 | for i in weektrips:
187 |     temp = df['Avg_speed'][j:(j+i)].replace(np.inf, 0)
188 |     avg_by_week.append(np.mean(temp))
189 |     j = j + i
190 | #The list 'avg_by_week' represents average trip speed as a function of week
191 | 
192 | labels = ['First', 'Second', 'Third', 'Fourth', 'Fifth']
193 | 
194 | plt.figure(fig)
195 | fig = fig + 1
196 | plt.stem(avg_by_week)
197 | plt.xlabel('Weeks')
198 | plt.ylabel('Average trip speeds in the week')
199 | plt.xticks([0, 1, 2, 3, 4], labels)
200 | plt.margins(0.1, 0.1)
201 | plt.ylim([13, 17])
202 | plt.title('Average trip speeds by week')
203 | plt.savefig('4.Average trip speeds by week.png', dpi=900, bbox_inches='tight')
204 | 
205 | 
206 | '''
207 | 5. Option A3: Build a hypothesis of average trip speed as a function of time of day
208 | '''
209 | 
210 | 
211 | #I have considered the pick up datetime as the time to work upon here, analysing on hourly basis
212 | df['pickup_times'] = pd.to_datetime(df['lpep_pickup_datetime'])
213 | #extract the hour counterpart
214 | df['hours'] = pd.DatetimeIndex(df['pickup_times']).hour
215 | 
216 | #Temporary dataframe used, not to tamper the original dataframe
217 | temp1 = df
218 | 
219 | #Sorting by the 'hours' column, to facilitate the calculation of average
220 | temp1 = temp1.sort_values(['hours'], ascending=True)
221 | temp1 = temp1.reset_index()
222 | hours_dict = dict(collections.Counter(temp1['hours']))
223 | 
224 | avg_by_hour = []
225 | j = 0
226 | 
227 | #Due to division by 0, some have 'inf' value. Replacing that with '0'
228 | #Averaging Calculation
229 | for i in hours_dict.values():
230 |     temp = temp1['Avg_speed'][j:(j+i)].replace(np.inf, 0)
231 |     avg_by_hour.append(np.mean(temp))
232 |     j = j + i
233 | #The list 'avg_by_hour' represents average trip speed as a function of each hour of day
234 | 
235 | #Plotting for better visualization
236 | plt.figure(fig)
237 | fig = fig + 1
238 | plt.stem(avg_by_hour)
239 | plt.xlabel('Time of the day')
240 | plt.ylabel('Average trip speeds in the hour')
241 | plt.margins(0.1, 0.1)
242 | plt.title('Average trip speeds by hour')
243 | plt.savefig('5.Average trip speeds by hour.png', dpi=900, bbox_inches='tight')
244 | 
245 | #########################################################################################################
246 | 


--------------------------------------------------------------------------------