├── Question9-Solution.py
├── Question3-Solution.py
├── Question10-Solution.py
├── Question1-Solution.py
├── Question8-Solution.py
├── Question4-Solution.py
├── Question5-Solution.py
├── Question2-Solution.py
├── Question6-Solution.py
├── dataset.csv
└── README.md


/Question9-Solution.py:
--------------------------------------------------------------------------------
1 | # Calculate requests per unique driver for each hour
2 | requests_per_driver = (df.groupBy('Time (Local)').agg(
3 |     (F.sum('Requests') / F.countDistinct('Unique Drivers')).alias('requests_per_driver'))
4 | )
5 | 
6 | # Show the hour with the highest ratio
7 | requests_per_driver.orderBy(F.desc('requests_per_driver')).show(1)
8 | 


--------------------------------------------------------------------------------
/Question3-Solution.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.functions import hour, sum
2 | 
3 | hourly_requests = df.groupBy(hour("Time (Local)").alias("hour")).agg(sum("Requests").alias("total_requests")).orderBy("total_requests", ascending=False)
4 | 
5 | most_requested_hour = hourly_requests.select("hour").first()[0]
6 | print("The hour with the most requests is:", most_requested_hour)
7 | 
8 | #The hour with the most requests is: 17
9 | 


--------------------------------------------------------------------------------
/Question10-Solution.py:
--------------------------------------------------------------------------------
1 | # Calculate average completed trips and unique drivers for each hour
2 | avg_trips_and_drivers = (df.groupBy('Time (Local)').agg(
3 |     F.mean('Completed Trips').alias('avg_completed_trips'),
4 |     F.mean('Unique Drivers').alias('avg_unique_drivers')
5 | ))
6 | 
7 | # Show the hour with the lowest average completed trips and unique drivers
8 | avg_trips_and_drivers.orderBy('avg_completed_trips', 'avg_unique_drivers').show(1)
9 | 


--------------------------------------------------------------------------------
/Question1-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import max
 2 | 
 3 | # Read the data from CSV file
 4 | uber = spark.read.csv("uber.csv", header=True, inferSchema=True)
 5 | 
 6 | # Group the data by date and sum the completed trips
 7 | completed_trips_by_date = uber.groupBy("Date").sum("Completed Trips")
 8 | 
 9 | # Find the date with the most completed trips
10 | date_with_most_completed_trips = completed_trips_by_date \
11 |     .orderBy("sum(Completed Trips)", ascending=False) \
12 |     .select("Date") \
13 |     .first()["Date"]
14 | 
15 | print(date_with_most_completed_trips)
16 | 
17 | #Output:  2012-09-15
18 | 


--------------------------------------------------------------------------------
/Question8-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import col, sum
 2 | 
 3 | # Group the data by 72-hour periods and calculate the ratio of zeroes to eyeballs for each period
 4 | period_ratios = (df
 5 |   .groupBy(((col("Date").cast("timestamp").cast("long") / (72*3600)).cast("int")).alias("period"))
 6 |   .agg(sum("Zeroes").alias("zeroes"), sum("Eyeballs").alias("eyeballs"))
 7 |   .withColumn("ratio", col("zeroes") / col("eyeballs"))
 8 | )
 9 | 
10 | # Find the period with the highest ratio
11 | highest_ratio_period = period_ratios.orderBy(col("ratio").desc()).limit(1)
12 | 
13 | # Print the result
14 | highest_ratio_period.show()
15 | 


--------------------------------------------------------------------------------
/Question4-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import dayofweek, hour
 2 | 
 3 | weekend_zeros = df.filter((hour("Time (Local)") >= 17) | (hour("Time (Local)") < 3)).filter((dayofweek("Date") == 6) | (dayofweek("Date") == 7)).agg(sum("Zeroes").alias("weekend_zeros")).collect()[0]["weekend_zeros"]
 4 | 
 5 | total_zeros = df.agg(sum("Zeroes").alias("total_zeros")).collect()[0]["total_zeros"]
 6 | 
 7 | percent_weekend_zeros = weekend_zeros / total_zeros * 100
 8 | 
 9 | print("The percentage of zeros that occurred on weekends is:", percent_weekend_zeros, "%")
10 | 
11 | #The percentage of zeros that occurred on weekends is: 41.333414829040026 %
12 | 


--------------------------------------------------------------------------------
/Question5-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import avg
 2 | 
 3 | weighted_avg = df.withColumn("completed_per_driver", df["Completed Trips"] / df["Unique Drivers"]) \
 4 |                  .groupBy("Date", "Time (Local)") \
 5 |                  .agg(avg("completed_per_driver").alias("avg_completed_per_driver"), sum("Completed Trips").alias("total_completed_trips")) \
 6 |                  .withColumn("weighted_ratio", col("avg_completed_per_driver") * col("total_completed_trips")) \
 7 |                  .agg(sum("weighted_ratio") / sum("total_completed_trips")).collect()[0][0]
 8 | 
 9 | print("The weighted average ratio of completed trips per driver is:", weighted_avg)
10 | 
11 | #Output: The weighted average ratio of completed trips per driver is: 1.2869201507713425
12 | 


--------------------------------------------------------------------------------
/Question2-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import sum, window
 2 | 
 3 | # Read the data from CSV file
 4 | uber = spark.read.csv("uber.csv", header=True, inferSchema=True)
 5 | 
 6 | # Group the data by 24-hour windows and sum the completed trips
 7 | completed_trips_by_window = uber \
 8 |     .groupBy(window("Time (Local)", "24 hours")) \
 9 |     .agg(sum("Completed Trips").alias("Total Completed Trips")) \
10 |     .orderBy("Total Completed Trips", ascending=False)
11 | 
12 | # Get the highest number of completed trips within a 24-hour period
13 | highest_completed_trips_in_24_hours = completed_trips_by_window \
14 |     .select("Total Completed Trips") \
15 |     .first()["Total Completed Trips"]
16 | 
17 | print(highest_completed_trips_in_24_hours)
18 | 
19 | #Output 2102
20 | 


--------------------------------------------------------------------------------
/Question6-Solution.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import col, hour, countDistinct
 2 | from pyspark.sql.window import Window
 3 | 
 4 | # Calculate the number of unique requests for each hour of the day
 5 | hourly_unique_requests = (df
 6 |   .groupBy(hour("Time (Local)").alias("hour"))
 7 |   .agg(countDistinct("Requests").alias("unique_requests"))
 8 | )
 9 | 
10 | # Slide a window of 8 hours to find the busiest 8 consecutive hours
11 | window = Window.orderBy(col("unique_requests").desc()).rowsBetween(0, 7)
12 | busiest_8_consecutive_hours = (hourly_unique_requests
13 |   .select("*", sum("unique_requests").over(window).alias("consecutive_sum"))
14 |   .orderBy(col("consecutive_sum").desc())
15 |   .limit(1)
16 | )
17 | 
18 | # Print the result
19 | busiest_8_consecutive_hours.show()
20 | 


--------------------------------------------------------------------------------
/dataset.csv:
--------------------------------------------------------------------------------
  1 | Date,Time (Local),Eyeballs ,Zeroes ,Completed Trips ,Requests ,Unique Drivers
  2 | 10-Sep-12,7,5,0,2,2,9
  3 | ,8,6,0,2,2,14
  4 | ,9,8,3,0,0,14
  5 | ,10,9,2,0,1,14
  6 | ,11,11,1,4,4,11
  7 | ,12,12,0,2,2,11
  8 | ,13,9,1,0,0,9
  9 | ,14,12,1,0,0,9
 10 | ,15,11,2,1,2,7
 11 | ,16,11,2,3,4,6
 12 | ,17,12,2,3,4,4
 13 | ,18,11,1,3,4,7
 14 | ,19,13,2,2,3,7
 15 | ,20,11,1,0,0,5
 16 | ,21,11,0,1,1,3
 17 | ,22,16,3,0,2,4
 18 | ,23,21,5,3,3,4
 19 | 11-Sep-12,0,9,3,1,1,3
 20 | ,1,3,2,0,1,3
 21 | ,2,1,1,0,0,1
 22 | ,3,1,1,0,0,1
 23 | ,4,1,1,0,0,1
 24 | ,5,1,1,0,0,0
 25 | ,6,7,3,2,3,3
 26 | ,7,10,0,2,2,5
 27 | ,8,11,2,0,0,6
 28 | ,9,15,2,0,0,6
 29 | ,10,12,1,1,1,8
 30 | ,11,16,1,0,0,9
 31 | ,12,5,1,0,0,8
 32 | ,13,11,2,1,1,7
 33 | ,14,19,1,1,2,5
 34 | ,15,19,1,2,2,9
 35 | ,16,19,3,1,1,7
 36 | ,17,23,5,0,0,9
 37 | ,18,21,1,7,8,11
 38 | ,19,28,2,8,9,12
 39 | ,20,22,2,7,7,10
 40 | ,21,19,6,5,5,5
 41 | ,22,26,10,2,6,2
 42 | ,23,15,12,0,3,1
 43 | 12-Sep-12,0,9,2,0,1,2
 44 | ,1,6,0,1,1,2
 45 | ,2,3,3,0,1,0
 46 | ,3,2,2,0,0,0
 47 | ,4,1,1,0,0,0
 48 | ,5,1,1,0,1,1
 49 | ,6,7,2,3,4,3
 50 | ,7,10,5,2,5,4
 51 | ,8,28,2,8,8,11
 52 | ,9,25,1,5,6,12
 53 | ,10,25,3,3,4,15
 54 | ,11,29,2,7,8,16
 55 | ,12,32,2,8,8,14
 56 | ,13,23,0,9,9,14
 57 | ,14,25,1,5,6,15
 58 | ,15,26,4,7,11,16
 59 | ,16,27,4,7,7,14
 60 | ,17,19,1,5,7,13
 61 | ,18,24,3,4,4,12
 62 | ,19,20,2,5,6,11
 63 | ,20,20,2,2,3,9
 64 | ,21,29,4,4,4,7
 65 | ,22,18,2,4,5,7
 66 | ,23,14,6,2,5,3
 67 | 13-Sep-12,0,11,11,0,2,0
 68 | ,1,6,6,0,1,0
 69 | ,2,5,5,0,3,0
 70 | ,3,2,2,0,2,0
 71 | ,4,1,1,0,0,0
 72 | ,5,1,1,0,1,0
 73 | ,6,4,1,2,2,2
 74 | ,7,10,1,1,1,4
 75 | ,8,9,1,1,1,8
 76 | ,9,14,4,0,0,9
 77 | ,10,15,3,2,2,10
 78 | ,11,14,5,1,2,9
 79 | ,12,18,3,4,5,10
 80 | ,13,18,7,3,4,8
 81 | ,14,12,3,0,1,10
 82 | ,15,17,4,3,3,12
 83 | ,16,23,1,2,3,14
 84 | ,17,21,3,1,3,14
 85 | ,18,21,3,3,3,11
 86 | ,19,19,3,7,8,10
 87 | ,20,13,0,3,3,9
 88 | ,21,25,4,6,7,9
 89 | ,22,25,2,2,3,7
 90 | ,23,21,6,4,7,3
 91 | 14-Sep-12,0,10,1,3,4,3
 92 | ,1,10,9,2,5,2
 93 | ,2,5,3,1,3,1
 94 | ,3,3,3,0,1,0
 95 | ,4,1,1,0,1,0
 96 | ,5,5,4,1,2,1
 97 | ,6,7,2,2,3,4
 98 | ,7,9,1,2,2,5
 99 | ,8,10,1,1,1,7
100 | ,9,22,2,4,4,7
101 | ,10,11,5,1,2,10
102 | ,11,22,2,3,4,12
103 | ,12,18,2,3,4,11
104 | ,13,20,2,1,1,10
105 | ,14,23,3,1,2,7
106 | ,15,20,5,2,3,10
107 | ,16,18,3,3,4,11
108 | ,17,34,4,3,5,13
109 | ,18,40,2,8,9,14
110 | ,19,46,6,9,10,15
111 | ,20,38,4,8,9,14
112 | ,21,49,6,8,9,17
113 | ,22,60,3,18,20,19
114 | ,23,68,18,24,29,18
115 | 15-Sep-12,0,45,2,23,24,19
116 | ,1,37,1,12,14,18
117 | ,2,38,17,22,27,12
118 | ,3,17,2,5,7,8
119 | ,4,5,1,0,0,1
120 | ,5,1,1,0,1,0
121 | ,6,3,3,0,0,0
122 | ,7,9,7,1,1,1
123 | ,8,13,10,2,3,2
124 | ,9,14,9,1,3,2
125 | ,10,20,5,2,3,2
126 | ,11,23,5,2,2,3
127 | ,12,28,11,2,2,3
128 | ,13,34,6,6,7,4
129 | ,14,34,11,6,10,6
130 | ,15,31,15,3,8,5
131 | ,16,20,3,4,5,8
132 | ,17,42,9,8,8,10
133 | ,18,59,25,15,25,10
134 | ,19,73,38,17,34,14
135 | ,20,53,15,13,20,17
136 | ,21,59,4,15,20,19
137 | ,22,69,26,24,34,16
138 | ,23,61,32,16,24,13
139 | 16-Sep-12,0,44,2,17,20,15
140 | ,1,38,3,17,17,15
141 | ,2,29,2,13,15,12
142 | ,3,14,2,4,6,7
143 | ,4,6,6,0,3,2
144 | ,5,5,5,0,1,1
145 | ,6,3,3,0,2,0
146 | ,7,3,0,0,0,1
147 | ,8,8,2,1,2,3
148 | ,9,11,3,3,3,3
149 | ,10,20,2,1,1,4
150 | ,11,19,5,2,2,4
151 | ,12,18,3,3,4,7
152 | ,13,21,4,0,0,5
153 | ,14,23,13,4,7,5
154 | ,15,14,3,3,3,7
155 | ,16,25,5,3,3,10
156 | ,17,17,2,2,3,10
157 | ,18,18,1,3,3,10
158 | ,19,16,2,1,2,8
159 | ,20,25,10,7,10,7
160 | ,21,25,11,5,5,6
161 | ,22,13,2,2,3,4
162 | ,23,11,2,2,3,4
163 | 17-Sep-12,0,11,5,0,2,2
164 | ,1,6,6,0,2,0
165 | ,2,3,3,0,1,0
166 | ,3,1,1,0,0,0
167 | ,4,2,1,0,1,0
168 | ,5,4,1,1,1,4
169 | ,6,7,0,4,4,5
170 | ,7,11,4,0,0,8
171 | ,8,6,1,1,1,10
172 | ,9,16,0,4,4,10
173 | ,10,21,4,1,3,10
174 | ,11,11,4,0,0,9
175 | ,12,13,1,2,2,9
176 | ,13,19,3,1,2,8
177 | ,14,19,3,0,1,8
178 | ,15,20,3,1,1,8
179 | ,16,23,7,3,5,7
180 | ,17,23,3,8,10,9
181 | ,18,35,7,8,9,12
182 | ,19,31,6,8,9,13
183 | ,20,21,2,4,4,11
184 | ,21,27,3,2,3,8
185 | ,22,18,1,6,6,8
186 | ,23,29,7,3,7,5
187 | 18-Sep-12,0,28,18,3,13,4
188 | ,1,4,2,2,2,1
189 | ,2,9,9,0,7,0
190 | ,3,4,4,0,3,0
191 | ,4,1,1,0,0,0
192 | ,5,2,0,0,0,0
193 | ,6,2,0,0,0,3
194 | ,7,9,1,3,3,5
195 | ,8,11,1,2,2,9
196 | ,9,15,1,0,1,11
197 | ,10,16,2,0,1,9
198 | ,11,12,2,2,4,9
199 | ,12,18,2,3,3,10
200 | ,13,17,3,0,0,11
201 | ,14,20,2,1,4,7
202 | ,15,17,4,1,3,7
203 | ,16,20,5,3,3,6
204 | ,17,18,4,3,4,7
205 | ,18,17,4,1,1,9
206 | ,19,26,4,6,8,8
207 | ,20,21,3,2,5,5
208 | ,21,24,6,4,5,5
209 | ,22,17,5,5,6,4
210 | ,23,14,11,1,3,4
211 | 19-Sep-12,0,9,1,0,0,3
212 | ,1,3,0,0,0,3
213 | ,2,3,0,2,2,1
214 | ,3,1,1,0,1,0
215 | ,4,1,1,0,0,0
216 | ,5,2,0,0,0,1
217 | ,6,7,1,2,3,5
218 | ,7,8,0,2,2,7
219 | ,8,11,1,2,2,6
220 | ,9,14,3,0,0,10
221 | ,10,12,3,0,2,12
222 | ,11,16,3,4,5,12
223 | ,12,19,3,1,1,11
224 | ,13,14,4,3,4,7
225 | ,14,9,2,1,1,8
226 | ,15,17,2,0,0,9
227 | ,16,22,5,2,4,10
228 | ,17,22,1,2,3,16
229 | ,18,28,5,3,3,13
230 | ,19,18,2,2,2,13
231 | ,20,19,1,2,2,12
232 | ,21,25,2,2,5,10
233 | ,22,23,3,9,10,9
234 | ,23,16,2,2,2,8
235 | 20-Sep-12,0,10,4,2,3,4
236 | ,1,1,0,0,1,1
237 | ,2,2,2,0,1,0
238 | ,3,2,2,0,2,0
239 | ,4,2,2,0,0,0
240 | ,5,1,1,0,0,0
241 | ,6,3,1,1,1,2
242 | ,7,5,2,2,2,4
243 | ,8,7,1,1,2,6
244 | ,9,11,2,1,2,7
245 | ,10,16,2,2,2,9
246 | ,11,15,3,2,3,12
247 | ,12,16,2,3,3,12
248 | ,13,15,2,0,0,11
249 | ,14,19,4,1,2,11
250 | ,15,18,5,2,3,12
251 | ,16,21,2,4,5,9
252 | ,17,37,4,5,9,8
253 | ,18,26,5,3,4,11
254 | ,19,36,7,14,16,13
255 | ,20,26,3,4,6,11
256 | ,21,30,5,6,8,13
257 | ,22,40,7,12,15,13
258 | ,23,16,0,5,5,10
259 | 21-Sep-12,0,22,1,10,11,9
260 | ,1,9,1,5,5,7
261 | ,2,5,0,1,2,5
262 | ,3,3,3,0,0,0
263 | ,4,0,0,0,0,0
264 | ,5,1,1,0,0,0
265 | ,6,6,3,0,2,2
266 | ,7,7,3,0,0,4
267 | ,8,3,0,0,0,6
268 | ,9,8,0,1,1,7
269 | ,10,17,4,0,1,8
270 | ,11,23,5,1,1,10
271 | ,12,22,3,1,1,9
272 | ,13,34,4,5,7,9
273 | ,14,35,5,9,10,12
274 | ,15,30,3,4,7,12
275 | ,16,49,4,11,12,15
276 | ,17,68,25,26,26,20
277 | ,18,62,6,15,22,26
278 | ,19,47,3,16,19,30
279 | ,20,63,8,14,21,28
280 | ,21,59,8,12,17,27
281 | ,22,73,4,23,29,26
282 | ,23,94,30,36,46,26
283 | 22-Sep-12,0,62,4,19,27,27
284 | ,1,39,5,17,24,24
285 | ,2,30,0,9,18,12
286 | ,3,11,3,3,3,8
287 | ,4,6,4,2,3,1
288 | ,5,5,4,1,2,0
289 | ,6,5,5,0,1,0
290 | ,7,8,3,0,0,1
291 | ,8,15,5,3,5,3
292 | ,9,15,5,1,1,6
293 | ,10,30,7,3,3,9
294 | ,11,33,6,5,7,9
295 | ,12,34,5,12,14,9
296 | ,13,61,25,12,17,10
297 | ,14,62,10,16,21,15
298 | ,15,75,15,18,23,16
299 | ,16,58,17,15,18,16
300 | ,17,41,2,8,10,20
301 | ,18,49,7,11,13,16
302 | ,19,43,10,21,24,15
303 | ,20,49,4,7,14,18
304 | ,21,53,3,15,19,21
305 | ,22,77,11,25,33,19
306 | ,23,99,59,25,44,17
307 | 23-Sep-12,0,62,25,23,32,18
308 | ,1,38,1,15,23,18
309 | ,2,34,4,13,18,18
310 | ,3,19,1,9,9,16
311 | ,4,1,0,0,1,4
312 | ,5,9,9,0,4,0
313 | ,6,8,7,1,1,1
314 | ,7,7,3,2,2,2
315 | ,8,13,3,0,0,4
316 | ,9,14,4,0,1,6
317 | ,10,19,3,2,2,9
318 | ,11,20,3,3,5,8
319 | ,12,25,3,2,4,8
320 | ,13,15,4,3,3,9
321 | ,14,20,2,4,4,7
322 | ,15,22,4,0,2,9
323 | ,16,22,0,7,8,11
324 | ,17,31,9,4,6,12
325 | ,18,36,4,8,11,12
326 | ,19,32,9,4,6,11
327 | ,20,29,6,3,3,7
328 | ,21,24,6,3,4,5
329 | ,22,17,2,2,2,6
330 | ,23,9,3,3,3,3
331 | 24-Sep-12,0,7,2,1,2,2
332 | ,1,7,7,0,0,0
333 | ,2,3,3,0,2,0
334 | ,3,3,3,0,1,0
335 | ,4,1,1,0,0,0
336 | ,5,4,2,1,1,3
337 | ,6,9,1,2,2,7
338 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Uber-Data-Analysis-Project-in-Pyspark
  2 | <img width="374" alt="image" src="https://user-images.githubusercontent.com/25612446/219968335-5e40c842-56cd-4f87-97b8-bb304a8a4c69.png">
  3 | 
  4 | This data project can be used as a take-home assignment to learn Pyspark and Data Engineering.
  5 | 
  6 | ## Insights from City Supply and Demand Data
  7 | 
  8 | ## Data Description
  9 | 
 10 | To answer the question, use the dataset from the file dataset.csv. For example, consider a row from this dataset:
 11 | 
 12 | 
 13 | ![image](https://user-images.githubusercontent.com/25612446/219965433-956a1dc0-2acf-4d0d-b1cb-40723249d349.png)
 14 | 
 15 | 
 16 | 
 17 | 
 18 | This means that during the hour beginning at 4pm (hour 16), on September 10th, 2012, 11 people opened the Uber app (Eyeballs). 2 of them did not see any car (Zeroes) and 4 of them requested a car (Requests). Of the 4 requests, only 3 complete trips actually resulted (Completed Trips). During this time, there were a total of 6 drivers who logged in (Unique Drivers).
 19 | 
 20 | ## Assignment 
 21 | 
 22 | Using the provided dataset, answer the following questions:
 23 | 
 24 | - ⚡ Which date had the most completed trips during the two week period?
 25 | - ⚡ What was the highest number of completed trips within a 24 hour period?
 26 | - ⚡ Which hour of the day had the most requests during the two week period?
 27 | - ⚡ What percentages of all zeroes during the two week period occurred on weekend (Friday at 5 pm to Sunday at 3 am)? Tip: The local time value is the start of the hour (e.g. 15 is the hour from 3:00pm - 4:00pm)
 28 | - ⚡ What is the weighted average ratio of completed trips per driver during the two week period? Tip: "Weighted average" means your answer should account for the total trip volume in each hour to determine the most accurate number in whole period.
 29 | - ⚡ In drafting a driver schedule in terms of 8 hours shifts, when are the busiest 8 consecutive hours over the two week period in terms of unique requests? A new shift starts in every 8 hours. Assume that a driver will work same shift each day.
 30 | - ⚡ True or False: Driver supply always increases when demand increases during the two week period. Tip: Visualize the data to confirm your answer if needed.
 31 | - ⚡ In which 72 hour period is the ratio of Zeroes to Eyeballs the highest?
 32 | - ⚡ If you could add 5 drivers to any single hour of every day during the two week period, which hour should you add them to? Hint: Consider both rider eyeballs and driver supply when choosing
 33 | - ⚡ Looking at the data from all two weeks, which time might make the most sense to consider a true "end day" instead of midnight? (i.e when are supply and demand at both their natural minimums) Tip: Visualize the data to confirm your answer if needed.
 34 | 
 35 | ## Solution: 
 36 | To solve the questions using PySpark, we need to first create a SparkSession and load the dataset into a DataFrame. Here's how we can do it:
 37 | ```python
 38 | from pyspark.sql import SparkSession
 39 | from pyspark.sql.functions import *
 40 | 
 41 | # Create a SparkSession
 42 | spark = SparkSession.builder.appName("UberDataAnalysis").getOrCreate()
 43 | 
 44 | # Load the dataset into a DataFrame
 45 | df = spark.read.csv("uber.csv", header=True, inferSchema=True) 
 46 | ```
 47 | Now that we have loaded the dataset into a data frame, we can start answering the questions.
 48 | Which date had the most completed trips during the two-week period?
 49 | 
 50 | - To find the date with the most completed trips, you can group the data by date and sum the completed trips column. Then, sort the results in descending order and select the top row.
 51 | ```python
 52 | from pyspark.sql.functions import max
 53 | 
 54 | # Read the data from CSV file
 55 | uber = spark.read.csv("uber.csv", header=True, inferSchema=True)
 56 | 
 57 | # Group the data by date and sum the completed trips
 58 | completed_trips_by_date = uber.groupBy("Date").sum("Completed Trips")
 59 | 
 60 | # Find the date with the most completed trips
 61 | date_with_most_completed_trips = completed_trips_by_date \
 62 |     .orderBy("sum(Completed Trips)", ascending=False) \
 63 |     .select("Date") \
 64 |     .first()["Date"]
 65 | 
 66 | print(date_with_most_completed_trips)
 67 | 
 68 | #Output:  2012-09-15
 69 | ```
 70 | - What was the highest number of completed trips within a 24-hour period?
 71 | 
 72 | To find the highest number of completed trips within a 24-hour period, you can group the data by date and use a window function to sum the completed trips column over a rolling 24-hour period. Then, you can sort the results in descending order and select the top row.
 73 | 
 74 | ```python
 75 | from pyspark.sql.functions import sum, window
 76 | 
 77 | # Read the data from CSV file
 78 | uber = spark.read.csv("uber.csv", header=True, inferSchema=True)
 79 | 
 80 | # Group the data by 24-hour windows and sum the completed trips
 81 | completed_trips_by_window = uber \
 82 |     .groupBy(window("Time (Local)", "24 hours")) \
 83 |     .agg(sum("Completed Trips").alias("Total Completed Trips")) \
 84 |     .orderBy("Total Completed Trips", ascending=False)
 85 | 
 86 | # Get the highest number of completed trips within a 24-hour period
 87 | highest_completed_trips_in_24_hours = completed_trips_by_window \
 88 |     .select("Total Completed Trips") \
 89 |     .first()["Total Completed Trips"]
 90 | 
 91 | print(highest_completed_trips_in_24_hours)
 92 | 
 93 | #Output 2102
 94 | ```
 95 | - Which hour of the day had the most requests during the two-week period?
 96 | 
 97 | To answer this question, we need to group the data by an hour and sum the "Requests" column for each hour. We can then sort the result by the sum of requests and select the hour with the highest sum.
 98 | 
 99 | ```python
100 | from pyspark.sql.functions import hour, sum
101 | 
102 | hourly_requests = df.groupBy(hour("Time (Local)").alias("hour")).agg(sum("Requests").alias("total_requests")).orderBy("total_requests", ascending=False)
103 | 
104 | most_requested_hour = hourly_requests.select("hour").first()[0]
105 | print("The hour with the most requests is:", most_requested_hour)
106 | 
107 | #The hour with the most requests is: 17
108 | ```
109 | 
110 | - What percentages of all zeroes during the two-week period occurred on weekends (Friday at 5 pm to Sunday at 3 am)?
111 | 
112 | To answer this question, we need to filter the data to select only the rows that fall within the specified time range, count the total number of zeros, and count the number of zeros that occurred on weekends. We can then calculate the percentage of zeros that occurred on weekends.
113 | 
114 | ```python
115 | from pyspark.sql.functions import dayofweek, hour
116 | 
117 | weekend_zeros = df.filter((hour("Time (Local)") >= 17) | (hour("Time (Local)") < 3)).filter((dayofweek("Date") == 6) | (dayofweek("Date") == 7)).agg(sum("Zeroes").alias("weekend_zeros")).collect()[0]["weekend_zeros"]
118 | 
119 | total_zeros = df.agg(sum("Zeroes").alias("total_zeros")).collect()[0]["total_zeros"]
120 | 
121 | percent_weekend_zeros = weekend_zeros / total_zeros * 100
122 | 
123 | print("The percentage of zeros that occurred on weekends is:", percent_weekend_zeros, "%")
124 | 
125 | #The percentage of zeros that occurred on weekends is: 41.333414829040026 %
126 | ```
127 | 
128 | - What is the weighted average ratio of completed trips per driver during the two-week period?
129 | 
130 | To answer this question, we need to calculate the ratio of completed trips to unique drivers for each hour, multiply the ratio by the total number of completed trips for that hour, and then sum the results. We can then divide this sum by the total number of completed trips for the entire period.
131 | 
132 | ```python
133 | from pyspark.sql.functions import avg
134 | 
135 | weighted_avg = df.withColumn("completed_per_driver", df["Completed Trips"] / df["Unique Drivers"]) \
136 |                  .groupBy("Date", "Time (Local)") \
137 |                  .agg(avg("completed_per_driver").alias("avg_completed_per_driver"), sum("Completed Trips").alias("total_completed_trips")) \
138 |                  .withColumn("weighted_ratio", col("avg_completed_per_driver") * col("total_completed_trips")) \
139 |                  .agg(sum("weighted_ratio") / sum("total_completed_trips")).collect()[0][0]
140 | 
141 | print("The weighted average ratio of completed trips per driver is:", weighted_avg)
142 | 
143 | #Output: The weighted average ratio of completed trips per driver is: 1.2869201507713425
144 | ```
145 | - In drafting a driver schedule in terms of 8 hours shifts, when are the busiest 8 consecutive hours over the two-week period in terms of unique requests? A new shift starts every 8 hours. Assume that a driver will work the same shift each day.
146 | 
147 | To solve this, we can first calculate the number of unique requests for each hour of the day, and then slide a window of 8 hours across the hours to find the 8 consecutive hours with the highest number of unique requests. Here's the code:
148 | 
149 | ```python
150 | from pyspark.sql.functions import col, hour, countDistinct
151 | from pyspark.sql.window import Window
152 | 
153 | # Calculate the number of unique requests for each hour of the day
154 | hourly_unique_requests = (df
155 |   .groupBy(hour("Time (Local)").alias("hour"))
156 |   .agg(countDistinct("Requests").alias("unique_requests"))
157 | )
158 | 
159 | # Slide a window of 8 hours to find the busiest 8 consecutive hours
160 | window = Window.orderBy(col("unique_requests").desc()).rowsBetween(0, 7)
161 | busiest_8_consecutive_hours = (hourly_unique_requests
162 |   .select("*", sum("unique_requests").over(window).alias("consecutive_sum"))
163 |   .orderBy(col("consecutive_sum").desc())
164 |   .limit(1)
165 | )
166 | 
167 | # Print the result
168 | busiest_8_consecutive_hours.show()
169 | ```
170 | This will output the busiest 8 consecutive hours in terms of unique requests, along with the number of unique requests during that time period.
171 | 
172 | 
173 | - True or False: Driver supply always increases when demand increases during the two-week period.
174 | 
175 | This statement is false. There are multiple reasons why driver supply might not always increase when demand increases. For example, some drivers might choose not to work during peak demand times, or there might be external factors that affect driver availability (such as traffic, weather conditions, or events in the city). To confirm this, we would need to analyze the data and identify instances where demand increased but driver supply did not.
176 | 
177 | 
178 | - In which 72-hour period is the ratio of Zeroes to Eyeballs the highest?
179 | 
180 | To answer this question, we can group the data by 72-hour periods and calculate the ratio of zeroes to eyeballs for each period. We can then find the period with the highest ratio. Here's the code:
181 | 
182 | ```python
183 | from pyspark.sql.functions import col, sum
184 | 
185 | # Group the data by 72-hour periods and calculate the ratio of zeroes to eyeballs for each period
186 | period_ratios = (df
187 |   .groupBy(((col("Date").cast("timestamp").cast("long") / (72*3600)).cast("int")).alias("period"))
188 |   .agg(sum("Zeroes").alias("zeroes"), sum("Eyeballs").alias("eyeballs"))
189 |   .withColumn("ratio", col("zeroes") / col("eyeballs"))
190 | )
191 | 
192 | # Find the period with the highest ratio
193 | highest_ratio_period = period_ratios.orderBy(col("ratio").desc()).limit(1)
194 | 
195 | # Print the result
196 | highest_ratio_period.show()
197 | ```
198 | 
199 | This will output the 72-hour period with the highest ratio of zeroes to eyeballs.
200 | 
201 | - If you could add 5 drivers to any single hour of every day during the two-week period, which hour should you add them to? Hint: Consider both rider eyeballs and driver supply when choosing.
202 | 
203 | 
204 | To determine which hour to add 5 drivers too, we want to look for an hour where there are a high number of rider eyeballs and a low number of unique drivers. One way to approach this is to calculate the ratio of requests to unique drivers for each hour and then choose the hour with the highest ratio. The idea here is that adding more drivers to an hour with a high ratio will result in more completed trips.
205 | We can use the following PySpark code to calculate the ratio for each hour:
206 | 
207 | ```python
208 | # Calculate requests per unique driver for each hour
209 | requests_per_driver = (df.groupBy('Time (Local)').agg(
210 |     (F.sum('Requests') / F.countDistinct('Unique Drivers')).alias('requests_per_driver'))
211 | )
212 | 
213 | # Show the hour with the highest ratio
214 | requests_per_driver.orderBy(F.desc('requests_per_driver')).show(1)
215 | ```
216 | 
217 | This will output the hour with the highest requests per unique driver ratio, which is where we should add 5 drivers.
218 | 
219 | - Looking at the data from all two weeks, which time might make the most sense to consider a true "end day" instead of midnight? (i.e when are supply and demand at both their natural minimums)
220 | 
221 | One way to approach this question is to calculate the average number of completed trips and unique drivers for each hour of the day over the entire two-week period. We can then look for the hour with the lowest number of completed trips and unique drivers to find the time when supply and demand are at their natural minimums.
222 | We can use the following PySpark code to calculate the average number of completed trips and unique drivers for each hour:
223 | 
224 | ```python
225 | # Calculate average completed trips and unique drivers for each hour
226 | avg_trips_and_drivers = (df.groupBy('Time (Local)').agg(
227 |     F.mean('Completed Trips').alias('avg_completed_trips'),
228 |     F.mean('Unique Drivers').alias('avg_unique_drivers')
229 | ))
230 | 
231 | # Show the hour with the lowest average completed trips and unique drivers
232 | avg_trips_and_drivers.orderBy('avg_completed_trips', 'avg_unique_drivers').show(1)
233 | ```
234 | 
235 | This will output the hour with the lowest average number of completed trips and unique drivers, which is when supply and demand are at their natural minimums and might make the most sense to consider as the "end day".
236 | 
237 | 
238 | ### Connect with me:
239 | 
240 | [<img src="https://img.icons8.com/color/48/000000/stackoverflow.png" width="6.5%"/>](https://stackoverflow.com/users/9249743/ayush-dixit) [<img src="https://img.icons8.com/color/48/000000/linkedin.png" width="6.5%"/>](www.linkedin.com/in/ayush-dixit-856067104) [<img src="https://cdn.jsdelivr.net/npm/simple-icons@v3/icons/leetcode.svg" width="6.5%"/>](https://leetcode.com/ayushdixit661/) [<img src="https://img.icons8.com/fluent/48/000000/google-plus.png" width="6.5%"/>](ayushdixit661@gmail.com) [<img src="https://img.icons8.com/fluent/48/000000/github.png" width="6.5%" alt="Github">](https://github.com/ayushdixit487/)
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 


--------------------------------------------------------------------------------