├── README.md ├── LICENSE ├── read-data.py └── write-data.py /README.md: -------------------------------------------------------------------------------- 1 | # Transport-Public-databricks -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 yassine essadi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /read-data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC #imports 4 | # MAGIC 5 | 6 | # COMMAND ---------- 7 | 8 | from pyspark.sql.functions import col, unix_timestamp 9 | from pyspark.sql.functions import year, month,dayofmonth,dayofweek 10 | from pyspark.sql.types import IntegerType 11 | from datetime import datetime 12 | from pyspark.sql.functions import col 13 | 14 | # COMMAND ---------- 15 | 16 | #Connection configuration 17 | spark.conf.set( 18 | "fs.azure.account.key.yassineessadidatalakeg2.blob.core.windows.net", "gWYEfszXt9mbYAwRbZP0hE3Bo1rZUFJoFw71LWPsENPoEPb5CzWeN28ukbQV6/o3vm6mlyg31lim+ASt3uGX5A==") 19 | 20 | spark_df = spark.read.format('csv').option('header', True).load("wasbs://data@yassineessadidatalakeg2.blob.core.windows.net/public_transport_data/raw/*.csv") 21 | 22 | display(spark_df) 23 | 24 | # COMMAND ---------- 25 | 26 | 27 | 28 | #Add columns year, month, day, and day of the week 29 | #spark_df = spark.createDataFrame(spark_df, ["Date"]) 30 | spark_df = spark_df.withColumn("Date", col("Date").cast("Date")) 31 | spark_df = spark_df.withColumn("Year", year(spark_df["Date"]).cast(IntegerType())) 32 | spark_df = spark_df.withColumn("Month", month(spark_df["Date"]).cast(IntegerType())) 33 | spark_df = spark_df.withColumn("DayOfMonth", dayofmonth(spark_df["Date"]).cast(IntegerType())) 34 | spark_df = spark_df.withColumn("DayOfWeek", dayofweek(spark_df["Date"]).cast(IntegerType())) 35 | 36 | display(spark_df) 37 | 38 | 39 | 40 | # COMMAND ---------- 41 | 42 | 43 | #spark_df.select('DepartureTime','ArrivalTime').show() 44 | 45 | spark_df = spark_df.withColumn("Duration (M)", ((col("ArrivalTime").cast("timestamp") - col("DepartureTime").cast("timestamp")) / 60).cast("int")) 46 | spark_df = spark_df.withColumn("Duration (H)", ((col("ArrivalTime").cast("timestamp") - col("DepartureTime").cast("timestamp")) / 3600).cast("int")) 47 | 48 | 49 | display(spark_df) 50 | -------------------------------------------------------------------------------- /write-data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import pandas as ps 3 | import random 4 | from datetime import datetime, timedelta 5 | 6 | # Generate data for January 2023 7 | start_date = datetime(2023, 1, 1) 8 | end_date = datetime(2023, 1, 30) 9 | date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days)] 10 | 11 | transport_types = ["Bus", "Train", "Tram", "Metro"] 12 | routes = ["Route_" + str(i) for i in range(1, 11)] 13 | stations = ["Station_" + str(i) for i in range(1, 21)] 14 | 15 | # Randomly select 5 days as extreme weather days 16 | extreme_weather_days = random.sample(date_generated, 5) 17 | 18 | data = [] 19 | 20 | for date in date_generated: 21 | for _ in range(32): # 32 records per day to get a total of 992 records for January 22 | transport = random.choice(transport_types) 23 | route = random.choice(routes) 24 | 25 | # Normal operating hours 26 | departure_hour = random.randint(5, 22) 27 | departure_minute = random.randint(0, 59) 28 | 29 | # Introducing Unusual Operating Hours for buses 30 | if transport == "Bus" and random.random() < 0.05: # 5% chance 31 | departure_hour = 3 32 | 33 | departure_time = f"{departure_hour:02}:{departure_minute:02}" 34 | 35 | # Normal duration 36 | duration = random.randint(10, 120) 37 | 38 | # Introducing Short Turnarounds 39 | if random.random() < 0.05: # 5% chance 40 | duration = random.randint(1, 5) 41 | 42 | # General delay 43 | delay = random.randint(0, 15) 44 | 45 | # Weather Impact 46 | if date in extreme_weather_days: 47 | # Increase delay by 10 to 60 minutes 48 | delay += random.randint(10, 60) 49 | 50 | # 10% chance to change the route 51 | if random.random() < 0.10: 52 | route = random.choice(routes) 53 | 54 | total_minutes = departure_minute + duration + delay 55 | arrival_hour = departure_hour + total_minutes // 60 56 | arrival_minute = total_minutes % 60 57 | arrival_time = f"{arrival_hour:02}:{arrival_minute:02}" 58 | 59 | passengers = random.randint(1, 100) 60 | departure_station = random.choice(stations) 61 | arrival_station = random.choice(stations) 62 | 63 | data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay]) 64 | 65 | df = ps.DataFrame(data, columns=["Date", "TransportType", "Route", "DepartureTime", "ArrivalTime", "Passengers", "DepartureStation", "ArrivalStation", "Delay"]) 66 | 67 | spark_df = spark.createDataFrame(df) 68 | 69 | session = spark.builder.getOrCreate() 70 | session.conf.set( 71 | "fs.azure.account.key.yassineessadidatalakeg2.blob.core.windows.net", "gWYEfszXt9mbYAwRbZP0hE3Bo1rZUFJoFw71LWPsENPoEPb5CzWeN28ukbQV6/o3vm6mlyg31lim+ASt3uGX5A==") 72 | spark_df.toPandas() 73 | 74 | spark_df.coalesce(1).write.format("csv").option('header', True).mode("overwrite").save("wasbs://data@yassineessadidatalakeg2.blob.core.windows.net/public_transport_data/raw") 75 | --------------------------------------------------------------------------------