├── README.md
├── LICENSE
├── read-data.py
└── write-data.py


/README.md:
--------------------------------------------------------------------------------
1 | # Transport-Public-databricks


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 yassine essadi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/read-data.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC #imports
 4 | # MAGIC
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | from pyspark.sql.functions import col, unix_timestamp
 9 | from pyspark.sql.functions import year, month,dayofmonth,dayofweek
10 | from pyspark.sql.types import IntegerType
11 | from datetime import datetime
12 | from pyspark.sql.functions import col
13 | 
14 | # COMMAND ----------
15 | 
16 | #Connection configuration
17 | spark.conf.set(
18 | "fs.azure.account.key.yassineessadidatalakeg2.blob.core.windows.net", "gWYEfszXt9mbYAwRbZP0hE3Bo1rZUFJoFw71LWPsENPoEPb5CzWeN28ukbQV6/o3vm6mlyg31lim+ASt3uGX5A==")
19 | 
20 | spark_df = spark.read.format('csv').option('header', True).load("wasbs://data@yassineessadidatalakeg2.blob.core.windows.net/public_transport_data/raw/*.csv")
21 | 
22 | display(spark_df)
23 | 
24 | # COMMAND ----------
25 | 
26 | 
27 | 
28 | #Add columns year, month, day, and day of the week
29 | #spark_df  = spark.createDataFrame(spark_df, ["Date"])
30 | spark_df = spark_df.withColumn("Date", col("Date").cast("Date"))
31 | spark_df = spark_df.withColumn("Year", year(spark_df["Date"]).cast(IntegerType()))
32 | spark_df = spark_df.withColumn("Month", month(spark_df["Date"]).cast(IntegerType()))
33 | spark_df = spark_df.withColumn("DayOfMonth", dayofmonth(spark_df["Date"]).cast(IntegerType()))
34 | spark_df = spark_df.withColumn("DayOfWeek", dayofweek(spark_df["Date"]).cast(IntegerType()))
35 | 
36 | display(spark_df)
37 | 
38 | 
39 | 
40 | # COMMAND ----------
41 | 
42 | 
43 | #spark_df.select('DepartureTime','ArrivalTime').show()
44 | 
45 | spark_df = spark_df.withColumn("Duration (M)", ((col("ArrivalTime").cast("timestamp") - col("DepartureTime").cast("timestamp")) / 60).cast("int"))
46 | spark_df = spark_df.withColumn("Duration (H)", ((col("ArrivalTime").cast("timestamp") - col("DepartureTime").cast("timestamp")) / 3600).cast("int"))
47 | 
48 | 
49 | display(spark_df)
50 | 


--------------------------------------------------------------------------------
/write-data.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import pandas as ps
 3 | import random
 4 | from datetime import datetime, timedelta
 5 | 
 6 | # Generate data for January 2023
 7 | start_date = datetime(2023, 1, 1)
 8 | end_date = datetime(2023, 1, 30)
 9 | date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days)]
10 | 
11 | transport_types = ["Bus", "Train", "Tram", "Metro"]
12 | routes = ["Route_" + str(i) for i in range(1, 11)]
13 | stations = ["Station_" + str(i) for i in range(1, 21)]
14 | 
15 | # Randomly select 5 days as extreme weather days
16 | extreme_weather_days = random.sample(date_generated, 5)
17 | 
18 | data = []
19 | 
20 | for date in date_generated:
21 |     for _ in range(32):  # 32 records per day to get a total of 992 records for January
22 |         transport = random.choice(transport_types)
23 |         route = random.choice(routes)
24 | 
25 |         # Normal operating hours
26 |         departure_hour = random.randint(5, 22)
27 |         departure_minute = random.randint(0, 59)
28 | 
29 |         # Introducing Unusual Operating Hours for buses
30 |         if transport == "Bus" and random.random() < 0.05:  # 5% chance
31 |             departure_hour = 3
32 | 
33 |         departure_time = f"{departure_hour:02}:{departure_minute:02}"
34 | 
35 |         # Normal duration
36 |         duration = random.randint(10, 120)
37 | 
38 |         # Introducing Short Turnarounds
39 |         if random.random() < 0.05:  # 5% chance
40 |             duration = random.randint(1, 5)
41 | 
42 |         # General delay
43 |         delay = random.randint(0, 15)
44 | 
45 |         # Weather Impact
46 |         if date in extreme_weather_days:
47 |             # Increase delay by 10 to 60 minutes
48 |             delay += random.randint(10, 60)
49 | 
50 |             # 10% chance to change the route
51 |             if random.random() < 0.10:
52 |                 route = random.choice(routes)
53 | 
54 |         total_minutes = departure_minute + duration + delay
55 |         arrival_hour = departure_hour + total_minutes // 60
56 |         arrival_minute = total_minutes % 60
57 |         arrival_time = f"{arrival_hour:02}:{arrival_minute:02}"
58 | 
59 |         passengers = random.randint(1, 100)
60 |         departure_station = random.choice(stations)
61 |         arrival_station = random.choice(stations)
62 | 
63 |         data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay])
64 | 
65 | df = ps.DataFrame(data, columns=["Date", "TransportType", "Route", "DepartureTime", "ArrivalTime", "Passengers", "DepartureStation", "ArrivalStation", "Delay"])
66 | 
67 | spark_df = spark.createDataFrame(df) 
68 | 
69 | session = spark.builder.getOrCreate()
70 | session.conf.set(
71 | "fs.azure.account.key.yassineessadidatalakeg2.blob.core.windows.net", "gWYEfszXt9mbYAwRbZP0hE3Bo1rZUFJoFw71LWPsENPoEPb5CzWeN28ukbQV6/o3vm6mlyg31lim+ASt3uGX5A==")
72 | spark_df.toPandas()
73 | 
74 | spark_df.coalesce(1).write.format("csv").option('header', True).mode("overwrite").save("wasbs://data@yassineessadidatalakeg2.blob.core.windows.net/public_transport_data/raw")
75 | 


--------------------------------------------------------------------------------