├── File generator.py ├── Integration.py ├── README.md ├── conservation.py └── manifest.mf /File generator.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import pandas as pd 3 | import random 4 | from datetime import datetime, timedelta 5 | 6 | #Connection configuration 7 | spark.conf.set( 8 | "fs.azure.account.key.hnadirstg.blob.core.windows.net", "iyMubLxTlSB0r/ZL+bQr2H9LLDVc20DunWUDOVQekQWG8W1xeb5pAwTpaeozRni2AM0Ak8po1V/s+AStnE1kcQ==" 9 | ) 10 | 11 | # Generate data for January 2023 12 | start_date = datetime(2023, 1, 1) 13 | end_date = datetime(2023, 1 , 31) 14 | date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days+1)] 15 | 16 | transport_types = ["Bus", "Train", "Tram", "Metro"] 17 | routes = ["Route_" + str(i) for i in range(1, 11)] 18 | stations = ["Station_" + str(i) for i in range(1, 21)] 19 | 20 | # Randomly select 5 days as extreme weather days 21 | extreme_weather_days = random.sample(date_generated, 5) 22 | 23 | data = [] 24 | 25 | for date in date_generated: 26 | for _ in range(32): # 32 records per day to get a total of 992 records for January 27 | transport = random.choice(transport_types) 28 | route = random.choice(routes) 29 | 30 | # Normal operating hours 31 | departure_hour = random.randint(5, 22) 32 | departure_minute = random.randint(0, 59) 33 | 34 | # Introducing Unusual Operating Hours for buses 35 | if transport == "Bus" and random.random() < 0.05: # 5% chance 36 | departure_hour = 3 37 | 38 | departure_time = f"{departure_hour:02}:{departure_minute:02}" 39 | 40 | # Normal duration 41 | duration = random.randint(10, 120) 42 | 43 | # Introducing Short Turnarounds 44 | if random.random() < 0.05: # 5% chance 45 | duration = random.randint(1, 5) 46 | 47 | # General delay 48 | delay = random.randint(0, 15) 49 | 50 | # Weather Impact 51 | if date in extreme_weather_days: 52 | # Increase delay by 10 to 60 minutes 53 | delay += random.randint(10, 60) 54 | 55 | # 10% chance to change the route 56 | if random.random() < 0.10: 57 | route = random.choice(routes) 58 | 59 | total_minutes = departure_minute + duration + delay 60 | arrival_hour = departure_hour + total_minutes // 60 61 | arrival_minute = total_minutes % 60 62 | arrival_time = f"{arrival_hour:02}:{arrival_minute:02}" 63 | 64 | passengers = random.randint(1, 100) 65 | departure_station = random.choice(stations) 66 | arrival_station = random.choice(stations) 67 | 68 | data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay]) 69 | 70 | df = pd.DataFrame(data, columns=["Date", "TransportType", "Route", "DepartureTime", "ArrivalTime", "Passengers", "DepartureStation", "ArrivalStation", "Delay"]) 71 | 72 | #Create spark dataframe by pandas dataframe 73 | spark_df = spark.createDataFrame(df) 74 | 75 | #Save the generated file in folder raw 76 | spark_df.coalesce(1).write.format("com.databricks.spark.csv").option("header","true").save("wasbs://public-transport-data@hnadirstg.blob.core.windows.net/raw/01-2023") 77 | -------------------------------------------------------------------------------- /Integration.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # 4 | 5 | # COMMAND ---------- 6 | 7 | #Processus ETL avec Azure Databricks 8 | from pyspark.sql.functions import year, month, dayofmonth, dayofweek, to_date, unix_timestamp, from_unixtime, split,when,hour 9 | from pyspark.sql.types import IntegerType, StringType 10 | import datetime 11 | 12 | 13 | #Connection configuration 14 | spark.conf.set( 15 | "fs.azure.account.key.hnadirstg.blob.core.windows.net", "iyMubLxTlSB0r/ZL+bQr2H9LLDVc20DunWUDOVQekQWG8W1xeb5pAwTpaeozRni2AM0Ak8po1V/s+AStnE1kcQ==" 16 | ) 17 | 18 | 19 | #Function to tronsformation file by name 20 | def transformationData(filePath): 21 | #======> Transformations de Date <=======# 22 | spark_df = spark.read.format('csv').option('header', True).load("wasbs://public-transport-data@hnadirstg.blob.core.windows.net/raw/"+filePath+"/*.csv") 23 | 24 | # 25 | #Convert columns date to type date 26 | # 27 | spark_df = spark_df.withColumn('Date',to_date(spark_df['Date'])) 28 | 29 | #Add columns year, month, day, and day of the week 30 | spark_df = spark_df.withColumn("Year", year(spark_df["Date"]).cast(IntegerType())) 31 | spark_df = spark_df.withColumn("Month", month(spark_df["Date"]).cast(IntegerType())) 32 | spark_df = spark_df.withColumn("DayOfMonth", dayofmonth(spark_df["Date"]).cast(IntegerType())) 33 | spark_df = spark_df.withColumn("DayOfWeek", dayofmonth(spark_df["Date"]).cast(IntegerType())) 34 | 35 | #Cast type of column Delay to integer 36 | spark_df.withColumn("Delay",spark_df.Delay.cast(IntegerType())) 37 | 38 | #Cast type of columnPassengers to integer 39 | spark_df.withColumn("Passengers",spark_df.Passengers.cast(IntegerType())) 40 | 41 | #======> Calculs Temporels <=======# 42 | def getMinuts(c): 43 | hours = split(c, ':')[0] 44 | minutes = split(c, ':')[1] 45 | return hours * 60 + minutes 46 | 47 | spark_df = spark_df.withColumn("Duration",getMinuts(spark_df['ArrivalTime']) - getMinuts(spark_df['DepartureTime'])) 48 | 49 | #======> Analyse des Retards <=======# 50 | 51 | spark_df = spark_df.withColumn("Retard",when(spark_df["Delay"] <= 0, 'Pas de Retard').when(spark_df["Delay"] <= 10, "Retard Court").when(spark_df["Delay"] <= 20, "Retard Moyen").otherwise( 'Long Retard')) 52 | 53 | spark_df.coalesce(1).write.format("com.databricks.spark.csv").option("header","true").save("wasbs://public-transport-data@hnadirstg.blob.core.windows.net/processed/transformed/"+filePath) 54 | 55 | #Analyse des Passagers 56 | analyseDesPassagers(spark_df,filePath) 57 | 58 | #Analyse des Itinéraires 59 | analyseDesItineraires(spark_df,filePath) 60 | 61 | return spark_df 62 | 63 | #======> Analyse des Passagers <=======# 64 | def analyseDesPassagers(saprkDF,filePath): 65 | df = saprkDF.groupby(hour('DepartureTime')).agg({"Passengers": "avg"}) 66 | df = df.withColumn('Pointe',when(df["avg(Passengers)"] >= 50, ' heures de pointe').when(df["avg(Passengers)"] < 50 ,'hors pointe')) 67 | #Renamed columns 68 | df = df.withColumnRenamed('hour(DepartureTime)','Hour') 69 | df =df.withColumnRenamed('avg(Passengers)','AVG Passengers') 70 | 71 | #Saved files in Analyse Passagers 72 | df.coalesce(1).write.format("com.databricks.spark.csv").option("header","true").save("wasbs://public-transport-data@hnadirstg.blob.core.windows.net/processed/analyse_passagers/"+filePath) 73 | 74 | return df 75 | 76 | #======> Analyse des Itinéraires <=======# 77 | 78 | def analyseDesItineraires(saprkDF,filePath): 79 | df = saprkDF.groupby('Route').agg({'Delay':'avg','Passengers': 'avg','Route':'count'}) 80 | #Renamed columns with new names 81 | df = df.withColumnRenamed('avg(Passengers)','AVG Passengers') 82 | df =df.withColumnRenamed('avg(Delay)','AVG Delay') 83 | 84 | #Saved files in Analyse des Itinéraires 85 | df.coalesce(1).write.format("com.databricks.spark.csv").option("header","true").save("wasbs://public-transport-data@hnadirstg.blob.core.windows.net/processed/analyse_itineraires/"+filePath) 86 | return df 87 | 88 | 89 | # COMMAND ---------- 90 | 91 | #Connection configuration 92 | spark.conf.set( 93 | "fs.azure.account.key.hnadirstg.dfs.core.windows.net", "iyMubLxTlSB0r/ZL+bQr2H9LLDVc20DunWUDOVQekQWG8W1xeb5pAwTpaeozRni2AM0Ak8po1V/s+AStnE1kcQ==" 94 | ) 95 | 96 | raw = "abfss://public-transport-data@hnadirstg.dfs.core.windows.net/raw/" 97 | processed = "abfss://public-transport-data@hnadirstg.dfs.core.windows.net/processed/transformed" 98 | 99 | processed_files = dbutils.fs.ls(processed) 100 | raw_files = dbutils.fs.ls(raw) 101 | 102 | processed_files_csv = [f.name for f in processed_files] 103 | 104 | prcessed_items_count = 0; 105 | 106 | for f_raw in raw_files: 107 | if prcessed_items_count == 2: 108 | break 109 | if f_raw.name not in processed_files_csv: 110 | prcessed_items_count+=1 111 | #Transformation and save file csv 112 | transformationData(f_raw.name) 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # public-transport-data-databricks -------------------------------------------------------------------------------- /conservation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from datetime import datetime 3 | from pyspark.sql.functions import dayofmonth 4 | 5 | #Connection configuration 6 | spark.conf.set( 7 | "fs.azure.account.key.hnadirstg.dfs.core.windows.net", "iyMubLxTlSB0r/ZL+bQr2H9LLDVc20DunWUDOVQekQWG8W1xeb5pAwTpaeozRni2AM0Ak8po1V/s+AStnE1kcQ==" 8 | ) 9 | 10 | raw = "abfss://public-transport-data@hnadirstg.dfs.core.windows.net/raw/" 11 | archived = "abfss://public-transport-data@hnadirstg.dfs.core.windows.net/raw/" 12 | 13 | raw_files = dbutils.fs.ls(raw) 14 | archived_files = dbutils.fs.ls(archived) 15 | 16 | #Archived data to folder archive 17 | for r in raw_files: 18 | modification_time_ms = r.modificationTime 19 | modification_time = datetime.fromtimestamp(modification_time_ms / 1000) # Divide by 1000 to convert milliseconds to seconds 20 | datenow = datetime.now() 21 | duration = datenow - modification_time 22 | if(duration.days > 15): 23 | #Archive data 24 | dbutils.fs.cp(r.path,'abfss://public-transport-data@hnadirstg.dfs.core.windows.net/archived/'+r.name,recurse=True) 25 | #Delete data after archive 26 | dbutils.fs.rm(r.path, recurse=True) 27 | print('Folder : '+r.name+' archived with successfuly!') 28 | 29 | #Deleted data from folder archive 30 | for r in archived_files: 31 | modification_time_ms = r.modificationTime 32 | modification_time = datetime.fromtimestamp(modification_time_ms / 1000) # Divide by 1000 to convert milliseconds to seconds 33 | datenow = datetime.now() 34 | duration = datenow - modification_time 35 | if(duration.days > 30): 36 | #Delete data from folder archive 37 | dbutils.fs.rm(r.path, recurse=True) 38 | print('Folder : '+r.name+' deleted with successfuly!') 39 | 40 | 41 | 42 | # COMMAND ---------- 43 | 44 | 45 | -------------------------------------------------------------------------------- /manifest.mf: -------------------------------------------------------------------------------- 1 | {"version":"Manifest","guid":"5199d717-ff67-4619-86d3-ed30c6b30351","origId":-1,"name":"manifest.mf"} --------------------------------------------------------------------------------