├── Automated_ETL.py ├── ETL_public_transport_data.ipynb ├── LICENSE ├── README.md ├── azure_databricks ├── linkedService │ └── AzureDatabricks1.json ├── pipeline │ └── pipeline1.json └── trigger │ └── trigger1.json ├── public_tansport_databricks ├── README.md ├── politiques_conservation_automatise.py └── transformations_automatiques.py └── public_transport_data_generated.ipynb /Automated_ETL.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | #Connection configuration 3 | spark.conf.set( 4 | "fs.azure.account.key.aminbenstorage.blob.core.windows.net", "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw==" 5 | ) 6 | 7 | # COMMAND ---------- 8 | 9 | from pyspark.sql.functions import avg, count, year, month, dayofmonth, dayofweek, to_date, col, expr, unix_timestamp, when 10 | 11 | # Fonction pour traiter un fichier CSV 12 | def process_csv(input_df): 13 | # Convertir la colonne "Date" en un format de date 14 | input_df = input_df.withColumn("Date", to_date("Date", "yyyy-MM-dd")) 15 | 16 | # Extraire l'année, le mois, le jour et le jour de la semaine 17 | input_df = input_df.withColumn("Year", year("Date")) 18 | input_df = input_df.withColumn("Month", month("Date")) 19 | input_df = input_df.withColumn("Day", dayofmonth("Date")) 20 | input_df = input_df.withColumn("DayOfWeek", dayofweek("Date")) 21 | 22 | # Supprimer les lignes où les deux premiers caractères de la colonne "ArrivalTime" sont "24" ou supérieurs à "24" 23 | input_df = input_df.filter(~(col("ArrivalTime").substr(1, 2) >= "24")) 24 | input_df = input_df.filter(~(col("DepartureTime").substr(1, 2) >= "24")) 25 | 26 | # Calculer la colonne "Duration" 27 | input_df = input_df.withColumn("Duration", expr( 28 | "from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')" 29 | )) 30 | 31 | # Catégoriser les retards en fonction de la colonne "Delay" 32 | input_df = input_df.withColumn("DelayCategory", 33 | when(col("Delay") <= 0, "No Delay") 34 | .when((col("Delay") > 0) & (col("Delay") <= 10), "Short Delay") 35 | .when((col("Delay") > 10) & (col("Delay") <= 20), "Medium Delay") 36 | .otherwise("Long Delay")) 37 | 38 | # Identifier les heures de pointe et heures hors pointe en fonction du nombre de passagers 39 | average_passengers = input_df.select(avg("Passengers")).first()[0] 40 | input_df = input_df.withColumn("HeureDePointe", when(col("Passengers") > average_passengers, "peak").otherwise("off-peak")) 41 | 42 | return input_df 43 | 44 | # Fonction pour agréger les données 45 | def aggregate_data(input_df): 46 | result_df = input_df.groupBy("Route").agg( 47 | avg("Delay").alias("RetardMoyen"), 48 | avg("Passengers").alias("NombrePassagersMoyen"), 49 | count("*").alias("NombreTotalVoyages") 50 | ) 51 | return result_df 52 | 53 | # COMMAND ---------- 54 | 55 | # Lecture du fichier CSV 56 | spark_df = spark.read.format('csv').option('header', True).load("wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/public-transport-data.csv") 57 | 58 | # Appliquer la première fonction pour effectuer le prétraitement 59 | processed_df = process_csv(spark_df) 60 | 61 | # Appliquer la deuxième fonction pour agréger les données 62 | aggregated_df = aggregate_data(processed_df) 63 | 64 | # Afficher les résultats 65 | display(aggregated_df) 66 | 67 | # COMMAND ---------- 68 | 69 | 70 | -------------------------------------------------------------------------------- /ETL_public_transport_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "752d8845-7cbb-4f6d-bea5-2e3f04d56704", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "#Connection configuration\n", 21 | "spark.conf.set(\n", 22 | "\"fs.azure.account.key.aminbenstorage.blob.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n", 23 | ")\n", 24 | "\n", 25 | "#affichage de données\n", 26 | "spark_df = spark.read.format('csv').option('header', True).load(\"wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/public-transport-data.csv\")\n", 27 | "\n", 28 | "display(spark_df)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 0, 34 | "metadata": { 35 | "application/vnd.databricks.v1+cell": { 36 | "cellMetadata": { 37 | "byteLimit": 2048000, 38 | "rowLimit": 10000 39 | }, 40 | "inputWidgets": {}, 41 | "nuid": "8a0ca201-c451-4242-9f75-6cc5d60b1af4", 42 | "showTitle": false, 43 | "title": "" 44 | } 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "#infos sur les colonnes\n", 49 | "spark_df.printSchema()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 0, 55 | "metadata": { 56 | "application/vnd.databricks.v1+cell": { 57 | "cellMetadata": { 58 | "byteLimit": 2048000, 59 | "rowLimit": 10000 60 | }, 61 | "inputWidgets": {}, 62 | "nuid": "d4a24fc1-5d96-417f-9f52-f5a0f641310e", 63 | "showTitle": false, 64 | "title": "" 65 | } 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "unique_dates = spark_df.select(\"Date\").distinct()\n", 70 | "display(unique_dates)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 0, 76 | "metadata": { 77 | "application/vnd.databricks.v1+cell": { 78 | "cellMetadata": { 79 | "byteLimit": 2048000, 80 | "rowLimit": 10000 81 | }, 82 | "inputWidgets": {}, 83 | "nuid": "cdc21430-c8c7-4400-9429-f7f5af86dc9f", 84 | "showTitle": false, 85 | "title": "" 86 | } 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "from pyspark.sql.functions import year, month, dayofmonth, dayofweek, to_date, col\n", 91 | "\n", 92 | "# Convertir la colonne \"Date\" en un format de date\n", 93 | "spark_df = spark_df.withColumn(\"Date\", to_date(\"Date\", \"yyyy-MM-dd\"))\n", 94 | "\n", 95 | "# Extraire l'année, le mois, le jour et le jour de la semaine\n", 96 | "spark_df = spark_df.withColumn(\"Year\", year(\"Date\"))\n", 97 | "spark_df = spark_df.withColumn(\"Month\", month(\"Date\"))\n", 98 | "spark_df = spark_df.withColumn(\"Day\", dayofmonth(\"Date\"))\n", 99 | "spark_df = spark_df.withColumn(\"DayOfWeek\", dayofweek(\"Date\"))\n", 100 | "\n", 101 | "# Afficher les résultats\n", 102 | "display(spark_df)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 0, 108 | "metadata": { 109 | "application/vnd.databricks.v1+cell": { 110 | "cellMetadata": { 111 | "byteLimit": 2048000, 112 | "rowLimit": 10000 113 | }, 114 | "inputWidgets": {}, 115 | "nuid": "dbcd678a-2c49-4e3b-96dc-bca700fc8a43", 116 | "showTitle": false, 117 | "title": "" 118 | } 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "\n", 123 | "# Supprimer les lignes où les deux premiers caractères de la colonne \"ArrivalTime\" sont \"24\" ou supérieurs à \"24\"\n", 124 | "spark_df = spark_df.filter(~(col(\"ArrivalTime\").substr(1, 2) >= \"24\"))\n", 125 | "spark_df = spark_df.filter(~(col(\"DepartureTime\").substr(1, 2) >= \"24\"))\n", 126 | "\n", 127 | "display(spark_df)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 0, 133 | "metadata": { 134 | "application/vnd.databricks.v1+cell": { 135 | "cellMetadata": { 136 | "byteLimit": 2048000, 137 | "rowLimit": 10000 138 | }, 139 | "inputWidgets": {}, 140 | "nuid": "f9d1b775-2af4-46f7-8c12-4c02b5c7b21a", 141 | "showTitle": false, 142 | "title": "" 143 | } 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "\n", 148 | "from pyspark.sql.functions import expr, unix_timestamp \n", 149 | "spark_df = spark_df.withColumn(\"Duration\", expr(\n", 150 | " \"from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')\"\n", 151 | "))\n", 152 | "\n", 153 | "display(spark_df)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 0, 159 | "metadata": { 160 | "application/vnd.databricks.v1+cell": { 161 | "cellMetadata": { 162 | "byteLimit": 2048000, 163 | "rowLimit": 10000 164 | }, 165 | "inputWidgets": {}, 166 | "nuid": "eba6adb1-71a3-4310-9901-9c7f3fac5ab1", 167 | "showTitle": false, 168 | "title": "" 169 | } 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "\n", 174 | "from pyspark.sql.functions import when \n", 175 | "\n", 176 | "# Catégoriser les retards en fonction de la colonne \"Delay\"\n", 177 | "\n", 178 | "spark_df = spark_df.withColumn(\"DelayCategory\", \n", 179 | " when(col(\"Delay\") <= 0, \"No Delay\")\n", 180 | " .when((col(\"Delay\") > 0) & (col(\"Delay\") <= 10), \"Short Delay\")\n", 181 | " .when((col(\"Delay\") > 10) & (col(\"Delay\") <= 20), \"Medium Delay\")\n", 182 | " .otherwise(\"Long Delay\"))\n", 183 | "display(spark_df)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 0, 189 | "metadata": { 190 | "application/vnd.databricks.v1+cell": { 191 | "cellMetadata": { 192 | "byteLimit": 2048000, 193 | "rowLimit": 10000 194 | }, 195 | "inputWidgets": {}, 196 | "nuid": "bfe2968f-2946-4db9-ae41-de14ed0cd8fd", 197 | "showTitle": false, 198 | "title": "" 199 | } 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "\n", 204 | "from pyspark.sql.functions import avg, col\n", 205 | "\n", 206 | "average_passengers = spark_df.select(avg(\"Passengers\")).first()[0]\n", 207 | "\n", 208 | "# Identifier les heures de pointe et heures hors pointe en fonction du nombre de passagers :\n", 209 | "\n", 210 | "spark_df = spark_df.withColumn(\"HeureDePointe\", when(col(\"Passengers\") > average_passengers, \"peak\").otherwise(\"off-peak\"))\n", 211 | "\n", 212 | "# Afficher le DataFrame avec les heures de pointe identifiées :\n", 213 | "display(spark_df)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 0, 219 | "metadata": { 220 | "application/vnd.databricks.v1+cell": { 221 | "cellMetadata": { 222 | "byteLimit": 2048000, 223 | "rowLimit": 10000 224 | }, 225 | "inputWidgets": {}, 226 | "nuid": "dabe3c76-2076-40da-9d82-f9b33ee4216e", 227 | "showTitle": false, 228 | "title": "" 229 | } 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "\n", 234 | "from pyspark.sql.functions import count\n", 235 | "\n", 236 | "result_df = spark_df.groupBy(\"Route\").agg(\n", 237 | " avg(\"Delay\").alias(\"RetardMoyen\"),\n", 238 | " avg(\"Passengers\").alias(\"NombrePassagersMoyen\"),\n", 239 | " count(\"*\").alias(\"NombreTotalVoyages\")\n", 240 | ")\n", 241 | "\n", 242 | "#Afficher le DataFrame résultant :\n", 243 | "display(result_df)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 0, 249 | "metadata": { 250 | "application/vnd.databricks.v1+cell": { 251 | "cellMetadata": { 252 | "byteLimit": 2048000, 253 | "rowLimit": 10000 254 | }, 255 | "inputWidgets": {}, 256 | "nuid": "08f6f103-c07d-4b00-b9cc-d90c55d3d5f9", 257 | "showTitle": false, 258 | "title": "" 259 | } 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "spark.conf.set(\n", 264 | "\"fs.azure.account.key.aminbenstorage.dfs.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n", 265 | ")\n", 266 | "\n", 267 | "raw = \"abfss://data@aminbenstorage.dfs.core.windows.net/public_transport_data/raw/\"\n", 268 | "processed = \"abfss://data@aminbenstorage.dfs.core.windows.net/public_transport_data/processed/\"\n", 269 | "\n", 270 | "raw_files = dbutils.fs.ls(raw)\n", 271 | "#raw_csv_files = [f.path for f in raw_files if f.name.endswith(\".csv\")] # Lit of CSV Files\n", 272 | "#raw_file_count = len(raw_csv_files)\n", 273 | "display(raw_files)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 0, 279 | "metadata": { 280 | "application/vnd.databricks.v1+cell": { 281 | "cellMetadata": {}, 282 | "inputWidgets": {}, 283 | "nuid": "bef07fa9-c814-4a92-a483-345ee7254569", 284 | "showTitle": false, 285 | "title": "" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [] 290 | } 291 | ], 292 | "metadata": { 293 | "application/vnd.databricks.v1+notebook": { 294 | "dashboards": [], 295 | "language": "python", 296 | "notebookMetadata": { 297 | "pythonIndentUnit": 4 298 | }, 299 | "notebookName": "ETL_public_transport_data", 300 | "widgets": {} 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 0 305 | } 306 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 aminscientist 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # azure_databricks -------------------------------------------------------------------------------- /azure_databricks/linkedService/AzureDatabricks1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDatabricks1", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "annotations": [], 6 | "type": "AzureDatabricks", 7 | "typeProperties": { 8 | "domain": "https://adb-7139156790856287.7.azuredatabricks.net", 9 | "existingClusterId": "0927-075712-jp35wqf7", 10 | "encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEIzMEI1Qzg0LTMxQ0YtNDVGQy05NDQwLURFRkRFMTUxMDYxNF8xOWNmYTMwNi04YzIyLTRlY2QtOWNkYS0yNTMzZjM2OTAxYzkiDQp9" 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /azure_databricks/pipeline/pipeline1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pipeline1", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Notebook1", 7 | "type": "DatabricksNotebook", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "0.12:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [], 17 | "typeProperties": { 18 | "notebookPath": "/Repos/abenazzouz.ext@simplonformations.onmicrosoft.com/azure_databricks/Transport_Data_Integration_and_Management_with_Azure_Databrick-main/ETL auto" 19 | }, 20 | "linkedServiceName": { 21 | "referenceName": "AzureDatabricks1", 22 | "type": "LinkedServiceReference" 23 | } 24 | }, 25 | { 26 | "name": "Notebook2", 27 | "type": "DatabricksNotebook", 28 | "dependsOn": [ 29 | { 30 | "activity": "Notebook1", 31 | "dependencyConditions": [ 32 | "Succeeded" 33 | ] 34 | } 35 | ], 36 | "policy": { 37 | "timeout": "0.12:00:00", 38 | "retry": 0, 39 | "retryIntervalInSeconds": 30, 40 | "secureOutput": false, 41 | "secureInput": false 42 | }, 43 | "userProperties": [], 44 | "typeProperties": { 45 | "notebookPath": "/Repos/abenazzouz.ext@simplonformations.onmicrosoft.com/azure_databricks/Transport_Data_Integration_and_Management_with_Azure_Databrick-main/Automatisation_Politiques_Conservation" 46 | }, 47 | "linkedServiceName": { 48 | "referenceName": "AzureDatabricks1", 49 | "type": "LinkedServiceReference" 50 | } 51 | } 52 | ], 53 | "annotations": [] 54 | } 55 | } -------------------------------------------------------------------------------- /azure_databricks/trigger/trigger1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trigger1", 3 | "properties": { 4 | "annotations": [], 5 | "runtimeState": "Started", 6 | "pipelines": [ 7 | { 8 | "pipelineReference": { 9 | "referenceName": "pipeline1", 10 | "type": "PipelineReference" 11 | } 12 | } 13 | ], 14 | "type": "ScheduleTrigger", 15 | "typeProperties": { 16 | "recurrence": { 17 | "frequency": "Minute", 18 | "interval": 1, 19 | "startTime": "2023-09-27T16:16:00", 20 | "timeZone": "Morocco Standard Time" 21 | } 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /public_tansport_databricks/README.md: -------------------------------------------------------------------------------- 1 | # Public_Transport_Data_Integration_and_Management_with_Azure -------------------------------------------------------------------------------- /public_tansport_databricks/politiques_conservation_automatise.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from pyspark.sql.functions import year, month, dayofmonth,\ 3 | dayofweek,col,date_format,regexp_extract,when,expr,\ 4 | unix_timestamp, from_unixtime,avg,to_timestamp,col, sum, count 5 | 6 | from datetime import datetime 7 | 8 | # COMMAND ---------- 9 | 10 | account_name = "aminbenstorage" 11 | container_name = "data" 12 | Access_keys = "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw==" 13 | 14 | spark.conf.set( 15 | f"fs.azure.account.key.{account_name}.dfs.core.windows.net", 16 | f"{Access_keys}" 17 | ) 18 | 19 | # COMMAND ---------- 20 | 21 | # get all fishies in processed : 22 | processed_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/") 23 | 24 | # get all fishies in row : 25 | row_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/") 26 | 27 | # show information for fishies csv dans row : 28 | row_data_info =[(info.name, info.modificationTime) for info in row_data] 29 | 30 | # show information for fishies csv dans processed : 31 | processed_data_info =[(info.name,info.modificationTime) for info in processed_data] 32 | 33 | 34 | # COMMAND ---------- 35 | 36 | # archive the file in row : 37 | 38 | for i in row_data_info : 39 | timestamp_datetime = datetime.fromtimestamp(i[1] / 1000) 40 | 41 | # Calculate the duration between the two datetime objects 42 | duration = datetime.now() - timestamp_datetime 43 | duration_day = duration.days 44 | print(duration_day) 45 | 46 | if duration_day ==0 : 47 | # file name : 48 | filenam = i[0] 49 | 50 | # file path : 51 | fishier_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/{filenam}" 52 | 53 | #archive folder : 54 | archive_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/Archive/{filenam}" 55 | 56 | dbutils.fs.cp(fishier_path,archive_path,recurse=True) 57 | -------------------------------------------------------------------------------- /public_tansport_databricks/transformations_automatiques.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from pyspark.sql.functions import year, month, dayofmonth,\ 3 | dayofweek,col,date_format,regexp_extract,when,expr,\ 4 | unix_timestamp, from_unixtime,avg,to_timestamp,col, sum, count 5 | 6 | # COMMAND ---------- 7 | 8 | account_name = "aminbenstorage" 9 | container_name = "data" 10 | Access_keys = "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw==" 11 | 12 | spark.conf.set( 13 | f"fs.azure.account.key.{account_name}.dfs.core.windows.net", 14 | f"{Access_keys}" 15 | ) 16 | 17 | # COMMAND ---------- 18 | 19 | # get all fishier in processed file : 20 | processed_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/") 21 | 22 | processed_data = [file.name for file in processed_data] 23 | 24 | # csv file in row : 25 | file_list = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/") 26 | 27 | # create list of csv : 28 | file_names = [file.name for file in file_list] 29 | 30 | # COMMAND ---------- 31 | 32 | def Cleaning(i,container_name,account_name) : 33 | 34 | # select curent file : 35 | file_location = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/{file_names[i]}" 36 | 37 | # load data : 38 | df = spark.read.format("csv").option("inferSchema", "True").option("header", 39 | "True").option("delimeter",",").load(file_location) 40 | 41 | # Fix data DepartureTime and ArrivalTime: 42 | df = df.withColumn("DepartureTime", date_format(col("DepartureTime"), "HH:mm")) 43 | df = df.withColumn("ArrivalTime", date_format(col("ArrivalTime"), "HH:mm")) 44 | 45 | # Fix invalid time values in ArrivalTime column : 46 | time_pattern = r'^([01][0-9]|2[0-3]):[0-5][0-9]$' 47 | 48 | df = df.withColumn("ArrivalTime", when(~col("ArrivalTime").rlike(time_pattern), "00:00").otherwise(col("ArrivalTime"))) 49 | 50 | # Add column day,month,year,day_of_week : 51 | df = df.withColumn("year", year("Date")) 52 | df = df.withColumn("month", month("Date")) 53 | df = df.withColumn("day", dayofmonth("Date")) 54 | df = df.withColumn("day_of_week", dayofweek("Date")) 55 | df = df.drop("date") 56 | 57 | # caluculer la duration of time : 58 | df = df.withColumn("Duration", expr( 59 | "from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')" 60 | )) 61 | 62 | # Catégoriser les retards en fonction de la colonne "Delay" : 63 | df = df.withColumn("DelayCategory", 64 | when(col("Delay") <= 0, "No Delay") 65 | .when((col("Delay") > 0) & (col("Delay") <= 10), "Short Delay") 66 | .when((col("Delay") > 10) & (col("Delay") <= 20), "Medium Delay") 67 | .otherwise("Long Delay")) 68 | 69 | # Identifier peak and off-peak : 70 | average_passengers = df.select(avg("Passengers")).first()[0] 71 | 72 | df = df.withColumn("HeureDePointe", when(col("Passengers") > average_passengers, True).otherwise(False)) 73 | 74 | # Define the path to save the CSV : 75 | output_file_location = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/{file_names[i]}" 76 | 77 | # Save the DataFrame as a CSV file to the specified location : 78 | df.write.csv(output_file_location, header=True, mode="overwrite") 79 | 80 | # COMMAND ---------- 81 | 82 | i = 0 83 | for fishie_csv in file_names : 84 | 85 | if len(processed_data) == 0 : 86 | print("prossess") 87 | Cleaning(i,container_name,account_name) 88 | break; 89 | 90 | if fishie_csv+'/' not in processed_data : 91 | i = file_names.index(fishie_csv) 92 | Cleaning(i,container_name,account_name) 93 | break; 94 | 95 | # COMMAND ---------- 96 | 97 | 98 | -------------------------------------------------------------------------------- /public_transport_data_generated.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "75102fd6-90d4-44a0-9196-3355ba75a5e8", 14 | "showTitle": false, 15 | "title": "" 16 | }, 17 | "id": "DMjYPGtNBMMx" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import random\n", 23 | "from datetime import datetime, timedelta\n", 24 | "\n", 25 | "#Connection configuration\n", 26 | "spark.conf.set(\n", 27 | "\"fs.azure.account.key.aminbenstorage.blob.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n", 28 | ")\n", 29 | "\n", 30 | "# Définissez les noms des mois\n", 31 | "mois = [\"janvier\", \"février\", \"mars\", \"avril\", \"mai\"]\n", 32 | "\n", 33 | "for m in mois:\n", 34 | " # Generate data for the current month\n", 35 | " start_date = datetime(2023, mois.index(m) + 1, 1)\n", 36 | " if mois.index(m) == 3: # Avril a 30 jours\n", 37 | " end_date = datetime(2023, mois.index(m) + 1, 30)\n", 38 | " elif mois.index(m) == 1: # Février a 28 jours\n", 39 | " end_date = datetime(2023, mois.index(m) + 1, 28)\n", 40 | " else:\n", 41 | " end_date = datetime(2023, mois.index(m) + 1, 31)\n", 42 | "\n", 43 | " date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days)]\n", 44 | "\n", 45 | " # Define transportation-related data\n", 46 | " transport_types = [\"Bus\", \"Train\", \"Tram\", \"Metro\"]\n", 47 | " routes = [\"Route_\" + str(i) for i in range(1, 11)]\n", 48 | " stations = [\"Station_\" + str(i) for i in range(1, 21)]\n", 49 | "\n", 50 | " # Randomly select 5 days as extreme weather days\n", 51 | " extreme_weather_days = random.sample(date_generated, 5)\n", 52 | "\n", 53 | " data = []\n", 54 | "\n", 55 | " for date in date_generated:\n", 56 | " for _ in range(32): # 32 records per day to get a total of 992 records for the current month\n", 57 | " transport = random.choice(transport_types)\n", 58 | " route = random.choice(routes)\n", 59 | "\n", 60 | " # Normal operating hours\n", 61 | " departure_hour = random.randint(5, 22)\n", 62 | " departure_minute = random.randint(0, 59)\n", 63 | "\n", 64 | " # Introducing Unusual Operating Hours for buses\n", 65 | " if transport == \"Bus\" and random.random() < 0.05: # 5% chance\n", 66 | " departure_hour = 3\n", 67 | "\n", 68 | " departure_time = f\"{departure_hour:02}:{departure_minute:02}\"\n", 69 | "\n", 70 | " # Normal duration\n", 71 | " duration = random.randint(10, 120)\n", 72 | "\n", 73 | " # Introducing Short Turnarounds\n", 74 | " if random.random() < 0.05: # 5% chance\n", 75 | " duration = random.randint(1, 5)\n", 76 | "\n", 77 | " # General delay\n", 78 | " delay = random.randint(0, 15)\n", 79 | "\n", 80 | " # Weather Impact\n", 81 | " if date in extreme_weather_days:\n", 82 | " # Increase delay by 10 to 60 minutes\n", 83 | " delay += random.randint(10, 60)\n", 84 | "\n", 85 | " # 10% chance to change the route\n", 86 | " if random.random() < 0.10:\n", 87 | " route = random.choice(routes)\n", 88 | "\n", 89 | " total_minutes = departure_minute + duration + delay\n", 90 | " arrival_hour = departure_hour + total_minutes // 60\n", 91 | " arrival_minute = total_minutes % 60\n", 92 | " arrival_time = f\"{arrival_hour:02}:{arrival_minute:02}\"\n", 93 | "\n", 94 | " passengers = random.randint(1, 100)\n", 95 | " departure_station = random.choice(stations)\n", 96 | " arrival_station = random.choice(stations)\n", 97 | "\n", 98 | " data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay])\n", 99 | "\n", 100 | " df = pd.DataFrame(data, columns=[\"Date\", \"TransportType\", \"Route\", \"DepartureTime\", \"ArrivalTime\", \"Passengers\", \"DepartureStation\", \"ArrivalStation\", \"Delay\"])\n", 101 | "\n", 102 | " # Modifiez le chemin de destination du fichier CSV pour le mois actuel\n", 103 | " destination_path = f\"wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/{m.capitalize()}\"\n", 104 | "\n", 105 | " # Écrivez les données dans un fichier CSV\n", 106 | " spark_df = spark.createDataFrame(df)\n", 107 | " spark_df.coalesce(1).write.format(\"com.databricks.spark.csv\").option(\"header\", \"true\").save(destination_path)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 0, 113 | "metadata": { 114 | "application/vnd.databricks.v1+cell": { 115 | "cellMetadata": {}, 116 | "inputWidgets": {}, 117 | "nuid": "6ac51858-5a0f-4893-a638-8d8df64cbd70", 118 | "showTitle": false, 119 | "title": "" 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "application/vnd.databricks.v1+notebook": { 128 | "dashboards": [], 129 | "language": "python", 130 | "notebookMetadata": { 131 | "pythonIndentUnit": 4 132 | }, 133 | "notebookName": "public_transport_data_generated", 134 | "widgets": {} 135 | }, 136 | "colab": { 137 | "authorship_tag": "ABX9TyNEcAPisy+UgH2pdAMa2tgd", 138 | "provenance": [] 139 | }, 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "name": "python" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 0 150 | } 151 | --------------------------------------------------------------------------------