├── Automated_ETL.py
├── ETL_public_transport_data.ipynb
├── LICENSE
├── README.md
├── azure_databricks
    ├── linkedService
    │   └── AzureDatabricks1.json
    ├── pipeline
    │   └── pipeline1.json
    └── trigger
    │   └── trigger1.json
├── public_tansport_databricks
    ├── README.md
    ├── politiques_conservation_automatise.py
    └── transformations_automatiques.py
└── public_transport_data_generated.ipynb


/Automated_ETL.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | #Connection configuration
 3 | spark.conf.set(
 4 | "fs.azure.account.key.aminbenstorage.blob.core.windows.net", "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw=="
 5 | )
 6 | 
 7 | # COMMAND ----------
 8 | 
 9 | from pyspark.sql.functions import avg, count, year, month, dayofmonth, dayofweek, to_date, col, expr, unix_timestamp, when
10 | 
11 | # Fonction pour traiter un fichier CSV
12 | def process_csv(input_df):
13 |     # Convertir la colonne "Date" en un format de date
14 |     input_df = input_df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))
15 | 
16 |     # Extraire l'année, le mois, le jour et le jour de la semaine
17 |     input_df = input_df.withColumn("Year", year("Date"))
18 |     input_df = input_df.withColumn("Month", month("Date"))
19 |     input_df = input_df.withColumn("Day", dayofmonth("Date"))
20 |     input_df = input_df.withColumn("DayOfWeek", dayofweek("Date"))
21 | 
22 |     # Supprimer les lignes où les deux premiers caractères de la colonne "ArrivalTime" sont "24" ou supérieurs à "24"
23 |     input_df = input_df.filter(~(col("ArrivalTime").substr(1, 2) >= "24"))
24 |     input_df = input_df.filter(~(col("DepartureTime").substr(1, 2) >= "24"))
25 | 
26 |     # Calculer la colonne "Duration"
27 |     input_df = input_df.withColumn("Duration", expr(
28 |         "from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')"
29 |     ))
30 | 
31 |     # Catégoriser les retards en fonction de la colonne "Delay"
32 |     input_df = input_df.withColumn("DelayCategory", 
33 |                    when(col("Delay") <= 0, "No Delay")
34 |                    .when((col("Delay") > 0) & (col("Delay") <= 10), "Short Delay")
35 |                    .when((col("Delay") > 10) & (col("Delay") <= 20), "Medium Delay")
36 |                    .otherwise("Long Delay"))
37 | 
38 |     # Identifier les heures de pointe et heures hors pointe en fonction du nombre de passagers
39 |     average_passengers = input_df.select(avg("Passengers")).first()[0]
40 |     input_df = input_df.withColumn("HeureDePointe", when(col("Passengers") > average_passengers, "peak").otherwise("off-peak"))
41 | 
42 |     return input_df
43 | 
44 | # Fonction pour agréger les données
45 | def aggregate_data(input_df):
46 |     result_df = input_df.groupBy("Route").agg(
47 |         avg("Delay").alias("RetardMoyen"),
48 |         avg("Passengers").alias("NombrePassagersMoyen"),
49 |         count("*").alias("NombreTotalVoyages")
50 |     )
51 |     return result_df
52 | 
53 | # COMMAND ----------
54 | 
55 | # Lecture du fichier CSV
56 | spark_df = spark.read.format('csv').option('header', True).load("wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/public-transport-data.csv")
57 | 
58 | # Appliquer la première fonction pour effectuer le prétraitement
59 | processed_df = process_csv(spark_df)
60 | 
61 | # Appliquer la deuxième fonction pour agréger les données
62 | aggregated_df = aggregate_data(processed_df)
63 | 
64 | # Afficher les résultats
65 | display(aggregated_df)
66 | 
67 | # COMMAND ----------
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/ETL_public_transport_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 0,
  6 |    "metadata": {
  7 |     "application/vnd.databricks.v1+cell": {
  8 |      "cellMetadata": {
  9 |       "byteLimit": 2048000,
 10 |       "rowLimit": 10000
 11 |      },
 12 |      "inputWidgets": {},
 13 |      "nuid": "752d8845-7cbb-4f6d-bea5-2e3f04d56704",
 14 |      "showTitle": false,
 15 |      "title": ""
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "#Connection configuration\n",
 21 |     "spark.conf.set(\n",
 22 |     "\"fs.azure.account.key.aminbenstorage.blob.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n",
 23 |     ")\n",
 24 |     "\n",
 25 |     "#affichage de données\n",
 26 |     "spark_df = spark.read.format('csv').option('header', True).load(\"wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/public-transport-data.csv\")\n",
 27 |     "\n",
 28 |     "display(spark_df)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 0,
 34 |    "metadata": {
 35 |     "application/vnd.databricks.v1+cell": {
 36 |      "cellMetadata": {
 37 |       "byteLimit": 2048000,
 38 |       "rowLimit": 10000
 39 |      },
 40 |      "inputWidgets": {},
 41 |      "nuid": "8a0ca201-c451-4242-9f75-6cc5d60b1af4",
 42 |      "showTitle": false,
 43 |      "title": ""
 44 |     }
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "#infos sur les colonnes\n",
 49 |     "spark_df.printSchema()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 0,
 55 |    "metadata": {
 56 |     "application/vnd.databricks.v1+cell": {
 57 |      "cellMetadata": {
 58 |       "byteLimit": 2048000,
 59 |       "rowLimit": 10000
 60 |      },
 61 |      "inputWidgets": {},
 62 |      "nuid": "d4a24fc1-5d96-417f-9f52-f5a0f641310e",
 63 |      "showTitle": false,
 64 |      "title": ""
 65 |     }
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "unique_dates = spark_df.select(\"Date\").distinct()\n",
 70 |     "display(unique_dates)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 0,
 76 |    "metadata": {
 77 |     "application/vnd.databricks.v1+cell": {
 78 |      "cellMetadata": {
 79 |       "byteLimit": 2048000,
 80 |       "rowLimit": 10000
 81 |      },
 82 |      "inputWidgets": {},
 83 |      "nuid": "cdc21430-c8c7-4400-9429-f7f5af86dc9f",
 84 |      "showTitle": false,
 85 |      "title": ""
 86 |     }
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from pyspark.sql.functions import year, month, dayofmonth, dayofweek, to_date, col\n",
 91 |     "\n",
 92 |     "# Convertir la colonne \"Date\" en un format de date\n",
 93 |     "spark_df = spark_df.withColumn(\"Date\", to_date(\"Date\", \"yyyy-MM-dd\"))\n",
 94 |     "\n",
 95 |     "# Extraire l'année, le mois, le jour et le jour de la semaine\n",
 96 |     "spark_df = spark_df.withColumn(\"Year\", year(\"Date\"))\n",
 97 |     "spark_df = spark_df.withColumn(\"Month\", month(\"Date\"))\n",
 98 |     "spark_df = spark_df.withColumn(\"Day\", dayofmonth(\"Date\"))\n",
 99 |     "spark_df = spark_df.withColumn(\"DayOfWeek\", dayofweek(\"Date\"))\n",
100 |     "\n",
101 |     "# Afficher les résultats\n",
102 |     "display(spark_df)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 0,
108 |    "metadata": {
109 |     "application/vnd.databricks.v1+cell": {
110 |      "cellMetadata": {
111 |       "byteLimit": 2048000,
112 |       "rowLimit": 10000
113 |      },
114 |      "inputWidgets": {},
115 |      "nuid": "dbcd678a-2c49-4e3b-96dc-bca700fc8a43",
116 |      "showTitle": false,
117 |      "title": ""
118 |     }
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "\n",
123 |     "# Supprimer les lignes où les deux premiers caractères de la colonne \"ArrivalTime\" sont \"24\" ou supérieurs à \"24\"\n",
124 |     "spark_df = spark_df.filter(~(col(\"ArrivalTime\").substr(1, 2) >= \"24\"))\n",
125 |     "spark_df = spark_df.filter(~(col(\"DepartureTime\").substr(1, 2) >= \"24\"))\n",
126 |     "\n",
127 |     "display(spark_df)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 0,
133 |    "metadata": {
134 |     "application/vnd.databricks.v1+cell": {
135 |      "cellMetadata": {
136 |       "byteLimit": 2048000,
137 |       "rowLimit": 10000
138 |      },
139 |      "inputWidgets": {},
140 |      "nuid": "f9d1b775-2af4-46f7-8c12-4c02b5c7b21a",
141 |      "showTitle": false,
142 |      "title": ""
143 |     }
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "\n",
148 |     "from pyspark.sql.functions import expr, unix_timestamp \n",
149 |     "spark_df = spark_df.withColumn(\"Duration\", expr(\n",
150 |     "    \"from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')\"\n",
151 |     "))\n",
152 |     "\n",
153 |     "display(spark_df)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 0,
159 |    "metadata": {
160 |     "application/vnd.databricks.v1+cell": {
161 |      "cellMetadata": {
162 |       "byteLimit": 2048000,
163 |       "rowLimit": 10000
164 |      },
165 |      "inputWidgets": {},
166 |      "nuid": "eba6adb1-71a3-4310-9901-9c7f3fac5ab1",
167 |      "showTitle": false,
168 |      "title": ""
169 |     }
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "\n",
174 |     "from pyspark.sql.functions import when \n",
175 |     "\n",
176 |     "# Catégoriser les retards en fonction de la colonne \"Delay\"\n",
177 |     "\n",
178 |     "spark_df = spark_df.withColumn(\"DelayCategory\", \n",
179 |     "                   when(col(\"Delay\") <= 0, \"No Delay\")\n",
180 |     "                   .when((col(\"Delay\") > 0) & (col(\"Delay\") <= 10), \"Short Delay\")\n",
181 |     "                   .when((col(\"Delay\") > 10) & (col(\"Delay\") <= 20), \"Medium Delay\")\n",
182 |     "                   .otherwise(\"Long Delay\"))\n",
183 |     "display(spark_df)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 0,
189 |    "metadata": {
190 |     "application/vnd.databricks.v1+cell": {
191 |      "cellMetadata": {
192 |       "byteLimit": 2048000,
193 |       "rowLimit": 10000
194 |      },
195 |      "inputWidgets": {},
196 |      "nuid": "bfe2968f-2946-4db9-ae41-de14ed0cd8fd",
197 |      "showTitle": false,
198 |      "title": ""
199 |     }
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "\n",
204 |     "from pyspark.sql.functions import avg, col\n",
205 |     "\n",
206 |     "average_passengers = spark_df.select(avg(\"Passengers\")).first()[0]\n",
207 |     "\n",
208 |     "# Identifier les heures de pointe et heures hors pointe en fonction du nombre de passagers :\n",
209 |     "\n",
210 |     "spark_df = spark_df.withColumn(\"HeureDePointe\", when(col(\"Passengers\") > average_passengers, \"peak\").otherwise(\"off-peak\"))\n",
211 |     "\n",
212 |     "# Afficher le DataFrame avec les heures de pointe identifiées :\n",
213 |     "display(spark_df)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 0,
219 |    "metadata": {
220 |     "application/vnd.databricks.v1+cell": {
221 |      "cellMetadata": {
222 |       "byteLimit": 2048000,
223 |       "rowLimit": 10000
224 |      },
225 |      "inputWidgets": {},
226 |      "nuid": "dabe3c76-2076-40da-9d82-f9b33ee4216e",
227 |      "showTitle": false,
228 |      "title": ""
229 |     }
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "\n",
234 |     "from pyspark.sql.functions import count\n",
235 |     "\n",
236 |     "result_df = spark_df.groupBy(\"Route\").agg(\n",
237 |     "    avg(\"Delay\").alias(\"RetardMoyen\"),\n",
238 |     "    avg(\"Passengers\").alias(\"NombrePassagersMoyen\"),\n",
239 |     "    count(\"*\").alias(\"NombreTotalVoyages\")\n",
240 |     ")\n",
241 |     "\n",
242 |     "#Afficher le DataFrame résultant :\n",
243 |     "display(result_df)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 0,
249 |    "metadata": {
250 |     "application/vnd.databricks.v1+cell": {
251 |      "cellMetadata": {
252 |       "byteLimit": 2048000,
253 |       "rowLimit": 10000
254 |      },
255 |      "inputWidgets": {},
256 |      "nuid": "08f6f103-c07d-4b00-b9cc-d90c55d3d5f9",
257 |      "showTitle": false,
258 |      "title": ""
259 |     }
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "spark.conf.set(\n",
264 |     "\"fs.azure.account.key.aminbenstorage.dfs.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n",
265 |     ")\n",
266 |     "\n",
267 |     "raw = \"abfss://data@aminbenstorage.dfs.core.windows.net/public_transport_data/raw/\"\n",
268 |     "processed = \"abfss://data@aminbenstorage.dfs.core.windows.net/public_transport_data/processed/\"\n",
269 |     "\n",
270 |     "raw_files = dbutils.fs.ls(raw)\n",
271 |     "#raw_csv_files = [f.path for f in raw_files if f.name.endswith(\".csv\")] # Lit of CSV Files\n",
272 |     "#raw_file_count = len(raw_csv_files)\n",
273 |     "display(raw_files)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 0,
279 |    "metadata": {
280 |     "application/vnd.databricks.v1+cell": {
281 |      "cellMetadata": {},
282 |      "inputWidgets": {},
283 |      "nuid": "bef07fa9-c814-4a92-a483-345ee7254569",
284 |      "showTitle": false,
285 |      "title": ""
286 |     }
287 |    },
288 |    "outputs": [],
289 |    "source": []
290 |   }
291 |  ],
292 |  "metadata": {
293 |   "application/vnd.databricks.v1+notebook": {
294 |    "dashboards": [],
295 |    "language": "python",
296 |    "notebookMetadata": {
297 |     "pythonIndentUnit": 4
298 |    },
299 |    "notebookName": "ETL_public_transport_data",
300 |    "widgets": {}
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 0
305 | }
306 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 aminscientist
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # azure_databricks


--------------------------------------------------------------------------------
/azure_databricks/linkedService/AzureDatabricks1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureDatabricks1",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"annotations": [],
 6 | 		"type": "AzureDatabricks",
 7 | 		"typeProperties": {
 8 | 			"domain": "https://adb-7139156790856287.7.azuredatabricks.net",
 9 | 			"existingClusterId": "0927-075712-jp35wqf7",
10 | 			"encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEIzMEI1Qzg0LTMxQ0YtNDVGQy05NDQwLURFRkRFMTUxMDYxNF8xOWNmYTMwNi04YzIyLTRlY2QtOWNkYS0yNTMzZjM2OTAxYzkiDQp9"
11 | 		}
12 | 	}
13 | }


--------------------------------------------------------------------------------
/azure_databricks/pipeline/pipeline1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "pipeline1",
 3 | 	"properties": {
 4 | 		"activities": [
 5 | 			{
 6 | 				"name": "Notebook1",
 7 | 				"type": "DatabricksNotebook",
 8 | 				"dependsOn": [],
 9 | 				"policy": {
10 | 					"timeout": "0.12:00:00",
11 | 					"retry": 0,
12 | 					"retryIntervalInSeconds": 30,
13 | 					"secureOutput": false,
14 | 					"secureInput": false
15 | 				},
16 | 				"userProperties": [],
17 | 				"typeProperties": {
18 | 					"notebookPath": "/Repos/abenazzouz.ext@simplonformations.onmicrosoft.com/azure_databricks/Transport_Data_Integration_and_Management_with_Azure_Databrick-main/ETL auto"
19 | 				},
20 | 				"linkedServiceName": {
21 | 					"referenceName": "AzureDatabricks1",
22 | 					"type": "LinkedServiceReference"
23 | 				}
24 | 			},
25 | 			{
26 | 				"name": "Notebook2",
27 | 				"type": "DatabricksNotebook",
28 | 				"dependsOn": [
29 | 					{
30 | 						"activity": "Notebook1",
31 | 						"dependencyConditions": [
32 | 							"Succeeded"
33 | 						]
34 | 					}
35 | 				],
36 | 				"policy": {
37 | 					"timeout": "0.12:00:00",
38 | 					"retry": 0,
39 | 					"retryIntervalInSeconds": 30,
40 | 					"secureOutput": false,
41 | 					"secureInput": false
42 | 				},
43 | 				"userProperties": [],
44 | 				"typeProperties": {
45 | 					"notebookPath": "/Repos/abenazzouz.ext@simplonformations.onmicrosoft.com/azure_databricks/Transport_Data_Integration_and_Management_with_Azure_Databrick-main/Automatisation_Politiques_Conservation"
46 | 				},
47 | 				"linkedServiceName": {
48 | 					"referenceName": "AzureDatabricks1",
49 | 					"type": "LinkedServiceReference"
50 | 				}
51 | 			}
52 | 		],
53 | 		"annotations": []
54 | 	}
55 | }


--------------------------------------------------------------------------------
/azure_databricks/trigger/trigger1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "trigger1",
 3 | 	"properties": {
 4 | 		"annotations": [],
 5 | 		"runtimeState": "Started",
 6 | 		"pipelines": [
 7 | 			{
 8 | 				"pipelineReference": {
 9 | 					"referenceName": "pipeline1",
10 | 					"type": "PipelineReference"
11 | 				}
12 | 			}
13 | 		],
14 | 		"type": "ScheduleTrigger",
15 | 		"typeProperties": {
16 | 			"recurrence": {
17 | 				"frequency": "Minute",
18 | 				"interval": 1,
19 | 				"startTime": "2023-09-27T16:16:00",
20 | 				"timeZone": "Morocco Standard Time"
21 | 			}
22 | 		}
23 | 	}
24 | }


--------------------------------------------------------------------------------
/public_tansport_databricks/README.md:
--------------------------------------------------------------------------------
1 | # Public_Transport_Data_Integration_and_Management_with_Azure


--------------------------------------------------------------------------------
/public_tansport_databricks/politiques_conservation_automatise.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from pyspark.sql.functions import year, month, dayofmonth,\
 3 |     dayofweek,col,date_format,regexp_extract,when,expr,\
 4 |     unix_timestamp, from_unixtime,avg,to_timestamp,col, sum, count
 5 | 
 6 | from datetime import datetime
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | account_name = "aminbenstorage"
11 | container_name = "data"
12 | Access_keys = "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw=="
13 | 
14 | spark.conf.set(
15 |     f"fs.azure.account.key.{account_name}.dfs.core.windows.net", 
16 |     f"{Access_keys}"
17 | )
18 | 
19 | # COMMAND ----------
20 | 
21 | # get all fishies in processed :
22 | processed_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/")
23 | 
24 | # get all fishies in row :
25 | row_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/")
26 | 
27 | # show information for fishies csv dans row :
28 | row_data_info =[(info.name, info.modificationTime) for info in row_data]
29 | 
30 | # show information for fishies csv dans processed :
31 | processed_data_info =[(info.name,info.modificationTime) for info in processed_data]
32 | 
33 | 
34 | # COMMAND ----------
35 | 
36 | # archive the file in row : 
37 | 
38 | for i in row_data_info  :
39 |     timestamp_datetime = datetime.fromtimestamp(i[1] / 1000)
40 | 
41 |     # Calculate the duration between the two datetime objects
42 |     duration = datetime.now() - timestamp_datetime
43 |     duration_day = duration.days
44 |     print(duration_day)
45 | 
46 |     if duration_day ==0 :
47 |         # file name :
48 |         filenam = i[0]
49 | 
50 |         # file path :
51 |         fishier_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/{filenam}"
52 | 
53 |         #archive folder :
54 |         archive_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/Archive/{filenam}"
55 | 
56 |         dbutils.fs.cp(fishier_path,archive_path,recurse=True)
57 | 


--------------------------------------------------------------------------------
/public_tansport_databricks/transformations_automatiques.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from pyspark.sql.functions import year, month, dayofmonth,\
 3 |     dayofweek,col,date_format,regexp_extract,when,expr,\
 4 |     unix_timestamp, from_unixtime,avg,to_timestamp,col, sum, count
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | account_name = "aminbenstorage"
 9 | container_name = "data"
10 | Access_keys = "B2vg1vuvYtkcygLcTLXhErl9DccZRYrGkrtXROsTvfIes2c/QM4vfyFfJdTSXv0riqXi/0iiNucV+ASt0IgRgw=="
11 | 
12 | spark.conf.set(
13 |     f"fs.azure.account.key.{account_name}.dfs.core.windows.net", 
14 |     f"{Access_keys}"
15 | )
16 | 
17 | # COMMAND ----------
18 | 
19 | # get all fishier in processed file :
20 | processed_data = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/")
21 | 
22 | processed_data = [file.name for file in processed_data]
23 | 
24 | # csv file in row :
25 | file_list = dbutils.fs.ls(f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/")
26 | 
27 | # create list of csv :
28 | file_names = [file.name for file in file_list]
29 | 
30 | # COMMAND ----------
31 | 
32 | def Cleaning(i,container_name,account_name) :
33 | 
34 |     # select curent file :
35 |     file_location = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/raw/{file_names[i]}"
36 | 
37 |     # load data :
38 |     df = spark.read.format("csv").option("inferSchema", "True").option("header",
39 |     "True").option("delimeter",",").load(file_location)
40 | 
41 |     # Fix data DepartureTime and ArrivalTime:
42 |     df = df.withColumn("DepartureTime", date_format(col("DepartureTime"), "HH:mm"))
43 |     df = df.withColumn("ArrivalTime", date_format(col("ArrivalTime"), "HH:mm"))
44 | 
45 |     # Fix invalid time values in ArrivalTime column :
46 |     time_pattern = r'^([01][0-9]|2[0-3]):[0-5][0-9]$'
47 | 
48 |     df = df.withColumn("ArrivalTime", when(~col("ArrivalTime").rlike(time_pattern), "00:00").otherwise(col("ArrivalTime")))
49 | 
50 |     # Add column day,month,year,day_of_week :
51 |     df = df.withColumn("year", year("Date"))
52 |     df = df.withColumn("month", month("Date"))
53 |     df = df.withColumn("day", dayofmonth("Date"))
54 |     df = df.withColumn("day_of_week", dayofweek("Date"))
55 |     df = df.drop("date")
56 | 
57 |     # caluculer la duration of time :
58 |     df = df.withColumn("Duration", expr(
59 |         "from_unixtime(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm'), 'HH:mm')"
60 |     ))
61 | 
62 |     # Catégoriser les retards en fonction de la colonne "Delay" :
63 |     df = df.withColumn("DelayCategory", 
64 |                     when(col("Delay") <= 0, "No Delay")
65 |                     .when((col("Delay") > 0) & (col("Delay") <= 10), "Short Delay")
66 |                     .when((col("Delay") > 10) & (col("Delay") <= 20), "Medium Delay")
67 |                     .otherwise("Long Delay"))
68 | 
69 |     # Identifier peak and off-peak :
70 |     average_passengers = df.select(avg("Passengers")).first()[0]
71 | 
72 |     df = df.withColumn("HeureDePointe", when(col("Passengers") > average_passengers, True).otherwise(False))
73 | 
74 |     # Define the path to save the CSV :
75 |     output_file_location = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/public_transport_data/processed/{file_names[i]}"
76 | 
77 |     # Save the DataFrame as a CSV file to the specified location :
78 |     df.write.csv(output_file_location, header=True, mode="overwrite") 
79 | 
80 | # COMMAND ----------
81 | 
82 | i = 0
83 | for fishie_csv in file_names :
84 | 
85 |     if len(processed_data) == 0 :
86 |         print("prossess")
87 |         Cleaning(i,container_name,account_name)
88 |         break;
89 |     
90 |     if fishie_csv+'/' not in processed_data :
91 |         i = file_names.index(fishie_csv)
92 |         Cleaning(i,container_name,account_name)
93 |         break;
94 | 
95 | # COMMAND ----------
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/public_transport_data_generated.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 0,
  6 |    "metadata": {
  7 |     "application/vnd.databricks.v1+cell": {
  8 |      "cellMetadata": {
  9 |       "byteLimit": 2048000,
 10 |       "rowLimit": 10000
 11 |      },
 12 |      "inputWidgets": {},
 13 |      "nuid": "75102fd6-90d4-44a0-9196-3355ba75a5e8",
 14 |      "showTitle": false,
 15 |      "title": ""
 16 |     },
 17 |     "id": "DMjYPGtNBMMx"
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import random\n",
 23 |     "from datetime import datetime, timedelta\n",
 24 |     "\n",
 25 |     "#Connection configuration\n",
 26 |     "spark.conf.set(\n",
 27 |     "\"fs.azure.account.key.aminbenstorage.blob.core.windows.net\", \"+6pERcvu8lJiDee3AMSByWKJMc3bYKLCeo9/r4d9hIcz0YyNoDpKTO4muOdjKqwWxlOwEd3dGWru+ASth3iE9w==\"\n",
 28 |     ")\n",
 29 |     "\n",
 30 |     "# Définissez les noms des mois\n",
 31 |     "mois = [\"janvier\", \"février\", \"mars\", \"avril\", \"mai\"]\n",
 32 |     "\n",
 33 |     "for m in mois:\n",
 34 |     "    # Generate data for the current month\n",
 35 |     "    start_date = datetime(2023, mois.index(m) + 1, 1)\n",
 36 |     "    if mois.index(m) == 3:  # Avril a 30 jours\n",
 37 |     "        end_date = datetime(2023, mois.index(m) + 1, 30)\n",
 38 |     "    elif mois.index(m) == 1:  # Février a 28 jours\n",
 39 |     "        end_date = datetime(2023, mois.index(m) + 1, 28)\n",
 40 |     "    else:\n",
 41 |     "        end_date = datetime(2023, mois.index(m) + 1, 31)\n",
 42 |     "\n",
 43 |     "    date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days)]\n",
 44 |     "\n",
 45 |     "    # Define transportation-related data\n",
 46 |     "    transport_types = [\"Bus\", \"Train\", \"Tram\", \"Metro\"]\n",
 47 |     "    routes = [\"Route_\" + str(i) for i in range(1, 11)]\n",
 48 |     "    stations = [\"Station_\" + str(i) for i in range(1, 21)]\n",
 49 |     "\n",
 50 |     "    # Randomly select 5 days as extreme weather days\n",
 51 |     "    extreme_weather_days = random.sample(date_generated, 5)\n",
 52 |     "\n",
 53 |     "    data = []\n",
 54 |     "\n",
 55 |     "    for date in date_generated:\n",
 56 |     "        for _ in range(32):  # 32 records per day to get a total of 992 records for the current month\n",
 57 |     "            transport = random.choice(transport_types)\n",
 58 |     "            route = random.choice(routes)\n",
 59 |     "\n",
 60 |     "            # Normal operating hours\n",
 61 |     "            departure_hour = random.randint(5, 22)\n",
 62 |     "            departure_minute = random.randint(0, 59)\n",
 63 |     "\n",
 64 |     "            # Introducing Unusual Operating Hours for buses\n",
 65 |     "            if transport == \"Bus\" and random.random() < 0.05:  # 5% chance\n",
 66 |     "                departure_hour = 3\n",
 67 |     "\n",
 68 |     "            departure_time = f\"{departure_hour:02}:{departure_minute:02}\"\n",
 69 |     "\n",
 70 |     "            # Normal duration\n",
 71 |     "            duration = random.randint(10, 120)\n",
 72 |     "\n",
 73 |     "            # Introducing Short Turnarounds\n",
 74 |     "            if random.random() < 0.05:  # 5% chance\n",
 75 |     "                duration = random.randint(1, 5)\n",
 76 |     "\n",
 77 |     "            # General delay\n",
 78 |     "            delay = random.randint(0, 15)\n",
 79 |     "\n",
 80 |     "            # Weather Impact\n",
 81 |     "            if date in extreme_weather_days:\n",
 82 |     "                # Increase delay by 10 to 60 minutes\n",
 83 |     "                delay += random.randint(10, 60)\n",
 84 |     "\n",
 85 |     "                # 10% chance to change the route\n",
 86 |     "                if random.random() < 0.10:\n",
 87 |     "                    route = random.choice(routes)\n",
 88 |     "\n",
 89 |     "            total_minutes = departure_minute + duration + delay\n",
 90 |     "            arrival_hour = departure_hour + total_minutes // 60\n",
 91 |     "            arrival_minute = total_minutes % 60\n",
 92 |     "            arrival_time = f\"{arrival_hour:02}:{arrival_minute:02}\"\n",
 93 |     "\n",
 94 |     "            passengers = random.randint(1, 100)\n",
 95 |     "            departure_station = random.choice(stations)\n",
 96 |     "            arrival_station = random.choice(stations)\n",
 97 |     "\n",
 98 |     "            data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay])\n",
 99 |     "\n",
100 |     "    df = pd.DataFrame(data, columns=[\"Date\", \"TransportType\", \"Route\", \"DepartureTime\", \"ArrivalTime\", \"Passengers\", \"DepartureStation\", \"ArrivalStation\", \"Delay\"])\n",
101 |     "\n",
102 |     "    # Modifiez le chemin de destination du fichier CSV pour le mois actuel\n",
103 |     "    destination_path = f\"wasbs://data@aminbenstorage.blob.core.windows.net/public_transport_data/raw/{m.capitalize()}\"\n",
104 |     "\n",
105 |     "    # Écrivez les données dans un fichier CSV\n",
106 |     "    spark_df = spark.createDataFrame(df)\n",
107 |     "    spark_df.coalesce(1).write.format(\"com.databricks.spark.csv\").option(\"header\", \"true\").save(destination_path)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 0,
113 |    "metadata": {
114 |     "application/vnd.databricks.v1+cell": {
115 |      "cellMetadata": {},
116 |      "inputWidgets": {},
117 |      "nuid": "6ac51858-5a0f-4893-a638-8d8df64cbd70",
118 |      "showTitle": false,
119 |      "title": ""
120 |     }
121 |    },
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "application/vnd.databricks.v1+notebook": {
128 |    "dashboards": [],
129 |    "language": "python",
130 |    "notebookMetadata": {
131 |     "pythonIndentUnit": 4
132 |    },
133 |    "notebookName": "public_transport_data_generated",
134 |    "widgets": {}
135 |   },
136 |   "colab": {
137 |    "authorship_tag": "ABX9TyNEcAPisy+UgH2pdAMa2tgd",
138 |    "provenance": []
139 |   },
140 |   "kernelspec": {
141 |    "display_name": "Python 3",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "name": "python"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 0
150 | }
151 | 


--------------------------------------------------------------------------------