├── README.md ├── script conservation.py ├── create csv files.ipynb └── script import csv.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Public_Transport_DataBricks -------------------------------------------------------------------------------- /script conservation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %run "./script import csv" 3 | 4 | # COMMAND ---------- 5 | 6 | # CAlculate the duration on minutes 7 | def get_file_duration(path): 8 | modification_time_ms = path.modificationTime 9 | modification_time = datetime.fromtimestamp(modification_time_ms / 1000) # Divide by 1000 to convert milliseconds to minute 10 | duration = (datetime.now() - modification_time).total_seconds() / 60 11 | return duration 12 | 13 | # COMMAND ---------- 14 | 15 | # Archive files from the raw directory 16 | def archived_raw_files(raw_paths): 17 | for path in raw_paths: 18 | file_duration = get_file_duration(path) 19 | # check if the duration 20 | if file_duration >= 5: 21 | # get the raw directory 22 | source_directory = path.path 23 | # get the archived directory 24 | destination_directory = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/{path.name}" 25 | dbutils.fs.mv(source_directory, destination_directory,recurse = True) 26 | 27 | # COMMAND ---------- 28 | 29 | # Delete the archived files 30 | def delete_archived_files(archived_paths): 31 | for path in archived_paths: 32 | file_duration = get_file_duration(path) 33 | # check if the duration 34 | if file_duration >= 10: 35 | # get the raw directory 36 | source_directory = path.path 37 | # get the archived directory 38 | destination_directory = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/{path.name}" 39 | dbutils.fs.rm(destination_directory,recurse = True) 40 | 41 | # COMMAND ---------- 42 | 43 | from datetime import datetime 44 | 45 | storage_account_name = "tarifihicham1cs" 46 | storage_account_access_key = "OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w==" 47 | container_name = "tarifihichamcontainer" 48 | 49 | # get files path 50 | files_paths = get_file_path(storage_account_name,storage_account_access_key,container_name) 51 | 52 | # Execute function to apply the conservation policies 53 | archived_raw_files(files_paths[0]) 54 | delete_archived_files(files_paths[2]) 55 | 56 | -------------------------------------------------------------------------------- /create csv files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "8485739d-d8da-4155-9e03-18a07f258561", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import random\n", 22 | "from datetime import datetime, timedelta\n", 23 | "import calendar\n", 24 | "\n", 25 | "storage_account_name = \"tarifihicham1cs\"\n", 26 | "storage_account_access_key = \"OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w==\"\n", 27 | "container_name = \"tarifihichamcontainer\"\n", 28 | "\n", 29 | "spark.conf.set(f\"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net\",\n", 30 | " storage_account_access_key)\n", 31 | "\n", 32 | "month = 1\n", 33 | "year = 2023\n", 34 | "\n", 35 | "while month < 6:\n", 36 | " num_days = calendar.monthrange(year, month)[1]\n", 37 | " # Generate data for each month\n", 38 | " start_date = datetime(year, month, 1)\n", 39 | " end_date = datetime(year, month, num_days)\n", 40 | " date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days+1)]\n", 41 | "\n", 42 | " transport_types = [\"Bus\", \"Train\", \"Tram\", \"Metro\"]\n", 43 | " routes = [\"Route_\" + str(i) for i in range(1, 11)]\n", 44 | " stations = [\"Station_\" + str(i) for i in range(1, 21)]\n", 45 | "\n", 46 | " # Randomly select 5 days as extreme weather days\n", 47 | " extreme_weather_days = random.sample(date_generated, 5)\n", 48 | "\n", 49 | " data = []\n", 50 | "\n", 51 | " for date in date_generated:\n", 52 | " for _ in range(32): # 32 records per day to get a total of 992 records for January\n", 53 | " transport = random.choice(transport_types)\n", 54 | " route = random.choice(routes)\n", 55 | "\n", 56 | " # Normal operating hours\n", 57 | " departure_hour = random.randint(5, 22)\n", 58 | " departure_minute = random.randint(0, 59)\n", 59 | "\n", 60 | " # Introducing Unusual Operating Hours for buses\n", 61 | " if transport == \"Bus\" and random.random() < 0.05: # 5% chance\n", 62 | " departure_hour = 3\n", 63 | "\n", 64 | " departure_time = f\"{departure_hour:02}:{departure_minute:02}\"\n", 65 | "\n", 66 | " # Normal duration\n", 67 | " duration = random.randint(10, 120)\n", 68 | "\n", 69 | " # Introducing Short Turnarounds\n", 70 | " if random.random() < 0.05: # 5% chance\n", 71 | " duration = random.randint(1, 5)\n", 72 | "\n", 73 | " # General delay\n", 74 | " delay = random.randint(0, 15)\n", 75 | "\n", 76 | " # Weather Impact\n", 77 | " if date in extreme_weather_days:\n", 78 | " # Increase delay by 10 to 60 minutes\n", 79 | " delay += random.randint(10, 60)\n", 80 | "\n", 81 | " # 10% chance to change the route\n", 82 | " if random.random() < 0.10:\n", 83 | " route = random.choice(routes)\n", 84 | "\n", 85 | " total_minutes = departure_minute + duration + delay\n", 86 | " arrival_hour = departure_hour + total_minutes // 60\n", 87 | " arrival_minute = total_minutes % 60\n", 88 | " arrival_time = f\"{arrival_hour:02}:{arrival_minute:02}\"\n", 89 | "\n", 90 | " passengers = random.randint(1, 100)\n", 91 | " departure_station = random.choice(stations)\n", 92 | " arrival_station = random.choice(stations)\n", 93 | "\n", 94 | " data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay])\n", 95 | "\n", 96 | " df = pd.DataFrame(data, columns=[\"Date\", \"TransportType\", \"Route\", \"DepartureTime\", \"ArrivalTime\", \"Passengers\", \"DepartureStation\", \"ArrivalStation\", \"Delay\"])\n", 97 | " \n", 98 | " spark_df = spark.createDataFrame(df)\n", 99 | "\n", 100 | " # Reduce the number of partitions to one\n", 101 | " spark_df = spark_df.coalesce(1)\n", 102 | "\n", 103 | " spark_df.write.format(\"csv\")\\\n", 104 | " .option(\"header\", \"true\")\\\n", 105 | " .mode(\"overwrite\")\\\n", 106 | " .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/public_transport_data{month}\")\n", 107 | " month = month + 1\n" 108 | ] 109 | } 110 | ], 111 | "metadata": { 112 | "application/vnd.databricks.v1+notebook": { 113 | "dashboards": [], 114 | "language": "python", 115 | "notebookMetadata": { 116 | "pythonIndentUnit": 4 117 | }, 118 | "notebookName": "create csv files", 119 | "widgets": {} 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 0 124 | } 125 | -------------------------------------------------------------------------------- /script import csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "c339e251-71fd-4c81-888c-b631a406d1a2", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "def transform_dataframe(df):\n", 21 | " # Extract year\n", 22 | " df = df.withColumn(\"Year\", year(df[\"Date\"]))\n", 23 | " # Extract month\n", 24 | " df = df.withColumn(\"Month\", month(df[\"Date\"]))\n", 25 | " # Extract day\n", 26 | " df = df.withColumn(\"Day\", dayofmonth(df[\"Date\"]))\n", 27 | " # Extract day of week\n", 28 | " df = df.withColumn(\"DayOfWeek\", dayofweek(df[\"Date\"]))\n", 29 | " # Extract duration (calculate the duration in minutes)\n", 30 | " df = df.withColumn(\"Duration\", expr(\"(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm')) / 60\"))\n", 31 | " # Calculate average passengers\n", 32 | " avgPassengers = df.select(avg(\"Passengers\")).first()[0]\n", 33 | " # Extract passengers traffic condition\n", 34 | " df = df.withColumn(\"PassengersTraffic\", expr(\"CASE WHEN Passengers <= {0} THEN 'Non' ELSE 'Oui' END\".format(avgPassengers)))\n", 35 | " return df\n", 36 | "\n", 37 | "# Calculate the average of Route, Passengers, Delay and the count of the Route\n", 38 | "def calcul_avg(df):\n", 39 | " df = df.groupBy(\"Route\").agg(avg(\"Passengers\").alias(\"AvgPassengers\"),avg(\"Delay\").alias(\"AvgDelay\"),count(\"Route\").alias(\"RouteCount\"))\n", 40 | " return df\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 0, 46 | "metadata": { 47 | "application/vnd.databricks.v1+cell": { 48 | "cellMetadata": { 49 | "byteLimit": 2048000, 50 | "rowLimit": 10000 51 | }, 52 | "inputWidgets": {}, 53 | "nuid": "aec21fe7-d33f-4bed-92d9-54cf320451e1", 54 | "showTitle": false, 55 | "title": "" 56 | } 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "# get the destinations of the directories in our container (Raw, Processed, Archived)\n", 61 | "def get_file_path(storage_account_name,storage_account_access_key,container_name):\n", 62 | "\n", 63 | " spark.conf.set(f\"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net\",\n", 64 | " storage_account_access_key)\n", 65 | "\n", 66 | " raw = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/\"\n", 67 | " processed = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/\"\n", 68 | " archived = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/\"\n", 69 | " \n", 70 | " raw_files = dbutils.fs.ls(raw)\n", 71 | " processed_files = dbutils.fs.ls(processed)\n", 72 | " archived_files = dbutils.fs.ls(archived)\n", 73 | "\n", 74 | " return [raw_files, processed_files, archived_files]" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 0, 80 | "metadata": { 81 | "application/vnd.databricks.v1+cell": { 82 | "cellMetadata": { 83 | "byteLimit": 2048000, 84 | "rowLimit": 10000 85 | }, 86 | "inputWidgets": {}, 87 | "nuid": "3cca86ac-fab6-4a6e-91df-2fe17234d862", 88 | "showTitle": false, 89 | "title": "" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from pyspark.sql.functions import year,month,dayofmonth,dayofweek,to_timestamp,from_unixtime,unix_timestamp,expr,avg,count,when,col\n", 95 | "\n", 96 | "storage_account_name = \"tarifihicham1cs\"\n", 97 | "storage_account_access_key = \"OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w==\"\n", 98 | "container_name = \"tarifihichamcontainer\"\n", 99 | "\n", 100 | "# get all the files destinations\n", 101 | "files_paths = get_file_path(storage_account_name,storage_account_access_key,container_name)\n", 102 | "files_processed = []\n", 103 | "# get all the files processed\n", 104 | "for processed in files_paths[1]:\n", 105 | " files_processed.append(processed.name)\n", 106 | "# this counter is to apply the transformation on juste 2 files\n", 107 | "counter = 0\n", 108 | "for raw in files_paths[0]:\n", 109 | " if (raw.name not in files_processed) and (counter < 2):\n", 110 | " filepath = dbutils.fs.ls(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/\"+raw.name)\n", 111 | " for filename in filepath:\n", 112 | " if filename.name.endswith(\".csv\"):\n", 113 | " # get the csv file\n", 114 | " df = spark.read.format(\"csv\")\\\n", 115 | " .option(\"header\", \"true\")\\\n", 116 | " .load(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/{raw.name}/{filename.name}\")\n", 117 | " # apply the transformations\n", 118 | " df = transform_dataframe(df)\n", 119 | " dfm = calcul_avg(df)\n", 120 | " # Reduce the number of partitions to one\n", 121 | " df = df.coalesce(1)\n", 122 | " dfm = dfm.coalesce(1)\n", 123 | " # Export csv file processed\n", 124 | " df.write.format(\"csv\")\\\n", 125 | " .option(\"header\", \"true\")\\\n", 126 | " .mode(\"overwrite\")\\\n", 127 | " .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/{raw.name}data\")\n", 128 | " # Export csv file of analysing\n", 129 | " dfm.write.format(\"csv\")\\\n", 130 | " .option(\"header\", \"true\")\\\n", 131 | " .mode(\"overwrite\")\\\n", 132 | " .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/{raw.name}analyse\")\n", 133 | " counter = counter + 1" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "application/vnd.databricks.v1+notebook": { 139 | "dashboards": [], 140 | "language": "python", 141 | "notebookMetadata": { 142 | "pythonIndentUnit": 4 143 | }, 144 | "notebookName": "script import csv", 145 | "widgets": {} 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 0 150 | } 151 | --------------------------------------------------------------------------------