├── README.md
├── script conservation.py
├── create csv files.ipynb
└── script import csv.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Public_Transport_DataBricks


--------------------------------------------------------------------------------
/script conservation.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %run "./script import csv"
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # CAlculate the duration on minutes
 7 | def get_file_duration(path):
 8 |     modification_time_ms = path.modificationTime
 9 |     modification_time = datetime.fromtimestamp(modification_time_ms / 1000)  # Divide by 1000 to convert milliseconds to minute
10 |     duration = (datetime.now() - modification_time).total_seconds() / 60
11 |     return duration
12 | 
13 | # COMMAND ----------
14 | 
15 | # Archive files from the raw directory
16 | def archived_raw_files(raw_paths):
17 |     for path in raw_paths:
18 |         file_duration = get_file_duration(path)
19 |         # check if the duration 
20 |         if file_duration >= 5:
21 |             # get the raw directory
22 |             source_directory = path.path
23 |             # get the archived directory
24 |             destination_directory = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/{path.name}"
25 |             dbutils.fs.mv(source_directory, destination_directory,recurse = True)
26 | 
27 | # COMMAND ----------
28 | 
29 | # Delete the archived files
30 | def delete_archived_files(archived_paths):
31 |     for path in archived_paths:
32 |         file_duration = get_file_duration(path)
33 |         # check if the duration 
34 |         if file_duration >= 10:
35 |             # get the raw directory
36 |             source_directory = path.path
37 |             # get the archived directory
38 |             destination_directory = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/{path.name}"
39 |             dbutils.fs.rm(destination_directory,recurse = True)
40 | 
41 | # COMMAND ----------
42 | 
43 | from datetime import datetime
44 | 
45 | storage_account_name = "tarifihicham1cs"
46 | storage_account_access_key = "OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w=="
47 | container_name = "tarifihichamcontainer"
48 | 
49 | # get files path
50 | files_paths = get_file_path(storage_account_name,storage_account_access_key,container_name)
51 | 
52 | # Execute function to apply the conservation policies
53 | archived_raw_files(files_paths[0])
54 | delete_archived_files(files_paths[2])
55 | 
56 | 


--------------------------------------------------------------------------------
/create csv files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 0,
  6 |    "metadata": {
  7 |     "application/vnd.databricks.v1+cell": {
  8 |      "cellMetadata": {
  9 |       "byteLimit": 2048000,
 10 |       "rowLimit": 10000
 11 |      },
 12 |      "inputWidgets": {},
 13 |      "nuid": "8485739d-d8da-4155-9e03-18a07f258561",
 14 |      "showTitle": false,
 15 |      "title": ""
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import random\n",
 22 |     "from datetime import datetime, timedelta\n",
 23 |     "import calendar\n",
 24 |     "\n",
 25 |     "storage_account_name = \"tarifihicham1cs\"\n",
 26 |     "storage_account_access_key = \"OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w==\"\n",
 27 |     "container_name = \"tarifihichamcontainer\"\n",
 28 |     "\n",
 29 |     "spark.conf.set(f\"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net\",\n",
 30 |     "  storage_account_access_key)\n",
 31 |     "\n",
 32 |     "month = 1\n",
 33 |     "year = 2023\n",
 34 |     "\n",
 35 |     "while month < 6:\n",
 36 |     "    num_days = calendar.monthrange(year, month)[1]\n",
 37 |     "    # Generate data for each month\n",
 38 |     "    start_date = datetime(year, month, 1)\n",
 39 |     "    end_date = datetime(year, month, num_days)\n",
 40 |     "    date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days+1)]\n",
 41 |     "\n",
 42 |     "    transport_types = [\"Bus\", \"Train\", \"Tram\", \"Metro\"]\n",
 43 |     "    routes = [\"Route_\" + str(i) for i in range(1, 11)]\n",
 44 |     "    stations = [\"Station_\" + str(i) for i in range(1, 21)]\n",
 45 |     "\n",
 46 |     "    # Randomly select 5 days as extreme weather days\n",
 47 |     "    extreme_weather_days = random.sample(date_generated, 5)\n",
 48 |     "\n",
 49 |     "    data = []\n",
 50 |     "\n",
 51 |     "    for date in date_generated:\n",
 52 |     "        for _ in range(32):  # 32 records per day to get a total of 992 records for January\n",
 53 |     "            transport = random.choice(transport_types)\n",
 54 |     "            route = random.choice(routes)\n",
 55 |     "\n",
 56 |     "            # Normal operating hours\n",
 57 |     "            departure_hour = random.randint(5, 22)\n",
 58 |     "            departure_minute = random.randint(0, 59)\n",
 59 |     "\n",
 60 |     "            # Introducing Unusual Operating Hours for buses\n",
 61 |     "            if transport == \"Bus\" and random.random() < 0.05:  # 5% chance\n",
 62 |     "                departure_hour = 3\n",
 63 |     "\n",
 64 |     "            departure_time = f\"{departure_hour:02}:{departure_minute:02}\"\n",
 65 |     "\n",
 66 |     "            # Normal duration\n",
 67 |     "            duration = random.randint(10, 120)\n",
 68 |     "\n",
 69 |     "            # Introducing Short Turnarounds\n",
 70 |     "            if random.random() < 0.05:  # 5% chance\n",
 71 |     "                duration = random.randint(1, 5)\n",
 72 |     "\n",
 73 |     "            # General delay\n",
 74 |     "            delay = random.randint(0, 15)\n",
 75 |     "\n",
 76 |     "            # Weather Impact\n",
 77 |     "            if date in extreme_weather_days:\n",
 78 |     "                # Increase delay by 10 to 60 minutes\n",
 79 |     "                delay += random.randint(10, 60)\n",
 80 |     "\n",
 81 |     "                # 10% chance to change the route\n",
 82 |     "                if random.random() < 0.10:\n",
 83 |     "                    route = random.choice(routes)\n",
 84 |     "\n",
 85 |     "            total_minutes = departure_minute + duration + delay\n",
 86 |     "            arrival_hour = departure_hour + total_minutes // 60\n",
 87 |     "            arrival_minute = total_minutes % 60\n",
 88 |     "            arrival_time = f\"{arrival_hour:02}:{arrival_minute:02}\"\n",
 89 |     "\n",
 90 |     "            passengers = random.randint(1, 100)\n",
 91 |     "            departure_station = random.choice(stations)\n",
 92 |     "            arrival_station = random.choice(stations)\n",
 93 |     "\n",
 94 |     "            data.append([date, transport, route, departure_time, arrival_time, passengers, departure_station, arrival_station, delay])\n",
 95 |     "\n",
 96 |     "    df = pd.DataFrame(data, columns=[\"Date\", \"TransportType\", \"Route\", \"DepartureTime\", \"ArrivalTime\", \"Passengers\", \"DepartureStation\", \"ArrivalStation\", \"Delay\"])\n",
 97 |     "    \n",
 98 |     "    spark_df = spark.createDataFrame(df)\n",
 99 |     "\n",
100 |     "    # Reduce the number of partitions to one\n",
101 |     "    spark_df = spark_df.coalesce(1)\n",
102 |     "\n",
103 |     "    spark_df.write.format(\"csv\")\\\n",
104 |     "    .option(\"header\", \"true\")\\\n",
105 |     "    .mode(\"overwrite\")\\\n",
106 |     "    .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/public_transport_data{month}\")\n",
107 |     "    month = month + 1\n"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "application/vnd.databricks.v1+notebook": {
113 |    "dashboards": [],
114 |    "language": "python",
115 |    "notebookMetadata": {
116 |     "pythonIndentUnit": 4
117 |    },
118 |    "notebookName": "create csv files",
119 |    "widgets": {}
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 0
124 | }
125 | 


--------------------------------------------------------------------------------
/script import csv.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 0,
  6 |    "metadata": {
  7 |     "application/vnd.databricks.v1+cell": {
  8 |      "cellMetadata": {
  9 |       "byteLimit": 2048000,
 10 |       "rowLimit": 10000
 11 |      },
 12 |      "inputWidgets": {},
 13 |      "nuid": "c339e251-71fd-4c81-888c-b631a406d1a2",
 14 |      "showTitle": false,
 15 |      "title": ""
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def transform_dataframe(df):\n",
 21 |     "    # Extract year\n",
 22 |     "    df = df.withColumn(\"Year\", year(df[\"Date\"]))\n",
 23 |     "    # Extract month\n",
 24 |     "    df = df.withColumn(\"Month\", month(df[\"Date\"]))\n",
 25 |     "    # Extract day\n",
 26 |     "    df = df.withColumn(\"Day\", dayofmonth(df[\"Date\"]))\n",
 27 |     "    # Extract day of week\n",
 28 |     "    df = df.withColumn(\"DayOfWeek\", dayofweek(df[\"Date\"]))\n",
 29 |     "    # Extract duration (calculate the duration in minutes)\n",
 30 |     "    df = df.withColumn(\"Duration\", expr(\"(unix_timestamp(ArrivalTime, 'HH:mm') - unix_timestamp(DepartureTime, 'HH:mm')) / 60\"))\n",
 31 |     "    # Calculate average passengers\n",
 32 |     "    avgPassengers = df.select(avg(\"Passengers\")).first()[0]\n",
 33 |     "    # Extract passengers traffic condition\n",
 34 |     "    df = df.withColumn(\"PassengersTraffic\", expr(\"CASE WHEN Passengers <= {0} THEN 'Non' ELSE 'Oui' END\".format(avgPassengers)))\n",
 35 |     "    return df\n",
 36 |     "\n",
 37 |     "# Calculate the average of Route, Passengers, Delay and the count of the Route\n",
 38 |     "def calcul_avg(df):\n",
 39 |     "    df = df.groupBy(\"Route\").agg(avg(\"Passengers\").alias(\"AvgPassengers\"),avg(\"Delay\").alias(\"AvgDelay\"),count(\"Route\").alias(\"RouteCount\"))\n",
 40 |     "    return df\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 0,
 46 |    "metadata": {
 47 |     "application/vnd.databricks.v1+cell": {
 48 |      "cellMetadata": {
 49 |       "byteLimit": 2048000,
 50 |       "rowLimit": 10000
 51 |      },
 52 |      "inputWidgets": {},
 53 |      "nuid": "aec21fe7-d33f-4bed-92d9-54cf320451e1",
 54 |      "showTitle": false,
 55 |      "title": ""
 56 |     }
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# get the destinations of the directories in our container (Raw, Processed, Archived)\n",
 61 |     "def get_file_path(storage_account_name,storage_account_access_key,container_name):\n",
 62 |     "\n",
 63 |     "    spark.conf.set(f\"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net\",\n",
 64 |     "    storage_account_access_key)\n",
 65 |     "\n",
 66 |     "    raw = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/\"\n",
 67 |     "    processed = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/\"\n",
 68 |     "    archived = f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/archived/\"\n",
 69 |     "    \n",
 70 |     "    raw_files = dbutils.fs.ls(raw)\n",
 71 |     "    processed_files = dbutils.fs.ls(processed)\n",
 72 |     "    archived_files = dbutils.fs.ls(archived)\n",
 73 |     "\n",
 74 |     "    return [raw_files, processed_files, archived_files]"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 0,
 80 |    "metadata": {
 81 |     "application/vnd.databricks.v1+cell": {
 82 |      "cellMetadata": {
 83 |       "byteLimit": 2048000,
 84 |       "rowLimit": 10000
 85 |      },
 86 |      "inputWidgets": {},
 87 |      "nuid": "3cca86ac-fab6-4a6e-91df-2fe17234d862",
 88 |      "showTitle": false,
 89 |      "title": ""
 90 |     }
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "from pyspark.sql.functions import year,month,dayofmonth,dayofweek,to_timestamp,from_unixtime,unix_timestamp,expr,avg,count,when,col\n",
 95 |     "\n",
 96 |     "storage_account_name = \"tarifihicham1cs\"\n",
 97 |     "storage_account_access_key = \"OCGL4AOQKWaFu6lezWKGDCVXDe7534tiifLMFUgdrPm6YJ3Vff3CMX5EGbxwIXGgBkdqnO6xomBP+ASti5On2w==\"\n",
 98 |     "container_name = \"tarifihichamcontainer\"\n",
 99 |     "\n",
100 |     "# get all the files destinations\n",
101 |     "files_paths = get_file_path(storage_account_name,storage_account_access_key,container_name)\n",
102 |     "files_processed = []\n",
103 |     "# get all the files processed\n",
104 |     "for processed in files_paths[1]:\n",
105 |     "    files_processed.append(processed.name)\n",
106 |     "# this counter is to apply the transformation on juste 2 files\n",
107 |     "counter = 0\n",
108 |     "for raw in files_paths[0]:\n",
109 |     "    if (raw.name not in files_processed) and (counter < 2):\n",
110 |     "        filepath = dbutils.fs.ls(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/\"+raw.name)\n",
111 |     "        for filename in filepath:\n",
112 |     "            if filename.name.endswith(\".csv\"):\n",
113 |     "                # get the csv file\n",
114 |     "                df = spark.read.format(\"csv\")\\\n",
115 |     "                    .option(\"header\", \"true\")\\\n",
116 |     "                    .load(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/raw/{raw.name}/{filename.name}\")\n",
117 |     "                # apply the transformations\n",
118 |     "                df = transform_dataframe(df)\n",
119 |     "                dfm = calcul_avg(df)\n",
120 |     "                # Reduce the number of partitions to one\n",
121 |     "                df = df.coalesce(1)\n",
122 |     "                dfm = dfm.coalesce(1)\n",
123 |     "                # Export csv file processed\n",
124 |     "                df.write.format(\"csv\")\\\n",
125 |     "                .option(\"header\", \"true\")\\\n",
126 |     "                .mode(\"overwrite\")\\\n",
127 |     "                .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/{raw.name}data\")\n",
128 |     "                # Export csv file of analysing\n",
129 |     "                dfm.write.format(\"csv\")\\\n",
130 |     "                .option(\"header\", \"true\")\\\n",
131 |     "                .mode(\"overwrite\")\\\n",
132 |     "                .save(f\"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/public_transport_data/processed/{raw.name}analyse\")\n",
133 |     "            counter = counter + 1"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "application/vnd.databricks.v1+notebook": {
139 |    "dashboards": [],
140 |    "language": "python",
141 |    "notebookMetadata": {
142 |     "pythonIndentUnit": 4
143 |    },
144 |    "notebookName": "script import csv",
145 |    "widgets": {}
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 0
150 | }
151 | 


--------------------------------------------------------------------------------