├── .gitignore ├── CyclingERD.sql ├── README.md ├── airflow ├── .env.example ├── Dockerfile ├── README.md ├── dags │ ├── init_0_ingestion_to_s3_dag.py │ ├── init_1_spark_emr_dag.py │ ├── init_2_s3_to_redshifht_dag.py │ ├── init_3_web_scraping_dag.py │ ├── proc_0_ingestion_to_s3_dag.py │ ├── proc_1_spark_emr_dag.py │ ├── proc_2_s3_to_redshifht_dag.py │ └── scripts │ │ ├── init-data-transformation.py │ │ └── journey-data-transformation.py ├── docker-compose.yaml ├── logs │ └── scheduler │ │ └── latest └── requirements.txt ├── images ├── CyclingERD.png ├── batch-on-aws.png ├── dags │ ├── init_0.png │ ├── init_1.png │ ├── init_2.png │ ├── init_3.png │ ├── inits.png │ ├── proc_0.png │ ├── proc_1.png │ └── proc_2.png ├── final-dashboard.png └── redshift-metabase.png ├── metabase └── README.md ├── notebook ├── data-exploration │ ├── Exploration.ipynb │ └── Scraping.ipynb └── data-transformation │ ├── experiment.ipynb │ ├── init-data-transformation.ipynb │ └── journey-data-transformation.ipynb ├── services.md └── terraform ├── main.tf ├── services.md └── variables.tf /.gitignore: -------------------------------------------------------------------------------- 1 | log/* 2 | 3 | *.log 4 | 5 | .terraform* 6 | 7 | terraform.tfstate* 8 | 9 | __pycache__ 10 | 11 | airflow/.env 12 | 13 | 14 | .ipynb_checkpoints -------------------------------------------------------------------------------- /CyclingERD.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS "fact_journey"; 2 | DROP TABLE IF EXISTS "dim_station"; 3 | DROP TABLE IF EXISTS "dim_weather"; 4 | DROP TABLE IF EXISTS "dim_datetime"; 5 | 6 | CREATE TABLE "fact_journey" ( 7 | "rental_id" int PRIMARY KEY, 8 | "bike_id" int, 9 | "end_date" timestamp, 10 | "end_station" int, 11 | "start_date" timestamp, 12 | "start_station" int, 13 | "weather_date" date 14 | ); 15 | 16 | CREATE TABLE "dim_station" ( 17 | "station_id" int PRIMARY KEY, 18 | "station_name" varchar, 19 | "longitude" double precision, 20 | "latitude" double precision, 21 | "easting" double precision, 22 | "northing" double precision 23 | ); 24 | 25 | CREATE TABLE "dim_weather" ( 26 | "weather_date" date PRIMARY KEY, 27 | "feelslike" double precision, 28 | "feelslikemax" double precision, 29 | "feelslikemin" double precision, 30 | "humidity" double precision, 31 | "moonphase" double precision, 32 | "precip" double precision, 33 | "pressure" double precision, 34 | "solarenergy" double precision, 35 | "solarradiation" double precision, 36 | "sunrise" varchar, 37 | "sunset" varchar, 38 | "temp" double precision, 39 | "tempmax" double precision, 40 | "tempmin" double precision, 41 | "tzoffset" double precision, 42 | "uvindex" double precision, 43 | "visibility" double precision, 44 | "winddir" double precision, 45 | "windgust" double precision, 46 | "windspeed" double precision 47 | ); 48 | 49 | CREATE TABLE "dim_datetime" ( 50 | "datetime_id" timestamp PRIMARY KEY, 51 | "second" int, 52 | "minute" int, 53 | "hour" int, 54 | "day" int, 55 | "month" int, 56 | "week_day" int, 57 | "year" int 58 | ); 59 | 60 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("start_date") REFERENCES "dim_datetime" ("datetime_id"); 61 | 62 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("end_date") REFERENCES "dim_datetime" ("datetime_id"); 63 | 64 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("start_station") REFERENCES "dim_station" ("station_id"); 65 | 66 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("end_station") REFERENCES "dim_station" ("station_id"); 67 | 68 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("weather_date") REFERENCES "dim_weather" ("weather_date"); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Batch processing on aws 2 | This project shows one way to perform a batch processing using mainly AWS and a few open-source tools. 3 | 4 | ## Table of contents 5 | - [Overview](#overview) 6 | - [The Goal](#the-goal) 7 | - [The dataset](#the-dataset) 8 | - [Data modeling](#data-modeling) 9 | - [Tools](#tools) 10 | - [Scalability](#scalability) 11 | - [Running the project](#running-the-project) 12 | * [1. Requirements](#1-requirements) 13 | * [2. Clone the repository](#2-clone-the-repository) 14 | * [3. Run Terraform](#3-run-terraform) 15 | * [4. Create the Data Warehouse](#4-create-the-data-warehouse) 16 | * [5. Run Airflow](#5-run-airflow) 17 | * [6. Run the Airflow DAGs](#6-run-the-airflow-dags) 18 | * [7. Visualise data on Metabase](#7-visualise-data-on-metabase) 19 | - [Project limitations](#project-limitations) 20 | * [Manual DAGs triggering](#manual-dags-triggering) 21 | 22 | 23 | ## Overview 24 | 25 | The current work aims to give answers to business questions concerning bicycle rentals in the city of London from 2021 to January 2022. To do so, we are going to build a data pipeline which collects data from multiple sources, applies transformations and displays the preprocessed data into a dashboard. 26 | 27 | The following diagram illustrates a high-level structure of the pipeline where data flows from different sources to the final visualisation tool. 28 | 29 | ![The ELT](/images/batch-on-aws.png "ERD edited from dbdiagram.io") 30 | 31 | 32 | ## The Goal 33 | The end goal of the current project is to preprocess the data on AWS platform and get useful insights from it. We can learn more from the data by responding to some of the following business questions on the final dashboard. 34 | 35 | - At what time or which hour of the day has the most active rental in average? 36 | 37 | - Which area has the most active bike rentals in London? 38 | 39 | - Which day of the week is the most active in general? 40 | 41 | - What is the global trend for daily rentals over the year? 42 | 43 | ## The dataset 44 | We are going to process 3 datasets along this project. 45 | 46 | 1. __Cycling journey__ dataset from January 2021 to January 2022. It is spread into multiple files in the [Transport for London (TFL)](https://cycling.data.tfl.gov.uk/) website. We will scrap the web page to extract all the relevant links. Then download each file afterwards. This dataset contains the main features for every cycling journey, including: the locations of start/end point of each journey, the timestamps for both departure and arrival, etc. 47 | 48 | 2. __Stations__ dataset encompasses the details of every station involved in a journey. This dataset is quite outdated as it does not include stations which were added after 2016. To solve this issue, We will add in this old dataset, all the new stations we encounter in each journey. The stations were found in a forum [What do they know](www.whatdotheyknow.com) and can be downloaded directly from [here](https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt). 49 | 50 | 3. __Weather__ dataset includes daily weather data in the city of London from January 2021 to January 2022. It was originally retrieved from [Visual Crossing](https://www.visualcrossing.com/) website and made available to download from [this link](https://drive.google.com/file/d/13LWAH93xxEvOukCnPhrfXH7rZZq_-mss/view?usp=sharing). 51 | 52 | In total the cycling journey data contains: 10925928 entries, stations: 808 and weather: 396. 53 | 54 | ## Data modeling 55 | We are going to build a **Star Schema** which comprises one fact and multiple dimension tables for our Data Warehouse. 56 | 57 | The Entity Relational Diagram (ERD) for the final Data Warehouse is represented in the following image: 58 | ![The ERD](/images/CyclingERD.png "ERD edited from dbdiagram.io") 59 | 60 | In the transformation phase, several columns from both weather and journey data will be removed. Also, we will add dimension table dim_datetime which will contain the reference for all datetime-related columns. 61 | 62 | The given schema will facilitate the exploration of the whole data in order to answer relevant business questions about them. 63 | 64 | ## Tools 65 | 1. **Terraform**: an open-source tool which provides `Infrastructure as Code (IaC)`. It allows us to build and maintain our AWS infrastructure including: `Redshift`, `S3` and `EC2 instance`. We will not include our `EMR clusters` in Terraform as they will be manually added and terminated from `Airflow` when we need them. 66 | 67 | 2. **Apache Airflow**: an open-source tool to programmatically author, schedule and monitor workflows. The majority of data tasks in the project will be monitored on Airflow. 68 | 69 | 3. **Selenium** and **BeautifulSoup** are packages which help us to perform web scraping. BeautifulSoup cannot scrape a webpage that displays data lazily, this is where Selenium comes into the picture as it can wait for a specific content to load on the page before doing further processing. 70 | 71 | 4. **AWS Simple Storage System** or Simple Storage System: provides a large storage for us to create a __Data Lake__. We will store all the raw data in this location. Also, the preprocessed data will be stored in S3 before being loaded to Redshift. 72 | 73 | 5. **Apache Spark**: an open-source software that can efficiently process Big Data in a distributed or parallel system. We will use PySpark (Spark with Python) to transform the raw data and prepare them for the Data Warehouse on Redshift. 74 | 75 | 6. **AWS Elastic MapReduce**, a managed cluster platform that allows the running of big data tools such as Spark and Hadoop. We will employ AWS EMR to run our Spark jobs during the transformation phase of the data. 76 | 77 | 7. **AWS Redshift**, a fully managed and highly scalable data warehouse solution offered by Amazon. We will build our Data Warehouse on Redshift and we will make the data available for visualisation tools from there. 78 | 79 | 8. **Metabase** another open-source software that allows an easy visualisation and analytics of structured data. We will build a dashboard with Metabase to better visualise our data stored in Redshift. 80 | 81 | 9. **Docker**, a set of platform as a service which containerise softwares, allowing them to act the same way across multiple platforms. In this project, we will run Airflow and Metabase on Docker. 82 | 83 | ## Scalability 84 | It is always a good practice to consider scalability scenarios when building a data pipeline. The significant increase of the data in the future is much expected. 85 | 86 | For instance, if the volume of the data has increased 500x or even as high as 1000x, that should not break our pipeline. 87 | 88 | We need to scale our EMR cluster nodes either horizontally or both vertically and horizontally 89 | 90 | - Horizontal scale refers to adding more cluster nodes to process the high-volume data. 91 | 92 | - Vertical and Horizontal scale means that we increase the performance of existing nodes. Then we also add new nodes to the cluster. 93 | 94 | 95 | ## Running the project 96 | ### 1. Requirements 97 | In order to run the project smoothly, a few requirements should be met: 98 | - AWS account with sufficient permissions to access and work on S3, Redshift, and EMR. 99 | To do so: 100 | * Go to [IAM](https://console.aws.amazon.com/iam/home) in AWS console. 101 | * Create a new user 102 | * Add permissions to that new user: `AmazonS3FullAccess`, `AmazonRedshiftFullAccess`, `AdministratorAccess`, `AmazonEMRFullAccessPolicy_v2`, `AmazonEMRServicePolicy_v2`, `AmazonEC2FullAccess`. 103 | * In the "Security credentials" tab, create access key and download the `.csv` file. 104 | 105 | - It is also necessary to have the AWS account preconfigured (i.e having `~/.aws/credentials` and `~/.aws/config` available in your local environment). [This AWS Doc](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html) shows the essential steps to setup local environment with AWS. 106 | 107 | 108 | - Docker and Docker Compose, preinstalled in your local environment. Otherwise, they can be installed from [Get Docker](https://docs.docker.com/get-docker/). 109 | 110 | - Terraform preinstalled in your local environment. If not, please install it by following the instructions given in the [official download page](https://www.terraform.io/downloads). 111 | 112 | 113 | ### 2. Clone the repository 114 | ```bash 115 | git clone https://github.com/HoracioSoldman/batch-processing-on-aws.git 116 | ``` 117 | 118 | ### 3. Run Terraform 119 | We are going to use Terraform to build our AWS infrastructure 120 | 121 | From the project root folder, move to the `./terraform` directory 122 | ```bash 123 | cd terraform 124 | ``` 125 | Run terraform commands one by one 126 | 127 | - Initialization 128 | ```bash 129 | terraform init 130 | ``` 131 | 132 | - Planning 133 | ```bash 134 | terraform plan 135 | ``` 136 | - Applying 137 | ```bash 138 | terraform apply 139 | ``` 140 | 141 | ### 4. Create the Data Warehouse 142 | 143 | - Go to the [AWS Redshift](https://console.aws.amazon.com/redshiftv2/home) cluster which was freshly created from Terraform. 144 | 145 | - Connect to your database then go to `Query Data`. 146 | 147 | - Manually `Copy` the content of [CyclingERD.sql](/CyclingERD.sql) into the query field and `RUN` the command. This will create the tables and attach constraints to them. 148 | 149 | 150 | ### 5. Run Airflow 151 | 152 | - From the project root folder, move to the `./airflow` directory 153 | ```bash 154 | cd airflow 155 | ``` 156 | - Create environment variables in the `.env` file for our future Docker containers. 157 | ```bash 158 | cp .env.example .env 159 | ``` 160 | 161 | - Fill in the content of the `.env` file. 162 | The value for `AIRFLOW_UID` is obtained from the following command: 163 | ```bash 164 | echo -e "AIRFLOW_UID=$(id -u)" 165 | ``` 166 | Then the value for `AIRFLOW_GID` can be left to `0`. 167 | 168 | - Build our extended Airflow Docker image 169 | ```bash 170 | docker build -t airflow-img . 171 | ``` 172 | If you would prefer having another tag, replace the `airflow-img` by whatever you like. Then just make sure that you also change the image tag in [docker-compose.yaml](/airflow/docker-compose.yaml) at line `48`: `image: :latest`. 173 | 174 | This process might take up to 15 minutes or even more depending on your internet speed. At this stage, Docker also instals several packages defined in the [requirements.txt](/airflow/requirements.txt). 175 | 176 | - Run docker-compose to launch Airflow 177 | 178 | Initialise Airflow 179 | ```bash 180 | docker-compose up airflow-init 181 | ``` 182 | 183 | Launch Airflow 184 | ```bash 185 | docker-compose up 186 | ``` 187 | This last command launched `Airflow Postgres` internal database, `Airflow Scheduler` and `Airflow Webserver` which could have been launched separately if we did not use Docker. 188 | 189 | ### 6. Run the Airflow DAGs 190 | 191 | Once Airflow is up and running, we can now proceed to the most exciting part of the project. 192 | 193 | The initialisation DAGs (`init_?_*_dag`) are interdependent. In essence, each DAG wait the success run of its predecessor before starting its tasks. 194 | For instance, `init_1_spark_emr_dag` will not be started until `init_0_ingestionto_s3_dag` is complete successfully. 195 | In order to trigger these DAGs, please enable the 4 of them _SIMULTANEOUSLY_. 196 | 197 | The processor DAGs (`proc_?_*_dag`) on the other hand, needs to be started individually. 198 | __It is necessary to wait for 4 initialisation DAGs to complete before starting the processor ones__. 199 | To run these last 3 DAGs, please enable the `proc_0_ingestion_to_s3_dag`, wait for it to finish its tasks before enabling the next DAG: `proc_1_spark_emr_dag`. 200 | Likewise, it is necessary to wait unti the end of `proc_1_spark_emr_dag` process before enabling the last DAG: `proc_2_s3_to_redshift_dag` 201 | 202 | 203 | The following screenshot shows a success run of the first DAG. 204 | 205 | ![Ingestion DAG (init_0_ingestion_to_s3_dag)](/images/dags/init_0.png "Ingestion DAG in the Graph view") 206 | 207 | 208 | After all dags operations, we can now move to Metabase to visualise the data. 209 | 210 | ### 7. Visualise data on Metabase 211 | 212 | Again we will install and run Metabase in a Docker container. 213 | ```bash 214 | docker run -d -p 3033:3000 --name metabase metabase/metabase 215 | ``` 216 | 217 | For the very first time of its execution, the above command downloads the latest Docker image available for Metabase before exposing the application on port `3033`. 218 | 219 | Once the above command finishes its execution, Metabase should be available at [http://localhost:3033](http://localhost:3033). 220 | 221 | We can now connect our Redshift database to this platform and visualise the data in multiple charts. 222 | 223 | The following screenshot displays a part of our final dashboard which clearly shows some useful insights about bicycle rides in different dimensions. 224 | 225 | ![Final Dashboard](/images/final-dashboard.png "The final dashboard on Metabase") 226 | -------------------------------------------------------------------------------- /airflow/.env.example: -------------------------------------------------------------------------------- 1 | # Custom 2 | AIRFLOW_CONN_AWS_DEFAULT="aws://:@" 3 | AIRFLOW_UID= 4 | AIRFLOW_GID= 5 | AWS_DEFAULT_REGION="" 6 | AWS_PROFILE= 7 | S3_BUCKET= 8 | 9 | AIRFLOW_CONN_REDSHIFT_DEFAULT= 10 | AIRFLOW_CONN_EMR_DEFAULT="aws://:@" 11 | 12 | # Postgres 13 | POSTGRES_USER=airflow 14 | POSTGRES_PASSWORD=airflow 15 | POSTGRES_DB=airflow 16 | 17 | # Airflow 18 | AIRFLOW__CORE__EXECUTOR=LocalExecutor 19 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 20 | 21 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} 22 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow 23 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow 24 | 25 | _AIRFLOW_WWW_USER_CREATE=True 26 | _AIRFLOW_WWW_USER_USERNAME=airflow 27 | _AIRFLOW_WWW_USER_PASSWORD=airflow 28 | -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # First-time build can take upto 10 mins. 2 | 3 | FROM apache/airflow:2.2.3 4 | 5 | ENV AIRFLOW_HOME=/opt/airflow 6 | 7 | USER root 8 | 9 | RUN apt-get update -qq \ 10 | && apt-get install firefox-esr -y -qq \ 11 | && apt-get install wget -y -qq 12 | 13 | COPY requirements.txt . 14 | 15 | RUN pip install -q -r requirements.txt 16 | 17 | # workaround to fix selenium module not found 18 | RUN curl -sSLf "https://files.pythonhosted.org/packages/58/76/705b5c776f783d1ba7c630347463d4ae323282bbd859a8e9420c7ff79581/selenium-4.1.0-py3-none-any.whl" > ~/selenium-4.1.0-py3-none-any.whl \ 19 | && chmod +x ~/selenium-4.1.0-py3-none-any.whl \ 20 | && sudo pip install -q ~/selenium-4.1.0-py3-none-any.whl webdriver_manager 21 | 22 | WORKDIR $AIRFLOW_HOME 23 | 24 | USER $AIRFLOW_UID -------------------------------------------------------------------------------- /airflow/README.md: -------------------------------------------------------------------------------- 1 | ## Running DAGS on Airflow 2 | In this project, we are using Airflow in a docker container. 3 | 4 | ### Requirements 5 | In order to run Airflow and the pipeline in this project, you need to have: 6 | 7 | * [Docker](https://www.docker.com) and [docker compose](https://docs.docker.com/compose/install) installed. This can be checked by running 8 | ```bash 9 | docker -v 10 | ``` 11 | * [AWS Account](https://aws.amazon.com/account/) which has access to an [S3 bucket](https://aws.amazon.com/s3/) 12 | * An **AWS_ACCESS_KEY_ID** and an **AWS_SECRET_ACCESS_KEY** associated with the [AWS Account](https://aws.amazon.com/account/). 13 | 14 | ### Set ENVIRONMENT VARIABLES 15 | Before building the airflow docker image, it is necessary to set ENVIRONMENT VARIABLES in a `.env` file. 16 | 17 | To do so, rename the `.env.example` file located in this directory to `.env` then add the correct values for your own environment. 18 | 19 | The `AIRFLOW_UID` value can be obtained from the following command: 20 | ```bash 21 | echo -e "AIRFLOW_UID=$(id -u)" > .env 22 | ``` 23 | Then `AIRFLOW_GID` can be set to `0`. 24 | 25 | ### Dockerfile and docker-compose.yaml 26 | In DOckerfile, we download several packages such as: 27 | 1. [firefox esr](https://www.mozilla.org/en-US/firefox/enterprise/) for web scraping 28 | 29 | 2. [selenium](https://pypi.org/project/selenium/) for web scraping 30 | 31 | 3. [webdriver_manager](https://pypi.org/project/webdriver-manager/) for web scraping 32 | 33 | 4. [apache-airflow-providers-amazon](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/index.html) to communicate and work with AWS 34 | 35 | 5. [pyarrow](https://pypi.org/project/pyarrow/) to convert `.csv` to `.parquet` files 36 | 37 | 6. [bs4](https://pypi.org/project/beautifulsoup4/) for web scraping 38 | 39 | Also, the `docker-compose.yaml` is a modified version of the original [docker-compose.yaml](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml). 40 | 41 | 42 | ### Run Airflow 43 | 44 | 1. Build docker image with the current Dockerfile 45 | ```bash 46 | docker build -t airflow-img . 47 | ``` 48 | This command should be run only once or after editing the content of Dockerfile. 49 | 50 | 2. Initialize Airflow 51 | ```bash 52 | docker-compose up airflow-init 53 | ``` 54 | This command should terminate with `exit code 0` if everything went well. 55 | 56 | 3. Launch Airflow 57 | ```bash 58 | docker-compose up 59 | ``` 60 | 61 | 4. Visit [http://localhost:8080](http://localhost:8080) to access the Airflow GUI. 62 | 63 | 5. In order to stop the Airflow container, 64 | ```bash 65 | docker-compose down 66 | ``` 67 | 68 | ### Note: 69 | It is highly recommended to manually trigger the `web_scraping_dag` prior to enabling the `s3_ingestion_dag`. 70 | 71 | In order to do this, open the `web_scraping_dag` on Airflow and click on the **Play** button on the right, then select **Trigger Dags now**. All the scraping tasks must be completed before starting the ingestion with the other DAG. 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /airflow/dags/init_0_ingestion_to_s3_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import logging 4 | import json 5 | import pandas as pd 6 | 7 | from airflow import DAG 8 | from airflow.utils.dates import days_ago 9 | from airflow.operators.dummy import DummyOperator 10 | from airflow.operators.bash import BashOperator 11 | from airflow.operators.python import PythonOperator 12 | from airflow.utils.task_group import TaskGroup 13 | from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator 14 | 15 | 16 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow") 17 | S3_DESTINATION = "raw/cycling-extras" 18 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket") 19 | S3_SCRIPT_DESTINATION = "utils/scripts" 20 | download_links= [ 21 | { 22 | 'name': 'stations', 23 | 'link': 'https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt', 24 | 'output': 'stations.csv' 25 | }, 26 | { 27 | 'name': 'weather', 28 | 'link': '--no-check-certificate "https://docs.google.com/uc?export=download&id=13LWAH93xxEvOukCnPhrfXH7rZZq_-mss"', 29 | 'output': 'weather.json' 30 | }, 31 | { 32 | 'name': 'journey', 33 | 'link': 'https://cycling.data.tfl.gov.uk/usage-stats/246JourneyDataExtract30Dec2020-05Jan2021.csv', 34 | 'output': 'journey.csv' 35 | } 36 | ] 37 | local_scripts = [ 'init-data-transformation.py', 'journey-data-transformation.py' ] 38 | 39 | 40 | # extract days value from the weather data 41 | def preprocess_data(filepath): 42 | 43 | filename= filepath.split('/')[-1] 44 | 45 | if filename != 'weather.json': 46 | print(f'No preprocessing needed for {filename}') 47 | return 48 | 49 | with open(filepath, 'r') as f: 50 | weather = json.load(f) 51 | 52 | daily_weather= weather['days'] 53 | 54 | with open(filepath, 'w') as f: 55 | json.dump(daily_weather, f) 56 | 57 | 58 | default_args = { 59 | "owner": "airflow", 60 | "start_date": days_ago(1), 61 | "depends_on_past": False, 62 | "retries": 1, 63 | } 64 | 65 | # NOTE: DAG declaration - using a Context Manager (an implicit way) 66 | with DAG( 67 | dag_id="init_0_ingestion_to_s3_dag", 68 | description=""" 69 | This dag ingests extra files for the cycling journey including: the docking stations, 70 | the weather data and an example file for cycling journey. 71 | """, 72 | schedule_interval="@once", 73 | default_args=default_args, 74 | catchup=False, 75 | max_active_runs=3, 76 | tags=['weather', 'stations', 'docking stations', 'london', '2021', 'journey'], 77 | ) as dag: 78 | 79 | start = DummyOperator(task_id="start") 80 | 81 | 82 | with TaskGroup(f"Download_files", tooltip="Download - Preprocess") as download_section: 83 | 84 | for index, item in enumerate(download_links): 85 | download_task = BashOperator( 86 | task_id=f"download_{item['name']}_task", 87 | bash_command=f"wget {item['link']} -O {path_to_local_home}/{item['output']}" 88 | ) 89 | 90 | if item['output'] == 'weather.json': 91 | preprocessing_task = PythonOperator( 92 | task_id=f"extract_daily_weather_data", 93 | python_callable=preprocess_data, 94 | provide_context=True, 95 | op_kwargs={ 96 | "filepath": f"{path_to_local_home}/{item['output']}" 97 | } 98 | ) 99 | 100 | download_task >> preprocessing_task 101 | 102 | 103 | 104 | with TaskGroup("upload_files_to_s3") as upload_section: 105 | 106 | for index, item in enumerate(download_links): 107 | 108 | upload_to_s3_task = LocalFilesystemToS3Operator( 109 | task_id=f"upload_{item['name']}_to_s3_task", 110 | filename=item['output'], 111 | dest_key=f"{S3_DESTINATION}/{item['output']}", 112 | dest_bucket=S3_BUCKET, 113 | ) 114 | 115 | 116 | cleanup = BashOperator( 117 | task_id="cleanup_local_storage", 118 | bash_command=f"rm {path_to_local_home}/*.json {path_to_local_home}/*.csv " 119 | ) 120 | 121 | # upload scripts 122 | with TaskGroup("upload_scripts_to_s3") as upload_scripts_section: 123 | for index, item in enumerate(local_scripts): 124 | upload_scripts_to_s3_task = LocalFilesystemToS3Operator( 125 | task_id=f"upload_scritps_{index}_to_s3_task", 126 | filename=f"dags/scripts/{item}", 127 | dest_key=f"{S3_SCRIPT_DESTINATION}/{item}", 128 | dest_bucket=S3_BUCKET, 129 | ) 130 | 131 | end = DummyOperator(task_id="end") 132 | 133 | start >> download_section >> upload_section >> cleanup >> end 134 | start >> upload_scripts_section >> end 135 | 136 | -------------------------------------------------------------------------------- /airflow/dags/init_1_spark_emr_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | from airflow.operators.dummy import DummyOperator 4 | 5 | from airflow.providers.amazon.aws.operators.emr_add_steps import EmrAddStepsOperator 6 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import EmrCreateJobFlowOperator 7 | from airflow.providers.amazon.aws.operators.emr_terminate_job_flow import EmrTerminateJobFlowOperator 8 | from airflow.providers.amazon.aws.sensors.emr_step import EmrStepSensor 9 | from airflow.sensors.external_task import ExternalTaskSensor 10 | 11 | 12 | SPARK_STEPS = [ 13 | { 14 | "Name": "One-time data transformation", 15 | "ActionOnFailure": "CANCEL_AND_WAIT", 16 | "HadoopJarStep": { 17 | "Jar": "command-runner.jar", 18 | "Args": [ 19 | "spark-submit", 20 | "--deploy-mode", 21 | "client", 22 | "s3://hrc-de-data/utils/scripts/init-data-transformation.py", 23 | ], 24 | }, 25 | } 26 | ] 27 | 28 | JOB_FLOW_OVERRIDES = { 29 | 'Name': 'ExtrasDataTransformer', 30 | 'ReleaseLabel': 'emr-5.34.0', 31 | 'Applications': [{'Name': 'Spark'}, {'Name': 'Hadoop'}], 32 | 'LogUri': 's3n://hrc-de-data/emr/logs', 33 | 'Instances': { 34 | 'InstanceGroups': [ 35 | { 36 | 'Name': 'Primary node', 37 | 'Market': 'SPOT', 38 | 'InstanceRole': 'MASTER', 39 | 'InstanceType': 'm5.xlarge', 40 | 'InstanceCount': 1, 41 | } 42 | ], 43 | 'KeepJobFlowAliveWhenNoSteps': False, 44 | 'TerminationProtected': False, 45 | }, 46 | 'Steps': SPARK_STEPS, 47 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 48 | 'ServiceRole': 'EMR_DefaultRole', 49 | } 50 | 51 | 52 | 53 | default_args = { 54 | "owner": "airflow", 55 | "start_date": days_ago(1), 56 | "depends_on_past": False, 57 | "retries": 1, 58 | } 59 | 60 | with DAG( 61 | dag_id="init_1_spark_emr_dag", 62 | description=""" 63 | This dag perform a manually triggered and one-time-running spark jobs which processes extra files in s3. 64 | """, 65 | schedule_interval="@once", 66 | default_args=default_args, 67 | catchup=False, 68 | max_active_runs=1, 69 | tags=['spark', 'emr', 'weather', 'stations', 'docking stations', 'london', '2021', 'journey'], 70 | ) as dag: 71 | 72 | 73 | external_task_sensor = ExternalTaskSensor( 74 | task_id='sensor_for_init_0_ingestion_dag', 75 | poke_interval=30, 76 | soft_fail=False, 77 | retries=2, 78 | allowed_states=['success'], 79 | failed_states=['failed', 'skipped'], 80 | external_task_id='end', 81 | external_dag_id='init_0_ingestion_to_s3_dag', 82 | ) 83 | 84 | start = DummyOperator(task_id="start") 85 | 86 | cluster_creator = EmrCreateJobFlowOperator( 87 | task_id='create_job_flow', 88 | job_flow_overrides=JOB_FLOW_OVERRIDES, 89 | 90 | ) 91 | 92 | step_adder = EmrAddStepsOperator( 93 | task_id='add_steps', 94 | job_flow_id=cluster_creator.output, 95 | steps=SPARK_STEPS, 96 | 97 | ) 98 | 99 | step_checker = EmrStepSensor( 100 | task_id='watch_step', 101 | job_flow_id=cluster_creator.output, 102 | step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", 103 | 104 | ) 105 | 106 | cluster_remover = EmrTerminateJobFlowOperator( 107 | task_id='remove_cluster', job_flow_id=cluster_creator.output, 108 | 109 | ) 110 | 111 | 112 | end = DummyOperator(task_id="end") 113 | 114 | external_task_sensor >> start >> cluster_creator >> step_adder >> step_checker >> cluster_remover >> end -------------------------------------------------------------------------------- /airflow/dags/init_2_s3_to_redshifht_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from airflow import DAG 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.utils.task_group import TaskGroup 7 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator 8 | from airflow.sensors.external_task import ExternalTaskSensor 9 | 10 | 11 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket") 12 | S3_KEY_EXTRAS = f"processed/cycling-dimension" 13 | S3_KEY_JOURNEY = f"processed/test" 14 | 15 | s3_objects= [ 16 | # we will only load the processed weather dimensional data in this dag, 17 | # as the stations and datetime data still receives updates from proc_2_spark_emr_dag.py. 18 | { 19 | 'type': 'weather', 20 | 'key': S3_KEY_EXTRAS, 21 | 'filename': 'weather/', 22 | 'table': 'dim_weather', 23 | 'file_type': 'parquet', 24 | 'upsert_key': 'weather_date' 25 | } 26 | ] 27 | 28 | 29 | default_args = { 30 | "owner": "airflow", 31 | "start_date": days_ago(1), 32 | "depends_on_past": False, 33 | "retries": 1, 34 | } 35 | 36 | # NOTE: DAG declaration - using a Context Manager (an implicit way) 37 | with DAG( 38 | dag_id="init_2_s3_to_redshifht_dag", 39 | description=""" 40 | This dag transfers extra files for dimensions from S3 to Redshift. 41 | """, 42 | schedule_interval="@once", 43 | default_args=default_args, 44 | catchup=False, 45 | max_active_runs=3, 46 | tags=['weather', 'stations', '2021', 's3 to redshift'], 47 | ) as dag: 48 | 49 | 50 | external_task_sensor = ExternalTaskSensor( 51 | task_id='sensor_for_init_1_spark_dag', 52 | poke_interval=30, 53 | soft_fail=False, 54 | retries=2, 55 | allowed_states=['success'], 56 | failed_states=['failed', 'skipped'], 57 | external_task_id='end', 58 | external_dag_id='init_1_spark_emr_dag', 59 | ) 60 | 61 | start = DummyOperator(task_id="start") 62 | 63 | with TaskGroup("load_files_to_redshift") as transfer_section: 64 | for item in s3_objects: 65 | transfer_task = S3ToRedshiftOperator( 66 | s3_bucket=S3_BUCKET, 67 | s3_key=f"{item['key']}/{item['filename']}", 68 | schema="PUBLIC", 69 | table=item['table'], 70 | copy_options=[item['file_type']], 71 | method='UPSERT', 72 | upsert_keys= [item['upsert_key']], 73 | task_id=f"transfer_{item['type']}_s3_to_redshift", 74 | ) 75 | 76 | end = DummyOperator(task_id="end") 77 | 78 | external_task_sensor >> start >> transfer_section >> end 79 | -------------------------------------------------------------------------------- /airflow/dags/init_3_web_scraping_dag.py: -------------------------------------------------------------------------------- 1 | # imports 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | from airflow.utils.dates import datetime 5 | from airflow.utils.dates import days_ago 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | # selenium will be used to scrap dynamic content of the webpage, our data source of our data 10 | from selenium import webdriver 11 | from webdriver_manager.firefox import GeckoDriverManager 12 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 13 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 14 | 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.ui import WebDriverWait 17 | from selenium.webdriver.support import expected_conditions as EC 18 | from airflow.sensors.external_task import ExternalTaskSensor 19 | 20 | import json 21 | 22 | 23 | url= "https://cycling.data.tfl.gov.uk" 24 | dictionary_file= "links_dictionary.json" 25 | 26 | def contents_downloader(**kwargs): 27 | cap = DesiredCapabilities().FIREFOX 28 | cap["marionette"] = False 29 | 30 | options = FirefoxOptions() 31 | options = webdriver.FirefoxOptions() 32 | options.log.level = "TRACE" 33 | options.add_argument('--no-sandbox') 34 | options.add_argument('--headless') 35 | options.add_argument('--disable-gpu') 36 | 37 | browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options) 38 | browser.get(url) 39 | 40 | # wait until at least a single element of the table exists 41 | wait = WebDriverWait(browser, 20) 42 | wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/table/tbody/tr[1]/td[1]'))) 43 | content= browser.page_source 44 | 45 | kwargs['ti'].xcom_push(key='html_content', value=content) 46 | 47 | 48 | def links_extractor(**kwargs): 49 | task_instance= kwargs['ti'] 50 | html_element= task_instance.xcom_pull(key='html_content', task_ids='download_contents_task') 51 | 52 | bsoup= BeautifulSoup(html_element, "html.parser") 53 | 54 | table= bsoup.find('table') 55 | tbody= table.find('tbody') 56 | folder_name= "usage-stats/" 57 | capture_files= False 58 | years= [2021, 2022] 59 | filetype= 'csv' 60 | extracted_files= {} 61 | 62 | for row in tbody.find_all('tr'): 63 | columns= row.find_all('td') 64 | 65 | if capture_files == False: 66 | col_values= [col.text.strip() for col in columns] 67 | 68 | if col_values[0] == folder_name: 69 | capture_files= True 70 | continue 71 | 72 | else: 73 | col= columns[0] 74 | filename= col.text.strip() 75 | filename_without_extension= filename.split('.')[-2] 76 | year_in_filename= filename_without_extension[-4:] 77 | 78 | if not year_in_filename.isdigit() or not int(year_in_filename) in years: 79 | continue 80 | 81 | # extract the date (e.g 257JourneyDataExtract17Mar2021-23Mar2021.csv --> 23Mar2021) 82 | 83 | filename_last_date= filename_without_extension.split('-')[-1] 84 | extracted_files[filename_last_date]= col.a['href'] 85 | 86 | kwargs['ti'].xcom_push(key="dictionary", value=extracted_files) 87 | 88 | 89 | def dico_exporter(**kwargs): 90 | task_instance= kwargs['ti'] 91 | links_dictionary= task_instance.xcom_pull(key="dictionary", task_ids="extract_links_task") 92 | 93 | # serialize json 94 | links_json_object = json.dumps(links_dictionary, indent = 4) 95 | 96 | # save into a dico file 97 | with open(dictionary_file, 'w', encoding='utf-8') as f: 98 | f.write(links_json_object) 99 | 100 | 101 | 102 | ''' 103 | TODO: We need to manually trigger this dag for the very first time in order 104 | for the s3_ingestion_dag to have links dictionary to work with. 105 | After the first run, this dag will run every Tuesday at 11:50pm, 106 | only 5 minutes before the ingestion dag runs. 107 | ''' 108 | default_args = { 109 | "owner": "airflow", 110 | "start_date": days_ago(1), 111 | "depends_on_past": False, 112 | "retries": 1 113 | } 114 | 115 | with DAG( 116 | dag_id="init_3__web_scraping_dag", 117 | schedule_interval="@once", 118 | default_args=default_args, 119 | catchup=True, 120 | max_active_runs=1, 121 | tags=['web', 'scraping', 'links', 'source'], 122 | ) as dag: 123 | 124 | 125 | external_task_sensor = ExternalTaskSensor( 126 | task_id='sensor_for_init_2_s3_to_redshift_dag', 127 | poke_interval=30, 128 | soft_fail=False, 129 | retries=2, 130 | allowed_states=['success'], 131 | failed_states=['failed', 'skipped'], 132 | external_task_id='end', 133 | external_dag_id='init_2_s3_to_redshifht_dag', 134 | ) 135 | 136 | download_web_contents_task = PythonOperator( 137 | task_id="download_contents_task", 138 | provide_context=True, 139 | python_callable=contents_downloader 140 | ) 141 | 142 | extract_links_task = PythonOperator( 143 | task_id="extract_links_task", 144 | provide_context=True, 145 | python_callable=links_extractor, 146 | 147 | ) 148 | 149 | export_links_task = PythonOperator( 150 | task_id="exporter_links_task", 151 | provide_context=True, 152 | python_callable=dico_exporter 153 | ) 154 | 155 | 156 | external_task_sensor >> download_web_contents_task >> extract_links_task >> export_links_task -------------------------------------------------------------------------------- /airflow/dags/proc_0_ingestion_to_s3_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import logging 4 | 5 | from airflow import DAG 6 | from airflow.operators.bash import BashOperator 7 | from airflow.operators.python import PythonOperator 8 | from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator 9 | from airflow.utils.dates import datetime 10 | 11 | 12 | import json 13 | 14 | # https://cycling.data.tfl.gov.uk/usage-stats/cycling-load.json 15 | 16 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 17 | S3_DESTINATION= 'raw/cycling-journey/{{logical_date.strftime(\'%b%Y\')}}' 18 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket") 19 | 20 | dictionary_file= "links_dictionary.json" 21 | 22 | 23 | def get_file_link(exec_date, s3_destination_folder, **kwargs): 24 | links= {} 25 | with open(dictionary_file) as dico_file: 26 | links= json.load(dico_file) 27 | 28 | file_link= links[exec_date] 29 | filename= file_link.split('/')[-1] 30 | 31 | kwargs['ti'].xcom_push(key="remote_file_link", value=file_link) 32 | kwargs['ti'].xcom_push(key="filename", value=filename) 33 | kwargs['ti'].xcom_push(key="local_file_link", value=f"{path_to_local_home}/{filename}") 34 | kwargs['ti'].xcom_push(key="s3_filepath_destination", value=f"{s3_destination_folder}/{filename}") 35 | 36 | 37 | download_cmd= "curl -sSLf $link > $destination" 38 | 39 | 40 | default_args = { 41 | "owner": "airflow", 42 | "start_date": datetime(2021, 1, 1), 43 | "depends_on_past": True, # the previous task instance needs to have succeeded for the current one to run 44 | "retries": 1, 45 | } 46 | 47 | with DAG( 48 | dag_id="proc_1_ingestion_to_s3_dag", 49 | schedule_interval="55 23 * * 2", # run this dag every Tuesday at 11:55pm 50 | max_active_runs=3, 51 | catchup=True, 52 | tags=['s3', 'aws', 'ingestion', 'cycling'], 53 | default_args=default_args 54 | ) as dag: 55 | 56 | get_file_link_task = PythonOperator( 57 | task_id="get_file_link_task", 58 | provide_context=True, 59 | python_callable=get_file_link, 60 | op_kwargs={ 61 | "exec_date": "{{execution_date.strftime('%d%b%Y')}}", 62 | "s3_destination_folder": S3_DESTINATION 63 | } 64 | ) 65 | 66 | 67 | download_dataset_task = BashOperator( 68 | task_id="download_dataset_task", 69 | bash_command=download_cmd, 70 | env={ 71 | "link": "{{ti.xcom_pull(key='remote_file_link')}}", 72 | "destination": "{{ti.xcom_pull(key='local_file_link')}}" 73 | } 74 | ) 75 | 76 | 77 | upload_to_s3_task = LocalFilesystemToS3Operator( 78 | task_id="upload_to_s3", 79 | filename="{{ti.xcom_pull(key='filename')}}", 80 | dest_key="{{ti.xcom_pull(key='s3_filepath_destination')}}", 81 | dest_bucket=S3_BUCKET, 82 | ) 83 | 84 | cleanup_local_storage_task = BashOperator( 85 | task_id="cleanup_local_storage_task", 86 | bash_command="rm {{ti.xcom_pull(key='local_file_link')}}" 87 | ) 88 | 89 | get_file_link_task >> download_dataset_task >> upload_to_s3_task >> cleanup_local_storage_task -------------------------------------------------------------------------------- /airflow/dags/proc_1_spark_emr_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from airflow import DAG 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.operators.bash import BashOperator 7 | from airflow.operators.python import PythonOperator 8 | from airflow.utils.task_group import TaskGroup 9 | 10 | from airflow import DAG 11 | from airflow.providers.amazon.aws.operators.emr_add_steps import EmrAddStepsOperator 12 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import EmrCreateJobFlowOperator 13 | from airflow.providers.amazon.aws.operators.emr_terminate_job_flow import EmrTerminateJobFlowOperator 14 | 15 | from airflow.providers.amazon.aws.sensors.emr_step import EmrStepSensor 16 | 17 | BUCKET_NAME = os.environ.get("S3_BUCKET", "s3_no_bucket") 18 | local_scripts = "dags/scripts" 19 | s3_script = "utils/scripts/" 20 | 21 | 22 | SPARK_STEPS = [ 23 | { 24 | "Name": "Journey data transformation", 25 | "ActionOnFailure": "CANCEL_AND_WAIT", 26 | "HadoopJarStep": { 27 | "Jar": "command-runner.jar", 28 | "Args": [ 29 | "spark-submit", 30 | "--deploy-mode", 31 | "client", 32 | "s3://hrc-de-data/utils/scripts/journey-data-transformation.py", 33 | ], 34 | }, 35 | } 36 | ] 37 | 38 | JOB_FLOW_OVERRIDES = { 39 | 'Name': 'ExtrasDataTransformer', 40 | 'ReleaseLabel': 'emr-5.34.0', 41 | 'Applications': [{'Name': 'Spark'}, {'Name': 'Hadoop'}], 42 | 'LogUri': 's3n://hrc-de-data/emr/logs', 43 | 'Instances': { 44 | 'InstanceGroups': [ 45 | { 46 | 'Name': 'Primary node', 47 | 'Market': 'SPOT', 48 | 'InstanceRole': 'MASTER', 49 | 'InstanceType': 'm5.xlarge', 50 | 'InstanceCount': 1, 51 | }, 52 | { 53 | "Name": "Core node", 54 | "Market": "SPOT", 55 | "InstanceRole": "CORE", 56 | "InstanceType": "m5.xlarge", 57 | "InstanceCount": 2, 58 | }, 59 | ], 60 | 'KeepJobFlowAliveWhenNoSteps': False, 61 | 'TerminationProtected': False, 62 | }, 63 | 'Steps': SPARK_STEPS, 64 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 65 | 'ServiceRole': 'EMR_DefaultRole', 66 | } 67 | 68 | 69 | 70 | 71 | default_args = { 72 | "owner": "airflow", 73 | "start_date": days_ago(1), 74 | "depends_on_past": False, 75 | "retries": 1, 76 | } 77 | 78 | with DAG( 79 | dag_id="proc_2_spark_emr_dag", 80 | description=""" 81 | This dag perform a manually triggered spark jobs which processes extra files in s3. 82 | """, 83 | schedule_interval="@once", 84 | default_args=default_args, 85 | catchup=False, 86 | max_active_runs=1, 87 | tags=['spark', 'emr', 'weather', 'stations', 'docking stations', 'london', '2021', 'journey'], 88 | ) as dag: 89 | 90 | start = DummyOperator(task_id="start") 91 | 92 | cluster_creator = EmrCreateJobFlowOperator( 93 | task_id='create_job_flow', 94 | job_flow_overrides=JOB_FLOW_OVERRIDES, 95 | aws_conn_id='aws_default' 96 | ) 97 | 98 | step_adder = EmrAddStepsOperator( 99 | task_id='add_steps', 100 | job_flow_id=cluster_creator.output, 101 | steps=SPARK_STEPS, 102 | params={ 103 | "BUCKET": BUCKET_NAME, 104 | "s3_script": s3_script 105 | }, 106 | aws_conn_id='aws_default' 107 | ) 108 | 109 | step_checker = EmrStepSensor( 110 | task_id='watch_step', 111 | job_flow_id=cluster_creator.output, 112 | step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", 113 | aws_conn_id='aws_default' 114 | ) 115 | 116 | cluster_remover = EmrTerminateJobFlowOperator( 117 | task_id='remove_cluster', job_flow_id=cluster_creator.output, 118 | aws_conn_id='aws_default' 119 | ) 120 | 121 | 122 | end = DummyOperator(task_id="end") 123 | 124 | start >> cluster_creator >> step_adder >> step_checker >> cluster_remover >> end -------------------------------------------------------------------------------- /airflow/dags/proc_2_s3_to_redshifht_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from airflow import DAG 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.utils.task_group import TaskGroup 7 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator 8 | 9 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket") 10 | S3_KEY_DIMS = f"processed/cycling-dimension" 11 | S3_KEY_JOURNEY = f"processed/cycling-fact" 12 | 13 | s3_objects= [ 14 | { 15 | 'type': 'stations', 16 | 'key': S3_KEY_DIMS, 17 | 'filename': 'stations/', 18 | 'table': 'dim_station', 19 | 'file_type': 'parquet', 20 | 'upsert_key': 'station_id' 21 | }, 22 | { 23 | 'type': 'datetime', 24 | 'key': S3_KEY_DIMS, 25 | 'filename': 'datetime/', 26 | 'table': 'dim_datetime', 27 | 'file_type': 'parquet', 28 | 'upsert_key': 'datetime_id' 29 | }, 30 | { 31 | 'type': 'journey', 32 | 'key': S3_KEY_JOURNEY, 33 | 'filename': 'journey/', 34 | 'table': 'fact_journey', 35 | 'file_type': 'parquet', 36 | 'upsert_key': 'rental_id' 37 | } 38 | 39 | ] 40 | 41 | 42 | default_args = { 43 | "owner": "airflow", 44 | "start_date": days_ago(1), 45 | "depends_on_past": False, 46 | "retries": 1, 47 | } 48 | 49 | # NOTE: DAG declaration - using a Context Manager (an implicit way) 50 | with DAG( 51 | dag_id="proc_3_s3_to_redshifht_dag", 52 | description=""" 53 | This dag transfers extra files for dimensions from S3 to Redshift. 54 | """, 55 | schedule_interval="@once", 56 | default_args=default_args, 57 | catchup=False, 58 | max_active_runs=3, 59 | tags=['weather', 'stations', '2021', 's3 to redshift'], 60 | ) as dag: 61 | 62 | start = DummyOperator(task_id="start") 63 | 64 | with TaskGroup("load_files_to_redshift") as transfer_section: 65 | for item in s3_objects: 66 | transfer_task = S3ToRedshiftOperator( 67 | s3_bucket=S3_BUCKET, 68 | s3_key=f"{item['key']}/{item['filename']}", 69 | schema="PUBLIC", 70 | table=item['table'], 71 | copy_options=[item['file_type']], 72 | method='UPSERT', 73 | upsert_keys= [item['upsert_key']], 74 | task_id=f"transfer_{item['type']}_s3_to_redshift", 75 | ) 76 | 77 | end = DummyOperator(task_id="end") 78 | 79 | start >> transfer_section >> end 80 | -------------------------------------------------------------------------------- /airflow/dags/scripts/init-data-transformation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # ## One time data transformation 5 | # In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables. 6 | # 7 | # The preprocessed data will be saved back to S3 before getting loaded to Redshift. 8 | 9 | import pyspark 10 | import os 11 | 12 | pyspark.__version__ 13 | 14 | from pyspark.sql import SparkSession 15 | 16 | spark = SparkSession.builder\ 17 | .master('local[*]')\ 18 | .appName('data-transformer')\ 19 | .getOrCreate() 20 | 21 | sc = spark.sparkContext 22 | 23 | df_stations = spark.read.csv("s3a://hrc-de-data/raw/cycling-extras/stations.csv", inferSchema=True, header=True) 24 | df_stations.take(2) 25 | 26 | df_stations.printSchema() 27 | 28 | 29 | from pyspark.sql import functions as F, types as T 30 | 31 | # rename columns 32 | stations= df_stations.withColumnRenamed('Station.Id', 'station_id')\ 33 | .withColumnRenamed('StationName', 'station_name')\ 34 | .withColumnRenamed('easting', 'easting')\ 35 | .withColumnRenamed('northing', 'northing') 36 | 37 | stations.show(5) 38 | 39 | 40 | # count missing values in each column 41 | stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]).show() 42 | 43 | stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite') 44 | 45 | 46 | # ### 2. Weather data 47 | 48 | df_weather = spark.read.json("s3a://hrc-de-data/raw/cycling-extras/weather.json") 49 | 50 | df_weather.take(2) 51 | 52 | df_weather.printSchema() 53 | 54 | # drop some columns that we won't need 55 | weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', 56 | 'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch') 57 | 58 | 59 | # transform datetime 60 | weather= weather.withColumnRenamed('datetime', 'weather_date') 61 | weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType())) 62 | 63 | weather.printSchema() 64 | print(len(weather.columns), 'columns') 65 | 66 | 67 | # count missing values in each column 68 | cols= weather.columns 69 | cols.remove('weather_date') 70 | 71 | missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols]) 72 | 73 | missing_values.show() 74 | 75 | 76 | perc_missing_values= weather.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in cols]) 77 | perc_missing_values.show() 78 | 79 | 80 | # drop columns where missing values are more than 70% 81 | 82 | weather= weather.drop('precipprob', 'snow', 'snowdepth') 83 | 84 | if 'severerisk' in weather.columns: 85 | weather= weather.drop('severerisk') 86 | 87 | 88 | weather.columns 89 | 90 | weather= weather.repartition(10) 91 | 92 | weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite') 93 | -------------------------------------------------------------------------------- /airflow/dags/scripts/journey-data-transformation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # ## Transformation for rental journey data 5 | # This notebook is responsible for transforming journey data by performing the following tasks: 6 | # 7 | # 1. Renaming columns (removing spaces and lowercasing) 8 | # 9 | # 2. Convert data types from string to timestamps 10 | # 11 | # 3. Attach weather dates 12 | # 13 | # 4. Drop unnecessary columns 14 | # 15 | # 5. Update extra files for dimension tables 16 | 17 | import pyspark 18 | import os 19 | 20 | from pyspark.sql import SparkSession 21 | 22 | spark = SparkSession.builder\ 23 | .master('local[*]') \ 24 | .appName('journey-and-stations-data-transformer')\ 25 | .getOrCreate() 26 | 27 | # get journey data 28 | df_journey = spark.read.csv("s3a://hrc-de-data/raw/cycling-journey/*/*", inferSchema=True, header=True) 29 | 30 | df_journey.take(2) 31 | 32 | df_journey.printSchema() 33 | 34 | from pyspark.sql.functions import * 35 | from pyspark.sql.types import * 36 | 37 | # rename columns 38 | df_journey= df_journey.withColumnRenamed('Rental Id', 'rental_id').withColumnRenamed('Bike Id', 'bike_id').withColumnRenamed('Start Date', 'start_date').withColumnRenamed('End Date', 'end_date').withColumnRenamed('StartStation Id', 'start_station').withColumnRenamed('EndStation Id', 'end_station') 39 | 40 | # convert data types 41 | df_journey= df_journey.withColumn('start_date', to_timestamp(col('start_date'), 'dd/MM/yyy HH:mm')) 42 | 43 | df_journey= df_journey.withColumn('end_date', to_timestamp(col('end_date'), 'dd/MM/yyy HH:mm')) 44 | 45 | # add weather_date column 46 | df_journey= df_journey.withColumn('weather_date', to_date(col("start_date"), 'dd/MM/yyy HH:mm')) 47 | 48 | 49 | df_journey.show(5) 50 | df_journey.printSchema() 51 | 52 | 53 | # ### Stations data 54 | # We are going to update the stations data (previously saved by another process) with some additional stations that are not present in the original stations data but are seen in some journey. 55 | 56 | # read previously saved stations data from parquet 57 | df_processed_stations= spark.read.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/') 58 | 59 | # create temporary table for both stations and journey 60 | df_journey.createOrReplaceTempView('journey') 61 | df_processed_stations.createOrReplaceTempView('station') 62 | 63 | 64 | # we keep all the stations which are not found in the temp view station table 65 | additional_stations= spark.sql(''' 66 | with station_ids as ( 67 | select 68 | station_id 69 | from 70 | station 71 | ) 72 | 73 | select 74 | distinct(start_station) as station_id, 75 | `StartStation Name` as station_name 76 | from 77 | journey 78 | where 79 | start_station not in (table station_ids) 80 | 81 | union 82 | 83 | select 84 | distinct(end_station) as station_id, 85 | `EndStation Name` as station_name 86 | from 87 | journey 88 | where 89 | end_station not in (table station_ids) 90 | ''') 91 | additional_stations.show() 92 | 93 | 94 | # add columns to the additional stations to avoid errors when merging it to the previous one (df_processed_stations) 95 | additional_stations= additional_stations.withColumn('longitude', lit(0).cast(DoubleType())).withColumn('latitude', lit(0).cast(DoubleType())).withColumn('easting', lit(0).cast(DoubleType())).withColumn('northing', lit(0).cast(DoubleType())) 96 | 97 | additional_stations.show(5) 98 | additional_stations.printSchema() 99 | 100 | 101 | # remove duplicate values 102 | additional_stations= additional_stations.dropDuplicates(['station_id']) 103 | 104 | 105 | # save additional stations data into parquet files in s3 106 | additional_stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='append') 107 | 108 | 109 | # drop other unnecessary journey columns 110 | df_journey= df_journey.drop('StartStation Name', 'EndStation Name', 'Duration') 111 | 112 | 113 | # ### Datetime 114 | # We are going to create/update datetime data from the start and end date of each journey. 115 | 116 | # extract datetime values from the start and the end date 117 | df_datetime_from_start= ( 118 | df_journey.select( 119 | col('start_date').alias('datetime_id'), 120 | year(col('start_date')).alias('year'), 121 | dayofweek(col('start_date')).alias('week_day'), 122 | month(col('start_date')).alias('month'), 123 | dayofmonth(col('start_date')).alias('day'), 124 | hour(col('start_date')).alias('hour'), 125 | minute(col('start_date')).alias('minute'), 126 | second(col('start_date')).alias('second'), 127 | ) 128 | ) 129 | df_datetime_from_end= ( 130 | df_journey.select( 131 | col('end_date').alias('datetime_id'), 132 | year(col('end_date')).alias('year'), 133 | dayofweek(col('end_date')).alias('week_day'), 134 | month(col('end_date')).alias('month'), 135 | dayofmonth(col('end_date')).alias('day'), 136 | hour(col('end_date')).alias('hour'), 137 | minute(col('end_date')).alias('minute'), 138 | second(col('end_date')).alias('second'), 139 | ) 140 | ) 141 | 142 | df_datetime_from_start.show(3) 143 | df_datetime_from_end.show(3) 144 | 145 | 146 | # combine the dataframes 147 | df_datetime= df_datetime_from_start.union(df_datetime_from_end) 148 | 149 | # remove duplicate entries 150 | df_datetime= df_datetime.dropDuplicates(['datetime_id']) 151 | 152 | df_datetime.show(10) 153 | 154 | 155 | # save datetime data into parquet files in s3 156 | df_datetime.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/datetime/', mode='append') 157 | 158 | 159 | # finally, save journey data into parquet files in s3 160 | df_journey.write.parquet('s3a://hrc-de-data/processed/cycling-fact/journey/', mode='append') 161 | -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.2.3 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.3} 48 | image: airflow-img:latest # can be replaced by any airflow image that was built from the Dockerfile 49 | env_file: 50 | - .env 51 | # build: . 52 | environment: 53 | &airflow-common-env 54 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 55 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 56 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 57 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 58 | AIRFLOW__CORE__FERNET_KEY: '' 59 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 60 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 61 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 62 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 63 | volumes: 64 | - ./dags:/opt/airflow/dags 65 | - ./logs:/opt/airflow/logs 66 | - ./plugins:/opt/airflow/plugins 67 | - ~/.aws:/home/airflow/.aws 68 | user: "${AIRFLOW_UID:-50000}:0" 69 | depends_on: 70 | &airflow-common-depends-on 71 | postgres: 72 | condition: service_healthy 73 | 74 | services: 75 | postgres: 76 | image: postgres:13 77 | environment: 78 | POSTGRES_USER: airflow 79 | POSTGRES_PASSWORD: airflow 80 | POSTGRES_DB: airflow 81 | volumes: 82 | - postgres-db-volume:/var/lib/postgresql/data 83 | healthcheck: 84 | test: ["CMD", "pg_isready", "-U", "airflow"] 85 | interval: 5s 86 | retries: 5 87 | restart: always 88 | 89 | airflow-webserver: 90 | <<: *airflow-common 91 | command: webserver 92 | ports: 93 | - 8080:8080 94 | healthcheck: 95 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 96 | interval: 10s 97 | timeout: 10s 98 | retries: 5 99 | restart: always 100 | depends_on: 101 | <<: *airflow-common-depends-on 102 | airflow-init: 103 | condition: service_completed_successfully 104 | 105 | airflow-scheduler: 106 | <<: *airflow-common 107 | command: scheduler 108 | healthcheck: 109 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 110 | interval: 10s 111 | timeout: 10s 112 | retries: 5 113 | restart: always 114 | depends_on: 115 | <<: *airflow-common-depends-on 116 | airflow-init: 117 | condition: service_completed_successfully 118 | 119 | 120 | airflow-init: 121 | <<: *airflow-common 122 | entrypoint: /bin/bash 123 | # yamllint disable rule:line-length 124 | command: 125 | - -c 126 | - | 127 | function ver() { 128 | printf "%04d%04d%04d%04d" $${1//./ } 129 | } 130 | airflow_version=$$(gosu airflow airflow version) 131 | airflow_version_comparable=$$(ver $${airflow_version}) 132 | min_airflow_version=2.2.0 133 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 134 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 135 | echo 136 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 137 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 138 | echo 139 | exit 1 140 | fi 141 | if [[ -z "${AIRFLOW_UID}" ]]; then 142 | echo 143 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 144 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 145 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 146 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 147 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 148 | echo 149 | fi 150 | one_meg=1048576 151 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 152 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 153 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 154 | warning_resources="false" 155 | if (( mem_available < 4000 )) ; then 156 | echo 157 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 158 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 159 | echo 160 | warning_resources="true" 161 | fi 162 | if (( cpus_available < 2 )); then 163 | echo 164 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 165 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 166 | echo 167 | warning_resources="true" 168 | fi 169 | if (( disk_available < one_meg * 10 )); then 170 | echo 171 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 172 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 173 | echo 174 | warning_resources="true" 175 | fi 176 | if [[ $${warning_resources} == "true" ]]; then 177 | echo 178 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 179 | echo "Please follow the instructions to increase amount of resources available:" 180 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 181 | echo 182 | fi 183 | mkdir -p /sources/logs /sources/dags /sources/plugins 184 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 185 | exec /entrypoint airflow version 186 | # yamllint enable rule:line-length 187 | environment: 188 | <<: *airflow-common-env 189 | _AIRFLOW_DB_UPGRADE: 'true' 190 | _AIRFLOW_WWW_USER_CREATE: 'true' 191 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 192 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 193 | user: "0:0" 194 | volumes: 195 | - .:/sources 196 | 197 | airflow-cli: 198 | <<: *airflow-common 199 | profiles: 200 | - debug 201 | environment: 202 | <<: *airflow-common-env 203 | CONNECTION_CHECK_MAX_COUNT: "0" 204 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 205 | command: 206 | - bash 207 | - -c 208 | - airflow 209 | 210 | volumes: 211 | postgres-db-volume: 212 | -------------------------------------------------------------------------------- /airflow/logs/scheduler/latest: -------------------------------------------------------------------------------- 1 | /opt/airflow/logs/scheduler/2022-03-11 -------------------------------------------------------------------------------- /airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-amazon 2 | bs4 3 | pandas -------------------------------------------------------------------------------- /images/CyclingERD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/CyclingERD.png -------------------------------------------------------------------------------- /images/batch-on-aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/batch-on-aws.png -------------------------------------------------------------------------------- /images/dags/init_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_0.png -------------------------------------------------------------------------------- /images/dags/init_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_1.png -------------------------------------------------------------------------------- /images/dags/init_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_2.png -------------------------------------------------------------------------------- /images/dags/init_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_3.png -------------------------------------------------------------------------------- /images/dags/inits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/inits.png -------------------------------------------------------------------------------- /images/dags/proc_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_0.png -------------------------------------------------------------------------------- /images/dags/proc_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_1.png -------------------------------------------------------------------------------- /images/dags/proc_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_2.png -------------------------------------------------------------------------------- /images/final-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/final-dashboard.png -------------------------------------------------------------------------------- /images/redshift-metabase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/redshift-metabase.png -------------------------------------------------------------------------------- /metabase/README.md: -------------------------------------------------------------------------------- 1 | ## Running Metabase 2 | In this project, we are using Metabase inside a Docker container. The [official documentation](https://www.metabase.com/docs/latest/operations-guide/running-metabase-on-docker.html) clearly mentioned a simple step to install Metabase in that way. 3 | 4 | It is as simple as running: 5 | 6 | ```bash 7 | docker run -d -p 3033:3000 --name metabase metabase/metabase 8 | ``` 9 | 10 | For the very first time of its execution, the above command downloads the latest Docker image available for Metabase before exposing the application on port `3033`. 11 | 12 | On Metabase we can setup a connection to Redshift database as follow: 13 | 14 | ![Metabase - Redshift connection](/images/redshift-metabase.png "Connecting Metabase to Redshift") 15 | 16 | Note: replace the database credentials by the relevant values in your case. 17 | 18 | -------------------------------------------------------------------------------- /notebook/data-exploration/Exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exploration of the dataset\n", 8 | "The TFL website contains **Santander cycling data** that are structured in different directories.\n", 9 | "In this notebook, we are going to read a single file as an example of the cycling journey.\n", 10 | "\n", 11 | "Additionally, we will also read the **docking stations data** which was found outside the main TFL website. \n", 12 | "The stations data contains the list of departure and destination stations mentioned in each cycling journey.\n", 13 | "\n", 14 | "Our third dataset consists of the **historical weather data** in London over the year of 2021. The data are represented daily with 36 weather attributes. This data was originally retrieved from www.visualcrossing.com website, then stored in Google Drive to allow easy access to it." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# import packages\n", 24 | "import pandas as pd\n", 25 | "import json" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Cycling journey data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "--2022-02-21 09:24:34-- https://cycling.data.tfl.gov.uk/usage-stats/252JourneyDataExtract10Feb2021-16Feb2021.csv\n", 45 | "Resolving cycling.data.tfl.gov.uk (cycling.data.tfl.gov.uk)... 54.230.115.31, 54.230.115.80, 54.230.115.35, ...\n", 46 | "Connecting to cycling.data.tfl.gov.uk (cycling.data.tfl.gov.uk)|54.230.115.31|:443... connected.\n", 47 | "HTTP request sent, awaiting response... 200 OK\n", 48 | "Length: 11036049 (11M) [text/csv]\n", 49 | "Saving to: ‘journey10Feb2021-16Feb2021.csv’\n", 50 | "\n", 51 | "journey10Feb2021-16 100%[===================>] 10.52M 2.59MB/s in 4.1s \n", 52 | "\n", 53 | "2022-02-21 09:24:40 (2.59 MB/s) - ‘journey10Feb2021-16Feb2021.csv’ saved [11036049/11036049]\n", 54 | "\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "# download an example file\n", 60 | "!wget https://cycling.data.tfl.gov.uk/usage-stats/252JourneyDataExtract10Feb2021-16Feb2021.csv -O journey10Feb2021-16Feb2021.csv" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 12, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 85 | "\n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
Rental IdDurationBike IdEnd DateEndStation IdEndStation NameStart DateStartStation IdStartStation Name
010540128533601749715/02/2021 20:55785Aquatic Centre, Queen Elizabeth Olympic Park15/02/2021 19:59785Aquatic Centre, Queen Elizabeth Olympic Park
11053222261020467710/02/2021 08:03194Hop Exchange, The Borough10/02/2021 07:4614Belgrove Street , King's Cross
21053518464801804612/02/2021 15:2627Bouverie Street, Temple12/02/2021 15:18196Union Street, The Borough
31053242291801978510/02/2021 10:46195Milroy Walk, South Bank10/02/2021 10:43196Union Street, The Borough
41053506967201424312/02/2021 14:17274Warwick Road, Olympia12/02/2021 14:05219Bramham Gardens, Earl's Court
\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " Rental Id Duration Bike Id End Date EndStation Id \\\n", 167 | "0 105401285 3360 17497 15/02/2021 20:55 785 \n", 168 | "1 105322226 1020 4677 10/02/2021 08:03 194 \n", 169 | "2 105351846 480 18046 12/02/2021 15:26 27 \n", 170 | "3 105324229 180 19785 10/02/2021 10:46 195 \n", 171 | "4 105350696 720 14243 12/02/2021 14:17 274 \n", 172 | "\n", 173 | " EndStation Name Start Date \\\n", 174 | "0 Aquatic Centre, Queen Elizabeth Olympic Park 15/02/2021 19:59 \n", 175 | "1 Hop Exchange, The Borough 10/02/2021 07:46 \n", 176 | "2 Bouverie Street, Temple 12/02/2021 15:18 \n", 177 | "3 Milroy Walk, South Bank 10/02/2021 10:43 \n", 178 | "4 Warwick Road, Olympia 12/02/2021 14:05 \n", 179 | "\n", 180 | " StartStation Id StartStation Name \n", 181 | "0 785 Aquatic Centre, Queen Elizabeth Olympic Park \n", 182 | "1 14 Belgrove Street , King's Cross \n", 183 | "2 196 Union Street, The Borough \n", 184 | "3 196 Union Street, The Borough \n", 185 | "4 219 Bramham Gardens, Earl's Court " 186 | ] 187 | }, 188 | "execution_count": 12, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "df= pd.read_csv('journey10Feb2021-16Feb2021.csv')\n", 195 | "df.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 13, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "(89405, 9)" 207 | ] 208 | }, 209 | "execution_count": 13, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "df.shape" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 14, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "CREATE TABLE \"journey_staging\" (\n", 228 | "\"Rental Id\" INTEGER,\n", 229 | " \"Duration\" INTEGER,\n", 230 | " \"Bike Id\" INTEGER,\n", 231 | " \"End Date\" TEXT,\n", 232 | " \"EndStation Id\" INTEGER,\n", 233 | " \"EndStation Name\" TEXT,\n", 234 | " \"Start Date\" TEXT,\n", 235 | " \"StartStation Id\" INTEGER,\n", 236 | " \"StartStation Name\" TEXT,\n", 237 | " CONSTRAINT journey_staging_pk PRIMARY KEY (\"Rental Id\")\n", 238 | ")\n" 239 | ] 240 | }, 241 | { 242 | "name": "stderr", 243 | "output_type": "stream", 244 | "text": [ 245 | "/tmp/ipykernel_234257/265231394.py:2: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores.\n", 246 | " journey_table= pd.io.sql.get_schema(frame=df, name='journey_staging', keys='Rental Id')\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "# infer a sql table schema for journey data\n", 252 | "journey_table= pd.io.sql.get_schema(frame=df, name='journey_staging', keys='Rental Id')\n", 253 | "print(journey_table)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "### Docking stations" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 15, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "--2022-02-21 09:41:08-- https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt\n", 273 | "Resolving www.whatdotheyknow.com (www.whatdotheyknow.com)... 46.43.39.108\n", 274 | "Connecting to www.whatdotheyknow.com (www.whatdotheyknow.com)|46.43.39.108|:443... connected.\n", 275 | "HTTP request sent, awaiting response... 200 OK\n", 276 | "Length: unspecified [text/plain]\n", 277 | "Saving to: ‘stations.csv’\n", 278 | "\n", 279 | "stations.csv [ <=> ] 57.09K 97.3KB/s in 0.6s \n", 280 | "\n", 281 | "2022-02-21 09:41:11 (97.3 KB/s) - ‘stations.csv’ saved [58461]\n", 282 | "\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "!wget https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt -O stations.csv" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 16, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 312 | "\n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | "
Station.IdStationNamelongitudelatitudeEastingNorthing
01River Street, Clerkenwell-0.10997151.5292531202.520182832.020
12Phillimore Gardens, Kensington-0.19757451.4996525207.070179391.860
23Christopher Street, Liverpool Street-0.08460651.5213532984.810182001.530
34St. Chad's Street, King's Cross-0.12097451.5301530436.760182911.990
45Sedding Street, Sloane Square-0.15687651.4931528051.649178742.097
\n", 372 | "
" 373 | ], 374 | "text/plain": [ 375 | " Station.Id StationName longitude latitude \\\n", 376 | "0 1 River Street, Clerkenwell -0.109971 51.5292 \n", 377 | "1 2 Phillimore Gardens, Kensington -0.197574 51.4996 \n", 378 | "2 3 Christopher Street, Liverpool Street -0.084606 51.5213 \n", 379 | "3 4 St. Chad's Street, King's Cross -0.120974 51.5301 \n", 380 | "4 5 Sedding Street, Sloane Square -0.156876 51.4931 \n", 381 | "\n", 382 | " Easting Northing \n", 383 | "0 531202.520 182832.020 \n", 384 | "1 525207.070 179391.860 \n", 385 | "2 532984.810 182001.530 \n", 386 | "3 530436.760 182911.990 \n", 387 | "4 528051.649 178742.097 " 388 | ] 389 | }, 390 | "execution_count": 16, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "df_stations= pd.read_csv('stations.csv')\n", 397 | "df_stations.head()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 19, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "CREATE TABLE \"stations_staging\" (\n", 410 | "\"Station.Id\" INTEGER,\n", 411 | " \"StationName\" TEXT,\n", 412 | " \"longitude\" REAL,\n", 413 | " \"latitude\" REAL,\n", 414 | " \"Easting\" REAL,\n", 415 | " \"Northing\" REAL,\n", 416 | " CONSTRAINT stations_staging_pk PRIMARY KEY (\"Station.Id\")\n", 417 | ")\n" 418 | ] 419 | } 420 | ], 421 | "source": [ 422 | "# infer a sql table schema for stations data\n", 423 | "stations_table= pd.io.sql.get_schema(frame=df_stations, name='stations_staging', keys='Station.Id')\n", 424 | "print(stations_table)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "### Historical weather data in 2021" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 20, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "name": "stdout", 441 | "output_type": "stream", 442 | "text": [ 443 | "--2022-02-21 09:42:22-- https://docs.google.com/uc?export=download&id=1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c\n", 444 | "Resolving docs.google.com (docs.google.com)... 216.58.223.78, 2a00:1450:401a:804::200e\n", 445 | "Connecting to docs.google.com (docs.google.com)|216.58.223.78|:443... connected.\n", 446 | "HTTP request sent, awaiting response... 303 See Other\n", 447 | "Location: https://doc-0s-2g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/oh5pmqielfjkamj3j5h4htd9undhtio4/1645425675000/00305885236840532660/*/1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c?e=download [following]\n", 448 | "Warning: wildcards not supported in HTTP.\n", 449 | "--2022-02-21 09:42:25-- https://doc-0s-2g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/oh5pmqielfjkamj3j5h4htd9undhtio4/1645425675000/00305885236840532660/*/1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c?e=download\n", 450 | "Resolving doc-0s-2g-docs.googleusercontent.com (doc-0s-2g-docs.googleusercontent.com)... 172.217.170.161, 2a00:1450:401a:800::2001\n", 451 | "Connecting to doc-0s-2g-docs.googleusercontent.com (doc-0s-2g-docs.googleusercontent.com)|172.217.170.161|:443... connected.\n", 452 | "HTTP request sent, awaiting response... 200 OK\n", 453 | "Length: 379443 (371K) [application/json]\n", 454 | "Saving to: ‘weather-2021.json’\n", 455 | "\n", 456 | "weather-2021.json 100%[===================>] 370.55K 507KB/s in 0.7s \n", 457 | "\n", 458 | "2022-02-21 09:42:27 (507 KB/s) - ‘weather-2021.json’ saved [379443/379443]\n", 459 | "\n" 460 | ] 461 | } 462 | ], 463 | "source": [ 464 | "!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13LWAH93xxEvOukCnPhrfXH7rZZq_-mss' -O weather-2021.json" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 21, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "{\n", 477 | " \"latitude\" : 51.5064,\n", 478 | " \"longitude\" : -0.12721,\n", 479 | " \"resolvedAddress\" : \"London, England, United Kingdom\",\n", 480 | " \"address\" : \"London,UK\",\n", 481 | " \"timezone\" : \"Europe/London\",\n", 482 | " \"tzoffset\" : 0.0,\n", 483 | " \"name\" : \"London,UK\",\n", 484 | " \"days\" : [ {\n", 485 | " \"datetime\" : \"2021-01-01\",\n", 486 | " \"datetimeEpoch\" : 1609459200,\n", 487 | " \"tempmax\" : 5.0,\n", 488 | " \"tempmin\" : -0.5,\n", 489 | " \"temp\" : 2.1,\n", 490 | " \"feelslikemax\" : 2.9,\n", 491 | " \"feelslikemin\" : -3.6,\n", 492 | " \"feelslike\" : -0.2,\n", 493 | " \"dew\" : 0.8,\n", 494 | " \"humidity\" : 91.03,\n", 495 | " \"precip\" : 0.22,\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "!head -n 20 weather-2021.json" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 22, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/html": [ 511 | "
\n", 512 | "\n", 525 | "\n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | "
datetimedatetimeEpochtempmaxtempmintempfeelslikemaxfeelslikeminfeelslikedewhumidity...sunriseEpochsunsetsunsetEpochmoonphaseconditionsdescriptioniconstationssourcetzoffset
02021-01-0116094592005.0-0.52.12.9-3.6-0.20.891.03...160948837416:02:2216095169420.53RainClear conditions throughout the day with late ...rain[03769099999, 03680099999, D5621, 03672099999,...obsNaN
12021-01-0216095456005.11.53.83.1-1.51.51.082.51...160957476516:03:2816096034080.56RainClear conditions throughout the day with rain.rain[03680099999, D5621, 03672099999, 03781099999,...obsNaN
22021-01-0316096320006.01.13.85.6-2.50.91.786.02...160966115416:04:3616096898760.60RainClear conditions throughout the day with rain.rain[03680099999, D5621, 03672099999, 03781099999,...obsNaN
32021-01-0416097184005.63.54.34.1-0.70.51.481.43...160974753816:05:4616097763460.65RainClear conditions throughout the day with rain.rain[03680099999, D5621, 03672099999, 03781099999,...obsNaN
42021-01-0516098048004.62.53.70.8-1.8-0.41.082.39...160983392016:06:5916098628190.70RainClear conditions throughout the day with rain.rain[03680099999, D5621, 03672099999, 03781099999,...obsNaN
\n", 675 | "

5 rows × 36 columns

\n", 676 | "
" 677 | ], 678 | "text/plain": [ 679 | " datetime datetimeEpoch tempmax tempmin temp feelslikemax \\\n", 680 | "0 2021-01-01 1609459200 5.0 -0.5 2.1 2.9 \n", 681 | "1 2021-01-02 1609545600 5.1 1.5 3.8 3.1 \n", 682 | "2 2021-01-03 1609632000 6.0 1.1 3.8 5.6 \n", 683 | "3 2021-01-04 1609718400 5.6 3.5 4.3 4.1 \n", 684 | "4 2021-01-05 1609804800 4.6 2.5 3.7 0.8 \n", 685 | "\n", 686 | " feelslikemin feelslike dew humidity ... sunriseEpoch sunset \\\n", 687 | "0 -3.6 -0.2 0.8 91.03 ... 1609488374 16:02:22 \n", 688 | "1 -1.5 1.5 1.0 82.51 ... 1609574765 16:03:28 \n", 689 | "2 -2.5 0.9 1.7 86.02 ... 1609661154 16:04:36 \n", 690 | "3 -0.7 0.5 1.4 81.43 ... 1609747538 16:05:46 \n", 691 | "4 -1.8 -0.4 1.0 82.39 ... 1609833920 16:06:59 \n", 692 | "\n", 693 | " sunsetEpoch moonphase conditions \\\n", 694 | "0 1609516942 0.53 Rain \n", 695 | "1 1609603408 0.56 Rain \n", 696 | "2 1609689876 0.60 Rain \n", 697 | "3 1609776346 0.65 Rain \n", 698 | "4 1609862819 0.70 Rain \n", 699 | "\n", 700 | " description icon \\\n", 701 | "0 Clear conditions throughout the day with late ... rain \n", 702 | "1 Clear conditions throughout the day with rain. rain \n", 703 | "2 Clear conditions throughout the day with rain. rain \n", 704 | "3 Clear conditions throughout the day with rain. rain \n", 705 | "4 Clear conditions throughout the day with rain. rain \n", 706 | "\n", 707 | " stations source tzoffset \n", 708 | "0 [03769099999, 03680099999, D5621, 03672099999,... obs NaN \n", 709 | "1 [03680099999, D5621, 03672099999, 03781099999,... obs NaN \n", 710 | "2 [03680099999, D5621, 03672099999, 03781099999,... obs NaN \n", 711 | "3 [03680099999, D5621, 03672099999, 03781099999,... obs NaN \n", 712 | "4 [03680099999, D5621, 03672099999, 03781099999,... obs NaN \n", 713 | "\n", 714 | "[5 rows x 36 columns]" 715 | ] 716 | }, 717 | "execution_count": 22, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "# we will only extract the day items\n", 724 | "with open('weather-2021.json', 'r') as f:\n", 725 | " weather = json.load(f)\n", 726 | "\n", 727 | "df_weather = pd.DataFrame.from_dict(weather[\"days\"])\n", 728 | "df_weather.head()" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 24, 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "name": "stdout", 738 | "output_type": "stream", 739 | "text": [ 740 | "\n", 741 | "RangeIndex: 365 entries, 0 to 364\n", 742 | "Data columns (total 36 columns):\n", 743 | " # Column Non-Null Count Dtype \n", 744 | "--- ------ -------------- ----- \n", 745 | " 0 datetime 365 non-null object \n", 746 | " 1 datetimeEpoch 365 non-null int64 \n", 747 | " 2 tempmax 365 non-null float64\n", 748 | " 3 tempmin 365 non-null float64\n", 749 | " 4 temp 365 non-null float64\n", 750 | " 5 feelslikemax 365 non-null float64\n", 751 | " 6 feelslikemin 365 non-null float64\n", 752 | " 7 feelslike 365 non-null float64\n", 753 | " 8 dew 365 non-null float64\n", 754 | " 9 humidity 365 non-null float64\n", 755 | " 10 precip 365 non-null float64\n", 756 | " 11 precipprob 0 non-null object \n", 757 | " 12 precipcover 365 non-null float64\n", 758 | " 13 preciptype 0 non-null object \n", 759 | " 14 snow 0 non-null object \n", 760 | " 15 snowdepth 9 non-null float64\n", 761 | " 16 windgust 139 non-null float64\n", 762 | " 17 windspeed 365 non-null float64\n", 763 | " 18 winddir 365 non-null float64\n", 764 | " 19 pressure 364 non-null float64\n", 765 | " 20 cloudcover 365 non-null float64\n", 766 | " 21 visibility 365 non-null float64\n", 767 | " 22 solarradiation 365 non-null float64\n", 768 | " 23 solarenergy 365 non-null float64\n", 769 | " 24 uvindex 365 non-null float64\n", 770 | " 25 sunrise 365 non-null object \n", 771 | " 26 sunriseEpoch 365 non-null int64 \n", 772 | " 27 sunset 365 non-null object \n", 773 | " 28 sunsetEpoch 365 non-null int64 \n", 774 | " 29 moonphase 365 non-null float64\n", 775 | " 30 conditions 365 non-null object \n", 776 | " 31 description 365 non-null object \n", 777 | " 32 icon 365 non-null object \n", 778 | " 33 stations 365 non-null object \n", 779 | " 34 source 365 non-null object \n", 780 | " 35 tzoffset 217 non-null float64\n", 781 | "dtypes: float64(22), int64(3), object(11)\n", 782 | "memory usage: 102.8+ KB\n" 783 | ] 784 | } 785 | ], 786 | "source": [ 787 | "df_weather.info()" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 25, 793 | "metadata": {}, 794 | "outputs": [ 795 | { 796 | "name": "stdout", 797 | "output_type": "stream", 798 | "text": [ 799 | "Columns: Index(['datetime', 'datetimeEpoch', 'tempmax', 'tempmin', 'temp',\n", 800 | " 'feelslikemax', 'feelslikemin', 'feelslike', 'dew', 'humidity',\n", 801 | " 'precip', 'precipprob', 'precipcover', 'preciptype', 'snow',\n", 802 | " 'snowdepth', 'windgust', 'windspeed', 'winddir', 'pressure',\n", 803 | " 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex',\n", 804 | " 'sunrise', 'sunriseEpoch', 'sunset', 'sunsetEpoch', 'moonphase',\n", 805 | " 'conditions', 'description', 'icon', 'stations', 'source', 'tzoffset'],\n", 806 | " dtype='object') \n", 807 | "Shape: (365, 36)\n" 808 | ] 809 | } 810 | ], 811 | "source": [ 812 | "print('Columns: ', df_weather.columns, '\\nShape: ', df_weather.shape)" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 26, 818 | "metadata": {}, 819 | "outputs": [ 820 | { 821 | "name": "stdout", 822 | "output_type": "stream", 823 | "text": [ 824 | "CREATE TABLE \"weather_staging\" (\n", 825 | "\"datetime\" TEXT,\n", 826 | " \"datetimeEpoch\" INTEGER,\n", 827 | " \"tempmax\" REAL,\n", 828 | " \"tempmin\" REAL,\n", 829 | " \"temp\" REAL,\n", 830 | " \"feelslikemax\" REAL,\n", 831 | " \"feelslikemin\" REAL,\n", 832 | " \"feelslike\" REAL,\n", 833 | " \"dew\" REAL,\n", 834 | " \"humidity\" REAL,\n", 835 | " \"precip\" REAL,\n", 836 | " \"precipprob\" TEXT,\n", 837 | " \"precipcover\" REAL,\n", 838 | " \"preciptype\" TEXT,\n", 839 | " \"snow\" TEXT,\n", 840 | " \"snowdepth\" REAL,\n", 841 | " \"windgust\" REAL,\n", 842 | " \"windspeed\" REAL,\n", 843 | " \"winddir\" REAL,\n", 844 | " \"pressure\" REAL,\n", 845 | " \"cloudcover\" REAL,\n", 846 | " \"visibility\" REAL,\n", 847 | " \"solarradiation\" REAL,\n", 848 | " \"solarenergy\" REAL,\n", 849 | " \"uvindex\" REAL,\n", 850 | " \"sunrise\" TEXT,\n", 851 | " \"sunriseEpoch\" INTEGER,\n", 852 | " \"sunset\" TEXT,\n", 853 | " \"sunsetEpoch\" INTEGER,\n", 854 | " \"moonphase\" REAL,\n", 855 | " \"conditions\" TEXT,\n", 856 | " \"description\" TEXT,\n", 857 | " \"icon\" TEXT,\n", 858 | " \"stations\" TEXT,\n", 859 | " \"source\" TEXT,\n", 860 | " \"tzoffset\" REAL,\n", 861 | " CONSTRAINT weather_staging_pk PRIMARY KEY (\"datetime\")\n", 862 | ")\n" 863 | ] 864 | } 865 | ], 866 | "source": [ 867 | "# infer a sql table schema for weather data\n", 868 | "weather_table= pd.io.sql.get_schema(frame=df_weather, name='weather_staging', keys='datetime')\n", 869 | "print(weather_table)" 870 | ] 871 | } 872 | ], 873 | "metadata": { 874 | "interpreter": { 875 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 876 | }, 877 | "kernelspec": { 878 | "display_name": "Python 3.8.10 64-bit", 879 | "language": "python", 880 | "name": "python3" 881 | }, 882 | "language_info": { 883 | "codemirror_mode": { 884 | "name": "ipython", 885 | "version": 3 886 | }, 887 | "file_extension": ".py", 888 | "mimetype": "text/x-python", 889 | "name": "python", 890 | "nbconvert_exporter": "python", 891 | "pygments_lexer": "ipython3", 892 | "version": "3.8.10" 893 | }, 894 | "orig_nbformat": 4 895 | }, 896 | "nbformat": 4, 897 | "nbformat_minor": 2 898 | } 899 | -------------------------------------------------------------------------------- /notebook/data-exploration/Scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Web Scrapping\n", 8 | "Scraping the TFL website: (https://cycling.data.tfl.gov.uk) in order to get the links of the files we are interested in." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 7, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "/usr/lib/python3/dist-packages/secretstorage/dhcrypto.py:15: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", 21 | " from cryptography.utils import int_from_bytes\n", 22 | "/usr/lib/python3/dist-packages/secretstorage/util.py:19: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", 23 | " from cryptography.utils import int_from_bytes\n", 24 | "Defaulting to user installation because normal site-packages is not writeable\n", 25 | "Requirement already satisfied: selenium in /home/hrc/.local/lib/python3.8/site-packages (4.1.0)\n", 26 | "Requirement already satisfied: trio-websocket~=0.9 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (0.9.2)\n", 27 | "Requirement already satisfied: trio~=0.17 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (0.19.0)\n", 28 | "Requirement already satisfied: urllib3[secure]~=1.26 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (1.26.7)\n", 29 | "Requirement already satisfied: async-generator>=1.9 in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.10)\n", 30 | "Requirement already satisfied: sniffio in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.2.0)\n", 31 | "Requirement already satisfied: outcome in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.1.0)\n", 32 | "Requirement already satisfied: idna in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (3.3)\n", 33 | "Requirement already satisfied: attrs>=19.2.0 in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (20.3.0)\n", 34 | "Requirement already satisfied: sortedcontainers in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (2.4.0)\n", 35 | "Requirement already satisfied: wsproto>=0.14 in /home/hrc/.local/lib/python3.8/site-packages (from trio-websocket~=0.9->selenium) (1.0.0)\n", 36 | "Requirement already satisfied: cryptography>=1.3.4 in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (36.0.1)\n", 37 | "Requirement already satisfied: certifi in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (2021.10.8)\n", 38 | "Requirement already satisfied: pyOpenSSL>=0.14 in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (22.0.0)\n", 39 | "Requirement already satisfied: cffi>=1.12 in /home/hrc/.local/lib/python3.8/site-packages (from cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (1.15.0)\n", 40 | "Requirement already satisfied: h11<1,>=0.9.0 in /home/hrc/.local/lib/python3.8/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.12.0)\n", 41 | "Requirement already satisfied: pycparser in /home/hrc/.local/lib/python3.8/site-packages (from cffi>=1.12->cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (2.21)\n", 42 | "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n", 43 | "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", 44 | "/usr/lib/python3/dist-packages/secretstorage/dhcrypto.py:15: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", 45 | " from cryptography.utils import int_from_bytes\n", 46 | "/usr/lib/python3/dist-packages/secretstorage/util.py:19: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", 47 | " from cryptography.utils import int_from_bytes\n", 48 | "Defaulting to user installation because normal site-packages is not writeable\n", 49 | "Requirement already satisfied: bs4 in /home/hrc/.local/lib/python3.8/site-packages (0.0.1)\n", 50 | "Requirement already satisfied: beautifulsoup4 in /home/hrc/.local/lib/python3.8/site-packages (from bs4) (4.10.0)\n", 51 | "Requirement already satisfied: soupsieve>1.2 in /home/hrc/.local/lib/python3.8/site-packages (from beautifulsoup4->bs4) (2.3.1)\n", 52 | "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n", 53 | "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "# install dependencies\n", 59 | "!pip install selenium\n", 60 | "!pip install bs4" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 8, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# imports\n", 70 | "from bs4 import BeautifulSoup\n", 71 | "\n", 72 | "# selenium will be used to scrap dynamic content of the webpage source of our data\n", 73 | "from selenium import webdriver\n", 74 | "from webdriver_manager.firefox import GeckoDriverManager\n", 75 | "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n", 76 | "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n", 77 | "\n", 78 | "from selenium.webdriver.common.by import By\n", 79 | "from selenium.webdriver.support.ui import WebDriverWait\n", 80 | "from selenium.webdriver.support import expected_conditions as EC" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 9, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "\n", 93 | "\n", 94 | "====== WebDriver manager ======\n", 95 | "Current firefox version is 96.0\n", 96 | "Get LATEST geckodriver version for 96.0 firefox\n", 97 | "Getting latest mozilla release info for v0.30.0\n", 98 | "Trying to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz\n", 99 | "Driver has been saved in cache [/home/hrc/.wdm/drivers/geckodriver/linux64/v0.30.0]\n", 100 | "/tmp/ipykernel_373988/1338734867.py:11: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n", 101 | " browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n", 102 | "/tmp/ipykernel_373988/1338734867.py:11: DeprecationWarning: capabilities and desired_capabilities have been deprecated, please pass in a Service object\n", 103 | " browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "# get the webpage contents in html format\n", 109 | "\n", 110 | "url= \"https://cycling.data.tfl.gov.uk\"\n", 111 | "\n", 112 | "cap = DesiredCapabilities().FIREFOX\n", 113 | "cap[\"marionette\"] = False\n", 114 | "\n", 115 | "options = FirefoxOptions()\n", 116 | "options.add_argument(\"--headless\")\n", 117 | "\n", 118 | "browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n", 119 | "browser.get(url)\n", 120 | "\n", 121 | "# wait until at least a single element of the table exists\n", 122 | "wait = WebDriverWait(browser, 20)\n", 123 | "html = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/table/tbody/tr[1]/td[1]')))\n", 124 | "\n", 125 | "html_element= browser.page_source\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# scrap the html contents\n", 135 | "bsoup= BeautifulSoup(html_element, \"html.parser\")" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 11, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "Display 2 items in the dictionary\n", 148 | "{'05Jan2021': 'https://cycling.data.tfl.gov.uk/usage-stats/246JourneyDataExtract30Dec2020-05Jan2021.csv', '12Jan2021': 'https://cycling.data.tfl.gov.uk/usage-stats/247JourneyDataExtract06Jan2021-12Jan2021.csv'}\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# find the relevant files with their links\n", 154 | "table= bsoup.find('table')\n", 155 | "tbody= table.find('tbody')\n", 156 | "folder_name= \"usage-stats/\"\n", 157 | "capture_files= False\n", 158 | "year= 2021\n", 159 | "filetype= 'csv'\n", 160 | "extracted_files= {}\n", 161 | "\n", 162 | "for row in tbody.find_all('tr'):\n", 163 | " columns= row.find_all('td')\n", 164 | "\n", 165 | " if capture_files == False:\n", 166 | " col_values= [col.text.strip() for col in columns]\n", 167 | "\n", 168 | " if col_values[0] == folder_name:\n", 169 | " capture_files= True\n", 170 | " continue\n", 171 | "\n", 172 | " else:\n", 173 | " col= columns[0]\n", 174 | " filename= col.text.strip()\n", 175 | " \n", 176 | " if not filename.endswith(f'{year}.{filetype}'):\n", 177 | " continue\n", 178 | " \n", 179 | " # extract the date\n", 180 | " filename_without_extension= filename.replace(f'.{filetype}', '') \n", 181 | " filename_last_date= filename_without_extension.split('-')[-1]\n", 182 | " extracted_files[filename_last_date]= col.a['href']\n", 183 | "\n", 184 | "print('Display 2 items in the dictionary')\n", 185 | "print(dict(list(extracted_files.items())[0:2]))" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "interpreter": { 191 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 192 | }, 193 | "kernelspec": { 194 | "display_name": "Python 3.8.10 64-bit", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.8.10" 209 | }, 210 | "orig_nbformat": 4 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /notebook/data-transformation/experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d20b3aba", 6 | "metadata": {}, 7 | "source": [ 8 | "## One time data transformation\n", 9 | "In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables.\n", 10 | "\n", 11 | "The preprocessed data will be saved back to S3 before getting loaded to Redshift." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "id": "d18f0fe2", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pyspark\n", 22 | "import os" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "fc380d3d", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "'3.2.1'" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "pyspark.__version__" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "id": "42c52993", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from pyspark.sql import SparkSession" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "id": "3f382bf9", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stderr", 64 | "output_type": "stream", 65 | "text": [ 66 | "WARNING: An illegal reflective access operation has occurred\n", 67 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/Documents/de-aws/data-venv/lib/python3.8/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 68 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 69 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 70 | "WARNING: All illegal access operations will be denied in a future release\n", 71 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 72 | "Setting default log level to \"WARN\".\n", 73 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 74 | "22/03/05 23:56:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "spark = SparkSession.builder \\\n", 80 | " .master('local[*]') \\\n", 81 | " .appName('data-transformer') \\\n", 82 | " .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n", 83 | " .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n", 84 | " .getOrCreate()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "id": "388ae2f2", 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "'3.3.1'" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "sc = spark.sparkContext\n", 106 | "sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "50ba85e8", 112 | "metadata": {}, 113 | "source": [ 114 | "### 1. Stations data" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "id": "ae6e918b", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stderr", 125 | "output_type": "stream", 126 | "text": [ 127 | "22/03/01 18:05:51 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n", 128 | " \r" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "df_stations = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-extras/stations.csv\", inferSchema=True, header=True)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "id": "0ceaf5ce", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "[Row(Station.Id=1, StationName='River Street, Clerkenwell', longitude=-0.109971, latitude=51.5292, Easting=531202.52, Northing=182832.02),\n", 146 | " Row(Station.Id=2, StationName='Phillimore Gardens, Kensington', longitude=-0.197574, latitude=51.4996, Easting=525207.07, Northing=179391.86)]" 147 | ] 148 | }, 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "df_stations.take(2)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "id": "4533a36e", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "root\n", 169 | " |-- Station.Id: integer (nullable = true)\n", 170 | " |-- StationName: string (nullable = true)\n", 171 | " |-- longitude: double (nullable = true)\n", 172 | " |-- latitude: double (nullable = true)\n", 173 | " |-- Easting: double (nullable = true)\n", 174 | " |-- Northing: double (nullable = true)\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "df_stations.printSchema()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 11, 186 | "id": "766eb9a2", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from pyspark.sql import functions as F, types as T" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 10, 196 | "id": "6cbf6baf", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# rename columns\n", 201 | "stations= df_stations.withColumnRenamed('Station.Id', 'station_id') \\\n", 202 | " .withColumnRenamed('StationName', 'station_name') \\\n", 203 | " .withColumnRenamed('easting', 'easting') \\\n", 204 | " .withColumnRenamed('northing', 'northing') " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 11, 210 | "id": "7d9360c5", 211 | "metadata": { 212 | "scrolled": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "+----------+--------------------+----------+--------+----------+----------+\n", 220 | "|station_id| station_name| longitude|latitude| easting| northing|\n", 221 | "+----------+--------------------+----------+--------+----------+----------+\n", 222 | "| 1|River Street, Cle...| -0.109971| 51.5292| 531202.52| 182832.02|\n", 223 | "| 2|Phillimore Garden...| -0.197574| 51.4996| 525207.07| 179391.86|\n", 224 | "| 3|Christopher Stree...|-0.0846057| 51.5213| 532984.81| 182001.53|\n", 225 | "| 4|St. Chad's Street...| -0.120974| 51.5301| 530436.76| 182911.99|\n", 226 | "| 5|Sedding Street, S...| -0.156876| 51.4931|528051.649|178742.097|\n", 227 | "+----------+--------------------+----------+--------+----------+----------+\n", 228 | "only showing top 5 rows\n", 229 | "\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "stations.show(5)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 12, 240 | "id": "88468f5d", 241 | "metadata": { 242 | "scrolled": true 243 | }, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "+----------+------------+---------+--------+-------+--------+\n", 250 | "|station_id|station_name|longitude|latitude|easting|northing|\n", 251 | "+----------+------------+---------+--------+-------+--------+\n", 252 | "| 0| 0| 0| 0| 0| 0|\n", 253 | "+----------+------------+---------+--------+-------+--------+\n", 254 | "\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "# count missing values in each column\n", 260 | "stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]\n", 261 | " ).show()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 17, 267 | "id": "d0ee0f23", 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stderr", 272 | "output_type": "stream", 273 | "text": [ 274 | " \r" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite')" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "id": "ef3f5de0", 285 | "metadata": {}, 286 | "source": [ 287 | "### 2. Weather data" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 6, 293 | "id": "c41101c2", 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "22/03/05 23:57:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n", 301 | " \r" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "df_weather = spark.read.json(\"s3a://hrc-de-data/raw/cycling-extras/weather.json\")" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 7, 312 | "id": "3203cde3", 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stderr", 317 | "output_type": "stream", 318 | "text": [ 319 | "22/03/05 23:57:19 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", 320 | " \r" 321 | ] 322 | }, 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "[Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-01', datetimeEpoch=1609459200, description='Clear conditions throughout the day with late afternoon rain.', dew=0.8, feelslike=-0.2, feelslikemax=2.9, feelslikemin=-3.6, humidity=91.03, icon='rain', moonphase=0.53, precip=0.22, precipcover=4.17, precipprob=None, preciptype=None, pressure=1011.6, severerisk=None, snow=None, snowdepth=None, solarenergy=0.8, solarradiation=29.4, source='obs', stations=['03769099999', '03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:14', sunriseEpoch=1609488374, sunset='16:02:22', sunsetEpoch=1609516942, temp=2.1, tempmax=5.0, tempmin=-0.5, tzoffset=None, uvindex=0.0, visibility=2.6, winddir=304.0, windgust=None, windspeed=6.6),\n", 327 | " Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-02', datetimeEpoch=1609545600, description='Clear conditions throughout the day with rain.', dew=1.0, feelslike=1.5, feelslikemax=3.1, feelslikemin=-1.5, humidity=82.51, icon='rain', moonphase=0.56, precip=0.6, precipcover=8.33, precipprob=None, preciptype=None, pressure=1015.9, severerisk=None, snow=None, snowdepth=None, solarenergy=1.3, solarradiation=43.9, source='obs', stations=['03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:05', sunriseEpoch=1609574765, sunset='16:03:28', sunsetEpoch=1609603408, temp=3.8, tempmax=5.1, tempmin=1.5, tzoffset=None, uvindex=1.0, visibility=15.1, winddir=299.0, windgust=None, windspeed=7.8)]" 328 | ] 329 | }, 330 | "execution_count": 7, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "df_weather.take(2)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 8, 342 | "id": "5474298d", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "root\n", 350 | " |-- cloudcover: double (nullable = true)\n", 351 | " |-- conditions: string (nullable = true)\n", 352 | " |-- datetime: string (nullable = true)\n", 353 | " |-- datetimeEpoch: long (nullable = true)\n", 354 | " |-- description: string (nullable = true)\n", 355 | " |-- dew: double (nullable = true)\n", 356 | " |-- feelslike: double (nullable = true)\n", 357 | " |-- feelslikemax: double (nullable = true)\n", 358 | " |-- feelslikemin: double (nullable = true)\n", 359 | " |-- humidity: double (nullable = true)\n", 360 | " |-- icon: string (nullable = true)\n", 361 | " |-- moonphase: double (nullable = true)\n", 362 | " |-- precip: double (nullable = true)\n", 363 | " |-- precipcover: double (nullable = true)\n", 364 | " |-- precipprob: double (nullable = true)\n", 365 | " |-- preciptype: array (nullable = true)\n", 366 | " | |-- element: string (containsNull = true)\n", 367 | " |-- pressure: double (nullable = true)\n", 368 | " |-- severerisk: double (nullable = true)\n", 369 | " |-- snow: double (nullable = true)\n", 370 | " |-- snowdepth: double (nullable = true)\n", 371 | " |-- solarenergy: double (nullable = true)\n", 372 | " |-- solarradiation: double (nullable = true)\n", 373 | " |-- source: string (nullable = true)\n", 374 | " |-- stations: array (nullable = true)\n", 375 | " | |-- element: string (containsNull = true)\n", 376 | " |-- sunrise: string (nullable = true)\n", 377 | " |-- sunriseEpoch: long (nullable = true)\n", 378 | " |-- sunset: string (nullable = true)\n", 379 | " |-- sunsetEpoch: long (nullable = true)\n", 380 | " |-- temp: double (nullable = true)\n", 381 | " |-- tempmax: double (nullable = true)\n", 382 | " |-- tempmin: double (nullable = true)\n", 383 | " |-- tzoffset: double (nullable = true)\n", 384 | " |-- uvindex: double (nullable = true)\n", 385 | " |-- visibility: double (nullable = true)\n", 386 | " |-- winddir: double (nullable = true)\n", 387 | " |-- windgust: double (nullable = true)\n", 388 | " |-- windspeed: double (nullable = true)\n", 389 | "\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "df_weather.printSchema()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 9, 400 | "id": "2e952690", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# drop some columns that we won't need\n", 405 | "weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', \n", 406 | " 'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch')" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 12, 412 | "id": "22b9368f", 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "# transform datetime\n", 417 | "weather= weather.withColumnRenamed('datetime', 'weather_date') \n", 418 | "weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType()))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 13, 424 | "id": "b7d8f370", 425 | "metadata": { 426 | "scrolled": true 427 | }, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "root\n", 434 | " |-- weather_date: date (nullable = true)\n", 435 | " |-- feelslike: double (nullable = true)\n", 436 | " |-- feelslikemax: double (nullable = true)\n", 437 | " |-- feelslikemin: double (nullable = true)\n", 438 | " |-- humidity: double (nullable = true)\n", 439 | " |-- moonphase: double (nullable = true)\n", 440 | " |-- precip: double (nullable = true)\n", 441 | " |-- precipprob: double (nullable = true)\n", 442 | " |-- pressure: double (nullable = true)\n", 443 | " |-- severerisk: double (nullable = true)\n", 444 | " |-- snow: double (nullable = true)\n", 445 | " |-- snowdepth: double (nullable = true)\n", 446 | " |-- solarenergy: double (nullable = true)\n", 447 | " |-- solarradiation: double (nullable = true)\n", 448 | " |-- sunrise: string (nullable = true)\n", 449 | " |-- sunset: string (nullable = true)\n", 450 | " |-- temp: double (nullable = true)\n", 451 | " |-- tempmax: double (nullable = true)\n", 452 | " |-- tempmin: double (nullable = true)\n", 453 | " |-- tzoffset: double (nullable = true)\n", 454 | " |-- uvindex: double (nullable = true)\n", 455 | " |-- visibility: double (nullable = true)\n", 456 | " |-- winddir: double (nullable = true)\n", 457 | " |-- windgust: double (nullable = true)\n", 458 | " |-- windspeed: double (nullable = true)\n", 459 | "\n", 460 | "25 columns\n" 461 | ] 462 | } 463 | ], 464 | "source": [ 465 | "weather.printSchema()\n", 466 | "print(len(weather.columns), 'columns')" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "id": "e065026c", 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 18, 480 | "id": "ac976766", 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "name": "stderr", 485 | "output_type": "stream", 486 | "text": [ 487 | "\r", 488 | "[Stage 11:> (0 + 1) / 1]\r" 489 | ] 490 | }, 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 496 | "|weather_date|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation| sunrise| sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n", 497 | "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 498 | "| 2021-01-01| -0.2| 2.9| -3.6| 91.03| 0.53| 0.22| null| 1011.6| null|null| null| 0.8| 29.4|08:06:14|16:02:22| 2.1| 5.0| -0.5| null| 0.0| 2.6| 304.0| null| 6.6|\n", 499 | "| 2021-01-02| 1.5| 3.1| -1.5| 82.51| 0.56| 0.6| null| 1015.9| null|null| null| 1.3| 43.9|08:06:05|16:03:28| 3.8| 5.1| 1.5| null| 1.0| 15.1| 299.0| null| 7.8|\n", 500 | "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 501 | "only showing top 2 rows\n", 502 | "\n" 503 | ] 504 | }, 505 | { 506 | "name": "stderr", 507 | "output_type": "stream", 508 | "text": [ 509 | "\r", 510 | " \r" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "weather.show(2)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 27, 521 | "id": "1d53f1c4", 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "name": "stderr", 526 | "output_type": "stream", 527 | "text": [ 528 | "\r", 529 | "[Stage 27:> (0 + 1) / 1]\r" 530 | ] 531 | }, 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "+----------------+\n", 537 | "|missing_tzoffset|\n", 538 | "+----------------+\n", 539 | "| 179|\n", 540 | "+----------------+\n", 541 | "\n" 542 | ] 543 | }, 544 | { 545 | "name": "stderr", 546 | "output_type": "stream", 547 | "text": [ 548 | "\r", 549 | " \r" 550 | ] 551 | } 552 | ], 553 | "source": [ 554 | "# count missing values in windgust\n", 555 | "missing_windgust= (\n", 556 | " weather.select(\n", 557 | " F.count(F.when(F.col('tzoffset').isNull() | F.isnan(F.col('tzoffset')), ''))\n", 558 | " .alias('missing_tzoffset'))\n", 559 | ")\n", 560 | "missing_windgust.show()" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 25, 566 | "id": "e116dc28", 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "# count missing values in each column\n", 571 | "cols= weather.columns\n", 572 | "cols.remove('weather_date')\n", 573 | "missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols])" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 26, 579 | "id": "e279bebd", 580 | "metadata": { 581 | "scrolled": true 582 | }, 583 | "outputs": [ 584 | { 585 | "name": "stderr", 586 | "output_type": "stream", 587 | "text": [ 588 | "\r", 589 | "[Stage 24:> (0 + 1) / 1]\r" 590 | ] 591 | }, 592 | { 593 | "name": "stdout", 594 | "output_type": "stream", 595 | "text": [ 596 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 597 | "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n", 598 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 599 | "| 0| 0| 0| 0| 0| 0| 374| 1| 374| 374| 365| 0| 0| 0| 0| 0| 0| 0| 179| 0| 0| 0| 229| 0|\n", 600 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 601 | "\n" 602 | ] 603 | }, 604 | { 605 | "name": "stderr", 606 | "output_type": "stream", 607 | "text": [ 608 | "\r", 609 | " \r" 610 | ] 611 | } 612 | ], 613 | "source": [ 614 | "missing_values.show()" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 32, 620 | "id": "49d42f95", 621 | "metadata": { 622 | "scrolled": true 623 | }, 624 | "outputs": [ 625 | { 626 | "name": "stderr", 627 | "output_type": "stream", 628 | "text": [ 629 | "\r", 630 | "[Stage 33:> (0 + 1) / 1]\r" 631 | ] 632 | }, 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 638 | "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n", 639 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 640 | "| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.94| 0.0| 0.94|0.94| 0.92| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.45| 0.0| 0.0| 0.0| 0.58| 0.0|\n", 641 | "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 642 | "\n" 643 | ] 644 | }, 645 | { 646 | "name": "stderr", 647 | "output_type": "stream", 648 | "text": [ 649 | "\r", 650 | " \r" 651 | ] 652 | } 653 | ], 654 | "source": [ 655 | "perc_missing_values= (\n", 656 | " weather.select([\n", 657 | " F.round(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1)),2)\n", 658 | " .alias(c) for c in cols\n", 659 | " ])\n", 660 | ")\n", 661 | "perc_missing_values.show()" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 28, 667 | "id": "61e4c2fd", 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/plain": [ 673 | "['weather_date',\n", 674 | " 'feelslike',\n", 675 | " 'feelslikemax',\n", 676 | " 'feelslikemin',\n", 677 | " 'humidity',\n", 678 | " 'moonphase',\n", 679 | " 'precip',\n", 680 | " 'pressure',\n", 681 | " 'solarenergy',\n", 682 | " 'solarradiation',\n", 683 | " 'sunrise',\n", 684 | " 'sunset',\n", 685 | " 'temp',\n", 686 | " 'tempmax',\n", 687 | " 'tempmin',\n", 688 | " 'tzoffset',\n", 689 | " 'uvindex',\n", 690 | " 'visibility',\n", 691 | " 'winddir',\n", 692 | " 'windgust',\n", 693 | " 'windspeed']" 694 | ] 695 | }, 696 | "execution_count": 28, 697 | "metadata": {}, 698 | "output_type": "execute_result" 699 | } 700 | ], 701 | "source": [ 702 | "# drop columns where missing values are more than 70%\n", 703 | "\n", 704 | "weather= weather.drop('precipprob', 'snow', 'snowdepth')\n", 705 | "\n", 706 | "if 'severerisk' in weather.columns:\n", 707 | " weather= weather.drop('severerisk')\n", 708 | "\n", 709 | "weather.columns" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 29, 715 | "id": "c1714e54", 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "name": "stderr", 720 | "output_type": "stream", 721 | "text": [ 722 | " \r" 723 | ] 724 | } 725 | ], 726 | "source": [ 727 | "weather= weather.repartition(10)\n", 728 | "\n", 729 | "weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite')" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "id": "2bbb74b9", 736 | "metadata": {}, 737 | "outputs": [], 738 | "source": [] 739 | } 740 | ], 741 | "metadata": { 742 | "kernelspec": { 743 | "display_name": "Python 3 (ipykernel)", 744 | "language": "python", 745 | "name": "python3" 746 | }, 747 | "language_info": { 748 | "codemirror_mode": { 749 | "name": "ipython", 750 | "version": 3 751 | }, 752 | "file_extension": ".py", 753 | "mimetype": "text/x-python", 754 | "name": "python", 755 | "nbconvert_exporter": "python", 756 | "pygments_lexer": "ipython3", 757 | "version": "3.8.10" 758 | } 759 | }, 760 | "nbformat": 4, 761 | "nbformat_minor": 5 762 | } 763 | -------------------------------------------------------------------------------- /notebook/data-transformation/init-data-transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d20b3aba", 6 | "metadata": {}, 7 | "source": [ 8 | "## One time data transformation\n", 9 | "In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables.\n", 10 | "\n", 11 | "The preprocessed data will be saved back to S3 before getting loaded to Redshift." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "id": "d18f0fe2", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pyspark\n", 22 | "import os" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "fc380d3d", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "'3.2.1'" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "pyspark.__version__" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "id": "42c52993", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from pyspark.sql import SparkSession" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "id": "3f382bf9", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stderr", 64 | "output_type": "stream", 65 | "text": [ 66 | "WARNING: An illegal reflective access operation has occurred\n", 67 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/anaconda3/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 68 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 69 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 70 | "WARNING: All illegal access operations will be denied in a future release\n", 71 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 72 | "Setting default log level to \"WARN\".\n", 73 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 74 | "22/03/01 18:05:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "spark = SparkSession.builder \\\n", 80 | " .master('local[*]') \\\n", 81 | " .appName('data-transformer') \\\n", 82 | " .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n", 83 | " .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n", 84 | " .getOrCreate()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "id": "388ae2f2", 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "'3.3.1'" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "sc = spark.sparkContext\n", 106 | "sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "50ba85e8", 112 | "metadata": {}, 113 | "source": [ 114 | "### 1. Stations data" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "id": "ae6e918b", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stderr", 125 | "output_type": "stream", 126 | "text": [ 127 | "22/03/01 18:05:51 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n", 128 | " \r" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "df_stations = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-extras/stations.csv\", inferSchema=True, header=True)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "id": "0ceaf5ce", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "[Row(Station.Id=1, StationName='River Street, Clerkenwell', longitude=-0.109971, latitude=51.5292, Easting=531202.52, Northing=182832.02),\n", 146 | " Row(Station.Id=2, StationName='Phillimore Gardens, Kensington', longitude=-0.197574, latitude=51.4996, Easting=525207.07, Northing=179391.86)]" 147 | ] 148 | }, 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "df_stations.take(2)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "id": "4533a36e", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "root\n", 169 | " |-- Station.Id: integer (nullable = true)\n", 170 | " |-- StationName: string (nullable = true)\n", 171 | " |-- longitude: double (nullable = true)\n", 172 | " |-- latitude: double (nullable = true)\n", 173 | " |-- Easting: double (nullable = true)\n", 174 | " |-- Northing: double (nullable = true)\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "df_stations.printSchema()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 9, 186 | "id": "766eb9a2", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from pyspark.sql import functions as F, types as T" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 10, 196 | "id": "6cbf6baf", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# rename columns\n", 201 | "stations= df_stations.withColumnRenamed('Station.Id', 'station_id') \\\n", 202 | " .withColumnRenamed('StationName', 'station_name') \\\n", 203 | " .withColumnRenamed('easting', 'easting') \\\n", 204 | " .withColumnRenamed('northing', 'northing') " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 11, 210 | "id": "7d9360c5", 211 | "metadata": { 212 | "scrolled": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "+----------+--------------------+----------+--------+----------+----------+\n", 220 | "|station_id| station_name| longitude|latitude| easting| northing|\n", 221 | "+----------+--------------------+----------+--------+----------+----------+\n", 222 | "| 1|River Street, Cle...| -0.109971| 51.5292| 531202.52| 182832.02|\n", 223 | "| 2|Phillimore Garden...| -0.197574| 51.4996| 525207.07| 179391.86|\n", 224 | "| 3|Christopher Stree...|-0.0846057| 51.5213| 532984.81| 182001.53|\n", 225 | "| 4|St. Chad's Street...| -0.120974| 51.5301| 530436.76| 182911.99|\n", 226 | "| 5|Sedding Street, S...| -0.156876| 51.4931|528051.649|178742.097|\n", 227 | "+----------+--------------------+----------+--------+----------+----------+\n", 228 | "only showing top 5 rows\n", 229 | "\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "stations.show(5)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 12, 240 | "id": "88468f5d", 241 | "metadata": { 242 | "scrolled": true 243 | }, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "+----------+------------+---------+--------+-------+--------+\n", 250 | "|station_id|station_name|longitude|latitude|easting|northing|\n", 251 | "+----------+------------+---------+--------+-------+--------+\n", 252 | "| 0| 0| 0| 0| 0| 0|\n", 253 | "+----------+------------+---------+--------+-------+--------+\n", 254 | "\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "# count missing values in each column\n", 260 | "stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]\n", 261 | " ).show()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 17, 267 | "id": "d0ee0f23", 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stderr", 272 | "output_type": "stream", 273 | "text": [ 274 | " \r" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite')" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "id": "ef3f5de0", 285 | "metadata": {}, 286 | "source": [ 287 | "### 2. Weather data" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 19, 293 | "id": "c41101c2", 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | " \r" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "df_weather = spark.read.json(\"s3a://hrc-de-data/raw/cycling-extras/weather.json\")" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 20, 311 | "id": "3203cde3", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "name": "stderr", 316 | "output_type": "stream", 317 | "text": [ 318 | "22/03/01 18:13:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" 319 | ] 320 | }, 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "[Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-01', datetimeEpoch=1609459200, description='Clear conditions throughout the day with late afternoon rain.', dew=0.8, feelslike=-0.2, feelslikemax=2.9, feelslikemin=-3.6, humidity=91.03, icon='rain', moonphase=0.53, precip=0.22, precipcover=4.17, precipprob=None, preciptype=None, pressure=1011.6, snow=None, snowdepth=None, solarenergy=0.8, solarradiation=29.4, source='obs', stations=['03769099999', '03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:14', sunriseEpoch=1609488374, sunset='16:02:22', sunsetEpoch=1609516942, temp=2.1, tempmax=5.0, tempmin=-0.5, tzoffset=None, uvindex=0.0, visibility=4.1, winddir=304.0, windgust=None, windspeed=10.6),\n", 325 | " Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-02', datetimeEpoch=1609545600, description='Clear conditions throughout the day with rain.', dew=1.0, feelslike=1.5, feelslikemax=3.1, feelslikemin=-1.5, humidity=82.51, icon='rain', moonphase=0.56, precip=0.6, precipcover=8.33, precipprob=None, preciptype=None, pressure=1015.9, snow=None, snowdepth=None, solarenergy=1.3, solarradiation=43.9, source='obs', stations=['03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:05', sunriseEpoch=1609574765, sunset='16:03:28', sunsetEpoch=1609603408, temp=3.8, tempmax=5.1, tempmin=1.5, tzoffset=None, uvindex=1.0, visibility=24.4, winddir=299.0, windgust=None, windspeed=12.5)]" 326 | ] 327 | }, 328 | "execution_count": 20, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "df_weather.take(2)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 21, 340 | "id": "5474298d", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "root\n", 348 | " |-- cloudcover: double (nullable = true)\n", 349 | " |-- conditions: string (nullable = true)\n", 350 | " |-- datetime: string (nullable = true)\n", 351 | " |-- datetimeEpoch: long (nullable = true)\n", 352 | " |-- description: string (nullable = true)\n", 353 | " |-- dew: double (nullable = true)\n", 354 | " |-- feelslike: double (nullable = true)\n", 355 | " |-- feelslikemax: double (nullable = true)\n", 356 | " |-- feelslikemin: double (nullable = true)\n", 357 | " |-- humidity: double (nullable = true)\n", 358 | " |-- icon: string (nullable = true)\n", 359 | " |-- moonphase: double (nullable = true)\n", 360 | " |-- precip: double (nullable = true)\n", 361 | " |-- precipcover: double (nullable = true)\n", 362 | " |-- precipprob: string (nullable = true)\n", 363 | " |-- preciptype: string (nullable = true)\n", 364 | " |-- pressure: double (nullable = true)\n", 365 | " |-- snow: string (nullable = true)\n", 366 | " |-- snowdepth: double (nullable = true)\n", 367 | " |-- solarenergy: double (nullable = true)\n", 368 | " |-- solarradiation: double (nullable = true)\n", 369 | " |-- source: string (nullable = true)\n", 370 | " |-- stations: array (nullable = true)\n", 371 | " | |-- element: string (containsNull = true)\n", 372 | " |-- sunrise: string (nullable = true)\n", 373 | " |-- sunriseEpoch: long (nullable = true)\n", 374 | " |-- sunset: string (nullable = true)\n", 375 | " |-- sunsetEpoch: long (nullable = true)\n", 376 | " |-- temp: double (nullable = true)\n", 377 | " |-- tempmax: double (nullable = true)\n", 378 | " |-- tempmin: double (nullable = true)\n", 379 | " |-- tzoffset: double (nullable = true)\n", 380 | " |-- uvindex: double (nullable = true)\n", 381 | " |-- visibility: double (nullable = true)\n", 382 | " |-- winddir: double (nullable = true)\n", 383 | " |-- windgust: double (nullable = true)\n", 384 | " |-- windspeed: double (nullable = true)\n", 385 | "\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "df_weather.printSchema()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 22, 396 | "id": "2e952690", 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "# drop some columns that we won't need\n", 401 | "weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', \n", 402 | " 'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch')" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 23, 408 | "id": "22b9368f", 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "# transform datetime\n", 413 | "weather= weather.withColumnRenamed('datetime', 'weather_date') \n", 414 | "weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType()))" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 24, 420 | "id": "b7d8f370", 421 | "metadata": { 422 | "scrolled": true 423 | }, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "root\n", 430 | " |-- weather_date: date (nullable = true)\n", 431 | " |-- feelslike: double (nullable = true)\n", 432 | " |-- feelslikemax: double (nullable = true)\n", 433 | " |-- feelslikemin: double (nullable = true)\n", 434 | " |-- humidity: double (nullable = true)\n", 435 | " |-- moonphase: double (nullable = true)\n", 436 | " |-- precip: double (nullable = true)\n", 437 | " |-- precipprob: string (nullable = true)\n", 438 | " |-- preciptype: string (nullable = true)\n", 439 | " |-- pressure: double (nullable = true)\n", 440 | " |-- snow: string (nullable = true)\n", 441 | " |-- snowdepth: double (nullable = true)\n", 442 | " |-- solarenergy: double (nullable = true)\n", 443 | " |-- solarradiation: double (nullable = true)\n", 444 | " |-- sunrise: string (nullable = true)\n", 445 | " |-- sunset: string (nullable = true)\n", 446 | " |-- temp: double (nullable = true)\n", 447 | " |-- tempmax: double (nullable = true)\n", 448 | " |-- tempmin: double (nullable = true)\n", 449 | " |-- tzoffset: double (nullable = true)\n", 450 | " |-- uvindex: double (nullable = true)\n", 451 | " |-- visibility: double (nullable = true)\n", 452 | " |-- winddir: double (nullable = true)\n", 453 | " |-- windgust: double (nullable = true)\n", 454 | " |-- windspeed: double (nullable = true)\n", 455 | "\n", 456 | "25 columns\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "weather.printSchema()\n", 462 | "print(len(weather.columns), 'columns')" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 25, 468 | "id": "e116dc28", 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "# count missing values in each column\n", 473 | "cols= weather.columns\n", 474 | "cols.remove('weather_date')\n", 475 | "missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols])" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 26, 481 | "id": "e279bebd", 482 | "metadata": { 483 | "scrolled": true 484 | }, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 491 | "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n", 492 | "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 493 | "| 0| 0| 0| 0| 0| 0| 365| 365| 1| 365| 356| 0| 0| 0| 0| 0| 0| 0| 148| 0| 0| 0| 226| 0|\n", 494 | "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n", 495 | "\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "missing_values.show()" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 27, 506 | "id": "49d42f95", 507 | "metadata": { 508 | "scrolled": true 509 | }, 510 | "outputs": [ 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n", 516 | "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|preciptype| pressure|snow| snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin| tzoffset|uvindex|visibility|winddir| windgust|windspeed|\n", 517 | "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n", 518 | "| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 1.0| 1.0|0.002739726027397...| 1.0|0.9753424657534246| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0|0.4054794520547945| 0.0| 0.0| 0.0|0.6191780821917808| 0.0|\n", 519 | "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n", 520 | "\n" 521 | ] 522 | } 523 | ], 524 | "source": [ 525 | "perc_missing_values= weather.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in cols])\n", 526 | "perc_missing_values.show()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 28, 532 | "id": "61e4c2fd", 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/plain": [ 538 | "['weather_date',\n", 539 | " 'feelslike',\n", 540 | " 'feelslikemax',\n", 541 | " 'feelslikemin',\n", 542 | " 'humidity',\n", 543 | " 'moonphase',\n", 544 | " 'precip',\n", 545 | " 'pressure',\n", 546 | " 'solarenergy',\n", 547 | " 'solarradiation',\n", 548 | " 'sunrise',\n", 549 | " 'sunset',\n", 550 | " 'temp',\n", 551 | " 'tempmax',\n", 552 | " 'tempmin',\n", 553 | " 'tzoffset',\n", 554 | " 'uvindex',\n", 555 | " 'visibility',\n", 556 | " 'winddir',\n", 557 | " 'windgust',\n", 558 | " 'windspeed']" 559 | ] 560 | }, 561 | "execution_count": 28, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "# drop columns where missing values are more than 70%\n", 568 | "\n", 569 | "weather= weather.drop('precipprob', 'snow', 'snowdepth')\n", 570 | "\n", 571 | "if 'severerisk' in weather.columns:\n", 572 | " weather= weather.drop('severerisk')\n", 573 | "\n", 574 | "weather.columns" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 29, 580 | "id": "c1714e54", 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "name": "stderr", 585 | "output_type": "stream", 586 | "text": [ 587 | " \r" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "weather= weather.repartition(10)\n", 593 | "\n", 594 | "weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite')" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "id": "2bbb74b9", 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [] 604 | } 605 | ], 606 | "metadata": { 607 | "kernelspec": { 608 | "display_name": "Python 3 (ipykernel)", 609 | "language": "python", 610 | "name": "python3" 611 | }, 612 | "language_info": { 613 | "codemirror_mode": { 614 | "name": "ipython", 615 | "version": 3 616 | }, 617 | "file_extension": ".py", 618 | "mimetype": "text/x-python", 619 | "name": "python", 620 | "nbconvert_exporter": "python", 621 | "pygments_lexer": "ipython3", 622 | "version": "3.9.7" 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 5 627 | } 628 | -------------------------------------------------------------------------------- /notebook/data-transformation/journey-data-transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "deda5766", 6 | "metadata": {}, 7 | "source": [ 8 | "## Transformation for rental journey data \n", 9 | "This notebook is responsible for transforming journey data by performing the following tasks:\n", 10 | "\n", 11 | " 1. Renaming columns (removing spaces and lowercasing)\n", 12 | "\n", 13 | " 2. Convert data types from string to timestamps\n", 14 | " \n", 15 | " 3. Attach weather dates\n", 16 | " \n", 17 | " 4. Drop unnecessary columns\n", 18 | " \n", 19 | " 5. Update extra files for dimension tables" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "763a90f8", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pyspark\n", 30 | "import os" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "25914d7c", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from pyspark.sql import SparkSession" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "a9382918", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stderr", 51 | "output_type": "stream", 52 | "text": [ 53 | "WARNING: An illegal reflective access operation has occurred\n", 54 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/anaconda3/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 55 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 56 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 57 | "WARNING: All illegal access operations will be denied in a future release\n", 58 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 59 | "Setting default log level to \"WARN\".\n", 60 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 61 | "22/03/01 21:00:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "spark = SparkSession.builder \\\n", 67 | " .master('local[*]') \\\n", 68 | " .appName('journey-and-stations-data-transformer') \\\n", 69 | " .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n", 70 | " .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n", 71 | " .getOrCreate()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "id": "30406f25", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stderr", 82 | "output_type": "stream", 83 | "text": [ 84 | " \r" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "# get journey data\n", 90 | "df_journey = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-journey/*/*\", inferSchema=True, header=True)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "id": "56579ae0", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "[Row(Rental Id=109096951, Duration=540, Bike Id=13318, End Date='15/06/2021 20:19', EndStation Id=661, EndStation Name='All Saints Church, Portobello', Start Date='15/06/2021 20:10', StartStation Id=105, StartStation Name='Westbourne Grove, Bayswater'),\n", 103 | " Row(Rental Id=108982015, Duration=780, Bike Id=18991, End Date='13/06/2021 13:03', EndStation Id=312, EndStation Name=\"Grove End Road, St. John's Wood\", Start Date='13/06/2021 12:50', StartStation Id=106, StartStation Name='Woodstock Street, Mayfair')]" 104 | ] 105 | }, 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "df_journey.take(2)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "id": "9ef2dc88", 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "root\n", 126 | " |-- Rental Id: integer (nullable = true)\n", 127 | " |-- Duration: integer (nullable = true)\n", 128 | " |-- Bike Id: integer (nullable = true)\n", 129 | " |-- End Date: string (nullable = true)\n", 130 | " |-- EndStation Id: integer (nullable = true)\n", 131 | " |-- EndStation Name: string (nullable = true)\n", 132 | " |-- Start Date: string (nullable = true)\n", 133 | " |-- StartStation Id: integer (nullable = true)\n", 134 | " |-- StartStation Name: string (nullable = true)\n", 135 | "\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "df_journey.printSchema()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 8, 146 | "id": "aea050fa", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "from pyspark.sql.functions import *\n", 151 | "from pyspark.sql.types import *" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 9, 157 | "id": "78224fd1", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# rename columns\n", 162 | "df_journey= df_journey.withColumnRenamed('Rental Id', 'rental_id')\\\n", 163 | ".withColumnRenamed('Bike Id', 'bike_id')\\\n", 164 | ".withColumnRenamed('Start Date', 'start_date')\\\n", 165 | ".withColumnRenamed('End Date', 'end_date')\\\n", 166 | ".withColumnRenamed('StartStation Id', 'start_station')\\\n", 167 | ".withColumnRenamed('EndStation Id', 'end_station')" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 10, 173 | "id": "b1ad54a8", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# convert data types\n", 178 | "df_journey= df_journey.withColumn('start_date', to_timestamp(col('start_date'), 'dd/MM/yyy HH:mm'))\n", 179 | "\n", 180 | "df_journey= df_journey.withColumn('end_date', to_timestamp(col('end_date'), 'dd/MM/yyy HH:mm'))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 11, 186 | "id": "2989d91b", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# add weather_date column\n", 191 | "df_journey= df_journey.withColumn('weather_date', to_date(col(\"start_date\"), 'dd/MM/yyy HH:mm'))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 12, 197 | "id": "1e64f016", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n", 205 | "|rental_id|Duration|bike_id| end_date|end_station| EndStation Name| start_date|start_station| StartStation Name|weather_date|\n", 206 | "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n", 207 | "|109096951| 540| 13318|2021-06-15 20:19:00| 661|All Saints Church...|2021-06-15 20:10:00| 105|Westbourne Grove,...| 2021-06-15|\n", 208 | "|108982015| 780| 18991|2021-06-13 13:03:00| 312|Grove End Road, S...|2021-06-13 12:50:00| 106|Woodstock Street,...| 2021-06-13|\n", 209 | "|108839141| 840| 16736|2021-06-10 15:28:00| 333|Palace Gardens Te...|2021-06-10 15:14:00| 106|Woodstock Street,...| 2021-06-10|\n", 210 | "|108816591| 1380| 913|2021-06-09 22:37:00| 51|Finsbury Library ...|2021-06-09 22:14:00| 123|St. John Street, ...| 2021-06-09|\n", 211 | "|108919084| 1200| 6682|2021-06-12 11:29:00| 732|Duke Street Hill,...|2021-06-12 11:09:00| 123|St. John Street, ...| 2021-06-12|\n", 212 | "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n", 213 | "only showing top 5 rows\n", 214 | "\n", 215 | "root\n", 216 | " |-- rental_id: integer (nullable = true)\n", 217 | " |-- Duration: integer (nullable = true)\n", 218 | " |-- bike_id: integer (nullable = true)\n", 219 | " |-- end_date: timestamp (nullable = true)\n", 220 | " |-- end_station: integer (nullable = true)\n", 221 | " |-- EndStation Name: string (nullable = true)\n", 222 | " |-- start_date: timestamp (nullable = true)\n", 223 | " |-- start_station: integer (nullable = true)\n", 224 | " |-- StartStation Name: string (nullable = true)\n", 225 | " |-- weather_date: date (nullable = true)\n", 226 | "\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "df_journey.show(5)\n", 232 | "df_journey.printSchema()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "71328ed9", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | " " 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "7b5e4e8d", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "id": "784a2702", 256 | "metadata": {}, 257 | "source": [ 258 | "### Stations data\n", 259 | "We are going to update the stations data (previously saved by another process) with some additional stations that are not present in the original stations data but are seen in some journey." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "f8da8a97", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "# read previously saved stations data from parquet\n", 270 | "df_processed_stations= spark.read.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/')" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "id": "52c496c2", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "# create temporary table for both stations and journey\n", 281 | "df_journey.createOrReplaceTempView('journey')\n", 282 | "df_processed_stations.createOrReplaceTempView('station')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "44f574f0", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# we keep all the stations which are not found in the temp view station table\n", 293 | "additional_stations= spark.sql('''\n", 294 | "with station_ids as (\n", 295 | " select \n", 296 | " station_id\n", 297 | " from\n", 298 | " station\n", 299 | ")\n", 300 | "\n", 301 | "select \n", 302 | " distinct(start_station) as station_id, \n", 303 | " `StartStation Name` as station_name \n", 304 | "from \n", 305 | " journey\n", 306 | "where \n", 307 | " start_station not in (table station_ids)\n", 308 | "\n", 309 | "union\n", 310 | "\n", 311 | "select \n", 312 | " distinct(end_station) as station_id, \n", 313 | " `EndStation Name` as station_name \n", 314 | "from \n", 315 | " journey\n", 316 | "where \n", 317 | " end_station not in (table station_ids)\n", 318 | "''')\n", 319 | "additional_stations.show()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "c95d0712", 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# add columns to the additional stations to avoid errors when merging it to the previous one (df_processed_stations)\n", 330 | "additional_stations= additional_stations.withColumn('longitude', lit(0).cast(DoubleType()))\\\n", 331 | ".withColumn('latitude', lit(0).cast(DoubleType()))\\\n", 332 | ".withColumn('easting', lit(0).cast(DoubleType()))\\\n", 333 | ".withColumn('northing', lit(0).cast(DoubleType()))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "id": "ccd5a72c", 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "additional_stations.show(5)\n", 344 | "additional_stations.printSchema()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "68e2adc5", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# remove duplicate values\n", 355 | "additional_stations= additional_stations.dropDuplicates(['station_id'])" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "9361ec15", 362 | "metadata": { 363 | "scrolled": false 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "# save additional stations data into parquet files in s3\n", 368 | "additional_stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='append')" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "bd2a8529", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# drop other unnecessary journey columns\n", 379 | "df_journey= df_journey.drop('StartStation Name', 'EndStation Name', 'Duration')" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "e70a0ba5", 385 | "metadata": {}, 386 | "source": [ 387 | "### Datetime\n", 388 | "We are going to create/update datetime data from the start and end date of each journey." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 13, 394 | "id": "1c813157", 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "+-------------------+----+--------+-----+---+----+------+------+\n", 402 | "| datetime_id|year|week_day|month|day|hour|minute|second|\n", 403 | "+-------------------+----+--------+-----+---+----+------+------+\n", 404 | "|2021-06-15 20:10:00|2021| 3| 6| 15| 20| 10| 0|\n", 405 | "|2021-06-13 12:50:00|2021| 1| 6| 13| 12| 50| 0|\n", 406 | "|2021-06-10 15:14:00|2021| 5| 6| 10| 15| 14| 0|\n", 407 | "+-------------------+----+--------+-----+---+----+------+------+\n", 408 | "only showing top 3 rows\n", 409 | "\n", 410 | "+-------------------+----+--------+-----+---+----+------+------+\n", 411 | "| datetime_id|year|week_day|month|day|hour|minute|second|\n", 412 | "+-------------------+----+--------+-----+---+----+------+------+\n", 413 | "|2021-06-15 20:19:00|2021| 3| 6| 15| 20| 19| 0|\n", 414 | "|2021-06-13 13:03:00|2021| 1| 6| 13| 13| 3| 0|\n", 415 | "|2021-06-10 15:28:00|2021| 5| 6| 10| 15| 28| 0|\n", 416 | "+-------------------+----+--------+-----+---+----+------+------+\n", 417 | "only showing top 3 rows\n", 418 | "\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "# extract datetime values from the start and the end date\n", 424 | "df_datetime_from_start= (\n", 425 | " df_journey.select(\n", 426 | " col('start_date').alias('datetime_id'), \n", 427 | " year(col('start_date')).alias('year'),\n", 428 | " dayofweek(col('start_date')).alias('week_day'),\n", 429 | " month(col('start_date')).alias('month'), \n", 430 | " dayofmonth(col('start_date')).alias('day'),\n", 431 | " hour(col('start_date')).alias('hour'),\n", 432 | " minute(col('start_date')).alias('minute'),\n", 433 | " second(col('start_date')).alias('second'),\n", 434 | " )\n", 435 | ")\n", 436 | "df_datetime_from_end= (\n", 437 | " df_journey.select(\n", 438 | " col('end_date').alias('datetime_id'), \n", 439 | " year(col('end_date')).alias('year'), \n", 440 | " dayofweek(col('end_date')).alias('week_day'),\n", 441 | " month(col('end_date')).alias('month'), \n", 442 | " dayofmonth(col('end_date')).alias('day'),\n", 443 | " hour(col('end_date')).alias('hour'),\n", 444 | " minute(col('end_date')).alias('minute'),\n", 445 | " second(col('end_date')).alias('second'),\n", 446 | " )\n", 447 | ")\n", 448 | "\n", 449 | "df_datetime_from_start.show(3)\n", 450 | "df_datetime_from_end.show(3)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 14, 456 | "id": "057c1e45", 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "+-------------------+----+--------+-----+---+----+------+------+\n", 464 | "| datetime_id|year|week_day|month|day|hour|minute|second|\n", 465 | "+-------------------+----+--------+-----+---+----+------+------+\n", 466 | "|2021-06-15 20:10:00|2021| 3| 6| 15| 20| 10| 0|\n", 467 | "|2021-06-13 12:50:00|2021| 1| 6| 13| 12| 50| 0|\n", 468 | "|2021-06-10 15:14:00|2021| 5| 6| 10| 15| 14| 0|\n", 469 | "|2021-06-09 22:14:00|2021| 4| 6| 9| 22| 14| 0|\n", 470 | "|2021-06-12 11:09:00|2021| 7| 6| 12| 11| 9| 0|\n", 471 | "|2021-06-10 22:33:00|2021| 5| 6| 10| 22| 33| 0|\n", 472 | "|2021-06-13 14:48:00|2021| 1| 6| 13| 14| 48| 0|\n", 473 | "|2021-06-14 18:06:00|2021| 2| 6| 14| 18| 6| 0|\n", 474 | "|2021-06-14 18:06:00|2021| 2| 6| 14| 18| 6| 0|\n", 475 | "|2021-06-09 16:06:00|2021| 4| 6| 9| 16| 6| 0|\n", 476 | "+-------------------+----+--------+-----+---+----+------+------+\n", 477 | "only showing top 10 rows\n", 478 | "\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "# combine the dataframes\n", 484 | "df_datetime= df_datetime_from_start.union(df_datetime_from_end)\n", 485 | "\n", 486 | "# remove duplicate entries\n", 487 | "df_datetime= df_datetime.dropDuplicates(['datetime_id'])\n", 488 | "\n", 489 | "df_datetime.show(10)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 15, 495 | "id": "38b8b2ac", 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stderr", 500 | "output_type": "stream", 501 | "text": [ 502 | " \r" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "# save datetime data into parquet files in s3\n", 508 | "df_datetime.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/datetime/', mode='overwrite')" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 26, 514 | "id": "028eb71d", 515 | "metadata": { 516 | "scrolled": true 517 | }, 518 | "outputs": [ 519 | { 520 | "name": "stderr", 521 | "output_type": "stream", 522 | "text": [ 523 | " \r" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "# finally, save journey data into parquet files in s3\n", 529 | "df_journey.write.parquet('s3a://hrc-de-data/processed/cycling-fact/journey/', mode='append')" 530 | ] 531 | } 532 | ], 533 | "metadata": { 534 | "kernelspec": { 535 | "display_name": "Python 3 (ipykernel)", 536 | "language": "python", 537 | "name": "python3" 538 | }, 539 | "language_info": { 540 | "codemirror_mode": { 541 | "name": "ipython", 542 | "version": 3 543 | }, 544 | "file_extension": ".py", 545 | "mimetype": "text/x-python", 546 | "name": "python", 547 | "nbconvert_exporter": "python", 548 | "pygments_lexer": "ipython3", 549 | "version": "3.9.7" 550 | } 551 | }, 552 | "nbformat": 4, 553 | "nbformat_minor": 5 554 | } 555 | -------------------------------------------------------------------------------- /services.md: -------------------------------------------------------------------------------- 1 | ### Creating an EC2 default role 2 | 3 | ```bash 4 | aws emr create-default-roles --region 5 | ``` 6 | 7 | In case the role was already created before but become invalid, checkout [this link](https://aws.amazon.com/premiumsupport/knowledge-center/emr-default-role-invalid/) -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 3.27" 6 | } 7 | } 8 | 9 | required_version = ">= 0.14.9" 10 | } 11 | 12 | provider "aws" { 13 | profile = "default" 14 | region = "${var.region}" 15 | } 16 | 17 | resource "aws_instance" "de-ec2" { 18 | ami = "${var.ec2_ami}" 19 | instance_type = "t2.micro" 20 | ebs_block_device { 21 | device_name = "/dev/sda1" 22 | volume_size = 10 23 | } 24 | 25 | tags = { 26 | Name = "EC2forDEprojects" 27 | } 28 | } 29 | 30 | resource "aws_s3_bucket" "de-s3" { 31 | bucket = "${var.s3_bucket_name}" 32 | acl = "private" 33 | 34 | tags = { 35 | Name = "S3forDEprojects " 36 | Environment = "Dev" 37 | } 38 | } 39 | 40 | 41 | resource "aws_redshift_cluster" "de-redshift" { 42 | cluster_identifier = "${var.cluster_id}" 43 | database_name = "dev" 44 | master_username = "${var.db_credentials_uname}" 45 | master_password = "${var.db_credentials_pwd}" 46 | node_type = "${var.node_type}" 47 | cluster_type = "${var.cluster_type}" 48 | publicly_accessible = false 49 | } -------------------------------------------------------------------------------- /terraform/services.md: -------------------------------------------------------------------------------- 1 | ### Running Terraform 2 | 3 | 4 | #### 1. Initialization 5 | ```bash 6 | terraform init 7 | ``` 8 | 9 | #### 2. Planning 10 | ```bash 11 | terraform plan 12 | ``` 13 | 14 | 15 | #### 3. Applying 16 | ```bash 17 | terraform apply 18 | ``` 19 | -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | default= "eu-west-2" 3 | } 4 | 5 | 6 | variable "ec2_ami" { 7 | default= "ami-0015a39e4b7c0966f" 8 | } 9 | 10 | variable "s3_bucket_name" { 11 | default= "hrc-de-data" 12 | } 13 | 14 | variable "cluster_id" { 15 | default= "redshift-cluster-0" 16 | } 17 | 18 | variable "node_type" { 19 | default= "dc2.large" 20 | } 21 | 22 | variable "cluster_type" { 23 | default= "single-node" 24 | } 25 | 26 | variable "db_credentials_uname" { 27 | default= "awsusr" 28 | } 29 | 30 | variable "db_credentials_pwd" { 31 | default= "Mustbe8charsAndInside.EnvFile" 32 | } 33 | 34 | --------------------------------------------------------------------------------